csci5451/assignments/04/km_cuda.cu

// #define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <time.h>

#define CUDACHECK(err)                                                         \
  do {                                                                         \
    cuda_check((err), __FILE__, __LINE__);                                     \
  } while (false)
inline void cuda_check(cudaError_t error_code, const char *file, int line) {
  if (error_code != cudaSuccess) {
    fprintf(stderr, "CUDA Error %d: %s. In file '%s' on line %d\n", error_code,
            cudaGetErrorString(error_code), file, line);
    fflush(stderr);
    exit(error_code);
  }
}

#define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
#define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))

// #define ENSURE_int(i) _Generic((i), int: (i))
// #define ENSURE_float(f) _Generic((f), float: (f))

// #define MAX(type, x, y) (type) GENERIC_MAX(ENSURE_##type(x), ENSURE_##type(y))
// #define MIN(type, x, y) (type) GENERIC_MIN(ENSURE_##type(x), ENSURE_##type(y))

/**
 * @brief Return the number of seconds since an unspecified time (e.g., Unix
 *        epoch). This is accomplished with a high-resolution monotonic timer,
 *        suitable for performance timing.
 *
 * @return The number of seconds.
 */
static inline double monotonic_seconds() {
  /* Linux systems */
  struct timespec ts;
  clock_gettime(CLOCK_MONOTONIC, &ts);
  return ts.tv_sec + ts.tv_nsec * 1e-9;
}

/**
 * @brief Output the seconds elapsed while clustering.
 *
 * @param seconds Seconds spent on k-means clustering, excluding IO.
 */
static void print_time(double const seconds) {
  printf("k-means clustering time: %0.04fs\n", seconds);
}

__global__ void findDistanceToCentroid(int N, int K, int dim,
                                       float *centroidDistances, float *data,
                                       float *centroids, int tOffset) {
  int t = blockIdx.x + tOffset; // data index
  int c = threadIdx.x;          // cluster index

  float sum = 0;
  for (int d = 0; d < dim; ++d) {
    float delta = data[t * dim + d] - centroids[c * dim + d];
    sum += delta * delta;
  }

  centroidDistances[t * K + c] = sqrt(sum);
}

__global__ void assignClosestCentroid(int N, int K, int *dirtyBit,
                                      float *centroidDistances,
                                      int *clusterMap) {
  int t = blockIdx.x;
  int minIdx = 0;
  float minValue = INFINITY;

  for (int c = 0; c < K; ++c) {
    float dist = centroidDistances[t * K + c];
    if (dist < minValue) {
      minValue = dist;
      minIdx = c;
    }
  }

  // printf("[%d]: minDist %f @ idx %d\n", t, minValue, minIdx);
  int oldMinIdx = clusterMap[t];
  clusterMap[t] = minIdx;
  if (oldMinIdx != minIdx) {
    atomicOr(dirtyBit, 1);
  }
}

__global__ void recentralizeCentroidSum(int N, int K, int dim, float *data,
                                        float *centroids, int *clusterMap,
                                        unsigned int *clusterCount) {
  int t = blockIdx.x;  // data index
  int c = threadIdx.x; // cluster index
  int assignedCluster = clusterMap[t];

  if (assignedCluster != c)
    return;

  atomicAdd((unsigned int *)&clusterCount[c], 1);
  for (int d = 0; d < dim; ++d) {
    atomicAdd(&centroids[c * dim + d], data[t * dim + d]);
  }
}

__global__ void recentralizeCentroidDiv(int dim, float *centroids,
                                        unsigned int *clusterCount) {
  int c = threadIdx.x; // cluster index
  for (int d = 0; d < dim; ++d) {
    centroids[c * dim + d] /= clusterCount[c];
  }
}

int main(int argc, char **argv) {
  char *data_file = argv[1];
  int num_clusters = atoi(argv[2]);
  int num_thread_blocks = atoi(argv[3]);
  int num_threads_per_block = atoi(argv[4]);

  int N, dim;
  float *centroids,       // centroids[cluster][dimension]
      *data,              // data[t][dimension]
      *centroidDistances; // centroidDistances[t][cluster]
  int *clusterMap, *dirtyBit;
  unsigned int *clusterCount;

#pragma region Read in data
  {
    FILE *fp = fopen(data_file, "r");

    // Read first line
    size_t n;
    char *line = NULL;
    if (!getline(&line, &n, fp))
      return -1;

    sscanf(line, "%d %d", &N, &dim);
    free(line);
    line = NULL;

    // Allocate memory on the GPU
    CUDACHECK(
        cudaMalloc((void **)&centroids, num_clusters * dim * sizeof(float)));
    CUDACHECK(cudaMallocManaged((void **)&clusterMap, N * sizeof(int)));
    CUDACHECK(cudaMallocManaged((void **)&clusterCount,
                                num_clusters * sizeof(unsigned int)));
    CUDACHECK(cudaMalloc((void **)&data, N * dim * sizeof(float)));
    CUDACHECK(cudaMalloc((void **)&centroidDistances,
                         N * num_clusters * sizeof(float)));
    CUDACHECK(cudaMallocManaged((void **)&dirtyBit, sizeof(int)));

    // Initialize all the cluster mappings to -1 so the first iteration is
    // always fully dirty
    CUDACHECK(cudaMemset(clusterMap, -1, N * sizeof(int)));

    // Read the rest of the lines
    {
      // Buffer for copying
      float *currentLine = (float *)malloc(dim * sizeof(float));
      for (int i = 0; i < N; ++i) {
        if (!getline(&line, &n, fp))
          return -1;

        for (int j = 0; j < dim; ++j)
          sscanf(line, "%f", &currentLine[j]);

        CUDACHECK(cudaMemcpy(&data[i * dim], currentLine, dim * sizeof(float),
                             cudaMemcpyHostToDevice));
      }
      free(currentLine);
    }

    fclose(fp);
  }
#pragma endregion

  double start_time = monotonic_seconds();

#pragma region Select the initial K centroids
  {
    CUDACHECK(cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float),
                         cudaMemcpyDeviceToDevice));
  }
#pragma endregion

#pragma region Assign each data point to the closest centroid,                 \
    measured via Euclidean distance.
  {
    if (num_thread_blocks == -1) {
      findDistanceToCentroid<<<N, num_clusters>>>(
          N, num_clusters, dim, centroidDistances, data, centroids, 0);
    } else {
      for (int j = 0; j < N; j += num_thread_blocks) {
        int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
        findDistanceToCentroid<<<n, num_clusters>>>(
            N, num_clusters, dim, centroidDistances, data, centroids,
            j * num_thread_blocks);
      }
    }
    CUDACHECK(cudaDeviceSynchronize());

    *dirtyBit = 0;
    assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
                                    centroidDistances, clusterMap);
    CUDACHECK(cudaDeviceSynchronize());
  }
#pragma endregion

#pragma region Iteration
  int it = 0;
  for (int it = 0; it < 20 && *dirtyBit; ++it) {
    printf("Iteration %d (dirty=%d)\n", it, *dirtyBit);

    // Update each centroid to be the average coordinate of all contained data
    // points
    CUDACHECK(cudaMemset(clusterCount, 0, num_clusters * sizeof(int)));
    CUDACHECK(cudaMemset(centroids, 0, num_clusters * dim * sizeof(float)));
    recentralizeCentroidSum<<<N, num_clusters>>>(
        N, num_clusters, dim, data, centroids, clusterMap, clusterCount);
    CUDACHECK(cudaDeviceSynchronize());

    // Print out the cluster compositions
    for (int i = 0; i < num_clusters; ++i)
      printf("%d ", clusterCount[i]);
    printf("\n");

    recentralizeCentroidDiv<<<1, num_clusters>>>(dim, centroids, clusterCount);
    CUDACHECK(cudaDeviceSynchronize());

    // Assign all data points to the closest centroid (measured via Euclidean
    // distance).
    // findDistanceToCentroid<<<N, num_clusters>>>(
    //     N, num_clusters, dim, centroidDistances, data, centroids);
    if (num_thread_blocks == -1) {
      findDistanceToCentroid<<<N, num_clusters>>>(
          N, num_clusters, dim, centroidDistances, data, centroids, 0);
    } else {
      for (int j = 0; j < N; j += num_thread_blocks) {
        int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
        findDistanceToCentroid<<<n, num_clusters>>>(
            N, num_clusters, dim, centroidDistances, data, centroids,
            j * num_thread_blocks);
      }
    }
    CUDACHECK(cudaDeviceSynchronize());

    *dirtyBit = 0;
    assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
                                    centroidDistances, clusterMap);
    CUDACHECK(cudaDeviceSynchronize());
  }
#pragma endregion

  double end_time = monotonic_seconds();
  print_time(end_time - start_time);

#pragma region
  {
    FILE *fp = fopen("clusters.txt", "w");
    for (int i = 0; i < N; ++i)
      fprintf(fp, "%d\n", clusterMap[i]);
    fclose(fp);
  }

  {
    FILE *fp = fopen("centroids.txt", "w");
    fprintf(fp, "%d %d\n", num_clusters, dim);
    float *line = (float *)malloc(dim * sizeof(float));
    for (int i = 0; i < num_clusters; ++i) {
      CUDACHECK(cudaMemcpy(line, &centroids[i * dim], dim * sizeof(float),
                           cudaMemcpyDeviceToHost));
      for (int d = 0; d < dim; ++d)
        fprintf(fp, "%.3f ", line[d]);
      fprintf(fp, "\n");
    }
    free(line);
    fclose(fp);
  }

#pragma endregion

  return 0;
}