d

2023-12-15 17:13:47 -06:00 · 2023-12-15 17:13:47 -06:00 · 0b179f38bd
commit 0b179f38bd
parent 18b1c2b6bc
5 changed files with 340 additions and 27 deletions
--- a/assignments/04/.gitignore
+++ b/assignments/04/.gitignore
@ -3,4 +3,5 @@ km_cuda

 clusters.txt
 centroids.txt
-report.pdf
+report.pdf
+cluster.txt
--- a/assignments/04/km_cuda.cu
+++ b/assignments/04/km_cuda.cu
@ -15,15 +15,23 @@ inline void cuda_check(cudaError_t error_code, const char *file, int line) {
  }
 }

+#define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
+#define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))
+
+// #define ENSURE_int(i) _Generic((i), int: (i))
+// #define ENSURE_float(f) _Generic((f), float: (f))
+
+// #define MAX(type, x, y) (type) GENERIC_MAX(ENSURE_##type(x), ENSURE_##type(y))
+// #define MIN(type, x, y) (type) GENERIC_MIN(ENSURE_##type(x), ENSURE_##type(y))
+
 /**
-* @brief Return the number of seconds since an unspecified time (e.g., Unix
-*        epoch). This is accomplished with a high-resolution monotonic timer,
-*        suitable for performance timing.
-*
-* @return The number of seconds.
-*/
-static inline double monotonic_seconds()
-{
+ * @brief Return the number of seconds since an unspecified time (e.g., Unix
+ *        epoch). This is accomplished with a high-resolution monotonic timer,
+ *        suitable for performance timing.
+ *
+ * @return The number of seconds.
+ */
+static inline double monotonic_seconds() {
  /* Linux systems */
  struct timespec ts;
  clock_gettime(CLOCK_MONOTONIC, &ts);
@ -31,20 +39,19 @@ static inline double monotonic_seconds()
 }

 /**
-* @brief Output the seconds elapsed while clustering.
-*
-* @param seconds Seconds spent on k-means clustering, excluding IO.
-*/
-static void print_time(double const seconds)
-{
+ * @brief Output the seconds elapsed while clustering.
+ *
+ * @param seconds Seconds spent on k-means clustering, excluding IO.
+ */
+static void print_time(double const seconds) {
  printf("k-means clustering time: %0.04fs\n", seconds);
 }

 __global__ void findDistanceToCentroid(int N, int K, int dim,
                                       float *centroidDistances, float *data,
-                                       float *centroids) {
-  int t = blockIdx.x;  // data index
-  int c = threadIdx.x; // cluster index
+                                       float *centroids, int tOffset) {
+  int t = blockIdx.x + tOffset; // data index
+  int c = threadIdx.x;          // cluster index

  float sum = 0;
  for (int d = 0; d < dim; ++d) {
@ -177,8 +184,17 @@ int main(int argc, char **argv) {
 #pragma region Assign each data point to the closest centroid,                 \
    measured via Euclidean distance.
  {
-    findDistanceToCentroid<<<N, num_clusters>>>(
-        N, num_clusters, dim, centroidDistances, data, centroids);
+    if (num_thread_blocks == -1) {
+      findDistanceToCentroid<<<N, num_clusters>>>(
+          N, num_clusters, dim, centroidDistances, data, centroids, 0);
+    } else {
+      for (int j = 0; j < N; j += num_thread_blocks) {
+        int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
+        findDistanceToCentroid<<<n, num_clusters>>>(
+            N, num_clusters, dim, centroidDistances, data, centroids,
+            j * num_thread_blocks);
+      }
+    }
    CUDACHECK(cudaDeviceSynchronize());

    *dirtyBit = 0;
@ -190,7 +206,7 @@ int main(int argc, char **argv) {

 #pragma region Iteration
  int it = 0;
-  for (int it=0;it < 20 && *dirtyBit; ++it) {
+  for (int it = 0; it < 20 && *dirtyBit; ++it) {
    printf("Iteration %d (dirty=%d)\n", it, *dirtyBit);

    // Update each centroid to be the average coordinate of all contained data
@ -201,7 +217,7 @@ int main(int argc, char **argv) {
        N, num_clusters, dim, data, centroids, clusterMap, clusterCount);
    CUDACHECK(cudaDeviceSynchronize());

-// Print out the cluster compositions
+    // Print out the cluster compositions
    for (int i = 0; i < num_clusters; ++i)
      printf("%d ", clusterCount[i]);
    printf("\n");
@ -211,8 +227,19 @@ int main(int argc, char **argv) {

    // Assign all data points to the closest centroid (measured via Euclidean
    // distance).
-    findDistanceToCentroid<<<N, num_clusters>>>(
-        N, num_clusters, dim, centroidDistances, data, centroids);
+    // findDistanceToCentroid<<<N, num_clusters>>>(
+    //     N, num_clusters, dim, centroidDistances, data, centroids);
+    if (num_thread_blocks == -1) {
+      findDistanceToCentroid<<<N, num_clusters>>>(
+          N, num_clusters, dim, centroidDistances, data, centroids, 0);
+    } else {
+      for (int j = 0; j < N; j += num_thread_blocks) {
+        int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
+        findDistanceToCentroid<<<n, num_clusters>>>(
+            N, num_clusters, dim, centroidDistances, data, centroids,
+            j * num_thread_blocks);
+      }
+    }
    CUDACHECK(cudaDeviceSynchronize());

    *dirtyBit = 0;
@ -222,8 +249,8 @@ int main(int argc, char **argv) {
  }
 #pragma endregion

-double end_time = monotonic_seconds();
-print_time(end_time - start_time);
+  double end_time = monotonic_seconds();
+  print_time(end_time - start_time);

 #pragma region
  {
--- a/assignments/04/km_cuda_fixed_n.cu
+++ b/assignments/04/km_cuda_fixed_n.cu
@ -0,0 +1,281 @@
+// #define _POSIX_C_SOURCE 200809L
+#include <stdio.h>
+#include <time.h>
+
+#define CUDACHECK(err)                                                         \
+  do {                                                                         \
+    cuda_check((err), __FILE__, __LINE__);                                     \
+  } while (false)
+inline void cuda_check(cudaError_t error_code, const char *file, int line) {
+  if (error_code != cudaSuccess) {
+    fprintf(stderr, "CUDA Error %d: %s. In file '%s' on line %d\n", error_code,
+            cudaGetErrorString(error_code), file, line);
+    fflush(stderr);
+    exit(error_code);
+  }
+}
+
+#define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
+#define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))
+
+// #define ENSURE_int(i) _Generic((i), int: (i))
+// #define ENSURE_float(f) _Generic((f), float: (f))
+
+// #define MAX(type, x, y) (type) GENERIC_MAX(ENSURE_##type(x), ENSURE_##type(y))
+// #define MIN(type, x, y) (type) GENERIC_MIN(ENSURE_##type(x), ENSURE_##type(y))
+
+/**
+ * @brief Return the number of seconds since an unspecified time (e.g., Unix
+ *        epoch). This is accomplished with a high-resolution monotonic timer,
+ *        suitable for performance timing.
+ *
+ * @return The number of seconds.
+ */
+static inline double monotonic_seconds() {
+  /* Linux systems */
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return ts.tv_sec + ts.tv_nsec * 1e-9;
+}
+
+/**
+ * @brief Output the seconds elapsed while clustering.
+ *
+ * @param seconds Seconds spent on k-means clustering, excluding IO.
+ */
+static void print_time(double const seconds) {
+  printf("k-means clustering time: %0.04fs\n", seconds);
+}
+
+__global__ void findDistanceToCentroid(int N, int K, int dim,
+                                       float *centroidDistances, float *data,
+                                       float *centroids, int tOffset) {
+  int t = blockIdx.x + tOffset; // data index
+  int c = threadIdx.x;          // cluster index
+
+  float sum = 0;
+  for (int d = 0; d < dim; ++d) {
+    float delta = data[t * dim + d] - centroids[c * dim + d];
+    sum += delta * delta;
+  }
+
+  centroidDistances[t * K + c] = sqrt(sum);
+}
+
+__global__ void assignClosestCentroid(int N, int K, int *dirtyBit,
+                                      float *centroidDistances,
+                                      int *clusterMap) {
+  int t = blockIdx.x;
+  int minIdx = 0;
+  float minValue = INFINITY;
+
+  for (int c = 0; c < K; ++c) {
+    float dist = centroidDistances[t * K + c];
+    if (dist < minValue) {
+      minValue = dist;
+      minIdx = c;
+    }
+  }
+
+  // printf("[%d]: minDist %f @ idx %d\n", t, minValue, minIdx);
+  int oldMinIdx = clusterMap[t];
+  clusterMap[t] = minIdx;
+  if (oldMinIdx != minIdx) {
+    atomicOr(dirtyBit, 1);
+  }
+}
+
+__global__ void recentralizeCentroidSum(int N, int K, int dim, float *data,
+                                        float *centroids, int *clusterMap,
+                                        unsigned int *clusterCount) {
+  int t = blockIdx.x;  // data index
+  int c = threadIdx.x; // cluster index
+  int assignedCluster = clusterMap[t];
+
+  if (assignedCluster != c)
+    return;
+
+  atomicAdd((unsigned int *)&clusterCount[c], 1);
+  for (int d = 0; d < dim; ++d) {
+    atomicAdd(&centroids[c * dim + d], data[t * dim + d]);
+  }
+}
+
+__global__ void recentralizeCentroidDiv(int dim, float *centroids,
+                                        unsigned int *clusterCount) {
+  int c = threadIdx.x; // cluster index
+  for (int d = 0; d < dim; ++d) {
+    centroids[c * dim + d] /= clusterCount[c];
+  }
+}
+
+int main(int argc, char **argv) {
+  char *data_file = argv[1];
+  int num_clusters = atoi(argv[2]);
+  int num_thread_blocks = atoi(argv[3]);
+  int num_threads_per_block = atoi(argv[4]);
+
+  int N, dim;
+  float *centroids,       // centroids[cluster][dimension]
+      *data,              // data[t][dimension]
+      *centroidDistances; // centroidDistances[t][cluster]
+  int *clusterMap, *dirtyBit;
+  unsigned int *clusterCount;
+
+#pragma region Read in data
+  {
+    FILE *fp = fopen(data_file, "r");
+
+    // Read first line
+    size_t n;
+    char *line = NULL;
+    if (!getline(&line, &n, fp))
+      return -1;
+
+    sscanf(line, "%d %d", &N, &dim);
+    free(line);
+    line = NULL;
+
+    // Allocate memory on the GPU
+    CUDACHECK(
+        cudaMalloc((void **)&centroids, num_clusters * dim * sizeof(float)));
+    CUDACHECK(cudaMallocManaged((void **)&clusterMap, N * sizeof(int)));
+    CUDACHECK(cudaMallocManaged((void **)&clusterCount,
+                                num_clusters * sizeof(unsigned int)));
+    CUDACHECK(cudaMalloc((void **)&data, N * dim * sizeof(float)));
+    CUDACHECK(cudaMalloc((void **)&centroidDistances,
+                         N * num_clusters * sizeof(float)));
+    CUDACHECK(cudaMallocManaged((void **)&dirtyBit, sizeof(int)));
+
+    // Initialize all the cluster mappings to -1 so the first iteration is
+    // always fully dirty
+    CUDACHECK(cudaMemset(clusterMap, -1, N * sizeof(int)));
+
+    // Read the rest of the lines
+    {
+      // Buffer for copying
+      float *currentLine = (float *)malloc(dim * sizeof(float));
+      for (int i = 0; i < N; ++i) {
+        if (!getline(&line, &n, fp))
+          return -1;
+
+        for (int j = 0; j < dim; ++j)
+          sscanf(line, "%f", &currentLine[j]);
+
+        CUDACHECK(cudaMemcpy(&data[i * dim], currentLine, dim * sizeof(float),
+                             cudaMemcpyHostToDevice));
+      }
+      free(currentLine);
+    }
+
+    fclose(fp);
+  }
+#pragma endregion
+
+  double start_time = monotonic_seconds();
+
+#pragma region Select the initial K centroids
+  {
+    CUDACHECK(cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float),
+                         cudaMemcpyDeviceToDevice));
+  }
+#pragma endregion
+
+#pragma region Assign each data point to the closest centroid,                 \
+    measured via Euclidean distance.
+  {
+    if (num_thread_blocks == -1) {
+      findDistanceToCentroid<<<N, num_clusters>>>(
+          N, num_clusters, dim, centroidDistances, data, centroids, 0);
+    } else {
+      for (int j = 0; j < N; j += num_thread_blocks) {
+        int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
+        findDistanceToCentroid<<<n, num_clusters>>>(
+            N, num_clusters, dim, centroidDistances, data, centroids,
+            j * num_thread_blocks);
+      }
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+
+    *dirtyBit = 0;
+    assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
+                                    centroidDistances, clusterMap);
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+#pragma endregion
+
+#pragma region Iteration
+  int it = 0;
+  for (int it = 0; it < 20 && *dirtyBit; ++it) {
+    // printf("Iteration %d (dirty=%d)\n", it, *dirtyBit);
+
+    // Update each centroid to be the average coordinate of all contained data
+    // points
+    CUDACHECK(cudaMemset(clusterCount, 0, num_clusters * sizeof(int)));
+    CUDACHECK(cudaMemset(centroids, 0, num_clusters * dim * sizeof(float)));
+    recentralizeCentroidSum<<<N, num_clusters>>>(
+        N, num_clusters, dim, data, centroids, clusterMap, clusterCount);
+    CUDACHECK(cudaDeviceSynchronize());
+
+    // Print out the cluster compositions
+    // for (int i = 0; i < num_clusters; ++i)
+    //   printf("%d ", clusterCount[i]);
+    // printf("\n");
+
+    recentralizeCentroidDiv<<<1, num_clusters>>>(dim, centroids, clusterCount);
+    CUDACHECK(cudaDeviceSynchronize());
+
+    // Assign all data points to the closest centroid (measured via Euclidean
+    // distance).
+    // findDistanceToCentroid<<<N, num_clusters>>>(
+    //     N, num_clusters, dim, centroidDistances, data, centroids);
+    if (num_thread_blocks == -1) {
+      findDistanceToCentroid<<<N, num_clusters>>>(
+          N, num_clusters, dim, centroidDistances, data, centroids, 0);
+    } else {
+      for (int j = 0; j < N; j += num_thread_blocks) {
+        int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
+        findDistanceToCentroid<<<n, num_clusters>>>(
+            N, num_clusters, dim, centroidDistances, data, centroids,
+            j * num_thread_blocks);
+      }
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+
+    *dirtyBit = 0;
+    assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
+                                    centroidDistances, clusterMap);
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+#pragma endregion
+
+  double end_time = monotonic_seconds();
+  print_time(end_time - start_time);
+
+#pragma region
+  {
+    FILE *fp = fopen("clusters.txt", "w");
+    for (int i = 0; i < N; ++i)
+      fprintf(fp, "%d\n", clusterMap[i]);
+    fclose(fp);
+  }
+
+  {
+    FILE *fp = fopen("centroids.txt", "w");
+    fprintf(fp, "%d %d\n", num_clusters, dim);
+    float *line = (float *)malloc(dim * sizeof(float));
+    for (int i = 0; i < num_clusters; ++i) {
+      CUDACHECK(cudaMemcpy(line, &centroids[i * dim], dim * sizeof(float),
+                           cudaMemcpyDeviceToHost));
+      for (int d = 0; d < dim; ++d)
+        fprintf(fp, "%.3f ", line[d]);
+      fprintf(fp, "\n");
+    }
+    free(line);
+    fclose(fp);
+  }
+
+#pragma endregion
+
+  return 0;
+}
--- a/assignments/04/report.typ
+++ b/assignments/04/report.typ
@ -1,3 +1,5 @@
+#set page(margin: 1.5em)
+
 == Homework 4

 Michael Zhang \<zhan4854\@umn.edu\>
@ -30,6 +32,8 @@ Michael Zhang \<zhan4854\@umn.edu\>

 4. *You need to perform a parameter study in order to determine how the number of elements processed by a thread and the size of a thread-block, i.e., the \# threads in a block, affect the performance of your algorithm. Your writeup should contain some results showing the runtime that you obtained for different choices.*

+  I tried using a fixed number of thread blocks, but for the 256 cluster case it ended up performing almost twice as slow (at 0.4516s). To the best of my knowledge, CUDA is able to schedule the extra kernel instances it needs to compute to happen sequentially after, if there are more thread blocks requested than there are available.
+
 5. *You should include results on the 'large_cpd.txt' dataset with 256, 512, and 1024 clusters.*

  - 256: 26.8258s
--- a/assignments/04/run.sh
+++ b/assignments/04/run.sh
@ -3,7 +3,7 @@ HOST="zhan4854@csel-cuda-02.cselabs.umn.edu"
 rsync -azPr --exclude 'large_cpd.txt' . $HOST:~/hwk4

 CLUSTERS=${1:-512}
-BLOCKS=-1
+BLOCKS=512
 THREADS=-1
 DATAFILE="large_cpd.txt"
 # DATAFILE="small_gaussian.txt"