snapshot

2023-12-11 22:11:02 -06:00 · 2023-12-11 22:11:02 -06:00 · 9f6a80a09f
commit 9f6a80a09f
parent 442638205c
5 changed files with 68 additions and 15 deletions
--- a/assignments/04/.gitignore
+++ b/assignments/04/.gitignore
@ -1,2 +1,6 @@
 dataset/large_cpd.txt
 km_cuda
+
+clusters.txt
+centroids.txt
+report.pdf
--- a/assignments/04/Makefile
+++ b/assignments/04/Makefile
@ -1,7 +1,7 @@
 .PHONY: clean

 km_cuda: km_cuda.cu
-	nvcc -g -o $@ $<
+	nvcc -Xptxas -O3,-v -use_fast_math -o $@ $<

 clean:
 	rm -f km_cuda
--- a/assignments/04/km_cuda.cu
+++ b/assignments/04/km_cuda.cu
@ -1,5 +1,6 @@
 // #define _POSIX_C_SOURCE 200809L
 #include <stdio.h>
+#include <time.h>

 #define CUDACHECK(err)                                                         \
  do {                                                                         \
@ -14,6 +15,31 @@ inline void cuda_check(cudaError_t error_code, const char *file, int line) {
  }
 }

+/**
+* @brief Return the number of seconds since an unspecified time (e.g., Unix
+*        epoch). This is accomplished with a high-resolution monotonic timer,
+*        suitable for performance timing.
+*
+* @return The number of seconds.
+*/
+static inline double monotonic_seconds()
+{
+  /* Linux systems */
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return ts.tv_sec + ts.tv_nsec * 1e-9;
+}
+
+/**
+* @brief Output the seconds elapsed while clustering.
+*
+* @param seconds Seconds spent on k-means clustering, excluding IO.
+*/
+static void print_time(double const seconds)
+{
+  printf("k-means clustering time: %0.04fs\n", seconds);
+}
+
 __global__ void findDistanceToCentroid(int N, int K, int dim,
                                       float *centroidDistances, float *data,
                                       float *centroids) {
@ -77,12 +103,6 @@ __global__ void recentralizeCentroidDiv(int dim, float *centroids,
 }

 int main(int argc, char **argv) {
-  int runtimeVersion, driverVersion;
-  cudaRuntimeGetVersion(&runtimeVersion);
-  cudaDriverGetVersion(&driverVersion);
-  printf("Runtime Version: %d, Driver Version: %d\n", runtimeVersion,
-         driverVersion);
-
  char *data_file = argv[1];
  int num_clusters = atoi(argv[2]);
  int num_thread_blocks = atoi(argv[3]);
@ -141,12 +161,12 @@ int main(int argc, char **argv) {
      free(currentLine);
    }

-    printf("Done copying.\n");
-
    fclose(fp);
  }
 #pragma endregion

+  double start_time = monotonic_seconds();
+
 #pragma region Select the initial K centroids
  {
    CUDACHECK(cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float),
@ -166,8 +186,6 @@ int main(int argc, char **argv) {
                                    centroidDistances, clusterMap);
    CUDACHECK(cudaDeviceSynchronize());
  }
-
-  printf("Is dirty: %d\n", *dirtyBit);
 #pragma endregion

 #pragma region Iteration
@ -182,9 +200,10 @@ int main(int argc, char **argv) {
    recentralizeCentroidSum<<<N, num_clusters>>>(
        N, num_clusters, dim, data, centroids, clusterMap, clusterCount);
    CUDACHECK(cudaDeviceSynchronize());
-    for (int i = 0; i < num_clusters; ++i) {
+
+// Print out the cluster compositions
+    for (int i = 0; i < num_clusters; ++i)
      printf("%d ", clusterCount[i]);
-    }
    printf("\n");

    recentralizeCentroidDiv<<<1, num_clusters>>>(dim, centroids, clusterCount);
@ -205,6 +224,9 @@ int main(int argc, char **argv) {
  }
 #pragma endregion

+double end_time = monotonic_seconds();
+print_time(end_time - start_time);
+
 #pragma region
  {
    FILE *fp = fopen("clusters.txt", "w");
@ -228,7 +250,6 @@ int main(int argc, char **argv) {
    fclose(fp);
  }

-  printf("Done.\n");
 #pragma endregion

  return 0;
--- a/assignments/04/report.typ
+++ b/assignments/04/report.typ
@ -0,0 +1,27 @@
+== Homework 4
+
+Michael Zhang \<zhan4854\@umn.edu\>
+
+1. A short description of how you went about parallelizing the k-means algorithm. You should include how you decomposed the problem and why, i.e., what were the tasks being parallelized.
+
+  My parallelized program included the following procedures:
+
+  - `findDistanceToCentroid` - This computes an $N times K$ matrix of distances from each data point to each centroid.
+
+  - `assignClosestCentroid` - This reduces the distances to find the minimum distance for each centroid, and assigns the closest one to an $N times 1$ vector.
+
+  - `recentralizeCentroidSum` - This computes a sum for the purposes of averaging, and also counts the number of elements in each cluster.
+
+  - `recentralizeCentroidDiv` - This uses the count from the previous step and divides everything in parallel.
+
+  I tried to make sure every thread is computing approximately one single for-loop's worth of data, most of the time over the $d$ axis
+
+2. Give details about how many elements and how the computations in your kernels are handled by a thread.
+
+  Threads are dynamically allocated 
+
+3. Ensure you include details about the thread hierarchy, i.e., whether the threads are organized in a 1D, 2D, or, 3D fashion in a thread-block, and whether the thread-blocks are arranged 1D, 2D, or, 3D grid. NOTE: If you choose to write CUDA kernels where the number of thread blocks is determined dynamically by the program during runtime, then send -1 as the input argument for the number of thread blocks to the invocation. In your program, use -1 as a flag to indicate that the number of thread blocks will need to be computed during runtime.
+
+4. You need to perform a parameter study in order to determine how the number of elements processed by a thread and the size of a thread-block, i.e., the \# threads in a block, affect the performance of your algorithm. Your writeup should contain some results showing the runtime that you obtained for different choices.
+
+5. You should include results on the 'large_cpd.txt' dataset with 256, 512, and 1024 clusters.
--- a/assignments/04/run.sh
+++ b/assignments/04/run.sh
@ -1,4 +1,5 @@
 set -euo pipefail
 HOST="zhan4854@csel-cuda-02.cselabs.umn.edu"
 rsync -azPr --exclude 'large_cpd.txt' . $HOST:~/hwk4
-ssh $HOST bash -c 'set -euo pipefail; module load soft/cuda/local; module initadd soft/cuda/local; cd hwk4; make clean; make; ls; ./km_cuda ./dataset/small_gaussian.txt 512 64 128'
+ssh $HOST bash -c 'set -euo pipefail; module load soft/cuda/local; module initadd soft/cuda/local; cd hwk4; make clean; make; ls; ./km_cuda ./dataset/small_gaussian.txt 2 64 128'
+rsync -azPr --exclude 'large_cpd.txt' zhan4854@csel-cuda-02.cselabs.umn.edu:~/hwk4/ .