From 0b179f38bd2ed3e63c42ba2656f126d8d6814fe9 Mon Sep 17 00:00:00 2001 From: Michael Zhang Date: Fri, 15 Dec 2023 17:13:47 -0600 Subject: [PATCH] d --- assignments/04/.gitignore | 3 +- assignments/04/km_cuda.cu | 77 +++++--- assignments/04/km_cuda_fixed_n.cu | 281 ++++++++++++++++++++++++++++++ assignments/04/report.typ | 4 + assignments/04/run.sh | 2 +- 5 files changed, 340 insertions(+), 27 deletions(-) create mode 100644 assignments/04/km_cuda_fixed_n.cu diff --git a/assignments/04/.gitignore b/assignments/04/.gitignore index eb1edeb..eacc0b0 100644 --- a/assignments/04/.gitignore +++ b/assignments/04/.gitignore @@ -3,4 +3,5 @@ km_cuda clusters.txt centroids.txt -report.pdf \ No newline at end of file +report.pdf +cluster.txt \ No newline at end of file diff --git a/assignments/04/km_cuda.cu b/assignments/04/km_cuda.cu index 917307f..27f0ffc 100644 --- a/assignments/04/km_cuda.cu +++ b/assignments/04/km_cuda.cu @@ -15,15 +15,23 @@ inline void cuda_check(cudaError_t error_code, const char *file, int line) { } } +#define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y)) +#define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y)) + +// #define ENSURE_int(i) _Generic((i), int: (i)) +// #define ENSURE_float(f) _Generic((f), float: (f)) + +// #define MAX(type, x, y) (type) GENERIC_MAX(ENSURE_##type(x), ENSURE_##type(y)) +// #define MIN(type, x, y) (type) GENERIC_MIN(ENSURE_##type(x), ENSURE_##type(y)) + /** -* @brief Return the number of seconds since an unspecified time (e.g., Unix -* epoch). This is accomplished with a high-resolution monotonic timer, -* suitable for performance timing. -* -* @return The number of seconds. -*/ -static inline double monotonic_seconds() -{ + * @brief Return the number of seconds since an unspecified time (e.g., Unix + * epoch). This is accomplished with a high-resolution monotonic timer, + * suitable for performance timing. + * + * @return The number of seconds. + */ +static inline double monotonic_seconds() { /* Linux systems */ struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); @@ -31,20 +39,19 @@ static inline double monotonic_seconds() } /** -* @brief Output the seconds elapsed while clustering. -* -* @param seconds Seconds spent on k-means clustering, excluding IO. -*/ -static void print_time(double const seconds) -{ + * @brief Output the seconds elapsed while clustering. + * + * @param seconds Seconds spent on k-means clustering, excluding IO. + */ +static void print_time(double const seconds) { printf("k-means clustering time: %0.04fs\n", seconds); } __global__ void findDistanceToCentroid(int N, int K, int dim, float *centroidDistances, float *data, - float *centroids) { - int t = blockIdx.x; // data index - int c = threadIdx.x; // cluster index + float *centroids, int tOffset) { + int t = blockIdx.x + tOffset; // data index + int c = threadIdx.x; // cluster index float sum = 0; for (int d = 0; d < dim; ++d) { @@ -177,8 +184,17 @@ int main(int argc, char **argv) { #pragma region Assign each data point to the closest centroid, \ measured via Euclidean distance. { - findDistanceToCentroid<<>>( - N, num_clusters, dim, centroidDistances, data, centroids); + if (num_thread_blocks == -1) { + findDistanceToCentroid<<>>( + N, num_clusters, dim, centroidDistances, data, centroids, 0); + } else { + for (int j = 0; j < N; j += num_thread_blocks) { + int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks); + findDistanceToCentroid<<>>( + N, num_clusters, dim, centroidDistances, data, centroids, + j * num_thread_blocks); + } + } CUDACHECK(cudaDeviceSynchronize()); *dirtyBit = 0; @@ -190,7 +206,7 @@ int main(int argc, char **argv) { #pragma region Iteration int it = 0; - for (int it=0;it < 20 && *dirtyBit; ++it) { + for (int it = 0; it < 20 && *dirtyBit; ++it) { printf("Iteration %d (dirty=%d)\n", it, *dirtyBit); // Update each centroid to be the average coordinate of all contained data @@ -201,7 +217,7 @@ int main(int argc, char **argv) { N, num_clusters, dim, data, centroids, clusterMap, clusterCount); CUDACHECK(cudaDeviceSynchronize()); -// Print out the cluster compositions + // Print out the cluster compositions for (int i = 0; i < num_clusters; ++i) printf("%d ", clusterCount[i]); printf("\n"); @@ -211,8 +227,19 @@ int main(int argc, char **argv) { // Assign all data points to the closest centroid (measured via Euclidean // distance). - findDistanceToCentroid<<>>( - N, num_clusters, dim, centroidDistances, data, centroids); + // findDistanceToCentroid<<>>( + // N, num_clusters, dim, centroidDistances, data, centroids); + if (num_thread_blocks == -1) { + findDistanceToCentroid<<>>( + N, num_clusters, dim, centroidDistances, data, centroids, 0); + } else { + for (int j = 0; j < N; j += num_thread_blocks) { + int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks); + findDistanceToCentroid<<>>( + N, num_clusters, dim, centroidDistances, data, centroids, + j * num_thread_blocks); + } + } CUDACHECK(cudaDeviceSynchronize()); *dirtyBit = 0; @@ -222,8 +249,8 @@ int main(int argc, char **argv) { } #pragma endregion -double end_time = monotonic_seconds(); -print_time(end_time - start_time); + double end_time = monotonic_seconds(); + print_time(end_time - start_time); #pragma region { diff --git a/assignments/04/km_cuda_fixed_n.cu b/assignments/04/km_cuda_fixed_n.cu new file mode 100644 index 0000000..b46c53d --- /dev/null +++ b/assignments/04/km_cuda_fixed_n.cu @@ -0,0 +1,281 @@ +// #define _POSIX_C_SOURCE 200809L +#include +#include + +#define CUDACHECK(err) \ + do { \ + cuda_check((err), __FILE__, __LINE__); \ + } while (false) +inline void cuda_check(cudaError_t error_code, const char *file, int line) { + if (error_code != cudaSuccess) { + fprintf(stderr, "CUDA Error %d: %s. In file '%s' on line %d\n", error_code, + cudaGetErrorString(error_code), file, line); + fflush(stderr); + exit(error_code); + } +} + +#define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y)) +#define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y)) + +// #define ENSURE_int(i) _Generic((i), int: (i)) +// #define ENSURE_float(f) _Generic((f), float: (f)) + +// #define MAX(type, x, y) (type) GENERIC_MAX(ENSURE_##type(x), ENSURE_##type(y)) +// #define MIN(type, x, y) (type) GENERIC_MIN(ENSURE_##type(x), ENSURE_##type(y)) + +/** + * @brief Return the number of seconds since an unspecified time (e.g., Unix + * epoch). This is accomplished with a high-resolution monotonic timer, + * suitable for performance timing. + * + * @return The number of seconds. + */ +static inline double monotonic_seconds() { + /* Linux systems */ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec + ts.tv_nsec * 1e-9; +} + +/** + * @brief Output the seconds elapsed while clustering. + * + * @param seconds Seconds spent on k-means clustering, excluding IO. + */ +static void print_time(double const seconds) { + printf("k-means clustering time: %0.04fs\n", seconds); +} + +__global__ void findDistanceToCentroid(int N, int K, int dim, + float *centroidDistances, float *data, + float *centroids, int tOffset) { + int t = blockIdx.x + tOffset; // data index + int c = threadIdx.x; // cluster index + + float sum = 0; + for (int d = 0; d < dim; ++d) { + float delta = data[t * dim + d] - centroids[c * dim + d]; + sum += delta * delta; + } + + centroidDistances[t * K + c] = sqrt(sum); +} + +__global__ void assignClosestCentroid(int N, int K, int *dirtyBit, + float *centroidDistances, + int *clusterMap) { + int t = blockIdx.x; + int minIdx = 0; + float minValue = INFINITY; + + for (int c = 0; c < K; ++c) { + float dist = centroidDistances[t * K + c]; + if (dist < minValue) { + minValue = dist; + minIdx = c; + } + } + + // printf("[%d]: minDist %f @ idx %d\n", t, minValue, minIdx); + int oldMinIdx = clusterMap[t]; + clusterMap[t] = minIdx; + if (oldMinIdx != minIdx) { + atomicOr(dirtyBit, 1); + } +} + +__global__ void recentralizeCentroidSum(int N, int K, int dim, float *data, + float *centroids, int *clusterMap, + unsigned int *clusterCount) { + int t = blockIdx.x; // data index + int c = threadIdx.x; // cluster index + int assignedCluster = clusterMap[t]; + + if (assignedCluster != c) + return; + + atomicAdd((unsigned int *)&clusterCount[c], 1); + for (int d = 0; d < dim; ++d) { + atomicAdd(¢roids[c * dim + d], data[t * dim + d]); + } +} + +__global__ void recentralizeCentroidDiv(int dim, float *centroids, + unsigned int *clusterCount) { + int c = threadIdx.x; // cluster index + for (int d = 0; d < dim; ++d) { + centroids[c * dim + d] /= clusterCount[c]; + } +} + +int main(int argc, char **argv) { + char *data_file = argv[1]; + int num_clusters = atoi(argv[2]); + int num_thread_blocks = atoi(argv[3]); + int num_threads_per_block = atoi(argv[4]); + + int N, dim; + float *centroids, // centroids[cluster][dimension] + *data, // data[t][dimension] + *centroidDistances; // centroidDistances[t][cluster] + int *clusterMap, *dirtyBit; + unsigned int *clusterCount; + +#pragma region Read in data + { + FILE *fp = fopen(data_file, "r"); + + // Read first line + size_t n; + char *line = NULL; + if (!getline(&line, &n, fp)) + return -1; + + sscanf(line, "%d %d", &N, &dim); + free(line); + line = NULL; + + // Allocate memory on the GPU + CUDACHECK( + cudaMalloc((void **)¢roids, num_clusters * dim * sizeof(float))); + CUDACHECK(cudaMallocManaged((void **)&clusterMap, N * sizeof(int))); + CUDACHECK(cudaMallocManaged((void **)&clusterCount, + num_clusters * sizeof(unsigned int))); + CUDACHECK(cudaMalloc((void **)&data, N * dim * sizeof(float))); + CUDACHECK(cudaMalloc((void **)¢roidDistances, + N * num_clusters * sizeof(float))); + CUDACHECK(cudaMallocManaged((void **)&dirtyBit, sizeof(int))); + + // Initialize all the cluster mappings to -1 so the first iteration is + // always fully dirty + CUDACHECK(cudaMemset(clusterMap, -1, N * sizeof(int))); + + // Read the rest of the lines + { + // Buffer for copying + float *currentLine = (float *)malloc(dim * sizeof(float)); + for (int i = 0; i < N; ++i) { + if (!getline(&line, &n, fp)) + return -1; + + for (int j = 0; j < dim; ++j) + sscanf(line, "%f", ¤tLine[j]); + + CUDACHECK(cudaMemcpy(&data[i * dim], currentLine, dim * sizeof(float), + cudaMemcpyHostToDevice)); + } + free(currentLine); + } + + fclose(fp); + } +#pragma endregion + + double start_time = monotonic_seconds(); + +#pragma region Select the initial K centroids + { + CUDACHECK(cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float), + cudaMemcpyDeviceToDevice)); + } +#pragma endregion + +#pragma region Assign each data point to the closest centroid, \ + measured via Euclidean distance. + { + if (num_thread_blocks == -1) { + findDistanceToCentroid<<>>( + N, num_clusters, dim, centroidDistances, data, centroids, 0); + } else { + for (int j = 0; j < N; j += num_thread_blocks) { + int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks); + findDistanceToCentroid<<>>( + N, num_clusters, dim, centroidDistances, data, centroids, + j * num_thread_blocks); + } + } + CUDACHECK(cudaDeviceSynchronize()); + + *dirtyBit = 0; + assignClosestCentroid<<>>(N, num_clusters, dirtyBit, + centroidDistances, clusterMap); + CUDACHECK(cudaDeviceSynchronize()); + } +#pragma endregion + +#pragma region Iteration + int it = 0; + for (int it = 0; it < 20 && *dirtyBit; ++it) { + // printf("Iteration %d (dirty=%d)\n", it, *dirtyBit); + + // Update each centroid to be the average coordinate of all contained data + // points + CUDACHECK(cudaMemset(clusterCount, 0, num_clusters * sizeof(int))); + CUDACHECK(cudaMemset(centroids, 0, num_clusters * dim * sizeof(float))); + recentralizeCentroidSum<<>>( + N, num_clusters, dim, data, centroids, clusterMap, clusterCount); + CUDACHECK(cudaDeviceSynchronize()); + + // Print out the cluster compositions + // for (int i = 0; i < num_clusters; ++i) + // printf("%d ", clusterCount[i]); + // printf("\n"); + + recentralizeCentroidDiv<<<1, num_clusters>>>(dim, centroids, clusterCount); + CUDACHECK(cudaDeviceSynchronize()); + + // Assign all data points to the closest centroid (measured via Euclidean + // distance). + // findDistanceToCentroid<<>>( + // N, num_clusters, dim, centroidDistances, data, centroids); + if (num_thread_blocks == -1) { + findDistanceToCentroid<<>>( + N, num_clusters, dim, centroidDistances, data, centroids, 0); + } else { + for (int j = 0; j < N; j += num_thread_blocks) { + int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks); + findDistanceToCentroid<<>>( + N, num_clusters, dim, centroidDistances, data, centroids, + j * num_thread_blocks); + } + } + CUDACHECK(cudaDeviceSynchronize()); + + *dirtyBit = 0; + assignClosestCentroid<<>>(N, num_clusters, dirtyBit, + centroidDistances, clusterMap); + CUDACHECK(cudaDeviceSynchronize()); + } +#pragma endregion + + double end_time = monotonic_seconds(); + print_time(end_time - start_time); + +#pragma region + { + FILE *fp = fopen("clusters.txt", "w"); + for (int i = 0; i < N; ++i) + fprintf(fp, "%d\n", clusterMap[i]); + fclose(fp); + } + + { + FILE *fp = fopen("centroids.txt", "w"); + fprintf(fp, "%d %d\n", num_clusters, dim); + float *line = (float *)malloc(dim * sizeof(float)); + for (int i = 0; i < num_clusters; ++i) { + CUDACHECK(cudaMemcpy(line, ¢roids[i * dim], dim * sizeof(float), + cudaMemcpyDeviceToHost)); + for (int d = 0; d < dim; ++d) + fprintf(fp, "%.3f ", line[d]); + fprintf(fp, "\n"); + } + free(line); + fclose(fp); + } + +#pragma endregion + + return 0; +} diff --git a/assignments/04/report.typ b/assignments/04/report.typ index 1bfed7d..dc3be65 100644 --- a/assignments/04/report.typ +++ b/assignments/04/report.typ @@ -1,3 +1,5 @@ +#set page(margin: 1.5em) + == Homework 4 Michael Zhang \ @@ -30,6 +32,8 @@ Michael Zhang \ 4. *You need to perform a parameter study in order to determine how the number of elements processed by a thread and the size of a thread-block, i.e., the \# threads in a block, affect the performance of your algorithm. Your writeup should contain some results showing the runtime that you obtained for different choices.* + I tried using a fixed number of thread blocks, but for the 256 cluster case it ended up performing almost twice as slow (at 0.4516s). To the best of my knowledge, CUDA is able to schedule the extra kernel instances it needs to compute to happen sequentially after, if there are more thread blocks requested than there are available. + 5. *You should include results on the 'large_cpd.txt' dataset with 256, 512, and 1024 clusters.* - 256: 26.8258s diff --git a/assignments/04/run.sh b/assignments/04/run.sh index 7a91e31..ea63168 100755 --- a/assignments/04/run.sh +++ b/assignments/04/run.sh @@ -3,7 +3,7 @@ HOST="zhan4854@csel-cuda-02.cselabs.umn.edu" rsync -azPr --exclude 'large_cpd.txt' . $HOST:~/hwk4 CLUSTERS=${1:-512} -BLOCKS=-1 +BLOCKS=512 THREADS=-1 DATAFILE="large_cpd.txt" # DATAFILE="small_gaussian.txt"