From d8f5fe13b6e834b5b85679bf6fe82b48869a6351 Mon Sep 17 00:00:00 2001 From: Michael Zhang Date: Sun, 10 Dec 2023 16:57:24 -0600 Subject: [PATCH] shiet --- assignments/04/km_cuda.cu | 146 ++++++++++++++++++++++++++++++++------ assignments/04/run.sh | 4 ++ flake.nix | 6 +- 3 files changed, 134 insertions(+), 22 deletions(-) create mode 100755 assignments/04/run.sh diff --git a/assignments/04/km_cuda.cu b/assignments/04/km_cuda.cu index 1b4d57a..938e9f7 100644 --- a/assignments/04/km_cuda.cu +++ b/assignments/04/km_cuda.cu @@ -14,8 +14,66 @@ inline void cuda_check(cudaError_t error_code, const char *file, int line) { } } -__global__ void findDistanceToCentroid(float *centroidDistances, float *data) { - centroidDistances[blockIdx.x] = blockIdx.x; +__global__ void findDistanceToCentroid(int N, int K, int dim, + float *centroidDistances, float *data, + float *centroids) { + int t = blockIdx.x; // data index + int c = threadIdx.x; // cluster index + + float sum = 0; + for (int d = 0; d < dim; ++d) { + float delta = data[t * dim + d] - centroids[c * dim + d]; + sum += delta * delta; + } + + centroidDistances[t * K + c] = sqrt(sum); +} + +__global__ void assignClosestCentroid(int N, int K, int *dirtyBit, + float *centroidDistances, + int *clusterMap) { + int t = blockIdx.x; + int minIdx = 0; + float minValue = INFINITY; + + for (int c = 0; c < K; ++c) { + float dist = centroidDistances[t * K + c]; + if (dist < minValue) { + minValue = dist; + minIdx = c; + } + } + + // printf("[%d]: minDist %f @ idx %d\n", t, minValue, minIdx); + int oldMinIdx = clusterMap[t]; + clusterMap[t] = minIdx; + if (oldMinIdx != minIdx) { + atomicOr(dirtyBit, 1); + } +} + +__global__ void recentralizeCentroidSum(int N, int K, int dim, float *data, + float *centroids, int *clusterMap, + unsigned int *clusterCount) { + int t = blockIdx.x; // data index + int c = threadIdx.x; // cluster index + int assignedCluster = clusterMap[t]; + + if (assignedCluster != c) + return; + + atomicAdd((unsigned int *)&clusterCount[c], 1); + for (int d = 0; d < dim; ++d) { + atomicAdd(¢roids[c * dim + d], data[t * dim + d]); + } +} + +__global__ void recentralizeCentroidDiv(int dim, float *centroids, + unsigned int *clusterCount) { + int c = threadIdx.x; // cluster index + for (int d = 0; d < dim; ++d) { + centroids[c * dim + d] /= clusterCount[c]; + } } int main(int argc, char **argv) { @@ -31,8 +89,11 @@ int main(int argc, char **argv) { int num_threads_per_block = atoi(argv[4]); int N, dim; - float *centroids, *data, *centroidDistances; - int *clusterMap; + float *centroids, // centroids[cluster][dimension] + *data, // data[t][dimension] + *centroidDistances; // centroidDistances[t][cluster] + int *clusterMap, *dirtyBit; + unsigned int *clusterCount; #pragma region Read in data { @@ -51,23 +112,31 @@ int main(int argc, char **argv) { // Allocate memory on the GPU CUDACHECK( cudaMalloc((void **)¢roids, num_clusters * dim * sizeof(float))); - cudaMalloc((void **)&clusterMap, N * sizeof(int)); - cudaMalloc((void **)&data, N * dim * sizeof(float)); - cudaMalloc((void **)¢roidDistances, N * num_clusters * sizeof(float)); + CUDACHECK(cudaMalloc((void **)&clusterMap, N * sizeof(int))); + CUDACHECK(cudaMallocManaged((void **)&clusterCount, + num_clusters * sizeof(unsigned int))); + CUDACHECK(cudaMalloc((void **)&data, N * dim * sizeof(float))); + CUDACHECK(cudaMalloc((void **)¢roidDistances, + N * num_clusters * sizeof(float))); + CUDACHECK(cudaMallocManaged((void **)&dirtyBit, sizeof(int))); + + // Initialize all the cluster mappings to -1 so the first iteration is + // always fully dirty + CUDACHECK(cudaMemset(clusterMap, -1, N * sizeof(int))); // Read the rest of the lines { // Buffer for copying - int *currentLine = (int *)malloc(dim * sizeof(int)); + float *currentLine = (float *)malloc(dim * sizeof(float)); for (int i = 0; i < N; ++i) { if (!getline(&line, &n, fp)) return -1; for (int j = 0; j < dim; ++j) - sscanf(line, "%d", ¤tLine[j]); + sscanf(line, "%f", ¤tLine[j]); - cudaMemcpy(&data[i * dim], currentLine, dim * sizeof(float), - cudaMemcpyHostToDevice); + CUDACHECK(cudaMemcpy(&data[i * dim], currentLine, dim * sizeof(float), + cudaMemcpyHostToDevice)); } free(currentLine); } @@ -80,24 +149,59 @@ int main(int argc, char **argv) { #pragma region Select the initial K centroids { - cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float), - cudaMemcpyDeviceToDevice); + CUDACHECK(cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float), + cudaMemcpyDeviceToDevice)); } #pragma endregion #pragma region Assign each data point to the closest centroid, \ measured via Euclidean distance. { - findDistanceToCentroid<<<10, 10>>>(centroidDistances, data); - cudaDeviceSynchronize(); - printf("Shiet\n"); + findDistanceToCentroid<<>>( + N, num_clusters, dim, centroidDistances, data, centroids); + CUDACHECK(cudaDeviceSynchronize()); - float wtf[10]; - cudaMemcpy(wtf, centroidDistances, 10 * sizeof(float), - cudaMemcpyDeviceToHost); - for (int i = 0; i < 10; ++i) { - printf("asdf %d %f\n", i, wtf[i]); + *dirtyBit = 0; + assignClosestCentroid<<>>(N, num_clusters, dirtyBit, + centroidDistances, clusterMap); + CUDACHECK(cudaDeviceSynchronize()); + } + + printf("Is dirty: %d\n", *dirtyBit); +#pragma endregion + +#pragma region + int it = 0; + while (*dirtyBit) { + printf("Iteration %d (dirty=%d)\n", it, *dirtyBit); + + // Update each centroid to be the average coordinate of all contained data + // points + CUDACHECK(cudaMemset(clusterCount, 0, num_clusters * sizeof(int))); + CUDACHECK(cudaMemset(centroids, 0, num_clusters * dim * sizeof(float))); + recentralizeCentroidSum<<>>( + N, num_clusters, dim, data, centroids, clusterMap, clusterCount); + CUDACHECK(cudaDeviceSynchronize()); + for (int i = 0; i < num_clusters; ++i) { + printf("%d ", clusterCount[i]); } + printf("\n"); + + recentralizeCentroidDiv<<<1, num_clusters>>>(dim, centroids, clusterCount); + CUDACHECK(cudaDeviceSynchronize()); + + // Assign all data points to the closest centroid (measured via Euclidean + // distance). + findDistanceToCentroid<<>>( + N, num_clusters, dim, centroidDistances, data, centroids); + CUDACHECK(cudaDeviceSynchronize()); + + *dirtyBit = 0; + assignClosestCentroid<<>>(N, num_clusters, dirtyBit, + centroidDistances, clusterMap); + CUDACHECK(cudaDeviceSynchronize()); + + it++; } #pragma endregion diff --git a/assignments/04/run.sh b/assignments/04/run.sh new file mode 100755 index 0000000..875dd63 --- /dev/null +++ b/assignments/04/run.sh @@ -0,0 +1,4 @@ +set -euo pipefail +HOST="zhan4854@csel-cuda-02.cselabs.umn.edu" +rsync -azPr --exclude 'large_cpd.txt' . $HOST:~/hwk4 +ssh $HOST bash -c 'set -euo pipefail; module load soft/cuda/local; module initadd soft/cuda/local; cd hwk4; make clean; make; ls; ./km_cuda ./dataset/small_gaussian.txt 512 64 128' diff --git a/flake.nix b/flake.nix index f96165d..08711fe 100644 --- a/flake.nix +++ b/flake.nix @@ -4,9 +4,13 @@ outputs = { self, nixpkgs, nixpkgsUnstable, flake-utils }: flake-utils.lib.eachDefaultSystem (system: let - pkgs = import nixpkgs { inherit system; }; + pkgs = import nixpkgs { + inherit system; + config.cudaSupport = true; + }; pkgsUnstable = import nixpkgsUnstable { inherit system; + config.cudaSupport = true; config.allowUnfreePredicate = pkg: builtins.elem (nixpkgs.lib.getName pkg) [ "cudatoolkit" ]; };