This commit is contained in:
Michael Zhang 2023-12-10 16:57:24 -06:00
parent 68a1e749dc
commit d8f5fe13b6
Signed by: michael
GPG key ID: BDA47A31A3C8EE6B
3 changed files with 134 additions and 22 deletions

View file

@ -14,8 +14,66 @@ inline void cuda_check(cudaError_t error_code, const char *file, int line) {
} }
} }
__global__ void findDistanceToCentroid(float *centroidDistances, float *data) { __global__ void findDistanceToCentroid(int N, int K, int dim,
centroidDistances[blockIdx.x] = blockIdx.x; float *centroidDistances, float *data,
float *centroids) {
int t = blockIdx.x; // data index
int c = threadIdx.x; // cluster index
float sum = 0;
for (int d = 0; d < dim; ++d) {
float delta = data[t * dim + d] - centroids[c * dim + d];
sum += delta * delta;
}
centroidDistances[t * K + c] = sqrt(sum);
}
__global__ void assignClosestCentroid(int N, int K, int *dirtyBit,
float *centroidDistances,
int *clusterMap) {
int t = blockIdx.x;
int minIdx = 0;
float minValue = INFINITY;
for (int c = 0; c < K; ++c) {
float dist = centroidDistances[t * K + c];
if (dist < minValue) {
minValue = dist;
minIdx = c;
}
}
// printf("[%d]: minDist %f @ idx %d\n", t, minValue, minIdx);
int oldMinIdx = clusterMap[t];
clusterMap[t] = minIdx;
if (oldMinIdx != minIdx) {
atomicOr(dirtyBit, 1);
}
}
__global__ void recentralizeCentroidSum(int N, int K, int dim, float *data,
float *centroids, int *clusterMap,
unsigned int *clusterCount) {
int t = blockIdx.x; // data index
int c = threadIdx.x; // cluster index
int assignedCluster = clusterMap[t];
if (assignedCluster != c)
return;
atomicAdd((unsigned int *)&clusterCount[c], 1);
for (int d = 0; d < dim; ++d) {
atomicAdd(&centroids[c * dim + d], data[t * dim + d]);
}
}
__global__ void recentralizeCentroidDiv(int dim, float *centroids,
unsigned int *clusterCount) {
int c = threadIdx.x; // cluster index
for (int d = 0; d < dim; ++d) {
centroids[c * dim + d] /= clusterCount[c];
}
} }
int main(int argc, char **argv) { int main(int argc, char **argv) {
@ -31,8 +89,11 @@ int main(int argc, char **argv) {
int num_threads_per_block = atoi(argv[4]); int num_threads_per_block = atoi(argv[4]);
int N, dim; int N, dim;
float *centroids, *data, *centroidDistances; float *centroids, // centroids[cluster][dimension]
int *clusterMap; *data, // data[t][dimension]
*centroidDistances; // centroidDistances[t][cluster]
int *clusterMap, *dirtyBit;
unsigned int *clusterCount;
#pragma region Read in data #pragma region Read in data
{ {
@ -51,23 +112,31 @@ int main(int argc, char **argv) {
// Allocate memory on the GPU // Allocate memory on the GPU
CUDACHECK( CUDACHECK(
cudaMalloc((void **)&centroids, num_clusters * dim * sizeof(float))); cudaMalloc((void **)&centroids, num_clusters * dim * sizeof(float)));
cudaMalloc((void **)&clusterMap, N * sizeof(int)); CUDACHECK(cudaMalloc((void **)&clusterMap, N * sizeof(int)));
cudaMalloc((void **)&data, N * dim * sizeof(float)); CUDACHECK(cudaMallocManaged((void **)&clusterCount,
cudaMalloc((void **)&centroidDistances, N * num_clusters * sizeof(float)); num_clusters * sizeof(unsigned int)));
CUDACHECK(cudaMalloc((void **)&data, N * dim * sizeof(float)));
CUDACHECK(cudaMalloc((void **)&centroidDistances,
N * num_clusters * sizeof(float)));
CUDACHECK(cudaMallocManaged((void **)&dirtyBit, sizeof(int)));
// Initialize all the cluster mappings to -1 so the first iteration is
// always fully dirty
CUDACHECK(cudaMemset(clusterMap, -1, N * sizeof(int)));
// Read the rest of the lines // Read the rest of the lines
{ {
// Buffer for copying // Buffer for copying
int *currentLine = (int *)malloc(dim * sizeof(int)); float *currentLine = (float *)malloc(dim * sizeof(float));
for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) {
if (!getline(&line, &n, fp)) if (!getline(&line, &n, fp))
return -1; return -1;
for (int j = 0; j < dim; ++j) for (int j = 0; j < dim; ++j)
sscanf(line, "%d", &currentLine[j]); sscanf(line, "%f", &currentLine[j]);
cudaMemcpy(&data[i * dim], currentLine, dim * sizeof(float), CUDACHECK(cudaMemcpy(&data[i * dim], currentLine, dim * sizeof(float),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice));
} }
free(currentLine); free(currentLine);
} }
@ -80,24 +149,59 @@ int main(int argc, char **argv) {
#pragma region Select the initial K centroids #pragma region Select the initial K centroids
{ {
cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float), CUDACHECK(cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float),
cudaMemcpyDeviceToDevice); cudaMemcpyDeviceToDevice));
} }
#pragma endregion #pragma endregion
#pragma region Assign each data point to the closest centroid, \ #pragma region Assign each data point to the closest centroid, \
measured via Euclidean distance. measured via Euclidean distance.
{ {
findDistanceToCentroid<<<10, 10>>>(centroidDistances, data); findDistanceToCentroid<<<N, num_clusters>>>(
cudaDeviceSynchronize(); N, num_clusters, dim, centroidDistances, data, centroids);
printf("Shiet\n"); CUDACHECK(cudaDeviceSynchronize());
float wtf[10]; *dirtyBit = 0;
cudaMemcpy(wtf, centroidDistances, 10 * sizeof(float), assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
cudaMemcpyDeviceToHost); centroidDistances, clusterMap);
for (int i = 0; i < 10; ++i) { CUDACHECK(cudaDeviceSynchronize());
printf("asdf %d %f\n", i, wtf[i]); }
printf("Is dirty: %d\n", *dirtyBit);
#pragma endregion
#pragma region
int it = 0;
while (*dirtyBit) {
printf("Iteration %d (dirty=%d)\n", it, *dirtyBit);
// Update each centroid to be the average coordinate of all contained data
// points
CUDACHECK(cudaMemset(clusterCount, 0, num_clusters * sizeof(int)));
CUDACHECK(cudaMemset(centroids, 0, num_clusters * dim * sizeof(float)));
recentralizeCentroidSum<<<N, num_clusters>>>(
N, num_clusters, dim, data, centroids, clusterMap, clusterCount);
CUDACHECK(cudaDeviceSynchronize());
for (int i = 0; i < num_clusters; ++i) {
printf("%d ", clusterCount[i]);
} }
printf("\n");
recentralizeCentroidDiv<<<1, num_clusters>>>(dim, centroids, clusterCount);
CUDACHECK(cudaDeviceSynchronize());
// Assign all data points to the closest centroid (measured via Euclidean
// distance).
findDistanceToCentroid<<<N, num_clusters>>>(
N, num_clusters, dim, centroidDistances, data, centroids);
CUDACHECK(cudaDeviceSynchronize());
*dirtyBit = 0;
assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
centroidDistances, clusterMap);
CUDACHECK(cudaDeviceSynchronize());
it++;
} }
#pragma endregion #pragma endregion

4
assignments/04/run.sh Executable file
View file

@ -0,0 +1,4 @@
set -euo pipefail
HOST="zhan4854@csel-cuda-02.cselabs.umn.edu"
rsync -azPr --exclude 'large_cpd.txt' . $HOST:~/hwk4
ssh $HOST bash -c 'set -euo pipefail; module load soft/cuda/local; module initadd soft/cuda/local; cd hwk4; make clean; make; ls; ./km_cuda ./dataset/small_gaussian.txt 512 64 128'

View file

@ -4,9 +4,13 @@
outputs = { self, nixpkgs, nixpkgsUnstable, flake-utils }: outputs = { self, nixpkgs, nixpkgsUnstable, flake-utils }:
flake-utils.lib.eachDefaultSystem (system: flake-utils.lib.eachDefaultSystem (system:
let let
pkgs = import nixpkgs { inherit system; }; pkgs = import nixpkgs {
inherit system;
config.cudaSupport = true;
};
pkgsUnstable = import nixpkgsUnstable { pkgsUnstable = import nixpkgsUnstable {
inherit system; inherit system;
config.cudaSupport = true;
config.allowUnfreePredicate = pkg: config.allowUnfreePredicate = pkg:
builtins.elem (nixpkgs.lib.getName pkg) [ "cudatoolkit" ]; builtins.elem (nixpkgs.lib.getName pkg) [ "cudatoolkit" ];
}; };