// #define _POSIX_C_SOURCE 200809L #include #define CUDACHECK(err) \ do { \ cuda_check((err), __FILE__, __LINE__); \ } while (false) inline void cuda_check(cudaError_t error_code, const char *file, int line) { if (error_code != cudaSuccess) { fprintf(stderr, "CUDA Error %d: %s. In file '%s' on line %d\n", error_code, cudaGetErrorString(error_code), file, line); fflush(stderr); exit(error_code); } } __global__ void findDistanceToCentroid(int N, int K, int dim, float *centroidDistances, float *data, float *centroids) { int t = blockIdx.x; // data index int c = threadIdx.x; // cluster index float sum = 0; for (int d = 0; d < dim; ++d) { float delta = data[t * dim + d] - centroids[c * dim + d]; sum += delta * delta; } centroidDistances[t * K + c] = sqrt(sum); } __global__ void assignClosestCentroid(int N, int K, int *dirtyBit, float *centroidDistances, int *clusterMap) { int t = blockIdx.x; int minIdx = 0; float minValue = INFINITY; for (int c = 0; c < K; ++c) { float dist = centroidDistances[t * K + c]; if (dist < minValue) { minValue = dist; minIdx = c; } } // printf("[%d]: minDist %f @ idx %d\n", t, minValue, minIdx); int oldMinIdx = clusterMap[t]; clusterMap[t] = minIdx; if (oldMinIdx != minIdx) { atomicOr(dirtyBit, 1); } } __global__ void recentralizeCentroidSum(int N, int K, int dim, float *data, float *centroids, int *clusterMap, unsigned int *clusterCount) { int t = blockIdx.x; // data index int c = threadIdx.x; // cluster index int assignedCluster = clusterMap[t]; if (assignedCluster != c) return; atomicAdd((unsigned int *)&clusterCount[c], 1); for (int d = 0; d < dim; ++d) { atomicAdd(¢roids[c * dim + d], data[t * dim + d]); } } __global__ void recentralizeCentroidDiv(int dim, float *centroids, unsigned int *clusterCount) { int c = threadIdx.x; // cluster index for (int d = 0; d < dim; ++d) { centroids[c * dim + d] /= clusterCount[c]; } } int main(int argc, char **argv) { int runtimeVersion, driverVersion; cudaRuntimeGetVersion(&runtimeVersion); cudaDriverGetVersion(&driverVersion); printf("Runtime Version: %d, Driver Version: %d\n", runtimeVersion, driverVersion); char *data_file = argv[1]; int num_clusters = atoi(argv[2]); int num_thread_blocks = atoi(argv[3]); int num_threads_per_block = atoi(argv[4]); int N, dim; float *centroids, // centroids[cluster][dimension] *data, // data[t][dimension] *centroidDistances; // centroidDistances[t][cluster] int *clusterMap, *dirtyBit; unsigned int *clusterCount; #pragma region Read in data { FILE *fp = fopen(data_file, "r"); // Read first line size_t n; char *line = NULL; if (!getline(&line, &n, fp)) return -1; sscanf(line, "%d %d", &N, &dim); free(line); line = NULL; // Allocate memory on the GPU CUDACHECK( cudaMalloc((void **)¢roids, num_clusters * dim * sizeof(float))); CUDACHECK(cudaMallocManaged((void **)&clusterMap, N * sizeof(int))); CUDACHECK(cudaMallocManaged((void **)&clusterCount, num_clusters * sizeof(unsigned int))); CUDACHECK(cudaMalloc((void **)&data, N * dim * sizeof(float))); CUDACHECK(cudaMalloc((void **)¢roidDistances, N * num_clusters * sizeof(float))); CUDACHECK(cudaMallocManaged((void **)&dirtyBit, sizeof(int))); // Initialize all the cluster mappings to -1 so the first iteration is // always fully dirty CUDACHECK(cudaMemset(clusterMap, -1, N * sizeof(int))); // Read the rest of the lines { // Buffer for copying float *currentLine = (float *)malloc(dim * sizeof(float)); for (int i = 0; i < N; ++i) { if (!getline(&line, &n, fp)) return -1; for (int j = 0; j < dim; ++j) sscanf(line, "%f", ¤tLine[j]); CUDACHECK(cudaMemcpy(&data[i * dim], currentLine, dim * sizeof(float), cudaMemcpyHostToDevice)); } free(currentLine); } printf("Done copying.\n"); fclose(fp); } #pragma endregion #pragma region Select the initial K centroids { CUDACHECK(cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float), cudaMemcpyDeviceToDevice)); } #pragma endregion #pragma region Assign each data point to the closest centroid, \ measured via Euclidean distance. { findDistanceToCentroid<<>>( N, num_clusters, dim, centroidDistances, data, centroids); CUDACHECK(cudaDeviceSynchronize()); *dirtyBit = 0; assignClosestCentroid<<>>(N, num_clusters, dirtyBit, centroidDistances, clusterMap); CUDACHECK(cudaDeviceSynchronize()); } printf("Is dirty: %d\n", *dirtyBit); #pragma endregion #pragma region Iteration int it = 0; while (*dirtyBit) { printf("Iteration %d (dirty=%d)\n", it, *dirtyBit); // Update each centroid to be the average coordinate of all contained data // points CUDACHECK(cudaMemset(clusterCount, 0, num_clusters * sizeof(int))); CUDACHECK(cudaMemset(centroids, 0, num_clusters * dim * sizeof(float))); recentralizeCentroidSum<<>>( N, num_clusters, dim, data, centroids, clusterMap, clusterCount); CUDACHECK(cudaDeviceSynchronize()); for (int i = 0; i < num_clusters; ++i) { printf("%d ", clusterCount[i]); } printf("\n"); recentralizeCentroidDiv<<<1, num_clusters>>>(dim, centroids, clusterCount); CUDACHECK(cudaDeviceSynchronize()); // Assign all data points to the closest centroid (measured via Euclidean // distance). findDistanceToCentroid<<>>( N, num_clusters, dim, centroidDistances, data, centroids); CUDACHECK(cudaDeviceSynchronize()); *dirtyBit = 0; assignClosestCentroid<<>>(N, num_clusters, dirtyBit, centroidDistances, clusterMap); CUDACHECK(cudaDeviceSynchronize()); it++; } #pragma endregion #pragma region { FILE *fp = fopen("clusters.txt", "w"); for (int i = 0; i < N; ++i) fprintf(fp, "%d\n", clusterMap[i]); fclose(fp); } { FILE *fp = fopen("centroids.txt", "w"); fprintf(fp, "%d %d\n", num_clusters, dim); float *line = (float *)malloc(dim * sizeof(float)); for (int i = 0; i < num_clusters; ++i) { CUDACHECK(cudaMemcpy(line, ¢roids[i * dim], dim * sizeof(float), cudaMemcpyDeviceToHost)); for (int d = 0; d < dim; ++d) fprintf(fp, "%.3f ", line[d]); fprintf(fp, "\n"); } free(line); fclose(fp); } printf("Done.\n"); #pragma endregion return 0; }