// #define _POSIX_C_SOURCE 200809L #include #include #define CUDACHECK(err) \ do { \ cuda_check((err), __FILE__, __LINE__); \ } while (false) inline void cuda_check(cudaError_t error_code, const char *file, int line) { if (error_code != cudaSuccess) { fprintf(stderr, "CUDA Error %d: %s. In file '%s' on line %d\n", error_code, cudaGetErrorString(error_code), file, line); fflush(stderr); exit(error_code); } } #define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y)) #define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y)) // #define ENSURE_int(i) _Generic((i), int: (i)) // #define ENSURE_float(f) _Generic((f), float: (f)) // #define MAX(type, x, y) (type) GENERIC_MAX(ENSURE_##type(x), ENSURE_##type(y)) // #define MIN(type, x, y) (type) GENERIC_MIN(ENSURE_##type(x), ENSURE_##type(y)) /** * @brief Return the number of seconds since an unspecified time (e.g., Unix * epoch). This is accomplished with a high-resolution monotonic timer, * suitable for performance timing. * * @return The number of seconds. */ static inline double monotonic_seconds() { /* Linux systems */ struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts.tv_sec + ts.tv_nsec * 1e-9; } /** * @brief Output the seconds elapsed while clustering. * * @param seconds Seconds spent on k-means clustering, excluding IO. */ static void print_time(double const seconds) { printf("k-means clustering time: %0.04fs\n", seconds); } __global__ void findDistanceToCentroid(int N, int K, int dim, float *centroidDistances, float *data, float *centroids, int tOffset) { int t = blockIdx.x + tOffset; // data index int c = threadIdx.x; // cluster index float sum = 0; for (int d = 0; d < dim; ++d) { float delta = data[t * dim + d] - centroids[c * dim + d]; sum += delta * delta; } centroidDistances[t * K + c] = sqrt(sum); } __global__ void assignClosestCentroid(int N, int K, int *dirtyBit, float *centroidDistances, int *clusterMap) { int t = blockIdx.x; int minIdx = 0; float minValue = INFINITY; for (int c = 0; c < K; ++c) { float dist = centroidDistances[t * K + c]; if (dist < minValue) { minValue = dist; minIdx = c; } } // printf("[%d]: minDist %f @ idx %d\n", t, minValue, minIdx); int oldMinIdx = clusterMap[t]; clusterMap[t] = minIdx; if (oldMinIdx != minIdx) { atomicOr(dirtyBit, 1); } } __global__ void recentralizeCentroidSum(int N, int K, int dim, float *data, float *centroids, int *clusterMap, unsigned int *clusterCount) { int t = blockIdx.x; // data index int c = threadIdx.x; // cluster index int assignedCluster = clusterMap[t]; if (assignedCluster != c) return; atomicAdd((unsigned int *)&clusterCount[c], 1); for (int d = 0; d < dim; ++d) { atomicAdd(¢roids[c * dim + d], data[t * dim + d]); } } __global__ void recentralizeCentroidDiv(int dim, float *centroids, unsigned int *clusterCount) { int c = threadIdx.x; // cluster index for (int d = 0; d < dim; ++d) { centroids[c * dim + d] /= clusterCount[c]; } } int main(int argc, char **argv) { char *data_file = argv[1]; int num_clusters = atoi(argv[2]); int num_thread_blocks = atoi(argv[3]); int num_threads_per_block = atoi(argv[4]); int N, dim; float *centroids, // centroids[cluster][dimension] *data, // data[t][dimension] *centroidDistances; // centroidDistances[t][cluster] int *clusterMap, *dirtyBit; unsigned int *clusterCount; #pragma region Read in data { FILE *fp = fopen(data_file, "r"); // Read first line size_t n; char *line = NULL; if (!getline(&line, &n, fp)) return -1; sscanf(line, "%d %d", &N, &dim); free(line); line = NULL; // Allocate memory on the GPU CUDACHECK( cudaMalloc((void **)¢roids, num_clusters * dim * sizeof(float))); CUDACHECK(cudaMallocManaged((void **)&clusterMap, N * sizeof(int))); CUDACHECK(cudaMallocManaged((void **)&clusterCount, num_clusters * sizeof(unsigned int))); CUDACHECK(cudaMalloc((void **)&data, N * dim * sizeof(float))); CUDACHECK(cudaMalloc((void **)¢roidDistances, N * num_clusters * sizeof(float))); CUDACHECK(cudaMallocManaged((void **)&dirtyBit, sizeof(int))); // Initialize all the cluster mappings to -1 so the first iteration is // always fully dirty CUDACHECK(cudaMemset(clusterMap, -1, N * sizeof(int))); // Read the rest of the lines { // Buffer for copying float *currentLine = (float *)malloc(dim * sizeof(float)); for (int i = 0; i < N; ++i) { if (!getline(&line, &n, fp)) return -1; for (int j = 0; j < dim; ++j) sscanf(line, "%f", ¤tLine[j]); CUDACHECK(cudaMemcpy(&data[i * dim], currentLine, dim * sizeof(float), cudaMemcpyHostToDevice)); } free(currentLine); } fclose(fp); } #pragma endregion double start_time = monotonic_seconds(); #pragma region Select the initial K centroids { CUDACHECK(cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float), cudaMemcpyDeviceToDevice)); } #pragma endregion #pragma region Assign each data point to the closest centroid, \ measured via Euclidean distance. { if (num_thread_blocks == -1) { findDistanceToCentroid<<>>( N, num_clusters, dim, centroidDistances, data, centroids, 0); } else { for (int j = 0; j < N; j += num_thread_blocks) { int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks); findDistanceToCentroid<<>>( N, num_clusters, dim, centroidDistances, data, centroids, j * num_thread_blocks); } } CUDACHECK(cudaDeviceSynchronize()); *dirtyBit = 0; assignClosestCentroid<<>>(N, num_clusters, dirtyBit, centroidDistances, clusterMap); CUDACHECK(cudaDeviceSynchronize()); } #pragma endregion #pragma region Iteration int it = 0; for (int it = 0; it < 20 && *dirtyBit; ++it) { // printf("Iteration %d (dirty=%d)\n", it, *dirtyBit); // Update each centroid to be the average coordinate of all contained data // points CUDACHECK(cudaMemset(clusterCount, 0, num_clusters * sizeof(int))); CUDACHECK(cudaMemset(centroids, 0, num_clusters * dim * sizeof(float))); recentralizeCentroidSum<<>>( N, num_clusters, dim, data, centroids, clusterMap, clusterCount); CUDACHECK(cudaDeviceSynchronize()); // Print out the cluster compositions // for (int i = 0; i < num_clusters; ++i) // printf("%d ", clusterCount[i]); // printf("\n"); recentralizeCentroidDiv<<<1, num_clusters>>>(dim, centroids, clusterCount); CUDACHECK(cudaDeviceSynchronize()); // Assign all data points to the closest centroid (measured via Euclidean // distance). // findDistanceToCentroid<<>>( // N, num_clusters, dim, centroidDistances, data, centroids); if (num_thread_blocks == -1) { findDistanceToCentroid<<>>( N, num_clusters, dim, centroidDistances, data, centroids, 0); } else { for (int j = 0; j < N; j += num_thread_blocks) { int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks); findDistanceToCentroid<<>>( N, num_clusters, dim, centroidDistances, data, centroids, j * num_thread_blocks); } } CUDACHECK(cudaDeviceSynchronize()); *dirtyBit = 0; assignClosestCentroid<<>>(N, num_clusters, dirtyBit, centroidDistances, clusterMap); CUDACHECK(cudaDeviceSynchronize()); } #pragma endregion double end_time = monotonic_seconds(); print_time(end_time - start_time); #pragma region { FILE *fp = fopen("clusters.txt", "w"); for (int i = 0; i < N; ++i) fprintf(fp, "%d\n", clusterMap[i]); fclose(fp); } { FILE *fp = fopen("centroids.txt", "w"); fprintf(fp, "%d %d\n", num_clusters, dim); float *line = (float *)malloc(dim * sizeof(float)); for (int i = 0; i < num_clusters; ++i) { CUDACHECK(cudaMemcpy(line, ¢roids[i * dim], dim * sizeof(float), cudaMemcpyDeviceToHost)); for (int d = 0; d < dim; ++d) fprintf(fp, "%.3f ", line[d]); fprintf(fp, "\n"); } free(line); fclose(fp); } #pragma endregion return 0; }