281 lines
8.9 KiB
Text
281 lines
8.9 KiB
Text
// #define _POSIX_C_SOURCE 200809L
|
|
#include <stdio.h>
|
|
#include <time.h>
|
|
|
|
#define CUDACHECK(err) \
|
|
do { \
|
|
cuda_check((err), __FILE__, __LINE__); \
|
|
} while (false)
|
|
inline void cuda_check(cudaError_t error_code, const char *file, int line) {
|
|
if (error_code != cudaSuccess) {
|
|
fprintf(stderr, "CUDA Error %d: %s. In file '%s' on line %d\n", error_code,
|
|
cudaGetErrorString(error_code), file, line);
|
|
fflush(stderr);
|
|
exit(error_code);
|
|
}
|
|
}
|
|
|
|
#define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
|
|
#define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))
|
|
|
|
// #define ENSURE_int(i) _Generic((i), int: (i))
|
|
// #define ENSURE_float(f) _Generic((f), float: (f))
|
|
|
|
// #define MAX(type, x, y) (type) GENERIC_MAX(ENSURE_##type(x), ENSURE_##type(y))
|
|
// #define MIN(type, x, y) (type) GENERIC_MIN(ENSURE_##type(x), ENSURE_##type(y))
|
|
|
|
/**
|
|
* @brief Return the number of seconds since an unspecified time (e.g., Unix
|
|
* epoch). This is accomplished with a high-resolution monotonic timer,
|
|
* suitable for performance timing.
|
|
*
|
|
* @return The number of seconds.
|
|
*/
|
|
static inline double monotonic_seconds() {
|
|
/* Linux systems */
|
|
struct timespec ts;
|
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
return ts.tv_sec + ts.tv_nsec * 1e-9;
|
|
}
|
|
|
|
/**
|
|
* @brief Output the seconds elapsed while clustering.
|
|
*
|
|
* @param seconds Seconds spent on k-means clustering, excluding IO.
|
|
*/
|
|
static void print_time(double const seconds) {
|
|
printf("k-means clustering time: %0.04fs\n", seconds);
|
|
}
|
|
|
|
__global__ void findDistanceToCentroid(int N, int K, int dim,
|
|
float *centroidDistances, float *data,
|
|
float *centroids, int tOffset) {
|
|
int t = blockIdx.x + tOffset; // data index
|
|
int c = threadIdx.x; // cluster index
|
|
|
|
float sum = 0;
|
|
for (int d = 0; d < dim; ++d) {
|
|
float delta = data[t * dim + d] - centroids[c * dim + d];
|
|
sum += delta * delta;
|
|
}
|
|
|
|
centroidDistances[t * K + c] = sqrt(sum);
|
|
}
|
|
|
|
__global__ void assignClosestCentroid(int N, int K, int *dirtyBit,
|
|
float *centroidDistances,
|
|
int *clusterMap) {
|
|
int t = blockIdx.x;
|
|
int minIdx = 0;
|
|
float minValue = INFINITY;
|
|
|
|
for (int c = 0; c < K; ++c) {
|
|
float dist = centroidDistances[t * K + c];
|
|
if (dist < minValue) {
|
|
minValue = dist;
|
|
minIdx = c;
|
|
}
|
|
}
|
|
|
|
// printf("[%d]: minDist %f @ idx %d\n", t, minValue, minIdx);
|
|
int oldMinIdx = clusterMap[t];
|
|
clusterMap[t] = minIdx;
|
|
if (oldMinIdx != minIdx) {
|
|
atomicOr(dirtyBit, 1);
|
|
}
|
|
}
|
|
|
|
__global__ void recentralizeCentroidSum(int N, int K, int dim, float *data,
|
|
float *centroids, int *clusterMap,
|
|
unsigned int *clusterCount) {
|
|
int t = blockIdx.x; // data index
|
|
int c = threadIdx.x; // cluster index
|
|
int assignedCluster = clusterMap[t];
|
|
|
|
if (assignedCluster != c)
|
|
return;
|
|
|
|
atomicAdd((unsigned int *)&clusterCount[c], 1);
|
|
for (int d = 0; d < dim; ++d) {
|
|
atomicAdd(¢roids[c * dim + d], data[t * dim + d]);
|
|
}
|
|
}
|
|
|
|
__global__ void recentralizeCentroidDiv(int dim, float *centroids,
|
|
unsigned int *clusterCount) {
|
|
int c = threadIdx.x; // cluster index
|
|
for (int d = 0; d < dim; ++d) {
|
|
centroids[c * dim + d] /= clusterCount[c];
|
|
}
|
|
}
|
|
|
|
int main(int argc, char **argv) {
|
|
char *data_file = argv[1];
|
|
int num_clusters = atoi(argv[2]);
|
|
int num_thread_blocks = atoi(argv[3]);
|
|
int num_threads_per_block = atoi(argv[4]);
|
|
|
|
int N, dim;
|
|
float *centroids, // centroids[cluster][dimension]
|
|
*data, // data[t][dimension]
|
|
*centroidDistances; // centroidDistances[t][cluster]
|
|
int *clusterMap, *dirtyBit;
|
|
unsigned int *clusterCount;
|
|
|
|
#pragma region Read in data
|
|
{
|
|
FILE *fp = fopen(data_file, "r");
|
|
|
|
// Read first line
|
|
size_t n;
|
|
char *line = NULL;
|
|
if (!getline(&line, &n, fp))
|
|
return -1;
|
|
|
|
sscanf(line, "%d %d", &N, &dim);
|
|
free(line);
|
|
line = NULL;
|
|
|
|
// Allocate memory on the GPU
|
|
CUDACHECK(
|
|
cudaMalloc((void **)¢roids, num_clusters * dim * sizeof(float)));
|
|
CUDACHECK(cudaMallocManaged((void **)&clusterMap, N * sizeof(int)));
|
|
CUDACHECK(cudaMallocManaged((void **)&clusterCount,
|
|
num_clusters * sizeof(unsigned int)));
|
|
CUDACHECK(cudaMalloc((void **)&data, N * dim * sizeof(float)));
|
|
CUDACHECK(cudaMalloc((void **)¢roidDistances,
|
|
N * num_clusters * sizeof(float)));
|
|
CUDACHECK(cudaMallocManaged((void **)&dirtyBit, sizeof(int)));
|
|
|
|
// Initialize all the cluster mappings to -1 so the first iteration is
|
|
// always fully dirty
|
|
CUDACHECK(cudaMemset(clusterMap, -1, N * sizeof(int)));
|
|
|
|
// Read the rest of the lines
|
|
{
|
|
// Buffer for copying
|
|
float *currentLine = (float *)malloc(dim * sizeof(float));
|
|
for (int i = 0; i < N; ++i) {
|
|
if (!getline(&line, &n, fp))
|
|
return -1;
|
|
|
|
for (int j = 0; j < dim; ++j)
|
|
sscanf(line, "%f", ¤tLine[j]);
|
|
|
|
CUDACHECK(cudaMemcpy(&data[i * dim], currentLine, dim * sizeof(float),
|
|
cudaMemcpyHostToDevice));
|
|
}
|
|
free(currentLine);
|
|
}
|
|
|
|
fclose(fp);
|
|
}
|
|
#pragma endregion
|
|
|
|
double start_time = monotonic_seconds();
|
|
|
|
#pragma region Select the initial K centroids
|
|
{
|
|
CUDACHECK(cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float),
|
|
cudaMemcpyDeviceToDevice));
|
|
}
|
|
#pragma endregion
|
|
|
|
#pragma region Assign each data point to the closest centroid, \
|
|
measured via Euclidean distance.
|
|
{
|
|
if (num_thread_blocks == -1) {
|
|
findDistanceToCentroid<<<N, num_clusters>>>(
|
|
N, num_clusters, dim, centroidDistances, data, centroids, 0);
|
|
} else {
|
|
for (int j = 0; j < N; j += num_thread_blocks) {
|
|
int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
|
|
findDistanceToCentroid<<<n, num_clusters>>>(
|
|
N, num_clusters, dim, centroidDistances, data, centroids,
|
|
j * num_thread_blocks);
|
|
}
|
|
}
|
|
CUDACHECK(cudaDeviceSynchronize());
|
|
|
|
*dirtyBit = 0;
|
|
assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
|
|
centroidDistances, clusterMap);
|
|
CUDACHECK(cudaDeviceSynchronize());
|
|
}
|
|
#pragma endregion
|
|
|
|
#pragma region Iteration
|
|
int it = 0;
|
|
for (int it = 0; it < 20 && *dirtyBit; ++it) {
|
|
// printf("Iteration %d (dirty=%d)\n", it, *dirtyBit);
|
|
|
|
// Update each centroid to be the average coordinate of all contained data
|
|
// points
|
|
CUDACHECK(cudaMemset(clusterCount, 0, num_clusters * sizeof(int)));
|
|
CUDACHECK(cudaMemset(centroids, 0, num_clusters * dim * sizeof(float)));
|
|
recentralizeCentroidSum<<<N, num_clusters>>>(
|
|
N, num_clusters, dim, data, centroids, clusterMap, clusterCount);
|
|
CUDACHECK(cudaDeviceSynchronize());
|
|
|
|
// Print out the cluster compositions
|
|
// for (int i = 0; i < num_clusters; ++i)
|
|
// printf("%d ", clusterCount[i]);
|
|
// printf("\n");
|
|
|
|
recentralizeCentroidDiv<<<1, num_clusters>>>(dim, centroids, clusterCount);
|
|
CUDACHECK(cudaDeviceSynchronize());
|
|
|
|
// Assign all data points to the closest centroid (measured via Euclidean
|
|
// distance).
|
|
// findDistanceToCentroid<<<N, num_clusters>>>(
|
|
// N, num_clusters, dim, centroidDistances, data, centroids);
|
|
if (num_thread_blocks == -1) {
|
|
findDistanceToCentroid<<<N, num_clusters>>>(
|
|
N, num_clusters, dim, centroidDistances, data, centroids, 0);
|
|
} else {
|
|
for (int j = 0; j < N; j += num_thread_blocks) {
|
|
int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
|
|
findDistanceToCentroid<<<n, num_clusters>>>(
|
|
N, num_clusters, dim, centroidDistances, data, centroids,
|
|
j * num_thread_blocks);
|
|
}
|
|
}
|
|
CUDACHECK(cudaDeviceSynchronize());
|
|
|
|
*dirtyBit = 0;
|
|
assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
|
|
centroidDistances, clusterMap);
|
|
CUDACHECK(cudaDeviceSynchronize());
|
|
}
|
|
#pragma endregion
|
|
|
|
double end_time = monotonic_seconds();
|
|
print_time(end_time - start_time);
|
|
|
|
#pragma region
|
|
{
|
|
FILE *fp = fopen("clusters.txt", "w");
|
|
for (int i = 0; i < N; ++i)
|
|
fprintf(fp, "%d\n", clusterMap[i]);
|
|
fclose(fp);
|
|
}
|
|
|
|
{
|
|
FILE *fp = fopen("centroids.txt", "w");
|
|
fprintf(fp, "%d %d\n", num_clusters, dim);
|
|
float *line = (float *)malloc(dim * sizeof(float));
|
|
for (int i = 0; i < num_clusters; ++i) {
|
|
CUDACHECK(cudaMemcpy(line, ¢roids[i * dim], dim * sizeof(float),
|
|
cudaMemcpyDeviceToHost));
|
|
for (int d = 0; d < dim; ++d)
|
|
fprintf(fp, "%.3f ", line[d]);
|
|
fprintf(fp, "\n");
|
|
}
|
|
free(line);
|
|
fclose(fp);
|
|
}
|
|
|
|
#pragma endregion
|
|
|
|
return 0;
|
|
}
|