This commit is contained in:
Michael Zhang 2023-12-15 17:13:47 -06:00
parent 18b1c2b6bc
commit 0b179f38bd
5 changed files with 340 additions and 27 deletions

View file

@ -4,3 +4,4 @@ km_cuda
clusters.txt
centroids.txt
report.pdf
cluster.txt

View file

@ -15,6 +15,15 @@ inline void cuda_check(cudaError_t error_code, const char *file, int line) {
}
}
#define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
#define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))
// #define ENSURE_int(i) _Generic((i), int: (i))
// #define ENSURE_float(f) _Generic((f), float: (f))
// #define MAX(type, x, y) (type) GENERIC_MAX(ENSURE_##type(x), ENSURE_##type(y))
// #define MIN(type, x, y) (type) GENERIC_MIN(ENSURE_##type(x), ENSURE_##type(y))
/**
* @brief Return the number of seconds since an unspecified time (e.g., Unix
* epoch). This is accomplished with a high-resolution monotonic timer,
@ -22,8 +31,7 @@ inline void cuda_check(cudaError_t error_code, const char *file, int line) {
*
* @return The number of seconds.
*/
static inline double monotonic_seconds()
{
static inline double monotonic_seconds() {
/* Linux systems */
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
@ -35,15 +43,14 @@ static inline double monotonic_seconds()
*
* @param seconds Seconds spent on k-means clustering, excluding IO.
*/
static void print_time(double const seconds)
{
static void print_time(double const seconds) {
printf("k-means clustering time: %0.04fs\n", seconds);
}
__global__ void findDistanceToCentroid(int N, int K, int dim,
float *centroidDistances, float *data,
float *centroids) {
int t = blockIdx.x; // data index
float *centroids, int tOffset) {
int t = blockIdx.x + tOffset; // data index
int c = threadIdx.x; // cluster index
float sum = 0;
@ -177,8 +184,17 @@ int main(int argc, char **argv) {
#pragma region Assign each data point to the closest centroid, \
measured via Euclidean distance.
{
if (num_thread_blocks == -1) {
findDistanceToCentroid<<<N, num_clusters>>>(
N, num_clusters, dim, centroidDistances, data, centroids);
N, num_clusters, dim, centroidDistances, data, centroids, 0);
} else {
for (int j = 0; j < N; j += num_thread_blocks) {
int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
findDistanceToCentroid<<<n, num_clusters>>>(
N, num_clusters, dim, centroidDistances, data, centroids,
j * num_thread_blocks);
}
}
CUDACHECK(cudaDeviceSynchronize());
*dirtyBit = 0;
@ -211,8 +227,19 @@ int main(int argc, char **argv) {
// Assign all data points to the closest centroid (measured via Euclidean
// distance).
// findDistanceToCentroid<<<N, num_clusters>>>(
// N, num_clusters, dim, centroidDistances, data, centroids);
if (num_thread_blocks == -1) {
findDistanceToCentroid<<<N, num_clusters>>>(
N, num_clusters, dim, centroidDistances, data, centroids);
N, num_clusters, dim, centroidDistances, data, centroids, 0);
} else {
for (int j = 0; j < N; j += num_thread_blocks) {
int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
findDistanceToCentroid<<<n, num_clusters>>>(
N, num_clusters, dim, centroidDistances, data, centroids,
j * num_thread_blocks);
}
}
CUDACHECK(cudaDeviceSynchronize());
*dirtyBit = 0;

View file

@ -0,0 +1,281 @@
// #define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <time.h>
#define CUDACHECK(err) \
do { \
cuda_check((err), __FILE__, __LINE__); \
} while (false)
inline void cuda_check(cudaError_t error_code, const char *file, int line) {
if (error_code != cudaSuccess) {
fprintf(stderr, "CUDA Error %d: %s. In file '%s' on line %d\n", error_code,
cudaGetErrorString(error_code), file, line);
fflush(stderr);
exit(error_code);
}
}
#define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
#define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))
// #define ENSURE_int(i) _Generic((i), int: (i))
// #define ENSURE_float(f) _Generic((f), float: (f))
// #define MAX(type, x, y) (type) GENERIC_MAX(ENSURE_##type(x), ENSURE_##type(y))
// #define MIN(type, x, y) (type) GENERIC_MIN(ENSURE_##type(x), ENSURE_##type(y))
/**
* @brief Return the number of seconds since an unspecified time (e.g., Unix
* epoch). This is accomplished with a high-resolution monotonic timer,
* suitable for performance timing.
*
* @return The number of seconds.
*/
static inline double monotonic_seconds() {
/* Linux systems */
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
/**
* @brief Output the seconds elapsed while clustering.
*
* @param seconds Seconds spent on k-means clustering, excluding IO.
*/
static void print_time(double const seconds) {
printf("k-means clustering time: %0.04fs\n", seconds);
}
__global__ void findDistanceToCentroid(int N, int K, int dim,
float *centroidDistances, float *data,
float *centroids, int tOffset) {
int t = blockIdx.x + tOffset; // data index
int c = threadIdx.x; // cluster index
float sum = 0;
for (int d = 0; d < dim; ++d) {
float delta = data[t * dim + d] - centroids[c * dim + d];
sum += delta * delta;
}
centroidDistances[t * K + c] = sqrt(sum);
}
__global__ void assignClosestCentroid(int N, int K, int *dirtyBit,
float *centroidDistances,
int *clusterMap) {
int t = blockIdx.x;
int minIdx = 0;
float minValue = INFINITY;
for (int c = 0; c < K; ++c) {
float dist = centroidDistances[t * K + c];
if (dist < minValue) {
minValue = dist;
minIdx = c;
}
}
// printf("[%d]: minDist %f @ idx %d\n", t, minValue, minIdx);
int oldMinIdx = clusterMap[t];
clusterMap[t] = minIdx;
if (oldMinIdx != minIdx) {
atomicOr(dirtyBit, 1);
}
}
__global__ void recentralizeCentroidSum(int N, int K, int dim, float *data,
float *centroids, int *clusterMap,
unsigned int *clusterCount) {
int t = blockIdx.x; // data index
int c = threadIdx.x; // cluster index
int assignedCluster = clusterMap[t];
if (assignedCluster != c)
return;
atomicAdd((unsigned int *)&clusterCount[c], 1);
for (int d = 0; d < dim; ++d) {
atomicAdd(&centroids[c * dim + d], data[t * dim + d]);
}
}
__global__ void recentralizeCentroidDiv(int dim, float *centroids,
unsigned int *clusterCount) {
int c = threadIdx.x; // cluster index
for (int d = 0; d < dim; ++d) {
centroids[c * dim + d] /= clusterCount[c];
}
}
int main(int argc, char **argv) {
char *data_file = argv[1];
int num_clusters = atoi(argv[2]);
int num_thread_blocks = atoi(argv[3]);
int num_threads_per_block = atoi(argv[4]);
int N, dim;
float *centroids, // centroids[cluster][dimension]
*data, // data[t][dimension]
*centroidDistances; // centroidDistances[t][cluster]
int *clusterMap, *dirtyBit;
unsigned int *clusterCount;
#pragma region Read in data
{
FILE *fp = fopen(data_file, "r");
// Read first line
size_t n;
char *line = NULL;
if (!getline(&line, &n, fp))
return -1;
sscanf(line, "%d %d", &N, &dim);
free(line);
line = NULL;
// Allocate memory on the GPU
CUDACHECK(
cudaMalloc((void **)&centroids, num_clusters * dim * sizeof(float)));
CUDACHECK(cudaMallocManaged((void **)&clusterMap, N * sizeof(int)));
CUDACHECK(cudaMallocManaged((void **)&clusterCount,
num_clusters * sizeof(unsigned int)));
CUDACHECK(cudaMalloc((void **)&data, N * dim * sizeof(float)));
CUDACHECK(cudaMalloc((void **)&centroidDistances,
N * num_clusters * sizeof(float)));
CUDACHECK(cudaMallocManaged((void **)&dirtyBit, sizeof(int)));
// Initialize all the cluster mappings to -1 so the first iteration is
// always fully dirty
CUDACHECK(cudaMemset(clusterMap, -1, N * sizeof(int)));
// Read the rest of the lines
{
// Buffer for copying
float *currentLine = (float *)malloc(dim * sizeof(float));
for (int i = 0; i < N; ++i) {
if (!getline(&line, &n, fp))
return -1;
for (int j = 0; j < dim; ++j)
sscanf(line, "%f", &currentLine[j]);
CUDACHECK(cudaMemcpy(&data[i * dim], currentLine, dim * sizeof(float),
cudaMemcpyHostToDevice));
}
free(currentLine);
}
fclose(fp);
}
#pragma endregion
double start_time = monotonic_seconds();
#pragma region Select the initial K centroids
{
CUDACHECK(cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float),
cudaMemcpyDeviceToDevice));
}
#pragma endregion
#pragma region Assign each data point to the closest centroid, \
measured via Euclidean distance.
{
if (num_thread_blocks == -1) {
findDistanceToCentroid<<<N, num_clusters>>>(
N, num_clusters, dim, centroidDistances, data, centroids, 0);
} else {
for (int j = 0; j < N; j += num_thread_blocks) {
int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
findDistanceToCentroid<<<n, num_clusters>>>(
N, num_clusters, dim, centroidDistances, data, centroids,
j * num_thread_blocks);
}
}
CUDACHECK(cudaDeviceSynchronize());
*dirtyBit = 0;
assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
centroidDistances, clusterMap);
CUDACHECK(cudaDeviceSynchronize());
}
#pragma endregion
#pragma region Iteration
int it = 0;
for (int it = 0; it < 20 && *dirtyBit; ++it) {
// printf("Iteration %d (dirty=%d)\n", it, *dirtyBit);
// Update each centroid to be the average coordinate of all contained data
// points
CUDACHECK(cudaMemset(clusterCount, 0, num_clusters * sizeof(int)));
CUDACHECK(cudaMemset(centroids, 0, num_clusters * dim * sizeof(float)));
recentralizeCentroidSum<<<N, num_clusters>>>(
N, num_clusters, dim, data, centroids, clusterMap, clusterCount);
CUDACHECK(cudaDeviceSynchronize());
// Print out the cluster compositions
// for (int i = 0; i < num_clusters; ++i)
// printf("%d ", clusterCount[i]);
// printf("\n");
recentralizeCentroidDiv<<<1, num_clusters>>>(dim, centroids, clusterCount);
CUDACHECK(cudaDeviceSynchronize());
// Assign all data points to the closest centroid (measured via Euclidean
// distance).
// findDistanceToCentroid<<<N, num_clusters>>>(
// N, num_clusters, dim, centroidDistances, data, centroids);
if (num_thread_blocks == -1) {
findDistanceToCentroid<<<N, num_clusters>>>(
N, num_clusters, dim, centroidDistances, data, centroids, 0);
} else {
for (int j = 0; j < N; j += num_thread_blocks) {
int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
findDistanceToCentroid<<<n, num_clusters>>>(
N, num_clusters, dim, centroidDistances, data, centroids,
j * num_thread_blocks);
}
}
CUDACHECK(cudaDeviceSynchronize());
*dirtyBit = 0;
assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
centroidDistances, clusterMap);
CUDACHECK(cudaDeviceSynchronize());
}
#pragma endregion
double end_time = monotonic_seconds();
print_time(end_time - start_time);
#pragma region
{
FILE *fp = fopen("clusters.txt", "w");
for (int i = 0; i < N; ++i)
fprintf(fp, "%d\n", clusterMap[i]);
fclose(fp);
}
{
FILE *fp = fopen("centroids.txt", "w");
fprintf(fp, "%d %d\n", num_clusters, dim);
float *line = (float *)malloc(dim * sizeof(float));
for (int i = 0; i < num_clusters; ++i) {
CUDACHECK(cudaMemcpy(line, &centroids[i * dim], dim * sizeof(float),
cudaMemcpyDeviceToHost));
for (int d = 0; d < dim; ++d)
fprintf(fp, "%.3f ", line[d]);
fprintf(fp, "\n");
}
free(line);
fclose(fp);
}
#pragma endregion
return 0;
}

View file

@ -1,3 +1,5 @@
#set page(margin: 1.5em)
== Homework 4
Michael Zhang \<zhan4854\@umn.edu\>
@ -30,6 +32,8 @@ Michael Zhang \<zhan4854\@umn.edu\>
4. *You need to perform a parameter study in order to determine how the number of elements processed by a thread and the size of a thread-block, i.e., the \# threads in a block, affect the performance of your algorithm. Your writeup should contain some results showing the runtime that you obtained for different choices.*
I tried using a fixed number of thread blocks, but for the 256 cluster case it ended up performing almost twice as slow (at 0.4516s). To the best of my knowledge, CUDA is able to schedule the extra kernel instances it needs to compute to happen sequentially after, if there are more thread blocks requested than there are available.
5. *You should include results on the 'large_cpd.txt' dataset with 256, 512, and 1024 clusters.*
- 256: 26.8258s

View file

@ -3,7 +3,7 @@ HOST="zhan4854@csel-cuda-02.cselabs.umn.edu"
rsync -azPr --exclude 'large_cpd.txt' . $HOST:~/hwk4
CLUSTERS=${1:-512}
BLOCKS=-1
BLOCKS=512
THREADS=-1
DATAFILE="large_cpd.txt"
# DATAFILE="small_gaussian.txt"