This commit is contained in:
Michael Zhang 2023-12-11 22:11:02 -06:00
parent 442638205c
commit 9f6a80a09f
5 changed files with 68 additions and 15 deletions

View file

@ -1,2 +1,6 @@
dataset/large_cpd.txt
km_cuda
clusters.txt
centroids.txt
report.pdf

View file

@ -1,7 +1,7 @@
.PHONY: clean
km_cuda: km_cuda.cu
nvcc -g -o $@ $<
nvcc -Xptxas -O3,-v -use_fast_math -o $@ $<
clean:
rm -f km_cuda

View file

@ -1,5 +1,6 @@
// #define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <time.h>
#define CUDACHECK(err) \
do { \
@ -14,6 +15,31 @@ inline void cuda_check(cudaError_t error_code, const char *file, int line) {
}
}
/**
* @brief Return the number of seconds since an unspecified time (e.g., Unix
* epoch). This is accomplished with a high-resolution monotonic timer,
* suitable for performance timing.
*
* @return The number of seconds.
*/
static inline double monotonic_seconds()
{
/* Linux systems */
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
/**
* @brief Output the seconds elapsed while clustering.
*
* @param seconds Seconds spent on k-means clustering, excluding IO.
*/
static void print_time(double const seconds)
{
printf("k-means clustering time: %0.04fs\n", seconds);
}
__global__ void findDistanceToCentroid(int N, int K, int dim,
float *centroidDistances, float *data,
float *centroids) {
@ -77,12 +103,6 @@ __global__ void recentralizeCentroidDiv(int dim, float *centroids,
}
int main(int argc, char **argv) {
int runtimeVersion, driverVersion;
cudaRuntimeGetVersion(&runtimeVersion);
cudaDriverGetVersion(&driverVersion);
printf("Runtime Version: %d, Driver Version: %d\n", runtimeVersion,
driverVersion);
char *data_file = argv[1];
int num_clusters = atoi(argv[2]);
int num_thread_blocks = atoi(argv[3]);
@ -141,12 +161,12 @@ int main(int argc, char **argv) {
free(currentLine);
}
printf("Done copying.\n");
fclose(fp);
}
#pragma endregion
double start_time = monotonic_seconds();
#pragma region Select the initial K centroids
{
CUDACHECK(cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float),
@ -166,8 +186,6 @@ int main(int argc, char **argv) {
centroidDistances, clusterMap);
CUDACHECK(cudaDeviceSynchronize());
}
printf("Is dirty: %d\n", *dirtyBit);
#pragma endregion
#pragma region Iteration
@ -182,9 +200,10 @@ int main(int argc, char **argv) {
recentralizeCentroidSum<<<N, num_clusters>>>(
N, num_clusters, dim, data, centroids, clusterMap, clusterCount);
CUDACHECK(cudaDeviceSynchronize());
for (int i = 0; i < num_clusters; ++i) {
// Print out the cluster compositions
for (int i = 0; i < num_clusters; ++i)
printf("%d ", clusterCount[i]);
}
printf("\n");
recentralizeCentroidDiv<<<1, num_clusters>>>(dim, centroids, clusterCount);
@ -205,6 +224,9 @@ int main(int argc, char **argv) {
}
#pragma endregion
double end_time = monotonic_seconds();
print_time(end_time - start_time);
#pragma region
{
FILE *fp = fopen("clusters.txt", "w");
@ -228,7 +250,6 @@ int main(int argc, char **argv) {
fclose(fp);
}
printf("Done.\n");
#pragma endregion
return 0;

27
assignments/04/report.typ Normal file
View file

@ -0,0 +1,27 @@
== Homework 4
Michael Zhang \<zhan4854\@umn.edu\>
1. A short description of how you went about parallelizing the k-means algorithm. You should include how you decomposed the problem and why, i.e., what were the tasks being parallelized.
My parallelized program included the following procedures:
- `findDistanceToCentroid` - This computes an $N times K$ matrix of distances from each data point to each centroid.
- `assignClosestCentroid` - This reduces the distances to find the minimum distance for each centroid, and assigns the closest one to an $N times 1$ vector.
- `recentralizeCentroidSum` - This computes a sum for the purposes of averaging, and also counts the number of elements in each cluster.
- `recentralizeCentroidDiv` - This uses the count from the previous step and divides everything in parallel.
I tried to make sure every thread is computing approximately one single for-loop's worth of data, most of the time over the $d$ axis
2. Give details about how many elements and how the computations in your kernels are handled by a thread.
Threads are dynamically allocated
3. Ensure you include details about the thread hierarchy, i.e., whether the threads are organized in a 1D, 2D, or, 3D fashion in a thread-block, and whether the thread-blocks are arranged 1D, 2D, or, 3D grid. NOTE: If you choose to write CUDA kernels where the number of thread blocks is determined dynamically by the program during runtime, then send -1 as the input argument for the number of thread blocks to the invocation. In your program, use -1 as a flag to indicate that the number of thread blocks will need to be computed during runtime.
4. You need to perform a parameter study in order to determine how the number of elements processed by a thread and the size of a thread-block, i.e., the \# threads in a block, affect the performance of your algorithm. Your writeup should contain some results showing the runtime that you obtained for different choices.
5. You should include results on the 'large_cpd.txt' dataset with 256, 512, and 1024 clusters.

View file

@ -1,4 +1,5 @@
set -euo pipefail
HOST="zhan4854@csel-cuda-02.cselabs.umn.edu"
rsync -azPr --exclude 'large_cpd.txt' . $HOST:~/hwk4
ssh $HOST bash -c 'set -euo pipefail; module load soft/cuda/local; module initadd soft/cuda/local; cd hwk4; make clean; make; ls; ./km_cuda ./dataset/small_gaussian.txt 512 64 128'
ssh $HOST bash -c 'set -euo pipefail; module load soft/cuda/local; module initadd soft/cuda/local; cd hwk4; make clean; make; ls; ./km_cuda ./dataset/small_gaussian.txt 2 64 128'
rsync -azPr --exclude 'large_cpd.txt' zhan4854@csel-cuda-02.cselabs.umn.edu:~/hwk4/ .