snapshot
This commit is contained in:
parent
442638205c
commit
9f6a80a09f
5 changed files with 68 additions and 15 deletions
4
assignments/04/.gitignore
vendored
4
assignments/04/.gitignore
vendored
|
@ -1,2 +1,6 @@
|
||||||
dataset/large_cpd.txt
|
dataset/large_cpd.txt
|
||||||
km_cuda
|
km_cuda
|
||||||
|
|
||||||
|
clusters.txt
|
||||||
|
centroids.txt
|
||||||
|
report.pdf
|
|
@ -1,7 +1,7 @@
|
||||||
.PHONY: clean
|
.PHONY: clean
|
||||||
|
|
||||||
km_cuda: km_cuda.cu
|
km_cuda: km_cuda.cu
|
||||||
nvcc -g -o $@ $<
|
nvcc -Xptxas -O3,-v -use_fast_math -o $@ $<
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f km_cuda
|
rm -f km_cuda
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
// #define _POSIX_C_SOURCE 200809L
|
// #define _POSIX_C_SOURCE 200809L
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include <time.h>
|
||||||
|
|
||||||
#define CUDACHECK(err) \
|
#define CUDACHECK(err) \
|
||||||
do { \
|
do { \
|
||||||
|
@ -14,6 +15,31 @@ inline void cuda_check(cudaError_t error_code, const char *file, int line) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Return the number of seconds since an unspecified time (e.g., Unix
|
||||||
|
* epoch). This is accomplished with a high-resolution monotonic timer,
|
||||||
|
* suitable for performance timing.
|
||||||
|
*
|
||||||
|
* @return The number of seconds.
|
||||||
|
*/
|
||||||
|
static inline double monotonic_seconds()
|
||||||
|
{
|
||||||
|
/* Linux systems */
|
||||||
|
struct timespec ts;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||||
|
return ts.tv_sec + ts.tv_nsec * 1e-9;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Output the seconds elapsed while clustering.
|
||||||
|
*
|
||||||
|
* @param seconds Seconds spent on k-means clustering, excluding IO.
|
||||||
|
*/
|
||||||
|
static void print_time(double const seconds)
|
||||||
|
{
|
||||||
|
printf("k-means clustering time: %0.04fs\n", seconds);
|
||||||
|
}
|
||||||
|
|
||||||
__global__ void findDistanceToCentroid(int N, int K, int dim,
|
__global__ void findDistanceToCentroid(int N, int K, int dim,
|
||||||
float *centroidDistances, float *data,
|
float *centroidDistances, float *data,
|
||||||
float *centroids) {
|
float *centroids) {
|
||||||
|
@ -77,12 +103,6 @@ __global__ void recentralizeCentroidDiv(int dim, float *centroids,
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
int runtimeVersion, driverVersion;
|
|
||||||
cudaRuntimeGetVersion(&runtimeVersion);
|
|
||||||
cudaDriverGetVersion(&driverVersion);
|
|
||||||
printf("Runtime Version: %d, Driver Version: %d\n", runtimeVersion,
|
|
||||||
driverVersion);
|
|
||||||
|
|
||||||
char *data_file = argv[1];
|
char *data_file = argv[1];
|
||||||
int num_clusters = atoi(argv[2]);
|
int num_clusters = atoi(argv[2]);
|
||||||
int num_thread_blocks = atoi(argv[3]);
|
int num_thread_blocks = atoi(argv[3]);
|
||||||
|
@ -141,12 +161,12 @@ int main(int argc, char **argv) {
|
||||||
free(currentLine);
|
free(currentLine);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("Done copying.\n");
|
|
||||||
|
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
}
|
}
|
||||||
#pragma endregion
|
#pragma endregion
|
||||||
|
|
||||||
|
double start_time = monotonic_seconds();
|
||||||
|
|
||||||
#pragma region Select the initial K centroids
|
#pragma region Select the initial K centroids
|
||||||
{
|
{
|
||||||
CUDACHECK(cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float),
|
CUDACHECK(cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float),
|
||||||
|
@ -166,8 +186,6 @@ int main(int argc, char **argv) {
|
||||||
centroidDistances, clusterMap);
|
centroidDistances, clusterMap);
|
||||||
CUDACHECK(cudaDeviceSynchronize());
|
CUDACHECK(cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("Is dirty: %d\n", *dirtyBit);
|
|
||||||
#pragma endregion
|
#pragma endregion
|
||||||
|
|
||||||
#pragma region Iteration
|
#pragma region Iteration
|
||||||
|
@ -182,9 +200,10 @@ int main(int argc, char **argv) {
|
||||||
recentralizeCentroidSum<<<N, num_clusters>>>(
|
recentralizeCentroidSum<<<N, num_clusters>>>(
|
||||||
N, num_clusters, dim, data, centroids, clusterMap, clusterCount);
|
N, num_clusters, dim, data, centroids, clusterMap, clusterCount);
|
||||||
CUDACHECK(cudaDeviceSynchronize());
|
CUDACHECK(cudaDeviceSynchronize());
|
||||||
for (int i = 0; i < num_clusters; ++i) {
|
|
||||||
|
// Print out the cluster compositions
|
||||||
|
for (int i = 0; i < num_clusters; ++i)
|
||||||
printf("%d ", clusterCount[i]);
|
printf("%d ", clusterCount[i]);
|
||||||
}
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
|
||||||
recentralizeCentroidDiv<<<1, num_clusters>>>(dim, centroids, clusterCount);
|
recentralizeCentroidDiv<<<1, num_clusters>>>(dim, centroids, clusterCount);
|
||||||
|
@ -205,6 +224,9 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
#pragma endregion
|
#pragma endregion
|
||||||
|
|
||||||
|
double end_time = monotonic_seconds();
|
||||||
|
print_time(end_time - start_time);
|
||||||
|
|
||||||
#pragma region
|
#pragma region
|
||||||
{
|
{
|
||||||
FILE *fp = fopen("clusters.txt", "w");
|
FILE *fp = fopen("clusters.txt", "w");
|
||||||
|
@ -228,7 +250,6 @@ int main(int argc, char **argv) {
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("Done.\n");
|
|
||||||
#pragma endregion
|
#pragma endregion
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
27
assignments/04/report.typ
Normal file
27
assignments/04/report.typ
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
== Homework 4
|
||||||
|
|
||||||
|
Michael Zhang \<zhan4854\@umn.edu\>
|
||||||
|
|
||||||
|
1. A short description of how you went about parallelizing the k-means algorithm. You should include how you decomposed the problem and why, i.e., what were the tasks being parallelized.
|
||||||
|
|
||||||
|
My parallelized program included the following procedures:
|
||||||
|
|
||||||
|
- `findDistanceToCentroid` - This computes an $N times K$ matrix of distances from each data point to each centroid.
|
||||||
|
|
||||||
|
- `assignClosestCentroid` - This reduces the distances to find the minimum distance for each centroid, and assigns the closest one to an $N times 1$ vector.
|
||||||
|
|
||||||
|
- `recentralizeCentroidSum` - This computes a sum for the purposes of averaging, and also counts the number of elements in each cluster.
|
||||||
|
|
||||||
|
- `recentralizeCentroidDiv` - This uses the count from the previous step and divides everything in parallel.
|
||||||
|
|
||||||
|
I tried to make sure every thread is computing approximately one single for-loop's worth of data, most of the time over the $d$ axis
|
||||||
|
|
||||||
|
2. Give details about how many elements and how the computations in your kernels are handled by a thread.
|
||||||
|
|
||||||
|
Threads are dynamically allocated
|
||||||
|
|
||||||
|
3. Ensure you include details about the thread hierarchy, i.e., whether the threads are organized in a 1D, 2D, or, 3D fashion in a thread-block, and whether the thread-blocks are arranged 1D, 2D, or, 3D grid. NOTE: If you choose to write CUDA kernels where the number of thread blocks is determined dynamically by the program during runtime, then send -1 as the input argument for the number of thread blocks to the invocation. In your program, use -1 as a flag to indicate that the number of thread blocks will need to be computed during runtime.
|
||||||
|
|
||||||
|
4. You need to perform a parameter study in order to determine how the number of elements processed by a thread and the size of a thread-block, i.e., the \# threads in a block, affect the performance of your algorithm. Your writeup should contain some results showing the runtime that you obtained for different choices.
|
||||||
|
|
||||||
|
5. You should include results on the 'large_cpd.txt' dataset with 256, 512, and 1024 clusters.
|
|
@ -1,4 +1,5 @@
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
HOST="zhan4854@csel-cuda-02.cselabs.umn.edu"
|
HOST="zhan4854@csel-cuda-02.cselabs.umn.edu"
|
||||||
rsync -azPr --exclude 'large_cpd.txt' . $HOST:~/hwk4
|
rsync -azPr --exclude 'large_cpd.txt' . $HOST:~/hwk4
|
||||||
ssh $HOST bash -c 'set -euo pipefail; module load soft/cuda/local; module initadd soft/cuda/local; cd hwk4; make clean; make; ls; ./km_cuda ./dataset/small_gaussian.txt 512 64 128'
|
ssh $HOST bash -c 'set -euo pipefail; module load soft/cuda/local; module initadd soft/cuda/local; cd hwk4; make clean; make; ls; ./km_cuda ./dataset/small_gaussian.txt 2 64 128'
|
||||||
|
rsync -azPr --exclude 'large_cpd.txt' zhan4854@csel-cuda-02.cselabs.umn.edu:~/hwk4/ .
|
||||||
|
|
Loading…
Reference in a new issue