snapshot
This commit is contained in:
parent
442638205c
commit
9f6a80a09f
5 changed files with 68 additions and 15 deletions
4
assignments/04/.gitignore
vendored
4
assignments/04/.gitignore
vendored
|
@ -1,2 +1,6 @@
|
|||
dataset/large_cpd.txt
|
||||
km_cuda
|
||||
|
||||
clusters.txt
|
||||
centroids.txt
|
||||
report.pdf
|
|
@ -1,7 +1,7 @@
|
|||
.PHONY: clean
|
||||
|
||||
km_cuda: km_cuda.cu
|
||||
nvcc -g -o $@ $<
|
||||
nvcc -Xptxas -O3,-v -use_fast_math -o $@ $<
|
||||
|
||||
clean:
|
||||
rm -f km_cuda
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
// #define _POSIX_C_SOURCE 200809L
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
|
||||
#define CUDACHECK(err) \
|
||||
do { \
|
||||
|
@ -14,6 +15,31 @@ inline void cuda_check(cudaError_t error_code, const char *file, int line) {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the number of seconds since an unspecified time (e.g., Unix
|
||||
* epoch). This is accomplished with a high-resolution monotonic timer,
|
||||
* suitable for performance timing.
|
||||
*
|
||||
* @return The number of seconds.
|
||||
*/
|
||||
static inline double monotonic_seconds()
|
||||
{
|
||||
/* Linux systems */
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return ts.tv_sec + ts.tv_nsec * 1e-9;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Output the seconds elapsed while clustering.
|
||||
*
|
||||
* @param seconds Seconds spent on k-means clustering, excluding IO.
|
||||
*/
|
||||
static void print_time(double const seconds)
|
||||
{
|
||||
printf("k-means clustering time: %0.04fs\n", seconds);
|
||||
}
|
||||
|
||||
__global__ void findDistanceToCentroid(int N, int K, int dim,
|
||||
float *centroidDistances, float *data,
|
||||
float *centroids) {
|
||||
|
@ -77,12 +103,6 @@ __global__ void recentralizeCentroidDiv(int dim, float *centroids,
|
|||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int runtimeVersion, driverVersion;
|
||||
cudaRuntimeGetVersion(&runtimeVersion);
|
||||
cudaDriverGetVersion(&driverVersion);
|
||||
printf("Runtime Version: %d, Driver Version: %d\n", runtimeVersion,
|
||||
driverVersion);
|
||||
|
||||
char *data_file = argv[1];
|
||||
int num_clusters = atoi(argv[2]);
|
||||
int num_thread_blocks = atoi(argv[3]);
|
||||
|
@ -141,12 +161,12 @@ int main(int argc, char **argv) {
|
|||
free(currentLine);
|
||||
}
|
||||
|
||||
printf("Done copying.\n");
|
||||
|
||||
fclose(fp);
|
||||
}
|
||||
#pragma endregion
|
||||
|
||||
double start_time = monotonic_seconds();
|
||||
|
||||
#pragma region Select the initial K centroids
|
||||
{
|
||||
CUDACHECK(cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float),
|
||||
|
@ -166,8 +186,6 @@ int main(int argc, char **argv) {
|
|||
centroidDistances, clusterMap);
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
printf("Is dirty: %d\n", *dirtyBit);
|
||||
#pragma endregion
|
||||
|
||||
#pragma region Iteration
|
||||
|
@ -182,9 +200,10 @@ int main(int argc, char **argv) {
|
|||
recentralizeCentroidSum<<<N, num_clusters>>>(
|
||||
N, num_clusters, dim, data, centroids, clusterMap, clusterCount);
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
for (int i = 0; i < num_clusters; ++i) {
|
||||
|
||||
// Print out the cluster compositions
|
||||
for (int i = 0; i < num_clusters; ++i)
|
||||
printf("%d ", clusterCount[i]);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
recentralizeCentroidDiv<<<1, num_clusters>>>(dim, centroids, clusterCount);
|
||||
|
@ -205,6 +224,9 @@ int main(int argc, char **argv) {
|
|||
}
|
||||
#pragma endregion
|
||||
|
||||
double end_time = monotonic_seconds();
|
||||
print_time(end_time - start_time);
|
||||
|
||||
#pragma region
|
||||
{
|
||||
FILE *fp = fopen("clusters.txt", "w");
|
||||
|
@ -228,7 +250,6 @@ int main(int argc, char **argv) {
|
|||
fclose(fp);
|
||||
}
|
||||
|
||||
printf("Done.\n");
|
||||
#pragma endregion
|
||||
|
||||
return 0;
|
||||
|
|
27
assignments/04/report.typ
Normal file
27
assignments/04/report.typ
Normal file
|
@ -0,0 +1,27 @@
|
|||
== Homework 4
|
||||
|
||||
Michael Zhang \<zhan4854\@umn.edu\>
|
||||
|
||||
1. A short description of how you went about parallelizing the k-means algorithm. You should include how you decomposed the problem and why, i.e., what were the tasks being parallelized.
|
||||
|
||||
My parallelized program included the following procedures:
|
||||
|
||||
- `findDistanceToCentroid` - This computes an $N times K$ matrix of distances from each data point to each centroid.
|
||||
|
||||
- `assignClosestCentroid` - This reduces the distances to find the minimum distance for each centroid, and assigns the closest one to an $N times 1$ vector.
|
||||
|
||||
- `recentralizeCentroidSum` - This computes a sum for the purposes of averaging, and also counts the number of elements in each cluster.
|
||||
|
||||
- `recentralizeCentroidDiv` - This uses the count from the previous step and divides everything in parallel.
|
||||
|
||||
I tried to make sure every thread is computing approximately one single for-loop's worth of data, most of the time over the $d$ axis
|
||||
|
||||
2. Give details about how many elements and how the computations in your kernels are handled by a thread.
|
||||
|
||||
Threads are dynamically allocated
|
||||
|
||||
3. Ensure you include details about the thread hierarchy, i.e., whether the threads are organized in a 1D, 2D, or, 3D fashion in a thread-block, and whether the thread-blocks are arranged 1D, 2D, or, 3D grid. NOTE: If you choose to write CUDA kernels where the number of thread blocks is determined dynamically by the program during runtime, then send -1 as the input argument for the number of thread blocks to the invocation. In your program, use -1 as a flag to indicate that the number of thread blocks will need to be computed during runtime.
|
||||
|
||||
4. You need to perform a parameter study in order to determine how the number of elements processed by a thread and the size of a thread-block, i.e., the \# threads in a block, affect the performance of your algorithm. Your writeup should contain some results showing the runtime that you obtained for different choices.
|
||||
|
||||
5. You should include results on the 'large_cpd.txt' dataset with 256, 512, and 1024 clusters.
|
|
@ -1,4 +1,5 @@
|
|||
set -euo pipefail
|
||||
HOST="zhan4854@csel-cuda-02.cselabs.umn.edu"
|
||||
rsync -azPr --exclude 'large_cpd.txt' . $HOST:~/hwk4
|
||||
ssh $HOST bash -c 'set -euo pipefail; module load soft/cuda/local; module initadd soft/cuda/local; cd hwk4; make clean; make; ls; ./km_cuda ./dataset/small_gaussian.txt 512 64 128'
|
||||
ssh $HOST bash -c 'set -euo pipefail; module load soft/cuda/local; module initadd soft/cuda/local; cd hwk4; make clean; make; ls; ./km_cuda ./dataset/small_gaussian.txt 2 64 128'
|
||||
rsync -azPr --exclude 'large_cpd.txt' zhan4854@csel-cuda-02.cselabs.umn.edu:~/hwk4/ .
|
||||
|
|
Loading…
Reference in a new issue