cuda shit

This commit is contained in:
Michael Zhang 2023-12-10 15:40:31 -06:00
parent 0728eb468f
commit 68a1e749dc
Signed by: michael
GPG key ID: BDA47A31A3C8EE6B
11 changed files with 2527 additions and 1 deletions

1
.envrc Normal file
View file

@ -0,0 +1 @@
use flake

1
.gitignore vendored
View file

@ -1,2 +1,3 @@
.DS_Store
/target
.direnv

2
assignments/04/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
dataset/large_cpd.txt
km_cuda

View file

@ -0,0 +1,235 @@
Introduction
The purpose of this assignment is for you to become familiar with GPU programming and the CUDA API by parallelizing a common algorithm in data mining: k-means clustering.
You need to write a program called km_cuda that will accept the following parameters as input: a filename of data points, the number of clusters, the number of thread blocks, and the number of threads per block. Your program should read in the specified file, cluster the given data points, and output the cluster assignments.
K-Means Clustering Algorithm
The k-means clustering algorithm clusters N data points into K clusters. Each cluster is characterized by a centroid, which is a point representing the average of all data points within the cluster. The algorithm proceeds as follows:
1. Select the initial K centroids.
a. For reproducibility, use points 0, 1, ..., K-1.
2. Assign each data point to the closest centroid (measured via Euclidean distance).
3. While not converged:
a. Update each centroid to be the average coordinate of all contained data points.
b. Assign all data points to the closest centroid (measured via Euclidean distance).
Convergence is detected when no data points change their cluster assignment, or if the maximum number of iterations have been executed. For this assignment, you should set the maximum number of iterations to twenty. Additional material on the k-means algorithm can be found in K-Means.pdf
Download K-Means.pdf.
Input/Output Formats
Each program will take as input one file (the list of data points), and output two files ("clusters.txt": the cluster assignments and "centroids.txt": the centers of each cluster).
The input file contains N+1 lines. The first line contains two space-separated integers: the number of data points (N), and the dimensionality of each data point (D). The following N lines each contain D space-separated floating-point numbers which represent the coordinates of the current data point. Each floating-point number contains at least one digit after the decimal point. For example, an input with four two-dimensional data points would be stored in a file as:
4 2
502.1 505.9
504.0 489.4
515.2 514.7
496.7 498.3
The output file cluster assignments must be called clusters.txt and contain N lines. Each line should contain a single zero-indexed integer which specifies the cluster that the current data point belongs to. For example, a clustering of the above input file into two clusters may look like:
0
0
1
0
The second output file, centroids.txt, should follow the same format as the input data file. It should contain K data points, one for each cluster. Each coordinate should be written with at least three digits after the decimal point.
Your program must also print the clustering time to standard out. You should use a high-precision, monotonic, wall-clock timer and also omit the time spent reading and writing to files. We recommend the function clock_gettime() when on a Linux system. Here is a function that you may use for timing:
```c
/* Gives us high-resolution timers. */
#define _POSIX_C_SOURCE 199309L
#include <time.h>
/* OSX timer includes */
#ifdef __MACH__
#include <mach/mach.h>
#include <mach/mach_time.h>
#endif
/**
* @brief Return the number of seconds since an unspecified time (e.g., Unix
* epoch). This is accomplished with a high-resolution monotonic timer,
* suitable for performance timing.
*
* @return The number of seconds.
*/
static inline double monotonic_seconds() {
#ifdef __MACH__
/* OSX */
static mach_timebase_info_data_t info;
static double seconds_per_unit;
if(seconds_per_unit == 0) {
mach_timebase_info(&info);
seconds_per_unit = (info.numer / info.denom) / 1e9;
}
return seconds_per_unit * mach_absolute_time();
#else
/* Linux systems */
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
#endif
}
```
You should use the following function to output your clustering time:
```c
/**
* @brief Output the seconds elapsed while clustering.
*
* @param seconds Seconds spent on k-means clustering, excluding IO.
*/
static void print_time(double const seconds)
{
printf("k-means clustering time: %0.04fs\n", seconds);
}
```
Timing information should be the ONLY thing printed to standard out.
Failure to follow any of these output instructions will result in significant loss of points.
Testing
Test inputs can be found in /export/scratch/csci5451_f23/hw4_data on any of the cuda lab machines. We provide two test files: small_gaussian.txt and large_cpd.txt. The former is a small two-dimensional dataset that you should use for testing the correctness of your code.
The cuda machines are located at csel-cuda-0{1..5}.cselabs.umn.edu. You must first load the cuda modules with the following commands before using nvcc.
module load soft/cuda/local
module initadd soft/cuda/local
If the command 'module' cannot be found. Add the following lines into your ~/.bashrc file.
```bash
MODULESINIT="/usr/local/modules-tcl/init/bash"
if [ -f $MODULESINIT ]; then
. $MODULESINIT
module load java system soft/ocaml soft/cuda/local
fi
unset MODULESINIT
```
After adding, run
```
source ~/.bashrc
```
More info on testing/using the cuda machines can be found at Parallel Lab Systems.
You should only run your code on one of the cuda machines. You will use the last digit of your student ID to select the machine. ID's ending in {0, 5} should use cuda01, ID's ending in {1, 6} should use cuda02, ID's ending in {3, 7} should use cuda03, ID's ending in {4, 8} should use cuda04, and ID's ending in {5, 9} should use cuda05.
We provide a short script for plotting clustered 2D data (such as small_gaussian.txt). Download and extract plot_clusters.tar.gz
Download plot_clusters.tar.gz. We provide plotting scripts and small_gaussian.txt inside the package. The plotting scripts rely on Octave, an open source alternative to Matlab. The Octave package can be loaded via 'module load math/octave'. Cluster the data with two clusters, place your files centroids.txt and clusters.txt inside of the plot/ directory, and run:
$ ./plot.sh data/small_gaussian.txt
If your clustering is correct, you should see a plot similar to the one below. Data points inside the same cluster are colored the same. Centroids are marked clearly in the center of each cluster.
clusters.png
For the longer running tests, you should look into CUDA compiler optimization flags as well as the screen
Links to an external site. command as these will be helpful.
Remember that the TA will be evaluating your data with a different data sets than those provided for testing.
What you need to turn in
Download What you need to turn in
The source code of your programs.
A short report including the following parts:
A short description of how you went about parallelizing the k-means algorithm. You should include how you decomposed the problem and why, i.e., what were the tasks being parallelized.
Give details about how many elements and how the computations in your kernels are handled by a thread.
Ensure you include details about the thread hierarchy, i.e., whether the threads are organized in a 1D, 2D, or, 3D fashion in a thread-block, and whether the thread-blocks are arranged 1D, 2D, or, 3D grid. NOTE: If you choose to write CUDA kernels where the number of thread blocks is determined dynamically by the program during runtime, then send -1 as the input argument for the number of thread blocks to the invocation. In your program, use -1 as a flag to indicate that the number of thread blocks will need to be computed during runtime.
You need to perform a parameter study in order to determine how the number of elements processed by a thread and the size of a thread-block, i.e., the #threads in a block, affect the performance of your algorithm. Your writeup should contain some results showing the runtime that you obtained for different choices.
You should include results on the 'large_cpd.txt' dataset with 256, 512, and 1024 clusters.
Remember, speed counts. Programs that fail to use the GPU efficiently will lose significant points.
Do NOT include the test files; TAs will have their own test files for grading. You will lose significant points for including test files.
Additional specifications related to assignment submission
A makefile must be provided to compile and generate the executable. The executable should be named:
km_cuda
Program invocation: Your programs should take as an argument the input file to be read from, the number of clusters, the number of thread blocks, and the number of threads per block to be used for parallel execution.
For example, with 64 thread blocks and 128 threads, your program would be invoked as follows:
./km_cuda /export/scratch/CSCI-5451/assignment-1/large_cpd.txt 512 64 128
NOTE: If you choose to write CUDA kernels where the number of thread blocks is determined dynamically by the program during runtime, then send -1 as the input argument for the number of thread blocks during invocation. Example:
km_cuda /export/scratch/CSCI-5451/assignment-1/large_cpd.txt 512 -1 128
All files (code + report) MUST be in a single directory and the directory's name MUST be your UMN login ID (e.g., your UMN email or Moodle username). Your submission directory MUST include at least the following files (other auxiliary files may also be included):
<UMN ID>/km_cuda.c
<UMN ID>/Makefile
<UMN ID>/report.pdf
If you choose to code in C++, then replace the .c suffixes with .cpp or .cc.
Submission MUST be in .tar.gz
The following sequence of commands should work on your submission file:
tar xzvf <UMN ID>.tar.gz
cd <UMN ID>
make
ls -ld km_cuda
This ensures that your submission is packaged correctly, your directory is named correctly, your makefile works correctly, and your output executables are named correctly. If any of these does not work, modify it so that you do not lose points. The TAs can answer questions about correctly formatting your submission BEFORE the assignment is due. Do not expect them to answer questions the night it is due.
Failure to follow any of these submission instructions will result in significant loss of points.
Evaluation criteria
The goal for this assignment is for you to become familiar with the APIs and not so much for developing the most efficient parallel program (this will be done later). As such, full points will be given to the programs that:
follows the assignment directions;
solve the problem correctly;
do so in parallel (i.e., both clustering sub-steps are parallelized);
The speedups obtained will probably depend on the size of the input file. It is not expected that you to get good speedups for small files, but you should be able to get speedups for large files.

7
assignments/04/Makefile Normal file
View file

@ -0,0 +1,7 @@
.PHONY: clean
km_cuda: km_cuda.cu
nvcc -g -o $@ $<
clean:
rm -f km_cuda

File diff suppressed because it is too large Load diff

105
assignments/04/km_cuda.cu Normal file
View file

@ -0,0 +1,105 @@
// #define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#define CUDACHECK(err) \
do { \
cuda_check((err), __FILE__, __LINE__); \
} while (false)
inline void cuda_check(cudaError_t error_code, const char *file, int line) {
if (error_code != cudaSuccess) {
fprintf(stderr, "CUDA Error %d: %s. In file '%s' on line %d\n", error_code,
cudaGetErrorString(error_code), file, line);
fflush(stderr);
exit(error_code);
}
}
__global__ void findDistanceToCentroid(float *centroidDistances, float *data) {
centroidDistances[blockIdx.x] = blockIdx.x;
}
int main(int argc, char **argv) {
int runtimeVersion, driverVersion;
cudaRuntimeGetVersion(&runtimeVersion);
cudaDriverGetVersion(&driverVersion);
printf("Runtime Version: %d, Driver Version: %d\n", runtimeVersion,
driverVersion);
char *data_file = argv[1];
int num_clusters = atoi(argv[2]);
int num_thread_blocks = atoi(argv[3]);
int num_threads_per_block = atoi(argv[4]);
int N, dim;
float *centroids, *data, *centroidDistances;
int *clusterMap;
#pragma region Read in data
{
FILE *fp = fopen(data_file, "r");
// Read first line
size_t n;
char *line = NULL;
if (!getline(&line, &n, fp))
return -1;
sscanf(line, "%d %d", &N, &dim);
free(line);
line = NULL;
// Allocate memory on the GPU
CUDACHECK(
cudaMalloc((void **)&centroids, num_clusters * dim * sizeof(float)));
cudaMalloc((void **)&clusterMap, N * sizeof(int));
cudaMalloc((void **)&data, N * dim * sizeof(float));
cudaMalloc((void **)&centroidDistances, N * num_clusters * sizeof(float));
// Read the rest of the lines
{
// Buffer for copying
int *currentLine = (int *)malloc(dim * sizeof(int));
for (int i = 0; i < N; ++i) {
if (!getline(&line, &n, fp))
return -1;
for (int j = 0; j < dim; ++j)
sscanf(line, "%d", &currentLine[j]);
cudaMemcpy(&data[i * dim], currentLine, dim * sizeof(float),
cudaMemcpyHostToDevice);
}
free(currentLine);
}
printf("Done copying.\n");
fclose(fp);
}
#pragma endregion
#pragma region Select the initial K centroids
{
cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float),
cudaMemcpyDeviceToDevice);
}
#pragma endregion
#pragma region Assign each data point to the closest centroid, \
measured via Euclidean distance.
{
findDistanceToCentroid<<<10, 10>>>(centroidDistances, data);
cudaDeviceSynchronize();
printf("Shiet\n");
float wtf[10];
cudaMemcpy(wtf, centroidDistances, 10 * sizeof(float),
cudaMemcpyDeviceToHost);
for (int i = 0; i < 10; ++i) {
printf("asdf %d %f\n", i, wtf[i]);
}
}
#pragma endregion
return 0;
}

10
assignments/04/plot.sh Executable file
View file

@ -0,0 +1,10 @@
#!/bin/bash
if [ "$#" -ne 1 ]; then
echo "usage: ./plot.sh <data file>";
echo " NOTE: 'clusters.txt' must be in working directory.";
exit 1;
fi
octave-cli --eval "plot_clusters2D('$1', 'clusters.txt', 'centroids.txt')"

View file

@ -0,0 +1,23 @@
function plot_clusters2D(points_file, clusters_file, centroids_file)
X = load(points_file);
X(1,:) = []; % remove metadata
clusters = load(clusters_file);
centroids = load(centroids_file);
centroids(1,:) = []; % remove metadata
f = figure();
hold on
nclusters = size(centroids,1);
for c=1:nclusters
points = X(clusters == c-1, :);
scatter(points(:,1), points(:,2));
scatter(centroids(c,1), centroids(c,2), '+k', 'LineWidth', 5, 'SizeData', 100);
end
uiwait(f);

74
flake.lock Normal file
View file

@ -0,0 +1,74 @@
{
"nodes": {
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1701680307,
"narHash": "sha256-kAuep2h5ajznlPMD9rnQyffWG8EM/C73lejGofXvdM8=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "4022d587cbbfd70fe950c1e2083a02621806a725",
"type": "github"
},
"original": {
"id": "flake-utils",
"type": "indirect"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1663551060,
"narHash": "sha256-e2SR4cVx9p7aW/XnVsGsWZBplApA9ZJUjc0fejJhnYo=",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "8a5b9ee7b7a2b38267c9481f5c629c015108ab0d",
"type": "github"
},
"original": {
"id": "nixpkgs",
"type": "indirect"
}
},
"nixpkgsUnstable": {
"locked": {
"lastModified": 1702237358,
"narHash": "sha256-PagQSuIdXAueAaAujhtqecP2sjXSYDdYfp2UVwqbkP8=",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "7eb0ff576d1bde14a3353ef85f8fba6fd57d32c7",
"type": "github"
},
"original": {
"owner": "nixos",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs",
"nixpkgsUnstable": "nixpkgsUnstable"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

19
flake.nix Normal file
View file

@ -0,0 +1,19 @@
{
inputs.nixpkgsUnstable.url = "github:nixos/nixpkgs";
outputs = { self, nixpkgs, nixpkgsUnstable, flake-utils }:
flake-utils.lib.eachDefaultSystem (system:
let
pkgs = import nixpkgs { inherit system; };
pkgsUnstable = import nixpkgsUnstable {
inherit system;
config.allowUnfreePredicate = pkg:
builtins.elem (nixpkgs.lib.getName pkg) [ "cudatoolkit" ];
};
in {
devShell = pkgs.mkShell {
packages = (with pkgs; [ clang-tools gdb octave ])
++ (with pkgsUnstable.cudaPackages_12; [ cudatoolkit ]);
};
});
}