shiet
This commit is contained in:
parent
68a1e749dc
commit
d8f5fe13b6
3 changed files with 134 additions and 22 deletions
|
@ -14,8 +14,66 @@ inline void cuda_check(cudaError_t error_code, const char *file, int line) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void findDistanceToCentroid(float *centroidDistances, float *data) {
|
__global__ void findDistanceToCentroid(int N, int K, int dim,
|
||||||
centroidDistances[blockIdx.x] = blockIdx.x;
|
float *centroidDistances, float *data,
|
||||||
|
float *centroids) {
|
||||||
|
int t = blockIdx.x; // data index
|
||||||
|
int c = threadIdx.x; // cluster index
|
||||||
|
|
||||||
|
float sum = 0;
|
||||||
|
for (int d = 0; d < dim; ++d) {
|
||||||
|
float delta = data[t * dim + d] - centroids[c * dim + d];
|
||||||
|
sum += delta * delta;
|
||||||
|
}
|
||||||
|
|
||||||
|
centroidDistances[t * K + c] = sqrt(sum);
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void assignClosestCentroid(int N, int K, int *dirtyBit,
|
||||||
|
float *centroidDistances,
|
||||||
|
int *clusterMap) {
|
||||||
|
int t = blockIdx.x;
|
||||||
|
int minIdx = 0;
|
||||||
|
float minValue = INFINITY;
|
||||||
|
|
||||||
|
for (int c = 0; c < K; ++c) {
|
||||||
|
float dist = centroidDistances[t * K + c];
|
||||||
|
if (dist < minValue) {
|
||||||
|
minValue = dist;
|
||||||
|
minIdx = c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// printf("[%d]: minDist %f @ idx %d\n", t, minValue, minIdx);
|
||||||
|
int oldMinIdx = clusterMap[t];
|
||||||
|
clusterMap[t] = minIdx;
|
||||||
|
if (oldMinIdx != minIdx) {
|
||||||
|
atomicOr(dirtyBit, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void recentralizeCentroidSum(int N, int K, int dim, float *data,
|
||||||
|
float *centroids, int *clusterMap,
|
||||||
|
unsigned int *clusterCount) {
|
||||||
|
int t = blockIdx.x; // data index
|
||||||
|
int c = threadIdx.x; // cluster index
|
||||||
|
int assignedCluster = clusterMap[t];
|
||||||
|
|
||||||
|
if (assignedCluster != c)
|
||||||
|
return;
|
||||||
|
|
||||||
|
atomicAdd((unsigned int *)&clusterCount[c], 1);
|
||||||
|
for (int d = 0; d < dim; ++d) {
|
||||||
|
atomicAdd(¢roids[c * dim + d], data[t * dim + d]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void recentralizeCentroidDiv(int dim, float *centroids,
|
||||||
|
unsigned int *clusterCount) {
|
||||||
|
int c = threadIdx.x; // cluster index
|
||||||
|
for (int d = 0; d < dim; ++d) {
|
||||||
|
centroids[c * dim + d] /= clusterCount[c];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
|
@ -31,8 +89,11 @@ int main(int argc, char **argv) {
|
||||||
int num_threads_per_block = atoi(argv[4]);
|
int num_threads_per_block = atoi(argv[4]);
|
||||||
|
|
||||||
int N, dim;
|
int N, dim;
|
||||||
float *centroids, *data, *centroidDistances;
|
float *centroids, // centroids[cluster][dimension]
|
||||||
int *clusterMap;
|
*data, // data[t][dimension]
|
||||||
|
*centroidDistances; // centroidDistances[t][cluster]
|
||||||
|
int *clusterMap, *dirtyBit;
|
||||||
|
unsigned int *clusterCount;
|
||||||
|
|
||||||
#pragma region Read in data
|
#pragma region Read in data
|
||||||
{
|
{
|
||||||
|
@ -51,23 +112,31 @@ int main(int argc, char **argv) {
|
||||||
// Allocate memory on the GPU
|
// Allocate memory on the GPU
|
||||||
CUDACHECK(
|
CUDACHECK(
|
||||||
cudaMalloc((void **)¢roids, num_clusters * dim * sizeof(float)));
|
cudaMalloc((void **)¢roids, num_clusters * dim * sizeof(float)));
|
||||||
cudaMalloc((void **)&clusterMap, N * sizeof(int));
|
CUDACHECK(cudaMalloc((void **)&clusterMap, N * sizeof(int)));
|
||||||
cudaMalloc((void **)&data, N * dim * sizeof(float));
|
CUDACHECK(cudaMallocManaged((void **)&clusterCount,
|
||||||
cudaMalloc((void **)¢roidDistances, N * num_clusters * sizeof(float));
|
num_clusters * sizeof(unsigned int)));
|
||||||
|
CUDACHECK(cudaMalloc((void **)&data, N * dim * sizeof(float)));
|
||||||
|
CUDACHECK(cudaMalloc((void **)¢roidDistances,
|
||||||
|
N * num_clusters * sizeof(float)));
|
||||||
|
CUDACHECK(cudaMallocManaged((void **)&dirtyBit, sizeof(int)));
|
||||||
|
|
||||||
|
// Initialize all the cluster mappings to -1 so the first iteration is
|
||||||
|
// always fully dirty
|
||||||
|
CUDACHECK(cudaMemset(clusterMap, -1, N * sizeof(int)));
|
||||||
|
|
||||||
// Read the rest of the lines
|
// Read the rest of the lines
|
||||||
{
|
{
|
||||||
// Buffer for copying
|
// Buffer for copying
|
||||||
int *currentLine = (int *)malloc(dim * sizeof(int));
|
float *currentLine = (float *)malloc(dim * sizeof(float));
|
||||||
for (int i = 0; i < N; ++i) {
|
for (int i = 0; i < N; ++i) {
|
||||||
if (!getline(&line, &n, fp))
|
if (!getline(&line, &n, fp))
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
for (int j = 0; j < dim; ++j)
|
for (int j = 0; j < dim; ++j)
|
||||||
sscanf(line, "%d", ¤tLine[j]);
|
sscanf(line, "%f", ¤tLine[j]);
|
||||||
|
|
||||||
cudaMemcpy(&data[i * dim], currentLine, dim * sizeof(float),
|
CUDACHECK(cudaMemcpy(&data[i * dim], currentLine, dim * sizeof(float),
|
||||||
cudaMemcpyHostToDevice);
|
cudaMemcpyHostToDevice));
|
||||||
}
|
}
|
||||||
free(currentLine);
|
free(currentLine);
|
||||||
}
|
}
|
||||||
|
@ -80,24 +149,59 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
#pragma region Select the initial K centroids
|
#pragma region Select the initial K centroids
|
||||||
{
|
{
|
||||||
cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float),
|
CUDACHECK(cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float),
|
||||||
cudaMemcpyDeviceToDevice);
|
cudaMemcpyDeviceToDevice));
|
||||||
}
|
}
|
||||||
#pragma endregion
|
#pragma endregion
|
||||||
|
|
||||||
#pragma region Assign each data point to the closest centroid, \
|
#pragma region Assign each data point to the closest centroid, \
|
||||||
measured via Euclidean distance.
|
measured via Euclidean distance.
|
||||||
{
|
{
|
||||||
findDistanceToCentroid<<<10, 10>>>(centroidDistances, data);
|
findDistanceToCentroid<<<N, num_clusters>>>(
|
||||||
cudaDeviceSynchronize();
|
N, num_clusters, dim, centroidDistances, data, centroids);
|
||||||
printf("Shiet\n");
|
CUDACHECK(cudaDeviceSynchronize());
|
||||||
|
|
||||||
float wtf[10];
|
*dirtyBit = 0;
|
||||||
cudaMemcpy(wtf, centroidDistances, 10 * sizeof(float),
|
assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
|
||||||
cudaMemcpyDeviceToHost);
|
centroidDistances, clusterMap);
|
||||||
for (int i = 0; i < 10; ++i) {
|
CUDACHECK(cudaDeviceSynchronize());
|
||||||
printf("asdf %d %f\n", i, wtf[i]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
printf("Is dirty: %d\n", *dirtyBit);
|
||||||
|
#pragma endregion
|
||||||
|
|
||||||
|
#pragma region
|
||||||
|
int it = 0;
|
||||||
|
while (*dirtyBit) {
|
||||||
|
printf("Iteration %d (dirty=%d)\n", it, *dirtyBit);
|
||||||
|
|
||||||
|
// Update each centroid to be the average coordinate of all contained data
|
||||||
|
// points
|
||||||
|
CUDACHECK(cudaMemset(clusterCount, 0, num_clusters * sizeof(int)));
|
||||||
|
CUDACHECK(cudaMemset(centroids, 0, num_clusters * dim * sizeof(float)));
|
||||||
|
recentralizeCentroidSum<<<N, num_clusters>>>(
|
||||||
|
N, num_clusters, dim, data, centroids, clusterMap, clusterCount);
|
||||||
|
CUDACHECK(cudaDeviceSynchronize());
|
||||||
|
for (int i = 0; i < num_clusters; ++i) {
|
||||||
|
printf("%d ", clusterCount[i]);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
recentralizeCentroidDiv<<<1, num_clusters>>>(dim, centroids, clusterCount);
|
||||||
|
CUDACHECK(cudaDeviceSynchronize());
|
||||||
|
|
||||||
|
// Assign all data points to the closest centroid (measured via Euclidean
|
||||||
|
// distance).
|
||||||
|
findDistanceToCentroid<<<N, num_clusters>>>(
|
||||||
|
N, num_clusters, dim, centroidDistances, data, centroids);
|
||||||
|
CUDACHECK(cudaDeviceSynchronize());
|
||||||
|
|
||||||
|
*dirtyBit = 0;
|
||||||
|
assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
|
||||||
|
centroidDistances, clusterMap);
|
||||||
|
CUDACHECK(cudaDeviceSynchronize());
|
||||||
|
|
||||||
|
it++;
|
||||||
}
|
}
|
||||||
#pragma endregion
|
#pragma endregion
|
||||||
|
|
||||||
|
|
4
assignments/04/run.sh
Executable file
4
assignments/04/run.sh
Executable file
|
@ -0,0 +1,4 @@
|
||||||
|
set -euo pipefail
|
||||||
|
HOST="zhan4854@csel-cuda-02.cselabs.umn.edu"
|
||||||
|
rsync -azPr --exclude 'large_cpd.txt' . $HOST:~/hwk4
|
||||||
|
ssh $HOST bash -c 'set -euo pipefail; module load soft/cuda/local; module initadd soft/cuda/local; cd hwk4; make clean; make; ls; ./km_cuda ./dataset/small_gaussian.txt 512 64 128'
|
|
@ -4,9 +4,13 @@
|
||||||
outputs = { self, nixpkgs, nixpkgsUnstable, flake-utils }:
|
outputs = { self, nixpkgs, nixpkgsUnstable, flake-utils }:
|
||||||
flake-utils.lib.eachDefaultSystem (system:
|
flake-utils.lib.eachDefaultSystem (system:
|
||||||
let
|
let
|
||||||
pkgs = import nixpkgs { inherit system; };
|
pkgs = import nixpkgs {
|
||||||
|
inherit system;
|
||||||
|
config.cudaSupport = true;
|
||||||
|
};
|
||||||
pkgsUnstable = import nixpkgsUnstable {
|
pkgsUnstable = import nixpkgsUnstable {
|
||||||
inherit system;
|
inherit system;
|
||||||
|
config.cudaSupport = true;
|
||||||
config.allowUnfreePredicate = pkg:
|
config.allowUnfreePredicate = pkg:
|
||||||
builtins.elem (nixpkgs.lib.getName pkg) [ "cudatoolkit" ];
|
builtins.elem (nixpkgs.lib.getName pkg) [ "cudatoolkit" ];
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in a new issue