a
This commit is contained in:
parent
9f6a80a09f
commit
18b1c2b6bc
3 changed files with 26 additions and 11 deletions
|
@ -190,7 +190,7 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
#pragma region Iteration
|
#pragma region Iteration
|
||||||
int it = 0;
|
int it = 0;
|
||||||
while (*dirtyBit) {
|
for (int it=0;it < 20 && *dirtyBit; ++it) {
|
||||||
printf("Iteration %d (dirty=%d)\n", it, *dirtyBit);
|
printf("Iteration %d (dirty=%d)\n", it, *dirtyBit);
|
||||||
|
|
||||||
// Update each centroid to be the average coordinate of all contained data
|
// Update each centroid to be the average coordinate of all contained data
|
||||||
|
@ -219,8 +219,6 @@ int main(int argc, char **argv) {
|
||||||
assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
|
assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
|
||||||
centroidDistances, clusterMap);
|
centroidDistances, clusterMap);
|
||||||
CUDACHECK(cudaDeviceSynchronize());
|
CUDACHECK(cudaDeviceSynchronize());
|
||||||
|
|
||||||
it++;
|
|
||||||
}
|
}
|
||||||
#pragma endregion
|
#pragma endregion
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
Michael Zhang \<zhan4854\@umn.edu\>
|
Michael Zhang \<zhan4854\@umn.edu\>
|
||||||
|
|
||||||
1. A short description of how you went about parallelizing the k-means algorithm. You should include how you decomposed the problem and why, i.e., what were the tasks being parallelized.
|
1. *A short description of how you went about parallelizing the k-means algorithm. You should include how you decomposed the problem and why, i.e., what were the tasks being parallelized.*
|
||||||
|
|
||||||
My parallelized program included the following procedures:
|
My parallelized program included the following procedures:
|
||||||
|
|
||||||
|
@ -16,12 +16,22 @@ Michael Zhang \<zhan4854\@umn.edu\>
|
||||||
|
|
||||||
I tried to make sure every thread is computing approximately one single for-loop's worth of data, most of the time over the $d$ axis
|
I tried to make sure every thread is computing approximately one single for-loop's worth of data, most of the time over the $d$ axis
|
||||||
|
|
||||||
2. Give details about how many elements and how the computations in your kernels are handled by a thread.
|
2. *Give details about how many elements and how the computations in your kernels are handled by a thread.*
|
||||||
|
|
||||||
Threads are dynamically allocated
|
I used the dynamic thread allocation method based on the size of the data.
|
||||||
|
|
||||||
3. Ensure you include details about the thread hierarchy, i.e., whether the threads are organized in a 1D, 2D, or, 3D fashion in a thread-block, and whether the thread-blocks are arranged 1D, 2D, or, 3D grid. NOTE: If you choose to write CUDA kernels where the number of thread blocks is determined dynamically by the program during runtime, then send -1 as the input argument for the number of thread blocks to the invocation. In your program, use -1 as a flag to indicate that the number of thread blocks will need to be computed during runtime.
|
For most of the kernels, the computation is very simple: perform a row-reduction into a different array. Since all the accesses are disjoint, I don't synchronize between threads.
|
||||||
|
|
||||||
4. You need to perform a parameter study in order to determine how the number of elements processed by a thread and the size of a thread-block, i.e., the \# threads in a block, affect the performance of your algorithm. Your writeup should contain some results showing the runtime that you obtained for different choices.
|
However, for averaging the datapoints, I unfortunately need to run a $N times K times D$ operation that involves a sum reduction. I tried using a tree-based approach after doing some bitwise math to avoid the conditional of whether it's in the same class, but the plain approach is simpler and I did not get the other one to work.
|
||||||
|
|
||||||
5. You should include results on the 'large_cpd.txt' dataset with 256, 512, and 1024 clusters.
|
3. *Ensure you include details about the thread hierarchy, i.e., whether the threads are organized in a 1D, 2D, or, 3D fashion in a thread-block, and whether the thread-blocks are arranged 1D, 2D, or, 3D grid. NOTE: If you choose to write CUDA kernels where the number of thread blocks is determined dynamically by the program during runtime, then send -1 as the input argument for the number of thread blocks to the invocation. In your program, use -1 as a flag to indicate that the number of thread blocks will need to be computed during runtime.*
|
||||||
|
|
||||||
|
I used a 1D thread hierarchy. This is because all my accesses are already basically along the "good" axis, so I'm not doing any strides along other dimensions.
|
||||||
|
|
||||||
|
4. *You need to perform a parameter study in order to determine how the number of elements processed by a thread and the size of a thread-block, i.e., the \# threads in a block, affect the performance of your algorithm. Your writeup should contain some results showing the runtime that you obtained for different choices.*
|
||||||
|
|
||||||
|
5. *You should include results on the 'large_cpd.txt' dataset with 256, 512, and 1024 clusters.*
|
||||||
|
|
||||||
|
- 256: 26.8258s
|
||||||
|
- 512: 62.1212s
|
||||||
|
- 1024: 163.4022s
|
|
@ -1,5 +1,12 @@
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
HOST="zhan4854@csel-cuda-02.cselabs.umn.edu"
|
HOST="zhan4854@csel-cuda-02.cselabs.umn.edu"
|
||||||
rsync -azPr --exclude 'large_cpd.txt' . $HOST:~/hwk4
|
rsync -azPr --exclude 'large_cpd.txt' . $HOST:~/hwk4
|
||||||
ssh $HOST bash -c 'set -euo pipefail; module load soft/cuda/local; module initadd soft/cuda/local; cd hwk4; make clean; make; ls; ./km_cuda ./dataset/small_gaussian.txt 2 64 128'
|
|
||||||
rsync -azPr --exclude 'large_cpd.txt' zhan4854@csel-cuda-02.cselabs.umn.edu:~/hwk4/ .
|
CLUSTERS=${1:-512}
|
||||||
|
BLOCKS=-1
|
||||||
|
THREADS=-1
|
||||||
|
DATAFILE="large_cpd.txt"
|
||||||
|
# DATAFILE="small_gaussian.txt"
|
||||||
|
|
||||||
|
ssh $HOST bash -c "set -euo pipefail; module load soft/cuda/local; module initadd soft/cuda/local; cd hwk4; make clean; make; ls; ./km_cuda ./dataset/$DATAFILE $CLUSTERS 64 128"
|
||||||
|
rsync -qazPr --exclude 'large_cpd.txt' zhan4854@csel-cuda-02.cselabs.umn.edu:~/hwk4/ .
|
||||||
|
|
Loading…
Reference in a new issue