a3 fix

final
d
2023-12-28 23:53:30 -05:00 · 2023-12-16 11:44:38 -06:00 · 2023-12-15 17:13:47 -06:00 · 2023-12-15 16:58:06 -06:00 · 2023-12-11 22:11:02 -06:00 · 2023-12-10 17:38:53 -06:00
24 changed files with 3340 additions and 164 deletions
--- a/.envrc
+++ b/.envrc
@ -0,0 +1 @@
 use flake
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 .DS_Store
-/target
+/target
 .direnv
--- a/4
+++ b/4
@ -12,6 +12,7 @@ RUN apt update -y && apt install -y --no-install-recommends \
    clang \
    curl \
    direnv \
    gdb \
    git \
    libomp-dev \
    libopenmpi-dev \
@ -21,6 +22,7 @@ RUN apt update -y && apt install -y --no-install-recommends \
    pkg-config \
    python3 \
    python3-pip \
    python3-venv \
    texlive-latex-base \
    texlive-latex-extra \
    valgrind \
@ -31,6 +33,6 @@ COPY --from=typst /bin/typst /usr/bin/typst
 RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
 RUN /root/.cargo/bin/cargo install cargo-watch
-RUN /root/.cargo/bin/cargo install watchexec
+# RUN /root/.cargo/bin/cargo install watchexec-cli
 RUN echo 'eval "$(direnv hook bash)"' >> /root/.bashrc
--- a/assignments/03/.envrc
+++ b/assignments/03/.envrc
@ -0,0 +1,3 @@
 layout python3
 export OMPI_ALLOW_RUN_AS_ROOT=1
 export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
--- a/assignments/03/.gitignore
+++ b/assignments/03/.gitignore
@ -2,4 +2,9 @@ lpa
 compile_commands.json
 .cache
 report.pdf
-*.tar.gz
+*.tar.gz
 out.txt
 dataset/gen_*.txt
 .direnv
 labels.txt
--- a/assignments/03/Makefile
+++ b/assignments/03/Makefile
@ -1,7 +1,7 @@
 .PHONY: run clean
-# CFLAGS += -O3
+CFLAGS += -g
-CFLAGS += -DFMT_HEADER_ONLY -g
+# CFLAGS += -DFMT_HEADER_ONLY -g
 # LDFLAGS += $(shell pkg-config --libs fmt)
 lpa: lpa.cpp Makefile
@ -16,7 +16,7 @@ run:
 report.pdf: report.typ
 	typst compile $< $@
-zhan4854.tar.gz: Makefile ASSIGNMENT.md lpa.cpp report.pdf
+zhan4854.tar.gz: Makefile ASSIGNMENT.md lpa.cpp report.pdf dataset/gen2.py
 	mkdir -p zhan4854
 	cp $^ zhan4854
 	tar -czvf $@ zhan4854
--- a/assignments/03/bench.sh
+++ b/assignments/03/bench.sh
@ -0,0 +1,7 @@
 for dataset in $(echo "1000.txt" "10000.txt" "100000.txt" "1000000.txt"); do
    for processors in $(echo 1 2 4 8 16 | tr ' ' '\n'); do
        # file="dataset/both_$dataset"
        file="/export/scratch/CSCI5451_F23/assignment-3/dataset/$dataset"
        mpirun -n $processors ./lpa $file "graph_out/$dataset-$processors.txt" >> "stdout_out/$dataset-$processors.txt"
    done
 done
--- a/assignments/03/dataset/gen2.py
+++ b/assignments/03/dataset/gen2.py
@ -0,0 +1,25 @@
 import igraph as ig
 import random
 import sys
 try:
    N = int(sys.argv[1])
 except:
    N = 1000
 random.seed(0)
 g = ig.Graph.Growing_Random(N, 5)
 components = g.connected_components(mode='weak')
 print(len(components))
 with open(f"dataset/gen_{N}.txt", "w") as f:
    both_edges = []
    for edge in g.es:
        both_edges.append((edge.source, edge.target))
        both_edges.append((edge.target, edge.source))
    num_edges = len(both_edges)
    f.write(f"{N} {num_edges}\n")
    for v1, v2 in sorted(both_edges):
        f.write(f"{v1} {v2}\n")
--- a/assignments/03/lpa.cpp
+++ b/assignments/03/lpa.cpp
@ -14,22 +14,13 @@
 #include <unistd.h>
 #include <utility>
-#ifdef FMT_HEADER_ONLY
+// #include <fmt/format.h>
-#include <fmt/format.h>
+// #include <fmt/ranges.h>
 #include <fmt/ranges.h>
 #endif
 #define TAG_SEND_NUM_EDGES 1001
 #define TAG_SEND_EDGES 1002
 #define TAG_SEND_FINAL_RESULT 1003
 #define MIN(a, b)                                                              \
  ({                                                                           \
    __typeof__(a) _a = (a);                                                    \
    __typeof__(b) _b = (b);                                                    \
    _a < _b ? _a : _b;                                                         \
  })
 typedef struct {
  int fst;
  int snd;
@ -45,18 +36,18 @@ void pair_vector_init(struct pair_vector *);
 void pair_vector_clear(struct pair_vector *);
 void pair_vector_push(struct pair_vector *v, int fst, int snd);
-pair compute_node_range(int p, int total_num_nodes, int each_num_nodes,
+/**
-                        int process);
+ * @brief Write a vector of labels to a file.
-int lookup_assignment(int *base_node_assignment, pair my_node_range,
+ *
-                      std::map<int, std::set<int>> recv_map, int *recvbuf,
+ * @param filename The name of the file to write to.
-                      int *recv_counts, int *recv_displs, int each_num_nodes,
+ * @param labels The array of labels.
-                      int rank, int node_number);
+ * @param nlabels How many labels to write.
 */
 static void print_labels(char *filename, int *labels, size_t const nlabels);
 int main(int argc, char **argv) {
-  MPI_Init(&argc, &argv);
+  MPI::Init(argc, argv);
-  int rank, p;
+  int rank = MPI::COMM_WORLD.Get_rank(), p = MPI::COMM_WORLD.Get_size();
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &p);
  MPI_Datatype IntPairType;
  init_pair_type(&IntPairType);
@ -71,15 +62,14 @@ int main(int argc, char **argv) {
  pair params;
  if (rank == 0) {
    printf("Processors: %d, file: %s\n", p, argv[1]);
    fp = fopen(argv[1], "r");
-
+    if ((read = getline(&line, &len, fp)) != -1)
    // Read the first line
    if (getline(&line, &len, fp) != -1)
      sscanf(line, "%d %d", &params.fst, &params.snd);
  }
  // Send the params
-  MPI_Bcast(&params, 1, IntPairType, 0, MPI_COMM_WORLD);
+  MPI_Bcast(&params, 1, IntPairType, 0, MPI::COMM_WORLD);
  int total_num_nodes = params.fst;
  int total_num_edges = params.snd;
  int each_num_nodes = total_num_nodes / p;
@ -89,9 +79,12 @@ int main(int argc, char **argv) {
      rank == p - 1 ? total_num_nodes - rank * each_num_nodes : each_num_nodes;
  int my_nodes[num_my_nodes];
-  pair node_ranges[p];
+  std::function<std::pair<int, int>(int)> node_range =
-  for (int i = 0; i < p; ++i)
+      [p, total_num_nodes, each_num_nodes](int process) {
-    node_ranges[i] = compute_node_range(p, total_num_nodes, each_num_nodes, i);
+        int start = process * each_num_nodes;
        int end = process == p - 1 ? total_num_nodes : start + each_num_nodes;
        return std::make_pair(start, end);
      };
  // Read the edges
  int num_my_edges;
@ -105,31 +98,30 @@ int main(int argc, char **argv) {
    // For the current process, what's the last node we're expecting to see?
    int current_process = 0;
-    pair current_node_range = node_ranges[current_process];
+    std::pair<int, int> current_node_range = node_range(current_process);
    int edge_counter = 0;
    for (int i = 0; i < total_num_edges; ++i) {
-      if (getline(&line, &len, fp) == -1)
+      getline(&line, &len, fp);
        break;
      int fst, snd;
      sscanf(line, "%d %d", &fst, &snd);
-      if (fst >= current_node_range.snd) {
+      if (fst >= current_node_range.second) {
        if (current_process == 0) {
          num_my_edges = edge_counter;
          my_edges = (pair *)calloc(num_my_edges, sizeof(pair));
          memcpy(my_edges, all_edges.ptr, edge_counter * sizeof(pair));
        } else {
          MPI_Send(&edge_counter, 1, MPI_INT, current_process,
-                   TAG_SEND_NUM_EDGES, MPI_COMM_WORLD);
+                   TAG_SEND_NUM_EDGES, MPI::COMM_WORLD);
          MPI_Send(all_edges.ptr, edge_counter, IntPairType, current_process,
-                   TAG_SEND_EDGES, MPI_COMM_WORLD);
+                   TAG_SEND_EDGES, MPI::COMM_WORLD);
        }
        // We're starting on the next process
        current_process += 1;
-        current_node_range = node_ranges[current_process];
+        current_node_range = node_range(current_process);
        edge_counter = 0;
        pair_vector_clear(&all_edges);
      }
@ -140,18 +132,33 @@ int main(int argc, char **argv) {
    // We have to send the last one again here, since it didn't get caught in
    // the loop above
-    MPI_Send(&edge_counter, 1, MPI_INT, current_process, TAG_SEND_NUM_EDGES,
+    if (current_process == 0) {
-             MPI_COMM_WORLD);
+      num_my_edges = edge_counter;
-    MPI_Send(all_edges.ptr, edge_counter, IntPairType, current_process,
+      my_edges = (pair *)calloc(num_my_edges, sizeof(pair));
-             TAG_SEND_EDGES, MPI_COMM_WORLD);
+      memcpy(my_edges, all_edges.ptr, edge_counter * sizeof(pair));
-
+    } else {
-    free(all_edges.ptr);
+      MPI_Send(&edge_counter, 1, MPI_INT, current_process, TAG_SEND_NUM_EDGES,
               MPI::COMM_WORLD);
      MPI_Send(all_edges.ptr, edge_counter, IntPairType, current_process,
               TAG_SEND_EDGES, MPI::COMM_WORLD);
    }
  } else {
-    MPI_Recv(&num_my_edges, 1, MPI_INT, 0, TAG_SEND_NUM_EDGES, MPI_COMM_WORLD,
+    MPI_Recv(&num_my_edges, 1, MPI_INT, 0, TAG_SEND_NUM_EDGES, MPI::COMM_WORLD,
             NULL);
    my_edges = (pair *)calloc(num_my_edges, sizeof(pair));
    MPI_Recv(my_edges, num_my_edges, IntPairType, 0, TAG_SEND_EDGES,
-             MPI_COMM_WORLD, NULL);
+             MPI::COMM_WORLD, NULL);
  }
  char *buf = (char *)calloc(sizeof(char), 1000);
  int offset = 0; // Keep track of the current position in the buffer
  for (int i = 0; i < std::min(num_my_edges, 5); i++) {
    offset +=
        sprintf(buf + offset, "(%d, %d)", my_edges[i].fst, my_edges[i].snd);
    if (i < len - 1) {
      // Add a separator (e.g., comma or space) if it's not the last
      offset += sprintf(buf + offset, " ");
    }
  }
  if (rank == 0) {
@ -162,21 +169,18 @@ int main(int argc, char **argv) {
 #pragma endregion
  // STEP 2 TIMER STARTS HERE
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI::COMM_WORLD.Barrier();
-  double step_2_start_time;
+  double step_2_start_time = MPI::Wtime();
  if (rank == 0)
    step_2_start_time = MPI_Wtime();
 // Each process analyzes the non-local edges that are contained in its portion
 // of the graph.
 #pragma region
-  int node_label_assignment_vec[num_my_nodes];
+  std::map<int, int> node_label_assignment;
-  // std::map<int, int> node_label_assignment;
+  std::pair<int, int> my_node_range = node_range(rank);
  pair my_node_range = node_ranges[rank];
  // Initial node assignment
-  for (int idx = 0; idx < num_my_nodes; ++idx) {
+  for (int i = my_node_range.first; i < my_node_range.second; ++i) {
-    node_label_assignment_vec[idx] = my_node_range.fst + idx;
+    node_label_assignment[i] = i;
  }
  std::map<int, std::set<int>> adj;
@ -187,12 +191,12 @@ int main(int argc, char **argv) {
    pair edge = my_edges[i];
    adj[edge.fst].insert(edge.snd);
-    if (!(my_node_range.fst <= edge.fst && edge.fst < my_node_range.snd)) {
+    if (!(my_node_range.first <= edge.fst && edge.fst < my_node_range.second)) {
      non_local_nodes.insert(edge.fst);
      non_local_edges.insert(std::make_pair(edge.snd, edge.fst));
    }
-    if (!(my_node_range.fst <= edge.snd && edge.snd < my_node_range.snd)) {
+    if (!(my_node_range.first <= edge.snd && edge.snd < my_node_range.second)) {
      non_local_nodes.insert(edge.snd);
      non_local_edges.insert(std::make_pair(edge.fst, edge.snd));
    }
@ -208,28 +212,26 @@ int main(int argc, char **argv) {
  for (auto entry : non_local_edges) {
    int local_node = entry.first, remote_node = entry.second;
-    int remote_process = remote_node / each_num_nodes;
+    int corresponding_process = remote_node / each_num_nodes;
    // The last process gets some extra nodes
-    if (remote_process >= p)
+    if (corresponding_process >= p)
-      remote_process = p - 1;
+      corresponding_process = p - 1;
-    send_map[remote_process].insert(local_node);
+    send_map[corresponding_process].insert(local_node);
-    recv_map[remote_process].insert(remote_node);
+    recv_map[corresponding_process].insert(remote_node);
  }
 #pragma endregion
 // All the processes are communicating to figure out which process needs to
 // send what data to the other processes.
 #pragma region
  // Nothing needs to be done here, I'm using the fact that everything is sent
  // in sorted order to ensure that both sides are referring to the same thing
 #pragma endregion
  // STEP 5 TIMER STARTS HERE
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI::COMM_WORLD.Barrier();
-  double step_5_start_time;
+  double step_5_start_time = MPI::Wtime();
  if (rank == 0) {
    step_5_start_time = MPI_Wtime();
    printf("STARTING STEP 5: %0.04fs\n", step_5_start_time - step_2_start_time);
  }
 // The processes perform the transfers of non-local labels and  updates of
 // local labels until convergence.
@ -247,9 +249,9 @@ int main(int argc, char **argv) {
      int offset = 0;
      for (int i = 0; i < p; ++i) {
        int count = send_map[i].size();
-        for (auto local_node : send_map[i]) {
+        // std::sort(send_map[i].begin(), send_map[i].end());
-          sendbuf.push_back(
+        for (auto k : send_map[i]) {
-              node_label_assignment_vec[local_node - my_node_range.fst]);
+          sendbuf.push_back(node_label_assignment[k]);
        }
        send_counts.push_back(count);
        send_displs.push_back(offset);
@ -259,6 +261,7 @@ int main(int argc, char **argv) {
      offset = 0;
      for (int i = 0; i < p; ++i) {
        int count = recv_map[i].size();
        // std::sort(recv_map[i].begin(), recv_map[i].end());
        recv_counts.push_back(count);
        recv_displs.push_back(offset);
        offset += count;
@ -267,25 +270,29 @@ int main(int argc, char **argv) {
    }
    std::vector<int> recvbuf(recv_total, 0);
-    MPI_Alltoallv(sendbuf.data(), send_counts.data(), send_displs.data(),
+    MPI::COMM_WORLD.Alltoallv(sendbuf.data(), send_counts.data(),
-                  MPI_INT, recvbuf.data(), recv_counts.data(),
+                              send_displs.data(), MPI_INT, recvbuf.data(),
-                  recv_displs.data(), MPI_INT, MPI_COMM_WORLD);
+                              recv_counts.data(), recv_displs.data(), MPI_INT);
    std::map<int, int> total_node_label_assignment(node_label_assignment);
    for (int i = 0; i < p; ++i) {
      std::vector<int> ouais(recv_map[i].begin(), recv_map[i].end());
      for (int j = 0; j < recv_counts[i]; ++j) {
        int remote_node = ouais[j];
        int remote_value = recvbuf[recv_displs[i] + j];
        total_node_label_assignment[remote_node] = remote_value;
      }
    }
    // For each local node, determine the minimum label out of its neighbors
    std::map<int, int> new_labels;
-    for (int i = 0; i < num_my_nodes; ++i) {
+    for (int i = my_node_range.first; i < my_node_range.second; ++i) {
-      int node = my_node_range.fst + i;
+      int current_value = total_node_label_assignment[i];
      // int current_value = total_node_label_assignment[i];
      int current_value = node_label_assignment_vec[i];
      int min = current_value;
-      for (auto neighbor : adj[node]) {
+      for (auto neighbor : adj[i]) {
-        int neighbor_value = lookup_assignment(
+        if (total_node_label_assignment[neighbor] < min)
-            node_label_assignment_vec, my_node_range, recv_map, recvbuf.data(),
+          min = total_node_label_assignment[neighbor];
            recv_counts.data(), recv_displs.data(), each_num_nodes, rank,
            neighbor);
        min = MIN(min, neighbor_value);
      }
      if (min < current_value) {
@ -296,8 +303,8 @@ int main(int argc, char **argv) {
    // Have there been any changes in the labels?
    int num_changes = new_labels.size();
    int total_changes;
-    MPI_Allreduce(&num_changes, &total_changes, 1, MPI_INT, MPI_SUM,
+    MPI::COMM_WORLD.Allreduce(&num_changes, &total_changes, 1, MPI_INT,
-                  MPI_COMM_WORLD);
+                              MPI::SUM);
    if (total_changes == 0) {
      break;
@ -305,19 +312,14 @@ int main(int argc, char **argv) {
    // Update the original node assignment
    for (auto entry : new_labels) {
-      node_label_assignment_vec[entry.first] = entry.second;
+      node_label_assignment[entry.first] = entry.second;
    }
    if (rank == 0)
      printf("total changes: %d\n", total_changes);
  }
 #pragma endregion
  // END TIMERS
-  MPI_Barrier(MPI_COMM_WORLD);
+  MPI::COMM_WORLD.Barrier();
-  double end_time;
+  double end_time = MPI::Wtime();
  if (rank == 0)
    end_time = MPI_Wtime();
  if (rank == 0) {
    printf("2-5 Time: %0.04fs\n", end_time - step_2_start_time);
@ -329,34 +331,35 @@ int main(int argc, char **argv) {
 #pragma region
  if (rank == 0) {
    std::vector<int> all_assignments(total_num_nodes);
-    std::map<int, int> label_count;
+    // std::map<int, int> label_count;
    int ctr = 0;
-    for (int process_idx = 0; process_idx < p; ++process_idx) {
+    for (int i = 0; i < p; ++i) {
-      pair this_node_range = node_ranges[process_idx];
+      std::pair<int, int> this_node_range = node_range(i);
-      int count = this_node_range.snd - this_node_range.fst;
+      int count = this_node_range.second - this_node_range.first;
-      if (process_idx == 0) {
+      if (i == 0) {
        for (int j = 0; j < count; ++j) {
-          all_assignments[this_node_range.fst + j] =
+          all_assignments[this_node_range.first + j] =
-              node_label_assignment_vec[j];
+              node_label_assignment[this_node_range.first + j];
-          label_count[all_assignments[this_node_range.fst + j]]++;
+          // label_count[all_assignments[this_node_range.first + j]]++;
        }
      } else {
-        MPI_Recv(&all_assignments[this_node_range.fst], count, MPI_INT,
+        MPI::COMM_WORLD.Recv(&all_assignments[this_node_range.first], count,
-                 process_idx, TAG_SEND_FINAL_RESULT, MPI_COMM_WORLD, NULL);
+                             MPI::INT, i, TAG_SEND_FINAL_RESULT);
        for (int j = 0; j < count; ++j) {
          label_count[all_assignments[this_node_range.fst + j]]++;
        }
      }
    }
-    std::cout << "Done! " << label_count.size() << std::endl;
+    print_labels(argv[2], all_assignments.data(), all_assignments.size());
  } else {
-    MPI_Send(node_label_assignment_vec, num_my_nodes, MPI_INT, 0,
+    std::vector<int> flat_assignments;
-             TAG_SEND_FINAL_RESULT, MPI_COMM_WORLD);
+    for (int i = my_node_range.first; i < my_node_range.second; ++i) {
      flat_assignments.push_back(node_label_assignment[i]);
    }
    MPI::COMM_WORLD.Send(flat_assignments.data(), flat_assignments.size(),
                         MPI::INT, 0, TAG_SEND_FINAL_RESULT);
  }
 #pragma endregion
-  MPI_Finalize();
+  MPI::Finalize();
  return 0;
 }
@ -398,54 +401,20 @@ void pair_vector_push(struct pair_vector *v, int fst, int snd) {
  v->len++;
 }
-pair compute_node_range(int p, int total_num_nodes, int each_num_nodes,
+static void print_labels(char *filename, int *labels, size_t const nlabels) {
-                        int process) {
+  size_t i;
-  int start = process * each_num_nodes;
+  FILE *fout;
  int end = process == p - 1 ? total_num_nodes : start + each_num_nodes;
  return {.fst = start, .snd = end};
 }
-int lookup_assignment(int *base_node_assignment, pair my_node_range,
+  /* open file */
-                      std::map<int, std::set<int>> recv_map, int *recvbuf,
+  if ((fout = fopen(filename, "w")) == NULL) {
-                      int *recv_counts, int *recv_displs, int each_num_nodes,
+    fprintf(stderr, "error opening '%s'\n", filename);
-                      int rank, int node_number) {
+    abort();
  int process_from = node_number / each_num_nodes;
  // Just return from local if local
  if (process_from == rank)
    return base_node_assignment[node_number - my_node_range.fst];
  int count = recv_counts[process_from];
  int displs = recv_displs[process_from];
  // Determine what index this node is
  int index = -1, ctr = 0;
  std::vector<int> inner(recv_map[process_from].begin(),
                         recv_map[process_from].end());
  // {
  //   int lo = 0, hi = count;
  //   while (lo < hi) {
  //     int mid = (lo + hi) / 2;
  //     int midk = inner[mid];
  //     if (node_number < midk)
  //       hi = mid;
  //     else if (node_number > midk)
  //       lo = mid;
  //     else {
  //       index = mid;
  //       break;
  //     }
  //   }
  // }
  for (int i = 0; i < count; ++i) {
    int remote_node = inner[i];
    if (node_number == remote_node) {
      index = ctr;
      break;
    }
    ctr++;
  }
-  // Pull the corresponding value from the map
+  /* write labels to fout */
-  return recvbuf[recv_displs[process_from] + index];
+  for (i = 0; i < nlabels; ++i) {
-}
+    fprintf(fout, "%u\n", labels[i]);
  }
  fclose(fout);
 }
--- a/assignments/03/process.py
+++ b/assignments/03/process.py
@ -0,0 +1,33 @@
 import re
 WTF = re.compile(r".*: (\d+),.*dataset/(\d+).txt")
 by_size = dict()
 with open("stdout.txt") as f:
    while True:
        line1 = f.readline().strip()
        if not line1: break
        m = WTF.match(line1)
        processors = int(m.group(1))
        size = int(m.group(2))
        if size not in by_size: by_size[size] = dict()
        line2 = f.readline().strip()
        line3 = f.readline().strip()
        time2 = line2.split(": ")[1]
        time5 = line3.split(": ")[1]
        if processors not in by_size[size]: by_size[size][processors] = (time2, time5)
 print("#table(")
 print("  columns: (auto, auto, auto, auto, auto, auto),")
 columns = [1, 2, 4, 8, 16]
 print("  [], ", ", ".join(map(lambda c: f"[{c}]", columns)), ",")
 for size, entries in sorted(by_size.items()):
    print(f"  [{size}],")
    for processors, (time2, time5) in sorted(entries.items()):
        print(f"  [{time2} #linebreak() {time5}],", end = None)
    print()
 print(")")
--- a/assignments/03/report.typ
+++ b/assignments/03/report.typ
@ -0,0 +1,79 @@
 == Step 2-4
 For steps 2-4, I calculated all of each process' outgoing nodes, sorted it in
 order and used its sorted position as a way to identify which nodes are being
 sent.
 This saves an extra communication and lets me index the same items for each
 loop.
 == Step 5
 I exchanged data using the unstructured communication approach, doing an
 all-to-all transfer.
 To read the result efficiently, I tried using the approach given in the slides.
 I also tried to use binary search since this would yield $log(n)$ time.
 However, this was taking a long time (up to 45 seconds for the 10,000 case), and
 it was the bottleneck.  Using STL's `std::map` proved to be orders of magnitude
 faster.
 == Other remarks
 On the original example dataset, it poorly using larger numbers. I have an
 explanation for this after looking at the performance characteristics of the
 run: it completes in one iteration where every single edge is assigned. The data
 distribution also indicates that almost everything is connected into the first
 node, which isn't balanced.
 I've written a generation script in Python using the `igraph` library.
 - 1,000: 93 components
 - 10,000: 947 components
 - 100,000: 9,423 components
 - 1,000,000: 92,880 components
 Using this data, I was able to achieve much better speedup. I didn't attach the
 actual data files but they can be generated with the same script (seeded for
 reproducibility).
 *NOTE:* I noticed that afterwards, the data was changed again, with a more balanced graph this time.
 So the numbers will not reflect the poorer performance.
 == Timing on example dataset
 This experiment was performed on CSELabs by using my bench script, and the table
 was generated with another script.
 #table(
  columns: (auto, auto, auto, auto, auto, auto),
  [],  [1], [2], [4], [8], [16] ,
  [1000],
  [0.0249s #linebreak() 0.0151s],
  [0.0234s #linebreak() 0.0122s],
  [0.0206s #linebreak() 0.0099s],
  [0.0491s #linebreak() 0.0248s],
  [0.0177s #linebreak() 0.0106s],
  [10000],
  [0.2929s #linebreak() 0.1830s],
  [0.2933s #linebreak() 0.1540s],
  [0.2457s #linebreak() 0.1178s],
  [0.3793s #linebreak() 0.1328s],
  [0.2473s #linebreak() 0.1197s],
  [100000],
  [3.7888s #linebreak() 2.4881s],
  [3.7592s #linebreak() 2.0212s],
  [3.3819s #linebreak() 1.6036s],
  [2.9485s #linebreak() 1.3954s],
  [2.8593s #linebreak() 1.3107s],
  [1000000],
  [46.7895s #linebreak() 31.9648s],
  [45.2284s #linebreak() 24.8540s],
  [40.3994s #linebreak() 20.2851s],
  [36.9628s #linebreak() 17.6794s],
  [35.7110s #linebreak() 16.6276s],
 )
--- a/assignments/03/test.gdb
+++ b/assignments/03/test.gdb
@ -1,5 +1,5 @@
 set pagination off
-run dataset/both_1000.txt
+run dataset/both_1000.txt out.txt
 bt
 frame 3
 print k
--- a/assignments/04/.gitignore
+++ b/assignments/04/.gitignore
@ -0,0 +1,9 @@
 dataset/large_cpd.txt
 km_cuda
 clusters.txt
 centroids.txt
 report.pdf
 cluster.txt
 zhan4854.tar.gz
--- a/assignments/04/ASSIGNMENT.md
+++ b/assignments/04/ASSIGNMENT.md
@ -0,0 +1,235 @@
 Introduction
 The purpose of this assignment is for you to become familiar with GPU programming and the CUDA API by parallelizing a common algorithm in data mining: k-means clustering.
 You need to write a program called km_cuda that will accept the following parameters as input: a filename of data points, the number of clusters, the number of thread blocks, and the number of threads per block. Your program should read in the specified file, cluster the given data points, and output the cluster assignments.
 K-Means Clustering Algorithm
 The k-means clustering algorithm clusters N data points into K clusters. Each cluster is characterized by a centroid, which is a point representing the average of all data points within the cluster. The algorithm proceeds as follows:
 1. Select the initial K centroids.
  a. For reproducibility, use points 0, 1, ..., K-1.
 2. Assign each data point to the closest centroid (measured via Euclidean distance).
 3. While not converged:
  a. Update each centroid to be the average coordinate of all contained data points.
  b. Assign all data points to the closest centroid (measured via Euclidean distance).
 Convergence is detected when no data points change their cluster assignment, or if the maximum number of iterations have been executed. For this assignment, you should set the maximum number of iterations to twenty. Additional material on the k-means algorithm can be found in K-Means.pdf
 Download K-Means.pdf.
 Input/Output Formats
 Each program will take as input one file (the list of data points), and output two files ("clusters.txt": the cluster assignments and "centroids.txt": the centers of each cluster).
 The input file contains N+1 lines. The first line contains two space-separated integers: the number of data points (N), and the dimensionality of each data point (D). The following N lines each contain D space-separated floating-point numbers which represent the coordinates of the current data point. Each floating-point number contains at least one digit after the decimal point. For example, an input with four two-dimensional data points would be stored in a file as: 
 4 2
 502.1 505.9
 504.0 489.4
 515.2 514.7
 496.7 498.3
 The output file cluster assignments must be called clusters.txt and contain N lines. Each line should contain a single zero-indexed integer which specifies the cluster that the current data point belongs to. For example, a clustering of the above input file into two clusters may look like:
 0
 0
 1
 0
 The second output file, centroids.txt, should follow the same format as the input data file. It should contain K data points, one for each cluster. Each coordinate should be written with at least three digits after the decimal point.
 Your program must also print the clustering time to standard out. You should use a high-precision, monotonic, wall-clock timer and also omit the time spent reading and writing to files. We recommend the function clock_gettime() when on a Linux system. Here is a function that you may use for timing:
 ```c
 /* Gives us high-resolution timers. */
 #define _POSIX_C_SOURCE 199309L
 #include <time.h>
 /* OSX timer includes */
 #ifdef __MACH__
  #include <mach/mach.h>
  #include <mach/mach_time.h>
 #endif
 /**
 * @brief Return the number of seconds since an unspecified time (e.g., Unix
 *        epoch). This is accomplished with a high-resolution monotonic timer,
 *        suitable for performance timing.
 *
 * @return The number of seconds.
 */
 static inline double monotonic_seconds() {
 #ifdef __MACH__
  /* OSX */
  static mach_timebase_info_data_t info;
  static double seconds_per_unit;
  if(seconds_per_unit == 0) {
    mach_timebase_info(&info);
    seconds_per_unit = (info.numer / info.denom) / 1e9;
  }
  return seconds_per_unit * mach_absolute_time();
 #else
  /* Linux systems */
  struct timespec ts;
  clock_gettime(CLOCK_MONOTONIC, &ts);
  return ts.tv_sec + ts.tv_nsec * 1e-9;
 #endif
 }
 ```
 You should use the following function to output your clustering time:
 ```c
 /**
 * @brief Output the seconds elapsed while clustering.
 *
 * @param seconds Seconds spent on k-means clustering, excluding IO.
 */
 static void print_time(double const seconds)
 {
  printf("k-means clustering time: %0.04fs\n", seconds);
 }
 ```
 Timing information should be the ONLY thing printed to standard out.
 Failure to follow any of these output instructions will result in significant loss of points.
 Testing
 Test inputs can be found in /export/scratch/csci5451_f23/hw4_data on any of the cuda lab machines. We provide two test files: small_gaussian.txt and large_cpd.txt. The former is a small two-dimensional dataset that you should use for testing the correctness of your code.
 The cuda machines are located at csel-cuda-0{1..5}.cselabs.umn.edu. You must first load the cuda modules with the following commands before using nvcc.
 module load soft/cuda/local
 module initadd soft/cuda/local
 If the command 'module' cannot be found. Add the following lines into your ~/.bashrc file. 
 ```bash
 MODULESINIT="/usr/local/modules-tcl/init/bash"
 if [ -f $MODULESINIT ]; then
   . $MODULESINIT
   module load java system soft/ocaml soft/cuda/local
 fi
 unset MODULESINIT
 ```
 After adding, run 
 ```
 source ~/.bashrc
 ```
 More info on testing/using the cuda machines can be found at Parallel Lab Systems.
 You should only run your code on one of the cuda machines. You will use the last digit of your student ID to select the machine. ID's ending in {0, 5} should use cuda01, ID's ending in {1, 6} should use cuda02, ID's ending in {3, 7} should use cuda03, ID's ending in {4, 8} should use cuda04, and ID's ending in {5, 9} should use cuda05.
 We provide a short script for plotting clustered 2D data (such as small_gaussian.txt). Download and extract plot_clusters.tar.gz
 Download plot_clusters.tar.gz. We provide plotting scripts and small_gaussian.txt inside the package. The plotting scripts rely on Octave, an open source alternative to Matlab. The Octave package can be loaded via 'module load math/octave'. Cluster the data with two clusters, place your files centroids.txt and clusters.txt inside of the plot/ directory, and run:
 $ ./plot.sh data/small_gaussian.txt
 If your clustering is correct, you should see a plot similar to the one below. Data points inside the same cluster are colored the same. Centroids are marked clearly in the center of each cluster.
 clusters.png
 For the longer running tests, you should look into CUDA compiler optimization flags as well as the screen
 Links to an external site. command as these will be helpful.
 Remember that the TA will be evaluating your data with a different data sets than those provided for testing.
 What you need to turn in
 Download What you need to turn in
    The source code of your programs.
    A short report including the following parts:
        A short description of how you went about parallelizing the k-means algorithm. You should include how you decomposed the problem and why, i.e., what were the tasks being parallelized.
        Give details about how many elements and how the computations in your kernels are handled by a thread.
        Ensure you include details about the thread hierarchy, i.e., whether the threads are organized in a 1D, 2D, or, 3D fashion in a thread-block, and whether the thread-blocks are arranged 1D, 2D, or, 3D grid. NOTE: If you choose to write CUDA kernels where the number of thread blocks is determined dynamically by the program during runtime, then send -1 as the input argument for the number of thread blocks to the invocation. In your program, use -1 as a flag to indicate that the number of thread blocks will need to be computed during runtime.
        You need to perform a parameter study in order to determine how the number of elements processed by a thread and the size of a thread-block, i.e., the #threads in a block, affect the performance of your algorithm. Your writeup should contain some results showing the runtime that you obtained for different choices.
        You should include results on the 'large_cpd.txt' dataset with 256, 512, and 1024 clusters.
        Remember, speed counts. Programs that fail to use the GPU efficiently will lose significant points.
 Do NOT include the test files; TAs will have their own test files for grading. You will lose significant points for including test files.
 Additional specifications related to assignment submission
    A makefile must be provided to compile and generate the executable. The executable should be named:
        km_cuda
    Program invocation: Your programs should take as an argument the input file to be read from, the number of clusters, the number of thread blocks, and the number of threads per block to be used for parallel execution.
    For example, with 64 thread blocks and 128 threads, your program would be invoked as follows:
 ./km_cuda /export/scratch/CSCI-5451/assignment-1/large_cpd.txt 512 64 128 
    NOTE: If you choose to write CUDA kernels where the number of thread blocks is determined dynamically by the program during runtime, then send -1 as the input argument for the number of thread blocks during invocation. Example:
 km_cuda /export/scratch/CSCI-5451/assignment-1/large_cpd.txt 512 -1 128 
    All files (code + report) MUST be in a single directory and the directory's name MUST be your UMN login ID (e.g., your UMN email or Moodle username). Your submission directory MUST include at least the following files (other auxiliary files may also be included):
 <UMN ID>/km_cuda.c
 <UMN ID>/Makefile
 <UMN ID>/report.pdf
 If you choose to code in C++, then replace the .c suffixes with .cpp or .cc.
    Submission MUST be in .tar.gz
    The following sequence of commands should work on your submission file:
 tar xzvf <UMN ID>.tar.gz
 cd <UMN ID>
 make
 ls -ld km_cuda
 This ensures that your submission is packaged correctly, your directory is named correctly, your makefile works correctly, and your output executables are named correctly. If any of these does not work, modify it so that you do not lose points. The TAs can answer questions about correctly formatting your submission BEFORE the assignment is due. Do not expect them to answer questions the night it is due.
 Failure to follow any of these submission instructions will result in significant loss of points.
 Evaluation criteria
 The goal for this assignment is for you to become familiar with the APIs and not so much for developing the most efficient parallel program (this will be done later). As such, full points will be given to the programs that:
    follows the assignment directions;
    solve the problem correctly;
    do so in parallel (i.e., both clustering sub-steps are parallelized);
 The speedups obtained will probably depend on the size of the input file. It is not expected that you to get good speedups for small files, but you should be able to get speedups for large files.
--- a/assignments/04/Makefile
+++ b/assignments/04/Makefile
@ -0,0 +1,13 @@
 .PHONY: clean
 zhan4854.tar.gz: Makefile ASSIGNMENT.md km_cuda.cu km_cuda_fixed_n.cu report.pdf
 	mkdir -p zhan4854
 	cp $^ zhan4854
 	tar -czvf $@ zhan4854
 	rm -r zhan4854
 km_cuda: km_cuda.cu
 	nvcc -Xptxas -O3,-v -use_fast_math -o $@ $<
 clean:
 	rm -f km_cuda
--- a/assignments/04/dataset/small_gaussian.txt
+++ b/assignments/04/dataset/small_gaussian.txt
--- a/assignments/04/km_cuda.cu
+++ b/assignments/04/km_cuda.cu
@ -0,0 +1,281 @@
 // #define _POSIX_C_SOURCE 200809L
 #include <stdio.h>
 #include <time.h>
 #define CUDACHECK(err)                                                         \
  do {                                                                         \
    cuda_check((err), __FILE__, __LINE__);                                     \
  } while (false)
 inline void cuda_check(cudaError_t error_code, const char *file, int line) {
  if (error_code != cudaSuccess) {
    fprintf(stderr, "CUDA Error %d: %s. In file '%s' on line %d\n", error_code,
            cudaGetErrorString(error_code), file, line);
    fflush(stderr);
    exit(error_code);
  }
 }
 #define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
 #define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))
 // #define ENSURE_int(i) _Generic((i), int: (i))
 // #define ENSURE_float(f) _Generic((f), float: (f))
 // #define MAX(type, x, y) (type) GENERIC_MAX(ENSURE_##type(x), ENSURE_##type(y))
 // #define MIN(type, x, y) (type) GENERIC_MIN(ENSURE_##type(x), ENSURE_##type(y))
 /**
 * @brief Return the number of seconds since an unspecified time (e.g., Unix
 *        epoch). This is accomplished with a high-resolution monotonic timer,
 *        suitable for performance timing.
 *
 * @return The number of seconds.
 */
 static inline double monotonic_seconds() {
  /* Linux systems */
  struct timespec ts;
  clock_gettime(CLOCK_MONOTONIC, &ts);
  return ts.tv_sec + ts.tv_nsec * 1e-9;
 }
 /**
 * @brief Output the seconds elapsed while clustering.
 *
 * @param seconds Seconds spent on k-means clustering, excluding IO.
 */
 static void print_time(double const seconds) {
  printf("k-means clustering time: %0.04fs\n", seconds);
 }
 __global__ void findDistanceToCentroid(int N, int K, int dim,
                                       float *centroidDistances, float *data,
                                       float *centroids, int tOffset) {
  int t = blockIdx.x + tOffset; // data index
  int c = threadIdx.x;          // cluster index
  float sum = 0;
  for (int d = 0; d < dim; ++d) {
    float delta = data[t * dim + d] - centroids[c * dim + d];
    sum += delta * delta;
  }
  centroidDistances[t * K + c] = sqrt(sum);
 }
 __global__ void assignClosestCentroid(int N, int K, int *dirtyBit,
                                      float *centroidDistances,
                                      int *clusterMap) {
  int t = blockIdx.x;
  int minIdx = 0;
  float minValue = INFINITY;
  for (int c = 0; c < K; ++c) {
    float dist = centroidDistances[t * K + c];
    if (dist < minValue) {
      minValue = dist;
      minIdx = c;
    }
  }
  // printf("[%d]: minDist %f @ idx %d\n", t, minValue, minIdx);
  int oldMinIdx = clusterMap[t];
  clusterMap[t] = minIdx;
  if (oldMinIdx != minIdx) {
    atomicOr(dirtyBit, 1);
  }
 }
 __global__ void recentralizeCentroidSum(int N, int K, int dim, float *data,
                                        float *centroids, int *clusterMap,
                                        unsigned int *clusterCount) {
  int t = blockIdx.x;  // data index
  int c = threadIdx.x; // cluster index
  int assignedCluster = clusterMap[t];
  if (assignedCluster != c)
    return;
  atomicAdd((unsigned int *)&clusterCount[c], 1);
  for (int d = 0; d < dim; ++d) {
    atomicAdd(&centroids[c * dim + d], data[t * dim + d]);
  }
 }
 __global__ void recentralizeCentroidDiv(int dim, float *centroids,
                                        unsigned int *clusterCount) {
  int c = threadIdx.x; // cluster index
  for (int d = 0; d < dim; ++d) {
    centroids[c * dim + d] /= clusterCount[c];
  }
 }
 int main(int argc, char **argv) {
  char *data_file = argv[1];
  int num_clusters = atoi(argv[2]);
  int num_thread_blocks = atoi(argv[3]);
  int num_threads_per_block = atoi(argv[4]);
  int N, dim;
  float *centroids,       // centroids[cluster][dimension]
      *data,              // data[t][dimension]
      *centroidDistances; // centroidDistances[t][cluster]
  int *clusterMap, *dirtyBit;
  unsigned int *clusterCount;
 #pragma region Read in data
  {
    FILE *fp = fopen(data_file, "r");
    // Read first line
    size_t n;
    char *line = NULL;
    if (!getline(&line, &n, fp))
      return -1;
    sscanf(line, "%d %d", &N, &dim);
    free(line);
    line = NULL;
    // Allocate memory on the GPU
    CUDACHECK(
        cudaMalloc((void **)&centroids, num_clusters * dim * sizeof(float)));
    CUDACHECK(cudaMallocManaged((void **)&clusterMap, N * sizeof(int)));
    CUDACHECK(cudaMallocManaged((void **)&clusterCount,
                                num_clusters * sizeof(unsigned int)));
    CUDACHECK(cudaMalloc((void **)&data, N * dim * sizeof(float)));
    CUDACHECK(cudaMalloc((void **)&centroidDistances,
                         N * num_clusters * sizeof(float)));
    CUDACHECK(cudaMallocManaged((void **)&dirtyBit, sizeof(int)));
    // Initialize all the cluster mappings to -1 so the first iteration is
    // always fully dirty
    CUDACHECK(cudaMemset(clusterMap, -1, N * sizeof(int)));
    // Read the rest of the lines
    {
      // Buffer for copying
      float *currentLine = (float *)malloc(dim * sizeof(float));
      for (int i = 0; i < N; ++i) {
        if (!getline(&line, &n, fp))
          return -1;
        for (int j = 0; j < dim; ++j)
          sscanf(line, "%f", &currentLine[j]);
        CUDACHECK(cudaMemcpy(&data[i * dim], currentLine, dim * sizeof(float),
                             cudaMemcpyHostToDevice));
      }
      free(currentLine);
    }
    fclose(fp);
  }
 #pragma endregion
  double start_time = monotonic_seconds();
 #pragma region Select the initial K centroids
  {
    CUDACHECK(cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float),
                         cudaMemcpyDeviceToDevice));
  }
 #pragma endregion
 #pragma region Assign each data point to the closest centroid,                 \
    measured via Euclidean distance.
  {
    if (num_thread_blocks == -1) {
      findDistanceToCentroid<<<N, num_clusters>>>(
          N, num_clusters, dim, centroidDistances, data, centroids, 0);
    } else {
      for (int j = 0; j < N; j += num_thread_blocks) {
        int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
        findDistanceToCentroid<<<n, num_clusters>>>(
            N, num_clusters, dim, centroidDistances, data, centroids,
            j * num_thread_blocks);
      }
    }
    CUDACHECK(cudaDeviceSynchronize());
    *dirtyBit = 0;
    assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
                                    centroidDistances, clusterMap);
    CUDACHECK(cudaDeviceSynchronize());
  }
 #pragma endregion
 #pragma region Iteration
  int it = 0;
  for (int it = 0; it < 20 && *dirtyBit; ++it) {
    printf("Iteration %d (dirty=%d)\n", it, *dirtyBit);
    // Update each centroid to be the average coordinate of all contained data
    // points
    CUDACHECK(cudaMemset(clusterCount, 0, num_clusters * sizeof(int)));
    CUDACHECK(cudaMemset(centroids, 0, num_clusters * dim * sizeof(float)));
    recentralizeCentroidSum<<<N, num_clusters>>>(
        N, num_clusters, dim, data, centroids, clusterMap, clusterCount);
    CUDACHECK(cudaDeviceSynchronize());
    // Print out the cluster compositions
    for (int i = 0; i < num_clusters; ++i)
      printf("%d ", clusterCount[i]);
    printf("\n");
    recentralizeCentroidDiv<<<1, num_clusters>>>(dim, centroids, clusterCount);
    CUDACHECK(cudaDeviceSynchronize());
    // Assign all data points to the closest centroid (measured via Euclidean
    // distance).
    // findDistanceToCentroid<<<N, num_clusters>>>(
    //     N, num_clusters, dim, centroidDistances, data, centroids);
    if (num_thread_blocks == -1) {
      findDistanceToCentroid<<<N, num_clusters>>>(
          N, num_clusters, dim, centroidDistances, data, centroids, 0);
    } else {
      for (int j = 0; j < N; j += num_thread_blocks) {
        int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
        findDistanceToCentroid<<<n, num_clusters>>>(
            N, num_clusters, dim, centroidDistances, data, centroids,
            j * num_thread_blocks);
      }
    }
    CUDACHECK(cudaDeviceSynchronize());
    *dirtyBit = 0;
    assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
                                    centroidDistances, clusterMap);
    CUDACHECK(cudaDeviceSynchronize());
  }
 #pragma endregion
  double end_time = monotonic_seconds();
  print_time(end_time - start_time);
 #pragma region
  {
    FILE *fp = fopen("clusters.txt", "w");
    for (int i = 0; i < N; ++i)
      fprintf(fp, "%d\n", clusterMap[i]);
    fclose(fp);
  }
  {
    FILE *fp = fopen("centroids.txt", "w");
    fprintf(fp, "%d %d\n", num_clusters, dim);
    float *line = (float *)malloc(dim * sizeof(float));
    for (int i = 0; i < num_clusters; ++i) {
      CUDACHECK(cudaMemcpy(line, &centroids[i * dim], dim * sizeof(float),
                           cudaMemcpyDeviceToHost));
      for (int d = 0; d < dim; ++d)
        fprintf(fp, "%.3f ", line[d]);
      fprintf(fp, "\n");
    }
    free(line);
    fclose(fp);
  }
 #pragma endregion
  return 0;
 }
--- a/assignments/04/km_cuda_fixed_n.cu
+++ b/assignments/04/km_cuda_fixed_n.cu
@ -0,0 +1,281 @@
 // #define _POSIX_C_SOURCE 200809L
 #include <stdio.h>
 #include <time.h>
 #define CUDACHECK(err)                                                         \
  do {                                                                         \
    cuda_check((err), __FILE__, __LINE__);                                     \
  } while (false)
 inline void cuda_check(cudaError_t error_code, const char *file, int line) {
  if (error_code != cudaSuccess) {
    fprintf(stderr, "CUDA Error %d: %s. In file '%s' on line %d\n", error_code,
            cudaGetErrorString(error_code), file, line);
    fflush(stderr);
    exit(error_code);
  }
 }
 #define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
 #define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))
 // #define ENSURE_int(i) _Generic((i), int: (i))
 // #define ENSURE_float(f) _Generic((f), float: (f))
 // #define MAX(type, x, y) (type) GENERIC_MAX(ENSURE_##type(x), ENSURE_##type(y))
 // #define MIN(type, x, y) (type) GENERIC_MIN(ENSURE_##type(x), ENSURE_##type(y))
 /**
 * @brief Return the number of seconds since an unspecified time (e.g., Unix
 *        epoch). This is accomplished with a high-resolution monotonic timer,
 *        suitable for performance timing.
 *
 * @return The number of seconds.
 */
 static inline double monotonic_seconds() {
  /* Linux systems */
  struct timespec ts;
  clock_gettime(CLOCK_MONOTONIC, &ts);
  return ts.tv_sec + ts.tv_nsec * 1e-9;
 }
 /**
 * @brief Output the seconds elapsed while clustering.
 *
 * @param seconds Seconds spent on k-means clustering, excluding IO.
 */
 static void print_time(double const seconds) {
  printf("k-means clustering time: %0.04fs\n", seconds);
 }
 __global__ void findDistanceToCentroid(int N, int K, int dim,
                                       float *centroidDistances, float *data,
                                       float *centroids, int tOffset) {
  int t = blockIdx.x + tOffset; // data index
  int c = threadIdx.x;          // cluster index
  float sum = 0;
  for (int d = 0; d < dim; ++d) {
    float delta = data[t * dim + d] - centroids[c * dim + d];
    sum += delta * delta;
  }
  centroidDistances[t * K + c] = sqrt(sum);
 }
 __global__ void assignClosestCentroid(int N, int K, int *dirtyBit,
                                      float *centroidDistances,
                                      int *clusterMap) {
  int t = blockIdx.x;
  int minIdx = 0;
  float minValue = INFINITY;
  for (int c = 0; c < K; ++c) {
    float dist = centroidDistances[t * K + c];
    if (dist < minValue) {
      minValue = dist;
      minIdx = c;
    }
  }
  // printf("[%d]: minDist %f @ idx %d\n", t, minValue, minIdx);
  int oldMinIdx = clusterMap[t];
  clusterMap[t] = minIdx;
  if (oldMinIdx != minIdx) {
    atomicOr(dirtyBit, 1);
  }
 }
 __global__ void recentralizeCentroidSum(int N, int K, int dim, float *data,
                                        float *centroids, int *clusterMap,
                                        unsigned int *clusterCount) {
  int t = blockIdx.x;  // data index
  int c = threadIdx.x; // cluster index
  int assignedCluster = clusterMap[t];
  if (assignedCluster != c)
    return;
  atomicAdd((unsigned int *)&clusterCount[c], 1);
  for (int d = 0; d < dim; ++d) {
    atomicAdd(&centroids[c * dim + d], data[t * dim + d]);
  }
 }
 __global__ void recentralizeCentroidDiv(int dim, float *centroids,
                                        unsigned int *clusterCount) {
  int c = threadIdx.x; // cluster index
  for (int d = 0; d < dim; ++d) {
    centroids[c * dim + d] /= clusterCount[c];
  }
 }
 int main(int argc, char **argv) {
  char *data_file = argv[1];
  int num_clusters = atoi(argv[2]);
  int num_thread_blocks = atoi(argv[3]);
  int num_threads_per_block = atoi(argv[4]);
  int N, dim;
  float *centroids,       // centroids[cluster][dimension]
      *data,              // data[t][dimension]
      *centroidDistances; // centroidDistances[t][cluster]
  int *clusterMap, *dirtyBit;
  unsigned int *clusterCount;
 #pragma region Read in data
  {
    FILE *fp = fopen(data_file, "r");
    // Read first line
    size_t n;
    char *line = NULL;
    if (!getline(&line, &n, fp))
      return -1;
    sscanf(line, "%d %d", &N, &dim);
    free(line);
    line = NULL;
    // Allocate memory on the GPU
    CUDACHECK(
        cudaMalloc((void **)&centroids, num_clusters * dim * sizeof(float)));
    CUDACHECK(cudaMallocManaged((void **)&clusterMap, N * sizeof(int)));
    CUDACHECK(cudaMallocManaged((void **)&clusterCount,
                                num_clusters * sizeof(unsigned int)));
    CUDACHECK(cudaMalloc((void **)&data, N * dim * sizeof(float)));
    CUDACHECK(cudaMalloc((void **)&centroidDistances,
                         N * num_clusters * sizeof(float)));
    CUDACHECK(cudaMallocManaged((void **)&dirtyBit, sizeof(int)));
    // Initialize all the cluster mappings to -1 so the first iteration is
    // always fully dirty
    CUDACHECK(cudaMemset(clusterMap, -1, N * sizeof(int)));
    // Read the rest of the lines
    {
      // Buffer for copying
      float *currentLine = (float *)malloc(dim * sizeof(float));
      for (int i = 0; i < N; ++i) {
        if (!getline(&line, &n, fp))
          return -1;
        for (int j = 0; j < dim; ++j)
          sscanf(line, "%f", &currentLine[j]);
        CUDACHECK(cudaMemcpy(&data[i * dim], currentLine, dim * sizeof(float),
                             cudaMemcpyHostToDevice));
      }
      free(currentLine);
    }
    fclose(fp);
  }
 #pragma endregion
  double start_time = monotonic_seconds();
 #pragma region Select the initial K centroids
  {
    CUDACHECK(cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float),
                         cudaMemcpyDeviceToDevice));
  }
 #pragma endregion
 #pragma region Assign each data point to the closest centroid,                 \
    measured via Euclidean distance.
  {
    if (num_thread_blocks == -1) {
      findDistanceToCentroid<<<N, num_clusters>>>(
          N, num_clusters, dim, centroidDistances, data, centroids, 0);
    } else {
      for (int j = 0; j < N; j += num_thread_blocks) {
        int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
        findDistanceToCentroid<<<n, num_clusters>>>(
            N, num_clusters, dim, centroidDistances, data, centroids,
            j * num_thread_blocks);
      }
    }
    CUDACHECK(cudaDeviceSynchronize());
    *dirtyBit = 0;
    assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
                                    centroidDistances, clusterMap);
    CUDACHECK(cudaDeviceSynchronize());
  }
 #pragma endregion
 #pragma region Iteration
  int it = 0;
  for (int it = 0; it < 20 && *dirtyBit; ++it) {
    // printf("Iteration %d (dirty=%d)\n", it, *dirtyBit);
    // Update each centroid to be the average coordinate of all contained data
    // points
    CUDACHECK(cudaMemset(clusterCount, 0, num_clusters * sizeof(int)));
    CUDACHECK(cudaMemset(centroids, 0, num_clusters * dim * sizeof(float)));
    recentralizeCentroidSum<<<N, num_clusters>>>(
        N, num_clusters, dim, data, centroids, clusterMap, clusterCount);
    CUDACHECK(cudaDeviceSynchronize());
    // Print out the cluster compositions
    // for (int i = 0; i < num_clusters; ++i)
    //   printf("%d ", clusterCount[i]);
    // printf("\n");
    recentralizeCentroidDiv<<<1, num_clusters>>>(dim, centroids, clusterCount);
    CUDACHECK(cudaDeviceSynchronize());
    // Assign all data points to the closest centroid (measured via Euclidean
    // distance).
    // findDistanceToCentroid<<<N, num_clusters>>>(
    //     N, num_clusters, dim, centroidDistances, data, centroids);
    if (num_thread_blocks == -1) {
      findDistanceToCentroid<<<N, num_clusters>>>(
          N, num_clusters, dim, centroidDistances, data, centroids, 0);
    } else {
      for (int j = 0; j < N; j += num_thread_blocks) {
        int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
        findDistanceToCentroid<<<n, num_clusters>>>(
            N, num_clusters, dim, centroidDistances, data, centroids,
            j * num_thread_blocks);
      }
    }
    CUDACHECK(cudaDeviceSynchronize());
    *dirtyBit = 0;
    assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
                                    centroidDistances, clusterMap);
    CUDACHECK(cudaDeviceSynchronize());
  }
 #pragma endregion
  double end_time = monotonic_seconds();
  print_time(end_time - start_time);
 #pragma region
  {
    FILE *fp = fopen("clusters.txt", "w");
    for (int i = 0; i < N; ++i)
      fprintf(fp, "%d\n", clusterMap[i]);
    fclose(fp);
  }
  {
    FILE *fp = fopen("centroids.txt", "w");
    fprintf(fp, "%d %d\n", num_clusters, dim);
    float *line = (float *)malloc(dim * sizeof(float));
    for (int i = 0; i < num_clusters; ++i) {
      CUDACHECK(cudaMemcpy(line, &centroids[i * dim], dim * sizeof(float),
                           cudaMemcpyDeviceToHost));
      for (int d = 0; d < dim; ++d)
        fprintf(fp, "%.3f ", line[d]);
      fprintf(fp, "\n");
    }
    free(line);
    fclose(fp);
  }
 #pragma endregion
  return 0;
 }
--- a/assignments/04/plot.sh
+++ b/assignments/04/plot.sh
@ -0,0 +1,10 @@
 #!/bin/bash
 if [ "$#" -ne 1 ]; then
  echo "usage: ./plot.sh <data file>";
  echo "  NOTE: 'clusters.txt' must be in working directory.";
  exit 1;
 fi
 octave-cli --eval "plot_clusters2D('$1', 'clusters.txt', 'centroids.txt')"
--- a/assignments/04/plot_clusters2D.m
+++ b/assignments/04/plot_clusters2D.m
@ -0,0 +1,23 @@
 function plot_clusters2D(points_file, clusters_file, centroids_file)
 X = load(points_file);
 X(1,:) = []; % remove metadata
 clusters = load(clusters_file);
 centroids = load(centroids_file);
 centroids(1,:) = []; % remove metadata
 f = figure();
 hold on
 nclusters = size(centroids,1);
 for c=1:nclusters
  points = X(clusters == c-1, :);
  scatter(points(:,1), points(:,2));
  scatter(centroids(c,1), centroids(c,2), '+k', 'LineWidth', 5, 'SizeData', 100);
 end
 uiwait(f);
--- a/assignments/04/report.typ
+++ b/assignments/04/report.typ
@ -0,0 +1,41 @@
 #set page(margin: 1.5em)
 == Homework 4
 Michael Zhang \<zhan4854\@umn.edu\>
 1. *A short description of how you went about parallelizing the k-means algorithm. You should include how you decomposed the problem and why, i.e., what were the tasks being parallelized.*
  My parallelized program included the following procedures:
  - `findDistanceToCentroid` - This computes an $N times K$ matrix of distances from each data point to each centroid.
  - `assignClosestCentroid` - This reduces the distances to find the minimum distance for each centroid, and assigns the closest one to an $N times 1$ vector.
  - `recentralizeCentroidSum` - This computes a sum for the purposes of averaging, and also counts the number of elements in each cluster.
  - `recentralizeCentroidDiv` - This uses the count from the previous step and divides everything in parallel.
  I tried to make sure every thread is computing approximately one single for-loop's worth of data, most of the time over the $d$ axis
 2. *Give details about how many elements and how the computations in your kernels are handled by a thread.*
  I used the dynamic thread allocation method based on the size of the data.
  For most of the kernels, the computation is very simple: perform a row-reduction into a different array. Since all the accesses are disjoint, I don't synchronize between threads.
  However, for averaging the datapoints, I unfortunately need to run a $N times K times D$ operation that involves a sum reduction. I tried using a tree-based approach after doing some bitwise math to avoid the conditional of whether it's in the same class, but the plain approach is simpler and I did not get the other one to work.
 3. *Ensure you include details about the thread hierarchy, i.e., whether the threads are organized in a 1D, 2D, or, 3D fashion in a thread-block, and whether the thread-blocks are arranged 1D, 2D, or, 3D grid. NOTE: If you choose to write CUDA kernels where the number of thread blocks is determined dynamically by the program during runtime, then send -1 as the input argument for the number of thread blocks to the invocation. In your program, use -1 as a flag to indicate that the number of thread blocks will need to be computed during runtime.*
  I used a 1D thread hierarchy. This is because all my accesses are already basically along the "good" axis, so I'm not doing any strides along other dimensions.
 4. *You need to perform a parameter study in order to determine how the number of elements processed by a thread and the size of a thread-block, i.e., the \# threads in a block, affect the performance of your algorithm. Your writeup should contain some results showing the runtime that you obtained for different choices.*
  I tried using a fixed number of thread blocks, but for the 256 cluster case it ended up performing almost twice as slow (at 0.4516s). To the best of my knowledge, CUDA is able to schedule the extra kernel instances it needs to compute to happen sequentially after, if there are more thread blocks requested than there are available.
 5. *You should include results on the 'large_cpd.txt' dataset with 256, 512, and 1024 clusters.*
  - 256: 26.8258s
  - 512: 62.1212s
  - 1024: 163.4022s
--- a/assignments/04/run.sh
+++ b/assignments/04/run.sh
@ -0,0 +1,12 @@
 set -euo pipefail
 HOST="zhan4854@csel-cuda-02.cselabs.umn.edu"
 rsync -azPr --exclude 'large_cpd.txt' . $HOST:~/hwk4
 CLUSTERS=${1:-512}
 BLOCKS=512
 THREADS=-1
 DATAFILE="large_cpd.txt"
 # DATAFILE="small_gaussian.txt"
 ssh $HOST bash -c "set -euo pipefail; module load soft/cuda/local; module initadd soft/cuda/local; cd hwk4; make clean; make; ls; ./km_cuda ./dataset/$DATAFILE $CLUSTERS 64 128"
 rsync -qazPr --exclude 'large_cpd.txt' zhan4854@csel-cuda-02.cselabs.umn.edu:~/hwk4/ .
--- a/flake.lock
+++ b/flake.lock
@ -0,0 +1,74 @@
 {
  "nodes": {
    "flake-utils": {
      "inputs": {
        "systems": "systems"
      },
      "locked": {
        "lastModified": 1701680307,
        "narHash": "sha256-kAuep2h5ajznlPMD9rnQyffWG8EM/C73lejGofXvdM8=",
        "owner": "numtide",
        "repo": "flake-utils",
        "rev": "4022d587cbbfd70fe950c1e2083a02621806a725",
        "type": "github"
      },
      "original": {
        "id": "flake-utils",
        "type": "indirect"
      }
    },
    "nixpkgs": {
      "locked": {
        "lastModified": 1663551060,
        "narHash": "sha256-e2SR4cVx9p7aW/XnVsGsWZBplApA9ZJUjc0fejJhnYo=",
        "owner": "nixos",
        "repo": "nixpkgs",
        "rev": "8a5b9ee7b7a2b38267c9481f5c629c015108ab0d",
        "type": "github"
      },
      "original": {
        "id": "nixpkgs",
        "type": "indirect"
      }
    },
    "nixpkgsUnstable": {
      "locked": {
        "lastModified": 1702237358,
        "narHash": "sha256-PagQSuIdXAueAaAujhtqecP2sjXSYDdYfp2UVwqbkP8=",
        "owner": "nixos",
        "repo": "nixpkgs",
        "rev": "7eb0ff576d1bde14a3353ef85f8fba6fd57d32c7",
        "type": "github"
      },
      "original": {
        "owner": "nixos",
        "repo": "nixpkgs",
        "type": "github"
      }
    },
    "root": {
      "inputs": {
        "flake-utils": "flake-utils",
        "nixpkgs": "nixpkgs",
        "nixpkgsUnstable": "nixpkgsUnstable"
      }
    },
    "systems": {
      "locked": {
        "lastModified": 1681028828,
        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
        "owner": "nix-systems",
        "repo": "default",
        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
        "type": "github"
      },
      "original": {
        "owner": "nix-systems",
        "repo": "default",
        "type": "github"
      }
    }
  },
  "root": "root",
  "version": 7
 }
--- a/flake.nix
+++ b/flake.nix
@ -0,0 +1,23 @@
 {
  inputs.nixpkgsUnstable.url = "github:nixos/nixpkgs";
  outputs = { self, nixpkgs, nixpkgsUnstable, flake-utils }:
    flake-utils.lib.eachDefaultSystem (system:
      let
        pkgs = import nixpkgs {
          inherit system;
          config.cudaSupport = true;
        };
        pkgsUnstable = import nixpkgsUnstable {
          inherit system;
          config.cudaSupport = true;
          config.allowUnfreePredicate = pkg:
            builtins.elem (nixpkgs.lib.getName pkg) [ "cudatoolkit" ];
        };
      in {
        devShell = pkgs.mkShell {
          packages = (with pkgs; [ clang-tools gdb octave ])
            ++ (with pkgsUnstable.cudaPackages_12; [ cudatoolkit ]);
        };
      });
 }
Author	SHA1	Message	Date
Michael Zhang	dfa2024001	a3 fix	2023-12-28 23:53:30 -05:00
Michael Zhang	9ce233b671	final	2023-12-16 11:44:38 -06:00
Michael Zhang	0b179f38bd	d	2023-12-15 17:13:47 -06:00
Michael Zhang	18b1c2b6bc	a	2023-12-15 16:58:06 -06:00
Michael Zhang	9f6a80a09f	snapshot	2023-12-11 22:11:02 -06:00
Michael Zhang	442638205c	assignment 3	2023-12-10 17:38:53 -06:00
Michael Zhang	d75da8de6d	give up on C	2023-12-10 17:38:53 -06:00
Michael Zhang	584a919ff1	print out	2023-12-10 17:38:06 -06:00
Michael Zhang	d8f5fe13b6	shiet	2023-12-10 16:57:24 -06:00
Michael Zhang	68a1e749dc	cuda shit	2023-12-10 15:40:31 -06:00
Michael Zhang	0728eb468f	okayge	2023-11-25 09:28:49 +00:00
Michael Zhang	5fe38262c5	consistent results now	2023-11-25 09:18:56 +00:00