a3 fix

final
d
2023-12-28 23:53:30 -05:00 · 2023-12-16 11:44:38 -06:00 · 2023-12-15 17:13:47 -06:00 · 2023-12-15 16:58:06 -06:00 · 2023-12-11 22:11:02 -06:00 · 2023-12-10 17:38:53 -06:00
24 changed files with 3340 additions and 164 deletions
--- a/.envrc
+++ b/.envrc
@ -0,0 +1 @@
+use flake
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 .DS_Store
 /target
+.direnv
--- a/4
+++ b/4
@ -12,6 +12,7 @@ RUN apt update -y && apt install -y --no-install-recommends \
    clang \
    curl \
    direnv \
+    gdb \
    git \
    libomp-dev \
    libopenmpi-dev \
@ -21,6 +22,7 @@ RUN apt update -y && apt install -y --no-install-recommends \
    pkg-config \
    python3 \
    python3-pip \
+    python3-venv \
    texlive-latex-base \
    texlive-latex-extra \
    valgrind \
@ -31,6 +33,6 @@ COPY --from=typst /bin/typst /usr/bin/typst

 RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
 RUN /root/.cargo/bin/cargo install cargo-watch
-RUN /root/.cargo/bin/cargo install watchexec
+# RUN /root/.cargo/bin/cargo install watchexec-cli

 RUN echo 'eval "$(direnv hook bash)"' >> /root/.bashrc
--- a/assignments/03/.envrc
+++ b/assignments/03/.envrc
@ -0,0 +1,3 @@
+layout python3
+export OMPI_ALLOW_RUN_AS_ROOT=1
+export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
--- a/assignments/03/.gitignore
+++ b/assignments/03/.gitignore
@ -3,3 +3,8 @@ compile_commands.json
 .cache
 report.pdf
 *.tar.gz
+out.txt
+
+dataset/gen_*.txt
+.direnv
+labels.txt
--- a/assignments/03/Makefile
+++ b/assignments/03/Makefile
@ -1,7 +1,7 @@
 .PHONY: run clean

-# CFLAGS += -O3
-CFLAGS += -DFMT_HEADER_ONLY -g
+CFLAGS += -g
+# CFLAGS += -DFMT_HEADER_ONLY -g
 # LDFLAGS += $(shell pkg-config --libs fmt)

 lpa: lpa.cpp Makefile
@ -16,7 +16,7 @@ run:
 report.pdf: report.typ
 	typst compile $< $@

-zhan4854.tar.gz: Makefile ASSIGNMENT.md lpa.cpp report.pdf
+zhan4854.tar.gz: Makefile ASSIGNMENT.md lpa.cpp report.pdf dataset/gen2.py
 	mkdir -p zhan4854
 	cp $^ zhan4854
 	tar -czvf $@ zhan4854
--- a/assignments/03/bench.sh
+++ b/assignments/03/bench.sh
@ -0,0 +1,7 @@
+for dataset in $(echo "1000.txt" "10000.txt" "100000.txt" "1000000.txt"); do
+    for processors in $(echo 1 2 4 8 16 | tr ' ' '\n'); do
+        # file="dataset/both_$dataset"
+        file="/export/scratch/CSCI5451_F23/assignment-3/dataset/$dataset"
+        mpirun -n $processors ./lpa $file "graph_out/$dataset-$processors.txt" >> "stdout_out/$dataset-$processors.txt"
+    done
+done
--- a/assignments/03/dataset/gen2.py
+++ b/assignments/03/dataset/gen2.py
@ -0,0 +1,25 @@
+import igraph as ig
+import random
+import sys
+
+try:
+    N = int(sys.argv[1])
+except:
+    N = 1000
+
+random.seed(0)
+g = ig.Graph.Growing_Random(N, 5)
+components = g.connected_components(mode='weak')
+print(len(components))
+
+with open(f"dataset/gen_{N}.txt", "w") as f:
+
+    both_edges = []
+    for edge in g.es:
+        both_edges.append((edge.source, edge.target))
+        both_edges.append((edge.target, edge.source))
+
+    num_edges = len(both_edges)
+    f.write(f"{N} {num_edges}\n")
+    for v1, v2 in sorted(both_edges):
+        f.write(f"{v1} {v2}\n")
--- a/assignments/03/lpa.cpp
+++ b/assignments/03/lpa.cpp
@ -14,22 +14,13 @@
 #include <unistd.h>
 #include <utility>

-#ifdef FMT_HEADER_ONLY
-#include <fmt/format.h>
-#include <fmt/ranges.h>
-#endif
+// #include <fmt/format.h>
+// #include <fmt/ranges.h>

 #define TAG_SEND_NUM_EDGES 1001
 #define TAG_SEND_EDGES 1002
 #define TAG_SEND_FINAL_RESULT 1003

-#define MIN(a, b)                                                              \
-  ({                                                                           \
-    __typeof__(a) _a = (a);                                                    \
-    __typeof__(b) _b = (b);                                                    \
-    _a < _b ? _a : _b;                                                         \
-  })
-
 typedef struct {
  int fst;
  int snd;
@ -45,18 +36,18 @@ void pair_vector_init(struct pair_vector *);
 void pair_vector_clear(struct pair_vector *);
 void pair_vector_push(struct pair_vector *v, int fst, int snd);

-pair compute_node_range(int p, int total_num_nodes, int each_num_nodes,
-                        int process);
-int lookup_assignment(int *base_node_assignment, pair my_node_range,
-                      std::map<int, std::set<int>> recv_map, int *recvbuf,
-                      int *recv_counts, int *recv_displs, int each_num_nodes,
-                      int rank, int node_number);
+/**
+ * @brief Write a vector of labels to a file.
+ *
+ * @param filename The name of the file to write to.
+ * @param labels The array of labels.
+ * @param nlabels How many labels to write.
+ */
+static void print_labels(char *filename, int *labels, size_t const nlabels);

 int main(int argc, char **argv) {
-  MPI_Init(&argc, &argv);
-  int rank, p;
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &p);
+  MPI::Init(argc, argv);
+  int rank = MPI::COMM_WORLD.Get_rank(), p = MPI::COMM_WORLD.Get_size();

  MPI_Datatype IntPairType;
  init_pair_type(&IntPairType);
@ -71,15 +62,14 @@ int main(int argc, char **argv) {
  pair params;

  if (rank == 0) {
+    printf("Processors: %d, file: %s\n", p, argv[1]);
    fp = fopen(argv[1], "r");
-
-    // Read the first line
-    if (getline(&line, &len, fp) != -1)
+    if ((read = getline(&line, &len, fp)) != -1)
      sscanf(line, "%d %d", &params.fst, &params.snd);
  }

  // Send the params
-  MPI_Bcast(&params, 1, IntPairType, 0, MPI_COMM_WORLD);
+  MPI_Bcast(&params, 1, IntPairType, 0, MPI::COMM_WORLD);
  int total_num_nodes = params.fst;
  int total_num_edges = params.snd;
  int each_num_nodes = total_num_nodes / p;
@ -89,9 +79,12 @@ int main(int argc, char **argv) {
      rank == p - 1 ? total_num_nodes - rank * each_num_nodes : each_num_nodes;
  int my_nodes[num_my_nodes];

-  pair node_ranges[p];
-  for (int i = 0; i < p; ++i)
-    node_ranges[i] = compute_node_range(p, total_num_nodes, each_num_nodes, i);
+  std::function<std::pair<int, int>(int)> node_range =
+      [p, total_num_nodes, each_num_nodes](int process) {
+        int start = process * each_num_nodes;
+        int end = process == p - 1 ? total_num_nodes : start + each_num_nodes;
+        return std::make_pair(start, end);
+      };

  // Read the edges
  int num_my_edges;
@ -105,31 +98,30 @@ int main(int argc, char **argv) {

    // For the current process, what's the last node we're expecting to see?
    int current_process = 0;
-    pair current_node_range = node_ranges[current_process];
+    std::pair<int, int> current_node_range = node_range(current_process);
    int edge_counter = 0;

    for (int i = 0; i < total_num_edges; ++i) {
-      if (getline(&line, &len, fp) == -1)
-        break;
+      getline(&line, &len, fp);

      int fst, snd;
      sscanf(line, "%d %d", &fst, &snd);

-      if (fst >= current_node_range.snd) {
+      if (fst >= current_node_range.second) {
        if (current_process == 0) {
          num_my_edges = edge_counter;
          my_edges = (pair *)calloc(num_my_edges, sizeof(pair));
          memcpy(my_edges, all_edges.ptr, edge_counter * sizeof(pair));
        } else {
          MPI_Send(&edge_counter, 1, MPI_INT, current_process,
-                   TAG_SEND_NUM_EDGES, MPI_COMM_WORLD);
+                   TAG_SEND_NUM_EDGES, MPI::COMM_WORLD);
          MPI_Send(all_edges.ptr, edge_counter, IntPairType, current_process,
-                   TAG_SEND_EDGES, MPI_COMM_WORLD);
+                   TAG_SEND_EDGES, MPI::COMM_WORLD);
        }

        // We're starting on the next process
        current_process += 1;
-        current_node_range = node_ranges[current_process];
+        current_node_range = node_range(current_process);
        edge_counter = 0;
        pair_vector_clear(&all_edges);
      }
@ -140,18 +132,33 @@ int main(int argc, char **argv) {

    // We have to send the last one again here, since it didn't get caught in
    // the loop above
-    MPI_Send(&edge_counter, 1, MPI_INT, current_process, TAG_SEND_NUM_EDGES,
-             MPI_COMM_WORLD);
-    MPI_Send(all_edges.ptr, edge_counter, IntPairType, current_process,
-             TAG_SEND_EDGES, MPI_COMM_WORLD);
-
-    free(all_edges.ptr);
+    if (current_process == 0) {
+      num_my_edges = edge_counter;
+      my_edges = (pair *)calloc(num_my_edges, sizeof(pair));
+      memcpy(my_edges, all_edges.ptr, edge_counter * sizeof(pair));
    } else {
-    MPI_Recv(&num_my_edges, 1, MPI_INT, 0, TAG_SEND_NUM_EDGES, MPI_COMM_WORLD,
+      MPI_Send(&edge_counter, 1, MPI_INT, current_process, TAG_SEND_NUM_EDGES,
+               MPI::COMM_WORLD);
+      MPI_Send(all_edges.ptr, edge_counter, IntPairType, current_process,
+               TAG_SEND_EDGES, MPI::COMM_WORLD);
+    }
+  } else {
+    MPI_Recv(&num_my_edges, 1, MPI_INT, 0, TAG_SEND_NUM_EDGES, MPI::COMM_WORLD,
             NULL);
    my_edges = (pair *)calloc(num_my_edges, sizeof(pair));
    MPI_Recv(my_edges, num_my_edges, IntPairType, 0, TAG_SEND_EDGES,
-             MPI_COMM_WORLD, NULL);
+             MPI::COMM_WORLD, NULL);
+  }
+
+  char *buf = (char *)calloc(sizeof(char), 1000);
+  int offset = 0; // Keep track of the current position in the buffer
+  for (int i = 0; i < std::min(num_my_edges, 5); i++) {
+    offset +=
+        sprintf(buf + offset, "(%d, %d)", my_edges[i].fst, my_edges[i].snd);
+    if (i < len - 1) {
+      // Add a separator (e.g., comma or space) if it's not the last
+      offset += sprintf(buf + offset, " ");
+    }
  }

  if (rank == 0) {
@ -162,21 +169,18 @@ int main(int argc, char **argv) {
 #pragma endregion

  // STEP 2 TIMER STARTS HERE
-  MPI_Barrier(MPI_COMM_WORLD);
-  double step_2_start_time;
-  if (rank == 0)
-    step_2_start_time = MPI_Wtime();
+  MPI::COMM_WORLD.Barrier();
+  double step_2_start_time = MPI::Wtime();

 // Each process analyzes the non-local edges that are contained in its portion
 // of the graph.
 #pragma region
-  int node_label_assignment_vec[num_my_nodes];
-  // std::map<int, int> node_label_assignment;
-  pair my_node_range = node_ranges[rank];
+  std::map<int, int> node_label_assignment;
+  std::pair<int, int> my_node_range = node_range(rank);

  // Initial node assignment
-  for (int idx = 0; idx < num_my_nodes; ++idx) {
-    node_label_assignment_vec[idx] = my_node_range.fst + idx;
+  for (int i = my_node_range.first; i < my_node_range.second; ++i) {
+    node_label_assignment[i] = i;
  }

  std::map<int, std::set<int>> adj;
@ -187,12 +191,12 @@ int main(int argc, char **argv) {
    pair edge = my_edges[i];
    adj[edge.fst].insert(edge.snd);

-    if (!(my_node_range.fst <= edge.fst && edge.fst < my_node_range.snd)) {
+    if (!(my_node_range.first <= edge.fst && edge.fst < my_node_range.second)) {
      non_local_nodes.insert(edge.fst);
      non_local_edges.insert(std::make_pair(edge.snd, edge.fst));
    }

-    if (!(my_node_range.fst <= edge.snd && edge.snd < my_node_range.snd)) {
+    if (!(my_node_range.first <= edge.snd && edge.snd < my_node_range.second)) {
      non_local_nodes.insert(edge.snd);
      non_local_edges.insert(std::make_pair(edge.fst, edge.snd));
    }
@ -208,28 +212,26 @@ int main(int argc, char **argv) {
  for (auto entry : non_local_edges) {
    int local_node = entry.first, remote_node = entry.second;

-    int remote_process = remote_node / each_num_nodes;
+    int corresponding_process = remote_node / each_num_nodes;
    // The last process gets some extra nodes
-    if (remote_process >= p)
-      remote_process = p - 1;
+    if (corresponding_process >= p)
+      corresponding_process = p - 1;

-    send_map[remote_process].insert(local_node);
-    recv_map[remote_process].insert(remote_node);
+    send_map[corresponding_process].insert(local_node);
+    recv_map[corresponding_process].insert(remote_node);
  }
 #pragma endregion

 // All the processes are communicating to figure out which process needs to
 // send what data to the other processes.
 #pragma region
+  // Nothing needs to be done here, I'm using the fact that everything is sent
+  // in sorted order to ensure that both sides are referring to the same thing
 #pragma endregion

  // STEP 5 TIMER STARTS HERE
-  MPI_Barrier(MPI_COMM_WORLD);
-  double step_5_start_time;
-  if (rank == 0) {
-    step_5_start_time = MPI_Wtime();
-    printf("STARTING STEP 5: %0.04fs\n", step_5_start_time - step_2_start_time);
-  }
+  MPI::COMM_WORLD.Barrier();
+  double step_5_start_time = MPI::Wtime();

 // The processes perform the transfers of non-local labels and  updates of
 // local labels until convergence.
@ -247,9 +249,9 @@ int main(int argc, char **argv) {
      int offset = 0;
      for (int i = 0; i < p; ++i) {
        int count = send_map[i].size();
-        for (auto local_node : send_map[i]) {
-          sendbuf.push_back(
-              node_label_assignment_vec[local_node - my_node_range.fst]);
+        // std::sort(send_map[i].begin(), send_map[i].end());
+        for (auto k : send_map[i]) {
+          sendbuf.push_back(node_label_assignment[k]);
        }
        send_counts.push_back(count);
        send_displs.push_back(offset);
@ -259,6 +261,7 @@ int main(int argc, char **argv) {
      offset = 0;
      for (int i = 0; i < p; ++i) {
        int count = recv_map[i].size();
+        // std::sort(recv_map[i].begin(), recv_map[i].end());
        recv_counts.push_back(count);
        recv_displs.push_back(offset);
        offset += count;
@ -267,25 +270,29 @@ int main(int argc, char **argv) {
    }

    std::vector<int> recvbuf(recv_total, 0);
-    MPI_Alltoallv(sendbuf.data(), send_counts.data(), send_displs.data(),
-                  MPI_INT, recvbuf.data(), recv_counts.data(),
-                  recv_displs.data(), MPI_INT, MPI_COMM_WORLD);
+    MPI::COMM_WORLD.Alltoallv(sendbuf.data(), send_counts.data(),
+                              send_displs.data(), MPI_INT, recvbuf.data(),
+                              recv_counts.data(), recv_displs.data(), MPI_INT);
+
+    std::map<int, int> total_node_label_assignment(node_label_assignment);
+    for (int i = 0; i < p; ++i) {
+      std::vector<int> ouais(recv_map[i].begin(), recv_map[i].end());
+      for (int j = 0; j < recv_counts[i]; ++j) {
+        int remote_node = ouais[j];
+        int remote_value = recvbuf[recv_displs[i] + j];
+        total_node_label_assignment[remote_node] = remote_value;
+      }
+    }

    // For each local node, determine the minimum label out of its neighbors
    std::map<int, int> new_labels;
-    for (int i = 0; i < num_my_nodes; ++i) {
-      int node = my_node_range.fst + i;
-
-      // int current_value = total_node_label_assignment[i];
-      int current_value = node_label_assignment_vec[i];
+    for (int i = my_node_range.first; i < my_node_range.second; ++i) {
+      int current_value = total_node_label_assignment[i];
      int min = current_value;

-      for (auto neighbor : adj[node]) {
-        int neighbor_value = lookup_assignment(
-            node_label_assignment_vec, my_node_range, recv_map, recvbuf.data(),
-            recv_counts.data(), recv_displs.data(), each_num_nodes, rank,
-            neighbor);
-        min = MIN(min, neighbor_value);
+      for (auto neighbor : adj[i]) {
+        if (total_node_label_assignment[neighbor] < min)
+          min = total_node_label_assignment[neighbor];
      }

      if (min < current_value) {
@ -296,8 +303,8 @@ int main(int argc, char **argv) {
    // Have there been any changes in the labels?
    int num_changes = new_labels.size();
    int total_changes;
-    MPI_Allreduce(&num_changes, &total_changes, 1, MPI_INT, MPI_SUM,
-                  MPI_COMM_WORLD);
+    MPI::COMM_WORLD.Allreduce(&num_changes, &total_changes, 1, MPI_INT,
+                              MPI::SUM);

    if (total_changes == 0) {
      break;
@ -305,19 +312,14 @@ int main(int argc, char **argv) {

    // Update the original node assignment
    for (auto entry : new_labels) {
-      node_label_assignment_vec[entry.first] = entry.second;
+      node_label_assignment[entry.first] = entry.second;
    }
-
-    if (rank == 0)
-      printf("total changes: %d\n", total_changes);
  }
 #pragma endregion

  // END TIMERS
-  MPI_Barrier(MPI_COMM_WORLD);
-  double end_time;
-  if (rank == 0)
-    end_time = MPI_Wtime();
+  MPI::COMM_WORLD.Barrier();
+  double end_time = MPI::Wtime();

  if (rank == 0) {
    printf("2-5 Time: %0.04fs\n", end_time - step_2_start_time);
@ -329,34 +331,35 @@ int main(int argc, char **argv) {
 #pragma region
  if (rank == 0) {
    std::vector<int> all_assignments(total_num_nodes);
-    std::map<int, int> label_count;
+    // std::map<int, int> label_count;
    int ctr = 0;
-    for (int process_idx = 0; process_idx < p; ++process_idx) {
-      pair this_node_range = node_ranges[process_idx];
-      int count = this_node_range.snd - this_node_range.fst;
-      if (process_idx == 0) {
+    for (int i = 0; i < p; ++i) {
+      std::pair<int, int> this_node_range = node_range(i);
+      int count = this_node_range.second - this_node_range.first;
+      if (i == 0) {
        for (int j = 0; j < count; ++j) {
-          all_assignments[this_node_range.fst + j] =
-              node_label_assignment_vec[j];
-          label_count[all_assignments[this_node_range.fst + j]]++;
+          all_assignments[this_node_range.first + j] =
+              node_label_assignment[this_node_range.first + j];
+          // label_count[all_assignments[this_node_range.first + j]]++;
        }
      } else {
-        MPI_Recv(&all_assignments[this_node_range.fst], count, MPI_INT,
-                 process_idx, TAG_SEND_FINAL_RESULT, MPI_COMM_WORLD, NULL);
-        for (int j = 0; j < count; ++j) {
-          label_count[all_assignments[this_node_range.fst + j]]++;
-        }
+        MPI::COMM_WORLD.Recv(&all_assignments[this_node_range.first], count,
+                             MPI::INT, i, TAG_SEND_FINAL_RESULT);
      }
    }

-    std::cout << "Done! " << label_count.size() << std::endl;
+    print_labels(argv[2], all_assignments.data(), all_assignments.size());
  } else {
-    MPI_Send(node_label_assignment_vec, num_my_nodes, MPI_INT, 0,
-             TAG_SEND_FINAL_RESULT, MPI_COMM_WORLD);
+    std::vector<int> flat_assignments;
+    for (int i = my_node_range.first; i < my_node_range.second; ++i) {
+      flat_assignments.push_back(node_label_assignment[i]);
+    }
+    MPI::COMM_WORLD.Send(flat_assignments.data(), flat_assignments.size(),
+                         MPI::INT, 0, TAG_SEND_FINAL_RESULT);
  }
 #pragma endregion

-  MPI_Finalize();
+  MPI::Finalize();
  return 0;
 }

@ -398,54 +401,20 @@ void pair_vector_push(struct pair_vector *v, int fst, int snd) {
  v->len++;
 }

-pair compute_node_range(int p, int total_num_nodes, int each_num_nodes,
-                        int process) {
-  int start = process * each_num_nodes;
-  int end = process == p - 1 ? total_num_nodes : start + each_num_nodes;
-  return {.fst = start, .snd = end};
-}
+static void print_labels(char *filename, int *labels, size_t const nlabels) {
+  size_t i;
+  FILE *fout;

-int lookup_assignment(int *base_node_assignment, pair my_node_range,
-                      std::map<int, std::set<int>> recv_map, int *recvbuf,
-                      int *recv_counts, int *recv_displs, int each_num_nodes,
-                      int rank, int node_number) {
-  int process_from = node_number / each_num_nodes;
-
-  // Just return from local if local
-  if (process_from == rank)
-    return base_node_assignment[node_number - my_node_range.fst];
-
-  int count = recv_counts[process_from];
-  int displs = recv_displs[process_from];
-
-  // Determine what index this node is
-  int index = -1, ctr = 0;
-  std::vector<int> inner(recv_map[process_from].begin(),
-                         recv_map[process_from].end());
-  // {
-  //   int lo = 0, hi = count;
-  //   while (lo < hi) {
-  //     int mid = (lo + hi) / 2;
-  //     int midk = inner[mid];
-  //     if (node_number < midk)
-  //       hi = mid;
-  //     else if (node_number > midk)
-  //       lo = mid;
-  //     else {
-  //       index = mid;
-  //       break;
-  //     }
-  //   }
-  // }
-  for (int i = 0; i < count; ++i) {
-    int remote_node = inner[i];
-    if (node_number == remote_node) {
-      index = ctr;
-      break;
-    }
-    ctr++;
+  /* open file */
+  if ((fout = fopen(filename, "w")) == NULL) {
+    fprintf(stderr, "error opening '%s'\n", filename);
+    abort();
  }

-  // Pull the corresponding value from the map
-  return recvbuf[recv_displs[process_from] + index];
+  /* write labels to fout */
+  for (i = 0; i < nlabels; ++i) {
+    fprintf(fout, "%u\n", labels[i]);
+  }
+
+  fclose(fout);
 }
--- a/assignments/03/process.py
+++ b/assignments/03/process.py
@ -0,0 +1,33 @@
+import re
+
+WTF = re.compile(r".*: (\d+),.*dataset/(\d+).txt")
+
+by_size = dict()
+with open("stdout.txt") as f:
+    while True:
+        line1 = f.readline().strip()
+        if not line1: break
+        m = WTF.match(line1)
+        processors = int(m.group(1))
+        size = int(m.group(2))
+
+        if size not in by_size: by_size[size] = dict()
+
+        line2 = f.readline().strip()
+        line3 = f.readline().strip()
+
+        time2 = line2.split(": ")[1]
+        time5 = line3.split(": ")[1]
+
+        if processors not in by_size[size]: by_size[size][processors] = (time2, time5)
+
+print("#table(")
+print("  columns: (auto, auto, auto, auto, auto, auto),")
+columns = [1, 2, 4, 8, 16]
+print("  [], ", ", ".join(map(lambda c: f"[{c}]", columns)), ",")
+for size, entries in sorted(by_size.items()):
+    print(f"  [{size}],")
+    for processors, (time2, time5) in sorted(entries.items()):
+        print(f"  [{time2} #linebreak() {time5}],", end = None)
+    print()
+print(")")
--- a/assignments/03/report.typ
+++ b/assignments/03/report.typ
@ -0,0 +1,79 @@
+== Step 2-4
+
+For steps 2-4, I calculated all of each process' outgoing nodes, sorted it in
+order and used its sorted position as a way to identify which nodes are being
+sent.
+
+This saves an extra communication and lets me index the same items for each
+loop.
+
+== Step 5
+
+I exchanged data using the unstructured communication approach, doing an
+all-to-all transfer.
+
+To read the result efficiently, I tried using the approach given in the slides.
+I also tried to use binary search since this would yield $log(n)$ time.
+However, this was taking a long time (up to 45 seconds for the 10,000 case), and
+it was the bottleneck.  Using STL's `std::map` proved to be orders of magnitude
+faster.
+
+== Other remarks
+
+On the original example dataset, it poorly using larger numbers. I have an
+explanation for this after looking at the performance characteristics of the
+run: it completes in one iteration where every single edge is assigned. The data
+distribution also indicates that almost everything is connected into the first
+node, which isn't balanced.
+
+I've written a generation script in Python using the `igraph` library.
+
+- 1,000: 93 components
+- 10,000: 947 components
+- 100,000: 9,423 components
+- 1,000,000: 92,880 components
+
+Using this data, I was able to achieve much better speedup. I didn't attach the
+actual data files but they can be generated with the same script (seeded for
+reproducibility).
+
+*NOTE:* I noticed that afterwards, the data was changed again, with a more balanced graph this time.
+So the numbers will not reflect the poorer performance.
+
+== Timing on example dataset
+
+This experiment was performed on CSELabs by using my bench script, and the table
+was generated with another script.
+
+#table(
+  columns: (auto, auto, auto, auto, auto, auto),
+  [],  [1], [2], [4], [8], [16] ,
+  [1000],
+  [0.0249s #linebreak() 0.0151s],
+  [0.0234s #linebreak() 0.0122s],
+  [0.0206s #linebreak() 0.0099s],
+  [0.0491s #linebreak() 0.0248s],
+  [0.0177s #linebreak() 0.0106s],
+
+  [10000],
+  [0.2929s #linebreak() 0.1830s],
+  [0.2933s #linebreak() 0.1540s],
+  [0.2457s #linebreak() 0.1178s],
+  [0.3793s #linebreak() 0.1328s],
+  [0.2473s #linebreak() 0.1197s],
+
+  [100000],
+  [3.7888s #linebreak() 2.4881s],
+  [3.7592s #linebreak() 2.0212s],
+  [3.3819s #linebreak() 1.6036s],
+  [2.9485s #linebreak() 1.3954s],
+  [2.8593s #linebreak() 1.3107s],
+
+  [1000000],
+  [46.7895s #linebreak() 31.9648s],
+  [45.2284s #linebreak() 24.8540s],
+  [40.3994s #linebreak() 20.2851s],
+  [36.9628s #linebreak() 17.6794s],
+  [35.7110s #linebreak() 16.6276s],
+
+)
--- a/assignments/03/test.gdb
+++ b/assignments/03/test.gdb
@ -1,5 +1,5 @@
 set pagination off
-run dataset/both_1000.txt
+run dataset/both_1000.txt out.txt
 bt
 frame 3
 print k
--- a/assignments/04/.gitignore
+++ b/assignments/04/.gitignore
@ -0,0 +1,9 @@
+dataset/large_cpd.txt
+km_cuda
+
+clusters.txt
+centroids.txt
+report.pdf
+cluster.txt
+
+zhan4854.tar.gz
--- a/assignments/04/ASSIGNMENT.md
+++ b/assignments/04/ASSIGNMENT.md
@ -0,0 +1,235 @@
+Introduction
+
+The purpose of this assignment is for you to become familiar with GPU programming and the CUDA API by parallelizing a common algorithm in data mining: k-means clustering.
+
+You need to write a program called km_cuda that will accept the following parameters as input: a filename of data points, the number of clusters, the number of thread blocks, and the number of threads per block. Your program should read in the specified file, cluster the given data points, and output the cluster assignments.
+
+ 
+K-Means Clustering Algorithm
+
+The k-means clustering algorithm clusters N data points into K clusters. Each cluster is characterized by a centroid, which is a point representing the average of all data points within the cluster. The algorithm proceeds as follows:
+
+1. Select the initial K centroids.
+  a. For reproducibility, use points 0, 1, ..., K-1.
+2. Assign each data point to the closest centroid (measured via Euclidean distance).
+3. While not converged:
+  a. Update each centroid to be the average coordinate of all contained data points.
+  b. Assign all data points to the closest centroid (measured via Euclidean distance).
+
+Convergence is detected when no data points change their cluster assignment, or if the maximum number of iterations have been executed. For this assignment, you should set the maximum number of iterations to twenty. Additional material on the k-means algorithm can be found in K-Means.pdf
+
+Download K-Means.pdf.
+
+ 
+Input/Output Formats
+
+Each program will take as input one file (the list of data points), and output two files ("clusters.txt": the cluster assignments and "centroids.txt": the centers of each cluster).
+
+The input file contains N+1 lines. The first line contains two space-separated integers: the number of data points (N), and the dimensionality of each data point (D). The following N lines each contain D space-separated floating-point numbers which represent the coordinates of the current data point. Each floating-point number contains at least one digit after the decimal point. For example, an input with four two-dimensional data points would be stored in a file as: 
+
+ 
+
+4 2
+502.1 505.9
+504.0 489.4
+515.2 514.7
+496.7 498.3
+
+ 
+
+The output file cluster assignments must be called clusters.txt and contain N lines. Each line should contain a single zero-indexed integer which specifies the cluster that the current data point belongs to. For example, a clustering of the above input file into two clusters may look like:
+
+ 
+
+0
+0
+1
+0
+
+ 
+
+The second output file, centroids.txt, should follow the same format as the input data file. It should contain K data points, one for each cluster. Each coordinate should be written with at least three digits after the decimal point.
+
+ 
+
+Your program must also print the clustering time to standard out. You should use a high-precision, monotonic, wall-clock timer and also omit the time spent reading and writing to files. We recommend the function clock_gettime() when on a Linux system. Here is a function that you may use for timing:
+
+ 
+
+```c
+/* Gives us high-resolution timers. */
+#define _POSIX_C_SOURCE 199309L
+#include <time.h>
+
+/* OSX timer includes */
+#ifdef __MACH__
+  #include <mach/mach.h>
+  #include <mach/mach_time.h>
+#endif
+
+/**
+* @brief Return the number of seconds since an unspecified time (e.g., Unix
+*        epoch). This is accomplished with a high-resolution monotonic timer,
+*        suitable for performance timing.
+*
+* @return The number of seconds.
+*/
+static inline double monotonic_seconds() {
+#ifdef __MACH__
+  /* OSX */
+  static mach_timebase_info_data_t info;
+  static double seconds_per_unit;
+  if(seconds_per_unit == 0) {
+    mach_timebase_info(&info);
+    seconds_per_unit = (info.numer / info.denom) / 1e9;
+  }
+  return seconds_per_unit * mach_absolute_time();
+#else
+  /* Linux systems */
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return ts.tv_sec + ts.tv_nsec * 1e-9;
+#endif
+}
+```
+
+ 
+
+You should use the following function to output your clustering time:
+
+```c
+/**
+* @brief Output the seconds elapsed while clustering.
+*
+* @param seconds Seconds spent on k-means clustering, excluding IO.
+*/
+static void print_time(double const seconds)
+{
+  printf("k-means clustering time: %0.04fs\n", seconds);
+}
+```
+
+ 
+
+Timing information should be the ONLY thing printed to standard out.
+
+ 
+
+Failure to follow any of these output instructions will result in significant loss of points.
+Testing
+
+Test inputs can be found in /export/scratch/csci5451_f23/hw4_data on any of the cuda lab machines. We provide two test files: small_gaussian.txt and large_cpd.txt. The former is a small two-dimensional dataset that you should use for testing the correctness of your code.
+
+The cuda machines are located at csel-cuda-0{1..5}.cselabs.umn.edu. You must first load the cuda modules with the following commands before using nvcc.
+
+module load soft/cuda/local
+module initadd soft/cuda/local
+
+If the command 'module' cannot be found. Add the following lines into your ~/.bashrc file. 
+
+```bash
+MODULESINIT="/usr/local/modules-tcl/init/bash"
+
+if [ -f $MODULESINIT ]; then
+
+   . $MODULESINIT
+
+   module load java system soft/ocaml soft/cuda/local
+
+fi
+
+unset MODULESINIT
+```
+
+After adding, run 
+
+```
+source ~/.bashrc
+```
+
+More info on testing/using the cuda machines can be found at Parallel Lab Systems.
+
+You should only run your code on one of the cuda machines. You will use the last digit of your student ID to select the machine. ID's ending in {0, 5} should use cuda01, ID's ending in {1, 6} should use cuda02, ID's ending in {3, 7} should use cuda03, ID's ending in {4, 8} should use cuda04, and ID's ending in {5, 9} should use cuda05.
+
+We provide a short script for plotting clustered 2D data (such as small_gaussian.txt). Download and extract plot_clusters.tar.gz
+
+Download plot_clusters.tar.gz. We provide plotting scripts and small_gaussian.txt inside the package. The plotting scripts rely on Octave, an open source alternative to Matlab. The Octave package can be loaded via 'module load math/octave'. Cluster the data with two clusters, place your files centroids.txt and clusters.txt inside of the plot/ directory, and run:
+
+ 
+
+$ ./plot.sh data/small_gaussian.txt
+
+
+If your clustering is correct, you should see a plot similar to the one below. Data points inside the same cluster are colored the same. Centroids are marked clearly in the center of each cluster.
+
+clusters.png
+
+For the longer running tests, you should look into CUDA compiler optimization flags as well as the screen
+
+Links to an external site. command as these will be helpful.
+
+Remember that the TA will be evaluating your data with a different data sets than those provided for testing.
+What you need to turn in
+Download What you need to turn in
+
+    The source code of your programs.
+    A short report including the following parts:
+        A short description of how you went about parallelizing the k-means algorithm. You should include how you decomposed the problem and why, i.e., what were the tasks being parallelized.
+        Give details about how many elements and how the computations in your kernels are handled by a thread.
+        Ensure you include details about the thread hierarchy, i.e., whether the threads are organized in a 1D, 2D, or, 3D fashion in a thread-block, and whether the thread-blocks are arranged 1D, 2D, or, 3D grid. NOTE: If you choose to write CUDA kernels where the number of thread blocks is determined dynamically by the program during runtime, then send -1 as the input argument for the number of thread blocks to the invocation. In your program, use -1 as a flag to indicate that the number of thread blocks will need to be computed during runtime.
+        You need to perform a parameter study in order to determine how the number of elements processed by a thread and the size of a thread-block, i.e., the #threads in a block, affect the performance of your algorithm. Your writeup should contain some results showing the runtime that you obtained for different choices.
+        You should include results on the 'large_cpd.txt' dataset with 256, 512, and 1024 clusters.
+        Remember, speed counts. Programs that fail to use the GPU efficiently will lose significant points.
+
+Do NOT include the test files; TAs will have their own test files for grading. You will lose significant points for including test files.
+
+ 
+
+Additional specifications related to assignment submission
+
+    A makefile must be provided to compile and generate the executable. The executable should be named:
+        km_cuda
+    Program invocation: Your programs should take as an argument the input file to be read from, the number of clusters, the number of thread blocks, and the number of threads per block to be used for parallel execution.
+    For example, with 64 thread blocks and 128 threads, your program would be invoked as follows:
+
+./km_cuda /export/scratch/CSCI-5451/assignment-1/large_cpd.txt 512 64 128 
+
+    NOTE: If you choose to write CUDA kernels where the number of thread blocks is determined dynamically by the program during runtime, then send -1 as the input argument for the number of thread blocks during invocation. Example:
+
+km_cuda /export/scratch/CSCI-5451/assignment-1/large_cpd.txt 512 -1 128 
+
+    All files (code + report) MUST be in a single directory and the directory's name MUST be your UMN login ID (e.g., your UMN email or Moodle username). Your submission directory MUST include at least the following files (other auxiliary files may also be included):
+
+<UMN ID>/km_cuda.c
+<UMN ID>/Makefile
+<UMN ID>/report.pdf
+
+If you choose to code in C++, then replace the .c suffixes with .cpp or .cc.
+
+    Submission MUST be in .tar.gz
+
+    The following sequence of commands should work on your submission file:
+
+tar xzvf <UMN ID>.tar.gz
+cd <UMN ID>
+make
+ls -ld km_cuda
+
+This ensures that your submission is packaged correctly, your directory is named correctly, your makefile works correctly, and your output executables are named correctly. If any of these does not work, modify it so that you do not lose points. The TAs can answer questions about correctly formatting your submission BEFORE the assignment is due. Do not expect them to answer questions the night it is due.
+
+ 
+
+Failure to follow any of these submission instructions will result in significant loss of points.
+
+ 
+
+ 
+Evaluation criteria
+
+The goal for this assignment is for you to become familiar with the APIs and not so much for developing the most efficient parallel program (this will be done later). As such, full points will be given to the programs that:
+
+    follows the assignment directions;
+    solve the problem correctly;
+    do so in parallel (i.e., both clustering sub-steps are parallelized);
+
+The speedups obtained will probably depend on the size of the input file. It is not expected that you to get good speedups for small files, but you should be able to get speedups for large files.
--- a/assignments/04/Makefile
+++ b/assignments/04/Makefile
@ -0,0 +1,13 @@
+.PHONY: clean
+
+zhan4854.tar.gz: Makefile ASSIGNMENT.md km_cuda.cu km_cuda_fixed_n.cu report.pdf
+	mkdir -p zhan4854
+	cp $^ zhan4854
+	tar -czvf $@ zhan4854
+	rm -r zhan4854
+
+km_cuda: km_cuda.cu
+	nvcc -Xptxas -O3,-v -use_fast_math -o $@ $<
+
+clean:
+	rm -f km_cuda
--- a/assignments/04/dataset/small_gaussian.txt
+++ b/assignments/04/dataset/small_gaussian.txt
--- a/assignments/04/km_cuda.cu
+++ b/assignments/04/km_cuda.cu
@ -0,0 +1,281 @@
+// #define _POSIX_C_SOURCE 200809L
+#include <stdio.h>
+#include <time.h>
+
+#define CUDACHECK(err)                                                         \
+  do {                                                                         \
+    cuda_check((err), __FILE__, __LINE__);                                     \
+  } while (false)
+inline void cuda_check(cudaError_t error_code, const char *file, int line) {
+  if (error_code != cudaSuccess) {
+    fprintf(stderr, "CUDA Error %d: %s. In file '%s' on line %d\n", error_code,
+            cudaGetErrorString(error_code), file, line);
+    fflush(stderr);
+    exit(error_code);
+  }
+}
+
+#define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
+#define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))
+
+// #define ENSURE_int(i) _Generic((i), int: (i))
+// #define ENSURE_float(f) _Generic((f), float: (f))
+
+// #define MAX(type, x, y) (type) GENERIC_MAX(ENSURE_##type(x), ENSURE_##type(y))
+// #define MIN(type, x, y) (type) GENERIC_MIN(ENSURE_##type(x), ENSURE_##type(y))
+
+/**
+ * @brief Return the number of seconds since an unspecified time (e.g., Unix
+ *        epoch). This is accomplished with a high-resolution monotonic timer,
+ *        suitable for performance timing.
+ *
+ * @return The number of seconds.
+ */
+static inline double monotonic_seconds() {
+  /* Linux systems */
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return ts.tv_sec + ts.tv_nsec * 1e-9;
+}
+
+/**
+ * @brief Output the seconds elapsed while clustering.
+ *
+ * @param seconds Seconds spent on k-means clustering, excluding IO.
+ */
+static void print_time(double const seconds) {
+  printf("k-means clustering time: %0.04fs\n", seconds);
+}
+
+__global__ void findDistanceToCentroid(int N, int K, int dim,
+                                       float *centroidDistances, float *data,
+                                       float *centroids, int tOffset) {
+  int t = blockIdx.x + tOffset; // data index
+  int c = threadIdx.x;          // cluster index
+
+  float sum = 0;
+  for (int d = 0; d < dim; ++d) {
+    float delta = data[t * dim + d] - centroids[c * dim + d];
+    sum += delta * delta;
+  }
+
+  centroidDistances[t * K + c] = sqrt(sum);
+}
+
+__global__ void assignClosestCentroid(int N, int K, int *dirtyBit,
+                                      float *centroidDistances,
+                                      int *clusterMap) {
+  int t = blockIdx.x;
+  int minIdx = 0;
+  float minValue = INFINITY;
+
+  for (int c = 0; c < K; ++c) {
+    float dist = centroidDistances[t * K + c];
+    if (dist < minValue) {
+      minValue = dist;
+      minIdx = c;
+    }
+  }
+
+  // printf("[%d]: minDist %f @ idx %d\n", t, minValue, minIdx);
+  int oldMinIdx = clusterMap[t];
+  clusterMap[t] = minIdx;
+  if (oldMinIdx != minIdx) {
+    atomicOr(dirtyBit, 1);
+  }
+}
+
+__global__ void recentralizeCentroidSum(int N, int K, int dim, float *data,
+                                        float *centroids, int *clusterMap,
+                                        unsigned int *clusterCount) {
+  int t = blockIdx.x;  // data index
+  int c = threadIdx.x; // cluster index
+  int assignedCluster = clusterMap[t];
+
+  if (assignedCluster != c)
+    return;
+
+  atomicAdd((unsigned int *)&clusterCount[c], 1);
+  for (int d = 0; d < dim; ++d) {
+    atomicAdd(&centroids[c * dim + d], data[t * dim + d]);
+  }
+}
+
+__global__ void recentralizeCentroidDiv(int dim, float *centroids,
+                                        unsigned int *clusterCount) {
+  int c = threadIdx.x; // cluster index
+  for (int d = 0; d < dim; ++d) {
+    centroids[c * dim + d] /= clusterCount[c];
+  }
+}
+
+int main(int argc, char **argv) {
+  char *data_file = argv[1];
+  int num_clusters = atoi(argv[2]);
+  int num_thread_blocks = atoi(argv[3]);
+  int num_threads_per_block = atoi(argv[4]);
+
+  int N, dim;
+  float *centroids,       // centroids[cluster][dimension]
+      *data,              // data[t][dimension]
+      *centroidDistances; // centroidDistances[t][cluster]
+  int *clusterMap, *dirtyBit;
+  unsigned int *clusterCount;
+
+#pragma region Read in data
+  {
+    FILE *fp = fopen(data_file, "r");
+
+    // Read first line
+    size_t n;
+    char *line = NULL;
+    if (!getline(&line, &n, fp))
+      return -1;
+
+    sscanf(line, "%d %d", &N, &dim);
+    free(line);
+    line = NULL;
+
+    // Allocate memory on the GPU
+    CUDACHECK(
+        cudaMalloc((void **)&centroids, num_clusters * dim * sizeof(float)));
+    CUDACHECK(cudaMallocManaged((void **)&clusterMap, N * sizeof(int)));
+    CUDACHECK(cudaMallocManaged((void **)&clusterCount,
+                                num_clusters * sizeof(unsigned int)));
+    CUDACHECK(cudaMalloc((void **)&data, N * dim * sizeof(float)));
+    CUDACHECK(cudaMalloc((void **)&centroidDistances,
+                         N * num_clusters * sizeof(float)));
+    CUDACHECK(cudaMallocManaged((void **)&dirtyBit, sizeof(int)));
+
+    // Initialize all the cluster mappings to -1 so the first iteration is
+    // always fully dirty
+    CUDACHECK(cudaMemset(clusterMap, -1, N * sizeof(int)));
+
+    // Read the rest of the lines
+    {
+      // Buffer for copying
+      float *currentLine = (float *)malloc(dim * sizeof(float));
+      for (int i = 0; i < N; ++i) {
+        if (!getline(&line, &n, fp))
+          return -1;
+
+        for (int j = 0; j < dim; ++j)
+          sscanf(line, "%f", &currentLine[j]);
+
+        CUDACHECK(cudaMemcpy(&data[i * dim], currentLine, dim * sizeof(float),
+                             cudaMemcpyHostToDevice));
+      }
+      free(currentLine);
+    }
+
+    fclose(fp);
+  }
+#pragma endregion
+
+  double start_time = monotonic_seconds();
+
+#pragma region Select the initial K centroids
+  {
+    CUDACHECK(cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float),
+                         cudaMemcpyDeviceToDevice));
+  }
+#pragma endregion
+
+#pragma region Assign each data point to the closest centroid,                 \
+    measured via Euclidean distance.
+  {
+    if (num_thread_blocks == -1) {
+      findDistanceToCentroid<<<N, num_clusters>>>(
+          N, num_clusters, dim, centroidDistances, data, centroids, 0);
+    } else {
+      for (int j = 0; j < N; j += num_thread_blocks) {
+        int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
+        findDistanceToCentroid<<<n, num_clusters>>>(
+            N, num_clusters, dim, centroidDistances, data, centroids,
+            j * num_thread_blocks);
+      }
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+
+    *dirtyBit = 0;
+    assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
+                                    centroidDistances, clusterMap);
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+#pragma endregion
+
+#pragma region Iteration
+  int it = 0;
+  for (int it = 0; it < 20 && *dirtyBit; ++it) {
+    printf("Iteration %d (dirty=%d)\n", it, *dirtyBit);
+
+    // Update each centroid to be the average coordinate of all contained data
+    // points
+    CUDACHECK(cudaMemset(clusterCount, 0, num_clusters * sizeof(int)));
+    CUDACHECK(cudaMemset(centroids, 0, num_clusters * dim * sizeof(float)));
+    recentralizeCentroidSum<<<N, num_clusters>>>(
+        N, num_clusters, dim, data, centroids, clusterMap, clusterCount);
+    CUDACHECK(cudaDeviceSynchronize());
+
+    // Print out the cluster compositions
+    for (int i = 0; i < num_clusters; ++i)
+      printf("%d ", clusterCount[i]);
+    printf("\n");
+
+    recentralizeCentroidDiv<<<1, num_clusters>>>(dim, centroids, clusterCount);
+    CUDACHECK(cudaDeviceSynchronize());
+
+    // Assign all data points to the closest centroid (measured via Euclidean
+    // distance).
+    // findDistanceToCentroid<<<N, num_clusters>>>(
+    //     N, num_clusters, dim, centroidDistances, data, centroids);
+    if (num_thread_blocks == -1) {
+      findDistanceToCentroid<<<N, num_clusters>>>(
+          N, num_clusters, dim, centroidDistances, data, centroids, 0);
+    } else {
+      for (int j = 0; j < N; j += num_thread_blocks) {
+        int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
+        findDistanceToCentroid<<<n, num_clusters>>>(
+            N, num_clusters, dim, centroidDistances, data, centroids,
+            j * num_thread_blocks);
+      }
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+
+    *dirtyBit = 0;
+    assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
+                                    centroidDistances, clusterMap);
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+#pragma endregion
+
+  double end_time = monotonic_seconds();
+  print_time(end_time - start_time);
+
+#pragma region
+  {
+    FILE *fp = fopen("clusters.txt", "w");
+    for (int i = 0; i < N; ++i)
+      fprintf(fp, "%d\n", clusterMap[i]);
+    fclose(fp);
+  }
+
+  {
+    FILE *fp = fopen("centroids.txt", "w");
+    fprintf(fp, "%d %d\n", num_clusters, dim);
+    float *line = (float *)malloc(dim * sizeof(float));
+    for (int i = 0; i < num_clusters; ++i) {
+      CUDACHECK(cudaMemcpy(line, &centroids[i * dim], dim * sizeof(float),
+                           cudaMemcpyDeviceToHost));
+      for (int d = 0; d < dim; ++d)
+        fprintf(fp, "%.3f ", line[d]);
+      fprintf(fp, "\n");
+    }
+    free(line);
+    fclose(fp);
+  }
+
+#pragma endregion
+
+  return 0;
+}
--- a/assignments/04/km_cuda_fixed_n.cu
+++ b/assignments/04/km_cuda_fixed_n.cu
@ -0,0 +1,281 @@
+// #define _POSIX_C_SOURCE 200809L
+#include <stdio.h>
+#include <time.h>
+
+#define CUDACHECK(err)                                                         \
+  do {                                                                         \
+    cuda_check((err), __FILE__, __LINE__);                                     \
+  } while (false)
+inline void cuda_check(cudaError_t error_code, const char *file, int line) {
+  if (error_code != cudaSuccess) {
+    fprintf(stderr, "CUDA Error %d: %s. In file '%s' on line %d\n", error_code,
+            cudaGetErrorString(error_code), file, line);
+    fflush(stderr);
+    exit(error_code);
+  }
+}
+
+#define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
+#define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))
+
+// #define ENSURE_int(i) _Generic((i), int: (i))
+// #define ENSURE_float(f) _Generic((f), float: (f))
+
+// #define MAX(type, x, y) (type) GENERIC_MAX(ENSURE_##type(x), ENSURE_##type(y))
+// #define MIN(type, x, y) (type) GENERIC_MIN(ENSURE_##type(x), ENSURE_##type(y))
+
+/**
+ * @brief Return the number of seconds since an unspecified time (e.g., Unix
+ *        epoch). This is accomplished with a high-resolution monotonic timer,
+ *        suitable for performance timing.
+ *
+ * @return The number of seconds.
+ */
+static inline double monotonic_seconds() {
+  /* Linux systems */
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return ts.tv_sec + ts.tv_nsec * 1e-9;
+}
+
+/**
+ * @brief Output the seconds elapsed while clustering.
+ *
+ * @param seconds Seconds spent on k-means clustering, excluding IO.
+ */
+static void print_time(double const seconds) {
+  printf("k-means clustering time: %0.04fs\n", seconds);
+}
+
+__global__ void findDistanceToCentroid(int N, int K, int dim,
+                                       float *centroidDistances, float *data,
+                                       float *centroids, int tOffset) {
+  int t = blockIdx.x + tOffset; // data index
+  int c = threadIdx.x;          // cluster index
+
+  float sum = 0;
+  for (int d = 0; d < dim; ++d) {
+    float delta = data[t * dim + d] - centroids[c * dim + d];
+    sum += delta * delta;
+  }
+
+  centroidDistances[t * K + c] = sqrt(sum);
+}
+
+__global__ void assignClosestCentroid(int N, int K, int *dirtyBit,
+                                      float *centroidDistances,
+                                      int *clusterMap) {
+  int t = blockIdx.x;
+  int minIdx = 0;
+  float minValue = INFINITY;
+
+  for (int c = 0; c < K; ++c) {
+    float dist = centroidDistances[t * K + c];
+    if (dist < minValue) {
+      minValue = dist;
+      minIdx = c;
+    }
+  }
+
+  // printf("[%d]: minDist %f @ idx %d\n", t, minValue, minIdx);
+  int oldMinIdx = clusterMap[t];
+  clusterMap[t] = minIdx;
+  if (oldMinIdx != minIdx) {
+    atomicOr(dirtyBit, 1);
+  }
+}
+
+__global__ void recentralizeCentroidSum(int N, int K, int dim, float *data,
+                                        float *centroids, int *clusterMap,
+                                        unsigned int *clusterCount) {
+  int t = blockIdx.x;  // data index
+  int c = threadIdx.x; // cluster index
+  int assignedCluster = clusterMap[t];
+
+  if (assignedCluster != c)
+    return;
+
+  atomicAdd((unsigned int *)&clusterCount[c], 1);
+  for (int d = 0; d < dim; ++d) {
+    atomicAdd(&centroids[c * dim + d], data[t * dim + d]);
+  }
+}
+
+__global__ void recentralizeCentroidDiv(int dim, float *centroids,
+                                        unsigned int *clusterCount) {
+  int c = threadIdx.x; // cluster index
+  for (int d = 0; d < dim; ++d) {
+    centroids[c * dim + d] /= clusterCount[c];
+  }
+}
+
+int main(int argc, char **argv) {
+  char *data_file = argv[1];
+  int num_clusters = atoi(argv[2]);
+  int num_thread_blocks = atoi(argv[3]);
+  int num_threads_per_block = atoi(argv[4]);
+
+  int N, dim;
+  float *centroids,       // centroids[cluster][dimension]
+      *data,              // data[t][dimension]
+      *centroidDistances; // centroidDistances[t][cluster]
+  int *clusterMap, *dirtyBit;
+  unsigned int *clusterCount;
+
+#pragma region Read in data
+  {
+    FILE *fp = fopen(data_file, "r");
+
+    // Read first line
+    size_t n;
+    char *line = NULL;
+    if (!getline(&line, &n, fp))
+      return -1;
+
+    sscanf(line, "%d %d", &N, &dim);
+    free(line);
+    line = NULL;
+
+    // Allocate memory on the GPU
+    CUDACHECK(
+        cudaMalloc((void **)&centroids, num_clusters * dim * sizeof(float)));
+    CUDACHECK(cudaMallocManaged((void **)&clusterMap, N * sizeof(int)));
+    CUDACHECK(cudaMallocManaged((void **)&clusterCount,
+                                num_clusters * sizeof(unsigned int)));
+    CUDACHECK(cudaMalloc((void **)&data, N * dim * sizeof(float)));
+    CUDACHECK(cudaMalloc((void **)&centroidDistances,
+                         N * num_clusters * sizeof(float)));
+    CUDACHECK(cudaMallocManaged((void **)&dirtyBit, sizeof(int)));
+
+    // Initialize all the cluster mappings to -1 so the first iteration is
+    // always fully dirty
+    CUDACHECK(cudaMemset(clusterMap, -1, N * sizeof(int)));
+
+    // Read the rest of the lines
+    {
+      // Buffer for copying
+      float *currentLine = (float *)malloc(dim * sizeof(float));
+      for (int i = 0; i < N; ++i) {
+        if (!getline(&line, &n, fp))
+          return -1;
+
+        for (int j = 0; j < dim; ++j)
+          sscanf(line, "%f", &currentLine[j]);
+
+        CUDACHECK(cudaMemcpy(&data[i * dim], currentLine, dim * sizeof(float),
+                             cudaMemcpyHostToDevice));
+      }
+      free(currentLine);
+    }
+
+    fclose(fp);
+  }
+#pragma endregion
+
+  double start_time = monotonic_seconds();
+
+#pragma region Select the initial K centroids
+  {
+    CUDACHECK(cudaMemcpy(centroids, data, num_clusters * dim * sizeof(float),
+                         cudaMemcpyDeviceToDevice));
+  }
+#pragma endregion
+
+#pragma region Assign each data point to the closest centroid,                 \
+    measured via Euclidean distance.
+  {
+    if (num_thread_blocks == -1) {
+      findDistanceToCentroid<<<N, num_clusters>>>(
+          N, num_clusters, dim, centroidDistances, data, centroids, 0);
+    } else {
+      for (int j = 0; j < N; j += num_thread_blocks) {
+        int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
+        findDistanceToCentroid<<<n, num_clusters>>>(
+            N, num_clusters, dim, centroidDistances, data, centroids,
+            j * num_thread_blocks);
+      }
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+
+    *dirtyBit = 0;
+    assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
+                                    centroidDistances, clusterMap);
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+#pragma endregion
+
+#pragma region Iteration
+  int it = 0;
+  for (int it = 0; it < 20 && *dirtyBit; ++it) {
+    // printf("Iteration %d (dirty=%d)\n", it, *dirtyBit);
+
+    // Update each centroid to be the average coordinate of all contained data
+    // points
+    CUDACHECK(cudaMemset(clusterCount, 0, num_clusters * sizeof(int)));
+    CUDACHECK(cudaMemset(centroids, 0, num_clusters * dim * sizeof(float)));
+    recentralizeCentroidSum<<<N, num_clusters>>>(
+        N, num_clusters, dim, data, centroids, clusterMap, clusterCount);
+    CUDACHECK(cudaDeviceSynchronize());
+
+    // Print out the cluster compositions
+    // for (int i = 0; i < num_clusters; ++i)
+    //   printf("%d ", clusterCount[i]);
+    // printf("\n");
+
+    recentralizeCentroidDiv<<<1, num_clusters>>>(dim, centroids, clusterCount);
+    CUDACHECK(cudaDeviceSynchronize());
+
+    // Assign all data points to the closest centroid (measured via Euclidean
+    // distance).
+    // findDistanceToCentroid<<<N, num_clusters>>>(
+    //     N, num_clusters, dim, centroidDistances, data, centroids);
+    if (num_thread_blocks == -1) {
+      findDistanceToCentroid<<<N, num_clusters>>>(
+          N, num_clusters, dim, centroidDistances, data, centroids, 0);
+    } else {
+      for (int j = 0; j < N; j += num_thread_blocks) {
+        int n = GENERIC_MIN(num_thread_blocks, N - j * num_thread_blocks);
+        findDistanceToCentroid<<<n, num_clusters>>>(
+            N, num_clusters, dim, centroidDistances, data, centroids,
+            j * num_thread_blocks);
+      }
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+
+    *dirtyBit = 0;
+    assignClosestCentroid<<<N, 1>>>(N, num_clusters, dirtyBit,
+                                    centroidDistances, clusterMap);
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+#pragma endregion
+
+  double end_time = monotonic_seconds();
+  print_time(end_time - start_time);
+
+#pragma region
+  {
+    FILE *fp = fopen("clusters.txt", "w");
+    for (int i = 0; i < N; ++i)
+      fprintf(fp, "%d\n", clusterMap[i]);
+    fclose(fp);
+  }
+
+  {
+    FILE *fp = fopen("centroids.txt", "w");
+    fprintf(fp, "%d %d\n", num_clusters, dim);
+    float *line = (float *)malloc(dim * sizeof(float));
+    for (int i = 0; i < num_clusters; ++i) {
+      CUDACHECK(cudaMemcpy(line, &centroids[i * dim], dim * sizeof(float),
+                           cudaMemcpyDeviceToHost));
+      for (int d = 0; d < dim; ++d)
+        fprintf(fp, "%.3f ", line[d]);
+      fprintf(fp, "\n");
+    }
+    free(line);
+    fclose(fp);
+  }
+
+#pragma endregion
+
+  return 0;
+}
--- a/assignments/04/plot.sh
+++ b/assignments/04/plot.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+
+if [ "$#" -ne 1 ]; then
+  echo "usage: ./plot.sh <data file>";
+  echo "  NOTE: 'clusters.txt' must be in working directory.";
+  exit 1;
+fi
+
+octave-cli --eval "plot_clusters2D('$1', 'clusters.txt', 'centroids.txt')"
+
--- a/assignments/04/plot_clusters2D.m
+++ b/assignments/04/plot_clusters2D.m
@ -0,0 +1,23 @@
+
+function plot_clusters2D(points_file, clusters_file, centroids_file)
+
+X = load(points_file);
+X(1,:) = []; % remove metadata
+
+clusters = load(clusters_file);
+
+centroids = load(centroids_file);
+centroids(1,:) = []; % remove metadata
+
+f = figure();
+hold on
+
+nclusters = size(centroids,1);
+for c=1:nclusters
+  points = X(clusters == c-1, :);
+  scatter(points(:,1), points(:,2));
+
+  scatter(centroids(c,1), centroids(c,2), '+k', 'LineWidth', 5, 'SizeData', 100);
+end
+
+uiwait(f);
--- a/assignments/04/report.typ
+++ b/assignments/04/report.typ
@ -0,0 +1,41 @@
+#set page(margin: 1.5em)
+
+== Homework 4
+
+Michael Zhang \<zhan4854\@umn.edu\>
+
+1. *A short description of how you went about parallelizing the k-means algorithm. You should include how you decomposed the problem and why, i.e., what were the tasks being parallelized.*
+
+  My parallelized program included the following procedures:
+
+  - `findDistanceToCentroid` - This computes an $N times K$ matrix of distances from each data point to each centroid.
+
+  - `assignClosestCentroid` - This reduces the distances to find the minimum distance for each centroid, and assigns the closest one to an $N times 1$ vector.
+
+  - `recentralizeCentroidSum` - This computes a sum for the purposes of averaging, and also counts the number of elements in each cluster.
+
+  - `recentralizeCentroidDiv` - This uses the count from the previous step and divides everything in parallel.
+
+  I tried to make sure every thread is computing approximately one single for-loop's worth of data, most of the time over the $d$ axis
+
+2. *Give details about how many elements and how the computations in your kernels are handled by a thread.*
+
+  I used the dynamic thread allocation method based on the size of the data.
+
+  For most of the kernels, the computation is very simple: perform a row-reduction into a different array. Since all the accesses are disjoint, I don't synchronize between threads.
+
+  However, for averaging the datapoints, I unfortunately need to run a $N times K times D$ operation that involves a sum reduction. I tried using a tree-based approach after doing some bitwise math to avoid the conditional of whether it's in the same class, but the plain approach is simpler and I did not get the other one to work.
+
+3. *Ensure you include details about the thread hierarchy, i.e., whether the threads are organized in a 1D, 2D, or, 3D fashion in a thread-block, and whether the thread-blocks are arranged 1D, 2D, or, 3D grid. NOTE: If you choose to write CUDA kernels where the number of thread blocks is determined dynamically by the program during runtime, then send -1 as the input argument for the number of thread blocks to the invocation. In your program, use -1 as a flag to indicate that the number of thread blocks will need to be computed during runtime.*
+
+  I used a 1D thread hierarchy. This is because all my accesses are already basically along the "good" axis, so I'm not doing any strides along other dimensions.
+
+4. *You need to perform a parameter study in order to determine how the number of elements processed by a thread and the size of a thread-block, i.e., the \# threads in a block, affect the performance of your algorithm. Your writeup should contain some results showing the runtime that you obtained for different choices.*
+
+  I tried using a fixed number of thread blocks, but for the 256 cluster case it ended up performing almost twice as slow (at 0.4516s). To the best of my knowledge, CUDA is able to schedule the extra kernel instances it needs to compute to happen sequentially after, if there are more thread blocks requested than there are available.
+
+5. *You should include results on the 'large_cpd.txt' dataset with 256, 512, and 1024 clusters.*
+
+  - 256: 26.8258s
+  - 512: 62.1212s
+  - 1024: 163.4022s
--- a/assignments/04/run.sh
+++ b/assignments/04/run.sh
@ -0,0 +1,12 @@
+set -euo pipefail
+HOST="zhan4854@csel-cuda-02.cselabs.umn.edu"
+rsync -azPr --exclude 'large_cpd.txt' . $HOST:~/hwk4
+
+CLUSTERS=${1:-512}
+BLOCKS=512
+THREADS=-1
+DATAFILE="large_cpd.txt"
+# DATAFILE="small_gaussian.txt"
+
+ssh $HOST bash -c "set -euo pipefail; module load soft/cuda/local; module initadd soft/cuda/local; cd hwk4; make clean; make; ls; ./km_cuda ./dataset/$DATAFILE $CLUSTERS 64 128"
+rsync -qazPr --exclude 'large_cpd.txt' zhan4854@csel-cuda-02.cselabs.umn.edu:~/hwk4/ .
--- a/flake.lock
+++ b/flake.lock
@ -0,0 +1,74 @@
+{
+  "nodes": {
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1701680307,
+        "narHash": "sha256-kAuep2h5ajznlPMD9rnQyffWG8EM/C73lejGofXvdM8=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "4022d587cbbfd70fe950c1e2083a02621806a725",
+        "type": "github"
+      },
+      "original": {
+        "id": "flake-utils",
+        "type": "indirect"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1663551060,
+        "narHash": "sha256-e2SR4cVx9p7aW/XnVsGsWZBplApA9ZJUjc0fejJhnYo=",
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "8a5b9ee7b7a2b38267c9481f5c629c015108ab0d",
+        "type": "github"
+      },
+      "original": {
+        "id": "nixpkgs",
+        "type": "indirect"
+      }
+    },
+    "nixpkgsUnstable": {
+      "locked": {
+        "lastModified": 1702237358,
+        "narHash": "sha256-PagQSuIdXAueAaAujhtqecP2sjXSYDdYfp2UVwqbkP8=",
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "7eb0ff576d1bde14a3353ef85f8fba6fd57d32c7",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "flake-utils": "flake-utils",
+        "nixpkgs": "nixpkgs",
+        "nixpkgsUnstable": "nixpkgsUnstable"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
--- a/flake.nix
+++ b/flake.nix
@ -0,0 +1,23 @@
+{
+  inputs.nixpkgsUnstable.url = "github:nixos/nixpkgs";
+
+  outputs = { self, nixpkgs, nixpkgsUnstable, flake-utils }:
+    flake-utils.lib.eachDefaultSystem (system:
+      let
+        pkgs = import nixpkgs {
+          inherit system;
+          config.cudaSupport = true;
+        };
+        pkgsUnstable = import nixpkgsUnstable {
+          inherit system;
+          config.cudaSupport = true;
+          config.allowUnfreePredicate = pkg:
+            builtins.elem (nixpkgs.lib.getName pkg) [ "cudatoolkit" ];
+        };
+      in {
+        devShell = pkgs.mkShell {
+          packages = (with pkgs; [ clang-tools gdb octave ])
+            ++ (with pkgsUnstable.cudaPackages_12; [ cudatoolkit ]);
+        };
+      });
+}
Author	SHA1	Message	Date
Michael Zhang	dfa2024001	a3 fix	2023-12-28 23:53:30 -05:00
Michael Zhang	9ce233b671	final	2023-12-16 11:44:38 -06:00
Michael Zhang	0b179f38bd	d	2023-12-15 17:13:47 -06:00
Michael Zhang	18b1c2b6bc	a	2023-12-15 16:58:06 -06:00
Michael Zhang	9f6a80a09f	snapshot	2023-12-11 22:11:02 -06:00
Michael Zhang	442638205c	assignment 3	2023-12-10 17:38:53 -06:00
Michael Zhang	d75da8de6d	give up on C	2023-12-10 17:38:53 -06:00
Michael Zhang	584a919ff1	print out	2023-12-10 17:38:06 -06:00
Michael Zhang	d8f5fe13b6	shiet	2023-12-10 16:57:24 -06:00
Michael Zhang	68a1e749dc	cuda shit	2023-12-10 15:40:31 -06:00
Michael Zhang	0728eb468f	okayge	2023-11-25 09:28:49 +00:00
Michael Zhang	5fe38262c5	consistent results now	2023-11-25 09:18:56 +00:00