fuckkkk

2023-10-31 04:37:41 +00:00 · 2023-10-31 04:37:41 +00:00 · 06d3e930ec
commit 06d3e930ec
parent a01580087f
9 changed files with 967 additions and 214 deletions
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@ -14,17 +14,18 @@
  "customizations": {
    "vscode": {
      "extensions": [
-        "eamodio.gitlens",
-        "esbenp.prettier-vscode",
-        "llvm-vs-code-extensions.vscode-clangd",
-        "ms-azuretools.vscode-docker",
-        "ms-python.python",
-        "ms-python.vscode-pylance",
-        "ms-vscode.cpptools",
-        "ms-vscode.makefile-tools",
-        "rust-lang.rust-analyzer",
-        "tomoki1207.pdf"
-      ]
+		"eamodio.gitlens",
+		"esbenp.prettier-vscode",
+		"llvm-vs-code-extensions.vscode-clangd",
+		"ms-azuretools.vscode-docker",
+		"ms-python.python",
+		"ms-python.vscode-pylance",
+		"ms-vscode.cpptools",
+		"ms-vscode.makefile-tools",
+		"rust-lang.rust-analyzer",
+		"tomoki1207.pdf",
+		"nvarner.typst-lsp"
+	]
    }
  }

--- a/5
+++ b/5
@ -1,4 +1,7 @@
 ARG DEBIAN_FRONTEND=noninteractive
+
+FROM ghcr.io/typst/typst:latest as typst
+
 FROM ubuntu:22.04
 ENV PATH="/root/.cargo/bin:${PATH}"

@ -22,5 +25,7 @@ RUN apt update -y && apt install -y --no-install-recommends \
    ;
 RUN pip install poetry

+COPY --from=typst /bin/typst /usr/bin/typst
+
 RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
 RUN echo 'eval "$(direnv hook bash)"' >> /root/.bashrc
--- a/assignments/02/.gitignore
+++ b/assignments/02/.gitignore
@ -2,3 +2,6 @@ qs_mpi
 *.o
 compile_commands.json
 .cache
+
+output*.txt
+*.tar.gz
--- a/assignments/02/Makefile
+++ b/assignments/02/Makefile
@ -1,17 +1,30 @@
-.PHONY: all clean run-example
+.PHONY: all handin clean run-example

 CC := cc

-CFLAGS := -g
-LDFLAGS := -g
+# CFLAGS := -g -O0
+# LDFLAGS := -g
+CFLAGS := -O3
+LDFLAGS :=

 CFLAGS += $(shell pkg-config --cflags mpi)
 LDFLAGS += $(shell pkg-config --libs mpi)

 all: qs_mpi

+handin: zhan4854.tar.gz
+
+zhan4854.tar.gz: Makefile ASSIGNMENT.md qs_mpi.c report.pdf
+	mkdir -p zhan4854
+	cp $^ zhan4854
+	tar -czvf $@ zhan4854
+	rm -r zhan4854
+
 run-example: qs_mpi
-	mpirun --allow-run-as-root -np 4 ./qs_mpi 32 output.txt
+	mpirun -v --allow-run-as-root -np 4 ./qs_mpi 32 output.txt
+
+report.pdf: report.typ
+	typst compile $< $@

 qs_mpi: qs_mpi.o
 	$(CC) $^ $(CFLAGS) $(LDFLAGS) -o $@
--- a/assignments/02/process_output.py
+++ b/assignments/02/process_output.py
@ -1,24 +1,25 @@
 import sys
 import re

-pat = re.compile(r"\[(\d+)\] (.*)")
+pat = re.compile(r"\[(\d+),(-?\d+)\] (.*)")

 outputs = {}

-for line in sys.stdin.readlines():
+for i, line in enumerate(sys.stdin.readlines()):
    m = pat.match(line)
    if not m:
        # print(line)
        continue
    p = int(m.group(1))
-    rest = m.group(2)
+    n = int(m.group(2))
+    rest = m.group(3)

-    if p not in outputs: outputs[p] = []
-    outputs[p].append(rest)
+    if (p, n) not in outputs: outputs[p, n] = (i, [])
+    outputs[p, n][1].append(rest)

-for p in sorted(outputs.keys()):
-    lines = outputs[p]
-    print(f"---- {p} ----")
+for ((p, n), (i, lines)) in sorted(outputs.items(), key=lambda v: (-v[0][1], v[0][0])):
+    # lines = outputs[p, n]
+    print(f"---- {p} [{n}] ----")
    for line in lines:
        print(line)
    print()
--- a/assignments/02/qs_mpi
+++ b/assignments/02/qs_mpi
@ -0,0 +1,655 @@
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+// https://stackoverflow.com/a/75458495
+#define check_mpi_error(n) __check_mpi_error(__FILE__, __LINE__, n)
+
+void __check_mpi_error(const char *file, const int line, const int n) {
+  char errbuffer[MPI_MAX_ERROR_STRING];
+  int errlen;
+
+  if (n != MPI_SUCCESS) {
+    MPI_Error_string(n, errbuffer, &errlen);
+    printf("MPI-error: %s\n", errbuffer);
+    printf("Location: %s:%i\n", file, line);
+    MPI_Abort(MPI_COMM_WORLD, n);
+  }
+}
+
+#define ORDER_FORWARDS 1
+#define ORDER_BACKWARDS 2
+#define CTL_SIZE 4
+#define ROOT_RANK 0
+
+#define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
+#define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))
+
+#define ENSURE_int(i) _Generic((i), int : (i))
+#define ENSURE_float(f) _Generic((f), float : (f))
+
+#define MAX(type, x, y) (type) GENERIC_MAX(ENSURE_##type(x), ENSURE_##type(y))
+#define MIN(type, x, y) (type) GENERIC_MIN(ENSURE_##type(x), ENSURE_##type(y))
+
+void init_ctl(int *ctl, int len);
+void local_quicksort(int *arr, int lo, int hi);
+char *string_of_list(int *arr, int len);
+void recursive_quicksort(int *integers, int n, int segment_capac,
+                         int segment_len, int *integers_out, MPI_Comm comm);
+
+int main(int argc, char **argv) {
+  int rank, p;
+  MPI_Init(&argc, &argv);
+
+  int n = atoi(argv[1]);
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &p);
+
+  // Generate integers
+  int n_over_p = n / p;
+  int integers[n_over_p];
+
+  // Minor implementation detail: srand(0) is specially handled by glibc to
+  // behave as if it was called with srand(1). To get around this, I'm seeding
+  // with rank + 1
+  //
+  // See more: https://stackoverflow.com/a/27386563
+  srand(rank + 1);
+
+  for (int i = 0; i < n_over_p; ++i) {
+    integers[i] = rand();
+    // printf(" - %d\n", integers[i]);
+  }
+  // printf("[%d,9999999999] GENERATED INTEGERS: %s\n", rank,
+  //        string_of_list(integers, n_over_p));
+
+  int new_integers[n_over_p];
+  recursive_quicksort(integers, n, n_over_p, n_over_p, new_integers,
+                      MPI_COMM_WORLD);
+
+  // sleep(1);
+  // printf("[%d] after: %s\n", rank, string_of_list(integers, n_over_p));
+
+  // The first node is responsible for collecting all the data and then
+  // printing it out to the file MPI_Gather(const void *sendbuf, int
+  // sendcount, MPI_INT, void *recvbuf,
+  //            int recvcount, MPI_INT, 0, MPI_COMM_WORLD);
+  int recvbuf[n];
+  MPI_Gather(new_integers, n_over_p, MPI_INT, recvbuf, n_over_p, MPI_INT, 0,
+             MPI_COMM_WORLD);
+
+  if (rank == 0) {
+    FILE *fp = fopen(argv[2], "w");
+    // printf("integers: %s\n", string_of_list(recvbuf, n));
+    // printf("[%d,-1] ==== FINAL ====\n", rank);
+    for (int i = 0; i < n; i += 1) {
+      fprintf(fp, "%d\n", recvbuf[i]);
+      // printf("[%d,-1]  %s\n", rank,
+      //        string_of_list(&recvbuf[i * n_over_p], n_over_p));
+    }
+    fclose(fp);
+  }
+
+  MPI_Finalize();
+  // printf("Done.\n");
+  return 0;
+}
+
+// hi not inclusive
+void local_quicksort(int *arr, int lo, int hi) {
+  int temp;
+
+  if (lo >= hi || lo < 0)
+    return;
+
+  int pivot = arr[hi - 1];
+  int pivot_idx = lo - 1;
+  for (int j = lo; j < hi; ++j) {
+    if (arr[j] < pivot) {
+      pivot_idx += 1;
+
+      temp = arr[j];
+      arr[j] = arr[pivot_idx];
+      arr[pivot_idx] = temp;
+    }
+  }
+
+  pivot_idx += 1;
+  temp = arr[hi - 1];
+  arr[hi - 1] = arr[pivot_idx];
+  arr[pivot_idx] = temp;
+
+  // Recursive call
+  local_quicksort(arr, lo, pivot_idx);
+  local_quicksort(arr, pivot_idx + 1, hi);
+}
+
+// char *string_of_list(int *arr, int len) {
+//   char *buffer = calloc(sizeof(char), 1000);
+//   int offset = 0; // Keep track of the current position in the buffer
+//   for (int i = 0; i < len; i++) {
+//     offset += sprintf(buffer + offset, "%d", arr[i]);
+//     if (i < len - 1) {
+//       // Add a separator (e.g., comma or space) if it's not the last element
+//       offset += sprintf(buffer + offset, " ");
+//     }
+//   }
+
+//   return buffer;
+// }
+
+void recursive_quicksort(int *integers, int total_elems, int segment_capac,
+                         int segment_len, int *integers_out, MPI_Comm comm) {
+  int err, rank, p;
+  MPI_Comm_size(comm, &p);
+  MPI_Comm_rank(comm, &rank);
+  // printf(
+  //     "[%d,%d] recursive_quicksort([%s], total=%d, capac=%d, len=%d)
+  //     {p=%d}\n", rank, total_elems, string_of_list(integers, segment_len),
+  //     total_elems, segment_capac, segment_len, p);
+
+  if (p <= 1) {
+    // Recursion base case: just sort it serially
+    local_quicksort(integers, 0, total_elems);
+    for (int i = 0; i < total_elems; ++i) {
+      integers_out[i] = integers[i];
+    }
+    // printf("Quicksorted: %s\n", string_of_list(integers, total_elems));
+    return;
+  }
+
+  // sleep(1);
+  // printf("\n\n");
+
+  // int segment_capac = (total_elems + p - 1) / p;
+  // int segment_len = total_elems / p;
+  // if (rank == ROOT_RANK)
+  //   segment_len += total_elems - p * segment_len;
+  // printf("[%d,%d] capac: %d, len: %d\n", rank, total_elems, segment_capac,
+  // segment_len);
+
+  // printf(
+  //     "[%d] :::::::::::::::::::::::::::: RECURSIVE QUICKSORT (n=%d,
+  //     n/p=%d)\n", rank, n, n_over_p);
+
+  // Locally sort
+  // printf("[%d] Numbers before:           %s\n", rank,
+  //        string_of_list(integers, n_over_p));
+  local_quicksort(integers, 0, segment_len);
+  // printf("[%d] Numbers after first sort: %s\n", rank,
+  //        string_of_list(integers, segment_len));
+
+  // Select a pivot.
+  // This pivot is broadcasted to all nodes
+  int pivot;
+  {
+    // First, select a random element
+    int rand_el = integers[rand() % segment_len];
+
+    // Gather it
+    int rand_els[p];
+    MPI_Gather(&rand_el, 1, MPI_INT, rand_els, 1, MPI_INT, ROOT_RANK, comm);
+
+    // Get the median
+    if (rank == ROOT_RANK) {
+      // Sort
+      local_quicksort(rand_els, 0, p);
+      // printf("[%d,%d] Local quicksort for pivot: %s\n", rank, total_elems,
+      //        string_of_list(rand_els, p));
+
+      // Get the middle element
+      pivot = rand_els[p / 2];
+    }
+
+    MPI_Bcast(&pivot, 1, MPI_INT, ROOT_RANK, comm);
+  }
+  // printf("[%d,%d] Broadcasted pivot: %d\n", rank, total_elems, pivot);
+
+  // Determine where the boundary between S (lower) and L (higher) lies
+  int boundary = 0;
+  for (int i = 0; i < segment_len; ++i) {
+    if (integers[i] >= pivot) {
+      boundary = i;
+      break;
+    }
+  }
+  // printf("[%d,%d] boundary: %d\n", rank, total_elems, boundary);
+
+  int S_lo = 0, S_hi = boundary;
+  int L_lo = boundary, L_hi = segment_len;
+  int S_size = S_hi - S_lo, L_size = L_hi - L_lo;
+  // printf("[%d,%d] S: [%d - %d] (%d), L: [%d - %d] (%d)\n", rank, total_elems,
+  //        S_lo, S_hi, S_size, L_lo, L_hi, L_size);
+
+  // Perform global arrangement
+  int S_global_end = -1, L_reverse_end = -1, S_global_max_end = -1;
+  MPI_Scan(&S_size, &S_global_end, 1, MPI_INT, MPI_SUM, comm);
+  MPI_Scan(&L_size, &L_reverse_end, 1, MPI_INT, MPI_SUM, comm);
+
+  int index;
+  MPI_Scan(&segment_len, &index, 1, MPI_INT, MPI_SUM, comm);
+
+  // printf("[%d] bruh %d\n", rank, S_global_end);
+  // Get the boundary element between S and L
+  MPI_Allreduce(&S_global_end, &S_global_max_end, 1, MPI_INT, MPI_MAX, comm);
+
+  int S_global_start = S_global_end - S_size,
+      L_reverse_start = L_reverse_end - L_size,
+      L_global_start = total_elems - L_reverse_end,
+      L_global_end = total_elems - L_reverse_start;
+  // printf("[%d,%d] Prefixed S: [%d - %d) (%d), Prefixed L: [%d - %d) (%d)\n",
+  //        rank, total_elems, S_global_start, S_global_end, S_size,
+  //        L_global_start, L_global_end, L_size);
+
+  // Determine which process S's and L's destination will start in, respectively
+  int S_starting_process, L_starting_process;
+  int p_of_split, split_point;
+  // int split_point = S_global_max_end % segment_len;
+  int indexes[p];
+  {
+    MPI_Allgather(&index, 1, MPI_INT, indexes, 1, MPI_INT, comm);
+
+    for (int i = 0; i < p; ++i) {
+      int lo = i == 0 ? 0 : indexes[i - 1];
+      int hi = indexes[i];
+      if (S_global_start >= lo && S_global_start < hi)
+        S_starting_process = i;
+      if (L_global_start >= lo && L_global_start < hi)
+        L_starting_process = i;
+      if (S_global_max_end >= lo && S_global_max_end < hi) {
+        p_of_split = i;
+        split_point = S_global_max_end - lo;
+      }
+    }
+
+    // err = MPI_Bcast(&S_starting_process, 1, MPI_INT, ROOT_RANK, comm);
+    // check_mpi_error(err);
+    // err = MPI_Bcast(&L_starting_process, 1, MPI_INT, ROOT_RANK, comm);
+    // check_mpi_error(err);
+  }
+  // printf("[%d,%d] indexes: %s\n", rank, total_elems,
+  //        string_of_list(indexes, p));
+  // printf("[%d,%d] S=%d starts at %d , L=%d starts at %d , indexes: %s\n",
+  // rank,
+  //        total_elems, S_global_start, S_starting_process, L_global_start,
+  //        L_starting_process, string_of_list(indexes, p));
+
+  // S_starting_process = S_global_start / segment_len;
+  // L_starting_process = L_global_start / segment_len;
+  int S_offset = S_global_start % segment_len,
+      L_offset = L_global_start % segment_len;
+
+  int S_ctl[p * CTL_SIZE];
+  int L_ctl[p * CTL_SIZE];
+  int S_send_ctl[p * CTL_SIZE];
+  int L_send_ctl[p * CTL_SIZE];
+  int ctl_send_counts[p];
+  int ctl_send_displs[p];
+
+  int send_counts[p];
+  int send_displs[p];
+  int recv_counts[p];
+  int recv_displs[p];
+
+  init_ctl(S_ctl, p);
+  init_ctl(L_ctl, p);
+  init_ctl(S_send_ctl, p);
+  init_ctl(L_send_ctl, p);
+
+  int SPACE = segment_capac;
+
+  for (int i = 0; i < p; ++i) {
+    send_counts[i] = SPACE;
+    send_displs[i] = i * SPACE;
+
+    ctl_send_counts[i] = CTL_SIZE;
+    ctl_send_displs[i] = i * CTL_SIZE;
+    recv_counts[i] = CTL_SIZE;
+    recv_displs[i] = i * CTL_SIZE;
+  }
+
+  // Send S to the correct target
+  if (S_size) {
+    for (int i = S_lo, dest_pos = S_global_start,
+             processor = S_starting_process;
+         i < S_hi;) {
+      int next_break =
+          MIN(int, S_global_end,
+              MIN(int, dest_pos + (S_hi - S_lo),
+                  (dest_pos / segment_len) * segment_len + segment_len));
+      int count = next_break - dest_pos;
+
+      int from_local_start = i, from_local_end = i + count;
+      int from_global_start = rank * segment_len + from_local_start,
+          from_global_end = from_global_start + count;
+
+      int to_global_start = dest_pos, to_global_end = dest_pos + count;
+      int to_local_start = to_global_start - processor * segment_len,
+          to_local_end = to_global_end - processor * segment_len;
+
+      // printf("[%d] S ->> (count=%d), from local [%d..%d] {%d..%d} -to-> "
+      //        "p#%d [%d..%d] {%d..%d}\n",
+      //        rank, count, from_local_start, from_local_end,
+      //        from_global_start, from_global_end, processor, to_local_start,
+      //        to_local_end, to_global_start, to_global_end);
+      S_send_ctl[processor * CTL_SIZE] = count;
+      S_send_ctl[processor * CTL_SIZE + 1] = from_global_start;
+      S_send_ctl[processor * CTL_SIZE + 2] = to_local_start;
+      S_send_ctl[processor * CTL_SIZE + 3] = from_local_start;
+
+      i += count;
+      dest_pos += count;
+      processor += 1;
+    }
+  }
+
+  MPI_Alltoallv(S_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, S_ctl,
+                recv_counts, recv_displs, MPI_INT, comm);
+
+  // Send L to the correct target
+  if (L_size) {
+    for (int i = L_lo, dest_pos = L_global_start,
+             processor = L_starting_process;
+         i < L_hi;) {
+      int next_break =
+          MIN(int, L_global_end,
+              MIN(int, dest_pos + (L_hi - L_lo),
+                  (dest_pos / segment_len) * segment_len + segment_len));
+      int count = next_break - dest_pos;
+
+      int from_local_start = i, from_local_end = i + count;
+      int from_global_start = rank * segment_len + from_local_start,
+          from_global_end = from_global_start + count;
+
+      int to_global_start = dest_pos, to_global_end = dest_pos + count;
+      int to_local_start = to_global_start - processor * segment_len,
+          to_local_end = to_global_end - processor * segment_len;
+
+      // printf("[%d] L ->> (count=%d), from local [%d..%d] {%d..%d} -to-> "
+      //        "p#%d [%d..%d] {%d..%d}\n",
+      //        rank, count, from_local_start, from_local_end,
+      //        from_global_start, from_global_end, processor, to_local_start,
+      //        to_local_end, to_global_start, to_global_end);
+      L_send_ctl[processor * CTL_SIZE] = count;
+      L_send_ctl[processor * CTL_SIZE + 1] = from_global_start;
+      L_send_ctl[processor * CTL_SIZE + 2] = to_local_start;
+      L_send_ctl[processor * CTL_SIZE + 3] = from_local_start;
+
+      i += count;
+      dest_pos += count;
+      processor += 1;
+    }
+  }
+
+  MPI_Alltoallv(L_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, L_ctl,
+                recv_counts, recv_displs, MPI_INT, comm);
+
+  // After sending S and L information
+
+  for (int i = 0; i < p; ++i) {
+    recv_counts[i] = segment_len;
+    recv_displs[i] = i * segment_len;
+  }
+
+  // printf("[%d,%d] S CTL INFO\n", rank, total_elems);
+  // for (int i = 0; i < p; ++i) {
+  //   printf("[%d,%d] [p=%d] (ct=%d)\n", rank, total_elems, i,
+  //          S_send_ctl[i * CTL_SIZE]);
+  // }
+
+  // MPI_Alltoallv(integers, send_counts, send_displs, MPI_INT,
+  // integers_recv_buf,
+  //               recv_counts, recv_displs, MPI_INT, MPI_COMM_WORLD);
+  // MPI_Allgather(integers, n_over_p, MPI_INT, integers_recv_buf, n_over_p,
+  //               MPI_INT, comm);
+  // printf("[%d] ints: %s\n", rank, string_of_list(integers_recv_buf, n));
+
+  // Scheme for all send
+  int integers_recv_2[segment_capac];
+  int integers_recv_3[segment_capac];
+  for (int i = 0; i < segment_len; ++i) {
+    integers_recv_2[i] = -1;
+    integers_recv_3[i] = integers[i];
+  }
+
+  for (int host_p = 0; host_p < p; ++host_p) {
+    if (rank == host_p) {
+      // Your {S,L}_ctl is a mapping from source_processor -> ctl
+      // Everyone already knows who needs to send to who now
+      for (int sender_p = 0; sender_p < p; ++sender_p) {
+        int S_count = S_ctl[sender_p * CTL_SIZE];
+        if (S_count > 0) {
+          int to_local_start = S_ctl[sender_p * CTL_SIZE + 2];
+          int from_local_start = S_ctl[sender_p * CTL_SIZE + 3];
+
+          if (sender_p == host_p) {
+            for (int k = 0; k < S_count; ++k) {
+              integers_recv_3[to_local_start + k] =
+                  integers[from_local_start + k];
+            }
+            continue;
+          }
+
+          // printf("[%d] - S inbound from host %d to [%d..%d] (%d)\n", rank,
+          //        sender_p, to_local_start, to_local_start + S_count,
+          //        S_count);
+          err = MPI_Recv(&integers_recv_2[to_local_start], S_count, MPI_INT,
+                         sender_p, 124, comm, MPI_STATUS_IGNORE);
+          check_mpi_error(err);
+          for (int k = 0; k < S_count; ++k) {
+            integers_recv_3[to_local_start + k] =
+                integers_recv_2[to_local_start + k];
+          }
+        }
+      }
+    } else {
+      // Your {S,L}_send_ctl contains a mapping from dest_processor -> ctl
+      for (int dest_p = 0; dest_p < p; ++dest_p) {
+        int S_count = S_send_ctl[dest_p * CTL_SIZE];
+        if (S_count > 0 && dest_p == host_p) {
+          int from_local_start = S_send_ctl[dest_p * CTL_SIZE + 3];
+          // printf("[%d] - S outbound to host %d from [%d..%d] (%d)\n", rank,
+          //        dest_p, from_local_start, from_local_start + S_count,
+          //        S_count);
+          MPI_Send(&integers[from_local_start], S_count, MPI_INT, dest_p, 124,
+                   comm);
+        }
+      }
+    }
+  }
+
+  for (int host_p = 0; host_p < p; ++host_p) {
+    if (rank == host_p) {
+      // Your {S,L}_ctl is a mapping from source_processor -> ctl
+      // Everyone already knows who needs to send to who now
+      for (int sender_p = 0; sender_p < p; ++sender_p) {
+        int L_count = L_ctl[sender_p * CTL_SIZE];
+        if (L_count > 0) {
+          int to_local_start = L_ctl[sender_p * CTL_SIZE + 2];
+          int from_local_start = L_ctl[sender_p * CTL_SIZE + 3];
+
+          if (sender_p == host_p) {
+            for (int k = 0; k < L_count; ++k) {
+              integers_recv_3[to_local_start + k] =
+                  integers[from_local_start + k];
+            }
+            continue;
+          }
+
+          // printf("[%d] - L inbound from host %d to [%d..%d] (%d)\n", rank,
+          //        sender_p, to_local_start, to_local_start + L_count,
+          //        L_count);
+          err = MPI_Recv(&integers_recv_2[to_local_start], L_count, MPI_INT,
+                         sender_p, 125, comm, MPI_STATUS_IGNORE);
+          check_mpi_error(err);
+          for (int k = 0; k < L_count; ++k) {
+            integers_recv_3[to_local_start + k] =
+                integers_recv_2[to_local_start + k];
+          }
+        }
+      }
+    } else {
+      // Your {S,L}_send_ctl contains a mapping from dest_processor -> ctl
+      for (int dest_p = 0; dest_p < p; ++dest_p) {
+        int L_count = L_send_ctl[dest_p * CTL_SIZE];
+        if (L_count > 0 && dest_p == host_p) {
+          int from_local_start = L_send_ctl[dest_p * CTL_SIZE + 3];
+          // printf("[%d] - L outbound to host %d from [%d..%d] (%d)\n", rank,
+          //        dest_p, from_local_start, from_local_start + L_count,
+          //        L_count);
+          MPI_Send(&integers[from_local_start], L_count, MPI_INT, dest_p, 125,
+                   comm);
+        }
+      }
+    }
+  }
+
+  // printf("[%d,%d] after: %s\n", rank, total_elems,
+  //        string_of_list(integers_recv_3, segment_len));
+  // printf("[%d,%d] -------------------------------------\n", rank,
+  // total_elems); for (int i = 0; i < segment_len; ++i) {
+  //   integers[i] = integers_recv_3[i];
+  // }
+
+  // ###################################################################################
+  // SUBDIVIDING
+
+  // Now, determine which processes should be responsible for taking the S and L
+  // arrays
+
+  // Specifically, the part where it's split, break the tie to see if it goes
+  // down or up
+
+  int child_len = segment_len;
+  int difference = segment_len - split_point;
+  int transfer[split_point];
+  // printf("[%d,%d] p_of_split = %d, split_point = %d => (child_len = %d)\n",
+  //        rank, total_elems, p_of_split, split_point, child_len);
+
+  int has_split = 0;
+  if (p_of_split == 0 || p_of_split == p - 1) {
+    // Super unfortunate, bad pivot
+  } else if (split_point == 0) {
+    // Super lucky, it's split evenly!
+  } else {
+    has_split = 1;
+    // Let's just say that if there's any split, the block itself counts as L
+    // and then add the rest to the previous block
+    if (rank == p_of_split - 1) {
+      child_len += split_point;
+      err = MPI_Recv(transfer, split_point, MPI_INT, p_of_split, 126, comm,
+                     MPI_STATUS_IGNORE);
+      check_mpi_error(err);
+    } else if (rank == p_of_split) {
+      child_len = difference;
+      err = MPI_Send(integers, split_point, MPI_INT, p_of_split - 1, 126, comm);
+      check_mpi_error(err);
+    }
+  }
+
+  // Which group is this child going into?
+  int color;
+  if (rank < p_of_split)
+    color = 100;
+  else
+    color = 200;
+
+  // printf("[%d,%d] split color = %d, split lenth = %d\n", rank, total_elems,
+  // color, child_len);
+  MPI_Comm child_comm;
+  MPI_Comm_split(comm, color, rank, &child_comm);
+
+  // Figure out what the max is
+  int max_child_buf_len, total_child_elems;
+  err = MPI_Allreduce(&child_len, &max_child_buf_len, 1, MPI_INT, MPI_MAX,
+                      child_comm);
+  check_mpi_error(err);
+  err = MPI_Allreduce(&child_len, &total_child_elems, 1, MPI_INT, MPI_SUM,
+                      child_comm);
+  check_mpi_error(err);
+  // printf("[%d] [color=%d] max length = %d, total child elems = %d\n", rank,
+  //        color, max_child_buf_len, total_child_elems);
+
+  // Copy into a new buf
+  int new_buf[max_child_buf_len];
+  int whichCase = 999;
+  for (int i = 0; i < max_child_buf_len; ++i) {
+    if (has_split && rank == p_of_split - 1) {
+      whichCase = 1001;
+      if (i < segment_len)
+        new_buf[i] = integers_recv_3[i];
+      else if (i < segment_len + split_point)
+        new_buf[i] = transfer[i - segment_len];
+      else
+        new_buf[i] = -1;
+    } else if (has_split && rank == p_of_split) {
+      whichCase = 1002;
+      if (i < difference)
+        new_buf[i] = integers_recv_3[i + split_point];
+      else
+        new_buf[i] = -1;
+    } else {
+      whichCase = 1003;
+      if (i < child_len)
+        new_buf[i] = integers_recv_3[i];
+      else
+        new_buf[i] = -1;
+    }
+  }
+
+  // printf("[%d,%d] orig integers: %s\n", rank, total_elems,
+  //        string_of_list(integers, segment_len));
+  // printf("[%d,%d] new buf = %s (has_split = %d, segment_len = %d, case = %d,
+  // "
+  //        "child_elems = %d)\n",
+  //        rank, total_elems, string_of_list(new_buf, max_child_buf_len),
+  //        has_split, segment_len, whichCase, child_len);
+  // printf("[%d,%d] \n", rank, total_elems);
+
+  int integers_out_buf[total_child_elems];
+  recursive_quicksort(new_buf, total_child_elems, max_child_buf_len, child_len,
+                      integers_out_buf, child_comm);
+
+  // Ok now copy the new items back
+  switch (whichCase) {
+  case 1001:
+    // In this case, p is right before the split, so it got extra elements
+    // To reverse this, we can send the elements back to the second
+    for (int i = 0; i < total_child_elems; ++i) {
+      if (i < segment_len)
+        integers_out[i] = integers_out_buf[i];
+      else
+        transfer[i - segment_len] = integers_out_buf[i];
+    }
+    MPI_Send(transfer, split_point, MPI_INT, p_of_split, 127, comm);
+    break;
+  case 1002:
+    MPI_Recv(transfer, split_point, MPI_INT, p_of_split - 1, 127, comm,
+             MPI_STATUS_IGNORE);
+    for (int i = 0; i < split_point; ++i) {
+      integers_out[i] = transfer[i];
+    }
+    for (int i = 0; i < total_child_elems; ++i) {
+      integers_out[i + split_point] = integers_out_buf[i];
+    }
+    break;
+  case 1003:
+    for (int i = 0; i < total_child_elems; ++i) {
+      integers_out[i] = integers_out_buf[i];
+    }
+    break;
+  }
+
+  MPI_Comm_free(&child_comm);
+}
+
+void init_ctl(int *ctl, int len) {
+  for (int i = 0; i < len; ++i) {
+    ctl[i * CTL_SIZE] = 0;
+    for (int j = 1; j < CTL_SIZE; ++j) {
+      ctl[i * CTL_SIZE + j] = -1;
+    }
+  }
+}
--- a/assignments/02/qs_mpi.c
+++ b/assignments/02/qs_mpi.c
@ -3,9 +3,25 @@
 #include <stdlib.h>
 #include <unistd.h>

+// https://stackoverflow.com/a/75458495
+#define check_mpi_error(n) __check_mpi_error(__FILE__, __LINE__, n)
+
+void __check_mpi_error(const char *file, const int line, const int n) {
+  char errbuffer[MPI_MAX_ERROR_STRING];
+  int errlen;
+
+  if (n != MPI_SUCCESS) {
+    MPI_Error_string(n, errbuffer, &errlen);
+    printf("MPI-error: %s\n", errbuffer);
+    printf("Location: %s:%i\n", file, line);
+    MPI_Abort(MPI_COMM_WORLD, n);
+  }
+}
+
 #define ORDER_FORWARDS 1
 #define ORDER_BACKWARDS 2
 #define CTL_SIZE 4
+#define ROOT_RANK 0

 #define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
 #define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))
@ -19,7 +35,8 @@
 void init_ctl(int *ctl, int len);
 void local_quicksort(int *arr, int lo, int hi);
 char *string_of_list(int *arr, int len);
-void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm);
+void recursive_quicksort(int *integers, int n, int segment_capac,
+                         int segment_len, int *integers_out, MPI_Comm comm);

 int main(int argc, char **argv) {
  int rank, p;
@ -42,37 +59,39 @@ int main(int argc, char **argv) {
  srand(rank + 1);

  for (int i = 0; i < n_over_p; ++i) {
-    // TODO: For readability during debugging, I'm capping this
    integers[i] = rand() % 101;
-    // printf(" - %d\n", integers[i]);
  }

-  recursive_quicksort(integers, n, 0, MPI_COMM_WORLD);
-
-  // sleep(1);
-  // printf("[%d] after: %s\n", rank, string_of_list(integers, n_over_p));
+  int new_integers[n_over_p];
+  recursive_quicksort(integers, n, n_over_p, n_over_p, new_integers,
+                      MPI_COMM_WORLD);

  // The first node is responsible for collecting all the data and then
-  // printing it out to the file MPI_Gather(const void *sendbuf, int
-  // sendcount, MPI_INT, void *recvbuf,
-  //            int recvcount, MPI_INT, 0, MPI_COMM_WORLD);
-  int recvbuf[n];
-  MPI_Gather(integers, n_over_p, MPI_INT, recvbuf, n_over_p, MPI_INT, 0,
-             MPI_COMM_WORLD);
+  // printing it out to the file

-  if (rank == 0) {
-    FILE *f = fopen(argv[2], "w");
-    // printf("integers: %s\n", string_of_list(recvbuf, n));
-    printf("[%d] ==== FINAL ====\n", rank);
-    for (int i = 0; i < p; i += 1) {
-      printf("[%d]  %s\n", rank,
-             string_of_list(&recvbuf[i * n_over_p], n_over_p));
+  FILE *fp;
+  if (rank == ROOT_RANK)
+    fp = fopen(argv[2], "w");
+
+  for (int i = 0; i < p; i += 1) {
+    if (rank == ROOT_RANK) {
+      if (i != ROOT_RANK) {
+        MPI_Recv(new_integers, n_over_p, MPI_INT, i, 129, MPI_COMM_WORLD,
+                 MPI_STATUS_IGNORE);
+      }
+
+      for (int j = 0; j < n_over_p; ++j) {
+        fprintf(fp, "%d\n", new_integers[j]);
+      }
+    } else if (rank == i) {
+      MPI_Send(new_integers, n_over_p, MPI_INT, ROOT_RANK, 129, MPI_COMM_WORLD);
    }
-    fclose(f);
  }

+  if (rank == ROOT_RANK)
+    fclose(fp);
+
  MPI_Finalize();
-  printf("Done.\n");
  return 0;
 }

@ -105,113 +124,112 @@ void local_quicksort(int *arr, int lo, int hi) {
  local_quicksort(arr, pivot_idx + 1, hi);
 }

-char *string_of_list(int *arr, int len) {
-  char *buffer = calloc(sizeof(char), 1000);
-  int offset = 0; // Keep track of the current position in the buffer
-  for (int i = 0; i < len; i++) {
-    offset += sprintf(buffer + offset, "%d", arr[i]);
-    if (i < len - 1) {
-      // Add a separator (e.g., comma or space) if it's not the last element
-      offset += sprintf(buffer + offset, " ");
-    }
-  }
+// char *string_of_list(int *arr, int len) {
+//   char *buffer = calloc(sizeof(char), 1000);
+//   int offset = 0; // Keep track of the current position in the buffer
+//   for (int i = 0; i < len; i++) {
+//     offset += sprintf(buffer + offset, "%d", arr[i]);
+//     if (i < len - 1) {
+//       // Add a separator (e.g., comma or space) if it's not the last
+//       element offset += sprintf(buffer + offset, " ");
+//     }
+//   }

-  return buffer;
-}
+//   return buffer;
+// }

-void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
-  int rank, p;
+void recursive_quicksort(int *integers, int total_elems, int segment_capac,
+                         int segment_len, int *integers_out, MPI_Comm comm) {
+  int err, rank, p;
  MPI_Comm_size(comm, &p);
  MPI_Comm_rank(comm, &rank);

-  if (p == 1) {
+  if (p <= 1) {
    // Recursion base case: just sort it serially
-    local_quicksort(integers, 0, n);
-    printf("Quicksorted: %s\n", string_of_list(integers, n));
+    local_quicksort(integers, 0, total_elems);
+    for (int i = 0; i < total_elems; ++i) {
+      integers_out[i] = integers[i];
+    }
    return;
  }

-  sleep(1);
-  printf("\n\n");
-  int n_over_p_max = (n + p - 1) / p;
-  int n_over_p = n / p;
-  if (rank == root)
-    n_over_p += n - p * n_over_p;
-  // printf(
-  //     "[%d] :::::::::::::::::::::::::::: RECURSIVE QUICKSORT (n=%d,
-  //     n/p=%d)\n", rank, n, n_over_p);
-
-  // Locally sort
-  // printf("[%d] Numbers before:           %s\n", rank,
-  //        string_of_list(integers, n_over_p));
-  local_quicksort(integers, 0, n_over_p);
-  printf("[%d] Numbers after first sort: %s\n", rank,
-         string_of_list(integers, n_over_p));
-
  // Select a pivot.
  // This pivot is broadcasted to all nodes
  int pivot;
  {
    // First, select a random element
-    int rand_el = integers[rand() % n_over_p];
+    int rand_el = integers[rand() % segment_len];

    // Gather it
    int rand_els[p];
-    MPI_Gather(&rand_el, 1, MPI_INT, rand_els, 1, MPI_INT, root, comm);
+    MPI_Gather(&rand_el, 1, MPI_INT, rand_els, 1, MPI_INT, ROOT_RANK, comm);

    // Get the median
-    if (rank == root) {
-      // Sort
+    if (rank == ROOT_RANK) {
+      // Get the middle element after sorting
      local_quicksort(rand_els, 0, p);
-
-      // Get the middle element
      pivot = rand_els[p / 2];
    }

-    MPI_Bcast(&pivot, 1, MPI_INT, root, comm);
+    MPI_Bcast(&pivot, 1, MPI_INT, ROOT_RANK, comm);
  }
-  printf("[%d] Broadcasted pivot: %d\n", rank, pivot);

  // Determine where the boundary between S (lower) and L (higher) lies
-  int boundary;
-  for (int i = 0; i < n_over_p; ++i) {
+  int boundary = 0;
+  for (int i = 0; i < segment_len; ++i) {
    if (integers[i] >= pivot) {
      boundary = i;
      break;
    }
  }
+
  int S_lo = 0, S_hi = boundary;
-  int L_lo = boundary, L_hi = n_over_p;
+  int L_lo = boundary, L_hi = segment_len;
  int S_size = S_hi - S_lo, L_size = L_hi - L_lo;
-  // printf("[%d] S: [%d - %d] (%d), L: [%d - %d] (%d)\n", rank, S_lo, S_hi,
-  //        S_size, L_lo, L_hi, L_size);

  // Perform global arrangement
-  int S_global_end, L_reverse_end, S_global_max_end;
+  int S_global_end = -1, L_reverse_end = -1, S_global_max_end = -1;
  MPI_Scan(&S_size, &S_global_end, 1, MPI_INT, MPI_SUM, comm);
  MPI_Scan(&L_size, &L_reverse_end, 1, MPI_INT, MPI_SUM, comm);

-  // printf("[%d] bruh %d\n", rank, S_global_end);
-  // Get the boundary element between S and L
+  int index;
+  MPI_Scan(&segment_len, &index, 1, MPI_INT, MPI_SUM, comm);
  MPI_Allreduce(&S_global_end, &S_global_max_end, 1, MPI_INT, MPI_MAX, comm);

  int S_global_start = S_global_end - S_size,
      L_reverse_start = L_reverse_end - L_size,
-      L_global_start = n - L_reverse_end, L_global_end = n - L_reverse_start;
-  // printf("[%d] Prefixed S: [%d - %d], Prefixed L: [%d - %d]\n", rank,
-  //        S_global_start, S_global_end - 1, L_global_start, L_global_end - 1);
+      L_global_start = total_elems - L_reverse_end,
+      L_global_end = total_elems - L_reverse_start;

-  int S_starting_process = S_global_start / n_over_p,
-      L_starting_process = L_global_start / n_over_p;
-  int S_offset = S_global_start % n_over_p,
-      L_offset = L_global_start % n_over_p;
+  // Determine which process S's and L's destination will start in,
+  // respectively
+  int S_starting_process, L_starting_process;
+  int p_of_split, split_point;
+  int indexes[p];
+  {
+    MPI_Allgather(&index, 1, MPI_INT, indexes, 1, MPI_INT, comm);
+
+    for (int i = 0; i < p; ++i) {
+      int lo = i == 0 ? 0 : indexes[i - 1];
+      int hi = indexes[i];
+      if (S_global_start >= lo && S_global_start < hi)
+        S_starting_process = i;
+      if (L_global_start >= lo && L_global_start < hi)
+        L_starting_process = i;
+      if (S_global_max_end >= lo && S_global_max_end < hi) {
+        p_of_split = i;
+        split_point = S_global_max_end - lo;
+      }
+    }
+  }
+
+  int S_offset = S_global_start % segment_len,
+      L_offset = L_global_start % segment_len;

-  int *integers_recv_buf = calloc(sizeof(int), n);
  int S_ctl[p * CTL_SIZE];
  int L_ctl[p * CTL_SIZE];
  int S_send_ctl[p * CTL_SIZE];
  int L_send_ctl[p * CTL_SIZE];
-  int recvpart[n_over_p];
  int ctl_send_counts[p];
  int ctl_send_displs[p];

@ -225,9 +243,11 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
  init_ctl(S_send_ctl, p);
  init_ctl(L_send_ctl, p);

+  int SPACE = segment_capac;
+
  for (int i = 0; i < p; ++i) {
-    send_counts[i] = n_over_p;
-    send_displs[i] = i * n_over_p;
+    send_counts[i] = SPACE;
+    send_displs[i] = i * SPACE;

    ctl_send_counts[i] = CTL_SIZE;
    ctl_send_displs[i] = i * CTL_SIZE;
@ -236,28 +256,24 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
  }

  // Send S to the correct target
-  {
+  if (S_size) {
    for (int i = S_lo, dest_pos = S_global_start,
             processor = S_starting_process;
         i < S_hi;) {
-      int next_break = MIN(int, S_global_end,
-                           MIN(int, dest_pos + (S_hi - S_lo),
-                               (dest_pos / n_over_p) * n_over_p + n_over_p));
+      int next_break =
+          MIN(int, S_global_end,
+              MIN(int, dest_pos + (S_hi - S_lo),
+                  (dest_pos / segment_len) * segment_len + segment_len));
      int count = next_break - dest_pos;

      int from_local_start = i, from_local_end = i + count;
-      int from_global_start = rank * n_over_p + from_local_start,
+      int from_global_start = rank * segment_len + from_local_start,
          from_global_end = from_global_start + count;

      int to_global_start = dest_pos, to_global_end = dest_pos + count;
-      int to_local_start = to_global_start - processor * n_over_p,
-          to_local_end = to_global_end - processor * n_over_p;
+      int to_local_start = to_global_start - processor * segment_len,
+          to_local_end = to_global_end - processor * segment_len;

-      // printf("[%d] S ->> (count=%d), from local [%d..%d] {%d..%d} -to-> "
-      //        "p#%d [%d..%d] {%d..%d}\n",
-      //        rank, count, from_local_start, from_local_end,
-      //        from_global_start, from_global_end, processor, to_local_start,
-      //        to_local_end, to_global_start, to_global_end);
      S_send_ctl[processor * CTL_SIZE] = count;
      S_send_ctl[processor * CTL_SIZE + 1] = from_global_start;
      S_send_ctl[processor * CTL_SIZE + 2] = to_local_start;
@ -267,34 +283,30 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
      dest_pos += count;
      processor += 1;
    }
-
-    MPI_Alltoallv(S_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, S_ctl,
-                  recv_counts, recv_displs, MPI_INT, comm);
  }

+  MPI_Alltoallv(S_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, S_ctl,
+                recv_counts, recv_displs, MPI_INT, comm);
+
  // Send L to the correct target
-  {
+  if (L_size) {
    for (int i = L_lo, dest_pos = L_global_start,
             processor = L_starting_process;
         i < L_hi;) {
-      int next_break = MIN(int, L_global_end,
-                           MIN(int, dest_pos + (L_hi - L_lo),
-                               (dest_pos / n_over_p) * n_over_p + n_over_p));
+      int next_break =
+          MIN(int, L_global_end,
+              MIN(int, dest_pos + (L_hi - L_lo),
+                  (dest_pos / segment_len) * segment_len + segment_len));
      int count = next_break - dest_pos;

      int from_local_start = i, from_local_end = i + count;
-      int from_global_start = rank * n_over_p + from_local_start,
+      int from_global_start = rank * segment_len + from_local_start,
          from_global_end = from_global_start + count;

      int to_global_start = dest_pos, to_global_end = dest_pos + count;
-      int to_local_start = to_global_start - processor * n_over_p,
-          to_local_end = to_global_end - processor * n_over_p;
+      int to_local_start = to_global_start - processor * segment_len,
+          to_local_end = to_global_end - processor * segment_len;

-      // printf("[%d] L ->> (count=%d), from local [%d..%d] {%d..%d} -to-> "
-      //        "p#%d [%d..%d] {%d..%d}\n",
-      //        rank, count, from_local_start, from_local_end,
-      //        from_global_start, from_global_end, processor, to_local_start,
-      //        to_local_end, to_global_start, to_global_end);
      L_send_ctl[processor * CTL_SIZE] = count;
      L_send_ctl[processor * CTL_SIZE + 1] = from_global_start;
      L_send_ctl[processor * CTL_SIZE + 2] = to_local_start;
@ -304,29 +316,23 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
      dest_pos += count;
      processor += 1;
    }
-
-    MPI_Alltoallv(L_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, L_ctl,
-                  recv_counts, recv_displs, MPI_INT, comm);
  }

+  MPI_Alltoallv(L_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, L_ctl,
+                recv_counts, recv_displs, MPI_INT, comm);
+
  // After sending S and L information

  for (int i = 0; i < p; ++i) {
-    recv_counts[i] = n_over_p;
-    recv_displs[i] = i * n_over_p;
+    recv_counts[i] = segment_len;
+    recv_displs[i] = i * segment_len;
  }

-  // MPI_Alltoallv(integers, send_counts, send_displs, MPI_INT,
-  // integers_recv_buf,
-  //               recv_counts, recv_displs, MPI_INT, MPI_COMM_WORLD);
-  // MPI_Allgather(integers, n_over_p, MPI_INT, integers_recv_buf, n_over_p,
-  //               MPI_INT, comm);
-  // printf("[%d] ints: %s\n", rank, string_of_list(integers_recv_buf, n));
+  // Algorithm for sending S and L between all processes without O(n)

-  // Scheme for all send
-  int integers_recv_2[n_over_p];
-  int integers_recv_3[n_over_p];
-  for (int i = 0; i < n_over_p; ++i) {
+  int integers_recv_2[segment_capac];
+  int integers_recv_3[segment_capac];
+  for (int i = 0; i < segment_len; ++i) {
    integers_recv_2[i] = -1;
    integers_recv_3[i] = integers[i];
  }
@ -349,11 +355,9 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
            continue;
          }

-          // printf("[%d] - S inbound from host %d to [%d..%d] (%d)\n", rank,
-          //        sender_p, to_local_start, to_local_start + S_count,
-          //        S_count);
-          MPI_Recv(&integers_recv_2[to_local_start], S_count, MPI_INT, sender_p,
-                   124, comm, MPI_STATUS_IGNORE);
+          err = MPI_Recv(&integers_recv_2[to_local_start], S_count, MPI_INT,
+                         sender_p, 124, comm, MPI_STATUS_IGNORE);
+          check_mpi_error(err);
          for (int k = 0; k < S_count; ++k) {
            integers_recv_3[to_local_start + k] =
                integers_recv_2[to_local_start + k];
@ -366,9 +370,6 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
        int S_count = S_send_ctl[dest_p * CTL_SIZE];
        if (S_count > 0 && dest_p == host_p) {
          int from_local_start = S_send_ctl[dest_p * CTL_SIZE + 3];
-          // printf("[%d] - S outbound to host %d from [%d..%d] (%d)\n", rank,
-          //        dest_p, from_local_start, from_local_start + S_count,
-          //        S_count);
          MPI_Send(&integers[from_local_start], S_count, MPI_INT, dest_p, 124,
                   comm);
        }
@ -394,11 +395,9 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
            continue;
          }

-          // printf("[%d] - L inbound from host %d to [%d..%d] (%d)\n", rank,
-          //        sender_p, to_local_start, to_local_start + L_count,
-          //        L_count);
-          MPI_Recv(&integers_recv_2[to_local_start], L_count, MPI_INT, sender_p,
-                   125, comm, MPI_STATUS_IGNORE);
+          err = MPI_Recv(&integers_recv_2[to_local_start], L_count, MPI_INT,
+                         sender_p, 125, comm, MPI_STATUS_IGNORE);
+          check_mpi_error(err);
          for (int k = 0; k < L_count; ++k) {
            integers_recv_3[to_local_start + k] =
                integers_recv_2[to_local_start + k];
@ -411,9 +410,6 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
        int L_count = L_send_ctl[dest_p * CTL_SIZE];
        if (L_count > 0 && dest_p == host_p) {
          int from_local_start = L_send_ctl[dest_p * CTL_SIZE + 3];
-          // printf("[%d] - L outbound to host %d from [%d..%d] (%d)\n", rank,
-          //        dest_p, from_local_start, from_local_start + L_count,
-          //        L_count);
          MPI_Send(&integers[from_local_start], L_count, MPI_INT, dest_p, 125,
                   comm);
        }
@ -421,69 +417,128 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
    }
  }

-  printf("[%d] after: %s\n", rank, string_of_list(integers_recv_3, n_over_p));
-  for (int i = 0; i < n_over_p; ++i) {
-    integers[i] = integers_recv_3[i];
-  }
+  // ###################################################################################
+  // SUBDIVIDING

-  // Now, determine which processes should be responsible for taking the S and L
-  // arrays
+  // Now, determine which processes should be responsible for taking the S and
+  // L arrays. Specifically, the part where it's split, break the tie to see
+  // if it goes down or up

-  // Specifically, the part where it's split, break the tie to see if it goes
-  // down or up
-  int colors[p];
-  int p_of_split = S_global_max_end / n_over_p;
-  int split_point = S_global_max_end % n_over_p;
-  // printf("[%d] p_of_split = %d / %d = %d\n", rank, S_global_max_end,
-  // n_over_p,
-  //        p_of_split);
-  int S_split_add = split_point, L_split_sub = n_over_p - split_point;
+  int child_len = segment_len;
+  int difference = segment_len - split_point;
+  int transfer[split_point];

-  int lo_start = 0, lo_end;
-  int hi_start, hi_end = p;
-  if (split_point > n_over_p / 2) {
-    // Belongs to the lower group
-    lo_end = hi_start = p_of_split + 1;
+  int has_split = 0;
+  if (p_of_split == 0 || p_of_split == p - 1) {
+    // Super unfortunate, bad pivot
+  } else if (split_point == 0) {
+    // Super lucky, it's split evenly!
  } else {
-    // Belongs to the higher group
-    lo_end = hi_start = p_of_split;
-  }
-
-  int child_root = -1;
-  for (int i = 0; i < p; ++i) {
-    if (i < lo_end)
-      colors[i] = 100;
-    else {
-      colors[i] = 200;
-      if (child_root == -1)
-        child_root = i;
+    has_split = 1;
+    // Let's just say that if there's any split, the block itself counts as L
+    // and then add the rest to the previous block
+    if (rank == p_of_split - 1) {
+      child_len += split_point;
+      err = MPI_Recv(transfer, split_point, MPI_INT, p_of_split, 126, comm,
+                     MPI_STATUS_IGNORE);
+      check_mpi_error(err);
+    } else if (rank == p_of_split) {
+      child_len = difference;
+      err = MPI_Send(integers, split_point, MPI_INT, p_of_split - 1, 126, comm);
+      check_mpi_error(err);
    }
  }

-  // MPI_Comm child;
-  // MPI_Comm_split(comm, colors[rank], rank, &child);
-  // printf("[%d] Recursing...\n", rank);
+  // Which group is this child going into?
+  int color;
+  if (rank < p_of_split)
+    color = 100;
+  else
+    color = 200;

-  // int child_size;
-  // MPI_Comm_size(child, &child_size);
+  MPI_Comm child_comm;
+  MPI_Comm_split(comm, color, rank, &child_comm);

-  // int start_at = 0, new_n = child_size * n_over_p;
-  // if (colors[rank] == 100) {
-  //   new_n += S_split_add;
-  // } else {
-  //   new_n -= L_split_sub;
-  //   if (rank == p_of_split)
-  //     start_at = split_point;
-  // }
-  // recursive_quicksort(integers, n, child_root, child);
+  // Figure out what the max is
+  int max_child_buf_len, total_child_elems;
+  err = MPI_Allreduce(&child_len, &max_child_buf_len, 1, MPI_INT, MPI_MAX,
+                      child_comm);
+  check_mpi_error(err);
+  err = MPI_Allreduce(&child_len, &total_child_elems, 1, MPI_INT, MPI_SUM,
+                      child_comm);
+  check_mpi_error(err);

-  // printf("[%d] Done recursing.\n", rank);
-  // MPI_Comm_free(&child);
+  // Copy into a new buf
+  int new_buf[max_child_buf_len];
+  int whichCase = 999;
+  for (int i = 0; i < max_child_buf_len; ++i) {
+    if (has_split && rank == p_of_split - 1) {
+      whichCase = 1001;
+      if (i < segment_len)
+        new_buf[i] = integers_recv_3[i];
+      else if (i < segment_len + split_point)
+        new_buf[i] = transfer[i - segment_len];
+      else
+        new_buf[i] = -1;
+    } else if (has_split && rank == p_of_split) {
+      whichCase = 1002;
+      if (i < difference)
+        new_buf[i] = integers_recv_3[i + split_point];
+      else
+        new_buf[i] = -1;
+    } else {
+      whichCase = 1003;
+      if (i < child_len)
+        new_buf[i] = integers_recv_3[i];
+      else
+        new_buf[i] = -1;
+    }
+  }
+
+  int integers_out_buf[total_child_elems];
+  recursive_quicksort(new_buf, total_child_elems, max_child_buf_len, child_len,
+                      integers_out_buf, child_comm);
+
+  // Ok now copy the new items back
+  switch (whichCase) {
+  case 1001:
+    // In this case, p is right before the split, so it got extra elements
+    // To reverse this, we can send the elements back to the second
+    for (int i = 0; i < total_child_elems; ++i) {
+      if (i < segment_len)
+        integers_out[i] = integers_out_buf[i];
+      else
+        transfer[i - segment_len] = integers_out_buf[i];
+    }
+    MPI_Send(transfer, split_point, MPI_INT, p_of_split, 127, comm);
+    break;
+  case 1002:
+    // The original array got shortened, so copy the transferred ones back in
+    // first, then copy the result from the child quicksorting after it
+    MPI_Recv(transfer, split_point, MPI_INT, p_of_split - 1, 127, comm,
+             MPI_STATUS_IGNORE);
+    for (int i = 0; i < split_point; ++i) {
+      integers_out[i] = transfer[i];
+    }
+    for (int i = 0; i < total_child_elems; ++i) {
+      integers_out[i + split_point] = integers_out_buf[i];
+    }
+    break;
+  case 1003:
+    // This is just the regular case
+    for (int i = 0; i < total_child_elems; ++i) {
+      integers_out[i] = integers_out_buf[i];
+    }
+    break;
+  }
+
+  MPI_Comm_free(&child_comm);
 }

 void init_ctl(int *ctl, int len) {
  for (int i = 0; i < len; ++i) {
-    for (int j = 0; j < CTL_SIZE; ++j) {
+    ctl[i * CTL_SIZE] = 0;
+    for (int j = 1; j < CTL_SIZE; ++j) {
      ctl[i * CTL_SIZE + j] = -1;
    }
  }
--- a/assignments/02/report.pdf
+++ b/assignments/02/report.pdf
--- a/assignments/02/report.typ
+++ b/assignments/02/report.typ
@ -0,0 +1,20 @@
+= Homework 2
+
+My algorithm works like this:
+
+- First I generate $n/p$ integers on each process.
+- Then I jump directly into the recursive step:
+  - I choose the pivot using the algorithm where each process picks a random element, and the median of those is picked.
+  - The way I moved $S$ and $L$ arrays around is:
+    1. First `MPI_Alltoallv` the plan for _which_ processors are going to be sent to, including exact calculations of which local index is being copied from and to.
+    2. Then, each processor loops through all the processors and if they have something to send, they send it.
+    3. This way, I can coordinate all of the senders/receivers and the ones with nothing to send don't do anything.
+  - For the recursion, I opted to make the recursive step have different lengths. (*NOTE:* The reason I have a different "capacity" than "length" is because for the `displs` array I opted to have them all be the same length, so there's extra padding on the shorter ones)
+    - If the boundary between $S$ and $L$ falls between a $n/p$ segment, I'd extend the one before and shorten the one after.
+    - Then, I recursively process all the $S$'s and all the $L$'s separately using `MPI_Comm_split`.
+    - Once it's done processing, I reverse the exact operation that extends / shortens the arrays. This ensures everything is always back to $n/p$ at the end.
+- Everything is collected back at the end via a `Send`/`Recv` to save on allocations.
+
+Allocations are all on the order of $O(p + n/p)$.
+
+Unfortunately I didn't finish debugging segfaults in time, and have this report prepared for the parts of the assignment that I _did_ do. It works on small integers (capped at 100) but for some reason segfaults at address `(nil)` at the end... I spent several hours debugging but have not discovered how this occurs.