fuckkkk

2023-10-31 04:37:41 +00:00 · 2023-10-31 04:37:41 +00:00 · 06d3e930ec
commit 06d3e930ec
parent a01580087f
9 changed files with 967 additions and 214 deletions
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@ -14,17 +14,18 @@
  "customizations": {
    "vscode": {
      "extensions": [
-        "eamodio.gitlens",
+		"eamodio.gitlens",
-        "esbenp.prettier-vscode",
+		"esbenp.prettier-vscode",
-        "llvm-vs-code-extensions.vscode-clangd",
+		"llvm-vs-code-extensions.vscode-clangd",
-        "ms-azuretools.vscode-docker",
+		"ms-azuretools.vscode-docker",
-        "ms-python.python",
+		"ms-python.python",
-        "ms-python.vscode-pylance",
+		"ms-python.vscode-pylance",
-        "ms-vscode.cpptools",
+		"ms-vscode.cpptools",
-        "ms-vscode.makefile-tools",
+		"ms-vscode.makefile-tools",
-        "rust-lang.rust-analyzer",
+		"rust-lang.rust-analyzer",
-        "tomoki1207.pdf"
+		"tomoki1207.pdf",
-      ]
+		"nvarner.typst-lsp"
 	]
    }
  }
--- a/5
+++ b/5
@ -1,4 +1,7 @@
 ARG DEBIAN_FRONTEND=noninteractive
 FROM ghcr.io/typst/typst:latest as typst
 FROM ubuntu:22.04
 ENV PATH="/root/.cargo/bin:${PATH}"
@ -22,5 +25,7 @@ RUN apt update -y && apt install -y --no-install-recommends \
    ;
 RUN pip install poetry
 COPY --from=typst /bin/typst /usr/bin/typst
 RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
 RUN echo 'eval "$(direnv hook bash)"' >> /root/.bashrc
--- a/assignments/02/.gitignore
+++ b/assignments/02/.gitignore
@ -1,4 +1,7 @@
 qs_mpi
 *.o
 compile_commands.json
-.cache
+.cache
 output*.txt
 *.tar.gz
--- a/assignments/02/Makefile
+++ b/assignments/02/Makefile
@ -1,17 +1,30 @@
-.PHONY: all clean run-example
+.PHONY: all handin clean run-example
 CC := cc
-CFLAGS := -g
+# CFLAGS := -g -O0
-LDFLAGS := -g
+# LDFLAGS := -g
 CFLAGS := -O3
 LDFLAGS :=
 CFLAGS += $(shell pkg-config --cflags mpi)
 LDFLAGS += $(shell pkg-config --libs mpi)
 all: qs_mpi
 handin: zhan4854.tar.gz
 zhan4854.tar.gz: Makefile ASSIGNMENT.md qs_mpi.c report.pdf
 	mkdir -p zhan4854
 	cp $^ zhan4854
 	tar -czvf $@ zhan4854
 	rm -r zhan4854
 run-example: qs_mpi
-	mpirun --allow-run-as-root -np 4 ./qs_mpi 32 output.txt
+	mpirun -v --allow-run-as-root -np 4 ./qs_mpi 32 output.txt
 report.pdf: report.typ
 	typst compile $< $@
 qs_mpi: qs_mpi.o
 	$(CC) $^ $(CFLAGS) $(LDFLAGS) -o $@
--- a/assignments/02/process_output.py
+++ b/assignments/02/process_output.py
@ -1,24 +1,25 @@
 import sys
 import re
-pat = re.compile(r"\[(\d+)\] (.*)")
+pat = re.compile(r"\[(\d+),(-?\d+)\] (.*)")
 outputs = {}
-for line in sys.stdin.readlines():
+for i, line in enumerate(sys.stdin.readlines()):
    m = pat.match(line)
    if not m:
        # print(line)
        continue
    p = int(m.group(1))
-    rest = m.group(2)
+    n = int(m.group(2))
    rest = m.group(3)
-    if p not in outputs: outputs[p] = []
+    if (p, n) not in outputs: outputs[p, n] = (i, [])
-    outputs[p].append(rest)
+    outputs[p, n][1].append(rest)
-for p in sorted(outputs.keys()):
+for ((p, n), (i, lines)) in sorted(outputs.items(), key=lambda v: (-v[0][1], v[0][0])):
-    lines = outputs[p]
+    # lines = outputs[p, n]
-    print(f"---- {p} ----")
+    print(f"---- {p} [{n}] ----")
    for line in lines:
        print(line)
    print()
--- a/assignments/02/qs_mpi
+++ b/assignments/02/qs_mpi
@ -0,0 +1,655 @@
 #include <mpi.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 // https://stackoverflow.com/a/75458495
 #define check_mpi_error(n) __check_mpi_error(__FILE__, __LINE__, n)
 void __check_mpi_error(const char *file, const int line, const int n) {
  char errbuffer[MPI_MAX_ERROR_STRING];
  int errlen;
  if (n != MPI_SUCCESS) {
    MPI_Error_string(n, errbuffer, &errlen);
    printf("MPI-error: %s\n", errbuffer);
    printf("Location: %s:%i\n", file, line);
    MPI_Abort(MPI_COMM_WORLD, n);
  }
 }
 #define ORDER_FORWARDS 1
 #define ORDER_BACKWARDS 2
 #define CTL_SIZE 4
 #define ROOT_RANK 0
 #define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
 #define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))
 #define ENSURE_int(i) _Generic((i), int : (i))
 #define ENSURE_float(f) _Generic((f), float : (f))
 #define MAX(type, x, y) (type) GENERIC_MAX(ENSURE_##type(x), ENSURE_##type(y))
 #define MIN(type, x, y) (type) GENERIC_MIN(ENSURE_##type(x), ENSURE_##type(y))
 void init_ctl(int *ctl, int len);
 void local_quicksort(int *arr, int lo, int hi);
 char *string_of_list(int *arr, int len);
 void recursive_quicksort(int *integers, int n, int segment_capac,
                         int segment_len, int *integers_out, MPI_Comm comm);
 int main(int argc, char **argv) {
  int rank, p;
  MPI_Init(&argc, &argv);
  int n = atoi(argv[1]);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &p);
  // Generate integers
  int n_over_p = n / p;
  int integers[n_over_p];
  // Minor implementation detail: srand(0) is specially handled by glibc to
  // behave as if it was called with srand(1). To get around this, I'm seeding
  // with rank + 1
  //
  // See more: https://stackoverflow.com/a/27386563
  srand(rank + 1);
  for (int i = 0; i < n_over_p; ++i) {
    integers[i] = rand();
    // printf(" - %d\n", integers[i]);
  }
  // printf("[%d,9999999999] GENERATED INTEGERS: %s\n", rank,
  //        string_of_list(integers, n_over_p));
  int new_integers[n_over_p];
  recursive_quicksort(integers, n, n_over_p, n_over_p, new_integers,
                      MPI_COMM_WORLD);
  // sleep(1);
  // printf("[%d] after: %s\n", rank, string_of_list(integers, n_over_p));
  // The first node is responsible for collecting all the data and then
  // printing it out to the file MPI_Gather(const void *sendbuf, int
  // sendcount, MPI_INT, void *recvbuf,
  //            int recvcount, MPI_INT, 0, MPI_COMM_WORLD);
  int recvbuf[n];
  MPI_Gather(new_integers, n_over_p, MPI_INT, recvbuf, n_over_p, MPI_INT, 0,
             MPI_COMM_WORLD);
  if (rank == 0) {
    FILE *fp = fopen(argv[2], "w");
    // printf("integers: %s\n", string_of_list(recvbuf, n));
    // printf("[%d,-1] ==== FINAL ====\n", rank);
    for (int i = 0; i < n; i += 1) {
      fprintf(fp, "%d\n", recvbuf[i]);
      // printf("[%d,-1]  %s\n", rank,
      //        string_of_list(&recvbuf[i * n_over_p], n_over_p));
    }
    fclose(fp);
  }
  MPI_Finalize();
  // printf("Done.\n");
  return 0;
 }
 // hi not inclusive
 void local_quicksort(int *arr, int lo, int hi) {
  int temp;
  if (lo >= hi || lo < 0)
    return;
  int pivot = arr[hi - 1];
  int pivot_idx = lo - 1;
  for (int j = lo; j < hi; ++j) {
    if (arr[j] < pivot) {
      pivot_idx += 1;
      temp = arr[j];
      arr[j] = arr[pivot_idx];
      arr[pivot_idx] = temp;
    }
  }
  pivot_idx += 1;
  temp = arr[hi - 1];
  arr[hi - 1] = arr[pivot_idx];
  arr[pivot_idx] = temp;
  // Recursive call
  local_quicksort(arr, lo, pivot_idx);
  local_quicksort(arr, pivot_idx + 1, hi);
 }
 // char *string_of_list(int *arr, int len) {
 //   char *buffer = calloc(sizeof(char), 1000);
 //   int offset = 0; // Keep track of the current position in the buffer
 //   for (int i = 0; i < len; i++) {
 //     offset += sprintf(buffer + offset, "%d", arr[i]);
 //     if (i < len - 1) {
 //       // Add a separator (e.g., comma or space) if it's not the last element
 //       offset += sprintf(buffer + offset, " ");
 //     }
 //   }
 //   return buffer;
 // }
 void recursive_quicksort(int *integers, int total_elems, int segment_capac,
                         int segment_len, int *integers_out, MPI_Comm comm) {
  int err, rank, p;
  MPI_Comm_size(comm, &p);
  MPI_Comm_rank(comm, &rank);
  // printf(
  //     "[%d,%d] recursive_quicksort([%s], total=%d, capac=%d, len=%d)
  //     {p=%d}\n", rank, total_elems, string_of_list(integers, segment_len),
  //     total_elems, segment_capac, segment_len, p);
  if (p <= 1) {
    // Recursion base case: just sort it serially
    local_quicksort(integers, 0, total_elems);
    for (int i = 0; i < total_elems; ++i) {
      integers_out[i] = integers[i];
    }
    // printf("Quicksorted: %s\n", string_of_list(integers, total_elems));
    return;
  }
  // sleep(1);
  // printf("\n\n");
  // int segment_capac = (total_elems + p - 1) / p;
  // int segment_len = total_elems / p;
  // if (rank == ROOT_RANK)
  //   segment_len += total_elems - p * segment_len;
  // printf("[%d,%d] capac: %d, len: %d\n", rank, total_elems, segment_capac,
  // segment_len);
  // printf(
  //     "[%d] :::::::::::::::::::::::::::: RECURSIVE QUICKSORT (n=%d,
  //     n/p=%d)\n", rank, n, n_over_p);
  // Locally sort
  // printf("[%d] Numbers before:           %s\n", rank,
  //        string_of_list(integers, n_over_p));
  local_quicksort(integers, 0, segment_len);
  // printf("[%d] Numbers after first sort: %s\n", rank,
  //        string_of_list(integers, segment_len));
  // Select a pivot.
  // This pivot is broadcasted to all nodes
  int pivot;
  {
    // First, select a random element
    int rand_el = integers[rand() % segment_len];
    // Gather it
    int rand_els[p];
    MPI_Gather(&rand_el, 1, MPI_INT, rand_els, 1, MPI_INT, ROOT_RANK, comm);
    // Get the median
    if (rank == ROOT_RANK) {
      // Sort
      local_quicksort(rand_els, 0, p);
      // printf("[%d,%d] Local quicksort for pivot: %s\n", rank, total_elems,
      //        string_of_list(rand_els, p));
      // Get the middle element
      pivot = rand_els[p / 2];
    }
    MPI_Bcast(&pivot, 1, MPI_INT, ROOT_RANK, comm);
  }
  // printf("[%d,%d] Broadcasted pivot: %d\n", rank, total_elems, pivot);
  // Determine where the boundary between S (lower) and L (higher) lies
  int boundary = 0;
  for (int i = 0; i < segment_len; ++i) {
    if (integers[i] >= pivot) {
      boundary = i;
      break;
    }
  }
  // printf("[%d,%d] boundary: %d\n", rank, total_elems, boundary);
  int S_lo = 0, S_hi = boundary;
  int L_lo = boundary, L_hi = segment_len;
  int S_size = S_hi - S_lo, L_size = L_hi - L_lo;
  // printf("[%d,%d] S: [%d - %d] (%d), L: [%d - %d] (%d)\n", rank, total_elems,
  //        S_lo, S_hi, S_size, L_lo, L_hi, L_size);
  // Perform global arrangement
  int S_global_end = -1, L_reverse_end = -1, S_global_max_end = -1;
  MPI_Scan(&S_size, &S_global_end, 1, MPI_INT, MPI_SUM, comm);
  MPI_Scan(&L_size, &L_reverse_end, 1, MPI_INT, MPI_SUM, comm);
  int index;
  MPI_Scan(&segment_len, &index, 1, MPI_INT, MPI_SUM, comm);
  // printf("[%d] bruh %d\n", rank, S_global_end);
  // Get the boundary element between S and L
  MPI_Allreduce(&S_global_end, &S_global_max_end, 1, MPI_INT, MPI_MAX, comm);
  int S_global_start = S_global_end - S_size,
      L_reverse_start = L_reverse_end - L_size,
      L_global_start = total_elems - L_reverse_end,
      L_global_end = total_elems - L_reverse_start;
  // printf("[%d,%d] Prefixed S: [%d - %d) (%d), Prefixed L: [%d - %d) (%d)\n",
  //        rank, total_elems, S_global_start, S_global_end, S_size,
  //        L_global_start, L_global_end, L_size);
  // Determine which process S's and L's destination will start in, respectively
  int S_starting_process, L_starting_process;
  int p_of_split, split_point;
  // int split_point = S_global_max_end % segment_len;
  int indexes[p];
  {
    MPI_Allgather(&index, 1, MPI_INT, indexes, 1, MPI_INT, comm);
    for (int i = 0; i < p; ++i) {
      int lo = i == 0 ? 0 : indexes[i - 1];
      int hi = indexes[i];
      if (S_global_start >= lo && S_global_start < hi)
        S_starting_process = i;
      if (L_global_start >= lo && L_global_start < hi)
        L_starting_process = i;
      if (S_global_max_end >= lo && S_global_max_end < hi) {
        p_of_split = i;
        split_point = S_global_max_end - lo;
      }
    }
    // err = MPI_Bcast(&S_starting_process, 1, MPI_INT, ROOT_RANK, comm);
    // check_mpi_error(err);
    // err = MPI_Bcast(&L_starting_process, 1, MPI_INT, ROOT_RANK, comm);
    // check_mpi_error(err);
  }
  // printf("[%d,%d] indexes: %s\n", rank, total_elems,
  //        string_of_list(indexes, p));
  // printf("[%d,%d] S=%d starts at %d , L=%d starts at %d , indexes: %s\n",
  // rank,
  //        total_elems, S_global_start, S_starting_process, L_global_start,
  //        L_starting_process, string_of_list(indexes, p));
  // S_starting_process = S_global_start / segment_len;
  // L_starting_process = L_global_start / segment_len;
  int S_offset = S_global_start % segment_len,
      L_offset = L_global_start % segment_len;
  int S_ctl[p * CTL_SIZE];
  int L_ctl[p * CTL_SIZE];
  int S_send_ctl[p * CTL_SIZE];
  int L_send_ctl[p * CTL_SIZE];
  int ctl_send_counts[p];
  int ctl_send_displs[p];
  int send_counts[p];
  int send_displs[p];
  int recv_counts[p];
  int recv_displs[p];
  init_ctl(S_ctl, p);
  init_ctl(L_ctl, p);
  init_ctl(S_send_ctl, p);
  init_ctl(L_send_ctl, p);
  int SPACE = segment_capac;
  for (int i = 0; i < p; ++i) {
    send_counts[i] = SPACE;
    send_displs[i] = i * SPACE;
    ctl_send_counts[i] = CTL_SIZE;
    ctl_send_displs[i] = i * CTL_SIZE;
    recv_counts[i] = CTL_SIZE;
    recv_displs[i] = i * CTL_SIZE;
  }
  // Send S to the correct target
  if (S_size) {
    for (int i = S_lo, dest_pos = S_global_start,
             processor = S_starting_process;
         i < S_hi;) {
      int next_break =
          MIN(int, S_global_end,
              MIN(int, dest_pos + (S_hi - S_lo),
                  (dest_pos / segment_len) * segment_len + segment_len));
      int count = next_break - dest_pos;
      int from_local_start = i, from_local_end = i + count;
      int from_global_start = rank * segment_len + from_local_start,
          from_global_end = from_global_start + count;
      int to_global_start = dest_pos, to_global_end = dest_pos + count;
      int to_local_start = to_global_start - processor * segment_len,
          to_local_end = to_global_end - processor * segment_len;
      // printf("[%d] S ->> (count=%d), from local [%d..%d] {%d..%d} -to-> "
      //        "p#%d [%d..%d] {%d..%d}\n",
      //        rank, count, from_local_start, from_local_end,
      //        from_global_start, from_global_end, processor, to_local_start,
      //        to_local_end, to_global_start, to_global_end);
      S_send_ctl[processor * CTL_SIZE] = count;
      S_send_ctl[processor * CTL_SIZE + 1] = from_global_start;
      S_send_ctl[processor * CTL_SIZE + 2] = to_local_start;
      S_send_ctl[processor * CTL_SIZE + 3] = from_local_start;
      i += count;
      dest_pos += count;
      processor += 1;
    }
  }
  MPI_Alltoallv(S_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, S_ctl,
                recv_counts, recv_displs, MPI_INT, comm);
  // Send L to the correct target
  if (L_size) {
    for (int i = L_lo, dest_pos = L_global_start,
             processor = L_starting_process;
         i < L_hi;) {
      int next_break =
          MIN(int, L_global_end,
              MIN(int, dest_pos + (L_hi - L_lo),
                  (dest_pos / segment_len) * segment_len + segment_len));
      int count = next_break - dest_pos;
      int from_local_start = i, from_local_end = i + count;
      int from_global_start = rank * segment_len + from_local_start,
          from_global_end = from_global_start + count;
      int to_global_start = dest_pos, to_global_end = dest_pos + count;
      int to_local_start = to_global_start - processor * segment_len,
          to_local_end = to_global_end - processor * segment_len;
      // printf("[%d] L ->> (count=%d), from local [%d..%d] {%d..%d} -to-> "
      //        "p#%d [%d..%d] {%d..%d}\n",
      //        rank, count, from_local_start, from_local_end,
      //        from_global_start, from_global_end, processor, to_local_start,
      //        to_local_end, to_global_start, to_global_end);
      L_send_ctl[processor * CTL_SIZE] = count;
      L_send_ctl[processor * CTL_SIZE + 1] = from_global_start;
      L_send_ctl[processor * CTL_SIZE + 2] = to_local_start;
      L_send_ctl[processor * CTL_SIZE + 3] = from_local_start;
      i += count;
      dest_pos += count;
      processor += 1;
    }
  }
  MPI_Alltoallv(L_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, L_ctl,
                recv_counts, recv_displs, MPI_INT, comm);
  // After sending S and L information
  for (int i = 0; i < p; ++i) {
    recv_counts[i] = segment_len;
    recv_displs[i] = i * segment_len;
  }
  // printf("[%d,%d] S CTL INFO\n", rank, total_elems);
  // for (int i = 0; i < p; ++i) {
  //   printf("[%d,%d] [p=%d] (ct=%d)\n", rank, total_elems, i,
  //          S_send_ctl[i * CTL_SIZE]);
  // }
  // MPI_Alltoallv(integers, send_counts, send_displs, MPI_INT,
  // integers_recv_buf,
  //               recv_counts, recv_displs, MPI_INT, MPI_COMM_WORLD);
  // MPI_Allgather(integers, n_over_p, MPI_INT, integers_recv_buf, n_over_p,
  //               MPI_INT, comm);
  // printf("[%d] ints: %s\n", rank, string_of_list(integers_recv_buf, n));
  // Scheme for all send
  int integers_recv_2[segment_capac];
  int integers_recv_3[segment_capac];
  for (int i = 0; i < segment_len; ++i) {
    integers_recv_2[i] = -1;
    integers_recv_3[i] = integers[i];
  }
  for (int host_p = 0; host_p < p; ++host_p) {
    if (rank == host_p) {
      // Your {S,L}_ctl is a mapping from source_processor -> ctl
      // Everyone already knows who needs to send to who now
      for (int sender_p = 0; sender_p < p; ++sender_p) {
        int S_count = S_ctl[sender_p * CTL_SIZE];
        if (S_count > 0) {
          int to_local_start = S_ctl[sender_p * CTL_SIZE + 2];
          int from_local_start = S_ctl[sender_p * CTL_SIZE + 3];
          if (sender_p == host_p) {
            for (int k = 0; k < S_count; ++k) {
              integers_recv_3[to_local_start + k] =
                  integers[from_local_start + k];
            }
            continue;
          }
          // printf("[%d] - S inbound from host %d to [%d..%d] (%d)\n", rank,
          //        sender_p, to_local_start, to_local_start + S_count,
          //        S_count);
          err = MPI_Recv(&integers_recv_2[to_local_start], S_count, MPI_INT,
                         sender_p, 124, comm, MPI_STATUS_IGNORE);
          check_mpi_error(err);
          for (int k = 0; k < S_count; ++k) {
            integers_recv_3[to_local_start + k] =
                integers_recv_2[to_local_start + k];
          }
        }
      }
    } else {
      // Your {S,L}_send_ctl contains a mapping from dest_processor -> ctl
      for (int dest_p = 0; dest_p < p; ++dest_p) {
        int S_count = S_send_ctl[dest_p * CTL_SIZE];
        if (S_count > 0 && dest_p == host_p) {
          int from_local_start = S_send_ctl[dest_p * CTL_SIZE + 3];
          // printf("[%d] - S outbound to host %d from [%d..%d] (%d)\n", rank,
          //        dest_p, from_local_start, from_local_start + S_count,
          //        S_count);
          MPI_Send(&integers[from_local_start], S_count, MPI_INT, dest_p, 124,
                   comm);
        }
      }
    }
  }
  for (int host_p = 0; host_p < p; ++host_p) {
    if (rank == host_p) {
      // Your {S,L}_ctl is a mapping from source_processor -> ctl
      // Everyone already knows who needs to send to who now
      for (int sender_p = 0; sender_p < p; ++sender_p) {
        int L_count = L_ctl[sender_p * CTL_SIZE];
        if (L_count > 0) {
          int to_local_start = L_ctl[sender_p * CTL_SIZE + 2];
          int from_local_start = L_ctl[sender_p * CTL_SIZE + 3];
          if (sender_p == host_p) {
            for (int k = 0; k < L_count; ++k) {
              integers_recv_3[to_local_start + k] =
                  integers[from_local_start + k];
            }
            continue;
          }
          // printf("[%d] - L inbound from host %d to [%d..%d] (%d)\n", rank,
          //        sender_p, to_local_start, to_local_start + L_count,
          //        L_count);
          err = MPI_Recv(&integers_recv_2[to_local_start], L_count, MPI_INT,
                         sender_p, 125, comm, MPI_STATUS_IGNORE);
          check_mpi_error(err);
          for (int k = 0; k < L_count; ++k) {
            integers_recv_3[to_local_start + k] =
                integers_recv_2[to_local_start + k];
          }
        }
      }
    } else {
      // Your {S,L}_send_ctl contains a mapping from dest_processor -> ctl
      for (int dest_p = 0; dest_p < p; ++dest_p) {
        int L_count = L_send_ctl[dest_p * CTL_SIZE];
        if (L_count > 0 && dest_p == host_p) {
          int from_local_start = L_send_ctl[dest_p * CTL_SIZE + 3];
          // printf("[%d] - L outbound to host %d from [%d..%d] (%d)\n", rank,
          //        dest_p, from_local_start, from_local_start + L_count,
          //        L_count);
          MPI_Send(&integers[from_local_start], L_count, MPI_INT, dest_p, 125,
                   comm);
        }
      }
    }
  }
  // printf("[%d,%d] after: %s\n", rank, total_elems,
  //        string_of_list(integers_recv_3, segment_len));
  // printf("[%d,%d] -------------------------------------\n", rank,
  // total_elems); for (int i = 0; i < segment_len; ++i) {
  //   integers[i] = integers_recv_3[i];
  // }
  // ###################################################################################
  // SUBDIVIDING
  // Now, determine which processes should be responsible for taking the S and L
  // arrays
  // Specifically, the part where it's split, break the tie to see if it goes
  // down or up
  int child_len = segment_len;
  int difference = segment_len - split_point;
  int transfer[split_point];
  // printf("[%d,%d] p_of_split = %d, split_point = %d => (child_len = %d)\n",
  //        rank, total_elems, p_of_split, split_point, child_len);
  int has_split = 0;
  if (p_of_split == 0 || p_of_split == p - 1) {
    // Super unfortunate, bad pivot
  } else if (split_point == 0) {
    // Super lucky, it's split evenly!
  } else {
    has_split = 1;
    // Let's just say that if there's any split, the block itself counts as L
    // and then add the rest to the previous block
    if (rank == p_of_split - 1) {
      child_len += split_point;
      err = MPI_Recv(transfer, split_point, MPI_INT, p_of_split, 126, comm,
                     MPI_STATUS_IGNORE);
      check_mpi_error(err);
    } else if (rank == p_of_split) {
      child_len = difference;
      err = MPI_Send(integers, split_point, MPI_INT, p_of_split - 1, 126, comm);
      check_mpi_error(err);
    }
  }
  // Which group is this child going into?
  int color;
  if (rank < p_of_split)
    color = 100;
  else
    color = 200;
  // printf("[%d,%d] split color = %d, split lenth = %d\n", rank, total_elems,
  // color, child_len);
  MPI_Comm child_comm;
  MPI_Comm_split(comm, color, rank, &child_comm);
  // Figure out what the max is
  int max_child_buf_len, total_child_elems;
  err = MPI_Allreduce(&child_len, &max_child_buf_len, 1, MPI_INT, MPI_MAX,
                      child_comm);
  check_mpi_error(err);
  err = MPI_Allreduce(&child_len, &total_child_elems, 1, MPI_INT, MPI_SUM,
                      child_comm);
  check_mpi_error(err);
  // printf("[%d] [color=%d] max length = %d, total child elems = %d\n", rank,
  //        color, max_child_buf_len, total_child_elems);
  // Copy into a new buf
  int new_buf[max_child_buf_len];
  int whichCase = 999;
  for (int i = 0; i < max_child_buf_len; ++i) {
    if (has_split && rank == p_of_split - 1) {
      whichCase = 1001;
      if (i < segment_len)
        new_buf[i] = integers_recv_3[i];
      else if (i < segment_len + split_point)
        new_buf[i] = transfer[i - segment_len];
      else
        new_buf[i] = -1;
    } else if (has_split && rank == p_of_split) {
      whichCase = 1002;
      if (i < difference)
        new_buf[i] = integers_recv_3[i + split_point];
      else
        new_buf[i] = -1;
    } else {
      whichCase = 1003;
      if (i < child_len)
        new_buf[i] = integers_recv_3[i];
      else
        new_buf[i] = -1;
    }
  }
  // printf("[%d,%d] orig integers: %s\n", rank, total_elems,
  //        string_of_list(integers, segment_len));
  // printf("[%d,%d] new buf = %s (has_split = %d, segment_len = %d, case = %d,
  // "
  //        "child_elems = %d)\n",
  //        rank, total_elems, string_of_list(new_buf, max_child_buf_len),
  //        has_split, segment_len, whichCase, child_len);
  // printf("[%d,%d] \n", rank, total_elems);
  int integers_out_buf[total_child_elems];
  recursive_quicksort(new_buf, total_child_elems, max_child_buf_len, child_len,
                      integers_out_buf, child_comm);
  // Ok now copy the new items back
  switch (whichCase) {
  case 1001:
    // In this case, p is right before the split, so it got extra elements
    // To reverse this, we can send the elements back to the second
    for (int i = 0; i < total_child_elems; ++i) {
      if (i < segment_len)
        integers_out[i] = integers_out_buf[i];
      else
        transfer[i - segment_len] = integers_out_buf[i];
    }
    MPI_Send(transfer, split_point, MPI_INT, p_of_split, 127, comm);
    break;
  case 1002:
    MPI_Recv(transfer, split_point, MPI_INT, p_of_split - 1, 127, comm,
             MPI_STATUS_IGNORE);
    for (int i = 0; i < split_point; ++i) {
      integers_out[i] = transfer[i];
    }
    for (int i = 0; i < total_child_elems; ++i) {
      integers_out[i + split_point] = integers_out_buf[i];
    }
    break;
  case 1003:
    for (int i = 0; i < total_child_elems; ++i) {
      integers_out[i] = integers_out_buf[i];
    }
    break;
  }
  MPI_Comm_free(&child_comm);
 }
 void init_ctl(int *ctl, int len) {
  for (int i = 0; i < len; ++i) {
    ctl[i * CTL_SIZE] = 0;
    for (int j = 1; j < CTL_SIZE; ++j) {
      ctl[i * CTL_SIZE + j] = -1;
    }
  }
 }
--- a/assignments/02/qs_mpi.c
+++ b/assignments/02/qs_mpi.c
@ -3,9 +3,25 @@
 #include <stdlib.h>
 #include <unistd.h>
 // https://stackoverflow.com/a/75458495
 #define check_mpi_error(n) __check_mpi_error(__FILE__, __LINE__, n)
 void __check_mpi_error(const char *file, const int line, const int n) {
  char errbuffer[MPI_MAX_ERROR_STRING];
  int errlen;
  if (n != MPI_SUCCESS) {
    MPI_Error_string(n, errbuffer, &errlen);
    printf("MPI-error: %s\n", errbuffer);
    printf("Location: %s:%i\n", file, line);
    MPI_Abort(MPI_COMM_WORLD, n);
  }
 }
 #define ORDER_FORWARDS 1
 #define ORDER_BACKWARDS 2
 #define CTL_SIZE 4
 #define ROOT_RANK 0
 #define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
 #define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))
@ -19,7 +35,8 @@
 void init_ctl(int *ctl, int len);
 void local_quicksort(int *arr, int lo, int hi);
 char *string_of_list(int *arr, int len);
-void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm);
+void recursive_quicksort(int *integers, int n, int segment_capac,
                         int segment_len, int *integers_out, MPI_Comm comm);
 int main(int argc, char **argv) {
  int rank, p;
@ -42,37 +59,39 @@ int main(int argc, char **argv) {
  srand(rank + 1);
  for (int i = 0; i < n_over_p; ++i) {
    // TODO: For readability during debugging, I'm capping this
    integers[i] = rand() % 101;
    // printf(" - %d\n", integers[i]);
  }
-  recursive_quicksort(integers, n, 0, MPI_COMM_WORLD);
+  int new_integers[n_over_p];
-
+  recursive_quicksort(integers, n, n_over_p, n_over_p, new_integers,
-  // sleep(1);
+                      MPI_COMM_WORLD);
  // printf("[%d] after: %s\n", rank, string_of_list(integers, n_over_p));
  // The first node is responsible for collecting all the data and then
-  // printing it out to the file MPI_Gather(const void *sendbuf, int
+  // printing it out to the file
  // sendcount, MPI_INT, void *recvbuf,
  //            int recvcount, MPI_INT, 0, MPI_COMM_WORLD);
  int recvbuf[n];
  MPI_Gather(integers, n_over_p, MPI_INT, recvbuf, n_over_p, MPI_INT, 0,
             MPI_COMM_WORLD);
-  if (rank == 0) {
+  FILE *fp;
-    FILE *f = fopen(argv[2], "w");
+  if (rank == ROOT_RANK)
-    // printf("integers: %s\n", string_of_list(recvbuf, n));
+    fp = fopen(argv[2], "w");
-    printf("[%d] ==== FINAL ====\n", rank);
+
-    for (int i = 0; i < p; i += 1) {
+  for (int i = 0; i < p; i += 1) {
-      printf("[%d]  %s\n", rank,
+    if (rank == ROOT_RANK) {
-             string_of_list(&recvbuf[i * n_over_p], n_over_p));
+      if (i != ROOT_RANK) {
        MPI_Recv(new_integers, n_over_p, MPI_INT, i, 129, MPI_COMM_WORLD,
                 MPI_STATUS_IGNORE);
      }
      for (int j = 0; j < n_over_p; ++j) {
        fprintf(fp, "%d\n", new_integers[j]);
      }
    } else if (rank == i) {
      MPI_Send(new_integers, n_over_p, MPI_INT, ROOT_RANK, 129, MPI_COMM_WORLD);
    }
    fclose(f);
  }
  if (rank == ROOT_RANK)
    fclose(fp);
  MPI_Finalize();
  printf("Done.\n");
  return 0;
 }
@ -105,113 +124,112 @@ void local_quicksort(int *arr, int lo, int hi) {
  local_quicksort(arr, pivot_idx + 1, hi);
 }
-char *string_of_list(int *arr, int len) {
+// char *string_of_list(int *arr, int len) {
-  char *buffer = calloc(sizeof(char), 1000);
+//   char *buffer = calloc(sizeof(char), 1000);
-  int offset = 0; // Keep track of the current position in the buffer
+//   int offset = 0; // Keep track of the current position in the buffer
-  for (int i = 0; i < len; i++) {
+//   for (int i = 0; i < len; i++) {
-    offset += sprintf(buffer + offset, "%d", arr[i]);
+//     offset += sprintf(buffer + offset, "%d", arr[i]);
-    if (i < len - 1) {
+//     if (i < len - 1) {
-      // Add a separator (e.g., comma or space) if it's not the last element
+//       // Add a separator (e.g., comma or space) if it's not the last
-      offset += sprintf(buffer + offset, " ");
+//       element offset += sprintf(buffer + offset, " ");
-    }
+//     }
-  }
+//   }
-  return buffer;
+//   return buffer;
-}
+// }
-void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
+void recursive_quicksort(int *integers, int total_elems, int segment_capac,
-  int rank, p;
+                         int segment_len, int *integers_out, MPI_Comm comm) {
  int err, rank, p;
  MPI_Comm_size(comm, &p);
  MPI_Comm_rank(comm, &rank);
-  if (p == 1) {
+  if (p <= 1) {
    // Recursion base case: just sort it serially
-    local_quicksort(integers, 0, n);
+    local_quicksort(integers, 0, total_elems);
-    printf("Quicksorted: %s\n", string_of_list(integers, n));
+    for (int i = 0; i < total_elems; ++i) {
      integers_out[i] = integers[i];
    }
    return;
  }
  sleep(1);
  printf("\n\n");
  int n_over_p_max = (n + p - 1) / p;
  int n_over_p = n / p;
  if (rank == root)
    n_over_p += n - p * n_over_p;
  // printf(
  //     "[%d] :::::::::::::::::::::::::::: RECURSIVE QUICKSORT (n=%d,
  //     n/p=%d)\n", rank, n, n_over_p);
  // Locally sort
  // printf("[%d] Numbers before:           %s\n", rank,
  //        string_of_list(integers, n_over_p));
  local_quicksort(integers, 0, n_over_p);
  printf("[%d] Numbers after first sort: %s\n", rank,
         string_of_list(integers, n_over_p));
  // Select a pivot.
  // This pivot is broadcasted to all nodes
  int pivot;
  {
    // First, select a random element
-    int rand_el = integers[rand() % n_over_p];
+    int rand_el = integers[rand() % segment_len];
    // Gather it
    int rand_els[p];
-    MPI_Gather(&rand_el, 1, MPI_INT, rand_els, 1, MPI_INT, root, comm);
+    MPI_Gather(&rand_el, 1, MPI_INT, rand_els, 1, MPI_INT, ROOT_RANK, comm);
    // Get the median
-    if (rank == root) {
+    if (rank == ROOT_RANK) {
-      // Sort
+      // Get the middle element after sorting
      local_quicksort(rand_els, 0, p);
      // Get the middle element
      pivot = rand_els[p / 2];
    }
-    MPI_Bcast(&pivot, 1, MPI_INT, root, comm);
+    MPI_Bcast(&pivot, 1, MPI_INT, ROOT_RANK, comm);
  }
  printf("[%d] Broadcasted pivot: %d\n", rank, pivot);
  // Determine where the boundary between S (lower) and L (higher) lies
-  int boundary;
+  int boundary = 0;
-  for (int i = 0; i < n_over_p; ++i) {
+  for (int i = 0; i < segment_len; ++i) {
    if (integers[i] >= pivot) {
      boundary = i;
      break;
    }
  }
  int S_lo = 0, S_hi = boundary;
-  int L_lo = boundary, L_hi = n_over_p;
+  int L_lo = boundary, L_hi = segment_len;
  int S_size = S_hi - S_lo, L_size = L_hi - L_lo;
  // printf("[%d] S: [%d - %d] (%d), L: [%d - %d] (%d)\n", rank, S_lo, S_hi,
  //        S_size, L_lo, L_hi, L_size);
  // Perform global arrangement
-  int S_global_end, L_reverse_end, S_global_max_end;
+  int S_global_end = -1, L_reverse_end = -1, S_global_max_end = -1;
  MPI_Scan(&S_size, &S_global_end, 1, MPI_INT, MPI_SUM, comm);
  MPI_Scan(&L_size, &L_reverse_end, 1, MPI_INT, MPI_SUM, comm);
-  // printf("[%d] bruh %d\n", rank, S_global_end);
+  int index;
-  // Get the boundary element between S and L
+  MPI_Scan(&segment_len, &index, 1, MPI_INT, MPI_SUM, comm);
  MPI_Allreduce(&S_global_end, &S_global_max_end, 1, MPI_INT, MPI_MAX, comm);
  int S_global_start = S_global_end - S_size,
      L_reverse_start = L_reverse_end - L_size,
-      L_global_start = n - L_reverse_end, L_global_end = n - L_reverse_start;
+      L_global_start = total_elems - L_reverse_end,
-  // printf("[%d] Prefixed S: [%d - %d], Prefixed L: [%d - %d]\n", rank,
+      L_global_end = total_elems - L_reverse_start;
  //        S_global_start, S_global_end - 1, L_global_start, L_global_end - 1);
-  int S_starting_process = S_global_start / n_over_p,
+  // Determine which process S's and L's destination will start in,
-      L_starting_process = L_global_start / n_over_p;
+  // respectively
-  int S_offset = S_global_start % n_over_p,
+  int S_starting_process, L_starting_process;
-      L_offset = L_global_start % n_over_p;
+  int p_of_split, split_point;
  int indexes[p];
  {
    MPI_Allgather(&index, 1, MPI_INT, indexes, 1, MPI_INT, comm);
    for (int i = 0; i < p; ++i) {
      int lo = i == 0 ? 0 : indexes[i - 1];
      int hi = indexes[i];
      if (S_global_start >= lo && S_global_start < hi)
        S_starting_process = i;
      if (L_global_start >= lo && L_global_start < hi)
        L_starting_process = i;
      if (S_global_max_end >= lo && S_global_max_end < hi) {
        p_of_split = i;
        split_point = S_global_max_end - lo;
      }
    }
  }
  int S_offset = S_global_start % segment_len,
      L_offset = L_global_start % segment_len;
  int *integers_recv_buf = calloc(sizeof(int), n);
  int S_ctl[p * CTL_SIZE];
  int L_ctl[p * CTL_SIZE];
  int S_send_ctl[p * CTL_SIZE];
  int L_send_ctl[p * CTL_SIZE];
  int recvpart[n_over_p];
  int ctl_send_counts[p];
  int ctl_send_displs[p];
@ -225,9 +243,11 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
  init_ctl(S_send_ctl, p);
  init_ctl(L_send_ctl, p);
  int SPACE = segment_capac;
  for (int i = 0; i < p; ++i) {
-    send_counts[i] = n_over_p;
+    send_counts[i] = SPACE;
-    send_displs[i] = i * n_over_p;
+    send_displs[i] = i * SPACE;
    ctl_send_counts[i] = CTL_SIZE;
    ctl_send_displs[i] = i * CTL_SIZE;
@ -236,28 +256,24 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
  }
  // Send S to the correct target
-  {
+  if (S_size) {
    for (int i = S_lo, dest_pos = S_global_start,
             processor = S_starting_process;
         i < S_hi;) {
-      int next_break = MIN(int, S_global_end,
+      int next_break =
-                           MIN(int, dest_pos + (S_hi - S_lo),
+          MIN(int, S_global_end,
-                               (dest_pos / n_over_p) * n_over_p + n_over_p));
+              MIN(int, dest_pos + (S_hi - S_lo),
                  (dest_pos / segment_len) * segment_len + segment_len));
      int count = next_break - dest_pos;
      int from_local_start = i, from_local_end = i + count;
-      int from_global_start = rank * n_over_p + from_local_start,
+      int from_global_start = rank * segment_len + from_local_start,
          from_global_end = from_global_start + count;
      int to_global_start = dest_pos, to_global_end = dest_pos + count;
-      int to_local_start = to_global_start - processor * n_over_p,
+      int to_local_start = to_global_start - processor * segment_len,
-          to_local_end = to_global_end - processor * n_over_p;
+          to_local_end = to_global_end - processor * segment_len;
      // printf("[%d] S ->> (count=%d), from local [%d..%d] {%d..%d} -to-> "
      //        "p#%d [%d..%d] {%d..%d}\n",
      //        rank, count, from_local_start, from_local_end,
      //        from_global_start, from_global_end, processor, to_local_start,
      //        to_local_end, to_global_start, to_global_end);
      S_send_ctl[processor * CTL_SIZE] = count;
      S_send_ctl[processor * CTL_SIZE + 1] = from_global_start;
      S_send_ctl[processor * CTL_SIZE + 2] = to_local_start;
@ -267,34 +283,30 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
      dest_pos += count;
      processor += 1;
    }
    MPI_Alltoallv(S_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, S_ctl,
                  recv_counts, recv_displs, MPI_INT, comm);
  }
  MPI_Alltoallv(S_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, S_ctl,
                recv_counts, recv_displs, MPI_INT, comm);
  // Send L to the correct target
-  {
+  if (L_size) {
    for (int i = L_lo, dest_pos = L_global_start,
             processor = L_starting_process;
         i < L_hi;) {
-      int next_break = MIN(int, L_global_end,
+      int next_break =
-                           MIN(int, dest_pos + (L_hi - L_lo),
+          MIN(int, L_global_end,
-                               (dest_pos / n_over_p) * n_over_p + n_over_p));
+              MIN(int, dest_pos + (L_hi - L_lo),
                  (dest_pos / segment_len) * segment_len + segment_len));
      int count = next_break - dest_pos;
      int from_local_start = i, from_local_end = i + count;
-      int from_global_start = rank * n_over_p + from_local_start,
+      int from_global_start = rank * segment_len + from_local_start,
          from_global_end = from_global_start + count;
      int to_global_start = dest_pos, to_global_end = dest_pos + count;
-      int to_local_start = to_global_start - processor * n_over_p,
+      int to_local_start = to_global_start - processor * segment_len,
-          to_local_end = to_global_end - processor * n_over_p;
+          to_local_end = to_global_end - processor * segment_len;
      // printf("[%d] L ->> (count=%d), from local [%d..%d] {%d..%d} -to-> "
      //        "p#%d [%d..%d] {%d..%d}\n",
      //        rank, count, from_local_start, from_local_end,
      //        from_global_start, from_global_end, processor, to_local_start,
      //        to_local_end, to_global_start, to_global_end);
      L_send_ctl[processor * CTL_SIZE] = count;
      L_send_ctl[processor * CTL_SIZE + 1] = from_global_start;
      L_send_ctl[processor * CTL_SIZE + 2] = to_local_start;
@ -304,29 +316,23 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
      dest_pos += count;
      processor += 1;
    }
    MPI_Alltoallv(L_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, L_ctl,
                  recv_counts, recv_displs, MPI_INT, comm);
  }
  MPI_Alltoallv(L_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, L_ctl,
                recv_counts, recv_displs, MPI_INT, comm);
  // After sending S and L information
  for (int i = 0; i < p; ++i) {
-    recv_counts[i] = n_over_p;
+    recv_counts[i] = segment_len;
-    recv_displs[i] = i * n_over_p;
+    recv_displs[i] = i * segment_len;
  }
-  // MPI_Alltoallv(integers, send_counts, send_displs, MPI_INT,
+  // Algorithm for sending S and L between all processes without O(n)
  // integers_recv_buf,
  //               recv_counts, recv_displs, MPI_INT, MPI_COMM_WORLD);
  // MPI_Allgather(integers, n_over_p, MPI_INT, integers_recv_buf, n_over_p,
  //               MPI_INT, comm);
  // printf("[%d] ints: %s\n", rank, string_of_list(integers_recv_buf, n));
-  // Scheme for all send
+  int integers_recv_2[segment_capac];
-  int integers_recv_2[n_over_p];
+  int integers_recv_3[segment_capac];
-  int integers_recv_3[n_over_p];
+  for (int i = 0; i < segment_len; ++i) {
  for (int i = 0; i < n_over_p; ++i) {
    integers_recv_2[i] = -1;
    integers_recv_3[i] = integers[i];
  }
@ -349,11 +355,9 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
            continue;
          }
-          // printf("[%d] - S inbound from host %d to [%d..%d] (%d)\n", rank,
+          err = MPI_Recv(&integers_recv_2[to_local_start], S_count, MPI_INT,
-          //        sender_p, to_local_start, to_local_start + S_count,
+                         sender_p, 124, comm, MPI_STATUS_IGNORE);
-          //        S_count);
+          check_mpi_error(err);
          MPI_Recv(&integers_recv_2[to_local_start], S_count, MPI_INT, sender_p,
                   124, comm, MPI_STATUS_IGNORE);
          for (int k = 0; k < S_count; ++k) {
            integers_recv_3[to_local_start + k] =
                integers_recv_2[to_local_start + k];
@ -366,9 +370,6 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
        int S_count = S_send_ctl[dest_p * CTL_SIZE];
        if (S_count > 0 && dest_p == host_p) {
          int from_local_start = S_send_ctl[dest_p * CTL_SIZE + 3];
          // printf("[%d] - S outbound to host %d from [%d..%d] (%d)\n", rank,
          //        dest_p, from_local_start, from_local_start + S_count,
          //        S_count);
          MPI_Send(&integers[from_local_start], S_count, MPI_INT, dest_p, 124,
                   comm);
        }
@ -394,11 +395,9 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
            continue;
          }
-          // printf("[%d] - L inbound from host %d to [%d..%d] (%d)\n", rank,
+          err = MPI_Recv(&integers_recv_2[to_local_start], L_count, MPI_INT,
-          //        sender_p, to_local_start, to_local_start + L_count,
+                         sender_p, 125, comm, MPI_STATUS_IGNORE);
-          //        L_count);
+          check_mpi_error(err);
          MPI_Recv(&integers_recv_2[to_local_start], L_count, MPI_INT, sender_p,
                   125, comm, MPI_STATUS_IGNORE);
          for (int k = 0; k < L_count; ++k) {
            integers_recv_3[to_local_start + k] =
                integers_recv_2[to_local_start + k];
@ -411,9 +410,6 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
        int L_count = L_send_ctl[dest_p * CTL_SIZE];
        if (L_count > 0 && dest_p == host_p) {
          int from_local_start = L_send_ctl[dest_p * CTL_SIZE + 3];
          // printf("[%d] - L outbound to host %d from [%d..%d] (%d)\n", rank,
          //        dest_p, from_local_start, from_local_start + L_count,
          //        L_count);
          MPI_Send(&integers[from_local_start], L_count, MPI_INT, dest_p, 125,
                   comm);
        }
@ -421,69 +417,128 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
    }
  }
-  printf("[%d] after: %s\n", rank, string_of_list(integers_recv_3, n_over_p));
+  // ###################################################################################
-  for (int i = 0; i < n_over_p; ++i) {
+  // SUBDIVIDING
    integers[i] = integers_recv_3[i];
  }
-  // Now, determine which processes should be responsible for taking the S and L
+  // Now, determine which processes should be responsible for taking the S and
-  // arrays
+  // L arrays. Specifically, the part where it's split, break the tie to see
  // if it goes down or up
-  // Specifically, the part where it's split, break the tie to see if it goes
+  int child_len = segment_len;
-  // down or up
+  int difference = segment_len - split_point;
-  int colors[p];
+  int transfer[split_point];
  int p_of_split = S_global_max_end / n_over_p;
  int split_point = S_global_max_end % n_over_p;
  // printf("[%d] p_of_split = %d / %d = %d\n", rank, S_global_max_end,
  // n_over_p,
  //        p_of_split);
  int S_split_add = split_point, L_split_sub = n_over_p - split_point;
-  int lo_start = 0, lo_end;
+  int has_split = 0;
-  int hi_start, hi_end = p;
+  if (p_of_split == 0 || p_of_split == p - 1) {
-  if (split_point > n_over_p / 2) {
+    // Super unfortunate, bad pivot
-    // Belongs to the lower group
+  } else if (split_point == 0) {
-    lo_end = hi_start = p_of_split + 1;
+    // Super lucky, it's split evenly!
  } else {
-    // Belongs to the higher group
+    has_split = 1;
-    lo_end = hi_start = p_of_split;
+    // Let's just say that if there's any split, the block itself counts as L
-  }
+    // and then add the rest to the previous block
-
+    if (rank == p_of_split - 1) {
-  int child_root = -1;
+      child_len += split_point;
-  for (int i = 0; i < p; ++i) {
+      err = MPI_Recv(transfer, split_point, MPI_INT, p_of_split, 126, comm,
-    if (i < lo_end)
+                     MPI_STATUS_IGNORE);
-      colors[i] = 100;
+      check_mpi_error(err);
-    else {
+    } else if (rank == p_of_split) {
-      colors[i] = 200;
+      child_len = difference;
-      if (child_root == -1)
+      err = MPI_Send(integers, split_point, MPI_INT, p_of_split - 1, 126, comm);
-        child_root = i;
+      check_mpi_error(err);
    }
  }
-  // MPI_Comm child;
+  // Which group is this child going into?
-  // MPI_Comm_split(comm, colors[rank], rank, &child);
+  int color;
-  // printf("[%d] Recursing...\n", rank);
+  if (rank < p_of_split)
    color = 100;
  else
    color = 200;
-  // int child_size;
+  MPI_Comm child_comm;
-  // MPI_Comm_size(child, &child_size);
+  MPI_Comm_split(comm, color, rank, &child_comm);
-  // int start_at = 0, new_n = child_size * n_over_p;
+  // Figure out what the max is
-  // if (colors[rank] == 100) {
+  int max_child_buf_len, total_child_elems;
-  //   new_n += S_split_add;
+  err = MPI_Allreduce(&child_len, &max_child_buf_len, 1, MPI_INT, MPI_MAX,
-  // } else {
+                      child_comm);
-  //   new_n -= L_split_sub;
+  check_mpi_error(err);
-  //   if (rank == p_of_split)
+  err = MPI_Allreduce(&child_len, &total_child_elems, 1, MPI_INT, MPI_SUM,
-  //     start_at = split_point;
+                      child_comm);
-  // }
+  check_mpi_error(err);
  // recursive_quicksort(integers, n, child_root, child);
-  // printf("[%d] Done recursing.\n", rank);
+  // Copy into a new buf
-  // MPI_Comm_free(&child);
+  int new_buf[max_child_buf_len];
  int whichCase = 999;
  for (int i = 0; i < max_child_buf_len; ++i) {
    if (has_split && rank == p_of_split - 1) {
      whichCase = 1001;
      if (i < segment_len)
        new_buf[i] = integers_recv_3[i];
      else if (i < segment_len + split_point)
        new_buf[i] = transfer[i - segment_len];
      else
        new_buf[i] = -1;
    } else if (has_split && rank == p_of_split) {
      whichCase = 1002;
      if (i < difference)
        new_buf[i] = integers_recv_3[i + split_point];
      else
        new_buf[i] = -1;
    } else {
      whichCase = 1003;
      if (i < child_len)
        new_buf[i] = integers_recv_3[i];
      else
        new_buf[i] = -1;
    }
  }
  int integers_out_buf[total_child_elems];
  recursive_quicksort(new_buf, total_child_elems, max_child_buf_len, child_len,
                      integers_out_buf, child_comm);
  // Ok now copy the new items back
  switch (whichCase) {
  case 1001:
    // In this case, p is right before the split, so it got extra elements
    // To reverse this, we can send the elements back to the second
    for (int i = 0; i < total_child_elems; ++i) {
      if (i < segment_len)
        integers_out[i] = integers_out_buf[i];
      else
        transfer[i - segment_len] = integers_out_buf[i];
    }
    MPI_Send(transfer, split_point, MPI_INT, p_of_split, 127, comm);
    break;
  case 1002:
    // The original array got shortened, so copy the transferred ones back in
    // first, then copy the result from the child quicksorting after it
    MPI_Recv(transfer, split_point, MPI_INT, p_of_split - 1, 127, comm,
             MPI_STATUS_IGNORE);
    for (int i = 0; i < split_point; ++i) {
      integers_out[i] = transfer[i];
    }
    for (int i = 0; i < total_child_elems; ++i) {
      integers_out[i + split_point] = integers_out_buf[i];
    }
    break;
  case 1003:
    // This is just the regular case
    for (int i = 0; i < total_child_elems; ++i) {
      integers_out[i] = integers_out_buf[i];
    }
    break;
  }
  MPI_Comm_free(&child_comm);
 }
 void init_ctl(int *ctl, int len) {
  for (int i = 0; i < len; ++i) {
-    for (int j = 0; j < CTL_SIZE; ++j) {
+    ctl[i * CTL_SIZE] = 0;
    for (int j = 1; j < CTL_SIZE; ++j) {
      ctl[i * CTL_SIZE + j] = -1;
    }
  }
--- a/assignments/02/report.pdf
+++ b/assignments/02/report.pdf
--- a/assignments/02/report.typ
+++ b/assignments/02/report.typ
@ -0,0 +1,20 @@
 = Homework 2
 My algorithm works like this:
 - First I generate $n/p$ integers on each process.
 - Then I jump directly into the recursive step:
  - I choose the pivot using the algorithm where each process picks a random element, and the median of those is picked.
  - The way I moved $S$ and $L$ arrays around is:
    1. First `MPI_Alltoallv` the plan for _which_ processors are going to be sent to, including exact calculations of which local index is being copied from and to.
    2. Then, each processor loops through all the processors and if they have something to send, they send it.
    3. This way, I can coordinate all of the senders/receivers and the ones with nothing to send don't do anything.
  - For the recursion, I opted to make the recursive step have different lengths. (*NOTE:* The reason I have a different "capacity" than "length" is because for the `displs` array I opted to have them all be the same length, so there's extra padding on the shorter ones)
    - If the boundary between $S$ and $L$ falls between a $n/p$ segment, I'd extend the one before and shorten the one after.
    - Then, I recursively process all the $S$'s and all the $L$'s separately using `MPI_Comm_split`.
    - Once it's done processing, I reverse the exact operation that extends / shortens the arrays. This ensures everything is always back to $n/p$ at the end.
 - Everything is collected back at the end via a `Send`/`Recv` to save on allocations.
 Allocations are all on the order of $O(p + n/p)$.
 Unfortunately I didn't finish debugging segfaults in time, and have this report prepared for the parts of the assignment that I _did_ do. It works on small integers (capped at 100) but for some reason segfaults at address `(nil)` at the end... I spent several hours debugging but have not discovered how this occurs.