This commit is contained in:
9 changed files with 967 additions and 214 deletions
@ -23,7 +23,8 @@
@ -1,4 +1,7 @@
ARG DEBIAN_FRONTEND=noninteractive
ARG DEBIAN_FRONTEND=noninteractive
FROM ghcr.io/typst/typst:latest as typst
FROM ubuntu:22.04
FROM ubuntu:22.04
ENV PATH="/root/.cargo/bin:${PATH}"
ENV PATH="/root/.cargo/bin:${PATH}"
@ -22,5 +25,7 @@ RUN apt update -y && apt install -y --no-install-recommends \
RUN pip install poetry
RUN pip install poetry
COPY --from=typst /bin/typst /usr/bin/typst
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
RUN echo 'eval "$(direnv hook bash)"' >> /root/.bashrc
RUN echo 'eval "$(direnv hook bash)"' >> /root/.bashrc
@ -2,3 +2,6 @@ qs_mpi
@ -1,17 +1,30 @@
.PHONY: all clean run-example
.PHONY: all handin clean run-example
CC := cc
CC := cc
CFLAGS := -g
# CFLAGS := -g -O0
# LDFLAGS := -g
CFLAGS += $(shell pkg-config --cflags mpi)
CFLAGS += $(shell pkg-config --cflags mpi)
LDFLAGS += $(shell pkg-config --libs mpi)
LDFLAGS += $(shell pkg-config --libs mpi)
all: qs_mpi
all: qs_mpi
handin: zhan4854.tar.gz
zhan4854.tar.gz: Makefile ASSIGNMENT.md qs_mpi.c report.pdf
mkdir -p zhan4854
cp $^ zhan4854
tar -czvf $@ zhan4854
rm -r zhan4854
run-example: qs_mpi
run-example: qs_mpi
mpirun --allow-run-as-root -np 4 ./qs_mpi 32 output.txt
mpirun -v --allow-run-as-root -np 4 ./qs_mpi 32 output.txt
report.pdf: report.typ
typst compile $< $@
qs_mpi: qs_mpi.o
qs_mpi: qs_mpi.o
$(CC) $^ $(CFLAGS) $(LDFLAGS) -o $@
$(CC) $^ $(CFLAGS) $(LDFLAGS) -o $@
@ -1,24 +1,25 @@
import sys
import sys
import re
import re
pat = re.compile(r"\[(\d+)\] (.*)")
pat = re.compile(r"\[(\d+),(-?\d+)\] (.*)")
outputs = {}
outputs = {}
for line in sys.stdin.readlines():
for i, line in enumerate(sys.stdin.readlines()):
m = pat.match(line)
m = pat.match(line)
if not m:
if not m:
# print(line)
# print(line)
p = int(m.group(1))
p = int(m.group(1))
rest = m.group(2)
n = int(m.group(2))
rest = m.group(3)
if p not in outputs: outputs[p] = []
if (p, n) not in outputs: outputs[p, n] = (i, [])
outputs[p, n][1].append(rest)
for p in sorted(outputs.keys()):
for ((p, n), (i, lines)) in sorted(outputs.items(), key=lambda v: (-v[0][1], v[0][0])):
lines = outputs[p]
# lines = outputs[p, n]
print(f"---- {p} ----")
print(f"---- {p} [{n}] ----")
for line in lines:
for line in lines:
assignments/02/qs_mpi commented.c
Normal file
assignments/02/qs_mpi commented.c
Normal file
@ -0,0 +1,655 @@
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// https://stackoverflow.com/a/75458495
#define check_mpi_error(n) __check_mpi_error(__FILE__, __LINE__, n)
void __check_mpi_error(const char *file, const int line, const int n) {
char errbuffer[MPI_MAX_ERROR_STRING];
int errlen;
if (n != MPI_SUCCESS) {
MPI_Error_string(n, errbuffer, &errlen);
printf("MPI-error: %s\n", errbuffer);
printf("Location: %s:%i\n", file, line);
#define CTL_SIZE 4
#define ROOT_RANK 0
#define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
#define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))
#define ENSURE_int(i) _Generic((i), int : (i))
#define ENSURE_float(f) _Generic((f), float : (f))
#define MAX(type, x, y) (type) GENERIC_MAX(ENSURE_##type(x), ENSURE_##type(y))
#define MIN(type, x, y) (type) GENERIC_MIN(ENSURE_##type(x), ENSURE_##type(y))
void init_ctl(int *ctl, int len);
void local_quicksort(int *arr, int lo, int hi);
char *string_of_list(int *arr, int len);
void recursive_quicksort(int *integers, int n, int segment_capac,
int segment_len, int *integers_out, MPI_Comm comm);
int main(int argc, char **argv) {
int rank, p;
MPI_Init(&argc, &argv);
int n = atoi(argv[1]);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &p);
// Generate integers
int n_over_p = n / p;
int integers[n_over_p];
// Minor implementation detail: srand(0) is specially handled by glibc to
// behave as if it was called with srand(1). To get around this, I'm seeding
// with rank + 1
// See more: https://stackoverflow.com/a/27386563
srand(rank + 1);
for (int i = 0; i < n_over_p; ++i) {
integers[i] = rand();
// printf(" - %d\n", integers[i]);
// printf("[%d,9999999999] GENERATED INTEGERS: %s\n", rank,
// string_of_list(integers, n_over_p));
int new_integers[n_over_p];
recursive_quicksort(integers, n, n_over_p, n_over_p, new_integers,
// sleep(1);
// printf("[%d] after: %s\n", rank, string_of_list(integers, n_over_p));
// The first node is responsible for collecting all the data and then
// printing it out to the file MPI_Gather(const void *sendbuf, int
// sendcount, MPI_INT, void *recvbuf,
// int recvcount, MPI_INT, 0, MPI_COMM_WORLD);
int recvbuf[n];
MPI_Gather(new_integers, n_over_p, MPI_INT, recvbuf, n_over_p, MPI_INT, 0,
if (rank == 0) {
FILE *fp = fopen(argv[2], "w");
// printf("integers: %s\n", string_of_list(recvbuf, n));
// printf("[%d,-1] ==== FINAL ====\n", rank);
for (int i = 0; i < n; i += 1) {
fprintf(fp, "%d\n", recvbuf[i]);
// printf("[%d,-1] %s\n", rank,
// string_of_list(&recvbuf[i * n_over_p], n_over_p));
// printf("Done.\n");
return 0;
// hi not inclusive
void local_quicksort(int *arr, int lo, int hi) {
int temp;
if (lo >= hi || lo < 0)
int pivot = arr[hi - 1];
int pivot_idx = lo - 1;
for (int j = lo; j < hi; ++j) {
if (arr[j] < pivot) {
pivot_idx += 1;
temp = arr[j];
arr[j] = arr[pivot_idx];
arr[pivot_idx] = temp;
pivot_idx += 1;
temp = arr[hi - 1];
arr[hi - 1] = arr[pivot_idx];
arr[pivot_idx] = temp;
// Recursive call
local_quicksort(arr, lo, pivot_idx);
local_quicksort(arr, pivot_idx + 1, hi);
// char *string_of_list(int *arr, int len) {
// char *buffer = calloc(sizeof(char), 1000);
// int offset = 0; // Keep track of the current position in the buffer
// for (int i = 0; i < len; i++) {
// offset += sprintf(buffer + offset, "%d", arr[i]);
// if (i < len - 1) {
// // Add a separator (e.g., comma or space) if it's not the last element
// offset += sprintf(buffer + offset, " ");
// }
// }
// return buffer;
// }
void recursive_quicksort(int *integers, int total_elems, int segment_capac,
int segment_len, int *integers_out, MPI_Comm comm) {
int err, rank, p;
MPI_Comm_size(comm, &p);
MPI_Comm_rank(comm, &rank);
// printf(
// "[%d,%d] recursive_quicksort([%s], total=%d, capac=%d, len=%d)
// {p=%d}\n", rank, total_elems, string_of_list(integers, segment_len),
// total_elems, segment_capac, segment_len, p);
if (p <= 1) {
// Recursion base case: just sort it serially
local_quicksort(integers, 0, total_elems);
for (int i = 0; i < total_elems; ++i) {
integers_out[i] = integers[i];
// printf("Quicksorted: %s\n", string_of_list(integers, total_elems));
// sleep(1);
// printf("\n\n");
// int segment_capac = (total_elems + p - 1) / p;
// int segment_len = total_elems / p;
// if (rank == ROOT_RANK)
// segment_len += total_elems - p * segment_len;
// printf("[%d,%d] capac: %d, len: %d\n", rank, total_elems, segment_capac,
// segment_len);
// printf(
// "[%d] :::::::::::::::::::::::::::: RECURSIVE QUICKSORT (n=%d,
// n/p=%d)\n", rank, n, n_over_p);
// Locally sort
// printf("[%d] Numbers before: %s\n", rank,
// string_of_list(integers, n_over_p));
local_quicksort(integers, 0, segment_len);
// printf("[%d] Numbers after first sort: %s\n", rank,
// string_of_list(integers, segment_len));
// Select a pivot.
// This pivot is broadcasted to all nodes
int pivot;
// First, select a random element
int rand_el = integers[rand() % segment_len];
// Gather it
int rand_els[p];
MPI_Gather(&rand_el, 1, MPI_INT, rand_els, 1, MPI_INT, ROOT_RANK, comm);
// Get the median
if (rank == ROOT_RANK) {
// Sort
local_quicksort(rand_els, 0, p);
// printf("[%d,%d] Local quicksort for pivot: %s\n", rank, total_elems,
// string_of_list(rand_els, p));
// Get the middle element
pivot = rand_els[p / 2];
MPI_Bcast(&pivot, 1, MPI_INT, ROOT_RANK, comm);
// printf("[%d,%d] Broadcasted pivot: %d\n", rank, total_elems, pivot);
// Determine where the boundary between S (lower) and L (higher) lies
int boundary = 0;
for (int i = 0; i < segment_len; ++i) {
if (integers[i] >= pivot) {
boundary = i;
// printf("[%d,%d] boundary: %d\n", rank, total_elems, boundary);
int S_lo = 0, S_hi = boundary;
int L_lo = boundary, L_hi = segment_len;
int S_size = S_hi - S_lo, L_size = L_hi - L_lo;
// printf("[%d,%d] S: [%d - %d] (%d), L: [%d - %d] (%d)\n", rank, total_elems,
// S_lo, S_hi, S_size, L_lo, L_hi, L_size);
// Perform global arrangement
int S_global_end = -1, L_reverse_end = -1, S_global_max_end = -1;
MPI_Scan(&S_size, &S_global_end, 1, MPI_INT, MPI_SUM, comm);
MPI_Scan(&L_size, &L_reverse_end, 1, MPI_INT, MPI_SUM, comm);
int index;
MPI_Scan(&segment_len, &index, 1, MPI_INT, MPI_SUM, comm);
// printf("[%d] bruh %d\n", rank, S_global_end);
// Get the boundary element between S and L
MPI_Allreduce(&S_global_end, &S_global_max_end, 1, MPI_INT, MPI_MAX, comm);
int S_global_start = S_global_end - S_size,
L_reverse_start = L_reverse_end - L_size,
L_global_start = total_elems - L_reverse_end,
L_global_end = total_elems - L_reverse_start;
// printf("[%d,%d] Prefixed S: [%d - %d) (%d), Prefixed L: [%d - %d) (%d)\n",
// rank, total_elems, S_global_start, S_global_end, S_size,
// L_global_start, L_global_end, L_size);
// Determine which process S's and L's destination will start in, respectively
int S_starting_process, L_starting_process;
int p_of_split, split_point;
// int split_point = S_global_max_end % segment_len;
int indexes[p];
MPI_Allgather(&index, 1, MPI_INT, indexes, 1, MPI_INT, comm);
for (int i = 0; i < p; ++i) {
int lo = i == 0 ? 0 : indexes[i - 1];
int hi = indexes[i];
if (S_global_start >= lo && S_global_start < hi)
S_starting_process = i;
if (L_global_start >= lo && L_global_start < hi)
L_starting_process = i;
if (S_global_max_end >= lo && S_global_max_end < hi) {
p_of_split = i;
split_point = S_global_max_end - lo;
// err = MPI_Bcast(&S_starting_process, 1, MPI_INT, ROOT_RANK, comm);
// check_mpi_error(err);
// err = MPI_Bcast(&L_starting_process, 1, MPI_INT, ROOT_RANK, comm);
// check_mpi_error(err);
// printf("[%d,%d] indexes: %s\n", rank, total_elems,
// string_of_list(indexes, p));
// printf("[%d,%d] S=%d starts at %d , L=%d starts at %d , indexes: %s\n",
// rank,
// total_elems, S_global_start, S_starting_process, L_global_start,
// L_starting_process, string_of_list(indexes, p));
// S_starting_process = S_global_start / segment_len;
// L_starting_process = L_global_start / segment_len;
int S_offset = S_global_start % segment_len,
L_offset = L_global_start % segment_len;
int S_ctl[p * CTL_SIZE];
int L_ctl[p * CTL_SIZE];
int S_send_ctl[p * CTL_SIZE];
int L_send_ctl[p * CTL_SIZE];
int ctl_send_counts[p];
int ctl_send_displs[p];
int send_counts[p];
int send_displs[p];
int recv_counts[p];
int recv_displs[p];
init_ctl(S_ctl, p);
init_ctl(L_ctl, p);
init_ctl(S_send_ctl, p);
init_ctl(L_send_ctl, p);
int SPACE = segment_capac;
for (int i = 0; i < p; ++i) {
send_counts[i] = SPACE;
send_displs[i] = i * SPACE;
ctl_send_counts[i] = CTL_SIZE;
ctl_send_displs[i] = i * CTL_SIZE;
recv_counts[i] = CTL_SIZE;
recv_displs[i] = i * CTL_SIZE;
// Send S to the correct target
if (S_size) {
for (int i = S_lo, dest_pos = S_global_start,
processor = S_starting_process;
i < S_hi;) {
int next_break =
MIN(int, S_global_end,
MIN(int, dest_pos + (S_hi - S_lo),
(dest_pos / segment_len) * segment_len + segment_len));
int count = next_break - dest_pos;
int from_local_start = i, from_local_end = i + count;
int from_global_start = rank * segment_len + from_local_start,
from_global_end = from_global_start + count;
int to_global_start = dest_pos, to_global_end = dest_pos + count;
int to_local_start = to_global_start - processor * segment_len,
to_local_end = to_global_end - processor * segment_len;
// printf("[%d] S ->> (count=%d), from local [%d..%d] {%d..%d} -to-> "
// "p#%d [%d..%d] {%d..%d}\n",
// rank, count, from_local_start, from_local_end,
// from_global_start, from_global_end, processor, to_local_start,
// to_local_end, to_global_start, to_global_end);
S_send_ctl[processor * CTL_SIZE] = count;
S_send_ctl[processor * CTL_SIZE + 1] = from_global_start;
S_send_ctl[processor * CTL_SIZE + 2] = to_local_start;
S_send_ctl[processor * CTL_SIZE + 3] = from_local_start;
i += count;
dest_pos += count;
processor += 1;
MPI_Alltoallv(S_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, S_ctl,
recv_counts, recv_displs, MPI_INT, comm);
// Send L to the correct target
if (L_size) {
for (int i = L_lo, dest_pos = L_global_start,
processor = L_starting_process;
i < L_hi;) {
int next_break =
MIN(int, L_global_end,
MIN(int, dest_pos + (L_hi - L_lo),
(dest_pos / segment_len) * segment_len + segment_len));
int count = next_break - dest_pos;
int from_local_start = i, from_local_end = i + count;
int from_global_start = rank * segment_len + from_local_start,
from_global_end = from_global_start + count;
int to_global_start = dest_pos, to_global_end = dest_pos + count;
int to_local_start = to_global_start - processor * segment_len,
to_local_end = to_global_end - processor * segment_len;
// printf("[%d] L ->> (count=%d), from local [%d..%d] {%d..%d} -to-> "
// "p#%d [%d..%d] {%d..%d}\n",
// rank, count, from_local_start, from_local_end,
// from_global_start, from_global_end, processor, to_local_start,
// to_local_end, to_global_start, to_global_end);
L_send_ctl[processor * CTL_SIZE] = count;
L_send_ctl[processor * CTL_SIZE + 1] = from_global_start;
L_send_ctl[processor * CTL_SIZE + 2] = to_local_start;
L_send_ctl[processor * CTL_SIZE + 3] = from_local_start;
i += count;
dest_pos += count;
processor += 1;
MPI_Alltoallv(L_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, L_ctl,
recv_counts, recv_displs, MPI_INT, comm);
// After sending S and L information
for (int i = 0; i < p; ++i) {
recv_counts[i] = segment_len;
recv_displs[i] = i * segment_len;
// printf("[%d,%d] S CTL INFO\n", rank, total_elems);
// for (int i = 0; i < p; ++i) {
// printf("[%d,%d] [p=%d] (ct=%d)\n", rank, total_elems, i,
// S_send_ctl[i * CTL_SIZE]);
// }
// MPI_Alltoallv(integers, send_counts, send_displs, MPI_INT,
// integers_recv_buf,
// recv_counts, recv_displs, MPI_INT, MPI_COMM_WORLD);
// MPI_Allgather(integers, n_over_p, MPI_INT, integers_recv_buf, n_over_p,
// MPI_INT, comm);
// printf("[%d] ints: %s\n", rank, string_of_list(integers_recv_buf, n));
// Scheme for all send
int integers_recv_2[segment_capac];
int integers_recv_3[segment_capac];
for (int i = 0; i < segment_len; ++i) {
integers_recv_2[i] = -1;
integers_recv_3[i] = integers[i];
for (int host_p = 0; host_p < p; ++host_p) {
if (rank == host_p) {
// Your {S,L}_ctl is a mapping from source_processor -> ctl
// Everyone already knows who needs to send to who now
for (int sender_p = 0; sender_p < p; ++sender_p) {
int S_count = S_ctl[sender_p * CTL_SIZE];
if (S_count > 0) {
int to_local_start = S_ctl[sender_p * CTL_SIZE + 2];
int from_local_start = S_ctl[sender_p * CTL_SIZE + 3];
if (sender_p == host_p) {
for (int k = 0; k < S_count; ++k) {
integers_recv_3[to_local_start + k] =
integers[from_local_start + k];
// printf("[%d] - S inbound from host %d to [%d..%d] (%d)\n", rank,
// sender_p, to_local_start, to_local_start + S_count,
// S_count);
err = MPI_Recv(&integers_recv_2[to_local_start], S_count, MPI_INT,
sender_p, 124, comm, MPI_STATUS_IGNORE);
for (int k = 0; k < S_count; ++k) {
integers_recv_3[to_local_start + k] =
integers_recv_2[to_local_start + k];
} else {
// Your {S,L}_send_ctl contains a mapping from dest_processor -> ctl
for (int dest_p = 0; dest_p < p; ++dest_p) {
int S_count = S_send_ctl[dest_p * CTL_SIZE];
if (S_count > 0 && dest_p == host_p) {
int from_local_start = S_send_ctl[dest_p * CTL_SIZE + 3];
// printf("[%d] - S outbound to host %d from [%d..%d] (%d)\n", rank,
// dest_p, from_local_start, from_local_start + S_count,
// S_count);
MPI_Send(&integers[from_local_start], S_count, MPI_INT, dest_p, 124,
for (int host_p = 0; host_p < p; ++host_p) {
if (rank == host_p) {
// Your {S,L}_ctl is a mapping from source_processor -> ctl
// Everyone already knows who needs to send to who now
for (int sender_p = 0; sender_p < p; ++sender_p) {
int L_count = L_ctl[sender_p * CTL_SIZE];
if (L_count > 0) {
int to_local_start = L_ctl[sender_p * CTL_SIZE + 2];
int from_local_start = L_ctl[sender_p * CTL_SIZE + 3];
if (sender_p == host_p) {
for (int k = 0; k < L_count; ++k) {
integers_recv_3[to_local_start + k] =
integers[from_local_start + k];
// printf("[%d] - L inbound from host %d to [%d..%d] (%d)\n", rank,
// sender_p, to_local_start, to_local_start + L_count,
// L_count);
err = MPI_Recv(&integers_recv_2[to_local_start], L_count, MPI_INT,
sender_p, 125, comm, MPI_STATUS_IGNORE);
for (int k = 0; k < L_count; ++k) {
integers_recv_3[to_local_start + k] =
integers_recv_2[to_local_start + k];
} else {
// Your {S,L}_send_ctl contains a mapping from dest_processor -> ctl
for (int dest_p = 0; dest_p < p; ++dest_p) {
int L_count = L_send_ctl[dest_p * CTL_SIZE];
if (L_count > 0 && dest_p == host_p) {
int from_local_start = L_send_ctl[dest_p * CTL_SIZE + 3];
// printf("[%d] - L outbound to host %d from [%d..%d] (%d)\n", rank,
// dest_p, from_local_start, from_local_start + L_count,
// L_count);
MPI_Send(&integers[from_local_start], L_count, MPI_INT, dest_p, 125,
// printf("[%d,%d] after: %s\n", rank, total_elems,
// string_of_list(integers_recv_3, segment_len));
// printf("[%d,%d] -------------------------------------\n", rank,
// total_elems); for (int i = 0; i < segment_len; ++i) {
// integers[i] = integers_recv_3[i];
// }
// ###################################################################################
// Now, determine which processes should be responsible for taking the S and L
// arrays
// Specifically, the part where it's split, break the tie to see if it goes
// down or up
int child_len = segment_len;
int difference = segment_len - split_point;
int transfer[split_point];
// printf("[%d,%d] p_of_split = %d, split_point = %d => (child_len = %d)\n",
// rank, total_elems, p_of_split, split_point, child_len);
int has_split = 0;
if (p_of_split == 0 || p_of_split == p - 1) {
// Super unfortunate, bad pivot
} else if (split_point == 0) {
// Super lucky, it's split evenly!
} else {
has_split = 1;
// Let's just say that if there's any split, the block itself counts as L
// and then add the rest to the previous block
if (rank == p_of_split - 1) {
child_len += split_point;
err = MPI_Recv(transfer, split_point, MPI_INT, p_of_split, 126, comm,
} else if (rank == p_of_split) {
child_len = difference;
err = MPI_Send(integers, split_point, MPI_INT, p_of_split - 1, 126, comm);
// Which group is this child going into?
int color;
if (rank < p_of_split)
color = 100;
color = 200;
// printf("[%d,%d] split color = %d, split lenth = %d\n", rank, total_elems,
// color, child_len);
MPI_Comm child_comm;
MPI_Comm_split(comm, color, rank, &child_comm);
// Figure out what the max is
int max_child_buf_len, total_child_elems;
err = MPI_Allreduce(&child_len, &max_child_buf_len, 1, MPI_INT, MPI_MAX,
err = MPI_Allreduce(&child_len, &total_child_elems, 1, MPI_INT, MPI_SUM,
// printf("[%d] [color=%d] max length = %d, total child elems = %d\n", rank,
// color, max_child_buf_len, total_child_elems);
// Copy into a new buf
int new_buf[max_child_buf_len];
int whichCase = 999;
for (int i = 0; i < max_child_buf_len; ++i) {
if (has_split && rank == p_of_split - 1) {
whichCase = 1001;
if (i < segment_len)
new_buf[i] = integers_recv_3[i];
else if (i < segment_len + split_point)
new_buf[i] = transfer[i - segment_len];
new_buf[i] = -1;
} else if (has_split && rank == p_of_split) {
whichCase = 1002;
if (i < difference)
new_buf[i] = integers_recv_3[i + split_point];
new_buf[i] = -1;
} else {
whichCase = 1003;
if (i < child_len)
new_buf[i] = integers_recv_3[i];
new_buf[i] = -1;
// printf("[%d,%d] orig integers: %s\n", rank, total_elems,
// string_of_list(integers, segment_len));
// printf("[%d,%d] new buf = %s (has_split = %d, segment_len = %d, case = %d,
// "
// "child_elems = %d)\n",
// rank, total_elems, string_of_list(new_buf, max_child_buf_len),
// has_split, segment_len, whichCase, child_len);
// printf("[%d,%d] \n", rank, total_elems);
int integers_out_buf[total_child_elems];
recursive_quicksort(new_buf, total_child_elems, max_child_buf_len, child_len,
integers_out_buf, child_comm);
// Ok now copy the new items back
switch (whichCase) {
case 1001:
// In this case, p is right before the split, so it got extra elements
// To reverse this, we can send the elements back to the second
for (int i = 0; i < total_child_elems; ++i) {
if (i < segment_len)
integers_out[i] = integers_out_buf[i];
transfer[i - segment_len] = integers_out_buf[i];
MPI_Send(transfer, split_point, MPI_INT, p_of_split, 127, comm);
case 1002:
MPI_Recv(transfer, split_point, MPI_INT, p_of_split - 1, 127, comm,
for (int i = 0; i < split_point; ++i) {
integers_out[i] = transfer[i];
for (int i = 0; i < total_child_elems; ++i) {
integers_out[i + split_point] = integers_out_buf[i];
case 1003:
for (int i = 0; i < total_child_elems; ++i) {
integers_out[i] = integers_out_buf[i];
void init_ctl(int *ctl, int len) {
for (int i = 0; i < len; ++i) {
ctl[i * CTL_SIZE] = 0;
for (int j = 1; j < CTL_SIZE; ++j) {
ctl[i * CTL_SIZE + j] = -1;
@ -3,9 +3,25 @@
#include <stdlib.h>
#include <stdlib.h>
#include <unistd.h>
#include <unistd.h>
// https://stackoverflow.com/a/75458495
#define check_mpi_error(n) __check_mpi_error(__FILE__, __LINE__, n)
void __check_mpi_error(const char *file, const int line, const int n) {
char errbuffer[MPI_MAX_ERROR_STRING];
int errlen;
if (n != MPI_SUCCESS) {
MPI_Error_string(n, errbuffer, &errlen);
printf("MPI-error: %s\n", errbuffer);
printf("Location: %s:%i\n", file, line);
#define CTL_SIZE 4
#define CTL_SIZE 4
#define ROOT_RANK 0
#define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
#define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
#define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))
#define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))
@ -19,7 +35,8 @@
void init_ctl(int *ctl, int len);
void init_ctl(int *ctl, int len);
void local_quicksort(int *arr, int lo, int hi);
void local_quicksort(int *arr, int lo, int hi);
char *string_of_list(int *arr, int len);
char *string_of_list(int *arr, int len);
void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm);
void recursive_quicksort(int *integers, int n, int segment_capac,
int segment_len, int *integers_out, MPI_Comm comm);
int main(int argc, char **argv) {
int main(int argc, char **argv) {
int rank, p;
int rank, p;
@ -42,37 +59,39 @@ int main(int argc, char **argv) {
srand(rank + 1);
srand(rank + 1);
for (int i = 0; i < n_over_p; ++i) {
for (int i = 0; i < n_over_p; ++i) {
// TODO: For readability during debugging, I'm capping this
integers[i] = rand() % 101;
integers[i] = rand() % 101;
// printf(" - %d\n", integers[i]);
recursive_quicksort(integers, n, 0, MPI_COMM_WORLD);
int new_integers[n_over_p];
recursive_quicksort(integers, n, n_over_p, n_over_p, new_integers,
// sleep(1);
// printf("[%d] after: %s\n", rank, string_of_list(integers, n_over_p));
// The first node is responsible for collecting all the data and then
// printing it out to the file MPI_Gather(const void *sendbuf, int
// sendcount, MPI_INT, void *recvbuf,
// int recvcount, MPI_INT, 0, MPI_COMM_WORLD);
int recvbuf[n];
MPI_Gather(integers, n_over_p, MPI_INT, recvbuf, n_over_p, MPI_INT, 0,
if (rank == 0) {
// The first node is responsible for collecting all the data and then
FILE *f = fopen(argv[2], "w");
// printing it out to the file
// printf("integers: %s\n", string_of_list(recvbuf, n));
printf("[%d] ==== FINAL ====\n", rank);
FILE *fp;
if (rank == ROOT_RANK)
fp = fopen(argv[2], "w");
for (int i = 0; i < p; i += 1) {
for (int i = 0; i < p; i += 1) {
printf("[%d] %s\n", rank,
if (rank == ROOT_RANK) {
string_of_list(&recvbuf[i * n_over_p], n_over_p));
if (i != ROOT_RANK) {
MPI_Recv(new_integers, n_over_p, MPI_INT, i, 129, MPI_COMM_WORLD,
for (int j = 0; j < n_over_p; ++j) {
fprintf(fp, "%d\n", new_integers[j]);
} else if (rank == i) {
MPI_Send(new_integers, n_over_p, MPI_INT, ROOT_RANK, 129, MPI_COMM_WORLD);
if (rank == ROOT_RANK)
return 0;
return 0;
@ -105,113 +124,112 @@ void local_quicksort(int *arr, int lo, int hi) {
local_quicksort(arr, pivot_idx + 1, hi);
local_quicksort(arr, pivot_idx + 1, hi);
char *string_of_list(int *arr, int len) {
// char *string_of_list(int *arr, int len) {
char *buffer = calloc(sizeof(char), 1000);
// char *buffer = calloc(sizeof(char), 1000);
int offset = 0; // Keep track of the current position in the buffer
// int offset = 0; // Keep track of the current position in the buffer
for (int i = 0; i < len; i++) {
// for (int i = 0; i < len; i++) {
offset += sprintf(buffer + offset, "%d", arr[i]);
// offset += sprintf(buffer + offset, "%d", arr[i]);
if (i < len - 1) {
// if (i < len - 1) {
// Add a separator (e.g., comma or space) if it's not the last element
// // Add a separator (e.g., comma or space) if it's not the last
offset += sprintf(buffer + offset, " ");
// element offset += sprintf(buffer + offset, " ");
// }
// }
return buffer;
// return buffer;
// }
void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
void recursive_quicksort(int *integers, int total_elems, int segment_capac,
int rank, p;
int segment_len, int *integers_out, MPI_Comm comm) {
int err, rank, p;
MPI_Comm_size(comm, &p);
MPI_Comm_size(comm, &p);
MPI_Comm_rank(comm, &rank);
MPI_Comm_rank(comm, &rank);
if (p == 1) {
if (p <= 1) {
// Recursion base case: just sort it serially
// Recursion base case: just sort it serially
local_quicksort(integers, 0, n);
local_quicksort(integers, 0, total_elems);
printf("Quicksorted: %s\n", string_of_list(integers, n));
for (int i = 0; i < total_elems; ++i) {
integers_out[i] = integers[i];
int n_over_p_max = (n + p - 1) / p;
int n_over_p = n / p;
if (rank == root)
n_over_p += n - p * n_over_p;
// printf(
// "[%d] :::::::::::::::::::::::::::: RECURSIVE QUICKSORT (n=%d,
// n/p=%d)\n", rank, n, n_over_p);
// Locally sort
// printf("[%d] Numbers before: %s\n", rank,
// string_of_list(integers, n_over_p));
local_quicksort(integers, 0, n_over_p);
printf("[%d] Numbers after first sort: %s\n", rank,
string_of_list(integers, n_over_p));
// Select a pivot.
// Select a pivot.
// This pivot is broadcasted to all nodes
// This pivot is broadcasted to all nodes
int pivot;
int pivot;
// First, select a random element
// First, select a random element
int rand_el = integers[rand() % n_over_p];
int rand_el = integers[rand() % segment_len];
// Gather it
// Gather it
int rand_els[p];
int rand_els[p];
MPI_Gather(&rand_el, 1, MPI_INT, rand_els, 1, MPI_INT, root, comm);
MPI_Gather(&rand_el, 1, MPI_INT, rand_els, 1, MPI_INT, ROOT_RANK, comm);
// Get the median
// Get the median
if (rank == root) {
if (rank == ROOT_RANK) {
// Sort
// Get the middle element after sorting
local_quicksort(rand_els, 0, p);
local_quicksort(rand_els, 0, p);
// Get the middle element
pivot = rand_els[p / 2];
pivot = rand_els[p / 2];
MPI_Bcast(&pivot, 1, MPI_INT, root, comm);
MPI_Bcast(&pivot, 1, MPI_INT, ROOT_RANK, comm);
printf("[%d] Broadcasted pivot: %d\n", rank, pivot);
// Determine where the boundary between S (lower) and L (higher) lies
// Determine where the boundary between S (lower) and L (higher) lies
int boundary;
int boundary = 0;
for (int i = 0; i < n_over_p; ++i) {
for (int i = 0; i < segment_len; ++i) {
if (integers[i] >= pivot) {
if (integers[i] >= pivot) {
boundary = i;
boundary = i;
int S_lo = 0, S_hi = boundary;
int S_lo = 0, S_hi = boundary;
int L_lo = boundary, L_hi = n_over_p;
int L_lo = boundary, L_hi = segment_len;
int S_size = S_hi - S_lo, L_size = L_hi - L_lo;
int S_size = S_hi - S_lo, L_size = L_hi - L_lo;
// printf("[%d] S: [%d - %d] (%d), L: [%d - %d] (%d)\n", rank, S_lo, S_hi,
// S_size, L_lo, L_hi, L_size);
// Perform global arrangement
// Perform global arrangement
int S_global_end, L_reverse_end, S_global_max_end;
int S_global_end = -1, L_reverse_end = -1, S_global_max_end = -1;
MPI_Scan(&S_size, &S_global_end, 1, MPI_INT, MPI_SUM, comm);
MPI_Scan(&S_size, &S_global_end, 1, MPI_INT, MPI_SUM, comm);
MPI_Scan(&L_size, &L_reverse_end, 1, MPI_INT, MPI_SUM, comm);
MPI_Scan(&L_size, &L_reverse_end, 1, MPI_INT, MPI_SUM, comm);
// printf("[%d] bruh %d\n", rank, S_global_end);
int index;
// Get the boundary element between S and L
MPI_Scan(&segment_len, &index, 1, MPI_INT, MPI_SUM, comm);
MPI_Allreduce(&S_global_end, &S_global_max_end, 1, MPI_INT, MPI_MAX, comm);
MPI_Allreduce(&S_global_end, &S_global_max_end, 1, MPI_INT, MPI_MAX, comm);
int S_global_start = S_global_end - S_size,
int S_global_start = S_global_end - S_size,
L_reverse_start = L_reverse_end - L_size,
L_reverse_start = L_reverse_end - L_size,
L_global_start = n - L_reverse_end, L_global_end = n - L_reverse_start;
L_global_start = total_elems - L_reverse_end,
// printf("[%d] Prefixed S: [%d - %d], Prefixed L: [%d - %d]\n", rank,
L_global_end = total_elems - L_reverse_start;
// S_global_start, S_global_end - 1, L_global_start, L_global_end - 1);
int S_starting_process = S_global_start / n_over_p,
// Determine which process S's and L's destination will start in,
L_starting_process = L_global_start / n_over_p;
// respectively
int S_offset = S_global_start % n_over_p,
int S_starting_process, L_starting_process;
L_offset = L_global_start % n_over_p;
int p_of_split, split_point;
int indexes[p];
MPI_Allgather(&index, 1, MPI_INT, indexes, 1, MPI_INT, comm);
for (int i = 0; i < p; ++i) {
int lo = i == 0 ? 0 : indexes[i - 1];
int hi = indexes[i];
if (S_global_start >= lo && S_global_start < hi)
S_starting_process = i;
if (L_global_start >= lo && L_global_start < hi)
L_starting_process = i;
if (S_global_max_end >= lo && S_global_max_end < hi) {
p_of_split = i;
split_point = S_global_max_end - lo;
int S_offset = S_global_start % segment_len,
L_offset = L_global_start % segment_len;
int *integers_recv_buf = calloc(sizeof(int), n);
int S_ctl[p * CTL_SIZE];
int S_ctl[p * CTL_SIZE];
int L_ctl[p * CTL_SIZE];
int L_ctl[p * CTL_SIZE];
int S_send_ctl[p * CTL_SIZE];
int S_send_ctl[p * CTL_SIZE];
int L_send_ctl[p * CTL_SIZE];
int L_send_ctl[p * CTL_SIZE];
int recvpart[n_over_p];
int ctl_send_counts[p];
int ctl_send_counts[p];
int ctl_send_displs[p];
int ctl_send_displs[p];
@ -225,9 +243,11 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
init_ctl(S_send_ctl, p);
init_ctl(S_send_ctl, p);
init_ctl(L_send_ctl, p);
init_ctl(L_send_ctl, p);
int SPACE = segment_capac;
for (int i = 0; i < p; ++i) {
for (int i = 0; i < p; ++i) {
send_counts[i] = n_over_p;
send_counts[i] = SPACE;
send_displs[i] = i * n_over_p;
send_displs[i] = i * SPACE;
ctl_send_counts[i] = CTL_SIZE;
ctl_send_counts[i] = CTL_SIZE;
ctl_send_displs[i] = i * CTL_SIZE;
ctl_send_displs[i] = i * CTL_SIZE;
@ -236,28 +256,24 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
// Send S to the correct target
// Send S to the correct target
if (S_size) {
for (int i = S_lo, dest_pos = S_global_start,
for (int i = S_lo, dest_pos = S_global_start,
processor = S_starting_process;
processor = S_starting_process;
i < S_hi;) {
i < S_hi;) {
int next_break = MIN(int, S_global_end,
int next_break =
MIN(int, S_global_end,
MIN(int, dest_pos + (S_hi - S_lo),
MIN(int, dest_pos + (S_hi - S_lo),
(dest_pos / n_over_p) * n_over_p + n_over_p));
(dest_pos / segment_len) * segment_len + segment_len));
int count = next_break - dest_pos;
int count = next_break - dest_pos;
int from_local_start = i, from_local_end = i + count;
int from_local_start = i, from_local_end = i + count;
int from_global_start = rank * n_over_p + from_local_start,
int from_global_start = rank * segment_len + from_local_start,
from_global_end = from_global_start + count;
from_global_end = from_global_start + count;
int to_global_start = dest_pos, to_global_end = dest_pos + count;
int to_global_start = dest_pos, to_global_end = dest_pos + count;
int to_local_start = to_global_start - processor * n_over_p,
int to_local_start = to_global_start - processor * segment_len,
to_local_end = to_global_end - processor * n_over_p;
to_local_end = to_global_end - processor * segment_len;
// printf("[%d] S ->> (count=%d), from local [%d..%d] {%d..%d} -to-> "
// "p#%d [%d..%d] {%d..%d}\n",
// rank, count, from_local_start, from_local_end,
// from_global_start, from_global_end, processor, to_local_start,
// to_local_end, to_global_start, to_global_end);
S_send_ctl[processor * CTL_SIZE] = count;
S_send_ctl[processor * CTL_SIZE] = count;
S_send_ctl[processor * CTL_SIZE + 1] = from_global_start;
S_send_ctl[processor * CTL_SIZE + 1] = from_global_start;
S_send_ctl[processor * CTL_SIZE + 2] = to_local_start;
S_send_ctl[processor * CTL_SIZE + 2] = to_local_start;
@ -267,34 +283,30 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
dest_pos += count;
dest_pos += count;
processor += 1;
processor += 1;
MPI_Alltoallv(S_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, S_ctl,
MPI_Alltoallv(S_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, S_ctl,
recv_counts, recv_displs, MPI_INT, comm);
recv_counts, recv_displs, MPI_INT, comm);
// Send L to the correct target
// Send L to the correct target
if (L_size) {
for (int i = L_lo, dest_pos = L_global_start,
for (int i = L_lo, dest_pos = L_global_start,
processor = L_starting_process;
processor = L_starting_process;
i < L_hi;) {
i < L_hi;) {
int next_break = MIN(int, L_global_end,
int next_break =
MIN(int, L_global_end,
MIN(int, dest_pos + (L_hi - L_lo),
MIN(int, dest_pos + (L_hi - L_lo),
(dest_pos / n_over_p) * n_over_p + n_over_p));
(dest_pos / segment_len) * segment_len + segment_len));
int count = next_break - dest_pos;
int count = next_break - dest_pos;
int from_local_start = i, from_local_end = i + count;
int from_local_start = i, from_local_end = i + count;
int from_global_start = rank * n_over_p + from_local_start,
int from_global_start = rank * segment_len + from_local_start,
from_global_end = from_global_start + count;
from_global_end = from_global_start + count;
int to_global_start = dest_pos, to_global_end = dest_pos + count;
int to_global_start = dest_pos, to_global_end = dest_pos + count;
int to_local_start = to_global_start - processor * n_over_p,
int to_local_start = to_global_start - processor * segment_len,
to_local_end = to_global_end - processor * n_over_p;
to_local_end = to_global_end - processor * segment_len;
// printf("[%d] L ->> (count=%d), from local [%d..%d] {%d..%d} -to-> "
// "p#%d [%d..%d] {%d..%d}\n",
// rank, count, from_local_start, from_local_end,
// from_global_start, from_global_end, processor, to_local_start,
// to_local_end, to_global_start, to_global_end);
L_send_ctl[processor * CTL_SIZE] = count;
L_send_ctl[processor * CTL_SIZE] = count;
L_send_ctl[processor * CTL_SIZE + 1] = from_global_start;
L_send_ctl[processor * CTL_SIZE + 1] = from_global_start;
L_send_ctl[processor * CTL_SIZE + 2] = to_local_start;
L_send_ctl[processor * CTL_SIZE + 2] = to_local_start;
@ -304,29 +316,23 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
dest_pos += count;
dest_pos += count;
processor += 1;
processor += 1;
MPI_Alltoallv(L_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, L_ctl,
MPI_Alltoallv(L_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, L_ctl,
recv_counts, recv_displs, MPI_INT, comm);
recv_counts, recv_displs, MPI_INT, comm);
// After sending S and L information
// After sending S and L information
for (int i = 0; i < p; ++i) {
for (int i = 0; i < p; ++i) {
recv_counts[i] = n_over_p;
recv_counts[i] = segment_len;
recv_displs[i] = i * n_over_p;
recv_displs[i] = i * segment_len;
// MPI_Alltoallv(integers, send_counts, send_displs, MPI_INT,
// Algorithm for sending S and L between all processes without O(n)
// integers_recv_buf,
// recv_counts, recv_displs, MPI_INT, MPI_COMM_WORLD);
// MPI_Allgather(integers, n_over_p, MPI_INT, integers_recv_buf, n_over_p,
// MPI_INT, comm);
// printf("[%d] ints: %s\n", rank, string_of_list(integers_recv_buf, n));
// Scheme for all send
int integers_recv_2[segment_capac];
int integers_recv_2[n_over_p];
int integers_recv_3[segment_capac];
int integers_recv_3[n_over_p];
for (int i = 0; i < segment_len; ++i) {
for (int i = 0; i < n_over_p; ++i) {
integers_recv_2[i] = -1;
integers_recv_2[i] = -1;
integers_recv_3[i] = integers[i];
integers_recv_3[i] = integers[i];
@ -349,11 +355,9 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
// printf("[%d] - S inbound from host %d to [%d..%d] (%d)\n", rank,
err = MPI_Recv(&integers_recv_2[to_local_start], S_count, MPI_INT,
// sender_p, to_local_start, to_local_start + S_count,
sender_p, 124, comm, MPI_STATUS_IGNORE);
// S_count);
MPI_Recv(&integers_recv_2[to_local_start], S_count, MPI_INT, sender_p,
124, comm, MPI_STATUS_IGNORE);
for (int k = 0; k < S_count; ++k) {
for (int k = 0; k < S_count; ++k) {
integers_recv_3[to_local_start + k] =
integers_recv_3[to_local_start + k] =
integers_recv_2[to_local_start + k];
integers_recv_2[to_local_start + k];
@ -366,9 +370,6 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
int S_count = S_send_ctl[dest_p * CTL_SIZE];
int S_count = S_send_ctl[dest_p * CTL_SIZE];
if (S_count > 0 && dest_p == host_p) {
if (S_count > 0 && dest_p == host_p) {
int from_local_start = S_send_ctl[dest_p * CTL_SIZE + 3];
int from_local_start = S_send_ctl[dest_p * CTL_SIZE + 3];
// printf("[%d] - S outbound to host %d from [%d..%d] (%d)\n", rank,
// dest_p, from_local_start, from_local_start + S_count,
// S_count);
MPI_Send(&integers[from_local_start], S_count, MPI_INT, dest_p, 124,
MPI_Send(&integers[from_local_start], S_count, MPI_INT, dest_p, 124,
@ -394,11 +395,9 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
// printf("[%d] - L inbound from host %d to [%d..%d] (%d)\n", rank,
err = MPI_Recv(&integers_recv_2[to_local_start], L_count, MPI_INT,
// sender_p, to_local_start, to_local_start + L_count,
sender_p, 125, comm, MPI_STATUS_IGNORE);
// L_count);
MPI_Recv(&integers_recv_2[to_local_start], L_count, MPI_INT, sender_p,
125, comm, MPI_STATUS_IGNORE);
for (int k = 0; k < L_count; ++k) {
for (int k = 0; k < L_count; ++k) {
integers_recv_3[to_local_start + k] =
integers_recv_3[to_local_start + k] =
integers_recv_2[to_local_start + k];
integers_recv_2[to_local_start + k];
@ -411,9 +410,6 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
int L_count = L_send_ctl[dest_p * CTL_SIZE];
int L_count = L_send_ctl[dest_p * CTL_SIZE];
if (L_count > 0 && dest_p == host_p) {
if (L_count > 0 && dest_p == host_p) {
int from_local_start = L_send_ctl[dest_p * CTL_SIZE + 3];
int from_local_start = L_send_ctl[dest_p * CTL_SIZE + 3];
// printf("[%d] - L outbound to host %d from [%d..%d] (%d)\n", rank,
// dest_p, from_local_start, from_local_start + L_count,
// L_count);
MPI_Send(&integers[from_local_start], L_count, MPI_INT, dest_p, 125,
MPI_Send(&integers[from_local_start], L_count, MPI_INT, dest_p, 125,
@ -421,69 +417,128 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
printf("[%d] after: %s\n", rank, string_of_list(integers_recv_3, n_over_p));
// ###################################################################################
for (int i = 0; i < n_over_p; ++i) {
integers[i] = integers_recv_3[i];
// Now, determine which processes should be responsible for taking the S and L
// Now, determine which processes should be responsible for taking the S and
// arrays
// L arrays. Specifically, the part where it's split, break the tie to see
// if it goes down or up
// Specifically, the part where it's split, break the tie to see if it goes
int child_len = segment_len;
// down or up
int difference = segment_len - split_point;
int colors[p];
int transfer[split_point];
int p_of_split = S_global_max_end / n_over_p;
int split_point = S_global_max_end % n_over_p;
// printf("[%d] p_of_split = %d / %d = %d\n", rank, S_global_max_end,
// n_over_p,
// p_of_split);
int S_split_add = split_point, L_split_sub = n_over_p - split_point;
int lo_start = 0, lo_end;
int has_split = 0;
int hi_start, hi_end = p;
if (p_of_split == 0 || p_of_split == p - 1) {
if (split_point > n_over_p / 2) {
// Super unfortunate, bad pivot
// Belongs to the lower group
} else if (split_point == 0) {
lo_end = hi_start = p_of_split + 1;
// Super lucky, it's split evenly!
} else {
} else {
// Belongs to the higher group
has_split = 1;
lo_end = hi_start = p_of_split;
// Let's just say that if there's any split, the block itself counts as L
// and then add the rest to the previous block
if (rank == p_of_split - 1) {
int child_root = -1;
child_len += split_point;
for (int i = 0; i < p; ++i) {
err = MPI_Recv(transfer, split_point, MPI_INT, p_of_split, 126, comm,
if (i < lo_end)
colors[i] = 100;
else {
} else if (rank == p_of_split) {
colors[i] = 200;
child_len = difference;
if (child_root == -1)
err = MPI_Send(integers, split_point, MPI_INT, p_of_split - 1, 126, comm);
child_root = i;
// MPI_Comm child;
// Which group is this child going into?
// MPI_Comm_split(comm, colors[rank], rank, &child);
int color;
// printf("[%d] Recursing...\n", rank);
if (rank < p_of_split)
color = 100;
color = 200;
// int child_size;
MPI_Comm child_comm;
// MPI_Comm_size(child, &child_size);
MPI_Comm_split(comm, color, rank, &child_comm);
// int start_at = 0, new_n = child_size * n_over_p;
// Figure out what the max is
// if (colors[rank] == 100) {
int max_child_buf_len, total_child_elems;
// new_n += S_split_add;
err = MPI_Allreduce(&child_len, &max_child_buf_len, 1, MPI_INT, MPI_MAX,
// } else {
// new_n -= L_split_sub;
// if (rank == p_of_split)
err = MPI_Allreduce(&child_len, &total_child_elems, 1, MPI_INT, MPI_SUM,
// start_at = split_point;
// }
// recursive_quicksort(integers, n, child_root, child);
// printf("[%d] Done recursing.\n", rank);
// Copy into a new buf
// MPI_Comm_free(&child);
int new_buf[max_child_buf_len];
int whichCase = 999;
for (int i = 0; i < max_child_buf_len; ++i) {
if (has_split && rank == p_of_split - 1) {
whichCase = 1001;
if (i < segment_len)
new_buf[i] = integers_recv_3[i];
else if (i < segment_len + split_point)
new_buf[i] = transfer[i - segment_len];
new_buf[i] = -1;
} else if (has_split && rank == p_of_split) {
whichCase = 1002;
if (i < difference)
new_buf[i] = integers_recv_3[i + split_point];
new_buf[i] = -1;
} else {
whichCase = 1003;
if (i < child_len)
new_buf[i] = integers_recv_3[i];
new_buf[i] = -1;
int integers_out_buf[total_child_elems];
recursive_quicksort(new_buf, total_child_elems, max_child_buf_len, child_len,
integers_out_buf, child_comm);
// Ok now copy the new items back
switch (whichCase) {
case 1001:
// In this case, p is right before the split, so it got extra elements
// To reverse this, we can send the elements back to the second
for (int i = 0; i < total_child_elems; ++i) {
if (i < segment_len)
integers_out[i] = integers_out_buf[i];
transfer[i - segment_len] = integers_out_buf[i];
MPI_Send(transfer, split_point, MPI_INT, p_of_split, 127, comm);
case 1002:
// The original array got shortened, so copy the transferred ones back in
// first, then copy the result from the child quicksorting after it
MPI_Recv(transfer, split_point, MPI_INT, p_of_split - 1, 127, comm,
for (int i = 0; i < split_point; ++i) {
integers_out[i] = transfer[i];
for (int i = 0; i < total_child_elems; ++i) {
integers_out[i + split_point] = integers_out_buf[i];
case 1003:
// This is just the regular case
for (int i = 0; i < total_child_elems; ++i) {
integers_out[i] = integers_out_buf[i];
void init_ctl(int *ctl, int len) {
void init_ctl(int *ctl, int len) {
for (int i = 0; i < len; ++i) {
for (int i = 0; i < len; ++i) {
for (int j = 0; j < CTL_SIZE; ++j) {
ctl[i * CTL_SIZE] = 0;
for (int j = 1; j < CTL_SIZE; ++j) {
ctl[i * CTL_SIZE + j] = -1;
ctl[i * CTL_SIZE + j] = -1;
Normal file
Normal file
Binary file not shown.
Normal file
Normal file
@ -0,0 +1,20 @@
= Homework 2
My algorithm works like this:
- First I generate $n/p$ integers on each process.
- Then I jump directly into the recursive step:
- I choose the pivot using the algorithm where each process picks a random element, and the median of those is picked.
- The way I moved $S$ and $L$ arrays around is:
1. First `MPI_Alltoallv` the plan for _which_ processors are going to be sent to, including exact calculations of which local index is being copied from and to.
2. Then, each processor loops through all the processors and if they have something to send, they send it.
3. This way, I can coordinate all of the senders/receivers and the ones with nothing to send don't do anything.
- For the recursion, I opted to make the recursive step have different lengths. (*NOTE:* The reason I have a different "capacity" than "length" is because for the `displs` array I opted to have them all be the same length, so there's extra padding on the shorter ones)
- If the boundary between $S$ and $L$ falls between a $n/p$ segment, I'd extend the one before and shorten the one after.
- Then, I recursively process all the $S$'s and all the $L$'s separately using `MPI_Comm_split`.
- Once it's done processing, I reverse the exact operation that extends / shortens the arrays. This ensures everything is always back to $n/p$ at the end.
- Everything is collected back at the end via a `Send`/`Recv` to save on allocations.
Allocations are all on the order of $O(p + n/p)$.
Unfortunately I didn't finish debugging segfaults in time, and have this report prepared for the parts of the assignment that I _did_ do. It works on small integers (capped at 100) but for some reason segfaults at address `(nil)` at the end... I spent several hours debugging but have not discovered how this occurs.
Add table
Reference in a new issue