csci5451/assignments/02/qs_mpi.c

#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#define ORDER_FORWARDS 1
#define ORDER_BACKWARDS 2
#define CTL_SIZE 3

#define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
#define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))

#define ENSURE_int(i) _Generic((i), int : (i))
#define ENSURE_float(f) _Generic((f), float : (f))

#define MAX(type, x, y) (type) GENERIC_MAX(ENSURE_##type(x), ENSURE_##type(y))
#define MIN(type, x, y) (type) GENERIC_MIN(ENSURE_##type(x), ENSURE_##type(y))

void local_quicksort(int *arr, int lo, int hi);
char *string_of_list(int *arr, int len);
void recursive_quicksort(int *integers, int n, MPI_Comm comm);

int main(int argc, char **argv) {
  int rank, p;
  MPI_Init(&argc, &argv);

  int n = atoi(argv[1]);

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &p);

  // Generate integers
  int n_over_p = n / p;
  int integers[n_over_p];

  // Important implementation detail: srand(0) is specially handled by glibc to
  // behave as if it was called with srand(1). To get around this, I'm seeding
  // with rank + 1
  //
  // See more: https://stackoverflow.com/a/27386563
  srand(rank + 1);

  for (int i = 0; i < n_over_p; ++i) {
    // TODO: For readability during debugging, I'm capping this
    integers[i] = rand() % 101;
    // printf(" - %d\n", integers[i]);
  }

  recursive_quicksort(integers, n, MPI_COMM_WORLD);

  sleep(1);
  printf("[%d] after: %s\n", rank, string_of_list(integers, n_over_p));

  // The first node is responsible for collecting all the data and then
  // printing it out to the file MPI_Gather(const void *sendbuf, int
  // sendcount, MPI_INT, void *recvbuf,
  //            int recvcount, MPI_INT, 0, MPI_COMM_WORLD);
  if (rank == 0) {
    FILE *f = fopen(argv[2], "w");
    fclose(f);
  }

  MPI_Finalize();
  printf("Done.\n");
  return 0;
}

// hi not inclusive
void local_quicksort(int *arr, int lo, int hi) {
  int temp;

  if (lo >= hi || lo < 0)
    return;

  int pivot = arr[hi - 1];
  int pivot_idx = lo - 1;
  for (int j = lo; j < hi; ++j) {
    if (arr[j] < pivot) {
      pivot_idx += 1;

      temp = arr[j];
      arr[j] = arr[pivot_idx];
      arr[pivot_idx] = temp;
    }
  }

  pivot_idx += 1;
  temp = arr[hi - 1];
  arr[hi - 1] = arr[pivot_idx];
  arr[pivot_idx] = temp;

  // Recursive call
  local_quicksort(arr, lo, pivot_idx);
  local_quicksort(arr, pivot_idx + 1, hi);
}

char *string_of_list(int *arr, int len) {
  char *buffer = calloc(sizeof(char), 1000);
  int offset = 0; // Keep track of the current position in the buffer
  for (int i = 0; i < len; i++) {
    offset += sprintf(buffer + offset, "%d", arr[i]);
    if (i < len - 1) {
      // Add a separator (e.g., comma or space) if it's not the last element
      offset += sprintf(buffer + offset, " ");
    }
  }

  return buffer;
}

void recursive_quicksort(int *integers, int n, MPI_Comm comm) {
  int rank, p;
  MPI_Comm_size(comm, &p);
  MPI_Comm_rank(comm, &rank);

  if (p == 1) {
    // Recursion base case: just sort it serially
    local_quicksort(integers, 0, n);
    return;
  }

  int n_over_p = n / p;

  // Locally sort
  // printf("[%d] Numbers before:           %s\n", rank,
  //        string_of_list(integers, n_over_p));
  local_quicksort(integers, 0, n_over_p);
  printf("[%d] Numbers after first sort: %s\n", rank,
         string_of_list(integers, n_over_p));

  // Select a pivot.
  // This pivot is broadcasted to all nodes
  int pivot;

  // The pivot is selected as the median (see chp. 9.4.4)
  // Not the real median though, need an existing element of the array
  pivot = integers[n_over_p / 2];
  MPI_Bcast(&pivot, 1, MPI_INT, 0, MPI_COMM_WORLD);
  // printf("--- Broadcasted pivot: %d ---\n", pivot);

  // Determine where the boundary between S (lower) and L (higher) lies
  int boundary;
  for (int i = 0; i < n_over_p; ++i) {
    if (integers[i] >= pivot) {
      boundary = i;
      break;
    }
  }
  int S_lo = 0, S_hi = boundary;
  int L_lo = boundary, L_hi = n_over_p;
  int S_size = S_hi - S_lo, L_size = L_hi - L_lo;
  printf("[%d] S: [%d - %d] (%d), L: [%d - %d] (%d)\n", rank, S_lo, S_hi,
         S_size, L_lo, L_hi, L_size);

  // Perform global arrangement
  int S_global_end, L_reverse_end, S_global_max_end;
  MPI_Scan(&S_size, &S_global_end, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  MPI_Scan(&L_size, &L_reverse_end, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

  // printf("[%d] bruh %d\n", rank, S_global_end);
  MPI_Reduce(&S_global_end, &S_global_max_end, 1, MPI_INT, MPI_MAX, 0,
             MPI_COMM_WORLD);

  int S_global_start = S_global_end - S_size,
      L_reverse_start = L_reverse_end - L_size,
      L_global_start = n - L_reverse_end, L_global_end = n - L_reverse_start;
  printf("[%d] Prefixed S: [%d - %d], Prefixed L: [%d - %d]\n", rank,
         S_global_start, S_global_end - 1, L_global_start, L_global_end - 1);

  int S_starting_process = S_global_start / n_over_p,
      L_starting_process = L_global_start / n_over_p;
  int S_offset = S_global_start % n_over_p,
      L_offset = L_global_start % n_over_p;

  int *integers_recv_buf = calloc(sizeof(int), n);
  int S_ctl[p * CTL_SIZE];
  int L_ctl[p * CTL_SIZE];
  int recvpart[n_over_p];
  int send_ctl[p * CTL_SIZE];
  int ctl_send_counts[p];
  int ctl_send_displs[p];

  int send_counts[p];
  int send_displs[p];
  int recv_counts[p];
  int recv_displs[p];
  for (int i = 0; i < p; ++i) {
    send_counts[i] = n_over_p;
    send_displs[i] = i * n_over_p;

    S_ctl[i * CTL_SIZE] = 0;
    S_ctl[i * CTL_SIZE + 1] = -1;
    S_ctl[i * CTL_SIZE + 2] = -1;
    L_ctl[i * CTL_SIZE] = 0;
    L_ctl[i * CTL_SIZE + 1] = -1;
    L_ctl[i * CTL_SIZE + 2] = -1;

    ctl_send_counts[i] = CTL_SIZE;
    ctl_send_displs[i] = i * CTL_SIZE;
    recv_counts[i] = CTL_SIZE;
    recv_displs[i] = i * CTL_SIZE;
  }

  // Send S to the correct target
  {
    for (int i = 0; i < p; ++i) {
      send_ctl[i * CTL_SIZE] = 0;
      send_ctl[i * CTL_SIZE + 1] = -1;
      send_ctl[i * CTL_SIZE + 2] = -1;
    }

    for (int i = S_lo, dest_pos = S_global_start,
             processor = S_starting_process;
         i < S_hi;) {
      int next_break = MIN(int, S_global_end,
                           MIN(int, dest_pos + (S_hi - S_lo),
                               (dest_pos / n_over_p) * n_over_p + n_over_p));
      int count = next_break - dest_pos;

      int from_local_start = i, from_local_end = i + count;
      int from_global_start = rank * n_over_p + from_local_start,
          from_global_end = from_global_start + count;

      int to_global_start = dest_pos, to_global_end = dest_pos + count;
      int to_local_start = to_global_start - processor * n_over_p,
          to_local_end = to_global_end - processor * n_over_p;

      printf("[%d] S ->> (count=%d), from local [%d..%d] {%d..%d} -to-> "
             "p#%d [%d..%d] {%d..%d}\n",
             rank, count, from_local_start, from_local_end, from_global_start,
             from_global_end, processor, to_local_start, to_local_end,
             to_global_start, to_global_end);
      send_ctl[processor * CTL_SIZE] = count;
      send_ctl[processor * CTL_SIZE + 1] = from_global_start;
      send_ctl[processor * CTL_SIZE + 2] = to_local_start;

      i += count;
      dest_pos += count;
      processor += 1;
    }

    MPI_Alltoallv(send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, S_ctl,
                  recv_counts, recv_displs, MPI_INT, MPI_COMM_WORLD);
  }

  // Send L to the correct target
  {
    for (int i = 0; i < p; ++i) {
      send_ctl[i * CTL_SIZE] = 0;
      send_ctl[i * CTL_SIZE + 1] = -1;
      send_ctl[i * CTL_SIZE + 2] = -1;
    }

    for (int i = L_lo, dest_pos = L_global_start,
             processor = L_starting_process;
         i < L_hi;) {
      int next_break = MIN(int, L_global_end,
                           MIN(int, dest_pos + (L_hi - L_lo),
                               (dest_pos / n_over_p) * n_over_p + n_over_p));
      int count = next_break - dest_pos;

      int from_local_start = i, from_local_end = i + count;
      int from_global_start = rank * n_over_p + from_local_start,
          from_global_end = from_global_start + count;

      int to_global_start = dest_pos, to_global_end = dest_pos + count;
      int to_local_start = to_global_start - processor * n_over_p,
          to_local_end = to_global_end - processor * n_over_p;

      printf("[%d] L ->> (count=%d), from local [%d..%d] {%d..%d} -to-> "
             "p#%d [%d..%d] {%d..%d}\n",
             rank, count, from_local_start, from_local_end, from_global_start,
             from_global_end, processor, to_local_start, to_local_end,
             to_global_start, to_global_end);
      send_ctl[processor * CTL_SIZE] = count;
      send_ctl[processor * CTL_SIZE + 1] = from_global_start;
      send_ctl[processor * CTL_SIZE + 2] = to_local_start;

      i += count;
      dest_pos += count;
      processor += 1;
    }

    MPI_Alltoallv(send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, L_ctl,
                  recv_counts, recv_displs, MPI_INT, MPI_COMM_WORLD);
  }

  // After sending S and L information

  for (int i = 0; i < p; ++i) {
    recv_counts[i] = n_over_p;
    recv_displs[i] = i * n_over_p;
  }

  // MPI_Alltoallv(integers, send_counts, send_displs, MPI_INT,
  // integers_recv_buf,
  //               recv_counts, recv_displs, MPI_INT, MPI_COMM_WORLD);
  MPI_Allgather(integers, n_over_p, MPI_INT, integers_recv_buf, n_over_p,
                MPI_INT, MPI_COMM_WORLD);
  // printf("[%d] ints: %s\n", rank, string_of_list(integers_recv_buf, n));

  for (int i = 0; i < p; ++i) {
    int count = S_ctl[i * CTL_SIZE];
    int from_global_start = S_ctl[i * CTL_SIZE + 1];
    int to_local_start = S_ctl[i * CTL_SIZE + 2];

    if (count > 0) {
      printf(
          "[%d] <<- S received (%d) from processor %d {%d..%d} to [%d..%d]\n",
          rank, count, i, from_global_start, from_global_start + count,
          to_local_start, to_local_start + count);
      for (int j = 0; j < count; ++j) {
        integers[to_local_start + j] = integers_recv_buf[from_global_start + j];
      }
    }
  }

  for (int i = 0; i < p; ++i) {
    int count = L_ctl[i * CTL_SIZE];
    int from_global_start = L_ctl[i * CTL_SIZE + 1];
    int to_local_start = L_ctl[i * CTL_SIZE + 2];

    if (count > 0) {
      printf(
          "[%d] <<- S received (%d) from processor %d {%d..%d} to [%d..%d]\n",
          rank, count, i, from_global_start, from_global_start + count,
          to_local_start, to_local_start + count);
      for (int j = 0; j < count; ++j) {
        integers[to_local_start + j] = integers_recv_buf[from_global_start + j];
      }
    }
  }

  // Now, determine which processes should be responsible for taking the S and L
  // arrays

  // Specifically, the part where it's split, break the tie to see if it goes
  // down or up
  int colors[p];
  if (rank == 0) {
    int p_of_split = S_global_max_end / n_over_p;
    int split_point = S_global_max_end % n_over_p;
    printf("[%d] shiet %d\n", rank, p_of_split);

    int lo_start = 0, lo_end;
    int hi_start, hi_end = p;
    if (split_point > n_over_p / 2) {
      // Belongs to the lower group
      lo_end = hi_start = p_of_split + 1;
    } else {
      // Belongs to the higher group
      lo_end = hi_start = p_of_split;
    }

    for (int i = 0; i < p; ++i) {
      if (i < lo_end)
        colors[i] = 100;
      else
        colors[i] = 200;
    }
  }

  MPI_Comm child;
  MPI_Comm_split(comm, colors[rank], rank, &child);
  printf("[%d] Recursing...\n", rank);
  MPI_Comm_free(&child);
}
progress 2023-10-23 00:38:42 +00:00			`#include <mpi.h>`
progress 2023-10-29 21:34:22 +00:00			`#include <stdio.h>`
			`#include <stdlib.h>`
holy SHIT it works 2023-10-30 09:09:03 +00:00			`#include <unistd.h>`
progress 2023-10-29 21:34:22 +00:00
L 2023-10-30 03:04:21 +00:00			`#define ORDER_FORWARDS 1`
			`#define ORDER_BACKWARDS 2`
holy SHIT it works 2023-10-30 09:09:03 +00:00			`#define CTL_SIZE 3`
L 2023-10-30 03:04:21 +00:00
			`#define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))`
			`#define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))`

			`#define ENSURE_int(i) _Generic((i), int : (i))`
			`#define ENSURE_float(f) _Generic((f), float : (f))`

			`#define MAX(type, x, y) (type) GENERIC_MAX(ENSURE_##type(x), ENSURE_##type(y))`
			`#define MIN(type, x, y) (type) GENERIC_MIN(ENSURE_##type(x), ENSURE_##type(y))`

progress 2023-10-29 21:34:22 +00:00			`void local_quicksort(int *arr, int lo, int hi);`
			`char string_of_list(int arr, int len);`
good progress 2023-10-30 09:35:04 +00:00			`void recursive_quicksort(int *integers, int n, MPI_Comm comm);`
progress 2023-10-23 00:38:42 +00:00
			`int main(int argc, char **argv) {`
progress 2023-10-29 21:34:22 +00:00			`int rank, p;`
progress 2023-10-23 00:38:42 +00:00			`MPI_Init(&argc, &argv);`
progress 2023-10-29 21:34:22 +00:00
			`int n = atoi(argv[1]);`

			`MPI_Comm_rank(MPI_COMM_WORLD, &rank);`
			`MPI_Comm_size(MPI_COMM_WORLD, &p);`

			`// Generate integers`
			`int n_over_p = n / p;`
			`int integers[n_over_p];`

			`// Important implementation detail: srand(0) is specially handled by glibc to`
			`// behave as if it was called with srand(1). To get around this, I'm seeding`
			`// with rank + 1`
			`//`
			`// See more: https://stackoverflow.com/a/27386563`
			`srand(rank + 1);`

			`for (int i = 0; i < n_over_p; ++i) {`
			`// TODO: For readability during debugging, I'm capping this`
			`integers[i] = rand() % 101;`
			`// printf(" - %d\n", integers[i]);`
			`}`

good progress 2023-10-30 09:35:04 +00:00			`recursive_quicksort(integers, n, MPI_COMM_WORLD);`

			`sleep(1);`
			`printf("[%d] after: %s\n", rank, string_of_list(integers, n_over_p));`

			`// The first node is responsible for collecting all the data and then`
			`// printing it out to the file MPI_Gather(const void *sendbuf, int`
			`// sendcount, MPI_INT, void *recvbuf,`
			`// int recvcount, MPI_INT, 0, MPI_COMM_WORLD);`
			`if (rank == 0) {`
			`FILE *f = fopen(argv[2], "w");`
			`fclose(f);`
			`}`

			`MPI_Finalize();`
			`printf("Done.\n");`
			`return 0;`
			`}`

			`// hi not inclusive`
			`void local_quicksort(int *arr, int lo, int hi) {`
			`int temp;`

			`if (lo >= hi \|\| lo < 0)`
			`return;`

			`int pivot = arr[hi - 1];`
			`int pivot_idx = lo - 1;`
			`for (int j = lo; j < hi; ++j) {`
			`if (arr[j] < pivot) {`
			`pivot_idx += 1;`

			`temp = arr[j];`
			`arr[j] = arr[pivot_idx];`
			`arr[pivot_idx] = temp;`
			`}`
			`}`

			`pivot_idx += 1;`
			`temp = arr[hi - 1];`
			`arr[hi - 1] = arr[pivot_idx];`
			`arr[pivot_idx] = temp;`

			`// Recursive call`
			`local_quicksort(arr, lo, pivot_idx);`
			`local_quicksort(arr, pivot_idx + 1, hi);`
			`}`

			`char string_of_list(int arr, int len) {`
			`char *buffer = calloc(sizeof(char), 1000);`
			`int offset = 0; // Keep track of the current position in the buffer`
			`for (int i = 0; i < len; i++) {`
			`offset += sprintf(buffer + offset, "%d", arr[i]);`
			`if (i < len - 1) {`
			`// Add a separator (e.g., comma or space) if it's not the last element`
			`offset += sprintf(buffer + offset, " ");`
			`}`
			`}`

			`return buffer;`
			`}`

			`void recursive_quicksort(int *integers, int n, MPI_Comm comm) {`
			`int rank, p;`
			`MPI_Comm_size(comm, &p);`
			`MPI_Comm_rank(comm, &rank);`

			`if (p == 1) {`
			`// Recursion base case: just sort it serially`
			`local_quicksort(integers, 0, n);`
			`return;`
			`}`

			`int n_over_p = n / p;`
progress 2023-10-29 21:34:22 +00:00
			`// Locally sort`
holy SHIT it works 2023-10-30 09:09:03 +00:00			`// printf("[%d] Numbers before: %s\n", rank,`
			`// string_of_list(integers, n_over_p));`
progress 2023-10-29 21:34:22 +00:00			`local_quicksort(integers, 0, n_over_p);`
			`printf("[%d] Numbers after first sort: %s\n", rank,`
			`string_of_list(integers, n_over_p));`

			`// Select a pivot.`
			`// This pivot is broadcasted to all nodes`
			`int pivot;`

			`// The pivot is selected as the median (see chp. 9.4.4)`
			`// Not the real median though, need an existing element of the array`
			`pivot = integers[n_over_p / 2];`
			`MPI_Bcast(&pivot, 1, MPI_INT, 0, MPI_COMM_WORLD);`
good progress 2023-10-30 09:35:04 +00:00			`// printf("--- Broadcasted pivot: %d ---\n", pivot);`
progress 2023-10-29 21:34:22 +00:00
			`// Determine where the boundary between S (lower) and L (higher) lies`
			`int boundary;`
			`for (int i = 0; i < n_over_p; ++i) {`
			`if (integers[i] >= pivot) {`
			`boundary = i;`
			`break;`
			`}`
			`}`
holy SHIT it works 2023-10-30 09:09:03 +00:00			`int S_lo = 0, S_hi = boundary;`
			`int L_lo = boundary, L_hi = n_over_p;`
			`int S_size = S_hi - S_lo, L_size = L_hi - L_lo;`
L 2023-10-30 03:04:21 +00:00			`printf("[%d] S: [%d - %d] (%d), L: [%d - %d] (%d)\n", rank, S_lo, S_hi,`
			`S_size, L_lo, L_hi, L_size);`
progress 2023-10-29 21:34:22 +00:00
			`// Perform global arrangement`
good progress 2023-10-30 09:35:04 +00:00			`int S_global_end, L_reverse_end, S_global_max_end;`
progress 2023-10-29 21:34:22 +00:00			`MPI_Scan(&S_size, &S_global_end, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);`
L 2023-10-30 03:04:21 +00:00			`MPI_Scan(&L_size, &L_reverse_end, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);`
progress 2023-10-29 21:34:22 +00:00
good progress 2023-10-30 09:35:04 +00:00			`// printf("[%d] bruh %d\n", rank, S_global_end);`
			`MPI_Reduce(&S_global_end, &S_global_max_end, 1, MPI_INT, MPI_MAX, 0,`
			`MPI_COMM_WORLD);`

progress 2023-10-29 21:34:22 +00:00			`int S_global_start = S_global_end - S_size,`
L 2023-10-30 03:04:21 +00:00			`L_reverse_start = L_reverse_end - L_size,`
			`L_global_start = n - L_reverse_end, L_global_end = n - L_reverse_start;`
holy SHIT it works 2023-10-30 09:09:03 +00:00			`printf("[%d] Prefixed S: [%d - %d], Prefixed L: [%d - %d]\n", rank,`
			`S_global_start, S_global_end - 1, L_global_start, L_global_end - 1);`

			`int S_starting_process = S_global_start / n_over_p,`
			`L_starting_process = L_global_start / n_over_p;`
			`int S_offset = S_global_start % n_over_p,`
			`L_offset = L_global_start % n_over_p;`

			`int *integers_recv_buf = calloc(sizeof(int), n);`
			`int S_ctl[p * CTL_SIZE];`
			`int L_ctl[p * CTL_SIZE];`
			`int recvpart[n_over_p];`
			`int send_ctl[p * CTL_SIZE];`
			`int ctl_send_counts[p];`
			`int ctl_send_displs[p];`

			`int send_counts[p];`
			`int send_displs[p];`
			`int recv_counts[p];`
			`int recv_displs[p];`
			`for (int i = 0; i < p; ++i) {`
			`send_counts[i] = n_over_p;`
			`send_displs[i] = i * n_over_p;`

			`S_ctl[i * CTL_SIZE] = 0;`
			`S_ctl[i * CTL_SIZE + 1] = -1;`
			`S_ctl[i * CTL_SIZE + 2] = -1;`
			`L_ctl[i * CTL_SIZE] = 0;`
			`L_ctl[i * CTL_SIZE + 1] = -1;`
			`L_ctl[i * CTL_SIZE + 2] = -1;`

			`ctl_send_counts[i] = CTL_SIZE;`
			`ctl_send_displs[i] = i * CTL_SIZE;`
			`recv_counts[i] = CTL_SIZE;`
			`recv_displs[i] = i * CTL_SIZE;`
			`}`
progress 2023-10-29 21:34:22 +00:00
holy SHIT it works 2023-10-30 09:09:03 +00:00			`// Send S to the correct target`
too bad can't do alltoallv 2023-10-30 07:08:35 +00:00			`{`
			`for (int i = 0; i < p; ++i) {`
holy SHIT it works 2023-10-30 09:09:03 +00:00			`send_ctl[i * CTL_SIZE] = 0;`
			`send_ctl[i * CTL_SIZE + 1] = -1;`
			`send_ctl[i * CTL_SIZE + 2] = -1;`
fucked up 2023-10-30 06:58:07 +00:00			`}`
L 2023-10-30 03:04:21 +00:00
fucked up 2023-10-30 06:58:07 +00:00			`for (int i = S_lo, dest_pos = S_global_start,`
			`processor = S_starting_process;`
			`i < S_hi;) {`
holy SHIT it works 2023-10-30 09:09:03 +00:00			`int next_break = MIN(int, S_global_end,`
			`MIN(int, dest_pos + (S_hi - S_lo),`
			`(dest_pos / n_over_p) * n_over_p + n_over_p));`
fucked up 2023-10-30 06:58:07 +00:00			`int count = next_break - dest_pos;`

holy SHIT it works 2023-10-30 09:09:03 +00:00			`int from_local_start = i, from_local_end = i + count;`
			`int from_global_start = rank * n_over_p + from_local_start,`
			`from_global_end = from_global_start + count;`
fucked up 2023-10-30 06:58:07 +00:00
holy SHIT it works 2023-10-30 09:09:03 +00:00			`int to_global_start = dest_pos, to_global_end = dest_pos + count;`
			`int to_local_start = to_global_start - processor * n_over_p,`
			`to_local_end = to_global_end - processor * n_over_p;`
fucked up 2023-10-30 06:58:07 +00:00
holy SHIT it works 2023-10-30 09:09:03 +00:00			`printf("[%d] S ->> (count=%d), from local [%d..%d] {%d..%d} -to-> "`
			`"p#%d [%d..%d] {%d..%d}\n",`
			`rank, count, from_local_start, from_local_end, from_global_start,`
			`from_global_end, processor, to_local_start, to_local_end,`
			`to_global_start, to_global_end);`
			`send_ctl[processor * CTL_SIZE] = count;`
			`send_ctl[processor * CTL_SIZE + 1] = from_global_start;`
			`send_ctl[processor * CTL_SIZE + 2] = to_local_start;`
fucked up 2023-10-30 06:58:07 +00:00
			`i += count;`
			`dest_pos += count;`
			`processor += 1;`
too bad can't do alltoallv 2023-10-30 07:08:35 +00:00			`}`
fucked up 2023-10-30 06:58:07 +00:00
holy SHIT it works 2023-10-30 09:09:03 +00:00			`MPI_Alltoallv(send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, S_ctl,`
			`recv_counts, recv_displs, MPI_INT, MPI_COMM_WORLD);`
			`}`

			`// Send L to the correct target`
			`{`
			`for (int i = 0; i < p; ++i) {`
			`send_ctl[i * CTL_SIZE] = 0;`
			`send_ctl[i * CTL_SIZE + 1] = -1;`
			`send_ctl[i * CTL_SIZE + 2] = -1;`
fucked up 2023-10-30 06:58:07 +00:00			`}`
L 2023-10-30 03:04:21 +00:00
holy SHIT it works 2023-10-30 09:09:03 +00:00			`for (int i = L_lo, dest_pos = L_global_start,`
			`processor = L_starting_process;`
			`i < L_hi;) {`
			`int next_break = MIN(int, L_global_end,`
			`MIN(int, dest_pos + (L_hi - L_lo),`
			`(dest_pos / n_over_p) * n_over_p + n_over_p));`
			`int count = next_break - dest_pos;`

			`int from_local_start = i, from_local_end = i + count;`
			`int from_global_start = rank * n_over_p + from_local_start,`
			`from_global_end = from_global_start + count;`

			`int to_global_start = dest_pos, to_global_end = dest_pos + count;`
			`int to_local_start = to_global_start - processor * n_over_p,`
			`to_local_end = to_global_end - processor * n_over_p;`
too bad can't do alltoallv 2023-10-30 07:08:35 +00:00
holy SHIT it works 2023-10-30 09:09:03 +00:00			`printf("[%d] L ->> (count=%d), from local [%d..%d] {%d..%d} -to-> "`
			`"p#%d [%d..%d] {%d..%d}\n",`
			`rank, count, from_local_start, from_local_end, from_global_start,`
			`from_global_end, processor, to_local_start, to_local_end,`
			`to_global_start, to_global_end);`
			`send_ctl[processor * CTL_SIZE] = count;`
			`send_ctl[processor * CTL_SIZE + 1] = from_global_start;`
			`send_ctl[processor * CTL_SIZE + 2] = to_local_start;`

			`i += count;`
			`dest_pos += count;`
			`processor += 1;`
			`}`

			`MPI_Alltoallv(send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, L_ctl,`
			`recv_counts, recv_displs, MPI_INT, MPI_COMM_WORLD);`
L 2023-10-30 03:04:21 +00:00			`}`
progress 2023-10-29 21:34:22 +00:00
holy SHIT it works 2023-10-30 09:09:03 +00:00			`// After sending S and L information`

			`for (int i = 0; i < p; ++i) {`
			`recv_counts[i] = n_over_p;`
			`recv_displs[i] = i * n_over_p;`
			`}`

			`// MPI_Alltoallv(integers, send_counts, send_displs, MPI_INT,`
			`// integers_recv_buf,`
			`// recv_counts, recv_displs, MPI_INT, MPI_COMM_WORLD);`
			`MPI_Allgather(integers, n_over_p, MPI_INT, integers_recv_buf, n_over_p,`
			`MPI_INT, MPI_COMM_WORLD);`
			`// printf("[%d] ints: %s\n", rank, string_of_list(integers_recv_buf, n));`

			`for (int i = 0; i < p; ++i) {`
			`int count = S_ctl[i * CTL_SIZE];`
			`int from_global_start = S_ctl[i * CTL_SIZE + 1];`
			`int to_local_start = S_ctl[i * CTL_SIZE + 2];`

			`if (count > 0) {`
			`printf(`
			`"[%d] <<- S received (%d) from processor %d {%d..%d} to [%d..%d]\n",`
			`rank, count, i, from_global_start, from_global_start + count,`
			`to_local_start, to_local_start + count);`
			`for (int j = 0; j < count; ++j) {`
			`integers[to_local_start + j] = integers_recv_buf[from_global_start + j];`
			`}`
			`}`
			`}`

			`for (int i = 0; i < p; ++i) {`
			`int count = L_ctl[i * CTL_SIZE];`
			`int from_global_start = L_ctl[i * CTL_SIZE + 1];`
			`int to_local_start = L_ctl[i * CTL_SIZE + 2];`

			`if (count > 0) {`
			`printf(`
			`"[%d] <<- S received (%d) from processor %d {%d..%d} to [%d..%d]\n",`
			`rank, count, i, from_global_start, from_global_start + count,`
			`to_local_start, to_local_start + count);`
			`for (int j = 0; j < count; ++j) {`
			`integers[to_local_start + j] = integers_recv_buf[from_global_start + j];`
			`}`
			`}`
			`}`

good progress 2023-10-30 09:35:04 +00:00			`// Now, determine which processes should be responsible for taking the S and L`
			`// arrays`
holy SHIT it works 2023-10-30 09:09:03 +00:00
good progress 2023-10-30 09:35:04 +00:00			`// Specifically, the part where it's split, break the tie to see if it goes`
			`// down or up`
			`int colors[p];`
progress 2023-10-29 21:34:22 +00:00			`if (rank == 0) {`
good progress 2023-10-30 09:35:04 +00:00			`int p_of_split = S_global_max_end / n_over_p;`
			`int split_point = S_global_max_end % n_over_p;`
			`printf("[%d] shiet %d\n", rank, p_of_split);`

			`int lo_start = 0, lo_end;`
			`int hi_start, hi_end = p;`
			`if (split_point > n_over_p / 2) {`
			`// Belongs to the lower group`
			`lo_end = hi_start = p_of_split + 1;`
			`} else {`
			`// Belongs to the higher group`
			`lo_end = hi_start = p_of_split;`
progress 2023-10-29 21:34:22 +00:00			`}`

good progress 2023-10-30 09:35:04 +00:00			`for (int i = 0; i < p; ++i) {`
			`if (i < lo_end)`
			`colors[i] = 100;`
			`else`
			`colors[i] = 200;`
progress 2023-10-29 21:34:22 +00:00			`}`
			`}`

good progress 2023-10-30 09:35:04 +00:00			`MPI_Comm child;`
			`MPI_Comm_split(comm, colors[rank], rank, &child);`
			`printf("[%d] Recursing...\n", rank);`
			`MPI_Comm_free(&child);`
progress 2023-10-29 21:34:22 +00:00			`}`