non-gather broadcast scheme

This commit is contained in:
Michael Zhang 2023-10-31 01:34:37 +00:00
parent 9d064bef0d
commit f700a2dfd8

View file

@ -5,7 +5,7 @@
#define ORDER_FORWARDS 1
#define ORDER_BACKWARDS 2
#define CTL_SIZE 3
#define CTL_SIZE 4
#define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
#define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))
@ -135,9 +135,9 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
int n_over_p = n / p;
if (rank == root)
n_over_p += n - p * n_over_p;
printf(
"[%d] :::::::::::::::::::::::::::: RECURSIVE QUICKSORT (n=%d, n/p=%d)\n",
rank, n, n_over_p);
// printf(
// "[%d] :::::::::::::::::::::::::::: RECURSIVE QUICKSORT (n=%d,
// n/p=%d)\n", rank, n, n_over_p);
// Locally sort
// printf("[%d] Numbers before: %s\n", rank,
@ -181,8 +181,8 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
int S_lo = 0, S_hi = boundary;
int L_lo = boundary, L_hi = n_over_p;
int S_size = S_hi - S_lo, L_size = L_hi - L_lo;
printf("[%d] S: [%d - %d] (%d), L: [%d - %d] (%d)\n", rank, S_lo, S_hi,
S_size, L_lo, L_hi, L_size);
// printf("[%d] S: [%d - %d] (%d), L: [%d - %d] (%d)\n", rank, S_lo, S_hi,
// S_size, L_lo, L_hi, L_size);
// Perform global arrangement
int S_global_end, L_reverse_end, S_global_max_end;
@ -196,8 +196,8 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
int S_global_start = S_global_end - S_size,
L_reverse_start = L_reverse_end - L_size,
L_global_start = n - L_reverse_end, L_global_end = n - L_reverse_start;
printf("[%d] Prefixed S: [%d - %d], Prefixed L: [%d - %d]\n", rank,
S_global_start, S_global_end - 1, L_global_start, L_global_end - 1);
// printf("[%d] Prefixed S: [%d - %d], Prefixed L: [%d - %d]\n", rank,
// S_global_start, S_global_end - 1, L_global_start, L_global_end - 1);
int S_starting_process = S_global_start / n_over_p,
L_starting_process = L_global_start / n_over_p;
@ -207,8 +207,9 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
int *integers_recv_buf = calloc(sizeof(int), n);
int S_ctl[p * CTL_SIZE];
int L_ctl[p * CTL_SIZE];
int S_send_ctl[p * CTL_SIZE];
int L_send_ctl[p * CTL_SIZE];
int recvpart[n_over_p];
int send_ctl[p * CTL_SIZE];
int ctl_send_counts[p];
int ctl_send_displs[p];
@ -223,9 +224,11 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
S_ctl[i * CTL_SIZE] = 0;
S_ctl[i * CTL_SIZE + 1] = -1;
S_ctl[i * CTL_SIZE + 2] = -1;
S_ctl[i * CTL_SIZE + 3] = -1;
L_ctl[i * CTL_SIZE] = 0;
L_ctl[i * CTL_SIZE + 1] = -1;
L_ctl[i * CTL_SIZE + 2] = -1;
L_ctl[i * CTL_SIZE + 3] = -1;
ctl_send_counts[i] = CTL_SIZE;
ctl_send_displs[i] = i * CTL_SIZE;
@ -236,9 +239,10 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
// Send S to the correct target
{
for (int i = 0; i < p; ++i) {
send_ctl[i * CTL_SIZE] = 0;
send_ctl[i * CTL_SIZE + 1] = -1;
send_ctl[i * CTL_SIZE + 2] = -1;
S_send_ctl[i * CTL_SIZE] = 0;
S_send_ctl[i * CTL_SIZE + 1] = -1;
S_send_ctl[i * CTL_SIZE + 2] = -1;
S_send_ctl[i * CTL_SIZE + 3] = -1;
}
for (int i = S_lo, dest_pos = S_global_start,
@ -262,25 +266,27 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
rank, count, from_local_start, from_local_end, from_global_start,
from_global_end, processor, to_local_start, to_local_end,
to_global_start, to_global_end);
send_ctl[processor * CTL_SIZE] = count;
send_ctl[processor * CTL_SIZE + 1] = from_global_start;
send_ctl[processor * CTL_SIZE + 2] = to_local_start;
S_send_ctl[processor * CTL_SIZE] = count;
S_send_ctl[processor * CTL_SIZE + 1] = from_global_start;
S_send_ctl[processor * CTL_SIZE + 2] = to_local_start;
S_send_ctl[processor * CTL_SIZE + 3] = from_local_start;
i += count;
dest_pos += count;
processor += 1;
}
MPI_Alltoallv(send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, S_ctl,
MPI_Alltoallv(S_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, S_ctl,
recv_counts, recv_displs, MPI_INT, comm);
}
// Send L to the correct target
{
for (int i = 0; i < p; ++i) {
send_ctl[i * CTL_SIZE] = 0;
send_ctl[i * CTL_SIZE + 1] = -1;
send_ctl[i * CTL_SIZE + 2] = -1;
L_send_ctl[i * CTL_SIZE] = 0;
L_send_ctl[i * CTL_SIZE + 1] = -1;
L_send_ctl[i * CTL_SIZE + 2] = -1;
L_send_ctl[i * CTL_SIZE + 3] = -1;
}
for (int i = L_lo, dest_pos = L_global_start,
@ -304,16 +310,17 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
rank, count, from_local_start, from_local_end, from_global_start,
from_global_end, processor, to_local_start, to_local_end,
to_global_start, to_global_end);
send_ctl[processor * CTL_SIZE] = count;
send_ctl[processor * CTL_SIZE + 1] = from_global_start;
send_ctl[processor * CTL_SIZE + 2] = to_local_start;
L_send_ctl[processor * CTL_SIZE] = count;
L_send_ctl[processor * CTL_SIZE + 1] = from_global_start;
L_send_ctl[processor * CTL_SIZE + 2] = to_local_start;
L_send_ctl[processor * CTL_SIZE + 3] = from_local_start;
i += count;
dest_pos += count;
processor += 1;
}
MPI_Alltoallv(send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, L_ctl,
MPI_Alltoallv(L_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, L_ctl,
recv_counts, recv_displs, MPI_INT, comm);
}
@ -327,46 +334,144 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
// MPI_Alltoallv(integers, send_counts, send_displs, MPI_INT,
// integers_recv_buf,
// recv_counts, recv_displs, MPI_INT, MPI_COMM_WORLD);
MPI_Allgather(integers, n_over_p, MPI_INT, integers_recv_buf, n_over_p,
MPI_INT, comm);
// MPI_Allgather(integers, n_over_p, MPI_INT, integers_recv_buf, n_over_p,
// MPI_INT, comm);
// printf("[%d] ints: %s\n", rank, string_of_list(integers_recv_buf, n));
for (int i = 0; i < p; ++i) {
// Scheme for all send
int integers_recv_2[n_over_p];
int integers_recv_3[n_over_p];
for (int i = 0; i < n_over_p; ++i) {
integers_recv_2[i] = -1;
integers_recv_3[i] = integers[i];
}
for (int i = 0; i < p; ++i) {
int count = S_ctl[i * CTL_SIZE];
int from_global_start = S_ctl[i * CTL_SIZE + 1];
int to_local_start = S_ctl[i * CTL_SIZE + 2];
for (int host_p = 0; host_p < p; ++host_p) {
if (rank == host_p) {
// Your {S,L}_ctl is a mapping from source_processor -> ctl
// Everyone already knows who needs to send to who now
for (int sender_p = 0; sender_p < p; ++sender_p) {
int S_count = S_ctl[sender_p * CTL_SIZE];
if (S_count > 0) {
int to_local_start = S_ctl[sender_p * CTL_SIZE + 2];
int from_local_start = S_ctl[sender_p * CTL_SIZE + 3];
if (count > 0) {
printf(
"[%d] <<- S received (%d) from processor %d {%d..%d} to [% d..% d]\n",
rank, count, i, from_global_start, from_global_start + count,
to_local_start, to_local_start + count);
for (int j = 0; j < count; ++j) {
integers[to_local_start + j] = integers_recv_buf[from_global_start + j];
if (sender_p == host_p) {
for (int k = 0; k < S_count; ++k) {
integers_recv_3[to_local_start + k] =
integers[from_local_start + k];
}
continue;
}
printf("[%d] - S inbound from host %d to [%d..%d] (%d)\n", rank,
sender_p, to_local_start, to_local_start + S_count, S_count);
MPI_Recv(&integers_recv_2[to_local_start], S_count, MPI_INT, sender_p,
124, comm, MPI_STATUS_IGNORE);
for (int k = 0; k < S_count; ++k) {
integers_recv_3[to_local_start + k] =
integers_recv_2[to_local_start + k];
}
}
}
} else {
// Your {S,L}_send_ctl contains a mapping from dest_processor -> ctl
for (int dest_p = 0; dest_p < p; ++dest_p) {
int S_count = S_send_ctl[dest_p * CTL_SIZE];
if (S_count > 0 && dest_p == host_p) {
int from_local_start = S_send_ctl[dest_p * CTL_SIZE + 3];
printf("[%d] - S outbound to host %d from [%d..%d] (%d)\n", rank,
dest_p, from_local_start, from_local_start + S_count, S_count);
MPI_Send(&integers[from_local_start], S_count, MPI_INT, dest_p, 124,
comm);
}
}
}
}
printf("[%d] S done\n", rank);
for (int i = 0; i < p; ++i) {
int count = L_ctl[i * CTL_SIZE];
int from_global_start = L_ctl[i * CTL_SIZE + 1];
int to_local_start = L_ctl[i * CTL_SIZE + 2];
for (int host_p = 0; host_p < p; ++host_p) {
if (rank == host_p) {
// Your {S,L}_ctl is a mapping from source_processor -> ctl
// Everyone already knows who needs to send to who now
for (int sender_p = 0; sender_p < p; ++sender_p) {
int L_count = L_ctl[sender_p * CTL_SIZE];
if (L_count > 0) {
int to_local_start = L_ctl[sender_p * CTL_SIZE + 2];
int from_local_start = L_ctl[sender_p * CTL_SIZE + 3];
if (count > 0) {
printf(
"[%d] <<- S received (%d) from processor %d {%d..%d} to [% d..% d]\n",
rank, count, i, from_global_start, from_global_start + count,
to_local_start, to_local_start + count);
for (int j = 0; j < count; ++j) {
integers[to_local_start + j] = integers_recv_buf[from_global_start + j];
if (sender_p == host_p) {
for (int k = 0; k < L_count; ++k) {
integers_recv_3[to_local_start + k] =
integers[from_local_start + k];
}
continue;
}
printf("[%d] - L inbound from host %d to [%d..%d] (%d)\n", rank,
sender_p, to_local_start, to_local_start + L_count, L_count);
MPI_Recv(&integers_recv_2[to_local_start], L_count, MPI_INT, sender_p,
125, comm, MPI_STATUS_IGNORE);
for (int k = 0; k < L_count; ++k) {
integers_recv_3[to_local_start + k] =
integers_recv_2[to_local_start + k];
}
}
}
} else {
// Your {S,L}_send_ctl contains a mapping from dest_processor -> ctl
for (int dest_p = 0; dest_p < p; ++dest_p) {
int L_count = L_send_ctl[dest_p * CTL_SIZE];
if (L_count > 0 && dest_p == host_p) {
int from_local_start = L_send_ctl[dest_p * CTL_SIZE + 3];
printf("[%d] - L outbound to host %d from [%d..%d] (%d)\n", rank,
dest_p, from_local_start, from_local_start + L_count, L_count);
MPI_Send(&integers[from_local_start], L_count, MPI_INT, dest_p, 125,
comm);
}
}
}
}
printf("[%d] L done\n", rank);
printf("[%d] after: %s\n", rank, string_of_list(integers, n_over_p));
// for (int i = 0; i < p; ++i) {
// int count = S_ctl[i * CTL_SIZE];
// int from_global_start = S_ctl[i * CTL_SIZE + 1];
// int to_local_start = S_ctl[i * CTL_SIZE + 2];
// if (count > 0) {
// printf(
// "[%d] <<- S received (%d) from processor %d {%d..%d} to
// [%d..%d]\n", rank, count, i, from_global_start, from_global_start +
// count, to_local_start, to_local_start + count);
// for (int j = 0; j < count; ++j) {
// integers[to_local_start + j] = integers_recv_buf[from_global_start +
// j];
// }
// }
// }
// for (int i = 0; i < p; ++i) {
// int count = L_ctl[i * CTL_SIZE];
// int from_global_start = L_ctl[i * CTL_SIZE + 1];
// int to_local_start = L_ctl[i * CTL_SIZE + 2];
// if (count > 0) {
// printf(
// "[%d] <<- S received (%d) from processor %d {%d..%d} to
// [%d..%d]\n", rank, count, i, from_global_start, from_global_start +
// count, to_local_start, to_local_start + count);
// for (int j = 0; j < count; ++j) {
// integers[to_local_start + j] = integers_recv_buf[from_global_start +
// j];
// }
// }
// }
printf("[%d] after: %s\n", rank, string_of_list(integers_recv_3, n_over_p));
for (int i = 0; i < n_over_p; ++i) {
integers[i] = integers_recv_3[i];
}
// Now, determine which processes should be responsible for taking the S and L
// arrays