non-gather broadcast scheme
This commit is contained in:
parent
9d064bef0d
commit
f700a2dfd8
1 changed files with 154 additions and 49 deletions
|
@ -5,7 +5,7 @@
|
||||||
|
|
||||||
#define ORDER_FORWARDS 1
|
#define ORDER_FORWARDS 1
|
||||||
#define ORDER_BACKWARDS 2
|
#define ORDER_BACKWARDS 2
|
||||||
#define CTL_SIZE 3
|
#define CTL_SIZE 4
|
||||||
|
|
||||||
#define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
|
#define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y))
|
||||||
#define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))
|
#define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y))
|
||||||
|
@ -135,9 +135,9 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
|
||||||
int n_over_p = n / p;
|
int n_over_p = n / p;
|
||||||
if (rank == root)
|
if (rank == root)
|
||||||
n_over_p += n - p * n_over_p;
|
n_over_p += n - p * n_over_p;
|
||||||
printf(
|
// printf(
|
||||||
"[%d] :::::::::::::::::::::::::::: RECURSIVE QUICKSORT (n=%d, n/p=%d)\n",
|
// "[%d] :::::::::::::::::::::::::::: RECURSIVE QUICKSORT (n=%d,
|
||||||
rank, n, n_over_p);
|
// n/p=%d)\n", rank, n, n_over_p);
|
||||||
|
|
||||||
// Locally sort
|
// Locally sort
|
||||||
// printf("[%d] Numbers before: %s\n", rank,
|
// printf("[%d] Numbers before: %s\n", rank,
|
||||||
|
@ -181,8 +181,8 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
|
||||||
int S_lo = 0, S_hi = boundary;
|
int S_lo = 0, S_hi = boundary;
|
||||||
int L_lo = boundary, L_hi = n_over_p;
|
int L_lo = boundary, L_hi = n_over_p;
|
||||||
int S_size = S_hi - S_lo, L_size = L_hi - L_lo;
|
int S_size = S_hi - S_lo, L_size = L_hi - L_lo;
|
||||||
printf("[%d] S: [%d - %d] (%d), L: [%d - %d] (%d)\n", rank, S_lo, S_hi,
|
// printf("[%d] S: [%d - %d] (%d), L: [%d - %d] (%d)\n", rank, S_lo, S_hi,
|
||||||
S_size, L_lo, L_hi, L_size);
|
// S_size, L_lo, L_hi, L_size);
|
||||||
|
|
||||||
// Perform global arrangement
|
// Perform global arrangement
|
||||||
int S_global_end, L_reverse_end, S_global_max_end;
|
int S_global_end, L_reverse_end, S_global_max_end;
|
||||||
|
@ -196,8 +196,8 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
|
||||||
int S_global_start = S_global_end - S_size,
|
int S_global_start = S_global_end - S_size,
|
||||||
L_reverse_start = L_reverse_end - L_size,
|
L_reverse_start = L_reverse_end - L_size,
|
||||||
L_global_start = n - L_reverse_end, L_global_end = n - L_reverse_start;
|
L_global_start = n - L_reverse_end, L_global_end = n - L_reverse_start;
|
||||||
printf("[%d] Prefixed S: [%d - %d], Prefixed L: [%d - %d]\n", rank,
|
// printf("[%d] Prefixed S: [%d - %d], Prefixed L: [%d - %d]\n", rank,
|
||||||
S_global_start, S_global_end - 1, L_global_start, L_global_end - 1);
|
// S_global_start, S_global_end - 1, L_global_start, L_global_end - 1);
|
||||||
|
|
||||||
int S_starting_process = S_global_start / n_over_p,
|
int S_starting_process = S_global_start / n_over_p,
|
||||||
L_starting_process = L_global_start / n_over_p;
|
L_starting_process = L_global_start / n_over_p;
|
||||||
|
@ -207,8 +207,9 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
|
||||||
int *integers_recv_buf = calloc(sizeof(int), n);
|
int *integers_recv_buf = calloc(sizeof(int), n);
|
||||||
int S_ctl[p * CTL_SIZE];
|
int S_ctl[p * CTL_SIZE];
|
||||||
int L_ctl[p * CTL_SIZE];
|
int L_ctl[p * CTL_SIZE];
|
||||||
|
int S_send_ctl[p * CTL_SIZE];
|
||||||
|
int L_send_ctl[p * CTL_SIZE];
|
||||||
int recvpart[n_over_p];
|
int recvpart[n_over_p];
|
||||||
int send_ctl[p * CTL_SIZE];
|
|
||||||
int ctl_send_counts[p];
|
int ctl_send_counts[p];
|
||||||
int ctl_send_displs[p];
|
int ctl_send_displs[p];
|
||||||
|
|
||||||
|
@ -223,9 +224,11 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
|
||||||
S_ctl[i * CTL_SIZE] = 0;
|
S_ctl[i * CTL_SIZE] = 0;
|
||||||
S_ctl[i * CTL_SIZE + 1] = -1;
|
S_ctl[i * CTL_SIZE + 1] = -1;
|
||||||
S_ctl[i * CTL_SIZE + 2] = -1;
|
S_ctl[i * CTL_SIZE + 2] = -1;
|
||||||
|
S_ctl[i * CTL_SIZE + 3] = -1;
|
||||||
L_ctl[i * CTL_SIZE] = 0;
|
L_ctl[i * CTL_SIZE] = 0;
|
||||||
L_ctl[i * CTL_SIZE + 1] = -1;
|
L_ctl[i * CTL_SIZE + 1] = -1;
|
||||||
L_ctl[i * CTL_SIZE + 2] = -1;
|
L_ctl[i * CTL_SIZE + 2] = -1;
|
||||||
|
L_ctl[i * CTL_SIZE + 3] = -1;
|
||||||
|
|
||||||
ctl_send_counts[i] = CTL_SIZE;
|
ctl_send_counts[i] = CTL_SIZE;
|
||||||
ctl_send_displs[i] = i * CTL_SIZE;
|
ctl_send_displs[i] = i * CTL_SIZE;
|
||||||
|
@ -236,9 +239,10 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
|
||||||
// Send S to the correct target
|
// Send S to the correct target
|
||||||
{
|
{
|
||||||
for (int i = 0; i < p; ++i) {
|
for (int i = 0; i < p; ++i) {
|
||||||
send_ctl[i * CTL_SIZE] = 0;
|
S_send_ctl[i * CTL_SIZE] = 0;
|
||||||
send_ctl[i * CTL_SIZE + 1] = -1;
|
S_send_ctl[i * CTL_SIZE + 1] = -1;
|
||||||
send_ctl[i * CTL_SIZE + 2] = -1;
|
S_send_ctl[i * CTL_SIZE + 2] = -1;
|
||||||
|
S_send_ctl[i * CTL_SIZE + 3] = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = S_lo, dest_pos = S_global_start,
|
for (int i = S_lo, dest_pos = S_global_start,
|
||||||
|
@ -262,25 +266,27 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
|
||||||
rank, count, from_local_start, from_local_end, from_global_start,
|
rank, count, from_local_start, from_local_end, from_global_start,
|
||||||
from_global_end, processor, to_local_start, to_local_end,
|
from_global_end, processor, to_local_start, to_local_end,
|
||||||
to_global_start, to_global_end);
|
to_global_start, to_global_end);
|
||||||
send_ctl[processor * CTL_SIZE] = count;
|
S_send_ctl[processor * CTL_SIZE] = count;
|
||||||
send_ctl[processor * CTL_SIZE + 1] = from_global_start;
|
S_send_ctl[processor * CTL_SIZE + 1] = from_global_start;
|
||||||
send_ctl[processor * CTL_SIZE + 2] = to_local_start;
|
S_send_ctl[processor * CTL_SIZE + 2] = to_local_start;
|
||||||
|
S_send_ctl[processor * CTL_SIZE + 3] = from_local_start;
|
||||||
|
|
||||||
i += count;
|
i += count;
|
||||||
dest_pos += count;
|
dest_pos += count;
|
||||||
processor += 1;
|
processor += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
MPI_Alltoallv(send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, S_ctl,
|
MPI_Alltoallv(S_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, S_ctl,
|
||||||
recv_counts, recv_displs, MPI_INT, comm);
|
recv_counts, recv_displs, MPI_INT, comm);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Send L to the correct target
|
// Send L to the correct target
|
||||||
{
|
{
|
||||||
for (int i = 0; i < p; ++i) {
|
for (int i = 0; i < p; ++i) {
|
||||||
send_ctl[i * CTL_SIZE] = 0;
|
L_send_ctl[i * CTL_SIZE] = 0;
|
||||||
send_ctl[i * CTL_SIZE + 1] = -1;
|
L_send_ctl[i * CTL_SIZE + 1] = -1;
|
||||||
send_ctl[i * CTL_SIZE + 2] = -1;
|
L_send_ctl[i * CTL_SIZE + 2] = -1;
|
||||||
|
L_send_ctl[i * CTL_SIZE + 3] = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = L_lo, dest_pos = L_global_start,
|
for (int i = L_lo, dest_pos = L_global_start,
|
||||||
|
@ -304,16 +310,17 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
|
||||||
rank, count, from_local_start, from_local_end, from_global_start,
|
rank, count, from_local_start, from_local_end, from_global_start,
|
||||||
from_global_end, processor, to_local_start, to_local_end,
|
from_global_end, processor, to_local_start, to_local_end,
|
||||||
to_global_start, to_global_end);
|
to_global_start, to_global_end);
|
||||||
send_ctl[processor * CTL_SIZE] = count;
|
L_send_ctl[processor * CTL_SIZE] = count;
|
||||||
send_ctl[processor * CTL_SIZE + 1] = from_global_start;
|
L_send_ctl[processor * CTL_SIZE + 1] = from_global_start;
|
||||||
send_ctl[processor * CTL_SIZE + 2] = to_local_start;
|
L_send_ctl[processor * CTL_SIZE + 2] = to_local_start;
|
||||||
|
L_send_ctl[processor * CTL_SIZE + 3] = from_local_start;
|
||||||
|
|
||||||
i += count;
|
i += count;
|
||||||
dest_pos += count;
|
dest_pos += count;
|
||||||
processor += 1;
|
processor += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
MPI_Alltoallv(send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, L_ctl,
|
MPI_Alltoallv(L_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, L_ctl,
|
||||||
recv_counts, recv_displs, MPI_INT, comm);
|
recv_counts, recv_displs, MPI_INT, comm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -327,46 +334,144 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) {
|
||||||
// MPI_Alltoallv(integers, send_counts, send_displs, MPI_INT,
|
// MPI_Alltoallv(integers, send_counts, send_displs, MPI_INT,
|
||||||
// integers_recv_buf,
|
// integers_recv_buf,
|
||||||
// recv_counts, recv_displs, MPI_INT, MPI_COMM_WORLD);
|
// recv_counts, recv_displs, MPI_INT, MPI_COMM_WORLD);
|
||||||
MPI_Allgather(integers, n_over_p, MPI_INT, integers_recv_buf, n_over_p,
|
// MPI_Allgather(integers, n_over_p, MPI_INT, integers_recv_buf, n_over_p,
|
||||||
MPI_INT, comm);
|
// MPI_INT, comm);
|
||||||
// printf("[%d] ints: %s\n", rank, string_of_list(integers_recv_buf, n));
|
// printf("[%d] ints: %s\n", rank, string_of_list(integers_recv_buf, n));
|
||||||
|
|
||||||
for (int i = 0; i < p; ++i) {
|
// Scheme for all send
|
||||||
|
int integers_recv_2[n_over_p];
|
||||||
|
int integers_recv_3[n_over_p];
|
||||||
|
for (int i = 0; i < n_over_p; ++i) {
|
||||||
|
integers_recv_2[i] = -1;
|
||||||
|
integers_recv_3[i] = integers[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < p; ++i) {
|
for (int host_p = 0; host_p < p; ++host_p) {
|
||||||
int count = S_ctl[i * CTL_SIZE];
|
if (rank == host_p) {
|
||||||
int from_global_start = S_ctl[i * CTL_SIZE + 1];
|
// Your {S,L}_ctl is a mapping from source_processor -> ctl
|
||||||
int to_local_start = S_ctl[i * CTL_SIZE + 2];
|
// Everyone already knows who needs to send to who now
|
||||||
|
for (int sender_p = 0; sender_p < p; ++sender_p) {
|
||||||
|
int S_count = S_ctl[sender_p * CTL_SIZE];
|
||||||
|
if (S_count > 0) {
|
||||||
|
int to_local_start = S_ctl[sender_p * CTL_SIZE + 2];
|
||||||
|
int from_local_start = S_ctl[sender_p * CTL_SIZE + 3];
|
||||||
|
|
||||||
if (count > 0) {
|
if (sender_p == host_p) {
|
||||||
printf(
|
for (int k = 0; k < S_count; ++k) {
|
||||||
"[%d] <<- S received (%d) from processor %d {%d..%d} to [% d..% d]\n",
|
integers_recv_3[to_local_start + k] =
|
||||||
rank, count, i, from_global_start, from_global_start + count,
|
integers[from_local_start + k];
|
||||||
to_local_start, to_local_start + count);
|
|
||||||
for (int j = 0; j < count; ++j) {
|
|
||||||
integers[to_local_start + j] = integers_recv_buf[from_global_start + j];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < p; ++i) {
|
printf("[%d] - S inbound from host %d to [%d..%d] (%d)\n", rank,
|
||||||
int count = L_ctl[i * CTL_SIZE];
|
sender_p, to_local_start, to_local_start + S_count, S_count);
|
||||||
int from_global_start = L_ctl[i * CTL_SIZE + 1];
|
MPI_Recv(&integers_recv_2[to_local_start], S_count, MPI_INT, sender_p,
|
||||||
int to_local_start = L_ctl[i * CTL_SIZE + 2];
|
124, comm, MPI_STATUS_IGNORE);
|
||||||
|
for (int k = 0; k < S_count; ++k) {
|
||||||
|
integers_recv_3[to_local_start + k] =
|
||||||
|
integers_recv_2[to_local_start + k];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Your {S,L}_send_ctl contains a mapping from dest_processor -> ctl
|
||||||
|
for (int dest_p = 0; dest_p < p; ++dest_p) {
|
||||||
|
int S_count = S_send_ctl[dest_p * CTL_SIZE];
|
||||||
|
if (S_count > 0 && dest_p == host_p) {
|
||||||
|
int from_local_start = S_send_ctl[dest_p * CTL_SIZE + 3];
|
||||||
|
printf("[%d] - S outbound to host %d from [%d..%d] (%d)\n", rank,
|
||||||
|
dest_p, from_local_start, from_local_start + S_count, S_count);
|
||||||
|
MPI_Send(&integers[from_local_start], S_count, MPI_INT, dest_p, 124,
|
||||||
|
comm);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("[%d] S done\n", rank);
|
||||||
|
|
||||||
if (count > 0) {
|
for (int host_p = 0; host_p < p; ++host_p) {
|
||||||
printf(
|
if (rank == host_p) {
|
||||||
"[%d] <<- S received (%d) from processor %d {%d..%d} to [% d..% d]\n",
|
// Your {S,L}_ctl is a mapping from source_processor -> ctl
|
||||||
rank, count, i, from_global_start, from_global_start + count,
|
// Everyone already knows who needs to send to who now
|
||||||
to_local_start, to_local_start + count);
|
for (int sender_p = 0; sender_p < p; ++sender_p) {
|
||||||
for (int j = 0; j < count; ++j) {
|
int L_count = L_ctl[sender_p * CTL_SIZE];
|
||||||
integers[to_local_start + j] = integers_recv_buf[from_global_start + j];
|
if (L_count > 0) {
|
||||||
}
|
int to_local_start = L_ctl[sender_p * CTL_SIZE + 2];
|
||||||
|
int from_local_start = L_ctl[sender_p * CTL_SIZE + 3];
|
||||||
|
|
||||||
|
if (sender_p == host_p) {
|
||||||
|
for (int k = 0; k < L_count; ++k) {
|
||||||
|
integers_recv_3[to_local_start + k] =
|
||||||
|
integers[from_local_start + k];
|
||||||
}
|
}
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("[%d] after: %s\n", rank, string_of_list(integers, n_over_p));
|
printf("[%d] - L inbound from host %d to [%d..%d] (%d)\n", rank,
|
||||||
|
sender_p, to_local_start, to_local_start + L_count, L_count);
|
||||||
|
MPI_Recv(&integers_recv_2[to_local_start], L_count, MPI_INT, sender_p,
|
||||||
|
125, comm, MPI_STATUS_IGNORE);
|
||||||
|
for (int k = 0; k < L_count; ++k) {
|
||||||
|
integers_recv_3[to_local_start + k] =
|
||||||
|
integers_recv_2[to_local_start + k];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Your {S,L}_send_ctl contains a mapping from dest_processor -> ctl
|
||||||
|
for (int dest_p = 0; dest_p < p; ++dest_p) {
|
||||||
|
int L_count = L_send_ctl[dest_p * CTL_SIZE];
|
||||||
|
if (L_count > 0 && dest_p == host_p) {
|
||||||
|
int from_local_start = L_send_ctl[dest_p * CTL_SIZE + 3];
|
||||||
|
printf("[%d] - L outbound to host %d from [%d..%d] (%d)\n", rank,
|
||||||
|
dest_p, from_local_start, from_local_start + L_count, L_count);
|
||||||
|
MPI_Send(&integers[from_local_start], L_count, MPI_INT, dest_p, 125,
|
||||||
|
comm);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("[%d] L done\n", rank);
|
||||||
|
|
||||||
|
// for (int i = 0; i < p; ++i) {
|
||||||
|
// int count = S_ctl[i * CTL_SIZE];
|
||||||
|
// int from_global_start = S_ctl[i * CTL_SIZE + 1];
|
||||||
|
// int to_local_start = S_ctl[i * CTL_SIZE + 2];
|
||||||
|
|
||||||
|
// if (count > 0) {
|
||||||
|
// printf(
|
||||||
|
// "[%d] <<- S received (%d) from processor %d {%d..%d} to
|
||||||
|
// [%d..%d]\n", rank, count, i, from_global_start, from_global_start +
|
||||||
|
// count, to_local_start, to_local_start + count);
|
||||||
|
// for (int j = 0; j < count; ++j) {
|
||||||
|
// integers[to_local_start + j] = integers_recv_buf[from_global_start +
|
||||||
|
// j];
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// for (int i = 0; i < p; ++i) {
|
||||||
|
// int count = L_ctl[i * CTL_SIZE];
|
||||||
|
// int from_global_start = L_ctl[i * CTL_SIZE + 1];
|
||||||
|
// int to_local_start = L_ctl[i * CTL_SIZE + 2];
|
||||||
|
|
||||||
|
// if (count > 0) {
|
||||||
|
// printf(
|
||||||
|
// "[%d] <<- S received (%d) from processor %d {%d..%d} to
|
||||||
|
// [%d..%d]\n", rank, count, i, from_global_start, from_global_start +
|
||||||
|
// count, to_local_start, to_local_start + count);
|
||||||
|
// for (int j = 0; j < count; ++j) {
|
||||||
|
// integers[to_local_start + j] = integers_recv_buf[from_global_start +
|
||||||
|
// j];
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
printf("[%d] after: %s\n", rank, string_of_list(integers_recv_3, n_over_p));
|
||||||
|
for (int i = 0; i < n_over_p; ++i) {
|
||||||
|
integers[i] = integers_recv_3[i];
|
||||||
|
}
|
||||||
|
|
||||||
// Now, determine which processes should be responsible for taking the S and L
|
// Now, determine which processes should be responsible for taking the S and L
|
||||||
// arrays
|
// arrays
|
||||||
|
|
Loading…
Reference in a new issue