diff --git a/assignments/02/qs_mpi.c b/assignments/02/qs_mpi.c index 69d6644..057aaf2 100644 --- a/assignments/02/qs_mpi.c +++ b/assignments/02/qs_mpi.c @@ -5,7 +5,7 @@ #define ORDER_FORWARDS 1 #define ORDER_BACKWARDS 2 -#define CTL_SIZE 3 +#define CTL_SIZE 4 #define GENERIC_MAX(x, y) ((x) > (y) ? (x) : (y)) #define GENERIC_MIN(x, y) ((x) < (y) ? (x) : (y)) @@ -135,9 +135,9 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) { int n_over_p = n / p; if (rank == root) n_over_p += n - p * n_over_p; - printf( - "[%d] :::::::::::::::::::::::::::: RECURSIVE QUICKSORT (n=%d, n/p=%d)\n", - rank, n, n_over_p); + // printf( + // "[%d] :::::::::::::::::::::::::::: RECURSIVE QUICKSORT (n=%d, + // n/p=%d)\n", rank, n, n_over_p); // Locally sort // printf("[%d] Numbers before: %s\n", rank, @@ -181,8 +181,8 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) { int S_lo = 0, S_hi = boundary; int L_lo = boundary, L_hi = n_over_p; int S_size = S_hi - S_lo, L_size = L_hi - L_lo; - printf("[%d] S: [%d - %d] (%d), L: [%d - %d] (%d)\n", rank, S_lo, S_hi, - S_size, L_lo, L_hi, L_size); + // printf("[%d] S: [%d - %d] (%d), L: [%d - %d] (%d)\n", rank, S_lo, S_hi, + // S_size, L_lo, L_hi, L_size); // Perform global arrangement int S_global_end, L_reverse_end, S_global_max_end; @@ -196,8 +196,8 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) { int S_global_start = S_global_end - S_size, L_reverse_start = L_reverse_end - L_size, L_global_start = n - L_reverse_end, L_global_end = n - L_reverse_start; - printf("[%d] Prefixed S: [%d - %d], Prefixed L: [%d - %d]\n", rank, - S_global_start, S_global_end - 1, L_global_start, L_global_end - 1); + // printf("[%d] Prefixed S: [%d - %d], Prefixed L: [%d - %d]\n", rank, + // S_global_start, S_global_end - 1, L_global_start, L_global_end - 1); int S_starting_process = S_global_start / n_over_p, L_starting_process = L_global_start / n_over_p; @@ -207,8 +207,9 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) { int *integers_recv_buf = calloc(sizeof(int), n); int S_ctl[p * CTL_SIZE]; int L_ctl[p * CTL_SIZE]; + int S_send_ctl[p * CTL_SIZE]; + int L_send_ctl[p * CTL_SIZE]; int recvpart[n_over_p]; - int send_ctl[p * CTL_SIZE]; int ctl_send_counts[p]; int ctl_send_displs[p]; @@ -223,9 +224,11 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) { S_ctl[i * CTL_SIZE] = 0; S_ctl[i * CTL_SIZE + 1] = -1; S_ctl[i * CTL_SIZE + 2] = -1; + S_ctl[i * CTL_SIZE + 3] = -1; L_ctl[i * CTL_SIZE] = 0; L_ctl[i * CTL_SIZE + 1] = -1; L_ctl[i * CTL_SIZE + 2] = -1; + L_ctl[i * CTL_SIZE + 3] = -1; ctl_send_counts[i] = CTL_SIZE; ctl_send_displs[i] = i * CTL_SIZE; @@ -236,9 +239,10 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) { // Send S to the correct target { for (int i = 0; i < p; ++i) { - send_ctl[i * CTL_SIZE] = 0; - send_ctl[i * CTL_SIZE + 1] = -1; - send_ctl[i * CTL_SIZE + 2] = -1; + S_send_ctl[i * CTL_SIZE] = 0; + S_send_ctl[i * CTL_SIZE + 1] = -1; + S_send_ctl[i * CTL_SIZE + 2] = -1; + S_send_ctl[i * CTL_SIZE + 3] = -1; } for (int i = S_lo, dest_pos = S_global_start, @@ -262,25 +266,27 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) { rank, count, from_local_start, from_local_end, from_global_start, from_global_end, processor, to_local_start, to_local_end, to_global_start, to_global_end); - send_ctl[processor * CTL_SIZE] = count; - send_ctl[processor * CTL_SIZE + 1] = from_global_start; - send_ctl[processor * CTL_SIZE + 2] = to_local_start; + S_send_ctl[processor * CTL_SIZE] = count; + S_send_ctl[processor * CTL_SIZE + 1] = from_global_start; + S_send_ctl[processor * CTL_SIZE + 2] = to_local_start; + S_send_ctl[processor * CTL_SIZE + 3] = from_local_start; i += count; dest_pos += count; processor += 1; } - MPI_Alltoallv(send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, S_ctl, + MPI_Alltoallv(S_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, S_ctl, recv_counts, recv_displs, MPI_INT, comm); } // Send L to the correct target { for (int i = 0; i < p; ++i) { - send_ctl[i * CTL_SIZE] = 0; - send_ctl[i * CTL_SIZE + 1] = -1; - send_ctl[i * CTL_SIZE + 2] = -1; + L_send_ctl[i * CTL_SIZE] = 0; + L_send_ctl[i * CTL_SIZE + 1] = -1; + L_send_ctl[i * CTL_SIZE + 2] = -1; + L_send_ctl[i * CTL_SIZE + 3] = -1; } for (int i = L_lo, dest_pos = L_global_start, @@ -304,16 +310,17 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) { rank, count, from_local_start, from_local_end, from_global_start, from_global_end, processor, to_local_start, to_local_end, to_global_start, to_global_end); - send_ctl[processor * CTL_SIZE] = count; - send_ctl[processor * CTL_SIZE + 1] = from_global_start; - send_ctl[processor * CTL_SIZE + 2] = to_local_start; + L_send_ctl[processor * CTL_SIZE] = count; + L_send_ctl[processor * CTL_SIZE + 1] = from_global_start; + L_send_ctl[processor * CTL_SIZE + 2] = to_local_start; + L_send_ctl[processor * CTL_SIZE + 3] = from_local_start; i += count; dest_pos += count; processor += 1; } - MPI_Alltoallv(send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, L_ctl, + MPI_Alltoallv(L_send_ctl, ctl_send_counts, ctl_send_displs, MPI_INT, L_ctl, recv_counts, recv_displs, MPI_INT, comm); } @@ -327,46 +334,144 @@ void recursive_quicksort(int *integers, int n, int root, MPI_Comm comm) { // MPI_Alltoallv(integers, send_counts, send_displs, MPI_INT, // integers_recv_buf, // recv_counts, recv_displs, MPI_INT, MPI_COMM_WORLD); - MPI_Allgather(integers, n_over_p, MPI_INT, integers_recv_buf, n_over_p, - MPI_INT, comm); + // MPI_Allgather(integers, n_over_p, MPI_INT, integers_recv_buf, n_over_p, + // MPI_INT, comm); // printf("[%d] ints: %s\n", rank, string_of_list(integers_recv_buf, n)); - for (int i = 0; i < p; ++i) { + // Scheme for all send + int integers_recv_2[n_over_p]; + int integers_recv_3[n_over_p]; + for (int i = 0; i < n_over_p; ++i) { + integers_recv_2[i] = -1; + integers_recv_3[i] = integers[i]; } - for (int i = 0; i < p; ++i) { - int count = S_ctl[i * CTL_SIZE]; - int from_global_start = S_ctl[i * CTL_SIZE + 1]; - int to_local_start = S_ctl[i * CTL_SIZE + 2]; + for (int host_p = 0; host_p < p; ++host_p) { + if (rank == host_p) { + // Your {S,L}_ctl is a mapping from source_processor -> ctl + // Everyone already knows who needs to send to who now + for (int sender_p = 0; sender_p < p; ++sender_p) { + int S_count = S_ctl[sender_p * CTL_SIZE]; + if (S_count > 0) { + int to_local_start = S_ctl[sender_p * CTL_SIZE + 2]; + int from_local_start = S_ctl[sender_p * CTL_SIZE + 3]; - if (count > 0) { - printf( - "[%d] <<- S received (%d) from processor %d {%d..%d} to [% d..% d]\n", - rank, count, i, from_global_start, from_global_start + count, - to_local_start, to_local_start + count); - for (int j = 0; j < count; ++j) { - integers[to_local_start + j] = integers_recv_buf[from_global_start + j]; + if (sender_p == host_p) { + for (int k = 0; k < S_count; ++k) { + integers_recv_3[to_local_start + k] = + integers[from_local_start + k]; + } + continue; + } + + printf("[%d] - S inbound from host %d to [%d..%d] (%d)\n", rank, + sender_p, to_local_start, to_local_start + S_count, S_count); + MPI_Recv(&integers_recv_2[to_local_start], S_count, MPI_INT, sender_p, + 124, comm, MPI_STATUS_IGNORE); + for (int k = 0; k < S_count; ++k) { + integers_recv_3[to_local_start + k] = + integers_recv_2[to_local_start + k]; + } + } + } + } else { + // Your {S,L}_send_ctl contains a mapping from dest_processor -> ctl + for (int dest_p = 0; dest_p < p; ++dest_p) { + int S_count = S_send_ctl[dest_p * CTL_SIZE]; + if (S_count > 0 && dest_p == host_p) { + int from_local_start = S_send_ctl[dest_p * CTL_SIZE + 3]; + printf("[%d] - S outbound to host %d from [%d..%d] (%d)\n", rank, + dest_p, from_local_start, from_local_start + S_count, S_count); + MPI_Send(&integers[from_local_start], S_count, MPI_INT, dest_p, 124, + comm); + } } } } + printf("[%d] S done\n", rank); - for (int i = 0; i < p; ++i) { - int count = L_ctl[i * CTL_SIZE]; - int from_global_start = L_ctl[i * CTL_SIZE + 1]; - int to_local_start = L_ctl[i * CTL_SIZE + 2]; + for (int host_p = 0; host_p < p; ++host_p) { + if (rank == host_p) { + // Your {S,L}_ctl is a mapping from source_processor -> ctl + // Everyone already knows who needs to send to who now + for (int sender_p = 0; sender_p < p; ++sender_p) { + int L_count = L_ctl[sender_p * CTL_SIZE]; + if (L_count > 0) { + int to_local_start = L_ctl[sender_p * CTL_SIZE + 2]; + int from_local_start = L_ctl[sender_p * CTL_SIZE + 3]; - if (count > 0) { - printf( - "[%d] <<- S received (%d) from processor %d {%d..%d} to [% d..% d]\n", - rank, count, i, from_global_start, from_global_start + count, - to_local_start, to_local_start + count); - for (int j = 0; j < count; ++j) { - integers[to_local_start + j] = integers_recv_buf[from_global_start + j]; + if (sender_p == host_p) { + for (int k = 0; k < L_count; ++k) { + integers_recv_3[to_local_start + k] = + integers[from_local_start + k]; + } + continue; + } + + printf("[%d] - L inbound from host %d to [%d..%d] (%d)\n", rank, + sender_p, to_local_start, to_local_start + L_count, L_count); + MPI_Recv(&integers_recv_2[to_local_start], L_count, MPI_INT, sender_p, + 125, comm, MPI_STATUS_IGNORE); + for (int k = 0; k < L_count; ++k) { + integers_recv_3[to_local_start + k] = + integers_recv_2[to_local_start + k]; + } + } + } + } else { + // Your {S,L}_send_ctl contains a mapping from dest_processor -> ctl + for (int dest_p = 0; dest_p < p; ++dest_p) { + int L_count = L_send_ctl[dest_p * CTL_SIZE]; + if (L_count > 0 && dest_p == host_p) { + int from_local_start = L_send_ctl[dest_p * CTL_SIZE + 3]; + printf("[%d] - L outbound to host %d from [%d..%d] (%d)\n", rank, + dest_p, from_local_start, from_local_start + L_count, L_count); + MPI_Send(&integers[from_local_start], L_count, MPI_INT, dest_p, 125, + comm); + } } } } + printf("[%d] L done\n", rank); - printf("[%d] after: %s\n", rank, string_of_list(integers, n_over_p)); + // for (int i = 0; i < p; ++i) { + // int count = S_ctl[i * CTL_SIZE]; + // int from_global_start = S_ctl[i * CTL_SIZE + 1]; + // int to_local_start = S_ctl[i * CTL_SIZE + 2]; + + // if (count > 0) { + // printf( + // "[%d] <<- S received (%d) from processor %d {%d..%d} to + // [%d..%d]\n", rank, count, i, from_global_start, from_global_start + + // count, to_local_start, to_local_start + count); + // for (int j = 0; j < count; ++j) { + // integers[to_local_start + j] = integers_recv_buf[from_global_start + + // j]; + // } + // } + // } + + // for (int i = 0; i < p; ++i) { + // int count = L_ctl[i * CTL_SIZE]; + // int from_global_start = L_ctl[i * CTL_SIZE + 1]; + // int to_local_start = L_ctl[i * CTL_SIZE + 2]; + + // if (count > 0) { + // printf( + // "[%d] <<- S received (%d) from processor %d {%d..%d} to + // [%d..%d]\n", rank, count, i, from_global_start, from_global_start + + // count, to_local_start, to_local_start + count); + // for (int j = 0; j < count; ++j) { + // integers[to_local_start + j] = integers_recv_buf[from_global_start + + // j]; + // } + // } + // } + + printf("[%d] after: %s\n", rank, string_of_list(integers_recv_3, n_over_p)); + for (int i = 0; i < n_over_p; ++i) { + integers[i] = integers_recv_3[i]; + } // Now, determine which processes should be responsible for taking the S and L // arrays