diff --git a/assignments/03/.gitignore b/assignments/03/.gitignore index 971d284..15a3a87 100644 --- a/assignments/03/.gitignore +++ b/assignments/03/.gitignore @@ -2,4 +2,5 @@ lpa compile_commands.json .cache report.pdf -*.tar.gz \ No newline at end of file +*.tar.gz +out.txt \ No newline at end of file diff --git a/assignments/03/Makefile b/assignments/03/Makefile index 203e7a4..836ebdb 100644 --- a/assignments/03/Makefile +++ b/assignments/03/Makefile @@ -1,7 +1,7 @@ .PHONY: run clean -# CFLAGS += -O3 -CFLAGS += -DFMT_HEADER_ONLY -g +CFLAGS += -O3 +# CFLAGS += -DFMT_HEADER_ONLY -g # LDFLAGS += $(shell pkg-config --libs fmt) lpa: lpa.cpp Makefile diff --git a/assignments/03/bench.sh b/assignments/03/bench.sh new file mode 100755 index 0000000..18f2039 --- /dev/null +++ b/assignments/03/bench.sh @@ -0,0 +1,8 @@ +for dataset in $(echo "1000.txt" "10000.txt" "1000000.txt" "1000000.txt"); do + for processors in $(echo 1 2 4 8 16 | tr ' ' '\n'); do + # file="dataset/both_$dataset" + file="/export/scratch/CSCI5451_F23/assignment-3/dataset/$dataset" + echo $processors $file; + mpirun -n $processors ./lpa $file >> out.txt + done +done \ No newline at end of file diff --git a/assignments/03/lpa.cpp b/assignments/03/lpa.cpp index d330c57..0f3686e 100644 --- a/assignments/03/lpa.cpp +++ b/assignments/03/lpa.cpp @@ -97,6 +97,7 @@ int main(int argc, char **argv) { int num_my_edges; pair *my_edges; int counts[p], displs[p]; + if (rank == 0) { line = NULL; // pair all_edges[total_num_edges]; @@ -140,10 +141,16 @@ int main(int argc, char **argv) { // We have to send the last one again here, since it didn't get caught in // the loop above - MPI_Send(&edge_counter, 1, MPI_INT, current_process, TAG_SEND_NUM_EDGES, - MPI_COMM_WORLD); - MPI_Send(all_edges.ptr, edge_counter, IntPairType, current_process, - TAG_SEND_EDGES, MPI_COMM_WORLD); + if (current_process == 0) { + num_my_edges = edge_counter; + my_edges = (pair *)calloc(num_my_edges, sizeof(pair)); + memcpy(my_edges, all_edges.ptr, edge_counter * sizeof(pair)); + } else { + MPI_Send(&edge_counter, 1, MPI_INT, current_process, TAG_SEND_NUM_EDGES, + MPI_COMM_WORLD); + MPI_Send(all_edges.ptr, edge_counter, IntPairType, current_process, + TAG_SEND_EDGES, MPI_COMM_WORLD); + } free(all_edges.ptr); } else { @@ -161,6 +168,10 @@ int main(int argc, char **argv) { } #pragma endregion + if (rank == 0) + printf("Params: p=%d, |E|=%d, |V|=%d\n", p, total_num_nodes, + total_num_edges); + // STEP 2 TIMER STARTS HERE MPI_Barrier(MPI_COMM_WORLD); double step_2_start_time; @@ -228,7 +239,6 @@ int main(int argc, char **argv) { double step_5_start_time; if (rank == 0) { step_5_start_time = MPI_Wtime(); - printf("STARTING STEP 5: %0.04fs\n", step_5_start_time - step_2_start_time); } // The processes perform the transfers of non-local labels and updates of @@ -241,36 +251,52 @@ int main(int argc, char **argv) { std::vector send_displs; std::vector recv_counts; std::vector recv_displs; + std::vector recvbuf; + std::map remote_labels; - int recv_total; - { - int offset = 0; - for (int i = 0; i < p; ++i) { - int count = send_map[i].size(); - for (auto local_node : send_map[i]) { - sendbuf.push_back( - node_label_assignment_vec[local_node - my_node_range.fst]); + if (p > 1) { + + int recv_total; + { + int offset = 0; + for (int i = 0; i < p; ++i) { + int count = send_map[i].size(); + for (auto local_node : send_map[i]) { + sendbuf.push_back( + node_label_assignment_vec[local_node - my_node_range.fst]); + } + send_counts.push_back(count); + send_displs.push_back(offset); + offset += count; } - send_counts.push_back(count); - send_displs.push_back(offset); - offset += count; + + offset = 0; + for (int i = 0; i < p; ++i) { + int count = recv_map[i].size(); + recv_counts.push_back(count); + recv_displs.push_back(offset); + offset += count; + } + recv_total = offset; } - offset = 0; + recvbuf = std::vector(recv_total, 0); + MPI_Alltoallv(sendbuf.data(), send_counts.data(), send_displs.data(), + MPI_INT, recvbuf.data(), recv_counts.data(), + recv_displs.data(), MPI_INT, MPI_COMM_WORLD); + + // Cache efficiently for (int i = 0; i < p; ++i) { - int count = recv_map[i].size(); - recv_counts.push_back(count); - recv_displs.push_back(offset); - offset += count; + std::vector processor_nodes(recv_map[i].begin(), + recv_map[i].end()); + for (int j = 0; j < recv_counts[i]; ++j) { + int remote_node = processor_nodes[j]; + int remote_value = recvbuf[recv_displs[i] + j]; + remote_labels[remote_node] = remote_value; + } } - recv_total = offset; } - std::vector recvbuf(recv_total, 0); - MPI_Alltoallv(sendbuf.data(), send_counts.data(), send_displs.data(), - MPI_INT, recvbuf.data(), recv_counts.data(), - recv_displs.data(), MPI_INT, MPI_COMM_WORLD); - // For each local node, determine the minimum label out of its neighbors std::map new_labels; for (int i = 0; i < num_my_nodes; ++i) { @@ -281,10 +307,18 @@ int main(int argc, char **argv) { int min = current_value; for (auto neighbor : adj[node]) { - int neighbor_value = lookup_assignment( - node_label_assignment_vec, my_node_range, recv_map, recvbuf.data(), - recv_counts.data(), recv_displs.data(), each_num_nodes, rank, - neighbor); + int neighbor_value; + if (my_node_range.fst <= neighbor && neighbor < my_node_range.snd) { + neighbor_value = + node_label_assignment_vec[neighbor - my_node_range.fst]; + } else { + neighbor_value = remote_labels[neighbor]; + } + + // = lookup_assignment( + // node_label_assignment_vec, my_node_range, recv_map, + // recvbuf.data(), recv_counts.data(), recv_displs.data(), + // each_num_nodes, rank, neighbor); min = MIN(min, neighbor_value); } diff --git a/assignments/03/report.typ b/assignments/03/report.typ index e69de29..c37c9e1 100644 --- a/assignments/03/report.typ +++ b/assignments/03/report.typ @@ -0,0 +1,17 @@ +== Step 2-4 + +For steps 2-4, I calculated all of each process' outgoing nodes, sorted it in +order and used its sorted position as a way to identify which nodes are being +sent. + +This saves an extra communication and lets me index the same items for each +loop. + +== Step 5 + +I exchanged data using the unstructured communication approach, doing an +all-to-all transfer. + +To read the result efficiently, I tried using the approach given in the slides. +However, this was taking a long time (up to 45 seconds for the 10,000 case) and +I tried using STL's `std::map`. This proved to be orders of magnitude faster \ No newline at end of file