consistent results now

This commit is contained in:
Michael Zhang 2023-11-25 09:18:56 +00:00
parent 0619d79caa
commit 5fe38262c5
5 changed files with 94 additions and 34 deletions

View file

@ -2,4 +2,5 @@ lpa
compile_commands.json compile_commands.json
.cache .cache
report.pdf report.pdf
*.tar.gz *.tar.gz
out.txt

View file

@ -1,7 +1,7 @@
.PHONY: run clean .PHONY: run clean
# CFLAGS += -O3 CFLAGS += -O3
CFLAGS += -DFMT_HEADER_ONLY -g # CFLAGS += -DFMT_HEADER_ONLY -g
# LDFLAGS += $(shell pkg-config --libs fmt) # LDFLAGS += $(shell pkg-config --libs fmt)
lpa: lpa.cpp Makefile lpa: lpa.cpp Makefile

8
assignments/03/bench.sh Executable file
View file

@ -0,0 +1,8 @@
for dataset in $(echo "1000.txt" "10000.txt" "1000000.txt" "1000000.txt"); do
for processors in $(echo 1 2 4 8 16 | tr ' ' '\n'); do
# file="dataset/both_$dataset"
file="/export/scratch/CSCI5451_F23/assignment-3/dataset/$dataset"
echo $processors $file;
mpirun -n $processors ./lpa $file >> out.txt
done
done

View file

@ -97,6 +97,7 @@ int main(int argc, char **argv) {
int num_my_edges; int num_my_edges;
pair *my_edges; pair *my_edges;
int counts[p], displs[p]; int counts[p], displs[p];
if (rank == 0) { if (rank == 0) {
line = NULL; line = NULL;
// pair all_edges[total_num_edges]; // pair all_edges[total_num_edges];
@ -140,10 +141,16 @@ int main(int argc, char **argv) {
// We have to send the last one again here, since it didn't get caught in // We have to send the last one again here, since it didn't get caught in
// the loop above // the loop above
MPI_Send(&edge_counter, 1, MPI_INT, current_process, TAG_SEND_NUM_EDGES, if (current_process == 0) {
MPI_COMM_WORLD); num_my_edges = edge_counter;
MPI_Send(all_edges.ptr, edge_counter, IntPairType, current_process, my_edges = (pair *)calloc(num_my_edges, sizeof(pair));
TAG_SEND_EDGES, MPI_COMM_WORLD); memcpy(my_edges, all_edges.ptr, edge_counter * sizeof(pair));
} else {
MPI_Send(&edge_counter, 1, MPI_INT, current_process, TAG_SEND_NUM_EDGES,
MPI_COMM_WORLD);
MPI_Send(all_edges.ptr, edge_counter, IntPairType, current_process,
TAG_SEND_EDGES, MPI_COMM_WORLD);
}
free(all_edges.ptr); free(all_edges.ptr);
} else { } else {
@ -161,6 +168,10 @@ int main(int argc, char **argv) {
} }
#pragma endregion #pragma endregion
if (rank == 0)
printf("Params: p=%d, |E|=%d, |V|=%d\n", p, total_num_nodes,
total_num_edges);
// STEP 2 TIMER STARTS HERE // STEP 2 TIMER STARTS HERE
MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD);
double step_2_start_time; double step_2_start_time;
@ -228,7 +239,6 @@ int main(int argc, char **argv) {
double step_5_start_time; double step_5_start_time;
if (rank == 0) { if (rank == 0) {
step_5_start_time = MPI_Wtime(); step_5_start_time = MPI_Wtime();
printf("STARTING STEP 5: %0.04fs\n", step_5_start_time - step_2_start_time);
} }
// The processes perform the transfers of non-local labels and updates of // The processes perform the transfers of non-local labels and updates of
@ -241,36 +251,52 @@ int main(int argc, char **argv) {
std::vector<int> send_displs; std::vector<int> send_displs;
std::vector<int> recv_counts; std::vector<int> recv_counts;
std::vector<int> recv_displs; std::vector<int> recv_displs;
std::vector<int> recvbuf;
std::map<int, int> remote_labels;
int recv_total; if (p > 1) {
{
int offset = 0; int recv_total;
for (int i = 0; i < p; ++i) { {
int count = send_map[i].size(); int offset = 0;
for (auto local_node : send_map[i]) { for (int i = 0; i < p; ++i) {
sendbuf.push_back( int count = send_map[i].size();
node_label_assignment_vec[local_node - my_node_range.fst]); for (auto local_node : send_map[i]) {
sendbuf.push_back(
node_label_assignment_vec[local_node - my_node_range.fst]);
}
send_counts.push_back(count);
send_displs.push_back(offset);
offset += count;
} }
send_counts.push_back(count);
send_displs.push_back(offset); offset = 0;
offset += count; for (int i = 0; i < p; ++i) {
int count = recv_map[i].size();
recv_counts.push_back(count);
recv_displs.push_back(offset);
offset += count;
}
recv_total = offset;
} }
offset = 0; recvbuf = std::vector<int>(recv_total, 0);
MPI_Alltoallv(sendbuf.data(), send_counts.data(), send_displs.data(),
MPI_INT, recvbuf.data(), recv_counts.data(),
recv_displs.data(), MPI_INT, MPI_COMM_WORLD);
// Cache efficiently
for (int i = 0; i < p; ++i) { for (int i = 0; i < p; ++i) {
int count = recv_map[i].size(); std::vector<int> processor_nodes(recv_map[i].begin(),
recv_counts.push_back(count); recv_map[i].end());
recv_displs.push_back(offset); for (int j = 0; j < recv_counts[i]; ++j) {
offset += count; int remote_node = processor_nodes[j];
int remote_value = recvbuf[recv_displs[i] + j];
remote_labels[remote_node] = remote_value;
}
} }
recv_total = offset;
} }
std::vector<int> recvbuf(recv_total, 0);
MPI_Alltoallv(sendbuf.data(), send_counts.data(), send_displs.data(),
MPI_INT, recvbuf.data(), recv_counts.data(),
recv_displs.data(), MPI_INT, MPI_COMM_WORLD);
// For each local node, determine the minimum label out of its neighbors // For each local node, determine the minimum label out of its neighbors
std::map<int, int> new_labels; std::map<int, int> new_labels;
for (int i = 0; i < num_my_nodes; ++i) { for (int i = 0; i < num_my_nodes; ++i) {
@ -281,10 +307,18 @@ int main(int argc, char **argv) {
int min = current_value; int min = current_value;
for (auto neighbor : adj[node]) { for (auto neighbor : adj[node]) {
int neighbor_value = lookup_assignment( int neighbor_value;
node_label_assignment_vec, my_node_range, recv_map, recvbuf.data(), if (my_node_range.fst <= neighbor && neighbor < my_node_range.snd) {
recv_counts.data(), recv_displs.data(), each_num_nodes, rank, neighbor_value =
neighbor); node_label_assignment_vec[neighbor - my_node_range.fst];
} else {
neighbor_value = remote_labels[neighbor];
}
// = lookup_assignment(
// node_label_assignment_vec, my_node_range, recv_map,
// recvbuf.data(), recv_counts.data(), recv_displs.data(),
// each_num_nodes, rank, neighbor);
min = MIN(min, neighbor_value); min = MIN(min, neighbor_value);
} }

View file

@ -0,0 +1,17 @@
== Step 2-4
For steps 2-4, I calculated all of each process' outgoing nodes, sorted it in
order and used its sorted position as a way to identify which nodes are being
sent.
This saves an extra communication and lets me index the same items for each
loop.
== Step 5
I exchanged data using the unstructured communication approach, doing an
all-to-all transfer.
To read the result efficiently, I tried using the approach given in the slides.
However, this was taking a long time (up to 45 seconds for the 10,000 case) and
I tried using STL's `std::map`. This proved to be orders of magnitude faster