diff --git a/assignments/03/.gitignore b/assignments/03/.gitignore
index 971d284..15a3a87 100644
--- a/assignments/03/.gitignore
+++ b/assignments/03/.gitignore
@@ -2,4 +2,5 @@ lpa
 compile_commands.json
 .cache
 report.pdf
-*.tar.gz
\ No newline at end of file
+*.tar.gz
+out.txt
\ No newline at end of file
diff --git a/assignments/03/Makefile b/assignments/03/Makefile
index 203e7a4..836ebdb 100644
--- a/assignments/03/Makefile
+++ b/assignments/03/Makefile
@@ -1,7 +1,7 @@
 .PHONY: run clean
 
-# CFLAGS += -O3
-CFLAGS += -DFMT_HEADER_ONLY -g
+CFLAGS += -O3
+# CFLAGS += -DFMT_HEADER_ONLY -g
 # LDFLAGS += $(shell pkg-config --libs fmt)
 
 lpa: lpa.cpp Makefile
diff --git a/assignments/03/bench.sh b/assignments/03/bench.sh
new file mode 100755
index 0000000..18f2039
--- /dev/null
+++ b/assignments/03/bench.sh
@@ -0,0 +1,8 @@
+for dataset in $(echo "1000.txt" "10000.txt" "1000000.txt" "1000000.txt"); do
+    for processors in $(echo 1 2 4 8 16 | tr ' ' '\n'); do
+        # file="dataset/both_$dataset"
+        file="/export/scratch/CSCI5451_F23/assignment-3/dataset/$dataset"
+        echo $processors $file;
+        mpirun -n $processors ./lpa $file >> out.txt
+    done
+done
\ No newline at end of file
diff --git a/assignments/03/lpa.cpp b/assignments/03/lpa.cpp
index d330c57..0f3686e 100644
--- a/assignments/03/lpa.cpp
+++ b/assignments/03/lpa.cpp
@@ -97,6 +97,7 @@ int main(int argc, char **argv) {
   int num_my_edges;
   pair *my_edges;
   int counts[p], displs[p];
+
   if (rank == 0) {
     line = NULL;
     // pair all_edges[total_num_edges];
@@ -140,10 +141,16 @@ int main(int argc, char **argv) {
 
     // We have to send the last one again here, since it didn't get caught in
     // the loop above
-    MPI_Send(&edge_counter, 1, MPI_INT, current_process, TAG_SEND_NUM_EDGES,
-             MPI_COMM_WORLD);
-    MPI_Send(all_edges.ptr, edge_counter, IntPairType, current_process,
-             TAG_SEND_EDGES, MPI_COMM_WORLD);
+    if (current_process == 0) {
+      num_my_edges = edge_counter;
+      my_edges = (pair *)calloc(num_my_edges, sizeof(pair));
+      memcpy(my_edges, all_edges.ptr, edge_counter * sizeof(pair));
+    } else {
+      MPI_Send(&edge_counter, 1, MPI_INT, current_process, TAG_SEND_NUM_EDGES,
+               MPI_COMM_WORLD);
+      MPI_Send(all_edges.ptr, edge_counter, IntPairType, current_process,
+               TAG_SEND_EDGES, MPI_COMM_WORLD);
+    }
 
     free(all_edges.ptr);
   } else {
@@ -161,6 +168,10 @@ int main(int argc, char **argv) {
   }
 #pragma endregion
 
+  if (rank == 0)
+    printf("Params: p=%d, |E|=%d, |V|=%d\n", p, total_num_nodes,
+           total_num_edges);
+
   // STEP 2 TIMER STARTS HERE
   MPI_Barrier(MPI_COMM_WORLD);
   double step_2_start_time;
@@ -228,7 +239,6 @@ int main(int argc, char **argv) {
   double step_5_start_time;
   if (rank == 0) {
     step_5_start_time = MPI_Wtime();
-    printf("STARTING STEP 5: %0.04fs\n", step_5_start_time - step_2_start_time);
   }
 
 // The processes perform the transfers of non-local labels and  updates of
@@ -241,36 +251,52 @@ int main(int argc, char **argv) {
     std::vector<int> send_displs;
     std::vector<int> recv_counts;
     std::vector<int> recv_displs;
+    std::vector<int> recvbuf;
+    std::map<int, int> remote_labels;
 
-    int recv_total;
-    {
-      int offset = 0;
-      for (int i = 0; i < p; ++i) {
-        int count = send_map[i].size();
-        for (auto local_node : send_map[i]) {
-          sendbuf.push_back(
-              node_label_assignment_vec[local_node - my_node_range.fst]);
+    if (p > 1) {
+
+      int recv_total;
+      {
+        int offset = 0;
+        for (int i = 0; i < p; ++i) {
+          int count = send_map[i].size();
+          for (auto local_node : send_map[i]) {
+            sendbuf.push_back(
+                node_label_assignment_vec[local_node - my_node_range.fst]);
+          }
+          send_counts.push_back(count);
+          send_displs.push_back(offset);
+          offset += count;
         }
-        send_counts.push_back(count);
-        send_displs.push_back(offset);
-        offset += count;
+
+        offset = 0;
+        for (int i = 0; i < p; ++i) {
+          int count = recv_map[i].size();
+          recv_counts.push_back(count);
+          recv_displs.push_back(offset);
+          offset += count;
+        }
+        recv_total = offset;
       }
 
-      offset = 0;
+      recvbuf = std::vector<int>(recv_total, 0);
+      MPI_Alltoallv(sendbuf.data(), send_counts.data(), send_displs.data(),
+                    MPI_INT, recvbuf.data(), recv_counts.data(),
+                    recv_displs.data(), MPI_INT, MPI_COMM_WORLD);
+
+      // Cache efficiently
       for (int i = 0; i < p; ++i) {
-        int count = recv_map[i].size();
-        recv_counts.push_back(count);
-        recv_displs.push_back(offset);
-        offset += count;
+        std::vector<int> processor_nodes(recv_map[i].begin(),
+                                         recv_map[i].end());
+        for (int j = 0; j < recv_counts[i]; ++j) {
+          int remote_node = processor_nodes[j];
+          int remote_value = recvbuf[recv_displs[i] + j];
+          remote_labels[remote_node] = remote_value;
+        }
       }
-      recv_total = offset;
     }
 
-    std::vector<int> recvbuf(recv_total, 0);
-    MPI_Alltoallv(sendbuf.data(), send_counts.data(), send_displs.data(),
-                  MPI_INT, recvbuf.data(), recv_counts.data(),
-                  recv_displs.data(), MPI_INT, MPI_COMM_WORLD);
-
     // For each local node, determine the minimum label out of its neighbors
     std::map<int, int> new_labels;
     for (int i = 0; i < num_my_nodes; ++i) {
@@ -281,10 +307,18 @@ int main(int argc, char **argv) {
       int min = current_value;
 
       for (auto neighbor : adj[node]) {
-        int neighbor_value = lookup_assignment(
-            node_label_assignment_vec, my_node_range, recv_map, recvbuf.data(),
-            recv_counts.data(), recv_displs.data(), each_num_nodes, rank,
-            neighbor);
+        int neighbor_value;
+        if (my_node_range.fst <= neighbor && neighbor < my_node_range.snd) {
+          neighbor_value =
+              node_label_assignment_vec[neighbor - my_node_range.fst];
+        } else {
+          neighbor_value = remote_labels[neighbor];
+        }
+
+        //  = lookup_assignment(
+        //     node_label_assignment_vec, my_node_range, recv_map,
+        //     recvbuf.data(), recv_counts.data(), recv_displs.data(),
+        //     each_num_nodes, rank, neighbor);
         min = MIN(min, neighbor_value);
       }
 
diff --git a/assignments/03/report.typ b/assignments/03/report.typ
index e69de29..c37c9e1 100644
--- a/assignments/03/report.typ
+++ b/assignments/03/report.typ
@@ -0,0 +1,17 @@
+== Step 2-4
+
+For steps 2-4, I calculated all of each process' outgoing nodes, sorted it in
+order and used its sorted position as a way to identify which nodes are being
+sent.
+
+This saves an extra communication and lets me index the same items for each
+loop.
+
+== Step 5
+
+I exchanged data using the unstructured communication approach, doing an
+all-to-all transfer.
+
+To read the result efficiently, I tried using the approach given in the slides.
+However, this was taking a long time (up to 45 seconds for the 10,000 case) and
+I tried using STL's `std::map`. This proved to be orders of magnitude faster
\ No newline at end of file