diff --git a/assignments/04/km_cuda.cu b/assignments/04/km_cuda.cu
index 938e9f7..2cd0c23 100644
--- a/assignments/04/km_cuda.cu
+++ b/assignments/04/km_cuda.cu
@@ -112,7 +112,7 @@ int main(int argc, char **argv) {
     // Allocate memory on the GPU
     CUDACHECK(
         cudaMalloc((void **)&centroids, num_clusters * dim * sizeof(float)));
-    CUDACHECK(cudaMalloc((void **)&clusterMap, N * sizeof(int)));
+    CUDACHECK(cudaMallocManaged((void **)&clusterMap, N * sizeof(int)));
     CUDACHECK(cudaMallocManaged((void **)&clusterCount,
                                 num_clusters * sizeof(unsigned int)));
     CUDACHECK(cudaMalloc((void **)&data, N * dim * sizeof(float)));
@@ -170,7 +170,7 @@ int main(int argc, char **argv) {
   printf("Is dirty: %d\n", *dirtyBit);
 #pragma endregion
 
-#pragma region
+#pragma region Iteration
   int it = 0;
   while (*dirtyBit) {
     printf("Iteration %d (dirty=%d)\n", it, *dirtyBit);
@@ -205,5 +205,31 @@ int main(int argc, char **argv) {
   }
 #pragma endregion
 
+#pragma region
+  {
+    FILE *fp = fopen("clusters.txt", "w");
+    for (int i = 0; i < N; ++i)
+      fprintf(fp, "%d\n", clusterMap[i]);
+    fclose(fp);
+  }
+
+  {
+    FILE *fp = fopen("centroids.txt", "w");
+    fprintf(fp, "%d %d\n", num_clusters, dim);
+    float *line = (float *)malloc(dim * sizeof(float));
+    for (int i = 0; i < num_clusters; ++i) {
+      CUDACHECK(cudaMemcpy(line, &centroids[i * dim], dim * sizeof(float),
+                           cudaMemcpyDeviceToHost));
+      for (int d = 0; d < dim; ++d)
+        fprintf(fp, "%.3f ", line[d]);
+      fprintf(fp, "\n");
+    }
+    free(line);
+    fclose(fp);
+  }
+
+  printf("Done.\n");
+#pragma endregion
+
   return 0;
 }