From ed6f0167fd1f6097ce9e454c2950c5db00a0c513 Mon Sep 17 00:00:00 2001
From: "Thouis (Ray) Jones" <thouis@gmail.com>
Date: Mon, 2 Nov 2015 21:49:00 -0500
Subject: [PATCH 1/7] remove extraneous +4

---
 HW3/P3/tune.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
index c16e9fa6..a0d56da2 100644
--- a/HW3/P3/tune.py
+++ b/HW3/P3/tune.py
@@ -23,7 +23,7 @@ def create_data(N):
     times = {}
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
@@ -40,7 +40,7 @@ def create_data(N):
                   format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)

From 6706ee708d97c656caa08b284792b3eee63892cf Mon Sep 17 00:00:00 2001
From: "Thouis (Ray) Jones" <thouis@gmail.com>
Date: Wed, 4 Nov 2015 21:46:08 -0500
Subject: [PATCH 2/7] typo in typecast

---
 HW3/P3/sum.cl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index 4fb771d2..ee914740 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -38,7 +38,7 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+    int k = ceil((float)N / get_global_size(0));
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1

From 6f342865123ebd2a584af90ed0daaa37be54ba36 Mon Sep 17 00:00:00 2001
From: Victor Lei <vlei@g.harvard.edu>
Date: Wed, 18 Nov 2015 23:43:24 -0500
Subject: [PATCH 3/7] Finish P3

---
 HW3/P3/sum.cl  | 31 +++++++++++++++++++++++--------
 HW3/P3/tune.py |  6 ++++--
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index ee914740..cee4e8d0 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -8,8 +8,11 @@ __kernel void sum_coalesced(__global float* x,
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+
+    unsigned int thread_id = get_global_id(0);
+    unsigned int step_size = get_global_size(0);
+    for (unsigned int i = thread_id; i < N; i += step_size) { // YOUR CODE HERE
+        sum += x[i]; // YOUR CODE HERE
     }
 
     fast[local_id] = sum;
@@ -24,8 +27,12 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+    unsigned int local_size = get_local_size(0);
+
+    for (unsigned int j = 1; (1 << j) <= local_size; j++) { // YOUR CODE HERE
+        unsigned int offset = (local_size >> j);
+        fast[thread_id] += fast[thread_id + offset]; // YOUR CODE HERE
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -38,6 +45,7 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
+
     int k = ceil((float)N / get_global_size(0));
 
     // thread with global_id 0 should add 0..k-1
@@ -48,8 +56,11 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+    unsigned int thread_id = get_global_id(0);
+
+    for (unsigned int i = k * thread_id; i < k * (thread_id + 1) && i < N; i++) { // YOUR CODE HERE
+        sum += x[i]; // YOUR CODE HERE
     }
 
     fast[local_id] = sum;
@@ -64,8 +75,12 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+    unsigned int local_size = get_local_size(0);
+
+    for (unsigned int j = 1; (1 << j) <= local_size; j++) { // YOUR CODE HERE
+        unsigned int offset = (local_size >> j);
+        fast[thread_id] += fast[thread_id + offset]; // YOUR CODE HERE
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
index a0d56da2..a0f16c53 100644
--- a/HW3/P3/tune.py
+++ b/HW3/P3/tune.py
@@ -1,10 +1,12 @@
 import pyopencl as cl
 import numpy as np
+import os
 
 def create_data(N):
     return host_x, x
 
 if __name__ == "__main__":
+    os.environ["PYOPENCL_COMPILER_OUTPUT"] = "1"
     N = 1e7
 
     platforms = cl.get_platforms()
@@ -34,7 +36,7 @@ def create_data(N):
             sum_gpu = sum(host_partial)
             sum_host = sum(host_x)
             seconds = (event.profile.end - event.profile.start) / 1e9
-            assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
+            # assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
             times['coalesced', num_workgroups, num_workers] = seconds
             print("coalesced reads, workgroups: {}, num_workers: {}, {} seconds".
                   format(num_workgroups, num_workers, seconds))
@@ -51,7 +53,7 @@ def create_data(N):
             sum_gpu = sum(host_partial)
             sum_host = sum(host_x)
             seconds = (event.profile.end - event.profile.start) / 1e9
-            assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
+            # assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
             times['blocked', num_workgroups, num_workers] = seconds
             print("blocked reads, workgroups: {}, num_workers: {}, {} seconds".
                   format(num_workgroups, num_workers, seconds))

From 9e268f2ccab92d64fa7f5626f7a3b5743b130c27 Mon Sep 17 00:00:00 2001
From: Victor Lei <vlei@g.harvard.edu>
Date: Fri, 20 Nov 2015 08:03:39 -0800
Subject: [PATCH 4/7] Complete P3 and P4 + fixes

---
 HW3/P3/P3.txt           |  2 ++
 HW3/P3/sum.cl           | 17 +++++++----
 HW3/P3/tune.py          |  4 +--
 HW3/P4/median_filter.cl | 65 +++++++++++++++++++++++++++++++++++++++++
 HW3/P5/P5.txt           | 13 +++++++++
 HW3/P5/label_regions.cl | 17 ++++++++++-
 6 files changed, 109 insertions(+), 9 deletions(-)
 create mode 100644 HW3/P3/P3.txt
 create mode 100644 HW3/P5/P5.txt

diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
new file mode 100644
index 00000000..e626738e
--- /dev/null
+++ b/HW3/P3/P3.txt
@@ -0,0 +1,2 @@
+The fastest configuration was the coaslesced setup with a workgroup size of 512, and 128 workers.
+This configuration completed the task in 0.00309848 seconds.
\ No newline at end of file
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index cee4e8d0..bf913a6c 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -11,6 +11,7 @@ __kernel void sum_coalesced(__global float* x,
 
     unsigned int thread_id = get_global_id(0);
     unsigned int step_size = get_global_size(0);
+
     for (unsigned int i = thread_id; i < N; i += step_size) { // YOUR CODE HERE
         sum += x[i]; // YOUR CODE HERE
     }
@@ -30,9 +31,11 @@ __kernel void sum_coalesced(__global float* x,
 
     unsigned int local_size = get_local_size(0);
 
-    for (unsigned int j = 1; (1 << j) <= local_size; j++) { // YOUR CODE HERE
-        unsigned int offset = (local_size >> j);
-        fast[thread_id] += fast[thread_id + offset]; // YOUR CODE HERE
+    for (unsigned int j = local_size/2; j > 0; j >>= 1) { // YOUR CODE HERE
+        if( local_id < j) {
+            fast[local_id] += fast[local_id + j]; // YOUR CODE HERE
+         }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -78,9 +81,11 @@ __kernel void sum_blocked(__global float* x,
 
     unsigned int local_size = get_local_size(0);
 
-    for (unsigned int j = 1; (1 << j) <= local_size; j++) { // YOUR CODE HERE
-        unsigned int offset = (local_size >> j);
-        fast[thread_id] += fast[thread_id + offset]; // YOUR CODE HERE
+    for (unsigned int j = local_size/2; j > 0; j >>= 1) { // YOUR CODE HERE
+        if( local_id < j) {
+            fast[local_id] += fast[local_id + j]; // YOUR CODE HERE
+         }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
index a0f16c53..e4387ccc 100644
--- a/HW3/P3/tune.py
+++ b/HW3/P3/tune.py
@@ -36,7 +36,7 @@ def create_data(N):
             sum_gpu = sum(host_partial)
             sum_host = sum(host_x)
             seconds = (event.profile.end - event.profile.start) / 1e9
-            # assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
+            assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
             times['coalesced', num_workgroups, num_workers] = seconds
             print("coalesced reads, workgroups: {}, num_workers: {}, {} seconds".
                   format(num_workgroups, num_workers, seconds))
@@ -53,7 +53,7 @@ def create_data(N):
             sum_gpu = sum(host_partial)
             sum_host = sum(host_x)
             seconds = (event.profile.end - event.profile.start) / 1e9
-            # assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
+            assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
             times['blocked', num_workgroups, num_workers] = seconds
             print("blocked reads, workgroups: {}, num_workers: {}, {} seconds".
                   format(num_workgroups, num_workers, seconds))
diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
index 07bb294c..647874c7 100644
--- a/HW3/P4/median_filter.cl
+++ b/HW3/P4/median_filter.cl
@@ -1,5 +1,16 @@
 #include "median9.h"
 
+// From WH3 P5
+float
+get_clamped_value(__global __read_only float *labels,
+                  int w, int h,
+                  int x, int y)
+{
+    int c_x = min(w-1, max(0, x)), c_y = min(h-1, max(0, y));
+    return labels[c_y * w + c_x];
+}
+
+
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -22,13 +33,67 @@ median_3x3(__global __read_only float *in_values,
     // Note that globally out-of-bounds pixels should be replaced
     // with the nearest valid pixel's value.
 
+    // Based on HW3 Problem 5
+
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+    // Load the relevant labels to a local buffer with a halo
+    if (idx_1D < buf_w) {
+        for (int row = 0; row < buf_h; row++) {
+            buffer[row * buf_w + idx_1D] =
+                get_clamped_value(in_values,
+                                  w, h,
+                                  buf_corner_x + idx_1D, buf_corner_y + row);
+        }
+    }
+
+    // Make sure all threads reach the next part after
+    // the local buffer is loaded
+    barrier(CLK_LOCAL_MEM_FENCE);
+
 
     // Compute 3x3 median for each pixel in core (non-halo) pixels
     //
     // We've given you median9.h, and included it above, so you can
     // use the median9() function.
 
+    const int dx[3] = {-1, 0, 1}, dy[3] = {-1, 0, 1};
+    int idxArr[9];
+
+    for( int i=0; i<3; i++ ) {
+        for ( int j=0; j<3; j++ ) {
+            idxArr[i*3+j] = (buf_y + dy[i])*buf_w + (buf_x + dx[j]);
+        }
+    }
 
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
+
+    //From HW3 P5
+    // stay in bounds
+    if ((x < w) && (y < h)) {
+        out_values[y*w + x] =
+                median9( buffer[ idxArr[0] ], buffer[ idxArr[1] ], buffer[ idxArr[2] ],
+                        buffer[ idxArr[3] ], buffer[ idxArr[4] ], buffer[ idxArr[5] ],
+                        buffer[ idxArr[6] ], buffer[ idxArr[7] ], buffer[ idxArr[8] ] );
+    }
+
 }
diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
new file mode 100644
index 00000000..e3b89e82
--- /dev/null
+++ b/HW3/P5/P5.txt
@@ -0,0 +1,13 @@
+Part 1:
+Finished after 911 iterations, 214.33168 ms total, 0.235270779363 ms per iteration
+Found 2 regions
+
+Finished after 531 iterations, 124.05312 ms total, 0.233621694915 ms per iteration
+Found 35 regions
+
+Part 2:
+Finished after 529 iterations, 133.35928 ms total, 0.252096937618 ms per iteration
+Found 2 regions
+
+Finished after 269 iterations, 68.04856 ms total, 0.252968624535 ms per iteration
+Found 35 regions
\ No newline at end of file
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 78b986b3..36c745d4 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -35,6 +35,8 @@ propagate_labels(__global __read_write int *labels,
                  int buf_w, int buf_h,
                  const int halo)
 {
+
+    const int dx[4] = {-1, 0, 1, 0}, dy[4] = {0, -1, 0, 1};
     // halo is the additional number of cells in one direction
 
     // Global position of output pixel
@@ -80,13 +82,26 @@ propagate_labels(__global __read_write int *labels,
     old_label = buffer[buf_y * buf_w + buf_x];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
-    
+
+    for( int i=0; i<4; i++ ) {
+        if( buffer[(buf_y+dy[i])*buf_w + (buf_x + dx[i])] < w * h ) {
+            buffer[(buf_y+dy[i])*buf_w + (buf_x + dx[i])] = labels[ buffer[(buf_y+dy[i])*buf_w + (buf_x + dx[i])] ];
+         }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
     // stay in bounds
     if ((x < w) && (y < h)) {
         // CODE FOR PART 1 HERE
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
         new_label = old_label;
+        if( new_label != w * h ) { // See Piazza @486
+            for( int i=0; i<4; i++ ) {
+                new_label = min( new_label, buffer[(buf_y+dy[i])*buf_w + (buf_x + dx[i])] );
+            }
+         }
 
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE

From 4e6f620808094408a00e72e2dc02723f94991ba4 Mon Sep 17 00:00:00 2001
From: Victor Lei <vlei@g.harvard.edu>
Date: Fri, 20 Nov 2015 08:31:43 -0800
Subject: [PATCH 5/7] Work on P5

---
 HW3/P5/P5.txt           | 25 ++++++++++++++++++++++++-
 HW3/P5/label_regions.cl |  1 +
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
index e3b89e82..9cc67094 100644
--- a/HW3/P5/P5.txt
+++ b/HW3/P5/P5.txt
@@ -10,4 +10,27 @@ Finished after 529 iterations, 133.35928 ms total, 0.252096937618 ms per iterati
 Found 2 regions
 
 Finished after 269 iterations, 68.04856 ms total, 0.252968624535 ms per iteration
-Found 35 regions
\ No newline at end of file
+Found 35 regions
+
+Part 3:
+Finished after 8 iterations, 2.59184 ms total, 0.32398 ms per iteration
+Found 2 regions
+
+Finished after 8 iterations, 2.4332 ms total, 0.30415 ms per iteration
+Found 35 regions
+
+Part 5:
+If instead of atomic_min() we use min(), then the update step is not done in a single transaction, meaning between
+the min() check and the subsequent update, a different thread could update the reference value. Suppose this
+different thread actually updated the reference to an even lower value than what we had intended. Now, if we go ahead
+ with our update, we are actually *increasing* the reference value. It could therefore also lead to an increase in
+ this value between iterations.
+
+ So while it would be faster to do without atomic_min() as memory access is not serialized, it is also slow since it
+ means more iterations have to be performed. While empirical testing is needed to determine which is the better
+ trade-off, my sense is that for simple, low-iteration mazes, atomic_min() is going to present a significant overhead
+  so it might be better to do a few more iterations. The opposite is likely true for high-iteration mazes.
+
+This makes things more inefficient since this cache is potentially being updated with worse values. However, because
+ultimately the stopping condition is whether or not any more updates have been performed, it won't effect the
+correctness of the algorithm.
\ No newline at end of file
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 36c745d4..6b644ccb 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -109,6 +109,7 @@ propagate_labels(__global __read_write int *labels,
             // multiple threads might write this.
             *(changed_flag) += 1;
             labels[y * w + x] = new_label;
+            atomic_min(&labels[old_label], new_label);
         }
     }
 }

From 708800aca10d96f4ef81f04920a794ea833f92d2 Mon Sep 17 00:00:00 2001
From: Victor Lei <vlei@g.harvard.edu>
Date: Fri, 20 Nov 2015 19:13:43 -0800
Subject: [PATCH 6/7] Finish P5

---
 HW3/P5/P5.txt           | 25 +++++++++++++++++++++++++
 HW3/P5/label_regions.cl | 29 ++++++++++++++++++++++++++++-
 HW3/P5/label_regions.py |  2 +-
 3 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
index 9cc67094..589ef6b3 100644
--- a/HW3/P5/P5.txt
+++ b/HW3/P5/P5.txt
@@ -1,4 +1,5 @@
 Part 1:
+========================
 Finished after 911 iterations, 214.33168 ms total, 0.235270779363 ms per iteration
 Found 2 regions
 
@@ -6,6 +7,7 @@ Finished after 531 iterations, 124.05312 ms total, 0.233621694915 ms per iterati
 Found 35 regions
 
 Part 2:
+========================
 Finished after 529 iterations, 133.35928 ms total, 0.252096937618 ms per iteration
 Found 2 regions
 
@@ -13,13 +15,36 @@ Finished after 269 iterations, 68.04856 ms total, 0.252968624535 ms per iteratio
 Found 35 regions
 
 Part 3:
+========================
 Finished after 8 iterations, 2.59184 ms total, 0.32398 ms per iteration
 Found 2 regions
 
 Finished after 8 iterations, 2.4332 ms total, 0.30415 ms per iteration
 Found 35 regions
 
+Part 4:
+========================
+Finished after 10 iterations, 7.37784 ms total, 0.737784 ms per iteration
+Found 2 regions
+
+Finished after 9 iterations, 6.66816 ms total, 0.740906666667 ms per iteration
+Found 35 regions
+
+Using a single-thread for caching seems to have made the running time much slower (looking at the time per iteration
+.) Computation is serialized during this caching process, so there is a big upfront cost that needs to be weighed
+against the ongoing cost of expensive, potentially serialized, main memory accesses.
+
+I suspect the worst case memory access scenario where neighbouring nodes all try to access main memory for a single
+cached value only happens quite late in the process. Earlier iterations are probably facing more diversity of memory
+accesses since there's a greater range of values still remaining. It follows then, that perhaps with much more
+complex mazes, there could be scenarios where the overall iteration count will be high, but parts of the maze will
+have already "stabilized" early in the process. Therefore, these "stabilized" parts are repeatedly drawing on the same
+cached values. It seems there aren't enough iterations in these examples for it to be worth the upfront cost (at
+least on my hardware setup.) Potentially, other setups that have different computation and memory access speeds might
+reveal differences in the results since the trade-off is weighed differently.
+
 Part 5:
+========================
 If instead of atomic_min() we use min(), then the update step is not done in a single transaction, meaning between
 the min() check and the subsequent update, a different thread could update the reference value. Suppose this
 different thread actually updated the reference to an even lower value than what we had intended. Now, if we go ahead
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 6b644ccb..e20a4eb6 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -82,12 +82,39 @@ propagate_labels(__global __read_write int *labels,
     old_label = buffer[buf_y * buf_w + buf_x];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
-
+    // Part 2
+    /*
     for( int i=0; i<4; i++ ) {
         if( buffer[(buf_y+dy[i])*buf_w + (buf_x + dx[i])] < w * h ) {
             buffer[(buf_y+dy[i])*buf_w + (buf_x + dx[i])] = labels[ buffer[(buf_y+dy[i])*buf_w + (buf_x + dx[i])] ];
          }
     }
+    */
+
+    // Part 4
+    // Reference: Piazza @524
+    unsigned int ls0 = get_local_size(0), ls1 = get_local_size(1);
+
+    if( lx == 0 && ly == 0 ) { //Use the first thread
+        unsigned int prev = -1, gparent = -1; // 1 variable cache
+        for( int c_lx = 0; c_lx < ls0; c_lx++ ) { // Update the entire local buffer
+            for( int c_ly = 0; c_ly < ls1; c_ly++ ) {
+                unsigned int cur_idx = (c_ly + halo) * buf_w + (c_lx + halo);
+                unsigned int parent = buffer[cur_idx];
+
+                if( parent == w * h ) continue; // Background pixel
+
+                if( parent == prev ) { // 1 variable cache success!
+                    buffer[cur_idx] = gparent;
+                }
+                else { // Update the cache
+                    buffer[cur_idx] = labels[parent];
+                    prev = parent;
+                    gparent = buffer[cur_idx];
+                }
+            }
+        }
+    }
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py
index c6ce60cb..b9aeeeb9 100644
--- a/HW3/P5/label_regions.py
+++ b/HW3/P5/label_regions.py
@@ -42,7 +42,7 @@ def round_up(global_size, group_size):
 
     program = cl.Program(context, open('label_regions.cl').read()).build(options='')
 
-    host_image = np.load('maze1.npy')
+    host_image = np.load('maze2.npy')
     host_labels = np.empty_like(host_image)
     host_done_flag = np.zeros(1).astype(np.int32)
 

From 426565bc8775b59c186416ab961ffe3c0a3660b9 Mon Sep 17 00:00:00 2001
From: Victor Lei <vlei@g.harvard.edu>
Date: Fri, 20 Nov 2015 19:22:27 -0800
Subject: [PATCH 7/7] Fix typo

---
 HW3/P4/median_filter.cl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
index 647874c7..6971a7b9 100644
--- a/HW3/P4/median_filter.cl
+++ b/HW3/P4/median_filter.cl
@@ -1,6 +1,6 @@
 #include "median9.h"
 
-// From WH3 P5
+// From HW3 P5
 float
 get_clamped_value(__global __read_only float *labels,
                   int w, int h,