diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
index 5a11c020..83c3d697 100644
--- a/HW3/P2/mandelbrot.cl
+++ b/HW3/P2/mandelbrot.cl
@@ -9,11 +9,27 @@ mandelbrot(__global __read_only float *coords_real,
     const int y = get_global_id(1);
 
     float c_real, c_imag;
-    float z_real, z_imag;
-    int iter;
+    float z_real, z_imag, z_imag_sq, z_real_sq;
+    int iter, offset;
 
     if ((x < w) && (y < h)) {
         // YOUR CODE HERE
-        ;
+        offset = y * w + x;
+        // Get complex number c
+        c_real = coords_real[offset];
+        c_imag = coords_imag[offset];
+        // Initialize z
+        z_real = 0.0;
+        z_imag = 0.0;
+        for (iter = 0; iter < max_iter; iter++) {
+            z_real_sq = z_real * z_real;
+            z_imag_sq = z_imag * z_imag;
+            if (z_real_sq + z_imag_sq > 4) {
+                break;
+            }
+            z_imag = 2 * z_real * z_imag + c_imag;
+            z_real = z_real_sq - z_imag_sq + c_real;
+        }
+        out_counts[offset] = iter;
     }
 }
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
new file mode 100644
index 00000000..7317a18b
--- /dev/null
+++ b/HW3/P3/P3.txt
@@ -0,0 +1,85 @@
+coalesced reads, workgroups: 8, num_workers: 4, 0.13523816 seconds
+coalesced reads, workgroups: 8, num_workers: 8, 0.05002128 seconds
+coalesced reads, workgroups: 8, num_workers: 16, 0.02228568 seconds
+coalesced reads, workgroups: 8, num_workers: 32, 0.01243688 seconds
+coalesced reads, workgroups: 8, num_workers: 64, 0.00679152 seconds
+coalesced reads, workgroups: 8, num_workers: 128, 0.00387344 seconds
+coalesced reads, workgroups: 16, num_workers: 4, 0.05827352 seconds
+coalesced reads, workgroups: 16, num_workers: 8, 0.02823304 seconds
+coalesced reads, workgroups: 16, num_workers: 16, 0.01497992 seconds
+coalesced reads, workgroups: 16, num_workers: 32, 0.00788968 seconds
+coalesced reads, workgroups: 16, num_workers: 64, 0.00434872 seconds
+coalesced reads, workgroups: 16, num_workers: 128, 0.0030288 seconds
+coalesced reads, workgroups: 32, num_workers: 4, 0.02866544 seconds
+coalesced reads, workgroups: 32, num_workers: 8, 0.01335272 seconds
+coalesced reads, workgroups: 32, num_workers: 16, 0.007186 seconds
+coalesced reads, workgroups: 32, num_workers: 32, 0.00406992 seconds
+coalesced reads, workgroups: 32, num_workers: 64, 0.00277344 seconds
+coalesced reads, workgroups: 32, num_workers: 128, 0.00296696 seconds
+coalesced reads, workgroups: 64, num_workers: 4, 0.03042336 seconds
+coalesced reads, workgroups: 64, num_workers: 8, 0.01716112 seconds
+coalesced reads, workgroups: 64, num_workers: 16, 0.01011488 seconds
+coalesced reads, workgroups: 64, num_workers: 32, 0.00509712 seconds
+coalesced reads, workgroups: 64, num_workers: 64, 0.0039112 seconds
+coalesced reads, workgroups: 64, num_workers: 128, 0.00386304 seconds
+coalesced reads, workgroups: 128, num_workers: 4, 0.03748808 seconds
+coalesced reads, workgroups: 128, num_workers: 8, 0.01877584 seconds
+coalesced reads, workgroups: 128, num_workers: 16, 0.01011024 seconds
+coalesced reads, workgroups: 128, num_workers: 32, 0.00510552 seconds
+coalesced reads, workgroups: 128, num_workers: 64, 0.00361184 seconds
+coalesced reads, workgroups: 128, num_workers: 128, 0.00353344 seconds
+coalesced reads, workgroups: 256, num_workers: 4, 0.04078736 seconds
+coalesced reads, workgroups: 256, num_workers: 8, 0.01956208 seconds
+coalesced reads, workgroups: 256, num_workers: 16, 0.01072384 seconds
+coalesced reads, workgroups: 256, num_workers: 32, 0.00538184 seconds
+coalesced reads, workgroups: 256, num_workers: 64, 0.00410528 seconds
+coalesced reads, workgroups: 256, num_workers: 128, 0.00426576 seconds
+coalesced reads, workgroups: 512, num_workers: 4, 0.04820448 seconds
+coalesced reads, workgroups: 512, num_workers: 8, 0.02555352 seconds
+coalesced reads, workgroups: 512, num_workers: 16, 0.01377392 seconds
+coalesced reads, workgroups: 512, num_workers: 32, 0.00804696 seconds
+coalesced reads, workgroups: 512, num_workers: 64, 0.00533144 seconds
+coalesced reads, workgroups: 512, num_workers: 128, 0.00534184 seconds
+blocked reads, workgroups: 8, num_workers: 4, 0.18848704 seconds
+blocked reads, workgroups: 8, num_workers: 8, 0.0736176 seconds
+blocked reads, workgroups: 8, num_workers: 16, 0.04906264 seconds
+blocked reads, workgroups: 8, num_workers: 32, 0.02473608 seconds
+blocked reads, workgroups: 8, num_workers: 64, 0.01126728 seconds
+blocked reads, workgroups: 8, num_workers: 128, 0.02127384 seconds
+blocked reads, workgroups: 16, num_workers: 4, 0.06236696 seconds
+blocked reads, workgroups: 16, num_workers: 8, 0.03539944 seconds
+blocked reads, workgroups: 16, num_workers: 16, 0.02401944 seconds
+blocked reads, workgroups: 16, num_workers: 32, 0.01006624 seconds
+blocked reads, workgroups: 16, num_workers: 64, 0.01824392 seconds
+blocked reads, workgroups: 16, num_workers: 128, 0.0521756 seconds
+blocked reads, workgroups: 32, num_workers: 4, 0.02761504 seconds
+blocked reads, workgroups: 32, num_workers: 8, 0.01609232 seconds
+blocked reads, workgroups: 32, num_workers: 16, 0.01000312 seconds
+blocked reads, workgroups: 32, num_workers: 32, 0.01815032 seconds
+blocked reads, workgroups: 32, num_workers: 64, 0.04871368 seconds
+blocked reads, workgroups: 32, num_workers: 128, 0.06726688 seconds
+blocked reads, workgroups: 64, num_workers: 4, 0.02830136 seconds
+blocked reads, workgroups: 64, num_workers: 8, 0.01547568 seconds
+blocked reads, workgroups: 64, num_workers: 16, 0.0123616 seconds
+blocked reads, workgroups: 64, num_workers: 32, 0.02625976 seconds
+blocked reads, workgroups: 64, num_workers: 64, 0.08451304 seconds
+blocked reads, workgroups: 64, num_workers: 128, 0.08008256 seconds
+blocked reads, workgroups: 128, num_workers: 4, 0.02486112 seconds
+blocked reads, workgroups: 128, num_workers: 8, 0.0149828 seconds
+blocked reads, workgroups: 128, num_workers: 16, 0.01152056 seconds
+blocked reads, workgroups: 128, num_workers: 32, 0.0220164 seconds
+blocked reads, workgroups: 128, num_workers: 64, 0.06941248 seconds
+blocked reads, workgroups: 128, num_workers: 128, 0.05325168 seconds
+blocked reads, workgroups: 256, num_workers: 4, 0.02504432 seconds
+blocked reads, workgroups: 256, num_workers: 8, 0.01262448 seconds
+blocked reads, workgroups: 256, num_workers: 16, 0.00845736 seconds
+blocked reads, workgroups: 256, num_workers: 32, 0.02512984 seconds
+blocked reads, workgroups: 256, num_workers: 64, 0.05071976 seconds
+blocked reads, workgroups: 256, num_workers: 128, 0.03916664 seconds
+blocked reads, workgroups: 512, num_workers: 4, 0.02529368 seconds
+blocked reads, workgroups: 512, num_workers: 8, 0.01537016 seconds
+blocked reads, workgroups: 512, num_workers: 16, 0.0116124 seconds
+blocked reads, workgroups: 512, num_workers: 32, 0.03006704 seconds
+blocked reads, workgroups: 512, num_workers: 64, 0.04359768 seconds
+blocked reads, workgroups: 512, num_workers: 128, 0.0251344 seconds
+configuration ('coalesced', 32, 64): 0.00277344 seconds <-- *** BEST ***
\ No newline at end of file
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index ee914740..66fd3af7 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -1,72 +1,91 @@
-__kernel void sum_coalesced(__global float* x,
-                            __global float* partial,
-                            __local  float* fast,
-                            long N)
-{
-    float sum = 0;
-    size_t local_id = get_local_id(0);
-
-    // thread i (i.e., with i = get_global_id()) should add x[i],
-    // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
-    }
-
-    fast[local_id] = sum;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // binary reduction
-    //
-    // thread i should sum fast[i] and fast[i + offset] and store back
-    // in fast[i], for offset = (local_size >> j) for j from 1 to
-    // log_2(local_size)
-    //
-    // You can assume get_local_size(0) is a power of 2.
-    //
-    // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
-    }
-
-    if (local_id == 0) partial[get_group_id(0)] = fast[0];
-}
-
-__kernel void sum_blocked(__global float* x,
-                          __global float* partial,
-                          __local  float* fast,
-                          long N)
-{
-    float sum = 0;
-    size_t local_id = get_local_id(0);
-    int k = ceil((float)N / get_global_size(0));
-
-    // thread with global_id 0 should add 0..k-1
-    // thread with global_id 1 should add k..2k-1
-    // thread with global_id 2 should add 2k..3k-1
-    // ...
-    //     with k = ceil(N / get_global_size()).
-    // 
-    // Be careful that each thread stays in bounds, both relative to
-    // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
-    }
-
-    fast[local_id] = sum;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // binary reduction
-    //
-    // thread i should sum fast[i] and fast[i + offset] and store back
-    // in fast[i], for offset = (local_size >> j) for j from 1 to
-    // log_2(local_size)
-    //
-    // You can assume get_local_size(0) is a power of 2.
-    //
-    // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
-    }
-
-    if (local_id == 0) partial[get_group_id(0)] = fast[0];
-}
+__kernel void sum_coalesced(__global float* x,
+                            __global float* partial,
+                            __local  float* fast,
+                            long N)
+{
+    float sum = 0;
+    int offset;
+    uint j; //unsigned so that we can compare j to size_t local id and add j and a size_t
+    int global_size = get_global_size(0);
+    uint local_size = get_local_size(0);
+    size_t local_id = get_local_id(0);
+    int i = get_global_id(0);
+    // thread i (i.e., with i = get_global_id()) should add x[i],
+    // x[i + get_global_size()], ... up to N-1, and store in sum.
+    // get thread id
+    for (offset = i; offset < N; offset += global_size) { 
+        sum += x[offset];
+    }
+    fast[local_id] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    // binary reduction
+    //
+    // thread i should sum fast[i] and fast[i + offset] and store back
+    // in fast[i], for offset = (local_size >> j) for j from 1 to
+    // log_2(local_size)
+    //
+    // You can assume get_local_size(0) is a power of 2.
+    //
+    // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
+    for (j = local_size >> 1; j > 0; j >>= 1) {
+        // only make sure j > local_id, so that we store the new sum in the position given by the 
+        // lesser of the two indexes
+        if (local_id < j) {
+            fast[local_id] += fast[local_id + j];   
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (local_id == 0) partial[get_group_id(0)] = fast[0];
+}
+
+__kernel void sum_blocked(__global float* x,
+                          __global float* partial,
+                          __local  float* fast,
+                          long N)
+{
+    float sum = 0;
+    size_t local_id = get_local_id(0);
+    int k = ceil((float)N / get_global_size(0));
+    int offset;
+    uint j; //unsigned so that we can compare j to size_t local id and add j and a size_t
+    uint local_size = get_local_size(0);
+    int i = get_global_id(0);
+    // thread with global_id 0 should add 0..k-1
+    // thread with global_id 1 should add k..2k-1
+    // thread with global_id 2 should add 2k..3k-1
+    // ...
+    //     with k = ceil(N / get_global_size()).
+    // 
+    // Be careful that each thread stays in bounds, both relative to
+    // size of x (i.e., N), and the range it's assigned to sum.
+    offset = k * i;
+    int max_offset = k * i + k; // last offset thread i should read from
+    while ((offset < max_offset) && (offset < N)) {
+        sum += x[offset];
+        offset++;
+    }
+
+    fast[local_id] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // binary reduction
+    //
+    // thread i should sum fast[i] and fast[i + offset] and store back
+    // in fast[i], for offset = (local_size >> j) for j from 1 to
+    // log_2(local_size)
+    //
+    // You can assume get_local_size(0) is a power of 2.
+    //
+    // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
+    for (j = local_size >> 1; j > 0; j >>= 1) {
+        // only make sure j > local_id, so that we store the new sum in the position given by the 
+        // lesser of the two indexes
+        if (local_id < j) {
+            fast[local_id] += fast[local_id + j];   
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (local_id == 0) partial[get_group_id(0)] = fast[0];
+}
diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
index a0d56da2..79b8e51f 100644
--- a/HW3/P3/tune.py
+++ b/HW3/P3/tune.py
@@ -1,3 +1,5 @@
+import os
+os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
 import pyopencl as cl
 import numpy as np
 
diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
index 07bb294c..1e7fbd90 100644
--- a/HW3/P4/median_filter.cl
+++ b/HW3/P4/median_filter.cl
@@ -1,5 +1,30 @@
 #include "median9.h"
 
+// ADAPTED FROM HW3 P5 label_regions.cl
+float get_clamped_value(__global __read_only float *in_values,int w, int h, int x, int y) {
+	// if x or y are globally out of bounds, return the coordinates of the closest valid pixel
+	int corrected_x;
+	int corrected_y;
+	//get correct x coord
+	if (x < 0) {
+		corrected_x = 0;
+	} else if (x >= w) {
+		corrected_x = w - 1;
+	} else {
+		corrected_x = x;
+	}
+	//get correct y coord
+	if (y < 0) {
+		corrected_y = 0;
+	} else if (y >= h) {
+		corrected_y = h - 1;
+	} else {
+		corrected_y = y;
+	}
+	// return pixel value
+	return in_values[corrected_y * w + corrected_x];
+}
+
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -31,4 +56,54 @@ median_3x3(__global __read_only float *in_values,
 
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
+	
+	//ADAPTED FROM https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
+	
+	// Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
+	
+	int row;
+	// Read pixel values and store in buffer
+	if (idx_1D < buf_w)
+	    for (row = 0; row < buf_h; row++) {
+            buffer[row * buf_w + idx_1D] = \
+                get_clamped_value(in_values, 
+								  w, h,
+                      			  buf_corner_x + idx_1D,
+                      			  buf_corner_y + row);
+        }
+
+	    barrier(CLK_LOCAL_MEM_FENCE);
+
+		// write output
+	    if ((y < h) && (x < w)) // stay in bounds
+			// Calculate 3x3 median
+	        out_values[y * w + x] = \
+				median9(buffer[(buf_y - 1) * buf_w + buf_x - 1],
+						buffer[(buf_y - 1) * buf_w + buf_x],
+						buffer[(buf_y - 1) * buf_w + buf_x + 1],
+						buffer[buf_y * buf_w + buf_x - 1],
+						buffer[buf_y * buf_w + buf_x],
+						buffer[buf_y * buf_w + buf_x + 1],
+						buffer[(buf_y + 1) * buf_w + buf_x - 1],
+						buffer[(buf_y + 1) * buf_w + buf_x],
+						buffer[(buf_y + 1) * buf_w + buf_x + 1]);
+
 }
diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
new file mode 100644
index 00000000..554b5883
--- /dev/null
+++ b/HW3/P5/P5.txt
@@ -0,0 +1,36 @@
+Part 1:
+	Maze 1:
+		Finished after 915 iterations, 694.465062 ms total, 0.758978209836 ms per iteration
+		Found 2 regions
+	Maze 2:
+		Finished after 532 iterations, 404.654829 ms total, 0.76062937782 ms per iteration
+		Found 35 regions
+Part 2:
+	Maze 1:
+		Finished after 529 iterations, 841.660941 ms total, 1.59104147637 ms per iteration
+		Found 2 regions
+	Maze 2:
+		Finished after 276 iterations, 441.299037 ms total, 1.59890955435 ms per iteration
+		Found 35 regions
+Part 3:
+	Maze 1:
+		Finished after 529 iterations, 843.47568 ms total, 1.59447198488 ms per iteration
+		Found 2 regions
+	Maze 2:
+		Finished after 273 iterations, 431.994906 ms total, 1.58239892308 ms per iteration
+		Found 35 region
+Part 4:
+	Maze 1:
+		Finished after 529 iterations, 2666.87046 ms total, 5.04134302457 ms per iteration
+		Found 2 regions
+	Maze 2:
+		Finished after 272 iterations, 1375.530648 ms total, 5.05709797059 ms per iteration
+		Found 35 regions
+		
+	Explanation: By using a single thread to fetch grandparents, we perform fewer global memory reads because we can hopefully reduce the number of repeated global memory reads.  However, the region labeling takes much longer to run because the global memory reads are no longer performed in parallel.  Hence, we reduce the number of global memory reads, but we also reduce speed.  The single thread approach would be a good idea when memory is under extreme use by other processes.  Using a single thread would reduce the load of global memory.
+	
+Part 5:
+	If we do not use atomic operations, we will still get a correct answer, but it will take longer.  We are guaranteed a correct answer because in a single iteration, we only store new_value if new_value is less than old_value.  Even if multiple threads write to labels[old_value],  labels [old_value] will be less at the end of the iteration than the original labels[old_value] at the beginning of the iteration.  
+	We use a single thread to fetch grandparents in Part 4 to reduce the number of repeated fetches.  This implies that sometimes multiple pixels have the same grandparents.  Multiple threads' pixels can have the same grandparents since grandparents quick permeate across columns, so overwriting is possible.  All atomic operations do is guarantee the min operation (i.e. we always finish an iteration with the minimum possible new_value, for a given old value, across all threads).  We are not guaranteed this without atomic operations.  
+	
+	Hence, using non-atomic operations can sometimes require more iterations to complete region labeling.  Suppose we have two threads updating the same labels[old_value] to new_value1, new_value2, respectively.  Suppose new_value1 > new_value2.  Suppose new_value1 enters first and computes min(new_value1, old_value), but there is an interrupt before new_value1 is stored.  Then thread2 comes in and takes min(new_value2, old_value) and stores it.  Then interrupt, and thread1 runs again.  Now thread1 stores new_value1 in labels[old_value].  Hence, labels[old_value] is not the smallest value that it could have been using atomic operations, and it will take addition time and iterations to lower labels[old_value].
\ No newline at end of file
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 78b986b3..1cff2ec9 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -17,6 +17,16 @@ initialize_labels(__global __read_only int *image,
     }
 }
 
+int min5(int x1, int x2, int x3, int x4, int x5) {
+	// Returns the minimum of the five integer parameters
+	int min_value = x1;
+	if (min_value > x2) min_value = x2;
+	if (min_value > x3) min_value = x3;
+	if (min_value > x4) min_value = x4;
+	if (min_value > x5) min_value = x5;
+	return min_value;
+}
+
 int
 get_clamped_value(__global __read_only int *labels,
                   int w, int h,
@@ -44,6 +54,7 @@ propagate_labels(__global __read_write int *labels,
     // Local position relative to (0, 0) in workgroup
     const int lx = get_local_id(0);
     const int ly = get_local_id(1);
+	const int local_size = get_local_size(0);
 
     // coordinates of the upper left corner of the buffer in image
     // space, including halo
@@ -55,15 +66,16 @@ propagate_labels(__global __read_write int *labels,
     const int buf_y = ly + halo;
 
     // 1D index of thread within our work-group
-    const int idx_1D = ly * get_local_size(0) + lx;
+    const int idx_1D = ly * local_size + lx;
     
     int old_label;
     // Will store the output value
     int new_label;
     
+	int row;
     // Load the relevant labels to a local buffer with a halo 
     if (idx_1D < buf_w) {
-        for (int row = 0; row < buf_h; row++) {
+        for (row = 0; row < buf_h; row++) {
             buffer[row * buf_w + idx_1D] = 
                 get_clamped_value(labels,
                                   w, h,
@@ -73,27 +85,73 @@ propagate_labels(__global __read_write int *labels,
 
     // Make sure all threads reach the next part after
     // the local buffer is loaded
-    barrier(CLK_LOCAL_MEM_FENCE);
+    //barrier(CLK_LOCAL_MEM_FENCE);
 
     // Fetch the value from the buffer the corresponds to
     // the pixel for this thread
     old_label = buffer[buf_y * buf_w + buf_x];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
+	/* Fetch grandparents
+ 	(i.e. store labels[buffer[offset]] in buffer[offset])
+	*/
+	// global offset, global column, global row, and buffer offset to be used to fetch grandparents
+	int g_offset, g_col, g_row, b_offset;
+	int last_g_offset = -1;
+	int last_b_offset = -1;
+	
+	// thread 0 performs all reads to populate buffer
+	if (idx_1D == 0) {
+    	for (row = 0; row < buf_h; row++) {
+			for (int idx = 0; idx < buf_w; idx++){
+				// offset in buffer
+				b_offset = row * buf_w + idx;
+				// get global offset for grandparent
+				g_offset = buffer[b_offset];
+				if (last_g_offset == g_offset) {
+					buffer[b_offset] = buffer[last_b_offset];
+				} else {
+					// calculate parameters for get_clamped_value
+					// row within global labels
+					g_row = g_offset / w; 
+					// column within global labels
+					g_col = g_offset % w; 
+					// fetch grandparent
+			        buffer[b_offset] = get_clamped_value(labels, w, h, g_col, g_row);
+					last_g_offset = g_offset;
+					last_b_offset = b_offset;
+				}
+			}
+    	}	
+	}
+	// Make sure all threads reach the next part after
+	// the grandparents are fetched
+	barrier(CLK_LOCAL_MEM_FENCE);
     
     // stay in bounds
     if ((x < w) && (y < h)) {
         // CODE FOR PART 1 HERE
-        // We set new_label to the value of old_label, but you will need
-        // to adjust this for correctness.
-        new_label = old_label;
+        // Set new_label as the minimum of old_label and the 4 pixels
+		// adjacent to x,y
+		
+		// check if x,y is foreground pixel
+		if (old_label < w * h)
+			new_label = min5(old_label, 
+						buffer[(buf_y - 1) * buf_w + buf_x],
+						buffer[(buf_y + 1) * buf_w + buf_x],
+						buffer[buf_y * buf_w + buf_x - 1],
+						buffer[buf_y * buf_w + buf_x + 1]);
 
-        if (new_label != old_label) {
-            // CODE FOR PART 3 HERE
-            // indicate there was a change this iteration.
-            // multiple threads might write this.
-            *(changed_flag) += 1;
-            labels[y * w + x] = new_label;
-        }
+	        if (new_label < old_label) {
+	            // CODE FOR PART 3 HERE
+	            // indicate there was a change this iteration.
+	            // multiple threads might write this.
+	            *(changed_flag) += 1;
+				atomic_min(&labels[y * w + x], new_label);
+					if ((0 <= buf_corner_y + buf_y) && (buf_corner_y + buf_y < h) && (buf_corner_x + buf_x >= 0) && (buf_corner_x + buf_x < w)) {
+						atomic_min(&labels[(buf_corner_y + buf_y) * w + buf_corner_x + buf_x], new_label);
+					}
+					
+	        }
     }
 }
diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py
index c6ce60cb..55d9c81c 100644
--- a/HW3/P5/label_regions.py
+++ b/HW3/P5/label_regions.py
@@ -43,6 +43,7 @@ def round_up(global_size, group_size):
     program = cl.Program(context, open('label_regions.cl').read()).build(options='')
 
     host_image = np.load('maze1.npy')
+    #host_image = np.load('maze2.npy')
     host_labels = np.empty_like(host_image)
     host_done_flag = np.zeros(1).astype(np.int32)
 
@@ -79,7 +80,8 @@ def round_up(global_size, group_size):
     pylab.title(itercount)
     pylab.show()
 
-    show_progress = True
+    #show_progress = True
+    show_progress = False
     total_time = 0
 
     while True: