diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
index 5a11c020..fdd24519 100644
--- a/HW3/P2/mandelbrot.cl
+++ b/HW3/P2/mandelbrot.cl
@@ -11,9 +11,24 @@ mandelbrot(__global __read_only float *coords_real,
     float c_real, c_imag;
     float z_real, z_imag;
     int iter;
+    float z_real_temp;
 
     if ((x < w) && (y < h)) {
         // YOUR CODE HERE
-        ;
+        c_real = coords_real[y * w + x];
+        c_imag = coords_imag[y * w + x];
+        z_real = 0;
+        z_imag = 0;
+
+        iter = 0;
+        
+        while(z_real * z_real + z_imag * z_imag < 4 && iter < max_iter) {
+            z_real_temp = z_real;
+            z_real = z_real * z_real - z_imag * z_imag + c_real;
+            z_imag = 2 * z_imag * z_real_temp + c_imag;
+            iter += 1;
+        }
+
+        out_counts[y * w + x] = iter;
     }
 }
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
new file mode 100644
index 00000000..d3e495e1
--- /dev/null
+++ b/HW3/P3/P3.txt
@@ -0,0 +1,4 @@
+#0: Intel(R) Core(TM) i5-5257U CPU @ 2.70GHz on Apple
+#1: Intel(R) Iris(TM) Graphics 6100 on Apple
+
+configuration ('coalesced', 64, 64): 0.0027708 seconds
\ No newline at end of file
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index 4fb771d2..310ebf28 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -4,12 +4,17 @@ __kernel void sum_coalesced(__global float* x,
                             long N)
 {
     float sum = 0;
+    
+    size_t i = get_global_id(0);
     size_t local_id = get_local_id(0);
+    
+    size_t k = get_global_size(0);
+    size_t ls = get_local_size(0);
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    for (uint index = i; index < N; index += k) { // YOUR CODE HERE
+        sum += x[index]; // YOUR CODE HERE 
     }
 
     fast[local_id] = sum;
@@ -24,8 +29,12 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (uint s = ls/2; s > 0; s >>= 1) { // YOUR CODE HERE
+        // YOUR CODE HERE
+        if (local_id < s) {
+            fast[local_id] += fast[local_id + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -37,8 +46,12 @@ __kernel void sum_blocked(__global float* x,
                           long N)
 {
     float sum = 0;
+    
+    size_t i = get_global_id(0);
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+
+    int k = ceil((float)N / get_global_size(0));
+    int ls = get_local_size(0);
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,8 +61,8 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (uint index = k * i; index < k * (i + 1) && index < N; index++) { // YOUR CODE HERE
+        sum += x[index]; // YOUR CODE HERE
     }
 
     fast[local_id] = sum;
@@ -64,8 +77,12 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (uint s = ls/2; s > 0; s >>= 1) { // YOUR CODE HERE
+        // YOUR CODE HERE
+        if (local_id < s) {
+            fast[local_id] += fast[local_id + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE); 
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
index c16e9fa6..a0d56da2 100644
--- a/HW3/P3/tune.py
+++ b/HW3/P3/tune.py
@@ -23,7 +23,7 @@ def create_data(N):
     times = {}
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
@@ -40,7 +40,7 @@ def create_data(N):
                   format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
index 07bb294c..90755934 100644
--- a/HW3/P4/median_filter.cl
+++ b/HW3/P4/median_filter.cl
@@ -1,5 +1,22 @@
 #include "median9.h"
 
+// Note that globally out-of-bounds pixels should be replaced
+// with the nearest valid pixel's value.
+static float
+fetch(__global __read_only float *in_values,
+      int w, int h,
+      int x, int y)
+{
+  if (x < 0) x = 0;
+  else if (x >= w) x = w-1;
+
+  if (y < 0) y = 0;
+  else if (y >= h) y = h-1;
+
+  return in_values[y * w + x];
+}
+
+
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -7,7 +24,7 @@ median_3x3(__global __read_only float *in_values,
            __local float *buffer,
            int w, int h,
            int buf_w, int buf_h,
-           const int halo)
+           const int halo) // width of halo on one side
 {
     // Note: It may be easier for you to implement median filtering
     // without using the local buffer, first, then adjust your code to
@@ -18,10 +35,39 @@ median_3x3(__global __read_only float *in_values,
     //
     // It may be helpful to consult HW3 Problem 5, and
     // https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
-    //
-    // Note that globally out-of-bounds pixels should be replaced
-    // with the nearest valid pixel's value.
+    
+    
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
 
+    // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+    int row;
+
+    if (idx_1D < buf_w)
+        for (row = 0; row < buf_h; row++) {
+            buffer[row * buf_w + idx_1D] = \
+                fetch(in_values, w, h,
+                      buf_corner_x + idx_1D,
+                      buf_corner_y + row);
+        }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
 
     // Compute 3x3 median for each pixel in core (non-halo) pixels
     //
@@ -31,4 +77,20 @@ median_3x3(__global __read_only float *in_values,
 
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
-}
+
+    // Processing code here...
+    //
+    // Should only use buffer, buf_x, buf_y.
+
+    // write output
+    if ((y < h) && (x < w)) // stay in bounds
+        out_values[y * w + x] = median9(buffer[(buf_y-1) * buf_w + buf_x - 1],
+                                        buffer[(buf_y-1) * buf_w + buf_x],
+                                        buffer[(buf_y-1) * buf_w + buf_x + 1],
+                                        buffer[buf_y * buf_w + buf_x - 1],
+                                        buffer[buf_y * buf_w + buf_x],
+                                        buffer[buf_y * buf_w + buf_x + 1],
+                                        buffer[(buf_y+1) * buf_w + buf_x - 1],
+                                        buffer[(buf_y+1) * buf_w + buf_x],
+                                        buffer[(buf_y+1) * buf_w + buf_x + 1]);
+}
\ No newline at end of file
diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
index 1eda1bb9..a181c05a 100644
--- a/HW3/P4/median_filter.py
+++ b/HW3/P4/median_filter.py
@@ -1,8 +1,8 @@
 from __future__ import division
 import pyopencl as cl
 import numpy as np
-import imread
 import pylab
+import os.path
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
-    program = cl.Program(context, open('median_filter.cl').read()).build(options='')
+    curdir = os.path.dirname(os.path.realpath(__file__))
+    program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
 
     host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
     host_image_filtered = np.zeros_like(host_image)
diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
new file mode 100644
index 00000000..aed99677
--- /dev/null
+++ b/HW3/P5/P5.txt
@@ -0,0 +1,38 @@
+1. Iteration counts and average kernel times after each part
+
+(1) Part 1
+- Maze 1 : Finished after 915 iterations, 194.06176 ms total, 0.212089355191 ms per iteration
+Found 2 regions
+- Maze 2 : Finished after 532 iterations, 111.65112 ms total, 0.209870526316 ms per iteration
+Found 35 regions
+
+(2) Part 2
+- Maze 1 : Finished after 529 iterations, 109.62104 ms total, 0.207223137996 ms per iteration
+Found 2 regions
+- Maze 2 : Finished after 273 iterations, 56.87648 ms total, 0.208338754579 ms per iteration
+Found 35 regions
+
+(3) Part 3
+- Maze 1 : Finished after 10 iterations, 3.1164 ms total, 0.31164 ms per iteration
+Found 2 regions
+- Maze 2 : Finished after 9 iterations, 2.72808 ms total, 0.30312 ms per iteration
+Found 35 regions
+
+(4) Part 4
+- Maze 1 : Finished after 16 iterations, 8.2236 ms total, 0.513975 ms per iteration
+Found 2 regions
+- Maze 2 : Finished after 15 iterations, 7.52448 ms total, 0.501632 ms per iteration
+Found 35 regions
+
+
+2. Part 4 discussion
+Based on my empirical result, using a single thread to reduce redundant memory access rather made it more slower than part 3 codes did. Number of terations has increased 2 times and execution speed decreased to half. 
+I think even though part 3 code makes redundant memory access and it might take some time, but still using multiple threads override the advantage of serial access. Therefore, in my machine, the time saved by lessening redundant lookup is smaller than the time saved by parallel implementation.
+But this might depend on different conditions or environment of GPUs. For example, if the speed of GPU access is very slow, and the input image mostly has the same labels within a workgroup, then seriel implementation (code of part 4) will be better. 
+On the other hand, if the GPU access speed is not bottleneck anymore and the image has label that varies a lot even within a same group, code for P4 will work better in efficiency.
+
+
+3. Part 5 explanation
+atomic_min() makes sure that a certain memory address will be only accessed by one thread at a time. So in our code, it makes the label[old_label] always gets the actual minimum values when we use atomic_min(). However, if we use min() instead of atomic_min(), it might cause label[old_label] to be overwritten so that it has different value, resulting a value in label to increase.
+The final result will still be correct since once the minimum value is taken, this value is keep used. However, the performance of the algorithm (time and iterations) might vary.
+The operation using min() will be faster since it allows parallelism, however, the # of iterations will increase since a single thread will take more computation until it converge to get the actual minimum value. 
\ No newline at end of file
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 78b986b3..66aeeb90 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -80,7 +80,43 @@ propagate_labels(__global __read_write int *labels,
     old_label = buffer[buf_y * buf_w + buf_x];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
+
+    // PART 2
+    // if (old_label < w * h) { // to check if it's background pixel
+    //     buffer[buf_y * buf_w + buf_x] = labels[buffer[buf_y * buf_w + buf_x]];
+    // }
     
+    // PART 4
+    // only thread for lx, ly = (0, 0) will be used to fetching the data
+    if (lx == 0 && ly == 0) {
+        int current_label, current_label_index, last_label;
+        int work_group_size_x = get_local_size(0);
+        int work_group_size_y = get_local_size(1);
+
+        for (int row = 0; row < work_group_size_x; row++) {
+            for (int col = 0; col < work_group_size_y; col++) {
+
+                current_label_index = (row + halo) * buf_w + (col + halo);
+                current_label = buffer[current_label_index];
+
+                if (current_label < w * h) { // not background
+                    if (current_label == last_label) {
+                        buffer[current_label_index] = last_label;
+                    }
+                    else {
+                        last_label = current_label;
+                        buffer[current_label_index] = labels[current_label];
+                    }
+                }
+
+            }
+        }
+    }
+
+    // Make sure all threads reach the next part after
+    // the local buffer is loaded
+    barrier(CLK_LOCAL_MEM_FENCE);
+
     // stay in bounds
     if ((x < w) && (y < h)) {
         // CODE FOR PART 1 HERE
@@ -88,12 +124,24 @@ propagate_labels(__global __read_write int *labels,
         // to adjust this for correctness.
         new_label = old_label;
 
+        if (new_label < w * h) { // to check if it's background pixel
+
+            int up_pixel = buffer[(buf_y - 1) * buf_w + buf_x];
+            int left_pixel = buffer[buf_y * buf_w + buf_x -1];
+            int down_pixel = buffer[(buf_y + 1) * buf_w + buf_x];
+            int right_pixel = buffer[buf_y * buf_w + buf_x + 1];
+
+            new_label = min(min(min(min(old_label, up_pixel), left_pixel), down_pixel), right_pixel);
+        }
+
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
             // indicate there was a change this iteration.
             // multiple threads might write this.
             *(changed_flag) += 1;
-            labels[y * w + x] = new_label;
+            atomic_min(&labels[old_label], new_label);
+            // labels[y * w + x] = new_label;
+            atomic_min(&labels[y * w + x], new_label);
         }
     }
 }
diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py
index c6ce60cb..16522113 100644
--- a/HW3/P5/label_regions.py
+++ b/HW3/P5/label_regions.py
@@ -3,6 +3,9 @@
 import pyopencl as cl
 import numpy as np
 import pylab
+# import os
+
+# os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -42,7 +45,7 @@ def round_up(global_size, group_size):
 
     program = cl.Program(context, open('label_regions.cl').read()).build(options='')
 
-    host_image = np.load('maze1.npy')
+    host_image = np.load('maze2.npy')
     host_labels = np.empty_like(host_image)
     host_done_flag = np.zeros(1).astype(np.int32)