diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
index 5a11c020..f96965d2 100644
--- a/HW3/P2/mandelbrot.cl
+++ b/HW3/P2/mandelbrot.cl
@@ -9,11 +9,25 @@ mandelbrot(__global __read_only float *coords_real,
     const int y = get_global_id(1);
 
     float c_real, c_imag;
-    float z_real, z_imag;
+    float z_real, z_imag, z_real_new, z_imag_new;
+    float mag2;
     int iter;
 
     if ((x < w) && (y < h)) {
-        // YOUR CODE HERE
-        ;
+        iter = 1;
+        c_real = coords_real[y*w + x];
+        c_imag = coords_imag[y*w + x];
+        z_real = c_real;
+        z_imag = c_imag;
+        mag2 = z_real*z_real + z_imag*z_imag; 
+        while ((mag2 < 4) && (iter < max_iter)){
+            z_real_new = z_real*z_real - z_imag*z_imag + c_real;
+            z_imag_new = 2*z_real*z_imag + c_imag;
+            mag2 = z_real_new*z_real_new + z_imag_new*z_imag_new;
+            z_real = z_real_new;
+            z_imag = z_imag_new;
+            iter = iter + 1;
+        }
+    out_counts[y*w + x] = iter;
     }
 }
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
new file mode 100644
index 00000000..426a59ad
--- /dev/null
+++ b/HW3/P3/P3.txt
@@ -0,0 +1,13 @@
+
+The best configuration for my machine is:
+
+configuration ('coalesced', 512, 128): 0.000331392 seconds
+
+The coalesced read is faster than the blocked read on average 
+for the same number of work groups and workers because more 
+threads can do work on the same block of fetched memory. In the
+blocked reads, once a thread fetchs its block to sum, more
+threads may have to wait to fetch their block of memory. However in the
+coalesced reads, more threads can sum elements simultaneously
+more often since a fetched block of memory will be more likely to 
+contain elements needed by more threads than in the blocked scheme.
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index 4fb771d2..0c5a8493 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -5,11 +5,16 @@ __kernel void sum_coalesced(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-
+    int i, j, gID, gSize, temp, lSize, loglSize;
+    
+    gID = get_global_id(0);
+    gSize = get_global_size(0);
+    lSize = get_local_size(0);
+   
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    for (i = gID; i < N; i += gSize) { 
+         sum = sum + x[i];
     }
 
     fast[local_id] = sum;
@@ -24,8 +29,17 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    loglSize = 1;
+    temp  = lSize >> 1;
+    while (temp > 1){
+        temp = temp >> 1;
+        loglSize = loglSize + 1;
+    }
+    for (j = 1; j <= loglSize; j++) {
+        if (local_id < (lSize >> j)) { 
+            fast[local_id] = fast[local_id] + fast[local_id + (lSize >> j)];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -38,7 +52,8 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+    int k = ceil((float)N / get_global_size(0));
+    int j, gID, temp, loglSize, lSize, minS;
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,8 +63,16 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    lSize = get_local_size(0);
+    gID = get_global_id(0);
+    if (k-1 < N - k*gID){
+        minS = k;
+    }
+    else{
+        minS = N - k*gID;
+    }
+    for (j = 0; j < minS; j++) { 
+        sum = sum + x[k*gID + j];
     }
 
     fast[local_id] = sum;
@@ -64,8 +87,17 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    loglSize = 1;
+    temp  = lSize >> 1;
+    while (temp > 1){
+        temp = temp >> 1;
+        loglSize = loglSize + 1;
+    }
+    for (j = 1; j <= loglSize; j++) {
+        if (local_id < (lSize >> j)) { 
+            fast[local_id] = fast[local_id] + fast[local_id + (lSize >> j)];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
index c16e9fa6..a0d56da2 100644
--- a/HW3/P3/tune.py
+++ b/HW3/P3/tune.py
@@ -23,7 +23,7 @@ def create_data(N):
     times = {}
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
@@ -40,7 +40,7 @@ def create_data(N):
                   format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
index 07bb294c..7e134fd1 100644
--- a/HW3/P4/median_filter.cl
+++ b/HW3/P4/median_filter.cl
@@ -1,5 +1,6 @@
 #include "median9.h"
 
+ 
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -12,23 +13,81 @@ median_3x3(__global __read_only float *in_values,
     // Note: It may be easier for you to implement median filtering
     // without using the local buffer, first, then adjust your code to
     // use such a buffer after you have that working.
+    int gID, lID, x, y, lx, ly, gSizeX, gSizeY, 
+        lSizeX, lSizeY, xTemp, yTemp, xUse, yUse,
+        buf_corner_x, buf_corner_y, buf_x, buf_y, row;
+    // the code below is adapted from the lecture code on halos
+    x = get_global_id(0);
+    y = get_global_id(1);
+    lx = get_local_id(0);
+    ly = get_local_id(1);
+    gSizeX = get_global_size(0);
+    gSizeY = get_global_size(1);
+    lSizeX = get_local_size(0);
+    lSizeY = get_local_size(1);
+    
+    
+    gID = gSizeX*y + x;
+    lID = lSizeX*ly + lx;
 
+    buf_corner_x = x - lx - halo;
+    buf_corner_y = y - ly - halo;
 
-    // Load into buffer (with 1-pixel halo).
-    //
-    // It may be helpful to consult HW3 Problem 5, and
-    // https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
-    //
-    // Note that globally out-of-bounds pixels should be replaced
-    // with the nearest valid pixel's value.
+    buf_x = lx + halo;
+    buf_y = ly + halo;
 
+    if ((y < h) && (x < w)){
+        if (lID < buf_w){ // only work with buf_w threads
+            xTemp = buf_corner_x + lID;
+            xUse = xTemp;
+            if (xTemp < 0){ // if pixel out of bounds, add compensation steps to find closest in bound pixel
+                    xUse += 1;
+            }
+            if (xTemp > w - 1){
+                xUse -= 1;
+            }
+            for (row = 0; row < buf_h; row++) {
+                yTemp = buf_corner_y + row;
+                yUse = yTemp;
+                if (yTemp < 0){
+                    yUse += 1;
+                }
+                if (yTemp > h - 1){
+                    yUse -= 1;
+                } 
+                buffer[row * buf_w + lID] = in_values[yUse*gSizeX + xUse]; // assign global memory of pixel or closest in bound pixel to buffer
+            }
+        }
+    }
 
-    // Compute 3x3 median for each pixel in core (non-halo) pixels
-    //
-    // We've given you median9.h, and included it above, so you can
-    // use the median9() function.
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if ((y < h) && (x < w)){
+        out_values[gID] = median9(buffer[(buf_y-1)*buf_w + (buf_x-1)], // take median of 8 neighbors and current pixel
+                                  buffer[(buf_y-1)*buf_w + (buf_x)],
+                                  buffer[(buf_y-1)*buf_w + (buf_x+1)],
+                                  buffer[(buf_y)*buf_w + (buf_x-1)],
+                                  buffer[(buf_y)*buf_w + (buf_x)],
+                                  buffer[(buf_y)*buf_w + (buf_x+1)],
+                                  buffer[(buf_y+1)*buf_w + (buf_x-1)],
+                                  buffer[(buf_y+1)*buf_w + (buf_x)],
+                                  buffer[(buf_y+1)*buf_w + (buf_x+1)]);
+    }
 
+        // Load into buffer (with 1-pixel halo).
+        //
+        // It may be helpful to consult HW3 Problem 5, and
+        // https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
+        //
+        // Note that globally out-of-bounds pixels should be replaced
+        // with the nearest valid pixel's value.
 
-    // Each thread in the valid region (x < w, y < h) should write
-    // back its 3x3 neighborhood median.
+
+        // Compute 3x3 median for each pixel in core (non-halo) pixels
+        //
+        // We've given you median9.h, and included it above, so you can
+        // use the median9() function.
+
+
+        // Each thread in the valid region (x < w, y < h) should write
+        // back its 3x3 neighborhood median.
 }
diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
index 1eda1bb9..118fe6bd 100644
--- a/HW3/P4/median_filter.py
+++ b/HW3/P4/median_filter.py
@@ -1,8 +1,8 @@
 from __future__ import division
 import pyopencl as cl
 import numpy as np
-import imread
 import pylab
+import os.path
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
-    program = cl.Program(context, open('median_filter.cl').read()).build(options='')
+    curdir = os.path.dirname(os.path.realpath(__file__))
+    program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
 
     host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
     host_image_filtered = np.zeros_like(host_image)
@@ -59,7 +60,7 @@ def numpy_median(image, iterations=10):
     gpu_image_a = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)
     gpu_image_b = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)
 
-    local_size = (8, 8)  # 64 pixels per work group
+    local_size = (4, 4)  # 64 pixels per work group
     global_size = tuple([round_up(g, l) for g, l in zip(host_image.shape[::-1], local_size)])
     width = np.int32(host_image.shape[1])
     height = np.int32(host_image.shape[0])
diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
new file mode 100644
index 00000000..26d9ffca
--- /dev/null
+++ b/HW3/P5/P5.txt
@@ -0,0 +1,86 @@
+
+
+Explanation:
+
+Part 1: 
+
+This is the base code.
+
+Part 2: 
+
+This is optimized over the first part because the buffer values are updated with the
+grandparent values, which is guaranteed to be less than or equal to the current 
+buffer value.
+
+Part 3:
+
+This is optimized over the second part because a pixel's parent is updated to the pixel's 
+value if it is smaller than the pixel's parent using atomic min. However, the iteration
+time increases due to the atomic (min) operation.
+
+Part 4:
+
+Making 1 thread update the buffer regions with grandparent values is not as efficient on average 
+given the time per iteration is roughly twice as long as Part 3. Even though lots of adjacent pixels 
+may have equal buffer values after sufficient iterations, the reduced number of memory reads 
+does not outweight the loss of parallelism between threads. If more threads are used, for 
+example due to smaller context sizes, then even more memory calls to the labels array will occur.
+So using one thread to remember previous grandparent values may perform better than having
+each thread fetch a value from memory simultaneously (resulting in partial serialization since 
+more threads will have to wait for memory) as the number of threads gets even larger.
+
+Part 5:
+
+If a standard min operation were used instead of atomic min, the iteration time would decrease
+because the imposed serialized delays from atomic operation will not be applied. The final result
+will still be correct because even if a thread overwrites the pixel's parent's value with a greater value
+than another thread, the value will still be less than the original parent value. Thus the number of 
+iterations may increase. As stated, the value in label could increase, but that is during the same iteration.
+Between iterations, label values cannot increase because a pixel's previous iteration value is compared
+via the minimum operator with a new label. Thus after the current iteration finishs, each label's value
+will be less than or equal to that of the previous iteration.
+  
+
+Results:
+
+Maze 1
+
+Part1:
+
+Finished after 915 iterations, 36.084992 ms total, 0.0394371497268 ms per iteration
+Found 2 regions
+
+Part 2:
+
+Finished after 529 iterations, 20.321376 ms total, 0.0384146994329 ms per iteration
+Found 2 regions
+
+Part 3:
+
+Finished after 12 iterations, 0.611552 ms total, 0.0509626666667 ms per iteration
+Found 2 regions
+
+Part 4:
+
+Finished after 11 iterations, 1.224416 ms total, 0.111310545455 ms per iteration
+Found 2 regions
+
+Maze 2
+
+Part 1:
+Finished after 532 iterations, 20.138752 ms total, 0.0378547969925 ms per iteration
+Found 35 regions
+
+Part 2:
+Finished after 276 iterations, 10.62384 ms total, 0.038492173913 ms per iteration
+Found 35 regions
+
+Part 3:
+Finished after 11 iterations, 0.539008 ms total, 0.0490007272727 ms per iteration
+Found 35 regions
+
+Part 4:
+Finished after 10 iterations, 1.11216 ms total, 0.111216 ms per iteration
+Found 35 regions
+
+
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 78b986b3..8fe5c312 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -57,13 +57,14 @@ propagate_labels(__global __read_write int *labels,
     // 1D index of thread within our work-group
     const int idx_1D = ly * get_local_size(0) + lx;
     
-    int old_label;
+    int old_label, old_label0;
     // Will store the output value
     int new_label;
+    int minT, newMin, row, col, prev;
     
     // Load the relevant labels to a local buffer with a halo 
     if (idx_1D < buf_w) {
-        for (int row = 0; row < buf_h; row++) {
+        for (row = 0; row < buf_h; row++) {
             buffer[row * buf_w + idx_1D] = 
                 get_clamped_value(labels,
                                   w, h,
@@ -74,19 +75,63 @@ propagate_labels(__global __read_write int *labels,
     // Make sure all threads reach the next part after
     // the local buffer is loaded
     barrier(CLK_LOCAL_MEM_FENCE);
-
+ 
     // Fetch the value from the buffer the corresponds to
     // the pixel for this thread
+    /* 
     old_label = buffer[buf_y * buf_w + buf_x];
-
+    if (old_label < w*h){
+        buffer[buf_y * buf_w + buf_x] = labels[old_label]; // grandparent update
+    }
+    barrier(CLK_LOCAL_MEM_FENCE); 
+    */
+      
+    old_label = buffer[buf_y * buf_w + buf_x]; // use the first thread in the upper left corner to update the buffer values with grandparents
+    if ((buf_y + buf_x) == 2*halo){            // keeps track of last buffer value read to avoid reading from memory (labels array) more than necessary
+        old_label0 = buffer[buf_y * buf_w + buf_x];
+        if (old_label0 < w*h){
+            prev = labels[old_label0];
+        }
+        for (row = halo; row < buf_h-halo; row++){
+            for (col = halo; col < buf_w-halo; col++){
+                if (buffer[row * buf_w + col] < w*h){
+                    if (buffer[row * buf_w + col] == old_label0){
+                        buffer[row * buf_w + col] = prev;
+                    }
+                    else{
+                        old_label0 = buffer[row * buf_w + col];
+                        buffer[row * buf_w + col] = labels[old_label0];
+                        prev = buffer[row * buf_w + col];
+                    }
+                }
+            }
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);   
+    
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
     
     // stay in bounds
-    if ((x < w) && (y < h)) {
+    if ((x < w) && (y < h) && (old_label < w*h)) {
         // CODE FOR PART 1 HERE
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
-        new_label = old_label;
+        minT = buffer[buf_y * buf_w + buf_x];
+        // Check neighbors to update minimum
+        if (buffer[(buf_y-1)*buf_w + buf_x] < minT){
+            minT = buffer[(buf_y-1)*buf_w + buf_x];
+        }
+        if (buffer[(buf_y+1)*buf_w + buf_x] < minT){
+            minT = buffer[(buf_y+1)*buf_w + buf_x];
+        }
+        if (buffer[(buf_y)*buf_w + buf_x - 1] < minT){
+            minT = buffer[(buf_y)*buf_w + buf_x - 1];
+        }
+        if (buffer[(buf_y)*buf_w + buf_x + 1] < minT){
+            minT = buffer[(buf_y)*buf_w + buf_x + 1];
+        }
+      
+        new_label = minT;
 
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
@@ -94,6 +139,7 @@ propagate_labels(__global __read_write int *labels,
             // multiple threads might write this.
             *(changed_flag) += 1;
             labels[y * w + x] = new_label;
+            atomic_min(&(labels[old_label]),new_label); // Use atomic min to update grandparent value if necessary
         }
     }
 }
diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py
index c6ce60cb..ba3facd2 100644
--- a/HW3/P5/label_regions.py
+++ b/HW3/P5/label_regions.py
@@ -2,7 +2,7 @@
 import sys
 import pyopencl as cl
 import numpy as np
-import pylab
+#import pylab
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -42,7 +42,7 @@ def round_up(global_size, group_size):
 
     program = cl.Program(context, open('label_regions.cl').read()).build(options='')
 
-    host_image = np.load('maze1.npy')
+    host_image = np.load('maze2.npy')
     host_labels = np.empty_like(host_image)
     host_done_flag = np.zeros(1).astype(np.int32)
 
@@ -75,9 +75,9 @@ def round_up(global_size, group_size):
 
     # Show the initial labels
     cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True)
-    pylab.imshow(host_labels)
-    pylab.title(itercount)
-    pylab.show()
+    #pylab.imshow(host_labels)
+    #pylab.title(itercount)
+    #pylab.show()
 
     show_progress = True
     total_time = 0
@@ -105,9 +105,9 @@ def round_up(global_size, group_size):
         print host_done_flag
         if itercount % 100 == 0 and show_progress:
             cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True)
-            pylab.imshow(host_labels)
-            pylab.title(itercount)
-            pylab.show()
+            #pylab.imshow(host_labels)
+            #pylab.title(itercount)
+            #pylab.show()
         if itercount % 10000 == 0:
             print 'Reached maximal number of iterations, aborting'
             sys.exit(0)
@@ -116,6 +116,6 @@ def round_up(global_size, group_size):
     # Show final result
     cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True)
     print 'Found {} regions'.format(len(np.unique(host_labels)) - 1)
-    pylab.imshow(host_labels)
-    pylab.title(itercount)
-    pylab.show()
+    #pylab.imshow(host_labels)
+    #pylab.title(itercount)
+    #pylab.show()