diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
index 5a11c020..b042c4a3 100644
--- a/HW3/P2/mandelbrot.cl
+++ b/HW3/P2/mandelbrot.cl
@@ -13,7 +13,21 @@ mandelbrot(__global __read_only float *coords_real,
     int iter;
 
     if ((x < w) && (y < h)) {
-        // YOUR CODE HERE
-        ;
+        c_real = coords_real[x + y * w];
+        c_imag = coords_imag[x + y * w];
+        z_real = 0;
+        z_imag = 0;
+
+        for (iter = 0; iter < max_iter; ++iter) {
+            if (z_real * z_real + z_imag * z_imag > 4.0) {
+                break;
+            }
+            float temp = z_real * z_real - z_imag * z_imag + c_real;
+
+            z_imag = (z_real * z_imag * 2) + c_imag;
+            z_real = temp;
+        }
+
+        out_counts[x + y * w] = iter;
     }
 }
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
new file mode 100644
index 00000000..75073736
--- /dev/null
+++ b/HW3/P3/P3.txt
@@ -0,0 +1,8 @@
+Hardware
+
+Intel(R) Core(TM) i7-3537 CPU @2.00GHz
+
+
+Performance:
+
+configuration ('coalesced', 128, 128): 0.00205134 seconds
\ No newline at end of file
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index 4fb771d2..c7ee1399 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -5,11 +5,12 @@ __kernel void sum_coalesced(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
+    unsigned int global_size = get_global_size(0);
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    for (int i = get_global_id(0); i < N; i += global_size) {
+        sum += x[i];
     }
 
     fast[local_id] = sum;
@@ -24,8 +25,11 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (int i = get_local_size(0) / 2; i > 0; i /= 2) {
+        if (local_id < i) {
+            fast[local_id] += fast[local_id + i]
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -38,7 +42,7 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+    int k = ceil((float)N / get_global_size(0));
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,8 +52,9 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (int i = get_global_id(0) * k; i < (get_global_id(0) + 1) * k &&
+                                           i < N; ++i) {
+        sum += x[i];
     }
 
     fast[local_id] = sum;
@@ -64,8 +69,11 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (int i = get_local_size(0) / 2; i > 0; i /= 2) {
+        if (local_id < i) {
+            fast[local_id] += fast[local_id + i]
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
index c16e9fa6..a0d56da2 100644
--- a/HW3/P3/tune.py
+++ b/HW3/P3/tune.py
@@ -23,7 +23,7 @@ def create_data(N):
     times = {}
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
@@ -40,7 +40,7 @@ def create_data(N):
                   format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
index 07bb294c..9f9b3eda 100644
--- a/HW3/P4/median_filter.cl
+++ b/HW3/P4/median_filter.cl
@@ -1,5 +1,29 @@
 #include "median9.h"
 
+float
+find_closest(__global __read_only float *in_values,
+             int w, int h,
+             int x, int y)
+{
+    // fix out of bounds pixels to closest valid pixel
+    if (x < 0) {
+        x = 0;
+    }
+    else if (x >= w) {
+        x = w - 1;
+    }
+
+    if (y < 0) {
+        y = 0;
+    }
+    else if (y >= h) {
+        y = h - 1;
+    }
+
+    return in_values[x + y * w];
+}
+
+
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -13,6 +37,21 @@ median_3x3(__global __read_only float *in_values,
     // without using the local buffer, first, then adjust your code to
     // use such a buffer after you have that working.
 
+    // constant coordinates to remember translations between local and global
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // absolute position in local buffer
+    const int x_rel = get_local_id(0);
+    const int y_rel = get_local_id(1);
+
+    // remember offset coordinate for local buffer
+    const int x_buf_corner = x - x_rel - halo;
+    const int y_buf_corner = y - y_rel - halo;
+
+    // local coordinate of our pixel
+    const int x_buf = x_rel + halo;
+    const int y_buf = y_rel + halo;
 
     // Load into buffer (with 1-pixel halo).
     //
@@ -22,13 +61,36 @@ median_3x3(__global __read_only float *in_values,
     // Note that globally out-of-bounds pixels should be replaced
     // with the nearest valid pixel's value.
 
+    const int t_idx = x_rel + y_rel * get_local_size(0);
+
+    if (t_idx < buf_w) {
+        for (int i = 0; i < buf_h; ++i) {
+            buffer[i * buf_w + t_idx] = find_closest(in_values, w, h,
+                                                     x_buf_corner + t_idx,
+                                                     y_buf_corner + i);
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
 
     // Compute 3x3 median for each pixel in core (non-halo) pixels
     //
     // We've given you median9.h, and included it above, so you can
     // use the median9() function.
 
-
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
+
+    // only calculate and write back if in valid region
+    if (x < w && y < h) {
+        out_values[x + y * w] = median9(buffer[(y_buf - 1) * buf_w + x_buf - 1],
+                                        buffer[(y_buf - 1) * buf_w + x_buf],
+                                        buffer[(y_buf - 1) * buf_w + x_buf + 1],
+                                        buffer[y_buf * buf_w + x_buf - 1],
+                                        buffer[y_buf * buf_w + x_buf],
+                                        buffer[y_buf * buf_w + x_buf + 1],
+                                        buffer[(y_buf + 1) * buf_w + x_buf - 1],
+                                        buffer[(y_buf + 1) * buf_w + x_buf],
+                                        buffer[(y_buf + 1) * buf_w + x_buf + 1]);
+    }
 }
diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
index 1eda1bb9..a181c05a 100644
--- a/HW3/P4/median_filter.py
+++ b/HW3/P4/median_filter.py
@@ -1,8 +1,8 @@
 from __future__ import division
 import pyopencl as cl
 import numpy as np
-import imread
 import pylab
+import os.path
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
-    program = cl.Program(context, open('median_filter.cl').read()).build(options='')
+    curdir = os.path.dirname(os.path.realpath(__file__))
+    program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
 
     host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
     host_image_filtered = np.zeros_like(host_image)
diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
new file mode 100644
index 00000000..1d2e277b
--- /dev/null
+++ b/HW3/P5/P5.txt
@@ -0,0 +1,59 @@
+Part 1: implement updates from neighbors
+
+Maze 1
+Finished after 894 iterations, 197.5328 ms total, 0.222069508197 ms per iteration
+Found 2 regions
+
+Maze 2
+Finished after 528 iterations, 116.40336 ms total, 0.220460909091 ms per iteration
+Found 35 regions
+
+===============================================================================
+
+Part 2: fetch grandparents
+
+Maze 1
+Finished after 529 iterations, 116.8076 ms total, 0.220808317580 ms per iteration
+
+Maze 2
+Finished after 274 iterations, 62.08362 ms total, 0.226582554745 ms per iteration
+
+===============================================================================
+
+Part 3: merge parent regions
+
+Maze 1
+Finished after 10 iterations, 2.4026 ms total, 0.24026 ms per iteration
+
+Maze 2
+Finished after 9 iterations, 2.17255 ms total, 0.241394444444 ms per iteration
+
+===============================================================================
+
+Part 4: efficient grandparents
+
+Maze 1
+Finished after 10 iterations, 4.59674 ms total, 0.459674 ms per iteration
+
+Maze 2
+Finished after 9 iterations, 4.1608 ms total, 0.462311111111 ms per iteration
+
+It is quite apparent that using a single thread to check the labels in its
+workgroup caused performance to worsen. I think this has to do with the
+overhead associated with accessing global memory we are trying to avoid, and
+the speedup we gain from parallelism. Here, we can see that using only one
+thread, i.e. serializing this part of the algorithm, takes away much of the
+benefit we gained in the parallel approach, even though it decreased number of
+accesses to global memory.
+
+===============================================================================
+
+Part 5: no atomic operations
+
+Without atomic operations, we introduce the possibility of running into the
+race condition in which our old_label is updated redundantly, such that while
+the output of the algorithm is not incorrect, we can increase the number of
+iterations. This is dangerous because it happens basically
+nondeterministically, so in the (very unlikely) worst case it could cause our
+algorithm to perform relatively slowly, just because more work (with no 
+progress) is being done.
\ No newline at end of file
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 78b986b3..fc0437da 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -81,19 +81,54 @@ propagate_labels(__global __read_write int *labels,
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
     
+    // Part 2
+    //
+    // if (old_label < w * h) {
+    //     buffer[buf_y * buf_w + buf_x] = labels[old_label];
+    // }
+
+    // Part 4
+    
+    // Update workgroup labels
+    if (lx == 0 && ly == 0) {
+        int max_iter = buf_w * buf_h;
+        int temp, last;
+        int prev = -1;
+        for (int i = 0; i < max_iter; ++i) {
+            temp = buffer[i];
+            if (temp < w * h) {
+                // if current label is not the same as previous, reset
+                if (prev != temp) {
+                    prev = temp;
+                    last = labels[prev];
+                }
+                buffer[i] = last;
+            }
+        }
+    }
+
     // stay in bounds
     if ((x < w) && (y < h)) {
         // CODE FOR PART 1 HERE
-        // We set new_label to the value of old_label, but you will need
-        // to adjust this for correctness.
+
+        // min over all possible values
         new_label = old_label;
+        if (new_label < w * h) {
+            int row_min = min(buffer[buf_y * buf_w + buf_x - 1],
+                              buffer[buf_y * buf_w + buf_x + 1]);
+            int col_min = min(buffer[buf_x + (buf_y - 1) * buf_w],
+                              buffer[buf_x + (buf_y + 1) * buf_w]);
+            new_label = min(row_min, col_min);
+        }
 
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
             // indicate there was a change this iteration.
             // multiple threads might write this.
             *(changed_flag) += 1;
-            labels[y * w + x] = new_label;
+            // labels[y * w + x] = new_label;
+            atomic_min(&labels[old_label], new_label);
+            atomic_min(&labels[x + y * w], new_label);
         }
     }
 }