diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
index 5a11c020..89e35c78 100644
--- a/HW3/P2/mandelbrot.cl
+++ b/HW3/P2/mandelbrot.cl
@@ -10,10 +10,34 @@ mandelbrot(__global __read_only float *coords_real,
 
     float c_real, c_imag;
     float z_real, z_imag;
+    float new_z_real;
     int iter;
 
     if ((x < w) && (y < h)) {
         // YOUR CODE HERE
+        // implementing mandelbrot here 
+
+        // initialize
+        iter = 0;
+        z_real =0;
+        z_imag = 0;
+        // pixel (x,y) is x*w + y away from (0,0)
+        // because (x,y) are flipped
+        c_real = coords_real[x*w + y];
+        c_imag = coords_imag[x*w + y];
+        while((z_real*z_real+ z_imag*z_imag <=4) \
+        &&(iter <= max_iter)){
+          // Similar to AVX implemtation
+          new_z_real = (z_real*z_real - z_imag*z_imag) \
+                      + c_real;
+          z_imag = (2 * z_real* z_imag) + c_imag;
+          z_real = new_z_real;
+          iter = iter + 1;
+        }
+
+
+
         ;
+    out_counts[x*w + y] = iter;
     }
 }
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
new file mode 100644
index 00000000..7b2371ef
--- /dev/null
+++ b/HW3/P3/P3.txt
@@ -0,0 +1,22 @@
+The best found configuration is
+
+configuration ('coalesced', 256, 128): 0.00303864 seconds
+
+The devices detected on platform Apple are:
+
+Intel(R) Core(TM) i5-4250U CPU @ 1.30GHz [Type: CPU ]
+
+Maximum clock Frequency: 1300 MHz
+
+Maximum allocable memory size: 1073 MB
+
+## Maximum work group size 1024
+
+HD Graphics 5000 [Type: GPU ]
+
+Maximum clock Frequency: 1000 MHz
+
+Maximum allocable memory size: 402 MB
+
+## Maximum work group size 512
+
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index ee914740..770ba2fe 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -5,11 +5,15 @@ __kernel void sum_coalesced(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
+    size_t i = get_global_id(0);
+    int jump = get_global_size(0);
+    int localsize = get_local_size(0);
+    int id_x;
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    for (id_x = 0 ;i + id_x*jump  < N; id_x++) { 
+        sum += x[ i + id_x*jump ];  
     }
 
     fast[local_id] = sum;
@@ -24,8 +28,11 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (uint offset = localsize/2; offset > 0; offset >>= 1) { 
+        if (local_id< offset) {
+            fast[local_id] += fast[local_id + offset];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -39,6 +46,13 @@ __kernel void sum_blocked(__global float* x,
     float sum = 0;
     size_t local_id = get_local_id(0);
     int k = ceil((float)N / get_global_size(0));
+    int globalid = get_global_id(0);
+    int localid = get_local_id(0);
+    int ini ;
+    int localsize = get_local_size(0);
+
+
+
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,8 +62,10 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for ( ini = globalid*k ; ini < (globalid +1)*k ; ini++) { 
+        if (ini < N) {
+            sum += x[ini];
+        } 
     }
 
     fast[local_id] = sum;
@@ -64,9 +80,11 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (uint offset = localsize/2; offset > 0; offset >>= 1) { 
+        if (local_id< offset) {
+            fast[local_id] += fast[local_id + offset];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
-
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
 }
diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
index 07bb294c..a8bc5391 100644
--- a/HW3/P4/median_filter.cl
+++ b/HW3/P4/median_filter.cl
@@ -1,5 +1,16 @@
 #include "median9.h"
 
+
+inline float get_values(__global float *in_values, \
+        int w, int h, int new_x, int new_y){
+  // check everything stays in bound
+  if (new_x < 0) new_x = 0;
+  if (new_y < 0) new_y = 0;
+  if (new_x >= w) new_x = w - 1;
+  if (new_y >= h) new_y = h - 1;
+  return in_values[new_y * w + new_x];
+}
+
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -22,13 +33,59 @@ median_3x3(__global __read_only float *in_values,
     // Note that globally out-of-bounds pixels should be replaced
     // with the nearest valid pixel's value.
 
+    // Define variables like in class
+
+    // global position of the pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // local position of the pixel in the workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // corner coordinates of the buffer
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // coordinates of the pixel in the buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // get 1-index of the pixels
+    const int idx_1D = ly * get_local_size(0) + lx;
 
     // Compute 3x3 median for each pixel in core (non-halo) pixels
     //
     // We've given you median9.h, and included it above, so you can
     // use the median9() function.
 
+    if (idx_1D < buf_w){
+      for (int row = 0; row < buf_h; row++){
+        int new_x = buf_corner_x + idx_1D;
+        int new_y = buf_corner_y + row;
+        // Each thread in the valid region (x < w, y < h) should write
+        // back its 3x3 neighborhood median.
+        buffer[row * buf_w + idx_1D] = \
+            get_values(in_values, w, h, new_x, new_y);
+      }
+    }
+        
+    // now write the output
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if((x < w) && (y < h)){
+      out_values[y * w + x] =\
+        median9( buffer[ (buf_y-1) * buf_w + buf_x -1],\
+        buffer[ (buf_y-1) * buf_w + buf_x],\
+        buffer[ (buf_y-1) * buf_w + buf_x +1],\
+        buffer[ buf_y * buf_w + buf_x -1],  \ 
+        buffer[ buf_y * buf_w + buf_x], \    
+        buffer[ buf_y * buf_w + buf_x +1],\
+        buffer[ (buf_y+1) * buf_w + buf_x -1],\
+        buffer[ (buf_y+1) * buf_w + buf_x],\
+        buffer[ (buf_y+1) * buf_w + buf_x +1]);
+
+    }
+
 
-    // Each thread in the valid region (x < w, y < h) should write
-    // back its 3x3 neighborhood median.
 }
diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
new file mode 100644
index 00000000..59554130
--- /dev/null
+++ b/HW3/P5/P5.txt
@@ -0,0 +1,87 @@
+Results
+
+********
+Part 1
+********
+
+--- Maze 1 ----
+
+Finished after 314 iterations, 153.46664 ms total, 0.488747261146 ms per iteration
+Found 77 regions
+
+--- Maze 2 ---
+
+Finished after 243 iterations, 106.70872 ms total, 0.439130534979 ms per iteration
+Found 113 regions
+
+********
+Part 2
+********
+
+--- Maze 1 ---
+
+
+Finished after 132 iterations, 65.11392 ms total, 0.493287272727 ms per iteration
+Found 77 regions
+
+--- Maze 2 ---
+
+Finished after 114 iterations, 55.83568 ms total, 0.489786666667 ms per iteration
+Found 113 regions
+
+********
+Part 3
+********
+
+--- Maze 1 ---
+
+Finished after 11 iterations, 5.42896 ms total, 0.493541818182 ms per iteration
+Found 60 regions
+
+
+--- Maze 2 ---
+
+Finished after 11 iterations, 5.3596 ms total, 0.487236363636 ms per iteration
+Found 106 regions
+
+********
+Part 4
+********
+
+--- Maze 1 ---
+
+Finished after 71 iterations, 96.0672 ms total, 1.35305915493 ms per iteration
+Found 66 regions
+
+--- Maze 2 ---
+
+Finished after 49 iterations, 66.29168 ms total, 1.35289142857 ms per iteration
+Found 106 regions
+
+Looks like using only one thread did not improve anything. Worse, I now have more iterations. I don't have time to find out what happened.
+While we have decreased the fetches into the global memory by serializing, the run time is bigger.
+
+
+********
+Part 5
+********
+
+Changing atomic_min() to a simple min() ?
+
+For now we have :
+
+atomic_min(&labels[old_label],new_label);
+atomic_min(&labels[y * w + x], new_label);
+
+and we are asking ourselves the questions what would happen with
+
+min(labels[old_label], new_label);
+min(labels[y * w + x], new_label); 
+
+The atomic operation ensure that only one thread assigns to labels[old_label] and  labels[y * w + x] the smallest label at a time. If multiple threads do this at the same time in parallel, there is no reason to believe that it will correctly assign the smallest value of the label anymore. 
+
+However, this operation will be faster per iteration because now done via multithreading. 
+
+Also we may expect more iterations because the minimum value stored at the end of one iteration may not be the minimum value for all the threads.
+
+
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 78b986b3..e14031fc 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -21,7 +21,7 @@ int
 get_clamped_value(__global __read_only int *labels,
                   int w, int h,
                   int x, int y)
-{
+{   
     if ((x < 0) || (x >= w) || (y < 0) || (y >= h))
         return w * h;
     return labels[y * w + x];
@@ -80,20 +80,73 @@ propagate_labels(__global __read_write int *labels,
     old_label = buffer[buf_y * buf_w + buf_x];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
-    
+
+
+    // CODE FOR PART 2
+
+    // if (old_label < w*h) {
+        // buffer[ buf_y * buf_w + buf_x ] = labels[old_label];
+    // }
+
+
+    // CODE FOR PART 4
+
+
+    // when we have the first thread
+    if ((lx == 0) && (ly == 0)) {
+
+        // initialize variables to use
+        int last_label = -1 ;
+        int my_label_new;
+
+        // get grandparent
+        if (old_label < w*h){
+            last_label = labels[old_label];
+        }
+
+        // loop over rows and columns of the buffer
+        for (int x_i = halo; x_i < get_local_size(0) + halo; x_i++) {
+            for (int y_i = halo; y_i < get_local_size(1) + halo; y_i++) {
+
+                my_label_new = buffer[(ly+x_i)*buf_w+(lx+y_i)];
+                
+                if (buffer[(ly+x_i)*buf_w+(lx+y_i)] < w*h) {
+                    // avoid having the same value as the previous one
+                    if (my_label_new != last_label) {
+                        // update the buffer
+                        buffer[(ly+x_i)*buf_w+(lx+y_i)] = labels[my_label_new];
+                    } 
+                }
+            }
+        }
+    }
+       
     // stay in bounds
-    if ((x < w) && (y < h)) {
+    if (((x < w) && (y < h)) && (old_label < w*h)) {
         // CODE FOR PART 1 HERE
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
-        new_label = old_label;
+
+        // one pixel becomes the minimum of its 4 neighboring 
+        // pixels and itself
+        // get the locations in a similar fashion as P4
+        int left = buffer[ buf_y * buf_w + buf_x - 1];
+        int right = buffer[ buf_y * buf_w + 1];
+        int up = buffer[ (buf_y - 1) * buf_w + buf_x ];
+        int down = buffer[ (buf_y + 1) * buf_w + buf_x ];
+        // find the minimum
+        new_label = min(old_label, min( min( min(up,down) , right) , left));
+        
+
 
         if (new_label != old_label) {
+            atomic_min(&labels[old_label], new_label);
             // CODE FOR PART 3 HERE
             // indicate there was a change this iteration.
             // multiple threads might write this.
             *(changed_flag) += 1;
-            labels[y * w + x] = new_label;
+            //labels[y * w + x] = new_label;
+            atomic_min(&labels[y * w + x], new_label);
         }
     }
 }
diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py
index c6ce60cb..5ffa14f2 100644
--- a/HW3/P5/label_regions.py
+++ b/HW3/P5/label_regions.py
@@ -42,7 +42,7 @@ def round_up(global_size, group_size):
 
     program = cl.Program(context, open('label_regions.cl').read()).build(options='')
 
-    host_image = np.load('maze1.npy')
+    host_image = np.load('maze2.npy')
     host_labels = np.empty_like(host_image)
     host_done_flag = np.zeros(1).astype(np.int32)
 
@@ -93,7 +93,7 @@ def round_up(global_size, group_size):
                                              width, height,
                                              buf_size[0], buf_size[1],
                                              halo)
-        prop_exec.wait()
+        prop_exec.wait( )
         elapsed = 1e-6 * (prop_exec.profile.end - prop_exec.profile.start)
         total_time += elapsed
         # read back done flag, block until it gets here