From 11c6d3737633fb956cbae522288ef90953430992 Mon Sep 17 00:00:00 2001
From: tperol <tperol@seas.harvard.edu>
Date: Mon, 16 Nov 2015 22:09:16 -0500
Subject: [PATCH 01/13] finished P2

---
 HW3/P2/mandelbrot.cl | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
index 5a11c020..241d884a 100644
--- a/HW3/P2/mandelbrot.cl
+++ b/HW3/P2/mandelbrot.cl
@@ -10,10 +10,32 @@ mandelbrot(__global __read_only float *coords_real,
 
     float c_real, c_imag;
     float z_real, z_imag;
+    float new_z_real;
     int iter;
 
     if ((x < w) && (y < h)) {
         // YOUR CODE HERE
+        // implementing mandelbrot here 
+
+        // initialize
+        iter = 0;
+        z_real =0;
+        z_imag = 0;
+        c_real = coords_real[w*x + y];
+        c_imag = coords_imag[w*x + y];
+        while((z_real*z_real+ z_imag*z_imag <=4) \
+        &&(iter <= max_iter)){
+          // Similar to AVX implemtation
+          new_z_real = (z_real*z_real - z_imag*z_imag) \
+                      + c_real;
+          z_imag = (2 * z_real* z_imag) + c_imag;
+          z_real = new_z_real;
+          iter = iter + 1;
+        }
+
+
+
         ;
+    out_counts[x*w + y] = iter;
     }
 }

From ee7c109bb9a016e5511a6c9453a388aa4fba7a8d Mon Sep 17 00:00:00 2001
From: tperol <tperol@seas.harvard.edu>
Date: Tue, 17 Nov 2015 10:15:22 -0500
Subject: [PATCH 02/13] commented P2

---
 HW3/P2/mandelbrot.cl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
index 241d884a..89e35c78 100644
--- a/HW3/P2/mandelbrot.cl
+++ b/HW3/P2/mandelbrot.cl
@@ -21,8 +21,10 @@ mandelbrot(__global __read_only float *coords_real,
         iter = 0;
         z_real =0;
         z_imag = 0;
-        c_real = coords_real[w*x + y];
-        c_imag = coords_imag[w*x + y];
+        // pixel (x,y) is x*w + y away from (0,0)
+        // because (x,y) are flipped
+        c_real = coords_real[x*w + y];
+        c_imag = coords_imag[x*w + y];
         while((z_real*z_real+ z_imag*z_imag <=4) \
         &&(iter <= max_iter)){
           // Similar to AVX implemtation

From 9a19ae2193dc960c47cf15e1536150c471df38b0 Mon Sep 17 00:00:00 2001
From: tperol <tperol@seas.harvard.edu>
Date: Thu, 19 Nov 2015 15:16:11 -0500
Subject: [PATCH 03/13] done with Prb 3

---
 HW3/P3/P3.txt | 22 ++++++++++++++++++++++
 HW3/P3/sum.cl | 36 +++++++++++++++++++++++++++---------
 2 files changed, 49 insertions(+), 9 deletions(-)
 create mode 100644 HW3/P3/P3.txt

diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
new file mode 100644
index 00000000..7b2371ef
--- /dev/null
+++ b/HW3/P3/P3.txt
@@ -0,0 +1,22 @@
+The best found configuration is
+
+configuration ('coalesced', 256, 128): 0.00303864 seconds
+
+The devices detected on platform Apple are:
+
+Intel(R) Core(TM) i5-4250U CPU @ 1.30GHz [Type: CPU ]
+
+Maximum clock Frequency: 1300 MHz
+
+Maximum allocable memory size: 1073 MB
+
+## Maximum work group size 1024
+
+HD Graphics 5000 [Type: GPU ]
+
+Maximum clock Frequency: 1000 MHz
+
+Maximum allocable memory size: 402 MB
+
+## Maximum work group size 512
+
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index ee914740..770ba2fe 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -5,11 +5,15 @@ __kernel void sum_coalesced(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
+    size_t i = get_global_id(0);
+    int jump = get_global_size(0);
+    int localsize = get_local_size(0);
+    int id_x;
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    for (id_x = 0 ;i + id_x*jump  < N; id_x++) { 
+        sum += x[ i + id_x*jump ];  
     }
 
     fast[local_id] = sum;
@@ -24,8 +28,11 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (uint offset = localsize/2; offset > 0; offset >>= 1) { 
+        if (local_id< offset) {
+            fast[local_id] += fast[local_id + offset];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -39,6 +46,13 @@ __kernel void sum_blocked(__global float* x,
     float sum = 0;
     size_t local_id = get_local_id(0);
     int k = ceil((float)N / get_global_size(0));
+    int globalid = get_global_id(0);
+    int localid = get_local_id(0);
+    int ini ;
+    int localsize = get_local_size(0);
+
+
+
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,8 +62,10 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for ( ini = globalid*k ; ini < (globalid +1)*k ; ini++) { 
+        if (ini < N) {
+            sum += x[ini];
+        } 
     }
 
     fast[local_id] = sum;
@@ -64,9 +80,11 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (uint offset = localsize/2; offset > 0; offset >>= 1) { 
+        if (local_id< offset) {
+            fast[local_id] += fast[local_id + offset];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
-
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
 }

From 463581cca7c5d9ce080eb3375fbe4d4c71a8e9bf Mon Sep 17 00:00:00 2001
From: tperol <tperol@seas.harvard.edu>
Date: Thu, 19 Nov 2015 20:17:02 -0500
Subject: [PATCH 04/13] finished P4

---
 HW3/P4/median_filter.cl | 61 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 59 insertions(+), 2 deletions(-)

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
index 07bb294c..31f3f221 100644
--- a/HW3/P4/median_filter.cl
+++ b/HW3/P4/median_filter.cl
@@ -1,5 +1,16 @@
 #include "median9.h"
 
+
+inline float get_values(__global float *in_values, \
+        int w, int h, int new_x, int new_y){
+  // check everything stays in bound
+  if (new_x < 0) new_x = 0;
+  if (new_y < 0) new_y = 0;
+  if (new_x >= w) new_x = w - 1;
+  if (new_y >= h) new_y = h - 1;
+  return in_values[new_y * w + new_x];
+}
+
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -22,13 +33,59 @@ median_3x3(__global __read_only float *in_values,
     // Note that globally out-of-bounds pixels should be replaced
     // with the nearest valid pixel's value.
 
+    // Define variables like in class
+
+    // global position of the pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // local position of the pixel in the workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // corner coordinates of the buffer
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // coordinates of the pixel in the buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // get 1-index of the pixels
+    const int idx_1D = ly * get_local_size(0) + lx;
 
     // Compute 3x3 median for each pixel in core (non-halo) pixels
     //
     // We've given you median9.h, and included it above, so you can
     // use the median9() function.
 
+    if (idx_1D < buf_w){
+      for (int row = 0; row < buf_h; row++){
+        int new_x = buf_corner_x + idx_1D;
+        int new_y = buf_corner_y + row;
+        // Each thread in the valid region (x < w, y < h) should write
+        // back its 3x3 neighborhood median.
+        buffer[row * buf_w + idx_1D] = \
+            get_values(in_values, w, h, new_x, new_y);
+      }
+    }
+        
+    //# Make sure all threads reach the next part after the local buffer is loaded
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if((x < w) && (y < h)){
+      out_values[y * w + x] =\
+        median9(buffer[(buf_y-1) * buf_w + buf_x -1],\
+        buffer[(buf_y-1) * buf_w + buf_x],\
+        buffer[(buf_y-1) * buf_w + buf_x +1],\
+        buffer[buf_y * buf_w + buf_x -1],  \ 
+        buffer[buf_y * buf_w + buf_x], \    
+        buffer[buf_y * buf_w + buf_x +1],\
+        buffer[(buf_y+1) * buf_w + buf_x -1],\
+        buffer[(buf_y+1) * buf_w + buf_x],\
+        buffer[(buf_y+1) * buf_w + buf_x +1]);
+
+    }
+
 
-    // Each thread in the valid region (x < w, y < h) should write
-    // back its 3x3 neighborhood median.
 }

From 5935042c0ec23581f660eac25062c0dca987995d Mon Sep 17 00:00:00 2001
From: tperol <tperol@seas.harvard.edu>
Date: Fri, 20 Nov 2015 09:29:50 -0500
Subject: [PATCH 05/13] finished commenting P4

---
 HW3/P4/median_filter.cl | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
index 31f3f221..a8bc5391 100644
--- a/HW3/P4/median_filter.cl
+++ b/HW3/P4/median_filter.cl
@@ -70,20 +70,20 @@ median_3x3(__global __read_only float *in_values,
       }
     }
         
-    //# Make sure all threads reach the next part after the local buffer is loaded
+    // now write the output
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if((x < w) && (y < h)){
       out_values[y * w + x] =\
-        median9(buffer[(buf_y-1) * buf_w + buf_x -1],\
-        buffer[(buf_y-1) * buf_w + buf_x],\
-        buffer[(buf_y-1) * buf_w + buf_x +1],\
-        buffer[buf_y * buf_w + buf_x -1],  \ 
-        buffer[buf_y * buf_w + buf_x], \    
-        buffer[buf_y * buf_w + buf_x +1],\
-        buffer[(buf_y+1) * buf_w + buf_x -1],\
-        buffer[(buf_y+1) * buf_w + buf_x],\
-        buffer[(buf_y+1) * buf_w + buf_x +1]);
+        median9( buffer[ (buf_y-1) * buf_w + buf_x -1],\
+        buffer[ (buf_y-1) * buf_w + buf_x],\
+        buffer[ (buf_y-1) * buf_w + buf_x +1],\
+        buffer[ buf_y * buf_w + buf_x -1],  \ 
+        buffer[ buf_y * buf_w + buf_x], \    
+        buffer[ buf_y * buf_w + buf_x +1],\
+        buffer[ (buf_y+1) * buf_w + buf_x -1],\
+        buffer[ (buf_y+1) * buf_w + buf_x],\
+        buffer[ (buf_y+1) * buf_w + buf_x +1]);
 
     }
 

From 79cdf52695a220461154c6884ff21ecdc824b7bb Mon Sep 17 00:00:00 2001
From: tperol <tperol@seas.harvard.edu>
Date: Fri, 20 Nov 2015 16:45:40 -0500
Subject: [PATCH 06/13] done p5 part1

---
 HW3/P5/label_regions.cl | 17 ++++++++++++++---
 HW3/P5/label_regions.py |  2 +-
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 78b986b3..7905c490 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -21,7 +21,7 @@ int
 get_clamped_value(__global __read_only int *labels,
                   int w, int h,
                   int x, int y)
-{
+{   
     if ((x < 0) || (x >= w) || (y < 0) || (y >= h))
         return w * h;
     return labels[y * w + x];
@@ -82,11 +82,22 @@ propagate_labels(__global __read_write int *labels,
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
     
     // stay in bounds
-    if ((x < w) && (y < h)) {
+    if (((x < w) && (y < h)) && (old_label < w*h)) {
         // CODE FOR PART 1 HERE
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
-        new_label = old_label;
+
+        // one pixel becomes the minimum of its 4 neighboring 
+        // pixels and itself
+        // get the locations in a similar fashion as P4
+        int left = buffer[ buf_y * buf_w + buf_x - 1];
+        int right = buffer[ buf_y * buf_w + 1];
+        int up = buffer[ (buf_y - 1) * buf_w + buf_x ];
+        int down = buffer[ (buf_y + 1) * buf_w + buf_x ];
+        // find the minimum
+        new_label = min(old_label, min( min( min(up,down) , right) , left));
+        
+
 
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py
index c6ce60cb..13d7c7d5 100644
--- a/HW3/P5/label_regions.py
+++ b/HW3/P5/label_regions.py
@@ -93,7 +93,7 @@ def round_up(global_size, group_size):
                                              width, height,
                                              buf_size[0], buf_size[1],
                                              halo)
-        prop_exec.wait()
+        prop_exec.wait( )
         elapsed = 1e-6 * (prop_exec.profile.end - prop_exec.profile.start)
         total_time += elapsed
         # read back done flag, block until it gets here

From bedeb0ae19bf2644cf8e8cd9b4433ed956235551 Mon Sep 17 00:00:00 2001
From: tperol <tperol@seas.harvard.edu>
Date: Fri, 20 Nov 2015 19:40:51 -0500
Subject: [PATCH 07/13] done with part 2, working on part4

---
 HW3/P5/label_regions.cl | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 7905c490..74216a79 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -80,7 +80,39 @@ propagate_labels(__global __read_write int *labels,
     old_label = buffer[buf_y * buf_w + buf_x];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
-    
+
+
+    // CODE FOR PART 2
+
+    if (old_label < w*h) {
+        buffer[ buf_y * buf_w + buf_x ] = labels[old_label];
+    }
+
+
+    // CODE FOR PART 4
+
+    // when we have the first thread
+    //if ((lx == 0) && (ly == 0)) {
+//
+        //// loop over rows and columns
+        //for (int x_i = halo; x_i < buf_h - halo; x_i++) {
+            //for (int y_i = halo; y_i < buf_w - halo; y_i++) {
+//
+                //// obtain grand parent
+                //if (old_label < w*h) {
+                    //if (this_label != last_label) {
+                        //buffer[x_i + buf_w * y_i] = labels[buffer[x_i + buf_w * y_i]];
+                        //last_label = this_label;
+                    //}
+                    //else {
+                        //buffer[x_i + buf_w * y_i] = buffer[last_index];
+                    //}
+                    //last_index = x_i + buf_w * y_i;
+                //}
+            //}
+        //}
+    //}
+       
     // stay in bounds
     if (((x < w) && (y < h)) && (old_label < w*h)) {
         // CODE FOR PART 1 HERE

From f4c252222aeaca558c86cfe0c00a9c694586ab53 Mon Sep 17 00:00:00 2001
From: tperol <tperol@seas.harvard.edu>
Date: Fri, 20 Nov 2015 19:46:14 -0500
Subject: [PATCH 08/13] finished part 3

---
 HW3/P5/label_regions.cl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 74216a79..079e1888 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -132,11 +132,13 @@ propagate_labels(__global __read_write int *labels,
 
 
         if (new_label != old_label) {
+            atomic_min(&labels[old_label],new_label);
             // CODE FOR PART 3 HERE
             // indicate there was a change this iteration.
             // multiple threads might write this.
             *(changed_flag) += 1;
-            labels[y * w + x] = new_label;
+            //labels[y * w + x] = new_label;
+            atomic_min(&labels[x + y * w], new_label);
         }
     }
 }

From e5c1709b281ca1c06767b561751ad45991c0c3fa Mon Sep 17 00:00:00 2001
From: tperol <tperol@seas.harvard.edu>
Date: Fri, 20 Nov 2015 21:02:54 -0500
Subject: [PATCH 09/13] got part 4 working

---
 HW3/P5/label_regions.cl | 53 ++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 079e1888..5cb633a2 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -84,34 +84,39 @@ propagate_labels(__global __read_write int *labels,
 
     // CODE FOR PART 2
 
-    if (old_label < w*h) {
-        buffer[ buf_y * buf_w + buf_x ] = labels[old_label];
-    }
+    // if (old_label < w*h) {
+        // buffer[ buf_y * buf_w + buf_x ] = labels[old_label];
+    // }
 
 
     // CODE FOR PART 4
 
+    barrier(CLK_LOCAL_MEM_FENCE);
     // when we have the first thread
-    //if ((lx == 0) && (ly == 0)) {
-//
-        //// loop over rows and columns
-        //for (int x_i = halo; x_i < buf_h - halo; x_i++) {
-            //for (int y_i = halo; y_i < buf_w - halo; y_i++) {
-//
-                //// obtain grand parent
-                //if (old_label < w*h) {
-                    //if (this_label != last_label) {
-                        //buffer[x_i + buf_w * y_i] = labels[buffer[x_i + buf_w * y_i]];
-                        //last_label = this_label;
-                    //}
-                    //else {
-                        //buffer[x_i + buf_w * y_i] = buffer[last_index];
-                    //}
-                    //last_index = x_i + buf_w * y_i;
-                //}
-            //}
-        //}
-    //}
+    if ((lx == 0) && (ly == 0)) {
+
+        // initialize variables to use
+        int last_label;
+        int my_label;
+
+        // loop over rows and columns of the buffer
+        for (int x_i = halo; x_i < buf_h - halo; x_i++) {
+            for (int y_i = halo; y_i < buf_w - halo; y_i++) {
+
+                my_label = buffer[x_i + buf_w * y_i];
+                // obtain grand parent
+                if (old_label < w*h) {
+                    // avoid having the same value as the previous one
+                    if (my_label != last_label) {
+                        // update the buffer
+                        buffer[x_i + buf_w * y_i] = labels[my_label];
+                        // update the last label
+                        last_label = my_label;
+                    }
+                }
+            }
+        }
+    }
        
     // stay in bounds
     if (((x < w) && (y < h)) && (old_label < w*h)) {
@@ -138,7 +143,7 @@ propagate_labels(__global __read_write int *labels,
             // multiple threads might write this.
             *(changed_flag) += 1;
             //labels[y * w + x] = new_label;
-            atomic_min(&labels[x + y * w], new_label);
+            atomic_min(&labels[y * w + x], new_label);
         }
     }
 }

From 8ff034e892d422d2ab7d1ad68539f0560f567f67 Mon Sep 17 00:00:00 2001
From: tperol <tperol@seas.harvard.edu>
Date: Fri, 20 Nov 2015 21:17:32 -0500
Subject: [PATCH 10/13] writing P5.txt

---
 HW3/P5/P5.txt | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 HW3/P5/P5.txt

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
new file mode 100644
index 00000000..caa4de85
--- /dev/null
+++ b/HW3/P5/P5.txt
@@ -0,0 +1,52 @@
+Results
+
+********
+Part 1
+********
+
+--- Maze 1 ----
+
+Finished after 314 iterations, 153.46664 ms total, 0.488747261146 ms per iteration
+Found 77 regions
+
+--- Maze 2 ---
+
+Finished after 243 iterations, 106.70872 ms total, 0.439130534979 ms per iteration
+Found 113 regions
+
+********
+Part 2
+********
+
+--- Maze 1 ---
+
+
+Finished after 132 iterations, 65.11392 ms total, 0.493287272727 ms per iteration
+Found 77 regions
+
+--- Maze 2 ---
+
+Finished after 114 iterations, 55.83568 ms total, 0.489786666667 ms per iteration
+Found 113 regions
+
+********
+Part 3
+********
+
+--- Maze 1 ---
+
+Finished after 11 iterations, 5.42896 ms total, 0.493541818182 ms per iteration
+Found 60 regions
+
+
+--- Maze 2 ---
+
+Finished after 11 iterations, 5.3596 ms total, 0.487236363636 ms per iteration
+Found 106 regions
+
+********
+Part 4
+********
+
+--- Maze 1 ---
+

From 1a2cb7f65d5416973d3bd1139f5258ba503b1b1f Mon Sep 17 00:00:00 2001
From: tperol <tperol@seas.harvard.edu>
Date: Fri, 20 Nov 2015 21:30:29 -0500
Subject: [PATCH 11/13] debugging part 4...

---
 HW3/P5/label_regions.cl | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 5cb633a2..95a4bd69 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -97,23 +97,27 @@ propagate_labels(__global __read_write int *labels,
 
         // initialize variables to use
         int last_label;
-        int my_label;
+        int my_label_new;
+        int my_last_idx;
 
         // loop over rows and columns of the buffer
-        for (int x_i = halo; x_i < buf_h - halo; x_i++) {
-            for (int y_i = halo; y_i < buf_w - halo; y_i++) {
+        for (int x_i = halo; x_i < get_local_size(0) + halo; x_i++) {
+            for (int y_i = halo; y_i < get_local_size(1) + halo; y_i++) {
 
-                my_label = buffer[x_i + buf_w * y_i];
+                my_label_new = buffer[x_i + buf_w * y_i];
                 // obtain grand parent
                 if (old_label < w*h) {
                     // avoid having the same value as the previous one
-                    if (my_label != last_label) {
+                    if (my_label_new != last_label) {
                         // update the buffer
-                        buffer[x_i + buf_w * y_i] = labels[my_label];
+                        buffer[x_i + buf_w * y_i] = labels[my_label_new];
                         // update the last label
-                        last_label = my_label;
+                        last_label = my_label_new;
+                    } else {
+                        buffer[x_i + buf_w * y_i ] = labels[buffer[my_last_idx]];
                     }
                 }
+                my_last_idx = x_i + buf_w * y_i ;
             }
         }
     }

From 60f5082be6e27743b8454dde38f5cc901bdd79d5 Mon Sep 17 00:00:00 2001
From: tperol <tperol@seas.harvard.edu>
Date: Fri, 20 Nov 2015 22:40:23 -0500
Subject: [PATCH 12/13] finished part 4, and text for part 4. now working on
 text for part 5

---
 HW3/P5/P5.txt           | 11 +++++++++++
 HW3/P5/label_regions.cl | 25 ++++++++++++-------------
 HW3/P5/label_regions.py |  2 +-
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
index caa4de85..c9f00249 100644
--- a/HW3/P5/P5.txt
+++ b/HW3/P5/P5.txt
@@ -50,3 +50,14 @@ Part 4
 
 --- Maze 1 ---
 
+Finished after 71 iterations, 96.0672 ms total, 1.35305915493 ms per iteration
+Found 66 regions
+
+--- Maze 2 ---
+
+Finished after 49 iterations, 66.29168 ms total, 1.35289142857 ms per iteration
+Found 106 regions
+
+Looks like using only one thread did not improve anything. Worse, I now have more iterations. I don't have time to find out what happened.
+While we have decreased the fetches into the global memory by serializing, the run time is bigger.
+
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 95a4bd69..e37e5bba 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -91,33 +91,32 @@ propagate_labels(__global __read_write int *labels,
 
     // CODE FOR PART 4
 
-    barrier(CLK_LOCAL_MEM_FENCE);
+
     // when we have the first thread
     if ((lx == 0) && (ly == 0)) {
 
         // initialize variables to use
-        int last_label;
+        int last_label = -1 ;
         int my_label_new;
-        int my_last_idx;
+
+        // get grandparent
+        if (old_label < w*h){
+            last_label = labels[old_label];
+        }
 
         // loop over rows and columns of the buffer
         for (int x_i = halo; x_i < get_local_size(0) + halo; x_i++) {
             for (int y_i = halo; y_i < get_local_size(1) + halo; y_i++) {
 
-                my_label_new = buffer[x_i + buf_w * y_i];
-                // obtain grand parent
-                if (old_label < w*h) {
+                my_label_new = buffer[(ly+x_i)*buf_w+(lx+y_i)];
+                
+                if (buffer[(ly+x_i)*buf_w+(lx+y_i)] < w*h) {
                     // avoid having the same value as the previous one
                     if (my_label_new != last_label) {
                         // update the buffer
-                        buffer[x_i + buf_w * y_i] = labels[my_label_new];
-                        // update the last label
-                        last_label = my_label_new;
-                    } else {
-                        buffer[x_i + buf_w * y_i ] = labels[buffer[my_last_idx]];
-                    }
+                        buffer[(ly+x_i)*buf_w+(lx+y_i)] = labels[my_label_new];
+                    } 
                 }
-                my_last_idx = x_i + buf_w * y_i ;
             }
         }
     }
diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py
index 13d7c7d5..5ffa14f2 100644
--- a/HW3/P5/label_regions.py
+++ b/HW3/P5/label_regions.py
@@ -42,7 +42,7 @@ def round_up(global_size, group_size):
 
     program = cl.Program(context, open('label_regions.cl').read()).build(options='')
 
-    host_image = np.load('maze1.npy')
+    host_image = np.load('maze2.npy')
     host_labels = np.empty_like(host_image)
     host_done_flag = np.zeros(1).astype(np.int32)
 

From f53b2184167c2d80f23bae519e9572674b0a9fad Mon Sep 17 00:00:00 2001
From: tperol <tperol@seas.harvard.edu>
Date: Fri, 20 Nov 2015 23:25:22 -0500
Subject: [PATCH 13/13] finished HW3

---
 HW3/P5/P5.txt           | 24 ++++++++++++++++++++++++
 HW3/P5/label_regions.cl |  2 +-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
index c9f00249..59554130 100644
--- a/HW3/P5/P5.txt
+++ b/HW3/P5/P5.txt
@@ -61,3 +61,27 @@ Found 106 regions
 Looks like using only one thread did not improve anything. Worse, I now have more iterations. I don't have time to find out what happened.
 While we have decreased the fetches into the global memory by serializing, the run time is bigger.
 
+
+********
+Part 5
+********
+
+Changing atomic_min() to a simple min() ?
+
+For now we have :
+
+atomic_min(&labels[old_label],new_label);
+atomic_min(&labels[y * w + x], new_label);
+
+and we are asking ourselves the questions what would happen with
+
+min(labels[old_label], new_label);
+min(labels[y * w + x], new_label); 
+
+The atomic operation ensure that only one thread assigns to labels[old_label] and  labels[y * w + x] the smallest label at a time. If multiple threads do this at the same time in parallel, there is no reason to believe that it will correctly assign the smallest value of the label anymore. 
+
+However, this operation will be faster per iteration because now done via multithreading. 
+
+Also we may expect more iterations because the minimum value stored at the end of one iteration may not be the minimum value for all the threads.
+
+
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index e37e5bba..e14031fc 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -140,7 +140,7 @@ propagate_labels(__global __read_write int *labels,
 
 
         if (new_label != old_label) {
-            atomic_min(&labels[old_label],new_label);
+            atomic_min(&labels[old_label], new_label);
             // CODE FOR PART 3 HERE
             // indicate there was a change this iteration.
             // multiple threads might write this.