From ed6f0167fd1f6097ce9e454c2950c5db00a0c513 Mon Sep 17 00:00:00 2001
From: "Thouis (Ray) Jones" <thouis@gmail.com>
Date: Mon, 2 Nov 2015 21:49:00 -0500
Subject: [PATCH 01/15] remove extraneous +4

---
 HW3/P3/tune.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
index c16e9fa6..a0d56da2 100644
--- a/HW3/P3/tune.py
+++ b/HW3/P3/tune.py
@@ -23,7 +23,7 @@ def create_data(N):
     times = {}
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
@@ -40,7 +40,7 @@ def create_data(N):
                   format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)

From 6706ee708d97c656caa08b284792b3eee63892cf Mon Sep 17 00:00:00 2001
From: "Thouis (Ray) Jones" <thouis@gmail.com>
Date: Wed, 4 Nov 2015 21:46:08 -0500
Subject: [PATCH 02/15] typo in typecast

---
 HW3/P3/sum.cl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index 4fb771d2..ee914740 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -38,7 +38,7 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+    int k = ceil((float)N / get_global_size(0));
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1

From f7dc5f66fb34745e3ed74460cdd037836c440df7 Mon Sep 17 00:00:00 2001
From: "Thouis (Ray) Jones" <thouis@seas.harvard.edu>
Date: Thu, 5 Nov 2015 09:43:23 -0500
Subject: [PATCH 03/15] unused module, set include path

---
 HW3/P4/median_filter.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
index 1eda1bb9..a181c05a 100644
--- a/HW3/P4/median_filter.py
+++ b/HW3/P4/median_filter.py
@@ -1,8 +1,8 @@
 from __future__ import division
 import pyopencl as cl
 import numpy as np
-import imread
 import pylab
+import os.path
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
-    program = cl.Program(context, open('median_filter.cl').read()).build(options='')
+    curdir = os.path.dirname(os.path.realpath(__file__))
+    program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
 
     host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
     host_image_filtered = np.zeros_like(host_image)

From 3f397aab7cd0f2f29041baa18c9bcdadab578e74 Mon Sep 17 00:00:00 2001
From: Daniel <drajchwald@g.harvard.edu>
Date: Fri, 20 Nov 2015 20:23:39 -0500
Subject: [PATCH 04/15] commit

---
 HW3/P5/label_regions.cl | 57 +++++++++++++++++++++++++++++++++++++----
 HW3/P5/label_regions.py | 20 +++++++--------
 2 files changed, 62 insertions(+), 15 deletions(-)

diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 78b986b3..9f5987fe 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -57,13 +57,14 @@ propagate_labels(__global __read_write int *labels,
     // 1D index of thread within our work-group
     const int idx_1D = ly * get_local_size(0) + lx;
     
-    int old_label;
+    int old_label, old_label0;
     // Will store the output value
     int new_label;
+    int minT, newMin, row, col, prev;
     
     // Load the relevant labels to a local buffer with a halo 
     if (idx_1D < buf_w) {
-        for (int row = 0; row < buf_h; row++) {
+        for (row = 0; row < buf_h; row++) {
             buffer[row * buf_w + idx_1D] = 
                 get_clamped_value(labels,
                                   w, h,
@@ -77,16 +78,61 @@ propagate_labels(__global __read_write int *labels,
 
     // Fetch the value from the buffer the corresponds to
     // the pixel for this thread
-    old_label = buffer[buf_y * buf_w + buf_x];
 
+    /*
+    old_label = buffer[buf_y * buf_w + buf_x];
+    if (old_label < w*h){
+        buffer[buf_y * buf_w + buf_x] = labels[old_label];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE); 
+    */
+   
+    old_label = buffer[buf_y * buf_w + buf_x];
+    if ((buf_y*buf_w + buf_x) == 0){
+        old_label0 = buffer[buf_y * buf_w + buf_x];
+        if (old_label0 < w*h){
+            prev = labels[old_label0];
+        }
+        for (row = halo; row < buf_h-halo; row++){
+            for (col = halo; col < buf_w-halo; col++){
+                if (buffer[row * buf_w + col] < w*h){
+                    if (buffer[row * buf_w + col] == old_label0){
+                        buffer[row * buf_w + col] = prev;
+                    }
+                    else{
+                        old_label0 = buffer[row * buf_w + col];
+                        buffer[row * buf_w + col] = labels[old_label0];
+                        prev = buffer[row * buf_w + col];
+                    }
+                }
+            }
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);   
+    
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
     
     // stay in bounds
-    if ((x < w) && (y < h)) {
+    if ((x < w) && (y < h) && (old_label < w*h)) {
         // CODE FOR PART 1 HERE
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
-        new_label = old_label;
+        minT = buffer[buf_y * buf_w + buf_x];
+     
+        if (buffer[(buf_y-1)*buf_w + buf_x] < minT){
+            minT = buffer[(buf_y-1)*buf_w + buf_x];
+        }
+        if (buffer[(buf_y+1)*buf_w + buf_x] < minT){
+            minT = buffer[(buf_y+1)*buf_w + buf_x];
+        }
+        if (buffer[(buf_y)*buf_w + buf_x - 1] < minT){
+            minT = buffer[(buf_y)*buf_w + buf_x - 1];
+        }
+        if (buffer[(buf_y)*buf_w + buf_x + 1] < minT){
+            minT = buffer[(buf_y)*buf_w + buf_x + 1];
+        }
+      
+        new_label = minT;
 
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
@@ -94,6 +140,7 @@ propagate_labels(__global __read_write int *labels,
             // multiple threads might write this.
             *(changed_flag) += 1;
             labels[y * w + x] = new_label;
+            atomic_min(&(labels[old_label]),new_label);
         }
     }
 }
diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py
index c6ce60cb..5b1508a5 100644
--- a/HW3/P5/label_regions.py
+++ b/HW3/P5/label_regions.py
@@ -2,7 +2,7 @@
 import sys
 import pyopencl as cl
 import numpy as np
-import pylab
+#import pylab
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -75,9 +75,9 @@ def round_up(global_size, group_size):
 
     # Show the initial labels
     cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True)
-    pylab.imshow(host_labels)
-    pylab.title(itercount)
-    pylab.show()
+    #pylab.imshow(host_labels)
+    #pylab.title(itercount)
+    #pylab.show()
 
     show_progress = True
     total_time = 0
@@ -105,9 +105,9 @@ def round_up(global_size, group_size):
         print host_done_flag
         if itercount % 100 == 0 and show_progress:
             cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True)
-            pylab.imshow(host_labels)
-            pylab.title(itercount)
-            pylab.show()
+            #pylab.imshow(host_labels)
+            #pylab.title(itercount)
+            #pylab.show()
         if itercount % 10000 == 0:
             print 'Reached maximal number of iterations, aborting'
             sys.exit(0)
@@ -116,6 +116,6 @@ def round_up(global_size, group_size):
     # Show final result
     cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True)
     print 'Found {} regions'.format(len(np.unique(host_labels)) - 1)
-    pylab.imshow(host_labels)
-    pylab.title(itercount)
-    pylab.show()
+    #pylab.imshow(host_labels)
+    #pylab.title(itercount)
+    #pylab.show()

From 2d25b7a5ee7371d2437dec74e2537ee9a0febc70 Mon Sep 17 00:00:00 2001
From: Daniel <drajchwald@g.harvard.edu>
Date: Fri, 20 Nov 2015 20:26:12 -0500
Subject: [PATCH 05/15] p4

---
 HW3/P4/median_filter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
index a181c05a..118fe6bd 100644
--- a/HW3/P4/median_filter.py
+++ b/HW3/P4/median_filter.py
@@ -60,7 +60,7 @@ def numpy_median(image, iterations=10):
     gpu_image_a = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)
     gpu_image_b = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)
 
-    local_size = (8, 8)  # 64 pixels per work group
+    local_size = (4, 4)  # 64 pixels per work group
     global_size = tuple([round_up(g, l) for g, l in zip(host_image.shape[::-1], local_size)])
     width = np.int32(host_image.shape[1])
     height = np.int32(host_image.shape[0])

From 0d4dbeb2be71382ad069da7594995f4790d6836d Mon Sep 17 00:00:00 2001
From: Daniel <drajchwald@g.harvard.edu>
Date: Fri, 20 Nov 2015 20:26:34 -0500
Subject: [PATCH 06/15] p4

---
 HW3/P4/median_filter.cl | 85 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 72 insertions(+), 13 deletions(-)

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
index 07bb294c..5a2ef563 100644
--- a/HW3/P4/median_filter.cl
+++ b/HW3/P4/median_filter.cl
@@ -1,5 +1,6 @@
 #include "median9.h"
 
+ 
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -12,23 +13,81 @@ median_3x3(__global __read_only float *in_values,
     // Note: It may be easier for you to implement median filtering
     // without using the local buffer, first, then adjust your code to
     // use such a buffer after you have that working.
+    int gID, lID, x, y, lx, ly, gSizeX, gSizeY, 
+        lSizeX, lSizeY, xTemp, yTemp, xUse, yUse,
+        buf_corner_x, buf_corner_y, buf_x, buf_y, row;
 
+    x = get_global_id(0);
+    y = get_global_id(1);
+    lx = get_local_id(0);
+    ly = get_local_id(1);
+    gSizeX = get_global_size(0);
+    gSizeY = get_global_size(1);
+    lSizeX = get_local_size(0);
+    lSizeY = get_local_size(1);
+    
+    
+    gID = gSizeX*y + x;
+    lID = lSizeX*ly + lx;
 
-    // Load into buffer (with 1-pixel halo).
-    //
-    // It may be helpful to consult HW3 Problem 5, and
-    // https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
-    //
-    // Note that globally out-of-bounds pixels should be replaced
-    // with the nearest valid pixel's value.
+    buf_corner_x = x - lx - halo;
+    buf_corner_y = y - ly - halo;
 
+    buf_x = lx + halo;
+    buf_y = ly + halo;
 
-    // Compute 3x3 median for each pixel in core (non-halo) pixels
-    //
-    // We've given you median9.h, and included it above, so you can
-    // use the median9() function.
+    if ((y < h) && (x < w)){
+        if (lID < buf_w){
+            xTemp = buf_corner_x + lID;
+            xUse = xTemp;
+            if (xTemp < 0){
+                    xUse += 1;
+                }
+            if (xTemp > w - 1){
+                xUse -= 1;
+            }
+            for (row = 0; row < buf_h; row++) {
+                yTemp = buf_corner_y + row;
+                yUse = yTemp;
+                if (yTemp < 0){
+                    yUse += 1;
+                }
+                if (yTemp > h - 1){
+                    yUse -= 1;
+                } 
+                buffer[row * buf_w + lID] = in_values[yUse*gSizeX + xUse];
+            }
+        }
+    }
 
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if ((y < h) && (x < w)){
+        out_values[gID] = median9(buffer[(buf_y-1)*buf_w + (buf_x-1)],
+                                  buffer[(buf_y-1)*buf_w + (buf_x)],
+                                  buffer[(buf_y-1)*buf_w + (buf_x+1)],
+                                  buffer[(buf_y)*buf_w + (buf_x-1)],
+                                  buffer[(buf_y)*buf_w + (buf_x)],
+                                  buffer[(buf_y)*buf_w + (buf_x+1)],
+                                  buffer[(buf_y+1)*buf_w + (buf_x-1)],
+                                  buffer[(buf_y+1)*buf_w + (buf_x)],
+                                  buffer[(buf_y+1)*buf_w + (buf_x+1)]);
+    }
 
-    // Each thread in the valid region (x < w, y < h) should write
-    // back its 3x3 neighborhood median.
+        // Load into buffer (with 1-pixel halo).
+        //
+        // It may be helpful to consult HW3 Problem 5, and
+        // https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
+        //
+        // Note that globally out-of-bounds pixels should be replaced
+        // with the nearest valid pixel's value.
+
+
+        // Compute 3x3 median for each pixel in core (non-halo) pixels
+        //
+        // We've given you median9.h, and included it above, so you can
+        // use the median9() function.
+
+
+        // Each thread in the valid region (x < w, y < h) should write
+        // back its 3x3 neighborhood median.
 }

From 22082ad3876c955c04852ff20811fa6710f73d2b Mon Sep 17 00:00:00 2001
From: Daniel <drajchwald@g.harvard.edu>
Date: Fri, 20 Nov 2015 20:27:06 -0500
Subject: [PATCH 07/15] p2

---
 HW3/P2/mandelbrot.cl | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
index 5a11c020..f96965d2 100644
--- a/HW3/P2/mandelbrot.cl
+++ b/HW3/P2/mandelbrot.cl
@@ -9,11 +9,25 @@ mandelbrot(__global __read_only float *coords_real,
     const int y = get_global_id(1);
 
     float c_real, c_imag;
-    float z_real, z_imag;
+    float z_real, z_imag, z_real_new, z_imag_new;
+    float mag2;
     int iter;
 
     if ((x < w) && (y < h)) {
-        // YOUR CODE HERE
-        ;
+        iter = 1;
+        c_real = coords_real[y*w + x];
+        c_imag = coords_imag[y*w + x];
+        z_real = c_real;
+        z_imag = c_imag;
+        mag2 = z_real*z_real + z_imag*z_imag; 
+        while ((mag2 < 4) && (iter < max_iter)){
+            z_real_new = z_real*z_real - z_imag*z_imag + c_real;
+            z_imag_new = 2*z_real*z_imag + c_imag;
+            mag2 = z_real_new*z_real_new + z_imag_new*z_imag_new;
+            z_real = z_real_new;
+            z_imag = z_imag_new;
+            iter = iter + 1;
+        }
+    out_counts[y*w + x] = iter;
     }
 }

From 62ffe6dd321fa009f24babab86dc3374a0bc03bc Mon Sep 17 00:00:00 2001
From: Daniel <drajchwald@g.harvard.edu>
Date: Fri, 20 Nov 2015 20:27:29 -0500
Subject: [PATCH 08/15] p3

---
 HW3/P3/sum.cl | 50 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 41 insertions(+), 9 deletions(-)

diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index ee914740..0c5a8493 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -5,11 +5,16 @@ __kernel void sum_coalesced(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-
+    int i, j, gID, gSize, temp, lSize, loglSize;
+    
+    gID = get_global_id(0);
+    gSize = get_global_size(0);
+    lSize = get_local_size(0);
+   
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    for (i = gID; i < N; i += gSize) { 
+         sum = sum + x[i];
     }
 
     fast[local_id] = sum;
@@ -24,8 +29,17 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    loglSize = 1;
+    temp  = lSize >> 1;
+    while (temp > 1){
+        temp = temp >> 1;
+        loglSize = loglSize + 1;
+    }
+    for (j = 1; j <= loglSize; j++) {
+        if (local_id < (lSize >> j)) { 
+            fast[local_id] = fast[local_id] + fast[local_id + (lSize >> j)];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -39,6 +53,7 @@ __kernel void sum_blocked(__global float* x,
     float sum = 0;
     size_t local_id = get_local_id(0);
     int k = ceil((float)N / get_global_size(0));
+    int j, gID, temp, loglSize, lSize, minS;
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,8 +63,16 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    lSize = get_local_size(0);
+    gID = get_global_id(0);
+    if (k-1 < N - k*gID){
+        minS = k;
+    }
+    else{
+        minS = N - k*gID;
+    }
+    for (j = 0; j < minS; j++) { 
+        sum = sum + x[k*gID + j];
     }
 
     fast[local_id] = sum;
@@ -64,8 +87,17 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    loglSize = 1;
+    temp  = lSize >> 1;
+    while (temp > 1){
+        temp = temp >> 1;
+        loglSize = loglSize + 1;
+    }
+    for (j = 1; j <= loglSize; j++) {
+        if (local_id < (lSize >> j)) { 
+            fast[local_id] = fast[local_id] + fast[local_id + (lSize >> j)];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];

From 3b2416fd7b5b7a823a1a9cb4b0661f20e1be2779 Mon Sep 17 00:00:00 2001
From: Daniel <drajchwald@g.harvard.edu>
Date: Fri, 20 Nov 2015 20:28:02 -0500
Subject: [PATCH 09/15] p3

---
 HW3/P3/P3.txt | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 HW3/P3/P3.txt

diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
new file mode 100644
index 00000000..a0c27375
--- /dev/null
+++ b/HW3/P3/P3.txt
@@ -0,0 +1,4 @@
+
+
+configuration ('coalesced', 512, 128): 0.000331392 seconds
+

From 4cad4c3218f800ed5c2d211d0b7ecf4e237b2495 Mon Sep 17 00:00:00 2001
From: Daniel <drajchwald@g.harvard.edu>
Date: Fri, 20 Nov 2015 21:00:37 -0500
Subject: [PATCH 10/15] p3

---
 HW3/P3/P3.txt | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
index a0c27375..426a59ad 100644
--- a/HW3/P3/P3.txt
+++ b/HW3/P3/P3.txt
@@ -1,4 +1,13 @@
 
+The best configuration for my machine is:
 
 configuration ('coalesced', 512, 128): 0.000331392 seconds
 
+The coalesced read is faster than the blocked read on average 
+for the same number of work groups and workers because more 
+threads can do work on the same block of fetched memory. In the
+blocked reads, once a thread fetchs its block to sum, more
+threads may have to wait to fetch their block of memory. However in the
+coalesced reads, more threads can sum elements simultaneously
+more often since a fetched block of memory will be more likely to 
+contain elements needed by more threads than in the blocked scheme.

From 3912be1d30e1bda807814d3e7d1563b6a9bd412f Mon Sep 17 00:00:00 2001
From: Daniel <drajchwald@g.harvard.edu>
Date: Fri, 20 Nov 2015 21:04:52 -0500
Subject: [PATCH 11/15] p4

---
 HW3/P4/median_filter.cl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
index 5a2ef563..7e134fd1 100644
--- a/HW3/P4/median_filter.cl
+++ b/HW3/P4/median_filter.cl
@@ -16,7 +16,7 @@ median_3x3(__global __read_only float *in_values,
     int gID, lID, x, y, lx, ly, gSizeX, gSizeY, 
         lSizeX, lSizeY, xTemp, yTemp, xUse, yUse,
         buf_corner_x, buf_corner_y, buf_x, buf_y, row;
-
+    // the code below is adapted from the lecture code on halos
     x = get_global_id(0);
     y = get_global_id(1);
     lx = get_local_id(0);
@@ -37,12 +37,12 @@ median_3x3(__global __read_only float *in_values,
     buf_y = ly + halo;
 
     if ((y < h) && (x < w)){
-        if (lID < buf_w){
+        if (lID < buf_w){ // only work with buf_w threads
             xTemp = buf_corner_x + lID;
             xUse = xTemp;
-            if (xTemp < 0){
+            if (xTemp < 0){ // if pixel out of bounds, add compensation steps to find closest in bound pixel
                     xUse += 1;
-                }
+            }
             if (xTemp > w - 1){
                 xUse -= 1;
             }
@@ -55,14 +55,14 @@ median_3x3(__global __read_only float *in_values,
                 if (yTemp > h - 1){
                     yUse -= 1;
                 } 
-                buffer[row * buf_w + lID] = in_values[yUse*gSizeX + xUse];
+                buffer[row * buf_w + lID] = in_values[yUse*gSizeX + xUse]; // assign global memory of pixel or closest in bound pixel to buffer
             }
         }
     }
 
     barrier(CLK_LOCAL_MEM_FENCE);
     if ((y < h) && (x < w)){
-        out_values[gID] = median9(buffer[(buf_y-1)*buf_w + (buf_x-1)],
+        out_values[gID] = median9(buffer[(buf_y-1)*buf_w + (buf_x-1)], // take median of 8 neighbors and current pixel
                                   buffer[(buf_y-1)*buf_w + (buf_x)],
                                   buffer[(buf_y-1)*buf_w + (buf_x+1)],
                                   buffer[(buf_y)*buf_w + (buf_x-1)],

From 6e186772a522a750e38df10d0fe8570ef5fa5ad0 Mon Sep 17 00:00:00 2001
From: Daniel <drajchwald@g.harvard.edu>
Date: Fri, 20 Nov 2015 22:31:52 -0500
Subject: [PATCH 12/15] p5

---
 HW3/P5/label_regions.cl | 17 ++++++++---------
 HW3/P5/label_regions.py |  2 +-
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 9f5987fe..8fe5c312 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -75,20 +75,19 @@ propagate_labels(__global __read_write int *labels,
     // Make sure all threads reach the next part after
     // the local buffer is loaded
     barrier(CLK_LOCAL_MEM_FENCE);
-
+ 
     // Fetch the value from the buffer the corresponds to
     // the pixel for this thread
-
-    /*
+    /* 
     old_label = buffer[buf_y * buf_w + buf_x];
     if (old_label < w*h){
-        buffer[buf_y * buf_w + buf_x] = labels[old_label];
+        buffer[buf_y * buf_w + buf_x] = labels[old_label]; // grandparent update
     }
     barrier(CLK_LOCAL_MEM_FENCE); 
     */
-   
-    old_label = buffer[buf_y * buf_w + buf_x];
-    if ((buf_y*buf_w + buf_x) == 0){
+      
+    old_label = buffer[buf_y * buf_w + buf_x]; // use the first thread in the upper left corner to update the buffer values with grandparents
+    if ((buf_y + buf_x) == 2*halo){            // keeps track of last buffer value read to avoid reading from memory (labels array) more than necessary
         old_label0 = buffer[buf_y * buf_w + buf_x];
         if (old_label0 < w*h){
             prev = labels[old_label0];
@@ -118,7 +117,7 @@ propagate_labels(__global __read_write int *labels,
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
         minT = buffer[buf_y * buf_w + buf_x];
-     
+        // Check neighbors to update minimum
         if (buffer[(buf_y-1)*buf_w + buf_x] < minT){
             minT = buffer[(buf_y-1)*buf_w + buf_x];
         }
@@ -140,7 +139,7 @@ propagate_labels(__global __read_write int *labels,
             // multiple threads might write this.
             *(changed_flag) += 1;
             labels[y * w + x] = new_label;
-            atomic_min(&(labels[old_label]),new_label);
+            atomic_min(&(labels[old_label]),new_label); // Use atomic min to update grandparent value if necessary
         }
     }
 }
diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py
index 5b1508a5..ba3facd2 100644
--- a/HW3/P5/label_regions.py
+++ b/HW3/P5/label_regions.py
@@ -42,7 +42,7 @@ def round_up(global_size, group_size):
 
     program = cl.Program(context, open('label_regions.cl').read()).build(options='')
 
-    host_image = np.load('maze1.npy')
+    host_image = np.load('maze2.npy')
     host_labels = np.empty_like(host_image)
     host_done_flag = np.zeros(1).astype(np.int32)
 

From ea73c1d946ed7d9c1ac0e4fda0d32743a5023cfd Mon Sep 17 00:00:00 2001
From: Daniel <drajchwald@g.harvard.edu>
Date: Fri, 20 Nov 2015 22:32:16 -0500
Subject: [PATCH 13/15] p5

---
 HW3/P5/P5.txt | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 HW3/P5/P5.txt

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
new file mode 100644
index 00000000..53ff79ef
--- /dev/null
+++ b/HW3/P5/P5.txt
@@ -0,0 +1,82 @@
+
+
+Explanation:
+
+Part 1: 
+
+This is the least optimal scenario.
+
+Part 2: 
+
+This is optimized over the first part because the buffer values are updated with the
+grandparent values, which is guaranteed to be less than or equal to the current 
+buffer value.
+
+Part 3:
+
+This is optimized over the second part because a pixel's parent is updated to the pixel's 
+value if it is smaller than the pixel's parent using atomic min. However, the iteration
+time increases due to the atomic (min) operation.
+
+Part 4:
+
+Making 1 thread update the buffer regions with grandparent values is not as efficient on average 
+given the time per iteration is roughly twice as long as Part 3. Even though lots of adjacent pixels 
+may have equal buffer values after sufficient iterations, the reduced number of memory reads 
+does not outweight the loss of parallelism between threads. 
+
+Part 5:
+
+If a standard min operation were used instead of atomic min, the iteration time would decrease
+because the imposed serialized delays from atomic operation will not be applied. The final result
+will still be correct because even if a thread overwrites the pixel's parent's value with a greater value
+than another thread, the value will still be less than the original parent value. Thus the number of 
+iterations may increase. As stated, the value in label could increase, but that is during the same iteration.
+Between iterations, label values cannot increase because a pixel's previous iteration value is compared
+via the minimum operator with a new label. Thus after the current iteration finishs, each label's value
+will be less than or equal to that of the previous iteration.
+  
+
+Results:
+
+Maze 1
+
+Part1:
+
+Finished after 915 iterations, 36.084992 ms total, 0.0394371497268 ms per iteration
+Found 2 regions
+
+Part 2:
+
+Finished after 529 iterations, 20.321376 ms total, 0.0384146994329 ms per iteration
+Found 2 regions
+
+Part 3:
+
+Finished after 12 iterations, 0.611552 ms total, 0.0509626666667 ms per iteration
+Found 2 regions
+
+Part 4:
+
+Finished after 11 iterations, 1.224416 ms total, 0.111310545455 ms per iteration
+Found 2 regions
+
+Maze 2
+
+Part 1:
+Finished after 532 iterations, 20.138752 ms total, 0.0378547969925 ms per iteration
+Found 35 regions
+
+Part 2:
+Finished after 276 iterations, 10.62384 ms total, 0.038492173913 ms per iteration
+Found 35 regions
+
+Part 3:
+Finished after 11 iterations, 0.539008 ms total, 0.0490007272727 ms per iteration
+Found 35 regions
+
+Part 4:
+Finished after 10 iterations, 1.11216 ms total, 0.111216 ms per iteration
+Found 35 regions
+
+

From 23d418a0f740f1ee69cdb3377a9ac888eebcaa4a Mon Sep 17 00:00:00 2001
From: Daniel <drajchwald@g.harvard.edu>
Date: Fri, 20 Nov 2015 22:34:38 -0500
Subject: [PATCH 14/15] p5

---
 HW3/P5/P5.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
index 53ff79ef..082e78bc 100644
--- a/HW3/P5/P5.txt
+++ b/HW3/P5/P5.txt
@@ -4,7 +4,7 @@ Explanation:
 
 Part 1: 
 
-This is the least optimal scenario.
+This is the base code.
 
 Part 2: 
 

From 522f842d7c9c6cff9e89a987df9332bd0f7b6817 Mon Sep 17 00:00:00 2001
From: Daniel <drajchwald@g.harvard.edu>
Date: Fri, 20 Nov 2015 23:04:09 -0500
Subject: [PATCH 15/15] p5

---
 HW3/P5/P5.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
index 082e78bc..26d9ffca 100644
--- a/HW3/P5/P5.txt
+++ b/HW3/P5/P5.txt
@@ -23,7 +23,11 @@ Part 4:
 Making 1 thread update the buffer regions with grandparent values is not as efficient on average 
 given the time per iteration is roughly twice as long as Part 3. Even though lots of adjacent pixels 
 may have equal buffer values after sufficient iterations, the reduced number of memory reads 
-does not outweight the loss of parallelism between threads. 
+does not outweight the loss of parallelism between threads. If more threads are used, for 
+example due to smaller context sizes, then even more memory calls to the labels array will occur.
+So using one thread to remember previous grandparent values may perform better than having
+each thread fetch a value from memory simultaneously (resulting in partial serialization since 
+more threads will have to wait for memory) as the number of threads gets even larger.
 
 Part 5: