From ed6f0167fd1f6097ce9e454c2950c5db00a0c513 Mon Sep 17 00:00:00 2001
From: "Thouis (Ray) Jones" <thouis@gmail.com>
Date: Mon, 2 Nov 2015 21:49:00 -0500
Subject: [PATCH 1/8] remove extraneous +4

---
 HW3/P3/tune.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
index c16e9fa6..a0d56da2 100644
--- a/HW3/P3/tune.py
+++ b/HW3/P3/tune.py
@@ -23,7 +23,7 @@ def create_data(N):
     times = {}
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
@@ -40,7 +40,7 @@ def create_data(N):
                   format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)

From 6706ee708d97c656caa08b284792b3eee63892cf Mon Sep 17 00:00:00 2001
From: "Thouis (Ray) Jones" <thouis@gmail.com>
Date: Wed, 4 Nov 2015 21:46:08 -0500
Subject: [PATCH 2/8] typo in typecast

---
 HW3/P3/sum.cl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index 4fb771d2..ee914740 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -38,7 +38,7 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+    int k = ceil((float)N / get_global_size(0));
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1

From f7dc5f66fb34745e3ed74460cdd037836c440df7 Mon Sep 17 00:00:00 2001
From: "Thouis (Ray) Jones" <thouis@seas.harvard.edu>
Date: Thu, 5 Nov 2015 09:43:23 -0500
Subject: [PATCH 3/8] unused module, set include path

---
 HW3/P4/median_filter.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
index 1eda1bb9..a181c05a 100644
--- a/HW3/P4/median_filter.py
+++ b/HW3/P4/median_filter.py
@@ -1,8 +1,8 @@
 from __future__ import division
 import pyopencl as cl
 import numpy as np
-import imread
 import pylab
+import os.path
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
-    program = cl.Program(context, open('median_filter.cl').read()).build(options='')
+    curdir = os.path.dirname(os.path.realpath(__file__))
+    program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
 
     host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
     host_image_filtered = np.zeros_like(host_image)

From 36f5888ce3cd0dfbbfd099a82ad0e9ba864fd1d9 Mon Sep 17 00:00:00 2001
From: vinayps <vinayps89@gmail.com>
Date: Fri, 20 Nov 2015 03:08:33 -0500
Subject: [PATCH 4/8] solutions for P3

---
 HW3/P3/P3.txt  | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++
 HW3/P3/sum.cl  | 35 ++++++++++++++-----
 HW3/P3/tune.py |  2 +-
 3 files changed, 121 insertions(+), 10 deletions(-)
 create mode 100644 HW3/P3/P3.txt

diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
new file mode 100644
index 00000000..21c79b86
--- /dev/null
+++ b/HW3/P3/P3.txt
@@ -0,0 +1,94 @@
+# Solutions for HW3 - P3
+
+Hardware - 
+#1: Intel(R) Iris(TM) Graphics 6100 on Apple
+
+Best Configuration - configuration ('coalesced', 512, 64): 0.00286168 seconds (majority winner based on a few runs)
+
+Raw Output:
+
+coalesced reads, workgroups: 8, num_workers: 4, 0.19061496 seconds
+coalesced reads, workgroups: 8, num_workers: 8, 0.09478136 seconds
+coalesced reads, workgroups: 8, num_workers: 16, 0.05535224 seconds
+coalesced reads, workgroups: 8, num_workers: 32, 0.03110032 seconds
+coalesced reads, workgroups: 8, num_workers: 64, 0.02122016 seconds
+coalesced reads, workgroups: 8, num_workers: 128, 0.00785568 seconds
+coalesced reads, workgroups: 16, num_workers: 4, 0.09534232 seconds
+coalesced reads, workgroups: 16, num_workers: 8, 0.04909168 seconds
+coalesced reads, workgroups: 16, num_workers: 16, 0.02945256 seconds
+coalesced reads, workgroups: 16, num_workers: 32, 0.02130744 seconds
+coalesced reads, workgroups: 16, num_workers: 64, 0.00840584 seconds
+coalesced reads, workgroups: 16, num_workers: 128, 0.00625968 seconds
+coalesced reads, workgroups: 32, num_workers: 4, 0.05298352 seconds
+coalesced reads, workgroups: 32, num_workers: 8, 0.02782392 seconds
+coalesced reads, workgroups: 32, num_workers: 16, 0.0139668 seconds
+coalesced reads, workgroups: 32, num_workers: 32, 0.011858 seconds
+coalesced reads, workgroups: 32, num_workers: 64, 0.00589408 seconds
+coalesced reads, workgroups: 32, num_workers: 128, 0.00364424 seconds
+coalesced reads, workgroups: 64, num_workers: 4, 0.02987728 seconds
+coalesced reads, workgroups: 64, num_workers: 8, 0.01375704 seconds
+coalesced reads, workgroups: 64, num_workers: 16, 0.0074668 seconds
+coalesced reads, workgroups: 64, num_workers: 32, 0.00482704 seconds
+coalesced reads, workgroups: 64, num_workers: 64, 0.00407016 seconds
+coalesced reads, workgroups: 64, num_workers: 128, 0.00323848 seconds
+coalesced reads, workgroups: 128, num_workers: 4, 0.03022608 seconds
+coalesced reads, workgroups: 128, num_workers: 8, 0.0170572 seconds
+coalesced reads, workgroups: 128, num_workers: 16, 0.01060368 seconds
+coalesced reads, workgroups: 128, num_workers: 32, 0.00647328 seconds
+coalesced reads, workgroups: 128, num_workers: 64, 0.00367552 seconds
+coalesced reads, workgroups: 128, num_workers: 128, 0.00308912 seconds
+coalesced reads, workgroups: 256, num_workers: 4, 0.025532 seconds
+coalesced reads, workgroups: 256, num_workers: 8, 0.00918576 seconds
+coalesced reads, workgroups: 256, num_workers: 16, 0.00548232 seconds
+coalesced reads, workgroups: 256, num_workers: 32, 0.00522008 seconds
+coalesced reads, workgroups: 256, num_workers: 64, 0.00321048 seconds
+coalesced reads, workgroups: 256, num_workers: 128, 0.00406888 seconds
+coalesced reads, workgroups: 512, num_workers: 4, 0.02146176 seconds
+coalesced reads, workgroups: 512, num_workers: 8, 0.01369368 seconds
+coalesced reads, workgroups: 512, num_workers: 16, 0.0064036 seconds
+coalesced reads, workgroups: 512, num_workers: 32, 0.00533056 seconds
+coalesced reads, workgroups: 512, num_workers: 64, 0.00286168 seconds
+coalesced reads, workgroups: 512, num_workers: 128, 0.0032056 seconds
+blocked reads, workgroups: 8, num_workers: 4, 0.15275448 seconds
+blocked reads, workgroups: 8, num_workers: 8, 0.08203064 seconds
+blocked reads, workgroups: 8, num_workers: 16, 0.05936312 seconds
+blocked reads, workgroups: 8, num_workers: 32, 0.03968904 seconds
+blocked reads, workgroups: 8, num_workers: 64, 0.01966736 seconds
+blocked reads, workgroups: 8, num_workers: 128, 0.01259184 seconds
+blocked reads, workgroups: 16, num_workers: 4, 0.08024176 seconds
+blocked reads, workgroups: 16, num_workers: 8, 0.04805832 seconds
+blocked reads, workgroups: 16, num_workers: 16, 0.03266928 seconds
+blocked reads, workgroups: 16, num_workers: 32, 0.0196948 seconds
+blocked reads, workgroups: 16, num_workers: 64, 0.01304792 seconds
+blocked reads, workgroups: 16, num_workers: 128, 0.00887808 seconds
+blocked reads, workgroups: 32, num_workers: 4, 0.0454124 seconds
+blocked reads, workgroups: 32, num_workers: 8, 0.02621024 seconds
+blocked reads, workgroups: 32, num_workers: 16, 0.01601688 seconds
+blocked reads, workgroups: 32, num_workers: 32, 0.0123308 seconds
+blocked reads, workgroups: 32, num_workers: 64, 0.00911488 seconds
+blocked reads, workgroups: 32, num_workers: 128, 0.00661224 seconds
+blocked reads, workgroups: 64, num_workers: 4, 0.0253416 seconds
+blocked reads, workgroups: 64, num_workers: 8, 0.01482472 seconds
+blocked reads, workgroups: 64, num_workers: 16, 0.01003168 seconds
+blocked reads, workgroups: 64, num_workers: 32, 0.00830936 seconds
+blocked reads, workgroups: 64, num_workers: 64, 0.00668752 seconds
+blocked reads, workgroups: 64, num_workers: 128, 0.01031256 seconds
+blocked reads, workgroups: 128, num_workers: 4, 0.02511456 seconds
+blocked reads, workgroups: 128, num_workers: 8, 0.01601144 seconds
+blocked reads, workgroups: 128, num_workers: 16, 0.01162648 seconds
+blocked reads, workgroups: 128, num_workers: 32, 0.00754224 seconds
+blocked reads, workgroups: 128, num_workers: 64, 0.00873912 seconds
+blocked reads, workgroups: 128, num_workers: 128, 0.01093456 seconds
+blocked reads, workgroups: 256, num_workers: 4, 0.02031408 seconds
+blocked reads, workgroups: 256, num_workers: 8, 0.0130572 seconds
+blocked reads, workgroups: 256, num_workers: 16, 0.00682696 seconds
+blocked reads, workgroups: 256, num_workers: 32, 0.00649288 seconds
+blocked reads, workgroups: 256, num_workers: 64, 0.00751592 seconds
+blocked reads, workgroups: 256, num_workers: 128, 0.0089632 seconds
+blocked reads, workgroups: 512, num_workers: 4, 0.01979328 seconds
+blocked reads, workgroups: 512, num_workers: 8, 0.0130584 seconds
+blocked reads, workgroups: 512, num_workers: 16, 0.00902016 seconds
+blocked reads, workgroups: 512, num_workers: 32, 0.00585392 seconds
+blocked reads, workgroups: 512, num_workers: 64, 0.00664848 seconds
+blocked reads, workgroups: 512, num_workers: 128, 0.00892288 seconds
+
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index ee914740..e99a0640 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -5,11 +5,15 @@ __kernel void sum_coalesced(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
+    size_t global_id = get_global_id(0);
+    uint gs = get_local_size(0);
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    for (uint c = global_id; c < N; c = c + get_global_size(0)) { 
+              if( global_id < N ){
+              sum += x[c];
+              }
     }
 
     fast[local_id] = sum;
@@ -24,8 +28,11 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for(uint s = gs/2; s > 0; s >>= 1) {
+        if(local_id < s) {
+          fast[local_id] += fast[local_id+s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -38,7 +45,10 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
+    size_t global_id = get_global_id(0);
     int k = ceil((float)N / get_global_size(0));
+    uint gs = get_local_size(0);
+
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,12 +58,15 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (uint c = k*global_id; c < k*global_id + k; c = c + 1) { 
+        if( c < N ){
+            sum += x[c];
+        }
     }
-
+    
     fast[local_id] = sum;
     barrier(CLK_LOCAL_MEM_FENCE);
+    
 
     // binary reduction
     //
@@ -64,9 +77,13 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for(uint s = gs/2; s > 0; s >>= 1) {
+        if(local_id < s) {
+          fast[local_id] += fast[local_id+s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
+    
 }
diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
index a0d56da2..4880c78d 100644
--- a/HW3/P3/tune.py
+++ b/HW3/P3/tune.py
@@ -54,7 +54,7 @@ def create_data(N):
             assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
             times['blocked', num_workgroups, num_workers] = seconds
             print("blocked reads, workgroups: {}, num_workers: {}, {} seconds".
-                  format(num_workgroups, num_workers, seconds))
+                  format(num_workgroups, num_workers, seconds)) 
 
     best_time = min(times.values())
     best_configuration = [config for config in times if times[config] == best_time]

From f80a03c4eb573fdb6bb3bc5305031c4299bc8006 Mon Sep 17 00:00:00 2001
From: vinayps <vinayps89@gmail.com>
Date: Fri, 20 Nov 2015 03:10:17 -0500
Subject: [PATCH 5/8] solutions for warm-up P2

---
 HW3/P2/mandelbrot.cl | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
index 5a11c020..76276261 100644
--- a/HW3/P2/mandelbrot.cl
+++ b/HW3/P2/mandelbrot.cl
@@ -9,11 +9,25 @@ mandelbrot(__global __read_only float *coords_real,
     const int y = get_global_id(1);
 
     float c_real, c_imag;
-    float z_real, z_imag;
+    float z_real, z_imag, z_real_temp;
     int iter;
 
+    c_real = coords_real[x + y * w];
+    c_imag = coords_imag[x + y * w];
+    z_real = 0;
+    z_imag = 0;
+
+
     if ((x < w) && (y < h)) {
-        // YOUR CODE HERE
-        ;
+         // YOUR CODE HERE
+        for(iter = 0; iter < max_iter; iter++)
+        {
+            if((z_real * z_real + z_imag * z_imag) > 4)
+                break;
+            z_real_temp = ((z_real * z_real) - (z_imag * z_imag)) + c_real;
+            z_imag = (2 * z_real * z_imag) + c_imag;
+            z_real = z_real_temp;
+        }        
+        out_counts[x + y * w] = iter;
     }
-}
+}    
\ No newline at end of file

From 43f1ddadec4a6f2e94785035929100a1fc863fa5 Mon Sep 17 00:00:00 2001
From: vinayps <vinayps89@gmail.com>
Date: Fri, 20 Nov 2015 16:50:19 -0500
Subject: [PATCH 6/8] solutions for P4

---
 HW3/P4/median_filter.cl | 57 ++++++++++++++++++++++++++++++++++++++++-
 HW3/P4/median_filter.py |  4 +++
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
index 07bb294c..a5c20737 100644
--- a/HW3/P4/median_filter.cl
+++ b/HW3/P4/median_filter.cl
@@ -1,5 +1,19 @@
 #include "median9.h"
 
+// clamped pixel fetch
+float FETCH(__global __read_only float *in_values, int w, int h, int x, int y) 
+{
+    if(x < 0)
+        x = 0;
+    if(x >= w) 
+        x = w - 1; 
+    if(y < 0)
+        y = 0;
+    if(y >= h) 
+        y = h - 1; 
+    return in_values[y * w + x];
+}
+
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -12,7 +26,26 @@ median_3x3(__global __read_only float *in_values,
     // Note: It may be easier for you to implement median filtering
     // without using the local buffer, first, then adjust your code to
     // use such a buffer after you have that working.
+    
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
 
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
 
     // Load into buffer (with 1-pixel halo).
     //
@@ -21,14 +54,36 @@ median_3x3(__global __read_only float *in_values,
     //
     // Note that globally out-of-bounds pixels should be replaced
     // with the nearest valid pixel's value.
+    
+    int row;
 
+    if (idx_1D < buf_w)
+        for (row = 0; row < buf_h; row++) {
+            buffer[row * buf_w + idx_1D] = \
+                FETCH(in_values, w, h,
+                      buf_corner_x + idx_1D,
+                      buf_corner_y + row);
+        }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
 
     // Compute 3x3 median for each pixel in core (non-halo) pixels
     //
     // We've given you median9.h, and included it above, so you can
     // use the median9() function.
+       
+    if ((y < h) && (x < w)) // stay in bounds
+    {
+        buffer[buf_y * buf_w + buf_x] = median9( buffer[(buf_y - 1) * buf_w + buf_x - 1], buffer[(buf_y - 1) * buf_w + buf_x], buffer[(buf_y - 1) * buf_w + buf_x + 1],
+                                                 buffer[buf_y * buf_w + buf_x - 1], buffer[buf_y * buf_w + buf_x], buffer[buf_y * buf_w + buf_x + 1],       
+                                                 buffer[(buf_y + 1) * buf_w + buf_x - 1], buffer[(buf_y + 1) * buf_w + buf_x], buffer[(buf_y + 1) * buf_w + buf_x + 1] );
+    }
 
-
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
+    if ((y < h) && (x < w)) // stay in bounds
+        out_values[y * w + x] = \
+            buffer[buf_y * buf_w + buf_x];
 }
diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
index a181c05a..bc713086 100644
--- a/HW3/P4/median_filter.py
+++ b/HW3/P4/median_filter.py
@@ -89,3 +89,7 @@ def numpy_median(image, iterations=10):
     cl.enqueue_copy(queue, host_image_filtered, gpu_image_a, is_blocking=True)
 
     assert np.allclose(host_image_filtered, numpy_median(host_image, num_iters))
+    
+    pylab.imshow(host_image_filtered)
+    
+    pylab.show()

From 43a404fcbc786e711d79f02959ab3ff47e072e33 Mon Sep 17 00:00:00 2001
From: vinayps <vinayps89@gmail.com>
Date: Fri, 20 Nov 2015 23:21:05 -0500
Subject: [PATCH 7/8] solutions for problem 5

---
 HW3/P5/P5.txt           | 64 +++++++++++++++++++++++++++++++++++++++++
 HW3/P5/label_regions.cl | 41 ++++++++++++++++++++++++--
 2 files changed, 103 insertions(+), 2 deletions(-)
 create mode 100644 HW3/P5/P5.txt

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
new file mode 100644
index 00000000..9cb52897
--- /dev/null
+++ b/HW3/P5/P5.txt
@@ -0,0 +1,64 @@
+#solutions for HW3 - P5
+
+Devices used - 
+The platforms detected are:
+---------------------------
+Apple Apple version: OpenCL 1.2 (Sep 21 2015 19:24:11)
+The devices detected on platform Apple are:
+---------------------------
+Intel(R) Core(TM) i5-5257U CPU @ 2.70GHz [Type: CPU ]
+Maximum clock Frequency: 2700 MHz
+Maximum allocable memory size: 4294 MB
+Maximum work group size 1024
+---------------------------
+Intel(R) Iris(TM) Graphics 6100 [Type: GPU ]
+Maximum clock Frequency: 1050 MHz
+Maximum allocable memory size: 402 MB
+Maximum work group size 256
+---------------------------
+This context is associated with  2 devices
+The queue is using the device: Intel(R) Iris(TM) Graphics 6100
+
+Results - 
+
+1) Maze 1 - 
+Finished after 912 iterations, 190.44896 ms total, 0.208825614035 ms per iteration
+Found 2 regions
+Maze 2 - 
+Finished after 532 iterations, 109.73248 ms total, 0.20626406015 ms per iteration
+Found 35 regions
+
+2) Maze 1 - 
+Finished after 529 iterations, 106.09848 ms total, 0.200564234405 ms per iteration
+Found 2 regions
+Maze 2 - 
+Finished after 273 iterations, 54.7588 ms total, 0.200581684982 ms per iteration
+Found 35 regions
+
+3) Maze 1 - 
+Finished after 11 iterations, 3.24648 ms total, 0.295134545455 ms per iteration
+Found 2 regions
+Maze 2 - 
+Finished after 9 iterations, 2.61312 ms total, 0.290346666667 ms per iteration
+Found 35 regions
+
+4) Maze 1 - 
+Finished after 10 iterations, 6.9416 ms total, 0.69416 ms per iteration
+Found 2 regions
+Maze 2 - 
+Finished after 9 iterations, 6.24304 ms total, 0.693671111111 ms per iteration
+Found 35 regions
+
+Explanation - This task is memory-bound and so avoiding duplicate calls to global memory (all of which are serialized) will help improve runtime. When using multiple threads to perform grandparent fetching from global memory, chances are that we are making such duplicate calls and so theoretically, forcing just one single thread per work-group to perform this task for the entire work-group could potentially help. However as can be seen in the runtimes above, for this problem and the GPU used here, the performance actually degrades. 
+I would hypothesize that this may have to do with the nature of our problem itself. Given that each workgroup only has 64 threads and that only a fraction of these (only the foreground pixels) need to make calls to global memory, there isnt much to be gained from explicitly serializing on one thread. In fact, we might be losing time due to the required administrataive overhead.
+
+5) Using min - Maze 1 - 
+Finished after 10 iterations, 6.92656 ms total, 0.692656 ms per iteration
+Found 2 regions
+Using min - Maze 2 - 
+Finished after 9 iterations, 6.20176 ms total, 0.689084444444 ms per iteration
+Found 35 regions
+
+As can be seen from the results above, using "min" instead of "atomic_min" speeds things up just a tiny bit after averaging over a few runs while still producing correct results. 
+The values in labels can increase due to race conditions (including between iterations), however, this will self-correct over time as the algorithm converges towards minimization of all connected foreground pixels. Thus, this is the reason for the final correct results.
+
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 78b986b3..79a419c4 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -17,6 +17,8 @@ initialize_labels(__global __read_only int *image,
     }
 }
 
+#define MIN(a,b) (((a)<(b))?(a):(b))
+
 int
 get_clamped_value(__global __read_only int *labels,
                   int w, int h,
@@ -60,6 +62,7 @@ propagate_labels(__global __read_write int *labels,
     int old_label;
     // Will store the output value
     int new_label;
+    int last_read;
     
     // Load the relevant labels to a local buffer with a halo 
     if (idx_1D < buf_w) {
@@ -80,20 +83,54 @@ propagate_labels(__global __read_write int *labels,
     old_label = buffer[buf_y * buf_w + buf_x];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
+    if( lx == 4 && ly == 4 ) // let (4,4) be the one thread that fetches grandparents
+    {
+        // force thread (4,4) to fetch grandparents for every work-group pixel that requires it 
+        // (excluding background and repeat immediate fetches)
+        for( int y_index = 0; y_index < get_local_size(0); y_index++ )
+            for( int x_index = 0; x_index < get_local_size(0); x_index++ )
+                if( buffer[(y_index + halo) * buf_w + halo + x_index] != w*h )
+                    if( buffer[(y_index + halo) * buf_w + halo + x_index] != last_read )
+                    {    
+                        buffer[(y_index + halo) * buf_w + halo + x_index] = labels[buffer[(y_index + halo) * buf_w + 
+                                                                                           halo + x_index]];
+                        last_read = buffer[(y_index + halo) * buf_w + halo + x_index];
+                    }
+    }             
+    // Code for PART 2
+    //if( buffer[buf_y * buf_w + buf_x] != w*h )
+    //        buffer[buf_y * buf_w + buf_x] = labels[buffer[buf_y * buf_w + buf_x]];
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
     
     // stay in bounds
     if ((x < w) && (y < h)) {
         // CODE FOR PART 1 HERE
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
-        new_label = old_label;
+        if( buffer[buf_y * buf_w + buf_x] != w*h )
+            new_label = MIN( buffer[buf_y * buf_w + buf_x], 
+                             MIN( MIN( buffer[(buf_y - 1) * buf_w + buf_x], buffer[(buf_y + 1) * buf_w + buf_x] ), 
+                                  MIN( buffer[buf_y * buf_w + buf_x + 1], buffer[buf_y * buf_w + buf_x - 1] ) ) );
+        else 
+            new_label = old_label;
 
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
             // indicate there was a change this iteration.
             // multiple threads might write this.
+
+            // Code to test PART 5
+            //int min_value;
+            //min_value = min( labels[old_label], new_label );
+            //labels[old_label] = min_value;
+            atomic_min( &labels[old_label], new_label );
+
             *(changed_flag) += 1;
-            labels[y * w + x] = new_label;
+            // Code to test PART 5
+            //min_value = min( labels[old_label], new_label );
+            //labels[y * w + x] = min_value;
+            atomic_min( &labels[y * w + x], new_label );
         }
     }
 }

From c5f7f9505964a07f029ae955615aa5152cffec64 Mon Sep 17 00:00:00 2001
From: vinayps <vinayps89@gmail.com>
Date: Fri, 20 Nov 2015 23:25:07 -0500
Subject: [PATCH 8/8] update to solutions for P5

---
 HW3/P5/label_regions.cl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 79a419c4..23450dac 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -120,14 +120,14 @@ propagate_labels(__global __read_write int *labels,
             // indicate there was a change this iteration.
             // multiple threads might write this.
 
-            // Code to test PART 5
+            // Commented code to test PART 5
             //int min_value;
             //min_value = min( labels[old_label], new_label );
             //labels[old_label] = min_value;
             atomic_min( &labels[old_label], new_label );
 
             *(changed_flag) += 1;
-            // Code to test PART 5
+            // Commented code to test PART 5
             //min_value = min( labels[old_label], new_label );
             //labels[y * w + x] = min_value;
             atomic_min( &labels[y * w + x], new_label );