From ed6f0167fd1f6097ce9e454c2950c5db00a0c513 Mon Sep 17 00:00:00 2001
From: "Thouis (Ray) Jones" <thouis@gmail.com>
Date: Mon, 2 Nov 2015 21:49:00 -0500
Subject: [PATCH 1/8] remove extraneous +4

---
 HW3/P3/tune.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
index c16e9fa6..a0d56da2 100644
--- a/HW3/P3/tune.py
+++ b/HW3/P3/tune.py
@@ -23,7 +23,7 @@ def create_data(N):
     times = {}
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
@@ -40,7 +40,7 @@ def create_data(N):
                   format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)

From 6706ee708d97c656caa08b284792b3eee63892cf Mon Sep 17 00:00:00 2001
From: "Thouis (Ray) Jones" <thouis@gmail.com>
Date: Wed, 4 Nov 2015 21:46:08 -0500
Subject: [PATCH 2/8] typo in typecast

---
 HW3/P3/sum.cl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index 4fb771d2..ee914740 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -38,7 +38,7 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+    int k = ceil((float)N / get_global_size(0));
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1

From f7dc5f66fb34745e3ed74460cdd037836c440df7 Mon Sep 17 00:00:00 2001
From: "Thouis (Ray) Jones" <thouis@seas.harvard.edu>
Date: Thu, 5 Nov 2015 09:43:23 -0500
Subject: [PATCH 3/8] unused module, set include path

---
 HW3/P4/median_filter.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
index 1eda1bb9..a181c05a 100644
--- a/HW3/P4/median_filter.py
+++ b/HW3/P4/median_filter.py
@@ -1,8 +1,8 @@
 from __future__ import division
 import pyopencl as cl
 import numpy as np
-import imread
 import pylab
+import os.path
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
-    program = cl.Program(context, open('median_filter.cl').read()).build(options='')
+    curdir = os.path.dirname(os.path.realpath(__file__))
+    program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
 
     host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
     host_image_filtered = np.zeros_like(host_image)

From 5814a34923e7570f9d27286939e52b4441902a66 Mon Sep 17 00:00:00 2001
From: JEEWON HWANG <jeewonhwang@g.harvard.edu>
Date: Wed, 18 Nov 2015 23:06:47 -0500
Subject: [PATCH 4/8] done P2

---
 HW3/P2/mandelbrot.cl | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
index 5a11c020..fdd24519 100644
--- a/HW3/P2/mandelbrot.cl
+++ b/HW3/P2/mandelbrot.cl
@@ -11,9 +11,24 @@ mandelbrot(__global __read_only float *coords_real,
     float c_real, c_imag;
     float z_real, z_imag;
     int iter;
+    float z_real_temp;
 
     if ((x < w) && (y < h)) {
         // YOUR CODE HERE
-        ;
+        c_real = coords_real[y * w + x];
+        c_imag = coords_imag[y * w + x];
+        z_real = 0;
+        z_imag = 0;
+
+        iter = 0;
+        
+        while(z_real * z_real + z_imag * z_imag < 4 && iter < max_iter) {
+            z_real_temp = z_real;
+            z_real = z_real * z_real - z_imag * z_imag + c_real;
+            z_imag = 2 * z_imag * z_real_temp + c_imag;
+            iter += 1;
+        }
+
+        out_counts[y * w + x] = iter;
     }
 }

From 9f8221f1410d43031e12a99e300f0f7c07824da0 Mon Sep 17 00:00:00 2001
From: JEEWON HWANG <jeewonhwang@g.harvard.edu>
Date: Wed, 18 Nov 2015 23:07:33 -0500
Subject: [PATCH 5/8] done until P3

---
 HW3/P3/P3.txt | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++
 HW3/P3/sum.cl | 33 ++++++++++++++-----
 2 files changed, 112 insertions(+), 8 deletions(-)
 create mode 100644 HW3/P3/P3.txt

diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
new file mode 100644
index 00000000..400d14e7
--- /dev/null
+++ b/HW3/P3/P3.txt
@@ -0,0 +1,87 @@
+#0: Intel(R) Core(TM) i5-5257U CPU @ 2.70GHz on Apple
+#1: Intel(R) Iris(TM) Graphics 6100 on Apple
+coalesced reads, workgroups: 8, num_workers: 4, 0.14305576 seconds
+coalesced reads, workgroups: 8, num_workers: 8, 0.07923752 seconds
+coalesced reads, workgroups: 8, num_workers: 16, 0.0476232 seconds
+coalesced reads, workgroups: 8, num_workers: 32, 0.03142232 seconds
+coalesced reads, workgroups: 8, num_workers: 64, 0.01692544 seconds
+coalesced reads, workgroups: 8, num_workers: 128, 0.0087728 seconds
+coalesced reads, workgroups: 16, num_workers: 4, 0.08204712 seconds
+coalesced reads, workgroups: 16, num_workers: 8, 0.04362816 seconds
+coalesced reads, workgroups: 16, num_workers: 16, 0.02584168 seconds
+coalesced reads, workgroups: 16, num_workers: 32, 0.01592912 seconds
+coalesced reads, workgroups: 16, num_workers: 64, 0.0089296 seconds
+coalesced reads, workgroups: 16, num_workers: 128, 0.00482416 seconds
+coalesced reads, workgroups: 32, num_workers: 4, 0.04397248 seconds
+coalesced reads, workgroups: 32, num_workers: 8, 0.02506672 seconds
+coalesced reads, workgroups: 32, num_workers: 16, 0.01428536 seconds
+coalesced reads, workgroups: 32, num_workers: 32, 0.0090748 seconds
+coalesced reads, workgroups: 32, num_workers: 64, 0.00475304 seconds
+coalesced reads, workgroups: 32, num_workers: 128, 0.00290496 seconds
+coalesced reads, workgroups: 64, num_workers: 4, 0.02331504 seconds
+coalesced reads, workgroups: 64, num_workers: 8, 0.01344792 seconds
+coalesced reads, workgroups: 64, num_workers: 16, 0.00785896 seconds
+coalesced reads, workgroups: 64, num_workers: 32, 0.00498288 seconds
+coalesced reads, workgroups: 64, num_workers: 64, 0.0027708 seconds
+coalesced reads, workgroups: 64, num_workers: 128, 0.00310968 seconds
+coalesced reads, workgroups: 128, num_workers: 4, 0.0252644 seconds
+coalesced reads, workgroups: 128, num_workers: 8, 0.0132512 seconds
+coalesced reads, workgroups: 128, num_workers: 16, 0.00801888 seconds
+coalesced reads, workgroups: 128, num_workers: 32, 0.00489016 seconds
+coalesced reads, workgroups: 128, num_workers: 64, 0.00347336 seconds
+coalesced reads, workgroups: 128, num_workers: 128, 0.00338608 seconds
+coalesced reads, workgroups: 256, num_workers: 4, 0.01951552 seconds
+coalesced reads, workgroups: 256, num_workers: 8, 0.01132352 seconds
+coalesced reads, workgroups: 256, num_workers: 16, 0.00630696 seconds
+coalesced reads, workgroups: 256, num_workers: 32, 0.0040716 seconds
+coalesced reads, workgroups: 256, num_workers: 64, 0.00298152 seconds
+coalesced reads, workgroups: 256, num_workers: 128, 0.00310384 seconds
+coalesced reads, workgroups: 512, num_workers: 4, 0.02002328 seconds
+coalesced reads, workgroups: 512, num_workers: 8, 0.0110812 seconds
+coalesced reads, workgroups: 512, num_workers: 16, 0.00702496 seconds
+coalesced reads, workgroups: 512, num_workers: 32, 0.00416688 seconds
+coalesced reads, workgroups: 512, num_workers: 64, 0.0027932 seconds
+coalesced reads, workgroups: 512, num_workers: 128, 0.00329216 seconds
+blocked reads, workgroups: 8, num_workers: 4, 0.16964528 seconds
+blocked reads, workgroups: 8, num_workers: 8, 0.09656184 seconds
+blocked reads, workgroups: 8, num_workers: 16, 0.06290712 seconds
+blocked reads, workgroups: 8, num_workers: 32, 0.04388848 seconds
+blocked reads, workgroups: 8, num_workers: 64, 0.02140192 seconds
+blocked reads, workgroups: 8, num_workers: 128, 0.01306816 seconds
+blocked reads, workgroups: 16, num_workers: 4, 0.08902432 seconds
+blocked reads, workgroups: 16, num_workers: 8, 0.054702 seconds
+blocked reads, workgroups: 16, num_workers: 16, 0.03474296 seconds
+blocked reads, workgroups: 16, num_workers: 32, 0.02219256 seconds
+blocked reads, workgroups: 16, num_workers: 64, 0.01349936 seconds
+blocked reads, workgroups: 16, num_workers: 128, 0.00919592 seconds
+blocked reads, workgroups: 32, num_workers: 4, 0.04945376 seconds
+blocked reads, workgroups: 32, num_workers: 8, 0.03145792 seconds
+blocked reads, workgroups: 32, num_workers: 16, 0.01753888 seconds
+blocked reads, workgroups: 32, num_workers: 32, 0.01313056 seconds
+blocked reads, workgroups: 32, num_workers: 64, 0.00843512 seconds
+blocked reads, workgroups: 32, num_workers: 128, 0.00641448 seconds
+blocked reads, workgroups: 64, num_workers: 4, 0.02745064 seconds
+blocked reads, workgroups: 64, num_workers: 8, 0.016524 seconds
+blocked reads, workgroups: 64, num_workers: 16, 0.01086632 seconds
+blocked reads, workgroups: 64, num_workers: 32, 0.00894552 seconds
+blocked reads, workgroups: 64, num_workers: 64, 0.00630192 seconds
+blocked reads, workgroups: 64, num_workers: 128, 0.00915584 seconds
+blocked reads, workgroups: 128, num_workers: 4, 0.02907888 seconds
+blocked reads, workgroups: 128, num_workers: 8, 0.01724728 seconds
+blocked reads, workgroups: 128, num_workers: 16, 0.01270904 seconds
+blocked reads, workgroups: 128, num_workers: 32, 0.00789992 seconds
+blocked reads, workgroups: 128, num_workers: 64, 0.00802856 seconds
+blocked reads, workgroups: 128, num_workers: 128, 0.00988272 seconds
+blocked reads, workgroups: 256, num_workers: 4, 0.02359272 seconds
+blocked reads, workgroups: 256, num_workers: 8, 0.01404144 seconds
+blocked reads, workgroups: 256, num_workers: 16, 0.00926688 seconds
+blocked reads, workgroups: 256, num_workers: 32, 0.00662312 seconds
+blocked reads, workgroups: 256, num_workers: 64, 0.0071192 seconds
+blocked reads, workgroups: 256, num_workers: 128, 0.00890344 seconds
+blocked reads, workgroups: 512, num_workers: 4, 0.02182904 seconds
+blocked reads, workgroups: 512, num_workers: 8, 0.01405424 seconds
+blocked reads, workgroups: 512, num_workers: 16, 0.01010704 seconds
+blocked reads, workgroups: 512, num_workers: 32, 0.00710488 seconds
+blocked reads, workgroups: 512, num_workers: 64, 0.00641912 seconds
+blocked reads, workgroups: 512, num_workers: 128, 0.00763992 seconds
+configuration ('coalesced', 64, 64): 0.0027708 seconds
\ No newline at end of file
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index ee914740..310ebf28 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -4,12 +4,17 @@ __kernel void sum_coalesced(__global float* x,
                             long N)
 {
     float sum = 0;
+    
+    size_t i = get_global_id(0);
     size_t local_id = get_local_id(0);
+    
+    size_t k = get_global_size(0);
+    size_t ls = get_local_size(0);
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    for (uint index = i; index < N; index += k) { // YOUR CODE HERE
+        sum += x[index]; // YOUR CODE HERE 
     }
 
     fast[local_id] = sum;
@@ -24,8 +29,12 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (uint s = ls/2; s > 0; s >>= 1) { // YOUR CODE HERE
+        // YOUR CODE HERE
+        if (local_id < s) {
+            fast[local_id] += fast[local_id + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -37,8 +46,12 @@ __kernel void sum_blocked(__global float* x,
                           long N)
 {
     float sum = 0;
+    
+    size_t i = get_global_id(0);
     size_t local_id = get_local_id(0);
+
     int k = ceil((float)N / get_global_size(0));
+    int ls = get_local_size(0);
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,8 +61,8 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (uint index = k * i; index < k * (i + 1) && index < N; index++) { // YOUR CODE HERE
+        sum += x[index]; // YOUR CODE HERE
     }
 
     fast[local_id] = sum;
@@ -64,8 +77,12 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (uint s = ls/2; s > 0; s >>= 1) { // YOUR CODE HERE
+        // YOUR CODE HERE
+        if (local_id < s) {
+            fast[local_id] += fast[local_id + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE); 
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];

From 8718a25bea94bb2accb08e10de230ac386025ff3 Mon Sep 17 00:00:00 2001
From: JEEWON HWANG <jeewonhwang@g.harvard.edu>
Date: Wed, 18 Nov 2015 23:07:51 -0500
Subject: [PATCH 6/8] done until P4

---
 HW3/P4/median_filter.cl | 72 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 67 insertions(+), 5 deletions(-)

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
index 07bb294c..90755934 100644
--- a/HW3/P4/median_filter.cl
+++ b/HW3/P4/median_filter.cl
@@ -1,5 +1,22 @@
 #include "median9.h"
 
+// Note that globally out-of-bounds pixels should be replaced
+// with the nearest valid pixel's value.
+static float
+fetch(__global __read_only float *in_values,
+      int w, int h,
+      int x, int y)
+{
+  if (x < 0) x = 0;
+  else if (x >= w) x = w-1;
+
+  if (y < 0) y = 0;
+  else if (y >= h) y = h-1;
+
+  return in_values[y * w + x];
+}
+
+
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -7,7 +24,7 @@ median_3x3(__global __read_only float *in_values,
            __local float *buffer,
            int w, int h,
            int buf_w, int buf_h,
-           const int halo)
+           const int halo) // width of halo on one side
 {
     // Note: It may be easier for you to implement median filtering
     // without using the local buffer, first, then adjust your code to
@@ -18,10 +35,39 @@ median_3x3(__global __read_only float *in_values,
     //
     // It may be helpful to consult HW3 Problem 5, and
     // https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
-    //
-    // Note that globally out-of-bounds pixels should be replaced
-    // with the nearest valid pixel's value.
+    
+    
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
 
+    // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+    int row;
+
+    if (idx_1D < buf_w)
+        for (row = 0; row < buf_h; row++) {
+            buffer[row * buf_w + idx_1D] = \
+                fetch(in_values, w, h,
+                      buf_corner_x + idx_1D,
+                      buf_corner_y + row);
+        }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
 
     // Compute 3x3 median for each pixel in core (non-halo) pixels
     //
@@ -31,4 +77,20 @@ median_3x3(__global __read_only float *in_values,
 
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
-}
+
+    // Processing code here...
+    //
+    // Should only use buffer, buf_x, buf_y.
+
+    // write output
+    if ((y < h) && (x < w)) // stay in bounds
+        out_values[y * w + x] = median9(buffer[(buf_y-1) * buf_w + buf_x - 1],
+                                        buffer[(buf_y-1) * buf_w + buf_x],
+                                        buffer[(buf_y-1) * buf_w + buf_x + 1],
+                                        buffer[buf_y * buf_w + buf_x - 1],
+                                        buffer[buf_y * buf_w + buf_x],
+                                        buffer[buf_y * buf_w + buf_x + 1],
+                                        buffer[(buf_y+1) * buf_w + buf_x - 1],
+                                        buffer[(buf_y+1) * buf_w + buf_x],
+                                        buffer[(buf_y+1) * buf_w + buf_x + 1]);
+}
\ No newline at end of file

From cdb851450ef198e1761e89e6880c31e681e2627f Mon Sep 17 00:00:00 2001
From: JEEWON HWANG <jeewonhwang@g.harvard.edu>
Date: Fri, 20 Nov 2015 23:51:14 -0500
Subject: [PATCH 7/8] p3.txt modified

---
 HW3/P3/P3.txt | 85 +--------------------------------------------------
 1 file changed, 1 insertion(+), 84 deletions(-)

diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
index 400d14e7..d3e495e1 100644
--- a/HW3/P3/P3.txt
+++ b/HW3/P3/P3.txt
@@ -1,87 +1,4 @@
 #0: Intel(R) Core(TM) i5-5257U CPU @ 2.70GHz on Apple
 #1: Intel(R) Iris(TM) Graphics 6100 on Apple
-coalesced reads, workgroups: 8, num_workers: 4, 0.14305576 seconds
-coalesced reads, workgroups: 8, num_workers: 8, 0.07923752 seconds
-coalesced reads, workgroups: 8, num_workers: 16, 0.0476232 seconds
-coalesced reads, workgroups: 8, num_workers: 32, 0.03142232 seconds
-coalesced reads, workgroups: 8, num_workers: 64, 0.01692544 seconds
-coalesced reads, workgroups: 8, num_workers: 128, 0.0087728 seconds
-coalesced reads, workgroups: 16, num_workers: 4, 0.08204712 seconds
-coalesced reads, workgroups: 16, num_workers: 8, 0.04362816 seconds
-coalesced reads, workgroups: 16, num_workers: 16, 0.02584168 seconds
-coalesced reads, workgroups: 16, num_workers: 32, 0.01592912 seconds
-coalesced reads, workgroups: 16, num_workers: 64, 0.0089296 seconds
-coalesced reads, workgroups: 16, num_workers: 128, 0.00482416 seconds
-coalesced reads, workgroups: 32, num_workers: 4, 0.04397248 seconds
-coalesced reads, workgroups: 32, num_workers: 8, 0.02506672 seconds
-coalesced reads, workgroups: 32, num_workers: 16, 0.01428536 seconds
-coalesced reads, workgroups: 32, num_workers: 32, 0.0090748 seconds
-coalesced reads, workgroups: 32, num_workers: 64, 0.00475304 seconds
-coalesced reads, workgroups: 32, num_workers: 128, 0.00290496 seconds
-coalesced reads, workgroups: 64, num_workers: 4, 0.02331504 seconds
-coalesced reads, workgroups: 64, num_workers: 8, 0.01344792 seconds
-coalesced reads, workgroups: 64, num_workers: 16, 0.00785896 seconds
-coalesced reads, workgroups: 64, num_workers: 32, 0.00498288 seconds
-coalesced reads, workgroups: 64, num_workers: 64, 0.0027708 seconds
-coalesced reads, workgroups: 64, num_workers: 128, 0.00310968 seconds
-coalesced reads, workgroups: 128, num_workers: 4, 0.0252644 seconds
-coalesced reads, workgroups: 128, num_workers: 8, 0.0132512 seconds
-coalesced reads, workgroups: 128, num_workers: 16, 0.00801888 seconds
-coalesced reads, workgroups: 128, num_workers: 32, 0.00489016 seconds
-coalesced reads, workgroups: 128, num_workers: 64, 0.00347336 seconds
-coalesced reads, workgroups: 128, num_workers: 128, 0.00338608 seconds
-coalesced reads, workgroups: 256, num_workers: 4, 0.01951552 seconds
-coalesced reads, workgroups: 256, num_workers: 8, 0.01132352 seconds
-coalesced reads, workgroups: 256, num_workers: 16, 0.00630696 seconds
-coalesced reads, workgroups: 256, num_workers: 32, 0.0040716 seconds
-coalesced reads, workgroups: 256, num_workers: 64, 0.00298152 seconds
-coalesced reads, workgroups: 256, num_workers: 128, 0.00310384 seconds
-coalesced reads, workgroups: 512, num_workers: 4, 0.02002328 seconds
-coalesced reads, workgroups: 512, num_workers: 8, 0.0110812 seconds
-coalesced reads, workgroups: 512, num_workers: 16, 0.00702496 seconds
-coalesced reads, workgroups: 512, num_workers: 32, 0.00416688 seconds
-coalesced reads, workgroups: 512, num_workers: 64, 0.0027932 seconds
-coalesced reads, workgroups: 512, num_workers: 128, 0.00329216 seconds
-blocked reads, workgroups: 8, num_workers: 4, 0.16964528 seconds
-blocked reads, workgroups: 8, num_workers: 8, 0.09656184 seconds
-blocked reads, workgroups: 8, num_workers: 16, 0.06290712 seconds
-blocked reads, workgroups: 8, num_workers: 32, 0.04388848 seconds
-blocked reads, workgroups: 8, num_workers: 64, 0.02140192 seconds
-blocked reads, workgroups: 8, num_workers: 128, 0.01306816 seconds
-blocked reads, workgroups: 16, num_workers: 4, 0.08902432 seconds
-blocked reads, workgroups: 16, num_workers: 8, 0.054702 seconds
-blocked reads, workgroups: 16, num_workers: 16, 0.03474296 seconds
-blocked reads, workgroups: 16, num_workers: 32, 0.02219256 seconds
-blocked reads, workgroups: 16, num_workers: 64, 0.01349936 seconds
-blocked reads, workgroups: 16, num_workers: 128, 0.00919592 seconds
-blocked reads, workgroups: 32, num_workers: 4, 0.04945376 seconds
-blocked reads, workgroups: 32, num_workers: 8, 0.03145792 seconds
-blocked reads, workgroups: 32, num_workers: 16, 0.01753888 seconds
-blocked reads, workgroups: 32, num_workers: 32, 0.01313056 seconds
-blocked reads, workgroups: 32, num_workers: 64, 0.00843512 seconds
-blocked reads, workgroups: 32, num_workers: 128, 0.00641448 seconds
-blocked reads, workgroups: 64, num_workers: 4, 0.02745064 seconds
-blocked reads, workgroups: 64, num_workers: 8, 0.016524 seconds
-blocked reads, workgroups: 64, num_workers: 16, 0.01086632 seconds
-blocked reads, workgroups: 64, num_workers: 32, 0.00894552 seconds
-blocked reads, workgroups: 64, num_workers: 64, 0.00630192 seconds
-blocked reads, workgroups: 64, num_workers: 128, 0.00915584 seconds
-blocked reads, workgroups: 128, num_workers: 4, 0.02907888 seconds
-blocked reads, workgroups: 128, num_workers: 8, 0.01724728 seconds
-blocked reads, workgroups: 128, num_workers: 16, 0.01270904 seconds
-blocked reads, workgroups: 128, num_workers: 32, 0.00789992 seconds
-blocked reads, workgroups: 128, num_workers: 64, 0.00802856 seconds
-blocked reads, workgroups: 128, num_workers: 128, 0.00988272 seconds
-blocked reads, workgroups: 256, num_workers: 4, 0.02359272 seconds
-blocked reads, workgroups: 256, num_workers: 8, 0.01404144 seconds
-blocked reads, workgroups: 256, num_workers: 16, 0.00926688 seconds
-blocked reads, workgroups: 256, num_workers: 32, 0.00662312 seconds
-blocked reads, workgroups: 256, num_workers: 64, 0.0071192 seconds
-blocked reads, workgroups: 256, num_workers: 128, 0.00890344 seconds
-blocked reads, workgroups: 512, num_workers: 4, 0.02182904 seconds
-blocked reads, workgroups: 512, num_workers: 8, 0.01405424 seconds
-blocked reads, workgroups: 512, num_workers: 16, 0.01010704 seconds
-blocked reads, workgroups: 512, num_workers: 32, 0.00710488 seconds
-blocked reads, workgroups: 512, num_workers: 64, 0.00641912 seconds
-blocked reads, workgroups: 512, num_workers: 128, 0.00763992 seconds
+
 configuration ('coalesced', 64, 64): 0.0027708 seconds
\ No newline at end of file

From 1b8f6bb6df4f29202ff37239c51a952aaab50d76 Mon Sep 17 00:00:00 2001
From: JEEWON HWANG <jeewonhwang@g.harvard.edu>
Date: Fri, 20 Nov 2015 23:51:53 -0500
Subject: [PATCH 8/8] done with P5

---
 HW3/P5/P5.txt           | 38 +++++++++++++++++++++++++++++++
 HW3/P5/label_regions.cl | 50 ++++++++++++++++++++++++++++++++++++++++-
 HW3/P5/label_regions.py |  5 ++++-
 3 files changed, 91 insertions(+), 2 deletions(-)
 create mode 100644 HW3/P5/P5.txt

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
new file mode 100644
index 00000000..aed99677
--- /dev/null
+++ b/HW3/P5/P5.txt
@@ -0,0 +1,38 @@
+1. Iteration counts and average kernel times after each part
+
+(1) Part 1
+- Maze 1 : Finished after 915 iterations, 194.06176 ms total, 0.212089355191 ms per iteration
+Found 2 regions
+- Maze 2 : Finished after 532 iterations, 111.65112 ms total, 0.209870526316 ms per iteration
+Found 35 regions
+
+(2) Part 2
+- Maze 1 : Finished after 529 iterations, 109.62104 ms total, 0.207223137996 ms per iteration
+Found 2 regions
+- Maze 2 : Finished after 273 iterations, 56.87648 ms total, 0.208338754579 ms per iteration
+Found 35 regions
+
+(3) Part 3
+- Maze 1 : Finished after 10 iterations, 3.1164 ms total, 0.31164 ms per iteration
+Found 2 regions
+- Maze 2 : Finished after 9 iterations, 2.72808 ms total, 0.30312 ms per iteration
+Found 35 regions
+
+(4) Part 4
+- Maze 1 : Finished after 16 iterations, 8.2236 ms total, 0.513975 ms per iteration
+Found 2 regions
+- Maze 2 : Finished after 15 iterations, 7.52448 ms total, 0.501632 ms per iteration
+Found 35 regions
+
+
+2. Part 4 discussion
+Based on my empirical result, using a single thread to reduce redundant memory access rather made it more slower than part 3 codes did. Number of terations has increased 2 times and execution speed decreased to half. 
+I think even though part 3 code makes redundant memory access and it might take some time, but still using multiple threads override the advantage of serial access. Therefore, in my machine, the time saved by lessening redundant lookup is smaller than the time saved by parallel implementation.
+But this might depend on different conditions or environment of GPUs. For example, if the speed of GPU access is very slow, and the input image mostly has the same labels within a workgroup, then seriel implementation (code of part 4) will be better. 
+On the other hand, if the GPU access speed is not bottleneck anymore and the image has label that varies a lot even within a same group, code for P4 will work better in efficiency.
+
+
+3. Part 5 explanation
+atomic_min() makes sure that a certain memory address will be only accessed by one thread at a time. So in our code, it makes the label[old_label] always gets the actual minimum values when we use atomic_min(). However, if we use min() instead of atomic_min(), it might cause label[old_label] to be overwritten so that it has different value, resulting a value in label to increase.
+The final result will still be correct since once the minimum value is taken, this value is keep used. However, the performance of the algorithm (time and iterations) might vary.
+The operation using min() will be faster since it allows parallelism, however, the # of iterations will increase since a single thread will take more computation until it converge to get the actual minimum value. 
\ No newline at end of file
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 78b986b3..66aeeb90 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -80,7 +80,43 @@ propagate_labels(__global __read_write int *labels,
     old_label = buffer[buf_y * buf_w + buf_x];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
+
+    // PART 2
+    // if (old_label < w * h) { // to check if it's background pixel
+    //     buffer[buf_y * buf_w + buf_x] = labels[buffer[buf_y * buf_w + buf_x]];
+    // }
     
+    // PART 4
+    // only thread for lx, ly = (0, 0) will be used to fetching the data
+    if (lx == 0 && ly == 0) {
+        int current_label, current_label_index, last_label;
+        int work_group_size_x = get_local_size(0);
+        int work_group_size_y = get_local_size(1);
+
+        for (int row = 0; row < work_group_size_x; row++) {
+            for (int col = 0; col < work_group_size_y; col++) {
+
+                current_label_index = (row + halo) * buf_w + (col + halo);
+                current_label = buffer[current_label_index];
+
+                if (current_label < w * h) { // not background
+                    if (current_label == last_label) {
+                        buffer[current_label_index] = last_label;
+                    }
+                    else {
+                        last_label = current_label;
+                        buffer[current_label_index] = labels[current_label];
+                    }
+                }
+
+            }
+        }
+    }
+
+    // Make sure all threads reach the next part after
+    // the local buffer is loaded
+    barrier(CLK_LOCAL_MEM_FENCE);
+
     // stay in bounds
     if ((x < w) && (y < h)) {
         // CODE FOR PART 1 HERE
@@ -88,12 +124,24 @@ propagate_labels(__global __read_write int *labels,
         // to adjust this for correctness.
         new_label = old_label;
 
+        if (new_label < w * h) { // to check if it's background pixel
+
+            int up_pixel = buffer[(buf_y - 1) * buf_w + buf_x];
+            int left_pixel = buffer[buf_y * buf_w + buf_x -1];
+            int down_pixel = buffer[(buf_y + 1) * buf_w + buf_x];
+            int right_pixel = buffer[buf_y * buf_w + buf_x + 1];
+
+            new_label = min(min(min(min(old_label, up_pixel), left_pixel), down_pixel), right_pixel);
+        }
+
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
             // indicate there was a change this iteration.
             // multiple threads might write this.
             *(changed_flag) += 1;
-            labels[y * w + x] = new_label;
+            atomic_min(&labels[old_label], new_label);
+            // labels[y * w + x] = new_label;
+            atomic_min(&labels[y * w + x], new_label);
         }
     }
 }
diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py
index c6ce60cb..16522113 100644
--- a/HW3/P5/label_regions.py
+++ b/HW3/P5/label_regions.py
@@ -3,6 +3,9 @@
 import pyopencl as cl
 import numpy as np
 import pylab
+# import os
+
+# os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -42,7 +45,7 @@ def round_up(global_size, group_size):
 
     program = cl.Program(context, open('label_regions.cl').read()).build(options='')
 
-    host_image = np.load('maze1.npy')
+    host_image = np.load('maze2.npy')
     host_labels = np.empty_like(host_image)
     host_done_flag = np.zeros(1).astype(np.int32)