From ed6f0167fd1f6097ce9e454c2950c5db00a0c513 Mon Sep 17 00:00:00 2001
From: "Thouis (Ray) Jones" <thouis@gmail.com>
Date: Mon, 2 Nov 2015 21:49:00 -0500
Subject: [PATCH 1/9] remove extraneous +4

---
 HW3/P3/tune.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
index c16e9fa6..a0d56da2 100644
--- a/HW3/P3/tune.py
+++ b/HW3/P3/tune.py
@@ -23,7 +23,7 @@ def create_data(N):
     times = {}
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
@@ -40,7 +40,7 @@ def create_data(N):
                   format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)

From 6706ee708d97c656caa08b284792b3eee63892cf Mon Sep 17 00:00:00 2001
From: "Thouis (Ray) Jones" <thouis@gmail.com>
Date: Wed, 4 Nov 2015 21:46:08 -0500
Subject: [PATCH 2/9] typo in typecast

---
 HW3/P3/sum.cl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index 4fb771d2..ee914740 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -38,7 +38,7 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+    int k = ceil((float)N / get_global_size(0));
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1

From f7dc5f66fb34745e3ed74460cdd037836c440df7 Mon Sep 17 00:00:00 2001
From: "Thouis (Ray) Jones" <thouis@seas.harvard.edu>
Date: Thu, 5 Nov 2015 09:43:23 -0500
Subject: [PATCH 3/9] unused module, set include path

---
 HW3/P4/median_filter.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
index 1eda1bb9..a181c05a 100644
--- a/HW3/P4/median_filter.py
+++ b/HW3/P4/median_filter.py
@@ -1,8 +1,8 @@
 from __future__ import division
 import pyopencl as cl
 import numpy as np
-import imread
 import pylab
+import os.path
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
-    program = cl.Program(context, open('median_filter.cl').read()).build(options='')
+    curdir = os.path.dirname(os.path.realpath(__file__))
+    program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
 
     host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
     host_image_filtered = np.zeros_like(host_image)

From bbf967ef55f385f834f7ccc0086fd0c0076a345d Mon Sep 17 00:00:00 2001
From: Vivek Jayaram <vivekjayaram@college.harvard.edu>
Date: Thu, 19 Nov 2015 18:47:13 -0500
Subject: [PATCH 4/9] done with part of 5

---
 HW3/P3/sum.cl           | 51 +++++++++++++++++-------------
 HW3/P3/tune.py          | 11 ++++---
 HW3/P4/median_filter.cl | 69 +++++++++++++++++++++++++++++++----------
 HW3/P5/label_regions.cl | 22 +++++++++++--
 4 files changed, 108 insertions(+), 45 deletions(-)

diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index ee914740..6cec4006 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -3,32 +3,38 @@ __kernel void sum_coalesced(__global float* x,
                             __local  float* fast,
                             long N)
 {
-    float sum = 0;
-    size_t local_id = get_local_id(0);
+    
+    //float sum = 0;
+    //size_t local_id = get_local_id(0);
+    //int i = get_global_id(0);
+    //int k = get_global_size(0);
 
-    // thread i (i.e., with i = get_global_id()) should add x[i],
-    // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
-    }
+    // // thread i (i.e., with i = get_global_id()) should add x[i],
+    // // x[i + get_global_size()], ... up to N-1, and store in sum.
+    // for (int counter = i; counter < N; counter += k) {
+    //     sum += x[counter];
+    // }
 
-    fast[local_id] = sum;
-    barrier(CLK_LOCAL_MEM_FENCE);
+    // fast[local_id] = sum;
+    // barrier(CLK_LOCAL_MEM_FENCE);
 
-    // binary reduction
-    //
-    // thread i should sum fast[i] and fast[i + offset] and store back
-    // in fast[i], for offset = (local_size >> j) for j from 1 to
-    // log_2(local_size)
-    //
-    // You can assume get_local_size(0) is a power of 2.
-    //
-    // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
-    }
+    // // binary reduction
+    // //
+    // // thread i should sum fast[i] and fast[i + offset] and store back
+    // // in fast[i], for offset = (local_size >> j) for j from 1 to
+    // // log_2(local_size)
+    // //
+    // // You can assume get_local_size(0) is a power of 2.
+    // //
+    // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
+    // for (size_t offset = get_local_size(0); offset > 0; offset = offset >> 1) { 
+    //     if(local_id < offset){
+    //         fast[local_id] = fast[local_id] + fast[local_id+offset];
+    //     }
+    //     barrier(CLK_LOCAL_MEM_FENCE);
+    // }
 
-    if (local_id == 0) partial[get_group_id(0)] = fast[0];
+    //if (local_id == 0) partial[get_group_id(0)] = fast[0];
 }
 
 __kernel void sum_blocked(__global float* x,
@@ -39,6 +45,7 @@ __kernel void sum_blocked(__global float* x,
     float sum = 0;
     size_t local_id = get_local_id(0);
     int k = ceil((float)N / get_global_size(0));
+    (void) k;
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
index a0d56da2..ca1dfad8 100644
--- a/HW3/P3/tune.py
+++ b/HW3/P3/tune.py
@@ -1,5 +1,7 @@
 import pyopencl as cl
 import numpy as np
+import os
+os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
 
 def create_data(N):
     return host_x, x
@@ -14,9 +16,10 @@ def create_data(N):
     ctx = cl.Context(devices)
 
     queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
-
-    program = cl.Program(ctx, open('sum.cl').read()).build(options='')
-
+    try:
+        program = cl.Program(ctx, open('sum.cl').read()).build(options='')
+    except:
+        print prg.get_build_info(ctx.devices[0], cl.program_build_info.LOG);
     host_x = np.random.rand(N).astype(np.float32)
     x = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=host_x)
 
@@ -30,7 +33,7 @@ def create_data(N):
             event = program.sum_coalesced(queue, (num_workgroups * num_workers,), (num_workers,),
                                           x, partial_sums, local, np.uint64(N))
             cl.enqueue_copy(queue, host_partial, partial_sums, is_blocking=True)
-
+            print partial_sums;
             sum_gpu = sum(host_partial)
             sum_host = sum(host_x)
             seconds = (event.profile.end - event.profile.start) / 1e9
diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
index 07bb294c..e0047e30 100644
--- a/HW3/P4/median_filter.cl
+++ b/HW3/P4/median_filter.cl
@@ -1,5 +1,20 @@
 #include "median9.h"
 
+float fetch(__global__read_only float* in_values, int image_w, int image_h, int x, int y){
+  if(x < 0){
+    x = 0;
+  }
+  if(x >= image_w){
+    x = image_w - 1;
+  }
+  if (y < 0){
+    y = 0;
+  }
+  if (y >= image_h){
+    y = image_h - 1;
+  }
+  return in_values[y * w + h];
+}
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -9,26 +24,48 @@ median_3x3(__global __read_only float *in_values,
            int buf_w, int buf_h,
            const int halo)
 {
-    // Note: It may be easier for you to implement median filtering
-    // without using the local buffer, first, then adjust your code to
-    // use such a buffer after you have that working.
 
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
 
-    // Load into buffer (with 1-pixel halo).
-    //
-    // It may be helpful to consult HW3 Problem 5, and
-    // https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
-    //
-    // Note that globally out-of-bounds pixels should be replaced
-    // with the nearest valid pixel's value.
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
 
+    int row;
 
-    // Compute 3x3 median for each pixel in core (non-halo) pixels
-    //
-    // We've given you median9.h, and included it above, so you can
-    // use the median9() function.
+    if (idx_1D < buf_w)
+        for (row = 0; row < buf_h; row++) {
+            buffer[row * buf_w + idx_1D] = \
+                fetch(in_values, w, h,
+                      buf_corner_x + idx_1D,
+                      buf_corner_y + row);
+        }
 
+    barrier(CLK_LOCAL_MEM_FENCE);
 
-    // Each thread in the valid region (x < w, y < h) should write
-    // back its 3x3 neighborhood median.
+    out_values[y * w + x] = median9(
+      buffer[(buf_y - 1) * buf_w + (buf_x - 1)],
+      buffer[(buf_y - 1) * buf_w + (buf_x)],
+      buffer[(buf_y - 1) * buf_w + (buf_x + 1)],
+      buffer[(buf_y) * buf_w + (buf_x - 1)],
+      buffer[(buf_y) * buf_w + (buf_x)],
+      buffer[(buf_y) * buf_w + (buf_x + 1)],
+      buffer[(buf_y + 1) * buf_w + (buf_x - 1)],
+      buffer[(buf_y + 1) * buf_w + (buf_x)],
+      buffer[(buf_y + 1) * buf_w + (buf_x + 1)],
+    )
 }
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 78b986b3..1d4ee99b 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -80,20 +80,36 @@ propagate_labels(__global __read_write int *labels,
     old_label = buffer[buf_y * buf_w + buf_x];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
-    
+    if(old_label < w * h){
+        location = buf_y * buf_w + buf_x;
+        buffer[location] = labels[buffer[location]];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
     // stay in bounds
     if ((x < w) && (y < h)) {
         // CODE FOR PART 1 HERE
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
         new_label = old_label;
-
+        if (new_label < w*h){
+            new_label = min(
+                new_label,
+                min(buffer[(buf_y -1) * h + buf_x],
+                    min(buffer[buf_y * h + buf_x - 1],
+                        min(buffer[buf_y * h + buf_x + 1],
+                            min(buffer[(buf_y - 1) * h + buf_x])))));
+            )
+        }
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
             // indicate there was a change this iteration.
             // multiple threads might write this.
             *(changed_flag) += 1;
-            labels[y * w + x] = new_label;
+            // Call atomic mean to put new_label in labels[old_label], and make sure it didn't increase
+            atomic_min(&labels[old_label], new_label);
+            // Call atomic min when writing the non halo portion
+            atomic_min(&labels[y * w + x], new_label
         }
     }
 }

From 6eee4f74e3e4315f798df7048ee681fd343c81b6 Mon Sep 17 00:00:00 2001
From: New Name <name@email.com>
Date: Thu, 19 Nov 2015 22:39:03 -0500
Subject: [PATCH 5/9] Done up to part 4

---
 HW3/P3/sum.cl           | 32 ++++++++++++++++----------------
 HW3/P3/tune.py          |  1 +
 HW3/P4/median_filter.cl | 34 +++++++++++++++++-----------------
 HW3/P4/median_filter.py |  1 +
 HW3/P5/label_regions.cl | 20 ++++++++++----------
 HW3/P5/label_regions.py | 18 +++++++++---------
 6 files changed, 54 insertions(+), 52 deletions(-)

diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index 6cec4006..12716219 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -4,19 +4,19 @@ __kernel void sum_coalesced(__global float* x,
                             long N)
 {
     
-    //float sum = 0;
-    //size_t local_id = get_local_id(0);
-    //int i = get_global_id(0);
-    //int k = get_global_size(0);
+    float sum = 0;
+    size_t local_id = get_local_id(0);
+    int i = get_global_id(0);
+    int k = get_global_size(0);
 
     // // thread i (i.e., with i = get_global_id()) should add x[i],
     // // x[i + get_global_size()], ... up to N-1, and store in sum.
-    // for (int counter = i; counter < N; counter += k) {
-    //     sum += x[counter];
-    // }
+    for (int counter = i; counter < N; counter += k) {
+        sum += x[counter];
+    }
 
-    // fast[local_id] = sum;
-    // barrier(CLK_LOCAL_MEM_FENCE);
+    fast[local_id] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
 
     // // binary reduction
     // //
@@ -27,14 +27,14 @@ __kernel void sum_coalesced(__global float* x,
     // // You can assume get_local_size(0) is a power of 2.
     // //
     // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    // for (size_t offset = get_local_size(0); offset > 0; offset = offset >> 1) { 
-    //     if(local_id < offset){
-    //         fast[local_id] = fast[local_id] + fast[local_id+offset];
-    //     }
-    //     barrier(CLK_LOCAL_MEM_FENCE);
-    // }
+    for (size_t offset = get_local_size(0) >> 1; offset > 0; offset = offset >> 1) { 
+        if(local_id < offset){
+            fast[local_id] = fast[local_id] + fast[local_id+offset];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
 
-    //if (local_id == 0) partial[get_group_id(0)] = fast[0];
+    if (local_id == 0) partial[get_group_id(0)] = fast[0];
 }
 
 __kernel void sum_blocked(__global float* x,
diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
index ca1dfad8..263f53b3 100644
--- a/HW3/P3/tune.py
+++ b/HW3/P3/tune.py
@@ -37,6 +37,7 @@ def create_data(N):
             sum_gpu = sum(host_partial)
             sum_host = sum(host_x)
             seconds = (event.profile.end - event.profile.start) / 1e9
+            print sum_gpu
             assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
             times['coalesced', num_workgroups, num_workers] = seconds
             print("coalesced reads, workgroups: {}, num_workers: {}, {} seconds".
diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
index e0047e30..6936e4ab 100644
--- a/HW3/P4/median_filter.cl
+++ b/HW3/P4/median_filter.cl
@@ -1,10 +1,9 @@
 #include "median9.h"
 
-float fetch(__global__read_only float* in_values, int image_w, int image_h, int x, int y){
+float fetch(__global __read_only float *in_values, int image_w, int image_h, int x, int y){
   if(x < 0){
     x = 0;
-  }
-  if(x >= image_w){
+  }if(x >= image_w){
     x = image_w - 1;
   }
   if (y < 0){
@@ -13,7 +12,7 @@ float fetch(__global__read_only float* in_values, int image_w, int image_h, int
   if (y >= image_h){
     y = image_h - 1;
   }
-  return in_values[y * w + h];
+  return in_values[y * image_w + x];
 }
 // 3x3 median filter
 __kernel void
@@ -47,25 +46,26 @@ median_3x3(__global __read_only float *in_values,
 
     int row;
 
-    if (idx_1D < buf_w)
+    if (idx_1D < buf_w){
         for (row = 0; row < buf_h; row++) {
             buffer[row * buf_w + idx_1D] = \
                 fetch(in_values, w, h,
                       buf_corner_x + idx_1D,
                       buf_corner_y + row);
         }
+    }
 
     barrier(CLK_LOCAL_MEM_FENCE);
-
-    out_values[y * w + x] = median9(
-      buffer[(buf_y - 1) * buf_w + (buf_x - 1)],
-      buffer[(buf_y - 1) * buf_w + (buf_x)],
-      buffer[(buf_y - 1) * buf_w + (buf_x + 1)],
-      buffer[(buf_y) * buf_w + (buf_x - 1)],
-      buffer[(buf_y) * buf_w + (buf_x)],
-      buffer[(buf_y) * buf_w + (buf_x + 1)],
-      buffer[(buf_y + 1) * buf_w + (buf_x - 1)],
-      buffer[(buf_y + 1) * buf_w + (buf_x)],
-      buffer[(buf_y + 1) * buf_w + (buf_x + 1)],
-    )
+    if ((y < h) && (x < w)){
+      out_values[y * w + x] = median9(
+        buffer[(buf_y - 1) * buf_w + (buf_x - 1)],
+        buffer[(buf_y - 1) * buf_w + (buf_x)],
+        buffer[(buf_y - 1) * buf_w + (buf_x + 1)],
+        buffer[(buf_y) * buf_w + (buf_x - 1)],
+        buffer[(buf_y) * buf_w + (buf_x)],
+        buffer[(buf_y) * buf_w + (buf_x + 1)],
+        buffer[(buf_y + 1) * buf_w + (buf_x - 1)],
+        buffer[(buf_y + 1) * buf_w + (buf_x)],
+        buffer[(buf_y + 1) * buf_w + (buf_x + 1)]);
+  }
 }
diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
index a181c05a..976564c3 100644
--- a/HW3/P4/median_filter.py
+++ b/HW3/P4/median_filter.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pylab
 import os.path
+os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
 
 def round_up(global_size, group_size):
     r = global_size % group_size
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 1d4ee99b..aac2461f 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -80,10 +80,10 @@ propagate_labels(__global __read_write int *labels,
     old_label = buffer[buf_y * buf_w + buf_x];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
-    if(old_label < w * h){
-        location = buf_y * buf_w + buf_x;
-        buffer[location] = labels[buffer[location]];
-    }
+    // if(old_label < w * h){
+    //     location = buf_y * buf_w + buf_x;
+    //     buffer[location] = labels[buffer[location]];
+    // }
 
     barrier(CLK_LOCAL_MEM_FENCE);
     // stay in bounds
@@ -97,19 +97,19 @@ propagate_labels(__global __read_write int *labels,
                 new_label,
                 min(buffer[(buf_y -1) * h + buf_x],
                     min(buffer[buf_y * h + buf_x - 1],
-                        min(buffer[buf_y * h + buf_x + 1],
-                            min(buffer[(buf_y - 1) * h + buf_x])))));
-            )
+                        min(buffer[buf_y * h + buf_x + 1], 
+                            buffer[(buf_y - 1) * h + buf_x]))));
         }
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
             // indicate there was a change this iteration.
             // multiple threads might write this.
             *(changed_flag) += 1;
+            labels[y * w + x] = new_label;
             // Call atomic mean to put new_label in labels[old_label], and make sure it didn't increase
-            atomic_min(&labels[old_label], new_label);
-            // Call atomic min when writing the non halo portion
-            atomic_min(&labels[y * w + x], new_label
+            // atomic_min(&labels[old_label], new_label);
+            // // Call atomic min when writing the non halo portion
+            // atomic_min(&labels[y * w + x], new_label
         }
     }
 }
diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py
index c6ce60cb..6939b0f0 100644
--- a/HW3/P5/label_regions.py
+++ b/HW3/P5/label_regions.py
@@ -75,9 +75,9 @@ def round_up(global_size, group_size):
 
     # Show the initial labels
     cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True)
-    pylab.imshow(host_labels)
-    pylab.title(itercount)
-    pylab.show()
+    # pylab.imshow(host_labels)
+    # pylab.title(itercount)
+    # pylab.show()
 
     show_progress = True
     total_time = 0
@@ -105,9 +105,9 @@ def round_up(global_size, group_size):
         print host_done_flag
         if itercount % 100 == 0 and show_progress:
             cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True)
-            pylab.imshow(host_labels)
-            pylab.title(itercount)
-            pylab.show()
+            # pylab.imshow(host_labels)
+            # pylab.title(itercount)
+            # pylab.show()
         if itercount % 10000 == 0:
             print 'Reached maximal number of iterations, aborting'
             sys.exit(0)
@@ -116,6 +116,6 @@ def round_up(global_size, group_size):
     # Show final result
     cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True)
     print 'Found {} regions'.format(len(np.unique(host_labels)) - 1)
-    pylab.imshow(host_labels)
-    pylab.title(itercount)
-    pylab.show()
+    # pylab.imshow(host_labels)
+    # pylab.title(itercount)
+    # pylab.show()

From c4a2a5fa7f1935d23eb1a206707bf28e0ef629be Mon Sep 17 00:00:00 2001
From: Vivek Jayaram <vivekjayaram@college.harvard.edu>
Date: Fri, 20 Nov 2015 00:12:47 -0500
Subject: [PATCH 6/9] About to start running the code

---
 HW3/P3/sum.cl           | 16 +++++++++++-----
 HW3/P5/label_regions.cl | 14 ++++++++++++++
 HW3/P5/label_regions.py | 20 ++++++++++----------
 3 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index 12716219..5a59558e 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -45,7 +45,8 @@ __kernel void sum_blocked(__global float* x,
     float sum = 0;
     size_t local_id = get_local_id(0);
     int k = ceil((float)N / get_global_size(0));
-    (void) k;
+    size_t global_id = get_global_id(0);
+
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -55,8 +56,10 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (int i = 0; i <k; i ++) {
+        if(i < N){
+            sum += x[k * global_id + i];
+        }
     }
 
     fast[local_id] = sum;
@@ -71,8 +74,11 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (size_t offset = get_local_size(0) >> 1; offset > 0; offset = offset >> 1) { 
+        if(local_id < offset){
+            fast[local_id] = fast[local_id] + fast[local_id+offset];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index aac2461f..8f093449 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -85,6 +85,20 @@ propagate_labels(__global __read_write int *labels,
     //     buffer[location] = labels[buffer[location]];
     // }
 
+    // Make sure we only use one thread per work group
+    // Use the first element
+    if (ly + lx == 0){
+        int previous_label = -1;
+        for(int i = 0; i < buf_x * buf_y; i ++){
+            if(buffer[i] < w * h && previous_label != buffer[i]){
+                previous_label == labels[buffer[i]];
+            }
+            buffer[i] = previous_label;
+        }
+
+    }
+
+
     barrier(CLK_LOCAL_MEM_FENCE);
     // stay in bounds
     if ((x < w) && (y < h)) {
diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py
index 6939b0f0..276179fd 100644
--- a/HW3/P5/label_regions.py
+++ b/HW3/P5/label_regions.py
@@ -2,7 +2,7 @@
 import sys
 import pyopencl as cl
 import numpy as np
-import pylab
+from matplotlib import pyplot
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -75,9 +75,9 @@ def round_up(global_size, group_size):
 
     # Show the initial labels
     cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True)
-    # pylab.imshow(host_labels)
-    # pylab.title(itercount)
-    # pylab.show()
+    pyplot.imshow(host_labels)
+    pyplot.title(itercount)
+    pyplot.show()
 
     show_progress = True
     total_time = 0
@@ -105,9 +105,9 @@ def round_up(global_size, group_size):
         print host_done_flag
         if itercount % 100 == 0 and show_progress:
             cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True)
-            # pylab.imshow(host_labels)
-            # pylab.title(itercount)
-            # pylab.show()
+            pyplot.imshow(host_labels)
+            pyplot.title(itercount)
+            pyplot.show()
         if itercount % 10000 == 0:
             print 'Reached maximal number of iterations, aborting'
             sys.exit(0)
@@ -116,6 +116,6 @@ def round_up(global_size, group_size):
     # Show final result
     cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True)
     print 'Found {} regions'.format(len(np.unique(host_labels)) - 1)
-    # pylab.imshow(host_labels)
-    # pylab.title(itercount)
-    # pylab.show()
+    pyplot.imshow(host_labels)
+    pyplot.title(itercount)
+    pyplot.show()

From 6cb18918ca2e36d8aa377f20d890c535b11d5d43 Mon Sep 17 00:00:00 2001
From: New Name <name@email.com>
Date: Fri, 20 Nov 2015 01:07:36 -0500
Subject: [PATCH 7/9] Almost done

---
 HW3/P3/P3.txt           | 87 +++++++++++++++++++++++++++++++++++++++++
 HW3/P3/sum.cl           |  2 +-
 HW3/P3/tune.py          |  2 -
 HW3/P5/P5.txt           | 28 +++++++++++++
 HW3/P5/label_regions.cl | 24 ++++++------
 HW3/P5/label_regions.py |  5 ++-
 6 files changed, 132 insertions(+), 16 deletions(-)
 create mode 100644 HW3/P3/P3.txt
 create mode 100644 HW3/P5/P5.txt

diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
new file mode 100644
index 00000000..77b350fc
--- /dev/null
+++ b/HW3/P3/P3.txt
@@ -0,0 +1,87 @@
+The best configuration was of course coalesced reads, and workgroups 256, num workers 64 at .0028
+
+coalesced reads, workgroups: 8, num_workers: 4, 0.17128184 seconds
+coalesced reads, workgroups: 8, num_workers: 8, 0.0681632 seconds
+coalesced reads, workgroups: 8, num_workers: 16, 0.05655496 seconds
+coalesced reads, workgroups: 8, num_workers: 32, 0.02897184 seconds
+coalesced reads, workgroups: 8, num_workers: 64, 0.0144704 seconds
+coalesced reads, workgroups: 8, num_workers: 128, 0.0077896 seconds
+coalesced reads, workgroups: 16, num_workers: 4, 0.09570504 seconds
+coalesced reads, workgroups: 16, num_workers: 8, 0.05443 seconds
+coalesced reads, workgroups: 16, num_workers: 16, 0.02877784 seconds
+coalesced reads, workgroups: 16, num_workers: 32, 0.01502832 seconds
+coalesced reads, workgroups: 16, num_workers: 64, 0.00775696 seconds
+coalesced reads, workgroups: 16, num_workers: 128, 0.0039332 seconds
+coalesced reads, workgroups: 32, num_workers: 4, 0.05933192 seconds
+coalesced reads, workgroups: 32, num_workers: 8, 0.02975696 seconds
+coalesced reads, workgroups: 32, num_workers: 16, 0.01458912 seconds
+coalesced reads, workgroups: 32, num_workers: 32, 0.00776816 seconds
+coalesced reads, workgroups: 32, num_workers: 64, 0.00391616 seconds
+coalesced reads, workgroups: 32, num_workers: 128, 0.00308944 seconds
+coalesced reads, workgroups: 64, num_workers: 4, 0.02779384 seconds
+coalesced reads, workgroups: 64, num_workers: 8, 0.01499048 seconds
+coalesced reads, workgroups: 64, num_workers: 16, 0.00775552 seconds
+coalesced reads, workgroups: 64, num_workers: 32, 0.00399272 seconds
+coalesced reads, workgroups: 64, num_workers: 64, 0.0032104 seconds
+coalesced reads, workgroups: 64, num_workers: 128, 0.00299176 seconds
+coalesced reads, workgroups: 128, num_workers: 4, 0.02973544 seconds
+coalesced reads, workgroups: 128, num_workers: 8, 0.0151668 seconds
+coalesced reads, workgroups: 128, num_workers: 16, 0.00765784 seconds
+coalesced reads, workgroups: 128, num_workers: 32, 0.00411832 seconds
+coalesced reads, workgroups: 128, num_workers: 64, 0.00301736 seconds
+coalesced reads, workgroups: 128, num_workers: 128, 0.00290664 seconds
+coalesced reads, workgroups: 256, num_workers: 4, 0.03019944 seconds
+coalesced reads, workgroups: 256, num_workers: 8, 0.01544704 seconds
+coalesced reads, workgroups: 256, num_workers: 16, 0.00798584 seconds
+coalesced reads, workgroups: 256, num_workers: 32, 0.00427288 seconds
+coalesced reads, workgroups: 256, num_workers: 64, 0.00282984 seconds
+coalesced reads, workgroups: 256, num_workers: 128, 0.00283432 seconds
+coalesced reads, workgroups: 512, num_workers: 4, 0.03030056 seconds
+coalesced reads, workgroups: 512, num_workers: 8, 0.01520048 seconds
+coalesced reads, workgroups: 512, num_workers: 16, 0.00820648 seconds
+coalesced reads, workgroups: 512, num_workers: 32, 0.00448768 seconds
+coalesced reads, workgroups: 512, num_workers: 64, 0.00285128 seconds
+coalesced reads, workgroups: 512, num_workers: 128, 0.00295096 seconds
+
+blocked reads, workgroups: 8, num_workers: 4, 0.1875092 seconds
+blocked reads, workgroups: 8, num_workers: 8, 0.07682712 seconds
+blocked reads, workgroups: 8, num_workers: 16, 0.04721432 seconds
+blocked reads, workgroups: 8, num_workers: 32, 0.0260356 seconds
+blocked reads, workgroups: 8, num_workers: 64, 0.0145952 seconds
+blocked reads, workgroups: 8, num_workers: 128, 0.01322928 seconds
+blocked reads, workgroups: 16, num_workers: 4, 0.11220432 seconds
+blocked reads, workgroups: 16, num_workers: 8, 0.05103584 seconds
+blocked reads, workgroups: 16, num_workers: 16, 0.0381004 seconds
+blocked reads, workgroups: 16, num_workers: 32, 0.01598448 seconds
+blocked reads, workgroups: 16, num_workers: 64, 0.011868 seconds
+blocked reads, workgroups: 16, num_workers: 128, 0.03393464 seconds
+blocked reads, workgroups: 32, num_workers: 4, 0.07117064 seconds
+blocked reads, workgroups: 32, num_workers: 8, 0.03616536 seconds
+blocked reads, workgroups: 32, num_workers: 16, 0.02227952 seconds
+blocked reads, workgroups: 32, num_workers: 32, 0.01344776 seconds
+blocked reads, workgroups: 32, num_workers: 64, 0.0342316 seconds
+blocked reads, workgroups: 32, num_workers: 128, 0.08281928 seconds
+blocked reads, workgroups: 64, num_workers: 4, 0.03155968 seconds
+blocked reads, workgroups: 64, num_workers: 8, 0.02060984 seconds
+blocked reads, workgroups: 64, num_workers: 16, 0.01327808 seconds
+blocked reads, workgroups: 64, num_workers: 32, 0.03418224 seconds
+blocked reads, workgroups: 64, num_workers: 64, 0.0794408 seconds
+blocked reads, workgroups: 64, num_workers: 128, 0.0758292 seconds
+blocked reads, workgroups: 128, num_workers: 4, 0.0247952 seconds
+blocked reads, workgroups: 128, num_workers: 8, 0.0154636 seconds
+blocked reads, workgroups: 128, num_workers: 16, 0.01440504 seconds
+blocked reads, workgroups: 128, num_workers: 32, 0.03554616 seconds
+blocked reads, workgroups: 128, num_workers: 64, 0.0905076 seconds
+blocked reads, workgroups: 128, num_workers: 128, 0.06857392 seconds
+blocked reads, workgroups: 256, num_workers: 4, 0.0332628 seconds
+blocked reads, workgroups: 256, num_workers: 8, 0.02187848 seconds
+blocked reads, workgroups: 256, num_workers: 16, 0.0133096 seconds
+blocked reads, workgroups: 256, num_workers: 32, 0.03665168 seconds
+blocked reads, workgroups: 256, num_workers: 64, 0.06601888 seconds
+blocked reads, workgroups: 256, num_workers: 128, 0.05091112 seconds
+blocked reads, workgroups: 512, num_workers: 4, 0.03760328 seconds
+blocked reads, workgroups: 512, num_workers: 8, 0.02227768 seconds
+blocked reads, workgroups: 512, num_workers: 16, 0.01377232 seconds
+blocked reads, workgroups: 512, num_workers: 32, 0.03684656 seconds
+blocked reads, workgroups: 512, num_workers: 64, 0.04446296 seconds
+blocked reads, workgroups: 512, num_workers: 128, 0.036996 seconds
\ No newline at end of file
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index 5a59558e..1bfd6617 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -57,7 +57,7 @@ __kernel void sum_blocked(__global float* x,
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
     for (int i = 0; i <k; i ++) {
-        if(i < N){
+        if(k * global_id + i < N){
             sum += x[k * global_id + i];
         }
     }
diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
index 263f53b3..382fdf8d 100644
--- a/HW3/P3/tune.py
+++ b/HW3/P3/tune.py
@@ -33,11 +33,9 @@ def create_data(N):
             event = program.sum_coalesced(queue, (num_workgroups * num_workers,), (num_workers,),
                                           x, partial_sums, local, np.uint64(N))
             cl.enqueue_copy(queue, host_partial, partial_sums, is_blocking=True)
-            print partial_sums;
             sum_gpu = sum(host_partial)
             sum_host = sum(host_x)
             seconds = (event.profile.end - event.profile.start) / 1e9
-            print sum_gpu
             assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
             times['coalesced', num_workgroups, num_workers] = seconds
             print("coalesced reads, workgroups: {}, num_workers: {}, {} seconds".
diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
new file mode 100644
index 00000000..a4b3e39b
--- /dev/null
+++ b/HW3/P5/P5.txt
@@ -0,0 +1,28 @@
+Part 1
+Maze 1: Finished after 885 iterations, 446.19376 ms total, 0.504173740113 ms per iteration
+Found 2 regions
+
+Maze 2: Finished after 516 iterations, 254.36296 ms total, 0.492951472868 ms per iteration
+Found 35 regions
+____________________
+Part 2
+Maze 1: Finished after 529 iterations, 266.34816 ms total, 0.5034936862 ms per iteration
+Found 2 regions
+
+Maze 2: Finished after 273 iterations, 135.44424 ms total, 0.496132747253 ms per iteration
+Found 35 regions
+_____________________
+Part 3
+Maze 1: Finished after 10 iterations, 5.0064 ms total, 0.50064 ms per iteration
+Found 2 regions
+
+Maze 2: Finished after 9 iterations, 4.40704 ms total, 0.489671111111 ms per iteration
+Found 35 regions
+_____________________
+Part 4
+Maze 1: Finished after 891 iterations, 447.6776 ms total, 0.502443995511 ms per iteration
+Found 2 regions
+
+Maze 2: Finished after 520 iterations, 257.42656 ms total, 0.495051076923 ms per iteration
+Found 35 regions
+
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 8f093449..f92409fd 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -81,7 +81,7 @@ propagate_labels(__global __read_write int *labels,
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
     // if(old_label < w * h){
-    //     location = buf_y * buf_w + buf_x;
+    //     int location = buf_y * buf_w + buf_x;
     //     buffer[location] = labels[buffer[location]];
     // }
 
@@ -91,7 +91,7 @@ propagate_labels(__global __read_write int *labels,
         int previous_label = -1;
         for(int i = 0; i < buf_x * buf_y; i ++){
             if(buffer[i] < w * h && previous_label != buffer[i]){
-                previous_label == labels[buffer[i]];
+                previous_label = labels[buffer[i]];
             }
             buffer[i] = previous_label;
         }
@@ -99,20 +99,20 @@ propagate_labels(__global __read_write int *labels,
     }
 
 
-    barrier(CLK_LOCAL_MEM_FENCE);
+    // barrier(CLK_LOCAL_MEM_FENCE);
     // stay in bounds
     if ((x < w) && (y < h)) {
         // CODE FOR PART 1 HERE
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
         new_label = old_label;
-        if (new_label < w*h){
+        if (old_label < w*h){
             new_label = min(
-                new_label,
-                min(buffer[(buf_y -1) * h + buf_x],
-                    min(buffer[buf_y * h + buf_x - 1],
-                        min(buffer[buf_y * h + buf_x + 1], 
-                            buffer[(buf_y - 1) * h + buf_x]))));
+                old_label,
+                min(buffer[(buf_y -1) * buf_w + buf_x],
+                    min(buffer[buf_y * buf_w + buf_x - 1],
+                        min(buffer[buf_y * buf_w + buf_x + 1], 
+                            buffer[(buf_y + 1) * buf_w + buf_x]))));
         }
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
@@ -121,9 +121,9 @@ propagate_labels(__global __read_write int *labels,
             *(changed_flag) += 1;
             labels[y * w + x] = new_label;
             // Call atomic mean to put new_label in labels[old_label], and make sure it didn't increase
-            // atomic_min(&labels[old_label], new_label);
-            // // Call atomic min when writing the non halo portion
-            // atomic_min(&labels[y * w + x], new_label
+            atomic_min(&labels[old_label], new_label);
+            // Call atomic min when writing the non halo portion
+            atomic_min(&labels[y * w + x], new_label);
         }
     }
 }
diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py
index 276179fd..449611c5 100644
--- a/HW3/P5/label_regions.py
+++ b/HW3/P5/label_regions.py
@@ -3,6 +3,9 @@
 import pyopencl as cl
 import numpy as np
 from matplotlib import pyplot
+import os
+
+os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -42,7 +45,7 @@ def round_up(global_size, group_size):
 
     program = cl.Program(context, open('label_regions.cl').read()).build(options='')
 
-    host_image = np.load('maze1.npy')
+    host_image = np.load('maze2.npy')
     host_labels = np.empty_like(host_image)
     host_done_flag = np.zeros(1).astype(np.int32)
 

From b60428c8455445122cb4097c20fb446cdbf1db2c Mon Sep 17 00:00:00 2001
From: Vivek Jayaram <vivekjayaram@college.harvard.edu>
Date: Fri, 20 Nov 2015 12:12:00 -0500
Subject: [PATCH 8/9] Done with Pset 3

---
 HW3/P5/P5.txt           | 8 ++++++++
 HW3/P5/label_regions.cl | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
index a4b3e39b..62e03c5f 100644
--- a/HW3/P5/P5.txt
+++ b/HW3/P5/P5.txt
@@ -26,3 +26,11 @@ Found 2 regions
 Maze 2: Finished after 520 iterations, 257.42656 ms total, 0.495051076923 ms per iteration
 Found 35 regions
 
+This is clearly much worse. Running it as a single thread is worse than reading multiple times from global memory. Only if the GPU had much slower read times, and the multithreaded version was running with few threads in the workgroup might the result be faster. `s
+_____________________
+Part 5
+Atomic_min is guaranteed to caclulate the minimum and write it into memory simultaneously. This has the effect of avoiding race conditions with min()
+where you might read in the the memory and calculate it's minimum, but before you write it back you were descheduled and another thread modified the same 
+memory. Now when you write your min, it might no longer be the correct min. So labels could increase within an iteration, but not between iterations because
+we have barrier() that guarentees an entire iteration finishes. The final answer will also be correct because if two labels could be written in the wrong order by min() then they are guaranteed to be in the same connected component. The only catch is that you might have an infinite loop where the min always gets scheduled in the wrong order, but that is unlikely. 
+
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index f92409fd..536b9023 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -99,7 +99,7 @@ propagate_labels(__global __read_write int *labels,
     }
 
 
-    // barrier(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
     // stay in bounds
     if ((x < w) && (y < h)) {
         // CODE FOR PART 1 HERE

From d5eced553ee80d09b995c5f4fb0ce2d8b606f233 Mon Sep 17 00:00:00 2001
From: Vivek Jayaram <vivekjayaram@college.harvard.edu>
Date: Fri, 20 Nov 2015 12:23:52 -0500
Subject: [PATCH 9/9] Done

---
 HW3/README.txt | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 HW3/README.txt

diff --git a/HW3/README.txt b/HW3/README.txt
new file mode 100644
index 00000000..33246ded
--- /dev/null
+++ b/HW3/README.txt
@@ -0,0 +1,2 @@
+Notes for the reader:
+My computer had a hardware issue that prevented me from running opencl. As a result, I collaborated with Rohit Ahuja and used his computer to do much of the pset. 
\ No newline at end of file