From 1884d7a1b8dccfde9f5809bdd62c45e30c79c5a1 Mon Sep 17 00:00:00 2001
From: Evan Yao <evanyao@Evans-MacBook-Pro.local>
Date: Fri, 20 Nov 2015 04:09:22 -0500
Subject: [PATCH 1/2] done with HW 3

---
 HW3/P3/P3.txt |  1 +
 HW3/P5/P5.txt | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)
 create mode 100644 HW3/P3/P3.txt
 create mode 100644 HW3/P5/P5.txt

diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
new file mode 100644
index 00000000..aaf12c99
--- /dev/null
+++ b/HW3/P3/P3.txt
@@ -0,0 +1 @@
+The best configuration and time for me was: configuration ('coalesced', 128, 128): 0.00291864 seconds
diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
new file mode 100644
index 00000000..7922cd98
--- /dev/null
+++ b/HW3/P5/P5.txt
@@ -0,0 +1,62 @@
+Part 1
+
+Maze 1:
+Finished after 878 iterations, 261.55712 ms total, 0.297901047836 ms per iteration
+Found 2 regions
+
+Maze 2:
+Finished after 517 iterations, 153.9384 ms total, 0.297753191489 ms per iteration
+Found 35 regions
+
+
+Part 2
+
+Maze 1: 
+Finished after 529 iterations, 158.00224 ms total, 0.298680982987 ms per iteration
+Found 2 regions
+
+Maze 2: 
+Finished after 273 iterations, 81.45792 ms total, 0.298380659341 ms per iteration
+Found 35 regions
+
+
+Part 3 
+
+Maze 1:
+Finished after 11 iterations, 3.37152 ms total, 0.306501818182 ms per iteration
+Found 2 regions 
+
+Maze 2:
+Finished after 9 iterations, 2.7204 ms total, 0.302266666667 ms per iteration
+Found 35 regions
+
+
+Part 4
+
+Maze 1: 
+Finished after 70 iterations, 52.56808 ms total, 0.750972571429 ms per iteration
+Found 2 regions
+
+Maze 2: 
+Finished after 103 iterations, 76.77008 ms total, 0.745340582524 ms per iteration
+Found 35 regions
+
+
+It seems like in my case, serialization of the "finding grandparents" process
+is not the best as it leads to a 2.5 time increase in time per iteration. 
+
+
+Part 5 
+
+Suppose that our current label sees 2 other labels, both of which have a
+smaller label number than our current one. In that case, if we did atomic
+updates, all 3 labels will become the minimum of these 3 labels. However, 
+it is possible that if we did "min" first, then "reassignment", the order 
+of the 2 mins and the 2 reassignments can make a difference. For example: 
+
+Suppose our current label at a square is 3, and there are two neighbors 
+with labels 2 and 1. We would like to update 3 -> 2. However, when two 
+different threads compute the min of (3,2) and (3,1), they will get 1 and 2. 
+Now, assume that we assign that label to be 1, and THEN assign it to be 2. 
+UH OH! Now we have a problem and will have to run for at least another iteration 
+to fix it. 

From 0010692a17f19c6022e9e73e836a7e0f82676fb5 Mon Sep 17 00:00:00 2001
From: Evan Yao <evanyao@Evans-MacBook-Pro.local>
Date: Fri, 20 Nov 2015 04:10:44 -0500
Subject: [PATCH 2/2] all set with HW 3

---
 HW3/P2/mandelbrot.py    |  2 +-
 HW3/P3/sum.cl           | 40 +++++++++++++++++++-----
 HW3/P4/median_filter.cl | 68 +++++++++++++++++++++++++++++++++++++++++
 HW3/P5/label_regions.cl | 57 +++++++++++++++++++++++++++++++++-
 HW3/P5/label_regions.py |  4 +--
 5 files changed, 159 insertions(+), 12 deletions(-)

diff --git a/HW3/P2/mandelbrot.py b/HW3/P2/mandelbrot.py
index 7c197af2..2978cac0 100644
--- a/HW3/P2/mandelbrot.py
+++ b/HW3/P2/mandelbrot.py
@@ -1,7 +1,7 @@
 from __future__ import division
 import pyopencl as cl
 import numpy as np
-import pylab
+import matplotlib.pyplot as pylab
 
 def round_up(global_size, group_size):
     r = global_size % group_size
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index ee914740..82f24dd8 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -5,11 +5,15 @@ __kernel void sum_coalesced(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
+    size_t i = get_global_id(0); 
+    size_t global_size = get_global_size(0); 
+    size_t group_size = get_local_size(0);
 
+    int counter;
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    for (counter = 0; counter * global_size + i < N; counter++) {
+        sum += x[i + counter * global_size];
     }
 
     fast[local_id] = sum;
@@ -24,8 +28,14 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    int k; 
+
+    for (k = group_size / 2 ;k > 0; k >>= 1) { 
+        if (local_id < k) {
+            fast[local_id] += fast[local_id + k];
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -38,6 +48,9 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
+    size_t global_id = get_global_id(0); 
+    size_t group_size = get_local_size(0);
+
     int k = ceil((float)N / get_global_size(0));
 
     // thread with global_id 0 should add 0..k-1
@@ -48,8 +61,13 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    int count;
+
+    for (count = global_id * k; count < (global_id + 1)*k; count++) { // YOUR CODE HERE
+        if (count < N)
+        {
+            sum += x[count];
+        }
     }
 
     fast[local_id] = sum;
@@ -64,8 +82,14 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    uint d; 
+
+    for (d = group_size / 2 ;d > 0; d >>= 1) { 
+        if (local_id < d) {
+            fast[local_id] += fast[local_id + d];
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
index 07bb294c..12e94e13 100644
--- a/HW3/P4/median_filter.cl
+++ b/HW3/P4/median_filter.cl
@@ -1,5 +1,27 @@
 #include "median9.h"
 
+// helper function which replaces the version of FETCH
+// that Ray provided. This function will check if x and y 
+// are within bounds, and if not return the closest pixel. 
+float FETCH_new(__global __read_only float *in_values, 
+    int width, int height, 
+    int x, int y)
+{
+    if (x < 0)
+      x = 0; 
+
+    else if (x > width - 1) 
+      x = width - 1; 
+
+    if (y < 0)
+      y = 0; 
+
+    else if (y > height - 1)
+      y = height - 1; 
+
+    return in_values[y * width + x]; 
+}
+
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -31,4 +53,50 @@ median_3x3(__global __read_only float *in_values,
 
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+    int row;
+
+    if (idx_1D < buf_w)
+        for (row = 0; row < buf_h; row++) {
+            buffer[row * buf_w + idx_1D] = \
+                FETCH_new(in_values, w, h,
+                      buf_corner_x + idx_1D,
+                      buf_corner_y + row);
+        }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if ((y < h && x < w))
+    {
+      int top = (buf_y - 1) * buf_w + buf_x; 
+      int middle = buf_y * buf_w + buf_x; 
+      int bottom = (buf_y + 1)* buf_w + buf_x; 
+
+      out_values[y * w + x] = median9(
+        buffer[top - 1], buffer[top], buffer[top + 1],
+        buffer[middle - 1], buffer[middle], buffer[middle + 1], 
+        buffer[bottom - 1], buffer[bottom], buffer[bottom + 1]
+        );
+    }
+
+
 }
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 78b986b3..931258b9 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -75,12 +75,50 @@ propagate_labels(__global __read_write int *labels,
     // the local buffer is loaded
     barrier(CLK_LOCAL_MEM_FENCE);
 
+    int current = buf_y * buf_w + buf_x; 
     // Fetch the value from the buffer the corresponds to
     // the pixel for this thread
-    old_label = buffer[buf_y * buf_w + buf_x];
+    old_label = buffer[current];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
+
+    /*
+    if (old_label < w * h)
+    {
+        buffer[current] = labels[old_label]; // grab grandparent
+    }
+    */
+
+    
+    if ((lx == 0) && (ly == 0))
+    {
+        int prev_key = -1000;
+        int prev_result;  
+
+        for (int i = 0; i < buf_w * buf_h; i++)
+        {
+            int this_label = buffer[i]; 
+
+            if (this_label >= w * h)
+                continue; 
+
+            if (prev_key == this_label)
+            {
+                buffer[i] = prev_result; 
+            }
+
+            else
+            {
+                prev_key = this_label; 
+                prev_result = labels[prev_key];
+            }
+        }
+    }
     
+
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
+
     // stay in bounds
     if ((x < w) && (y < h)) {
         // CODE FOR PART 1 HERE
@@ -88,10 +126,27 @@ propagate_labels(__global __read_write int *labels,
         // to adjust this for correctness.
         new_label = old_label;
 
+        if (new_label < w * h)
+        {
+            int this = buf_y * buf_w + buf_x; 
+            new_label = 
+                min(buffer[(buf_y + 1) * buf_w + buf_x], 
+                min(buffer[(buf_y - 1) * buf_w + buf_x], 
+                min(buffer[this + 1], 
+                min(buffer[this - 1], new_label
+                )))); 
+        }
+
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
             // indicate there was a change this iteration.
             // multiple threads might write this.
+
+            // 
+            atomic_min(&labels[old_label], new_label);
+
+            atomic_min(&labels[y * w + x], new_label);
+
             *(changed_flag) += 1;
             labels[y * w + x] = new_label;
         }
diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py
index c6ce60cb..3e5092ac 100644
--- a/HW3/P5/label_regions.py
+++ b/HW3/P5/label_regions.py
@@ -2,7 +2,7 @@
 import sys
 import pyopencl as cl
 import numpy as np
-import pylab
+import matplotlib.pyplot as pylab
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -42,7 +42,7 @@ def round_up(global_size, group_size):
 
     program = cl.Program(context, open('label_regions.cl').read()).build(options='')
 
-    host_image = np.load('maze1.npy')
+    host_image = np.load('maze2.npy')
     host_labels = np.empty_like(host_image)
     host_done_flag = np.zeros(1).astype(np.int32)