harvard-cs205 · TuanTrain · Nov 3, 2015 · Nov 5, 2015 · Nov 5, 2015 · Nov 20, 2015
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
@@ -0,0 +1,17 @@
+Using VM on Windows set up for CS205:
+
+best:
+configuration ('blocked', 16, 4): 0.004673046 seconds
+
+
+The platforms detected are:
+---------------------------
+AMD Accelerated Parallel Processing Advanced Micro Devices, Inc. version: OpenCL 2.0 AMD-APP (1800.8)
+The devices detected on platform AMD Accelerated Parallel Processing are:
+---------------------------
+Intel(R) Core(TM) i7-3632QM CPU @ 2.20GHz [Type: CPU ]
+Maximum clock Frequency: 2195 MHz
+Maximum allocable memory size: 1049 MB
+Maximum work group size 1024
+---------------------------
+This context is associated with  1 devices
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
@@ -5,11 +5,15 @@ __kernel void sum_coalesced(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
+    size_t global_id = get_global_id(0);
+    size_t global_size = get_global_size(0);
+    size_t local_size = get_local_size(0);
+    int n;
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    for (n = 0; global_id + n * global_size < N; n++) { // YOUR CODE HERE
+        sum += x[global_id + n * global_size]; // YOUR CODE HERE 
     }
 
     fast[local_id] = sum;
@@ -24,8 +28,13 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+    for (uint j = local_size / 2; j > 0; j >>= 1) { // YOUR CODE HERE
+        if (local_id < j)
+        {
+            fast[local_id] += fast[local_id+j];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -38,7 +47,11 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+    int k = ceil((float)N / get_global_size(0));
+    size_t global_id = get_global_id(0);
+    size_t global_size = get_global_size(0);
+    size_t local_size = get_local_size(0);
+    int n;
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,8 +61,11 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (n = k*global_id; n < (global_id + 1) * k; n++) { // YOUR CODE HERE
+        if (n < N)
+        {
+            sum += x[n];
+        }
     }
 
     fast[local_id] = sum;
@@ -64,8 +80,12 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+     for (uint j = local_size / 2; j > 0; j >>= 1) { // YOUR CODE HERE
+        if (local_id < j)
+        {
+            fast[local_id] += fast[local_id+j];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];

diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
@@ -23,7 +23,7 @@ def create_data(N):
     times = {}
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
@@ -40,7 +40,7 @@ def create_data(N):
                   format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
@@ -1,5 +1,27 @@
 #include "median9.h"
 
+// returns value within the picture
+float get_in(__global __read_only float *in_values, int w, int h, int x, int y)
+{
+  if (x < 0)
+  {
+    x = 0;
+  }
+  else if (x > w - 1)
+  {
+    x = w - 1;
+  }
+  if (y < 0)
+  {
+    y = 0;
+  }
+  else if (y > h - 1)
+  {
+    y = h - 1;
+  }
+  return in_values[y * w + x];
+}
+
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -22,6 +44,43 @@ median_3x3(__global __read_only float *in_values,
     // Note that globally out-of-bounds pixels should be replaced
     // with the nearest valid pixel's value.
 
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+    int row;
+
+    // get all the pixels in the bounds
+    if (idx_1D < buf_w)
+    {
+        for (row = 0; row < buf_h; row++) 
+        {
+            buffer[row * buf_w + idx_1D] = get_in(in_values, w, h, 
+                            buf_corner_x + idx_1D, buf_corner_y + row);
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Processing code here...
+    //
+    // Should only use buffer, buf_x, buf_y.
 
     // Compute 3x3 median for each pixel in core (non-halo) pixels
     //
@@ -31,4 +90,17 @@ median_3x3(__global __read_only float *in_values,
 
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
+    if (x < w && y < h)
+    {
+      out_values[y * w + x] = median9(buffer[(buf_y - 1) * buf_w + buf_x - 1],
+                                      buffer[(buf_y - 1) * buf_w + buf_x],
+                                      buffer[(buf_y - 1) * buf_w + buf_x + 1],
+                                      buffer[(buf_y) * buf_w + buf_x - 1],
+                                      buffer[(buf_y) * buf_w + buf_x],
+                                      buffer[(buf_y) * buf_w + buf_x + 1],
+                                      buffer[(buf_y + 1) * buf_w + buf_x - 1],
+                                      buffer[(buf_y + 1) * buf_w + buf_x],
+                                      buffer[(buf_y + 1) * buf_w + buf_x + 1]
+                                      ); // <- he is sad. coding is hard
+    }
 }
diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
@@ -1,8 +1,8 @@
 from __future__ import division
 import pyopencl as cl
 import numpy as np
-import imread
 import pylab
+import os.path
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
-    program = cl.Program(context, open('median_filter.cl').read()).build(options='')
+    curdir = os.path.dirname(os.path.realpath(__file__))
+    program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
 
     host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
     host_image_filtered = np.zeros_like(host_image)

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
@@ -0,0 +1,47 @@
+Part 1:
+
+Maze 1:
+Finished after 879 iterations, 261.9836 ms total, 0.298047326507 ms per iteration
+Found 2 regions
+
+Maze 2:
+Finished after 514 iterations, 152.45928 ms total, 0.296613385214 ms per iteration
+Found 35 regions
+
+
+Part 2:
+
+Maze 1:
+Finished after 529 iterations, 157.65824 ms total, 0.298030699433 ms per iteration
+Found 2 regions
+
+Maze 2:
+Finished after 273 iterations, 81.33136 ms total, 0.297917069597 ms per iteration
+Found 35 regions
+
+
+Part 3:
+
+Maze 1:
+Finished after 10 iterations, 3.07216 ms total, 0.307216 ms per iteration
+Found 2 regions
+
+Maze 2:
+Finished after 10 iterations, 3.03024 ms total, 0.303024 ms per iteration
+Found 35 regions
+
+Part 4:
+
+Maze 1:
+Finished after 11 iterations, 8.83696 ms total, 0.80336 ms per iteration
+Found 2 regions
+
+Maze 2:
+Finished after 10 iterations, 8.00088 ms total, 0.800088 ms per iteration
+Found 35 regions
+
+For part 4, the optimization of reducing global memory reads by using a single thread to do that action only once per group actually slowed down the speed of the program. This is because that action was serialized over the whole group so the read and update could only be done one at a time rather than with the faster GPU memory access. The global memory access is not the limiting factor. This implementation would work if the memory access was much slower, perhaps if it was using the GPU of an old command line computer.
+
+
+Part 5:
+If we replace atomic_min() with min(), the memory updates are no longer serial but we lose the guarantee that the update occurs on a given iteration. So there are the two factors of increased speed yet lost accuracy, and it is hard to tell which one would win out. The increased speed due to parallelization is great when the threads are working on updating different labels. On the other hand, it is more difficult to understand the behavior when the min updates are no longer atomic. What may happen is that one update could be decreasing a label to 5, but another is decreasing it to 2. Depending on when the write occurs, either one can occur, so if the label should be 2, it can still be stalling at 5 until the write of 2 occurs. So I do not think an increase in label can ever occur from one iteration to another, but a decrease could be postponed or not done in time. Eventually, however, we should be able to see that the min values converge to the state they were meant to go to, it is just that the time to get there would not be consistent from one run to the next.
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
@@ -80,20 +80,75 @@ propagate_labels(__global __read_write int *labels,
     old_label = buffer[buf_y * buf_w + buf_x];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
+    // Part 2:
+    //if (old_label < w * h) 
+    //{
+    //    buffer[buf_y * buf_w + buf_x] = labels[old_label];
+    //}
 
+    // Part 4:
+    // once per local group
+    if (lx == 0 && ly == 0)
+    {
+        int cur_label;
+        int prev_label = -25;
+        int prev_grand_label;
+        int pixels = buf_w * buf_h;
+        int i;
+        // iterate over pixels
+        for(i = 0; i < pixels; i++)
+        {
+            cur_label = buffer[i];
+
+            // within pic
+            if (cur_label < w * h)
+            {
+                // if labels on same value, use the already found grand label
+                if (cur_label == prev_label)
+                {
+                    buffer[i] = prev_grand_label;
+                }
+                // otherwise update the grand label then use
+                else
+                {
+                    prev_label = cur_label;
+                    prev_grand_label = labels[prev_label];
+                    buffer[i] = prev_grand_label;
+                }
+            } 
+        }
+
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
     // stay in bounds
+    new_label = old_label;
     if ((x < w) && (y < h)) {
         // CODE FOR PART 1 HERE
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
         new_label = old_label;
+        if (new_label < w * h) 
+        {
+            // min of adjacents
+            new_label = min(new_label,
+                            min(buffer[(buf_y - 1) * buf_w + buf_x],
+                            min(buffer[(buf_y) * buf_w + buf_x - 1],
+                            min(buffer[(buf_y) * buf_w + buf_x + 1],
+                                buffer[(buf_y + 1) * buf_w + buf_x]))));
+        }
 
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
             // indicate there was a change this iteration.
             // multiple threads might write this.
+            // atomic update
+            atomic_min(&labels[old_label], new_label);
+            atomic_min(&labels[y * w + x], new_label);
+
             *(changed_flag) += 1;
             labels[y * w + x] = new_label;
+
         }
     }
 }