harvard-cs205 · foo-bar-baz-qux · Nov 3, 2015 · Nov 5, 2015 · Nov 19, 2015 · Nov 20, 2015
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
@@ -0,0 +1,2 @@
+The fastest configuration was the coaslesced setup with a workgroup size of 512, and 128 workers.
+This configuration completed the task in 0.00309848 seconds.
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
@@ -8,8 +8,12 @@ __kernel void sum_coalesced(__global float* x,
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+
+    unsigned int thread_id = get_global_id(0);
+    unsigned int step_size = get_global_size(0);
+
+    for (unsigned int i = thread_id; i < N; i += step_size) { // YOUR CODE HERE
+        sum += x[i]; // YOUR CODE HERE
     }
 
     fast[local_id] = sum;
@@ -24,8 +28,14 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+    unsigned int local_size = get_local_size(0);
+
+    for (unsigned int j = local_size/2; j > 0; j >>= 1) { // YOUR CODE HERE
+        if( local_id < j) {
+            fast[local_id] += fast[local_id + j]; // YOUR CODE HERE
+         }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -38,7 +48,8 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+
+    int k = ceil((float)N / get_global_size(0));
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,8 +59,11 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+    unsigned int thread_id = get_global_id(0);
+
+    for (unsigned int i = k * thread_id; i < k * (thread_id + 1) && i < N; i++) { // YOUR CODE HERE
+        sum += x[i]; // YOUR CODE HERE
     }
 
     fast[local_id] = sum;
@@ -64,8 +78,14 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+    unsigned int local_size = get_local_size(0);
+
+    for (unsigned int j = local_size/2; j > 0; j >>= 1) { // YOUR CODE HERE
+        if( local_id < j) {
+            fast[local_id] += fast[local_id + j]; // YOUR CODE HERE
+         }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];

diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
@@ -1,10 +1,12 @@
 import pyopencl as cl
 import numpy as np
+import os
 
 def create_data(N):
     return host_x, x
 
 if __name__ == "__main__":
+    os.environ["PYOPENCL_COMPILER_OUTPUT"] = "1"
     N = 1e7
 
     platforms = cl.get_platforms()
@@ -23,7 +25,7 @@ def create_data(N):
     times = {}
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
@@ -40,7 +42,7 @@ def create_data(N):
                   format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
@@ -1,5 +1,16 @@
 #include "median9.h"
 
+// From HW3 P5
+float
+get_clamped_value(__global __read_only float *labels,
+                  int w, int h,
+                  int x, int y)
+{
+    int c_x = min(w-1, max(0, x)), c_y = min(h-1, max(0, y));
+    return labels[c_y * w + c_x];
+}
+
+
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -22,13 +33,67 @@ median_3x3(__global __read_only float *in_values,
     // Note that globally out-of-bounds pixels should be replaced
     // with the nearest valid pixel's value.
 
+    // Based on HW3 Problem 5
+
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+    // Load the relevant labels to a local buffer with a halo
+    if (idx_1D < buf_w) {
+        for (int row = 0; row < buf_h; row++) {
+            buffer[row * buf_w + idx_1D] =
+                get_clamped_value(in_values,
+                                  w, h,
+                                  buf_corner_x + idx_1D, buf_corner_y + row);
+        }
+    }
+
+    // Make sure all threads reach the next part after
+    // the local buffer is loaded
+    barrier(CLK_LOCAL_MEM_FENCE);
+
 
     // Compute 3x3 median for each pixel in core (non-halo) pixels
     //
     // We've given you median9.h, and included it above, so you can
     // use the median9() function.
 
+    const int dx[3] = {-1, 0, 1}, dy[3] = {-1, 0, 1};
+    int idxArr[9];
+
+    for( int i=0; i<3; i++ ) {
+        for ( int j=0; j<3; j++ ) {
+            idxArr[i*3+j] = (buf_y + dy[i])*buf_w + (buf_x + dx[j]);
+        }
+    }
 
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
+
+    //From HW3 P5
+    // stay in bounds
+    if ((x < w) && (y < h)) {
+        out_values[y*w + x] =
+                median9( buffer[ idxArr[0] ], buffer[ idxArr[1] ], buffer[ idxArr[2] ],
+                        buffer[ idxArr[3] ], buffer[ idxArr[4] ], buffer[ idxArr[5] ],
+                        buffer[ idxArr[6] ], buffer[ idxArr[7] ], buffer[ idxArr[8] ] );
+    }
+
 }
diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
@@ -0,0 +1,61 @@
+Part 1:
+========================
+Finished after 911 iterations, 214.33168 ms total, 0.235270779363 ms per iteration
+Found 2 regions
+
+Finished after 531 iterations, 124.05312 ms total, 0.233621694915 ms per iteration
+Found 35 regions
+
+Part 2:
+========================
+Finished after 529 iterations, 133.35928 ms total, 0.252096937618 ms per iteration
+Found 2 regions
+
+Finished after 269 iterations, 68.04856 ms total, 0.252968624535 ms per iteration
+Found 35 regions
+
+Part 3:
+========================
+Finished after 8 iterations, 2.59184 ms total, 0.32398 ms per iteration
+Found 2 regions
+
+Finished after 8 iterations, 2.4332 ms total, 0.30415 ms per iteration
+Found 35 regions
+
+Part 4:
+========================
+Finished after 10 iterations, 7.37784 ms total, 0.737784 ms per iteration
+Found 2 regions
+
+Finished after 9 iterations, 6.66816 ms total, 0.740906666667 ms per iteration
+Found 35 regions
+
+Using a single-thread for caching seems to have made the running time much slower (looking at the time per iteration
+.) Computation is serialized during this caching process, so there is a big upfront cost that needs to be weighed
+against the ongoing cost of expensive, potentially serialized, main memory accesses.
+
+I suspect the worst case memory access scenario where neighbouring nodes all try to access main memory for a single
+cached value only happens quite late in the process. Earlier iterations are probably facing more diversity of memory
+accesses since there's a greater range of values still remaining. It follows then, that perhaps with much more
+complex mazes, there could be scenarios where the overall iteration count will be high, but parts of the maze will
+have already "stabilized" early in the process. Therefore, these "stabilized" parts are repeatedly drawing on the same
+cached values. It seems there aren't enough iterations in these examples for it to be worth the upfront cost (at
+least on my hardware setup.) Potentially, other setups that have different computation and memory access speeds might
+reveal differences in the results since the trade-off is weighed differently.
+
+Part 5:
+========================
+If instead of atomic_min() we use min(), then the update step is not done in a single transaction, meaning between
+the min() check and the subsequent update, a different thread could update the reference value. Suppose this
+different thread actually updated the reference to an even lower value than what we had intended. Now, if we go ahead
+ with our update, we are actually *increasing* the reference value. It could therefore also lead to an increase in
+ this value between iterations.
+
+ So while it would be faster to do without atomic_min() as memory access is not serialized, it is also slow since it
+ means more iterations have to be performed. While empirical testing is needed to determine which is the better
+ trade-off, my sense is that for simple, low-iteration mazes, atomic_min() is going to present a significant overhead
+  so it might be better to do a few more iterations. The opposite is likely true for high-iteration mazes.
+
+This makes things more inefficient since this cache is potentially being updated with worse values. However, because
+ultimately the stopping condition is whether or not any more updates have been performed, it won't effect the
+correctness of the algorithm.
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
@@ -35,6 +35,8 @@ propagate_labels(__global __read_write int *labels,
                  int buf_w, int buf_h,
                  const int halo)
 {
+
+    const int dx[4] = {-1, 0, 1, 0}, dy[4] = {0, -1, 0, 1};
     // halo is the additional number of cells in one direction
 
     // Global position of output pixel
@@ -80,20 +82,61 @@ propagate_labels(__global __read_write int *labels,
     old_label = buffer[buf_y * buf_w + buf_x];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
-
+    // Part 2
+    /*
+    for( int i=0; i<4; i++ ) {
+        if( buffer[(buf_y+dy[i])*buf_w + (buf_x + dx[i])] < w * h ) {
+            buffer[(buf_y+dy[i])*buf_w + (buf_x + dx[i])] = labels[ buffer[(buf_y+dy[i])*buf_w + (buf_x + dx[i])] ];
+         }
+    }
+    */
+
+    // Part 4
+    // Reference: Piazza @524
+    unsigned int ls0 = get_local_size(0), ls1 = get_local_size(1);
+
+    if( lx == 0 && ly == 0 ) { //Use the first thread
+        unsigned int prev = -1, gparent = -1; // 1 variable cache
+        for( int c_lx = 0; c_lx < ls0; c_lx++ ) { // Update the entire local buffer
+            for( int c_ly = 0; c_ly < ls1; c_ly++ ) {
+                unsigned int cur_idx = (c_ly + halo) * buf_w + (c_lx + halo);
+                unsigned int parent = buffer[cur_idx];
+
+                if( parent == w * h ) continue; // Background pixel
+
+                if( parent == prev ) { // 1 variable cache success!
+                    buffer[cur_idx] = gparent;
+                }
+                else { // Update the cache
+                    buffer[cur_idx] = labels[parent];
+                    prev = parent;
+                    gparent = buffer[cur_idx];
+                }
+            }
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
     // stay in bounds
     if ((x < w) && (y < h)) {
         // CODE FOR PART 1 HERE
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
         new_label = old_label;
+        if( new_label != w * h ) { // See Piazza @486
+            for( int i=0; i<4; i++ ) {
+                new_label = min( new_label, buffer[(buf_y+dy[i])*buf_w + (buf_x + dx[i])] );
+            }
+         }
 
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
             // indicate there was a change this iteration.
             // multiple threads might write this.
             *(changed_flag) += 1;
             labels[y * w + x] = new_label;
+            atomic_min(&labels[old_label], new_label);
         }
     }
 }
diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py
@@ -42,7 +42,7 @@ def round_up(global_size, group_size):
 
     program = cl.Program(context, open('label_regions.cl').read()).build(options='')
 
-    host_image = np.load('maze1.npy')
+    host_image = np.load('maze2.npy')
     host_labels = np.empty_like(host_image)
     host_done_flag = np.zeros(1).astype(np.int32)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		The fastest configuration was the coaslesced setup with a workgroup size of 512, and 128 workers.
		This configuration completed the task in 0.00309848 seconds.