harvard-cs205 · jack91an · Nov 20, 2015 · Nov 20, 2015
diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
@@ -13,7 +13,20 @@ mandelbrot(__global __read_only float *coords_real,
     int iter;
 
     if ((x < w) && (y < h)) {
-        // YOUR CODE HERE
-        ;
+        c_real = coords_real[y*w+x];
+        c_imag = coords_imag[y*w+x];
+        z_real = 0;
+        z_imag = 0;
+        for (iter = 0; iter < max_iter; iter++) {
+            if (z_real*z_real + z_imag*z_imag > 4.0) {
+                break;
+            }
+            float z_real_temp = z_real*z_real - z_imag*z_imag;
+            float z_imag_temp = 2*z_imag*z_real;
+            z_real = z_real_temp + c_real;
+            z_imag = z_imag_temp + c_imag;
+
+        }
+        out_counts[y*w+x] = iter;
     }
 }
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
@@ -0,0 +1,13 @@
+My computer has the following specs:
+#0: Intel(R) Core(TM) i5-5257U CPU @ 2.70GHz on Apple
+#1: Intel(R) Iris(TM) Graphics 6100 on Apple
+
+It appears that the best configuration is not always the same across different runs. The best configurations appear to
+be toward the largest number of work groups and workers, which is intuitive. However, it also seems that the marginal
+benefit of getting more work groups and workers is very small.
+
+configuration ('coalesced', 512, 64): 0.00334184 seconds
+configuration ('coalesced', 512, 128): 0.00338008 seconds
+configuration ('coalesced', 256, 128): 0.00328632 seconds
+configuration ('coalesced', 256, 128): 0.00327048 seconds
+configuration ('coalesced', 512, 64): 0.00270928 seconds
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
@@ -5,11 +5,18 @@ __kernel void sum_coalesced(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
+    // DEFINING VARIABLES FOR LATER USE
+    size_t global_id = get_global_id(0);
+    long global_size = get_global_size(0);
+    size_t group_id = get_group_id(0);
+    long group_size = get_local_size(0);
+    int idx;
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    // UPDATE THE SUMS FOR EACH ELEMENT K SPACES FROM THE PRIOR
+        for (idx = 0; global_id + idx * global_size < N; idx ++) {
+        sum = sum + x[global_id + idx * global_size];
     }
 
     fast[local_id] = sum;
@@ -24,11 +31,15 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    // BINARY REDUCTION
+    for(uint s = group_size/2; s > 0; s >>= 1) {
+        if(local_id < s) {
+          fast[local_id] += fast[local_id+s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
-    if (local_id == 0) partial[get_group_id(0)] = fast[0];
+    if (local_id == 0) partial[group_id] = fast[0];
 }
 
 __kernel void sum_blocked(__global float* x,
@@ -38,7 +49,13 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
+    // DEFINING VARIABLES FOR LATER USE
+    size_t global_id = get_global_id(0);
+    long global_size = get_global_size(0);
+    size_t group_id = get_group_id(0);
+    long group_size = get_local_size(0);
     int k = ceil((float)N / get_global_size(0));
+    int idx;
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,8 +65,9 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    // UPDATE THE SUMS FOR EACH BLOCK
+    for (idx = global_id * k; (idx < (global_id+1)*k) & (idx < N); idx++) {
+        sum = sum+x[idx];
     }
 
     fast[local_id] = sum;
@@ -64,8 +82,12 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    // BINARY REDUCTION
+    for(uint s = group_size/2; s > 0; s >>= 1) {
+        if(local_id < s) {
+          fast[local_id] += fast[local_id+s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];

diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
@@ -34,10 +34,11 @@ def create_data(N):
             sum_gpu = sum(host_partial)
             sum_host = sum(host_x)
             seconds = (event.profile.end - event.profile.start) / 1e9
+            # print("sum gpu", sum_gpu, "sum host", sum_host)
             assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
             times['coalesced', num_workgroups, num_workers] = seconds
-            print("coalesced reads, workgroups: {}, num_workers: {}, {} seconds".
-                  format(num_workgroups, num_workers, seconds))
+            # print("coalesced reads, workgroups: {}, num_workers: {}, {} seconds".
+            #       format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
         partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
@@ -53,8 +54,8 @@ def create_data(N):
             seconds = (event.profile.end - event.profile.start) / 1e9
             assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
             times['blocked', num_workgroups, num_workers] = seconds
-            print("blocked reads, workgroups: {}, num_workers: {}, {} seconds".
-                  format(num_workgroups, num_workers, seconds))
+            # print("blocked reads, workgroups: {}, num_workers: {}, {} seconds".
+            #       format(num_workgroups, num_workers, seconds))
 
     best_time = min(times.values())
     best_configuration = [config for config in times if times[config] == best_time]

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
@@ -1,5 +1,29 @@
 #include "median9.h"
 
+// DEFINE FUNCION TO DETECT IF WITHIN GLOBAL BONDS.
+// IF NOT, THEN SET EHT COORDINATES TO THE NEAREST VALID PIXAL
+// MODIFIED BASED ON P5 CODE
+static float
+check_inbound(__global __read_only float *in_values,
+                  int w, int h,
+                  int x, int y) {
+// CHECK ROWS
+    if (x < 0) {
+        x = 0;
+    }
+    else if (x >= w) {
+        x = w-1;
+    }
+// CHECK COLUMNS
+    if (y < 0) {
+        y = 0;
+    }
+    else if (y >= h) {
+        y = h-1;
+    }
+    return in_values[y * w + x];
+}
+
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -12,7 +36,22 @@ median_3x3(__global __read_only float *in_values,
     // Note: It may be easier for you to implement median filtering
     // without using the local buffer, first, then adjust your code to
     // use such a buffer after you have that working.
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
 
+    // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
 
     // Load into buffer (with 1-pixel halo).
     //
@@ -22,13 +61,40 @@ median_3x3(__global __read_only float *in_values,
     // Note that globally out-of-bounds pixels should be replaced
     // with the nearest valid pixel's value.
 
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+    // Load the relevant labels to a local buffer with a halo
+    if (idx_1D < buf_w) {
+        for (int row = 0; row < buf_h; row++) {
+            buffer[row * buf_w + idx_1D] =
+                check_inbound(in_values,
+                                  w, h,
+                                  buf_corner_x + idx_1D, buf_corner_y + row);
+        }
+    }
+
+    // Make sure all threads reach the next part after
+    // the local buffer is loaded
+    barrier(CLK_LOCAL_MEM_FENCE);
 
     // Compute 3x3 median for each pixel in core (non-halo) pixels
     //
     // We've given you median9.h, and included it above, so you can
     // use the median9() function.
 
-
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
+
+    if ((x < w) && (y < h)) {
+        out_values[y * w + x] = median9(buffer[(buf_y-1) * buf_w + buf_x-1],
+                                        buffer[(buf_y-1) * buf_w + buf_x],
+                                        buffer[(buf_y-1) * buf_w + buf_x+1],
+                                        buffer[(buf_y) * buf_w + buf_x-1],
+                                        buffer[(buf_y) * buf_w + buf_x],
+                                        buffer[(buf_y) * buf_w + buf_x+1],
+                                        buffer[(buf_y+1) * buf_w + buf_x-1],
+                                        buffer[(buf_y+1) * buf_w + buf_x],
+                                        buffer[(buf_y+1) * buf_w + buf_x+1]);
+    }
 }
diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
@@ -69,6 +69,7 @@ def numpy_median(image, iterations=10):
     # +2 for 1-pixel halo on all sides, 4 bytes for float.
     local_memory = cl.LocalMemory(4 * (local_size[0] + 2) * (local_size[1] + 2))
     # Each work group will have its own private buffer.
+    # Each work group will have its own private buffer.
     buf_width = np.int32(local_size[0] + 2)
     buf_height = np.int32(local_size[1] + 2)
     halo = np.int32(1)

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
@@ -0,0 +1,63 @@
+Part 1:
+
+Maze 1:
+Finished after 914 iterations, 204.70632 ms total, 0.223967527352 ms per iteration
+Found 2 regions
+Maze 2:
+Finished after 532 iterations, 118.44736 ms total, 0.222645413534 ms per iteration
+Found 35 regions
+
+
+Part 2:
+
+Maze 1:
+Finished after 529 iterations, 114.21496 ms total, 0.215907296786 ms per iteration
+Found 2 regions
+Maze 2:
+Finished after 273 iterations, 59.77376 ms total, 0.218951501832 ms per iteration
+Found 35 regions
+
+
+Part 3:
+
+Maze 1:
+Finished after 10 iterations, 3.06184 ms total, 0.306184 ms per iteration
+Found 2 regions
+Maze 2:
+Finished after 9 iterations, 2.65912 ms total, 0.295457777778 ms per iteration
+Found 35 regions
+
+Part 4:
+
+Maze 1:
+Finished after 13 iterations, 10.22584 ms total, 0.786603076923 ms per iteration
+Found 2 regions
+Maze 2:
+Finished after 12 iterations, 9.34432 ms total, 0.778693333333 ms per iteration
+Found 35 regions
+
+Discussion: Using the single thread replacement slows down the performance in my case. I kind of expect this drop in
+performance because we now look up the value with a single thread, even though we are skipping some memory reads. I
+think the memory read in GPU is relatively cheap, whose savings here would probably not override the loss from single-
+threading.
+Therefore, it is not a reasonable choice here. If the computation is even more intensive, or we have more cores, then we
+should be even more willing to use the old version instead of the single-threaded version.
+On the other hand, there are scenarios where the serialization would be much more beneficial:
+1) if we have labels that are more likely to be same: we would be able to skip over much more memory reads;
+2) if the GPU read is much slower: similar to 1), we would get better speedup if we read much fewer
+3) if a large number of threads are trying to access the same part of the local memory.
+
+Part 5:
+
+Similar to a lock, the atomic operations would ensure that when multiple threads are accessing the same old label, the
+label would be updated before being as the input for the second new label calculation. My understanding is that atomic
+operations are able to do this because the calculation and write are in one step, hence preventing intervention from
+other threads in the middle of the process.
+
+I think the result would still be correct, because eventually the label of the same region would still be the same, but
+it may hurt the performance because it introduces the race condition and thus redundent updating. In that way the
+iteration numbers should increase and have some fluctuation due to the race condition. I don't think the values in labels
+would increase between iterations.
+
+I tested the min() instead of teh atomic_min() with the code, and confirmed that the iteration is larger. However, the
+performance did not change (even slightly faster), which I find very interesting.