diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt new file mode 100644 index 00000000..e626738e --- /dev/null +++ b/HW3/P3/P3.txt @@ -0,0 +1,2 @@ +The fastest configuration was the coaslesced setup with a workgroup size of 512, and 128 workers. +This configuration completed the task in 0.00309848 seconds. \ No newline at end of file diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl index 4fb771d2..bf913a6c 100644 --- a/HW3/P3/sum.cl +++ b/HW3/P3/sum.cl @@ -8,8 +8,12 @@ __kernel void sum_coalesced(__global float* x, // thread i (i.e., with i = get_global_id()) should add x[i], // x[i + get_global_size()], ... up to N-1, and store in sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + + unsigned int thread_id = get_global_id(0); + unsigned int step_size = get_global_size(0); + + for (unsigned int i = thread_id; i < N; i += step_size) { // YOUR CODE HERE + sum += x[i]; // YOUR CODE HERE } fast[local_id] = sum; @@ -24,8 +28,14 @@ __kernel void sum_coalesced(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + + unsigned int local_size = get_local_size(0); + + for (unsigned int j = local_size/2; j > 0; j >>= 1) { // YOUR CODE HERE + if( local_id < j) { + fast[local_id] += fast[local_id + j]; // YOUR CODE HERE + } + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; @@ -38,7 +48,8 @@ __kernel void sum_blocked(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); - int k = ceil(float(N) / get_global_size(0)); + + int k = ceil((float)N / get_global_size(0)); // thread with global_id 0 should add 0..k-1 // thread with global_id 1 should add k..2k-1 @@ -48,8 +59,11 @@ __kernel void sum_blocked(__global float* x, // // Be careful that each thread stays in bounds, both relative to // size of x (i.e., N), and the range it's assigned to sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + + unsigned int thread_id = get_global_id(0); + + for (unsigned int i = k * thread_id; i < k * (thread_id + 1) && i < N; i++) { // YOUR CODE HERE + sum += x[i]; // YOUR CODE HERE } fast[local_id] = sum; @@ -64,8 +78,14 @@ __kernel void sum_blocked(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + + unsigned int local_size = get_local_size(0); + + for (unsigned int j = local_size/2; j > 0; j >>= 1) { // YOUR CODE HERE + if( local_id < j) { + fast[local_id] += fast[local_id + j]; // YOUR CODE HERE + } + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py index c16e9fa6..e4387ccc 100644 --- a/HW3/P3/tune.py +++ b/HW3/P3/tune.py @@ -1,10 +1,12 @@ import pyopencl as cl import numpy as np +import os def create_data(N): return host_x, x if __name__ == "__main__": + os.environ["PYOPENCL_COMPILER_OUTPUT"] = "1" N = 1e7 platforms = cl.get_platforms() @@ -23,7 +25,7 @@ def create_data(N): times = {} for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) @@ -40,7 +42,7 @@ def create_data(N): format(num_workgroups, num_workers, seconds)) for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl index 07bb294c..6971a7b9 100644 --- a/HW3/P4/median_filter.cl +++ b/HW3/P4/median_filter.cl @@ -1,5 +1,16 @@ #include "median9.h" +// From HW3 P5 +float +get_clamped_value(__global __read_only float *labels, + int w, int h, + int x, int y) +{ + int c_x = min(w-1, max(0, x)), c_y = min(h-1, max(0, y)); + return labels[c_y * w + c_x]; +} + + // 3x3 median filter __kernel void median_3x3(__global __read_only float *in_values, @@ -22,13 +33,67 @@ median_3x3(__global __read_only float *in_values, // Note that globally out-of-bounds pixels should be replaced // with the nearest valid pixel's value. + // Based on HW3 Problem 5 + + // Global position of output pixel + const int x = get_global_id(0); + const int y = get_global_id(1); + + // Local position relative to (0, 0) in workgroup + const int lx = get_local_id(0); + const int ly = get_local_id(1); + + // coordinates of the upper left corner of the buffer in image + // space, including halo + const int buf_corner_x = x - lx - halo; + const int buf_corner_y = y - ly - halo; + + // coordinates of our pixel in the local buffer + const int buf_x = lx + halo; + const int buf_y = ly + halo; + + // 1D index of thread within our work-group + const int idx_1D = ly * get_local_size(0) + lx; + + // Load the relevant labels to a local buffer with a halo + if (idx_1D < buf_w) { + for (int row = 0; row < buf_h; row++) { + buffer[row * buf_w + idx_1D] = + get_clamped_value(in_values, + w, h, + buf_corner_x + idx_1D, buf_corner_y + row); + } + } + + // Make sure all threads reach the next part after + // the local buffer is loaded + barrier(CLK_LOCAL_MEM_FENCE); + // Compute 3x3 median for each pixel in core (non-halo) pixels // // We've given you median9.h, and included it above, so you can // use the median9() function. + const int dx[3] = {-1, 0, 1}, dy[3] = {-1, 0, 1}; + int idxArr[9]; + + for( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + idxArr[i*3+j] = (buf_y + dy[i])*buf_w + (buf_x + dx[j]); + } + } // Each thread in the valid region (x < w, y < h) should write // back its 3x3 neighborhood median. + + //From HW3 P5 + // stay in bounds + if ((x < w) && (y < h)) { + out_values[y*w + x] = + median9( buffer[ idxArr[0] ], buffer[ idxArr[1] ], buffer[ idxArr[2] ], + buffer[ idxArr[3] ], buffer[ idxArr[4] ], buffer[ idxArr[5] ], + buffer[ idxArr[6] ], buffer[ idxArr[7] ], buffer[ idxArr[8] ] ); + } + } diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt new file mode 100644 index 00000000..589ef6b3 --- /dev/null +++ b/HW3/P5/P5.txt @@ -0,0 +1,61 @@ +Part 1: +======================== +Finished after 911 iterations, 214.33168 ms total, 0.235270779363 ms per iteration +Found 2 regions + +Finished after 531 iterations, 124.05312 ms total, 0.233621694915 ms per iteration +Found 35 regions + +Part 2: +======================== +Finished after 529 iterations, 133.35928 ms total, 0.252096937618 ms per iteration +Found 2 regions + +Finished after 269 iterations, 68.04856 ms total, 0.252968624535 ms per iteration +Found 35 regions + +Part 3: +======================== +Finished after 8 iterations, 2.59184 ms total, 0.32398 ms per iteration +Found 2 regions + +Finished after 8 iterations, 2.4332 ms total, 0.30415 ms per iteration +Found 35 regions + +Part 4: +======================== +Finished after 10 iterations, 7.37784 ms total, 0.737784 ms per iteration +Found 2 regions + +Finished after 9 iterations, 6.66816 ms total, 0.740906666667 ms per iteration +Found 35 regions + +Using a single-thread for caching seems to have made the running time much slower (looking at the time per iteration +.) Computation is serialized during this caching process, so there is a big upfront cost that needs to be weighed +against the ongoing cost of expensive, potentially serialized, main memory accesses. + +I suspect the worst case memory access scenario where neighbouring nodes all try to access main memory for a single +cached value only happens quite late in the process. Earlier iterations are probably facing more diversity of memory +accesses since there's a greater range of values still remaining. It follows then, that perhaps with much more +complex mazes, there could be scenarios where the overall iteration count will be high, but parts of the maze will +have already "stabilized" early in the process. Therefore, these "stabilized" parts are repeatedly drawing on the same +cached values. It seems there aren't enough iterations in these examples for it to be worth the upfront cost (at +least on my hardware setup.) Potentially, other setups that have different computation and memory access speeds might +reveal differences in the results since the trade-off is weighed differently. + +Part 5: +======================== +If instead of atomic_min() we use min(), then the update step is not done in a single transaction, meaning between +the min() check and the subsequent update, a different thread could update the reference value. Suppose this +different thread actually updated the reference to an even lower value than what we had intended. Now, if we go ahead + with our update, we are actually *increasing* the reference value. It could therefore also lead to an increase in + this value between iterations. + + So while it would be faster to do without atomic_min() as memory access is not serialized, it is also slow since it + means more iterations have to be performed. While empirical testing is needed to determine which is the better + trade-off, my sense is that for simple, low-iteration mazes, atomic_min() is going to present a significant overhead + so it might be better to do a few more iterations. The opposite is likely true for high-iteration mazes. + +This makes things more inefficient since this cache is potentially being updated with worse values. However, because +ultimately the stopping condition is whether or not any more updates have been performed, it won't effect the +correctness of the algorithm. \ No newline at end of file diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl index 78b986b3..e20a4eb6 100644 --- a/HW3/P5/label_regions.cl +++ b/HW3/P5/label_regions.cl @@ -35,6 +35,8 @@ propagate_labels(__global __read_write int *labels, int buf_w, int buf_h, const int halo) { + + const int dx[4] = {-1, 0, 1, 0}, dy[4] = {0, -1, 0, 1}; // halo is the additional number of cells in one direction // Global position of output pixel @@ -80,13 +82,53 @@ propagate_labels(__global __read_write int *labels, old_label = buffer[buf_y * buf_w + buf_x]; // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2) - + // Part 2 + /* + for( int i=0; i<4; i++ ) { + if( buffer[(buf_y+dy[i])*buf_w + (buf_x + dx[i])] < w * h ) { + buffer[(buf_y+dy[i])*buf_w + (buf_x + dx[i])] = labels[ buffer[(buf_y+dy[i])*buf_w + (buf_x + dx[i])] ]; + } + } + */ + + // Part 4 + // Reference: Piazza @524 + unsigned int ls0 = get_local_size(0), ls1 = get_local_size(1); + + if( lx == 0 && ly == 0 ) { //Use the first thread + unsigned int prev = -1, gparent = -1; // 1 variable cache + for( int c_lx = 0; c_lx < ls0; c_lx++ ) { // Update the entire local buffer + for( int c_ly = 0; c_ly < ls1; c_ly++ ) { + unsigned int cur_idx = (c_ly + halo) * buf_w + (c_lx + halo); + unsigned int parent = buffer[cur_idx]; + + if( parent == w * h ) continue; // Background pixel + + if( parent == prev ) { // 1 variable cache success! + buffer[cur_idx] = gparent; + } + else { // Update the cache + buffer[cur_idx] = labels[parent]; + prev = parent; + gparent = buffer[cur_idx]; + } + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + // stay in bounds if ((x < w) && (y < h)) { // CODE FOR PART 1 HERE // We set new_label to the value of old_label, but you will need // to adjust this for correctness. new_label = old_label; + if( new_label != w * h ) { // See Piazza @486 + for( int i=0; i<4; i++ ) { + new_label = min( new_label, buffer[(buf_y+dy[i])*buf_w + (buf_x + dx[i])] ); + } + } if (new_label != old_label) { // CODE FOR PART 3 HERE @@ -94,6 +136,7 @@ propagate_labels(__global __read_write int *labels, // multiple threads might write this. *(changed_flag) += 1; labels[y * w + x] = new_label; + atomic_min(&labels[old_label], new_label); } } } diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py index c6ce60cb..b9aeeeb9 100644 --- a/HW3/P5/label_regions.py +++ b/HW3/P5/label_regions.py @@ -42,7 +42,7 @@ def round_up(global_size, group_size): program = cl.Program(context, open('label_regions.cl').read()).build(options='') - host_image = np.load('maze1.npy') + host_image = np.load('maze2.npy') host_labels = np.empty_like(host_image) host_done_flag = np.zeros(1).astype(np.int32)