diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt new file mode 100644 index 00000000..8d11671f --- /dev/null +++ b/HW3/P3/P3.txt @@ -0,0 +1,89 @@ +The best configuration was coalesced reads, workgroups: 64, num_workers: 128, 0.00275208 seconds + +#0: Intel(R) Core(TM) i7-4650U CPU @ 1.70GHz on Apple +#1: HD Graphics 5000 on Apple +coalesced reads, workgroups: 8, num_workers: 4, 0.173174 seconds +coalesced reads, workgroups: 8, num_workers: 8, 0.09775128 seconds +coalesced reads, workgroups: 8, num_workers: 16, 0.05811304 seconds +coalesced reads, workgroups: 8, num_workers: 32, 0.02927688 seconds +coalesced reads, workgroups: 8, num_workers: 64, 0.01506872 seconds +coalesced reads, workgroups: 8, num_workers: 128, 0.00775552 seconds +coalesced reads, workgroups: 16, num_workers: 4, 0.09545584 seconds +coalesced reads, workgroups: 16, num_workers: 8, 0.05122536 seconds +coalesced reads, workgroups: 16, num_workers: 16, 0.03318448 seconds +coalesced reads, workgroups: 16, num_workers: 32, 0.01706696 seconds +coalesced reads, workgroups: 16, num_workers: 64, 0.00832944 seconds +coalesced reads, workgroups: 16, num_workers: 128, 0.00449384 seconds +coalesced reads, workgroups: 32, num_workers: 4, 0.06161328 seconds +coalesced reads, workgroups: 32, num_workers: 8, 0.02915712 seconds +coalesced reads, workgroups: 32, num_workers: 16, 0.01422552 seconds +coalesced reads, workgroups: 32, num_workers: 32, 0.00753888 seconds +coalesced reads, workgroups: 32, num_workers: 64, 0.00383592 seconds +coalesced reads, workgroups: 32, num_workers: 128, 0.00296608 seconds +coalesced reads, workgroups: 64, num_workers: 4, 0.02890824 seconds +coalesced reads, workgroups: 64, num_workers: 8, 0.0142484 seconds +coalesced reads, workgroups: 64, num_workers: 16, 0.0075216 seconds +coalesced reads, workgroups: 64, num_workers: 32, 0.00401608 seconds +coalesced reads, workgroups: 64, num_workers: 64, 0.00291696 seconds +coalesced reads, workgroups: 64, num_workers: 128, 0.00275208 seconds +coalesced reads, workgroups: 128, num_workers: 4, 0.02929296 seconds +coalesced reads, workgroups: 128, num_workers: 8, 0.01506728 seconds +coalesced reads, workgroups: 128, num_workers: 16, 0.00750416 seconds +coalesced reads, workgroups: 128, num_workers: 32, 0.00392192 seconds +coalesced reads, workgroups: 128, num_workers: 64, 0.00282296 seconds +coalesced reads, workgroups: 128, num_workers: 128, 0.00283168 seconds +coalesced reads, workgroups: 256, num_workers: 4, 0.02977544 seconds +coalesced reads, workgroups: 256, num_workers: 8, 0.01534176 seconds +coalesced reads, workgroups: 256, num_workers: 16, 0.00795784 seconds +coalesced reads, workgroups: 256, num_workers: 32, 0.00426864 seconds +coalesced reads, workgroups: 256, num_workers: 64, 0.0028708 seconds +coalesced reads, workgroups: 256, num_workers: 128, 0.00280752 seconds +coalesced reads, workgroups: 512, num_workers: 4, 0.02997592 seconds +coalesced reads, workgroups: 512, num_workers: 8, 0.01532384 seconds +coalesced reads, workgroups: 512, num_workers: 16, 0.00771888 seconds +coalesced reads, workgroups: 512, num_workers: 32, 0.00434744 seconds +coalesced reads, workgroups: 512, num_workers: 64, 0.00287568 seconds +coalesced reads, workgroups: 512, num_workers: 128, 0.00275472 seconds +blocked reads, workgroups: 8, num_workers: 4, 0.18406568 seconds +blocked reads, workgroups: 8, num_workers: 8, 0.11625232 seconds +blocked reads, workgroups: 8, num_workers: 16, 0.08278504 seconds +blocked reads, workgroups: 8, num_workers: 32, 0.04329048 seconds +blocked reads, workgroups: 8, num_workers: 64, 0.0209084 seconds +blocked reads, workgroups: 8, num_workers: 128, 0.01256128 seconds +blocked reads, workgroups: 16, num_workers: 4, 0.11378736 seconds +blocked reads, workgroups: 16, num_workers: 8, 0.07066096 seconds +blocked reads, workgroups: 16, num_workers: 16, 0.04468824 seconds +blocked reads, workgroups: 16, num_workers: 32, 0.02061624 seconds +blocked reads, workgroups: 16, num_workers: 64, 0.01264984 seconds +blocked reads, workgroups: 16, num_workers: 128, 0.03288288 seconds +blocked reads, workgroups: 32, num_workers: 4, 0.06355624 seconds +blocked reads, workgroups: 32, num_workers: 8, 0.03810152 seconds +blocked reads, workgroups: 32, num_workers: 16, 0.02057768 seconds +blocked reads, workgroups: 32, num_workers: 32, 0.01233816 seconds +blocked reads, workgroups: 32, num_workers: 64, 0.03313528 seconds +blocked reads, workgroups: 32, num_workers: 128, 0.08156096 seconds +blocked reads, workgroups: 64, num_workers: 4, 0.0351416 seconds +blocked reads, workgroups: 64, num_workers: 8, 0.01899656 seconds +blocked reads, workgroups: 64, num_workers: 16, 0.01270664 seconds +blocked reads, workgroups: 64, num_workers: 32, 0.0335048 seconds +blocked reads, workgroups: 64, num_workers: 64, 0.08059792 seconds +blocked reads, workgroups: 64, num_workers: 128, 0.08670848 seconds +blocked reads, workgroups: 128, num_workers: 4, 0.0413724 seconds +blocked reads, workgroups: 128, num_workers: 8, 0.02386552 seconds +blocked reads, workgroups: 128, num_workers: 16, 0.01572904 seconds +blocked reads, workgroups: 128, num_workers: 32, 0.04104112 seconds +blocked reads, workgroups: 128, num_workers: 64, 0.08430344 seconds +blocked reads, workgroups: 128, num_workers: 128, 0.06656264 seconds +blocked reads, workgroups: 256, num_workers: 4, 0.03564384 seconds +blocked reads, workgroups: 256, num_workers: 8, 0.02051488 seconds +blocked reads, workgroups: 256, num_workers: 16, 0.01278368 seconds +blocked reads, workgroups: 256, num_workers: 32, 0.03704784 seconds +blocked reads, workgroups: 256, num_workers: 64, 0.07087392 seconds +blocked reads, workgroups: 256, num_workers: 128, 0.04985056 seconds +blocked reads, workgroups: 512, num_workers: 4, 0.0354248 seconds +blocked reads, workgroups: 512, num_workers: 8, 0.02118616 seconds +blocked reads, workgroups: 512, num_workers: 16, 0.01358552 seconds +blocked reads, workgroups: 512, num_workers: 32, 0.03686744 seconds +blocked reads, workgroups: 512, num_workers: 64, 0.04923632 seconds +blocked reads, workgroups: 512, num_workers: 128, 0.03931944 seconds +configuration ('coalesced', 64, 128): 0.00275208 seconds \ No newline at end of file diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl index 4fb771d2..2354ad58 100644 --- a/HW3/P3/sum.cl +++ b/HW3/P3/sum.cl @@ -5,11 +5,14 @@ __kernel void sum_coalesced(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); + int group_size = get_local_size(0); + int global_size = get_global_size(0); + int global_id = get_global_id(0); // thread i (i.e., with i = get_global_id()) should add x[i], // x[i + get_global_size()], ... up to N-1, and store in sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (int i = global_id; i < N; i += global_size) { // YOUR CODE HERE + sum += x[i]; // YOUR CODE HERE } fast[local_id] = sum; @@ -24,8 +27,11 @@ __kernel void sum_coalesced(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (int offset = group_size >> 1; offset > 0; offset >>= 1) { // YOUR CODE HERE + if ((int)local_id < offset) { // YOUR CODE HERE + fast[local_id] += fast[local_id + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; @@ -38,8 +44,9 @@ __kernel void sum_blocked(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); - int k = ceil(float(N) / get_global_size(0)); - + int global_id = get_global_id(0); + int group_size = get_local_size(0); + int k = ceil((float)N / get_global_size(0)); // thread with global_id 0 should add 0..k-1 // thread with global_id 1 should add k..2k-1 // thread with global_id 2 should add 2k..3k-1 @@ -48,8 +55,10 @@ __kernel void sum_blocked(__global float* x, // // Be careful that each thread stays in bounds, both relative to // size of x (i.e., N), and the range it's assigned to sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (int i = global_id*k; i < (global_id+1)*k; i++) { // YOUR CODE HERE + if (i < N) { + sum += x[i]; // YOUR CODE HERE + } } fast[local_id] = sum; @@ -64,8 +73,11 @@ __kernel void sum_blocked(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (int offset = group_size >> 1; offset > 0; offset >>= 1) { // YOUR CODE HERE + if ((int)local_id < offset) { // YOUR CODE HERE + fast[local_id] += fast[local_id + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py index c16e9fa6..a0d56da2 100644 --- a/HW3/P3/tune.py +++ b/HW3/P3/tune.py @@ -23,7 +23,7 @@ def create_data(N): times = {} for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) @@ -40,7 +40,7 @@ def create_data(N): format(num_workgroups, num_workers, seconds)) for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl index 07bb294c..eaa9b141 100644 --- a/HW3/P4/median_filter.cl +++ b/HW3/P4/median_filter.cl @@ -1,5 +1,21 @@ #include "median9.h" +float +FETCH(__global __read_only float *in_values, int w, int h, int x, int y) +{ + if (x < 0) { + x = 0; + } else if (x > w-1) { + x = w-1; + } + if (y < 0) { + y = 0; + } else if (y > h-1) { + y = h-1; + } + return in_values[y*w+x]; +} + // 3x3 median filter __kernel void median_3x3(__global __read_only float *in_values, @@ -31,4 +47,53 @@ median_3x3(__global __read_only float *in_values, // Each thread in the valid region (x < w, y < h) should write // back its 3x3 neighborhood median. + + // Pulled from http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ + // Global position of output pixel + const int x = get_global_id(0); + const int y = get_global_id(1); + + // Local position relative to (0, 0) in workgroup + const int lx = get_local_id(0); + const int ly = get_local_id(1); + + // coordinates of the upper left corner of the buffer in image + // space, including halo + const int buf_corner_x = x - lx - halo; + const int buf_corner_y = y - ly - halo; + + // coordinates of our pixel in the local buffer + const int buf_x = lx + halo; + const int buf_y = ly + halo; + + // 1D index of thread within our work-group + const int idx_1D = ly * get_local_size(0) + lx; + + int row; + + if (idx_1D < buf_w) { + for (row = 0; row < buf_h; row++) { + buffer[row * buf_w + idx_1D] = FETCH(in_values, w, h, \ + buf_corner_x + idx_1D, \ + buf_corner_y + row); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // Processing code here... + // + // Should only use buffer, buf_x, buf_y. + + // write output + if ((y < h) && (x < w)) // stay in bounds + out_values[y * w + x] = median9(buffer[(buf_y-1) * buf_w + buf_x-1], + buffer[(buf_y-1) * buf_w + buf_x], + buffer[(buf_y-1) * buf_w + buf_x+1], + buffer[buf_y * buf_w + buf_x-1], + buffer[buf_y * buf_w + buf_x], + buffer[buf_y * buf_w + buf_x+1], + buffer[(buf_y+1) * buf_w + buf_x-1], + buffer[(buf_y+1) * buf_w + buf_x], + buffer[(buf_y+1) * buf_w + buf_x+1]); } diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py index 1eda1bb9..a181c05a 100644 --- a/HW3/P4/median_filter.py +++ b/HW3/P4/median_filter.py @@ -1,8 +1,8 @@ from __future__ import division import pyopencl as cl import numpy as np -import imread import pylab +import os.path def round_up(global_size, group_size): r = global_size % group_size @@ -51,7 +51,8 @@ def numpy_median(image, iterations=10): properties=cl.command_queue_properties.PROFILING_ENABLE) print 'The queue is using the device:', queue.device.name - program = cl.Program(context, open('median_filter.cl').read()).build(options='') + curdir = os.path.dirname(os.path.realpath(__file__)) + program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir]) host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy() host_image_filtered = np.zeros_like(host_image) diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt new file mode 100644 index 00000000..89e6ecc7 --- /dev/null +++ b/HW3/P5/P5.txt @@ -0,0 +1,36 @@ +Part 1: +Maze 1: Finished after 888 iterations, 450.07784 ms total, 0.506844414414 ms per iteration +Found 2 regions + +Maze 2: Finished after 516 iterations, 258.12848 ms total, 0.500248992248 ms per iteration +Found 35 regions + +Part 2: +Maze 1: Finished after 529 iterations, 262.70208 ms total, 0.496601285444 ms per iteration +Found 2 regions + +Maze 2: Finished after 273 iterations, 137.47456 ms total, 0.50356981685 ms per iteration +Found 35 regions + +Part 3: +Maze 1: Finished after 10 iterations, 4.89648 ms total, 0.489648 ms per iteration +Found 2 regions + +Maze 2: Finished after 9 iterations, 4.50248 ms total, 0.500275555556 ms per iteration +Found 35 regions + +Part 4: +Maze 1: Finished after 19 iterations, 19.07336 ms total, 1.00386105263 ms per iteration +Found 2 regions + +Maze 2: Finished after 17 iterations, 17.0408 ms total, 1.0024 ms per iteration +Found 35 regions + +Discussion: +Despite avoiding repeating global memory reads in the single threaded sequential scan, the single thread workgroup labeling resulted in double the time to label the maze. This is because we are updating the labels in parallel. Empirically, we are bottlenecked by computation. We don't achieve much speedup given we are reducing global memory reads, so reading is not the issue. Thus, for my hardware, single thread is not a good option. + +There are two general factors that can impact the speed. 1) If the pixels are similar, then there would be more speedup in part 4 since there would be fewer updates. 2) If a GPU can has more core and can perform reads and writes much more quickly, then the speedup in part 4 would also be much more. + +Part 5: +Discussion: +Using atomic_min is useful because it treats reading and writing as one task in our OS scheduler. However, it causes serialized reads and writes. The issue with min instead of atomic_min is that we can run into race conditions. One thread can write a value that is not necessarily the min value after another thread has written the min value. As a result, we would significantly decrease our perfomance since it would take longer to label the entire region. diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl index 78b986b3..1f8aa03a 100644 --- a/HW3/P5/label_regions.cl +++ b/HW3/P5/label_regions.cl @@ -45,6 +45,10 @@ propagate_labels(__global __read_write int *labels, const int lx = get_local_id(0); const int ly = get_local_id(1); + // Local workgroup sizes + const int wgx = get_local_size(1); + const int wgy = get_local_size(0); + // coordinates of the upper left corner of the buffer in image // space, including halo const int buf_corner_x = x - lx - halo; @@ -80,20 +84,54 @@ propagate_labels(__global __read_write int *labels, old_label = buffer[buf_y * buf_w + buf_x]; // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2) + // grab the grandparent label + // if (old_label < w*h) { + // buffer[buf_y * buf_w + buf_x] = labels[old_label]; + // } + // if we are at the top left corner of the work group, we propogate grandparent + // labels across the group in a serialized fashion + if (lx == 0 && ly == 0) { + int prev, curr; + for (int i = 0; i < wgy; i++) { + for (int j = 0; j < wgx; j++) { + int id = (i + halo)*buf_w + (j + halo); + curr = buffer[id]; + + if (curr < w*h) { + if (curr != prev) { + prev = curr; + buffer[id] = labels[curr]; + } + } + } + } + } + + // grandparent labels have been propogated + barrier(CLK_LOCAL_MEM_FENCE); + // stay in bounds - if ((x < w) && (y < h)) { + // we must check if old_label is less than w*h since the foreground pixels are bounded by + // w*h and we don't want to blend the background pixels with foreground pixels + if ((x < w) && (y < h) && (old_label < w*h)) { // CODE FOR PART 1 HERE // We set new_label to the value of old_label, but you will need // to adjust this for correctness. - new_label = old_label; + // grab the smallest pixel amoung the current and surrounding pixels + new_label = min(old_label, + min(buffer[buf_y*buf_w+buf_x+1], + min(buffer[buf_y*buf_w+buf_x-1], + min(buffer[(buf_y-1)*buf_w+buf_x], buffer[(buf_y+1)*buf_w+buf_x])))); if (new_label != old_label) { // CODE FOR PART 3 HERE // indicate there was a change this iteration. // multiple threads might write this. + // merging old and new parent with same new_label + atomic_min(&labels[old_label], new_label); *(changed_flag) += 1; - labels[y * w + x] = new_label; + atomic_min(&labels[y * w + x], new_label); } } } diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py index c6ce60cb..276179fd 100644 --- a/HW3/P5/label_regions.py +++ b/HW3/P5/label_regions.py @@ -2,7 +2,7 @@ import sys import pyopencl as cl import numpy as np -import pylab +from matplotlib import pyplot def round_up(global_size, group_size): r = global_size % group_size @@ -75,9 +75,9 @@ def round_up(global_size, group_size): # Show the initial labels cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True) - pylab.imshow(host_labels) - pylab.title(itercount) - pylab.show() + pyplot.imshow(host_labels) + pyplot.title(itercount) + pyplot.show() show_progress = True total_time = 0 @@ -105,9 +105,9 @@ def round_up(global_size, group_size): print host_done_flag if itercount % 100 == 0 and show_progress: cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True) - pylab.imshow(host_labels) - pylab.title(itercount) - pylab.show() + pyplot.imshow(host_labels) + pyplot.title(itercount) + pyplot.show() if itercount % 10000 == 0: print 'Reached maximal number of iterations, aborting' sys.exit(0) @@ -116,6 +116,6 @@ def round_up(global_size, group_size): # Show final result cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True) print 'Found {} regions'.format(len(np.unique(host_labels)) - 1) - pylab.imshow(host_labels) - pylab.title(itercount) - pylab.show() + pyplot.imshow(host_labels) + pyplot.title(itercount) + pyplot.show()