diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl index 5a11c020..f96965d2 100644 --- a/HW3/P2/mandelbrot.cl +++ b/HW3/P2/mandelbrot.cl @@ -9,11 +9,25 @@ mandelbrot(__global __read_only float *coords_real, const int y = get_global_id(1); float c_real, c_imag; - float z_real, z_imag; + float z_real, z_imag, z_real_new, z_imag_new; + float mag2; int iter; if ((x < w) && (y < h)) { - // YOUR CODE HERE - ; + iter = 1; + c_real = coords_real[y*w + x]; + c_imag = coords_imag[y*w + x]; + z_real = c_real; + z_imag = c_imag; + mag2 = z_real*z_real + z_imag*z_imag; + while ((mag2 < 4) && (iter < max_iter)){ + z_real_new = z_real*z_real - z_imag*z_imag + c_real; + z_imag_new = 2*z_real*z_imag + c_imag; + mag2 = z_real_new*z_real_new + z_imag_new*z_imag_new; + z_real = z_real_new; + z_imag = z_imag_new; + iter = iter + 1; + } + out_counts[y*w + x] = iter; } } diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt new file mode 100644 index 00000000..426a59ad --- /dev/null +++ b/HW3/P3/P3.txt @@ -0,0 +1,13 @@ + +The best configuration for my machine is: + +configuration ('coalesced', 512, 128): 0.000331392 seconds + +The coalesced read is faster than the blocked read on average +for the same number of work groups and workers because more +threads can do work on the same block of fetched memory. In the +blocked reads, once a thread fetchs its block to sum, more +threads may have to wait to fetch their block of memory. However in the +coalesced reads, more threads can sum elements simultaneously +more often since a fetched block of memory will be more likely to +contain elements needed by more threads than in the blocked scheme. diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl index 4fb771d2..0c5a8493 100644 --- a/HW3/P3/sum.cl +++ b/HW3/P3/sum.cl @@ -5,11 +5,16 @@ __kernel void sum_coalesced(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); - + int i, j, gID, gSize, temp, lSize, loglSize; + + gID = get_global_id(0); + gSize = get_global_size(0); + lSize = get_local_size(0); + // thread i (i.e., with i = get_global_id()) should add x[i], // x[i + get_global_size()], ... up to N-1, and store in sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (i = gID; i < N; i += gSize) { + sum = sum + x[i]; } fast[local_id] = sum; @@ -24,8 +29,17 @@ __kernel void sum_coalesced(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + loglSize = 1; + temp = lSize >> 1; + while (temp > 1){ + temp = temp >> 1; + loglSize = loglSize + 1; + } + for (j = 1; j <= loglSize; j++) { + if (local_id < (lSize >> j)) { + fast[local_id] = fast[local_id] + fast[local_id + (lSize >> j)]; + } + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; @@ -38,7 +52,8 @@ __kernel void sum_blocked(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); - int k = ceil(float(N) / get_global_size(0)); + int k = ceil((float)N / get_global_size(0)); + int j, gID, temp, loglSize, lSize, minS; // thread with global_id 0 should add 0..k-1 // thread with global_id 1 should add k..2k-1 @@ -48,8 +63,16 @@ __kernel void sum_blocked(__global float* x, // // Be careful that each thread stays in bounds, both relative to // size of x (i.e., N), and the range it's assigned to sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + lSize = get_local_size(0); + gID = get_global_id(0); + if (k-1 < N - k*gID){ + minS = k; + } + else{ + minS = N - k*gID; + } + for (j = 0; j < minS; j++) { + sum = sum + x[k*gID + j]; } fast[local_id] = sum; @@ -64,8 +87,17 @@ __kernel void sum_blocked(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + loglSize = 1; + temp = lSize >> 1; + while (temp > 1){ + temp = temp >> 1; + loglSize = loglSize + 1; + } + for (j = 1; j <= loglSize; j++) { + if (local_id < (lSize >> j)) { + fast[local_id] = fast[local_id] + fast[local_id + (lSize >> j)]; + } + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py index c16e9fa6..a0d56da2 100644 --- a/HW3/P3/tune.py +++ b/HW3/P3/tune.py @@ -23,7 +23,7 @@ def create_data(N): times = {} for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) @@ -40,7 +40,7 @@ def create_data(N): format(num_workgroups, num_workers, seconds)) for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl index 07bb294c..7e134fd1 100644 --- a/HW3/P4/median_filter.cl +++ b/HW3/P4/median_filter.cl @@ -1,5 +1,6 @@ #include "median9.h" + // 3x3 median filter __kernel void median_3x3(__global __read_only float *in_values, @@ -12,23 +13,81 @@ median_3x3(__global __read_only float *in_values, // Note: It may be easier for you to implement median filtering // without using the local buffer, first, then adjust your code to // use such a buffer after you have that working. + int gID, lID, x, y, lx, ly, gSizeX, gSizeY, + lSizeX, lSizeY, xTemp, yTemp, xUse, yUse, + buf_corner_x, buf_corner_y, buf_x, buf_y, row; + // the code below is adapted from the lecture code on halos + x = get_global_id(0); + y = get_global_id(1); + lx = get_local_id(0); + ly = get_local_id(1); + gSizeX = get_global_size(0); + gSizeY = get_global_size(1); + lSizeX = get_local_size(0); + lSizeY = get_local_size(1); + + + gID = gSizeX*y + x; + lID = lSizeX*ly + lx; + buf_corner_x = x - lx - halo; + buf_corner_y = y - ly - halo; - // Load into buffer (with 1-pixel halo). - // - // It may be helpful to consult HW3 Problem 5, and - // https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl - // - // Note that globally out-of-bounds pixels should be replaced - // with the nearest valid pixel's value. + buf_x = lx + halo; + buf_y = ly + halo; + if ((y < h) && (x < w)){ + if (lID < buf_w){ // only work with buf_w threads + xTemp = buf_corner_x + lID; + xUse = xTemp; + if (xTemp < 0){ // if pixel out of bounds, add compensation steps to find closest in bound pixel + xUse += 1; + } + if (xTemp > w - 1){ + xUse -= 1; + } + for (row = 0; row < buf_h; row++) { + yTemp = buf_corner_y + row; + yUse = yTemp; + if (yTemp < 0){ + yUse += 1; + } + if (yTemp > h - 1){ + yUse -= 1; + } + buffer[row * buf_w + lID] = in_values[yUse*gSizeX + xUse]; // assign global memory of pixel or closest in bound pixel to buffer + } + } + } - // Compute 3x3 median for each pixel in core (non-halo) pixels - // - // We've given you median9.h, and included it above, so you can - // use the median9() function. + barrier(CLK_LOCAL_MEM_FENCE); + if ((y < h) && (x < w)){ + out_values[gID] = median9(buffer[(buf_y-1)*buf_w + (buf_x-1)], // take median of 8 neighbors and current pixel + buffer[(buf_y-1)*buf_w + (buf_x)], + buffer[(buf_y-1)*buf_w + (buf_x+1)], + buffer[(buf_y)*buf_w + (buf_x-1)], + buffer[(buf_y)*buf_w + (buf_x)], + buffer[(buf_y)*buf_w + (buf_x+1)], + buffer[(buf_y+1)*buf_w + (buf_x-1)], + buffer[(buf_y+1)*buf_w + (buf_x)], + buffer[(buf_y+1)*buf_w + (buf_x+1)]); + } + // Load into buffer (with 1-pixel halo). + // + // It may be helpful to consult HW3 Problem 5, and + // https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl + // + // Note that globally out-of-bounds pixels should be replaced + // with the nearest valid pixel's value. - // Each thread in the valid region (x < w, y < h) should write - // back its 3x3 neighborhood median. + + // Compute 3x3 median for each pixel in core (non-halo) pixels + // + // We've given you median9.h, and included it above, so you can + // use the median9() function. + + + // Each thread in the valid region (x < w, y < h) should write + // back its 3x3 neighborhood median. } diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py index 1eda1bb9..118fe6bd 100644 --- a/HW3/P4/median_filter.py +++ b/HW3/P4/median_filter.py @@ -1,8 +1,8 @@ from __future__ import division import pyopencl as cl import numpy as np -import imread import pylab +import os.path def round_up(global_size, group_size): r = global_size % group_size @@ -51,7 +51,8 @@ def numpy_median(image, iterations=10): properties=cl.command_queue_properties.PROFILING_ENABLE) print 'The queue is using the device:', queue.device.name - program = cl.Program(context, open('median_filter.cl').read()).build(options='') + curdir = os.path.dirname(os.path.realpath(__file__)) + program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir]) host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy() host_image_filtered = np.zeros_like(host_image) @@ -59,7 +60,7 @@ def numpy_median(image, iterations=10): gpu_image_a = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4) gpu_image_b = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4) - local_size = (8, 8) # 64 pixels per work group + local_size = (4, 4) # 64 pixels per work group global_size = tuple([round_up(g, l) for g, l in zip(host_image.shape[::-1], local_size)]) width = np.int32(host_image.shape[1]) height = np.int32(host_image.shape[0]) diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt new file mode 100644 index 00000000..26d9ffca --- /dev/null +++ b/HW3/P5/P5.txt @@ -0,0 +1,86 @@ + + +Explanation: + +Part 1: + +This is the base code. + +Part 2: + +This is optimized over the first part because the buffer values are updated with the +grandparent values, which is guaranteed to be less than or equal to the current +buffer value. + +Part 3: + +This is optimized over the second part because a pixel's parent is updated to the pixel's +value if it is smaller than the pixel's parent using atomic min. However, the iteration +time increases due to the atomic (min) operation. + +Part 4: + +Making 1 thread update the buffer regions with grandparent values is not as efficient on average +given the time per iteration is roughly twice as long as Part 3. Even though lots of adjacent pixels +may have equal buffer values after sufficient iterations, the reduced number of memory reads +does not outweight the loss of parallelism between threads. If more threads are used, for +example due to smaller context sizes, then even more memory calls to the labels array will occur. +So using one thread to remember previous grandparent values may perform better than having +each thread fetch a value from memory simultaneously (resulting in partial serialization since +more threads will have to wait for memory) as the number of threads gets even larger. + +Part 5: + +If a standard min operation were used instead of atomic min, the iteration time would decrease +because the imposed serialized delays from atomic operation will not be applied. The final result +will still be correct because even if a thread overwrites the pixel's parent's value with a greater value +than another thread, the value will still be less than the original parent value. Thus the number of +iterations may increase. As stated, the value in label could increase, but that is during the same iteration. +Between iterations, label values cannot increase because a pixel's previous iteration value is compared +via the minimum operator with a new label. Thus after the current iteration finishs, each label's value +will be less than or equal to that of the previous iteration. + + +Results: + +Maze 1 + +Part1: + +Finished after 915 iterations, 36.084992 ms total, 0.0394371497268 ms per iteration +Found 2 regions + +Part 2: + +Finished after 529 iterations, 20.321376 ms total, 0.0384146994329 ms per iteration +Found 2 regions + +Part 3: + +Finished after 12 iterations, 0.611552 ms total, 0.0509626666667 ms per iteration +Found 2 regions + +Part 4: + +Finished after 11 iterations, 1.224416 ms total, 0.111310545455 ms per iteration +Found 2 regions + +Maze 2 + +Part 1: +Finished after 532 iterations, 20.138752 ms total, 0.0378547969925 ms per iteration +Found 35 regions + +Part 2: +Finished after 276 iterations, 10.62384 ms total, 0.038492173913 ms per iteration +Found 35 regions + +Part 3: +Finished after 11 iterations, 0.539008 ms total, 0.0490007272727 ms per iteration +Found 35 regions + +Part 4: +Finished after 10 iterations, 1.11216 ms total, 0.111216 ms per iteration +Found 35 regions + + diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl index 78b986b3..8fe5c312 100644 --- a/HW3/P5/label_regions.cl +++ b/HW3/P5/label_regions.cl @@ -57,13 +57,14 @@ propagate_labels(__global __read_write int *labels, // 1D index of thread within our work-group const int idx_1D = ly * get_local_size(0) + lx; - int old_label; + int old_label, old_label0; // Will store the output value int new_label; + int minT, newMin, row, col, prev; // Load the relevant labels to a local buffer with a halo if (idx_1D < buf_w) { - for (int row = 0; row < buf_h; row++) { + for (row = 0; row < buf_h; row++) { buffer[row * buf_w + idx_1D] = get_clamped_value(labels, w, h, @@ -74,19 +75,63 @@ propagate_labels(__global __read_write int *labels, // Make sure all threads reach the next part after // the local buffer is loaded barrier(CLK_LOCAL_MEM_FENCE); - + // Fetch the value from the buffer the corresponds to // the pixel for this thread + /* old_label = buffer[buf_y * buf_w + buf_x]; - + if (old_label < w*h){ + buffer[buf_y * buf_w + buf_x] = labels[old_label]; // grandparent update + } + barrier(CLK_LOCAL_MEM_FENCE); + */ + + old_label = buffer[buf_y * buf_w + buf_x]; // use the first thread in the upper left corner to update the buffer values with grandparents + if ((buf_y + buf_x) == 2*halo){ // keeps track of last buffer value read to avoid reading from memory (labels array) more than necessary + old_label0 = buffer[buf_y * buf_w + buf_x]; + if (old_label0 < w*h){ + prev = labels[old_label0]; + } + for (row = halo; row < buf_h-halo; row++){ + for (col = halo; col < buf_w-halo; col++){ + if (buffer[row * buf_w + col] < w*h){ + if (buffer[row * buf_w + col] == old_label0){ + buffer[row * buf_w + col] = prev; + } + else{ + old_label0 = buffer[row * buf_w + col]; + buffer[row * buf_w + col] = labels[old_label0]; + prev = buffer[row * buf_w + col]; + } + } + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2) // stay in bounds - if ((x < w) && (y < h)) { + if ((x < w) && (y < h) && (old_label < w*h)) { // CODE FOR PART 1 HERE // We set new_label to the value of old_label, but you will need // to adjust this for correctness. - new_label = old_label; + minT = buffer[buf_y * buf_w + buf_x]; + // Check neighbors to update minimum + if (buffer[(buf_y-1)*buf_w + buf_x] < minT){ + minT = buffer[(buf_y-1)*buf_w + buf_x]; + } + if (buffer[(buf_y+1)*buf_w + buf_x] < minT){ + minT = buffer[(buf_y+1)*buf_w + buf_x]; + } + if (buffer[(buf_y)*buf_w + buf_x - 1] < minT){ + minT = buffer[(buf_y)*buf_w + buf_x - 1]; + } + if (buffer[(buf_y)*buf_w + buf_x + 1] < minT){ + minT = buffer[(buf_y)*buf_w + buf_x + 1]; + } + + new_label = minT; if (new_label != old_label) { // CODE FOR PART 3 HERE @@ -94,6 +139,7 @@ propagate_labels(__global __read_write int *labels, // multiple threads might write this. *(changed_flag) += 1; labels[y * w + x] = new_label; + atomic_min(&(labels[old_label]),new_label); // Use atomic min to update grandparent value if necessary } } } diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py index c6ce60cb..ba3facd2 100644 --- a/HW3/P5/label_regions.py +++ b/HW3/P5/label_regions.py @@ -2,7 +2,7 @@ import sys import pyopencl as cl import numpy as np -import pylab +#import pylab def round_up(global_size, group_size): r = global_size % group_size @@ -42,7 +42,7 @@ def round_up(global_size, group_size): program = cl.Program(context, open('label_regions.cl').read()).build(options='') - host_image = np.load('maze1.npy') + host_image = np.load('maze2.npy') host_labels = np.empty_like(host_image) host_done_flag = np.zeros(1).astype(np.int32) @@ -75,9 +75,9 @@ def round_up(global_size, group_size): # Show the initial labels cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True) - pylab.imshow(host_labels) - pylab.title(itercount) - pylab.show() + #pylab.imshow(host_labels) + #pylab.title(itercount) + #pylab.show() show_progress = True total_time = 0 @@ -105,9 +105,9 @@ def round_up(global_size, group_size): print host_done_flag if itercount % 100 == 0 and show_progress: cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True) - pylab.imshow(host_labels) - pylab.title(itercount) - pylab.show() + #pylab.imshow(host_labels) + #pylab.title(itercount) + #pylab.show() if itercount % 10000 == 0: print 'Reached maximal number of iterations, aborting' sys.exit(0) @@ -116,6 +116,6 @@ def round_up(global_size, group_size): # Show final result cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True) print 'Found {} regions'.format(len(np.unique(host_labels)) - 1) - pylab.imshow(host_labels) - pylab.title(itercount) - pylab.show() + #pylab.imshow(host_labels) + #pylab.title(itercount) + #pylab.show()