diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl index 5a11c020..1bed0388 100644 --- a/HW3/P2/mandelbrot.cl +++ b/HW3/P2/mandelbrot.cl @@ -9,11 +9,24 @@ mandelbrot(__global __read_only float *coords_real, const int y = get_global_id(1); float c_real, c_imag; - float z_real, z_imag; + float z_real, z_imag, z_real_temp; int iter; if ((x < w) && (y < h)) { // YOUR CODE HERE - ; + iter=0; + z_real=coords_real[y * w + x]; + z_imag=coords_imag[y * w + x]; + c_real=coords_real[y * w + x]; + c_imag=coords_imag[y * w + x]; + //printf("%f",z_real); + while (((z_real*z_real+z_imag*z_imag)<4) && (iter32 workers resulted in an error. It also seems that when the gloabl size>4096, then the computation does not complete correctly in the blocked reads case (I see an assertion error). + +With that in mind, the best performance was achieved with the greatest number of workgroups and workers +coalesced reads, workgroups: 512, num_workers: 32, 0.013397824 seconds +blocked reads, workgroups: 128, num_workers: 16, 0.03992816 seconds \ No newline at end of file diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl index 4fb771d2..e84d7e6d 100644 --- a/HW3/P3/sum.cl +++ b/HW3/P3/sum.cl @@ -5,13 +5,17 @@ __kernel void sum_coalesced(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); + int global_id=get_global_id(0); + //float local_size=get_local_size(0); + //int lim=(int)log2(local_size); + int i,j,offset; // thread i (i.e., with i = get_global_id()) should add x[i], // x[i + get_global_size()], ... up to N-1, and store in sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (i=global_id;i>j; + while(offset>1) { + //for (j = 1; j <= lim; j++) { // YOUR CODE HERE + offset=get_local_size(0)>>j; + //printf("global_id[%d], offset=%d\n",global_id,offset); + fast[local_id]=fast[local_id]+fast[local_id+offset]; + barrier(CLK_LOCAL_MEM_FENCE); + j++; } if (local_id == 0) partial[get_group_id(0)] = fast[0]; @@ -38,7 +49,12 @@ __kernel void sum_blocked(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); - int k = ceil(float(N) / get_global_size(0)); + int global_id = get_global_id(0); + //float local_size=get_local_size(0); + int k = ceil((float)(N) / get_global_size(0)); + //int lim=(int)log2(local_size); + int i,j,offset; + // thread with global_id 0 should add 0..k-1 // thread with global_id 1 should add k..2k-1 @@ -48,13 +64,14 @@ __kernel void sum_blocked(__global float* x, // // Be careful that each thread stays in bounds, both relative to // size of x (i.e., N), and the range it's assigned to sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (i=global_id*k; i <= (k*(global_id+1)-1); i++) { // YOUR CODE HERE + sum=sum+x[i]; } - + + fast[local_id] = sum; barrier(CLK_LOCAL_MEM_FENCE); - + //printf("local size=%f, lim=%d \n",local_size,lim); // binary reduction // // thread i should sum fast[i] and fast[i + offset] and store back @@ -64,9 +81,18 @@ __kernel void sum_blocked(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + j=1; + offset=get_local_size(0)>>j; + while(offset>1) { + //for (j = 1; j <= lim; j++) { // YOUR CODE HERE + offset=get_local_size(0)>>j; + //printf("global_id[%d], offset=%d\n",global_id,offset); + fast[local_id]=fast[local_id]+fast[local_id+offset]; + barrier(CLK_LOCAL_MEM_FENCE); + j++; } - if (local_id == 0) partial[get_group_id(0)] = fast[0]; + if (local_id == 0) { + partial[get_group_id(0)] = fast[0]; + } } diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py index a0d56da2..5df8bc50 100644 --- a/HW3/P3/tune.py +++ b/HW3/P3/tune.py @@ -1,6 +1,8 @@ import pyopencl as cl import numpy as np - +import pdb +import os +os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1' def create_data(N): return host_x, x @@ -13,7 +15,7 @@ def create_data(N): print("#{0}: {1} on {2}".format(i, d.name, d.platform.name)) ctx = cl.Context(devices) - queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + queue = cl.CommandQueue(ctx, ctx.devices[1], properties=cl.command_queue_properties.PROFILING_ENABLE) program = cl.Program(ctx, open('sum.cl').read()).build(options='') @@ -25,8 +27,9 @@ def create_data(N): for num_workgroups in 2 ** np.arange(3, 10): partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) - for num_workers in 2 ** np.arange(2, 8): + for num_workers in 2 ** np.arange(2, 6): local = cl.LocalMemory(num_workers * 4) + #pdb.set_trace() event = program.sum_coalesced(queue, (num_workgroups * num_workers,), (num_workers,), x, partial_sums, local, np.uint64(N)) cl.enqueue_copy(queue, host_partial, partial_sums, is_blocking=True) @@ -34,6 +37,7 @@ def create_data(N): sum_gpu = sum(host_partial) sum_host = sum(host_x) seconds = (event.profile.end - event.profile.start) / 1e9 + #pdb.set_trace() assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4 times['coalesced', num_workgroups, num_workers] = seconds print("coalesced reads, workgroups: {}, num_workers: {}, {} seconds". @@ -42,15 +46,18 @@ def create_data(N): for num_workgroups in 2 ** np.arange(3, 10): partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) - for num_workers in 2 ** np.arange(2, 8): + for num_workers in 2 ** np.arange(2, 6): + print("global_size={}".format(num_workers*num_workgroups)) local = cl.LocalMemory(num_workers * 4) event = program.sum_blocked(queue, (num_workgroups * num_workers,), (num_workers,), x, partial_sums, local, np.uint64(N)) + #pdb.set_trace() cl.enqueue_copy(queue, host_partial, partial_sums, is_blocking=True) sum_gpu = sum(host_partial) sum_host = sum(host_x) seconds = (event.profile.end - event.profile.start) / 1e9 + assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4 times['blocked', num_workgroups, num_workers] = seconds print("blocked reads, workgroups: {}, num_workers: {}, {} seconds". diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl index 07bb294c..f126738c 100644 --- a/HW3/P4/median_filter.cl +++ b/HW3/P4/median_filter.cl @@ -1,4 +1,76 @@ -#include "median9.h" +//#include "median9.h" + + +#define min(a, b) (((a) < (b)) ? (a) : (b)) +#define max(a, b) (((a) < (b)) ? (b) : (a)) + +#define cas(a, b) tmp = min(a, b); b = max(a, b); a = tmp + +inline float median9(float s0, float s1, float s2, + float s3, float s4, float s5, + float s6, float s7, float s8) +{ + // http://a-hackers-craic.blogspot.com/2011/05/3x3-median-filter-or-branchless.html + float tmp; + + cas(s1, s2); + cas(s4, s5); + cas(s7, s8); + + cas(s0, s1); + cas(s3, s4); + cas(s6, s7); + + cas(s1, s2); + cas(s4, s5); + cas(s7, s8); + + cas(s3, s6); + cas(s4, s7); + cas(s5, s8); + cas(s0, s3); + + cas(s1, s4); + cas(s2, s5); + cas(s3, s6); + + cas(s4, s7); + cas(s1, s3); + + cas(s2, s6); + + cas(s2, s3); + cas(s4, s6); + + cas(s3, s4); + + return s4; +} + +//Return image value at point (x,y), where the indicies are relative to the global image. If the indicies are out of bounds, choose closest in bounds value instead. +inline float fetch_point(__global __read_only float *img, + int w, int h, + int x, int y) +{ + float out_img; + while (x<0) { + x++; + } + while (x>w-1){ + x--; + } + while (y<0) { + y++; + } + while (y>h-1){ + y--; + } + out_img=img[w*y+x]; + + return out_img; + +} + // 3x3 median filter __kernel void @@ -13,7 +85,27 @@ median_3x3(__global __read_only float *in_values, // without using the local buffer, first, then adjust your code to // use such a buffer after you have that working. + //store calculated median here before transferring + float out_buffer; + + // Global position of output pixel + const int x = get_global_id(0); + const int y = get_global_id(1); + // Local position relative to (0, 0) in workgroup + const int lx = get_local_id(0); + const int ly = get_local_id(1); + + // coordinates of the upper left corner of the buffer in image + // space, including halo + const int buf_corner_x = x - lx - halo; + const int buf_corner_y = y - ly - halo; + + const int idx_1D = ly * get_local_size(0) + lx; + + int row; + + if (x < w && y < h){ // Load into buffer (with 1-pixel halo). // // It may be helpful to consult HW3 Problem 5, and @@ -21,14 +113,38 @@ median_3x3(__global __read_only float *in_values, // // Note that globally out-of-bounds pixels should be replaced // with the nearest valid pixel's value. + if (idx_1D < buf_w) { + for (row = 0; row < buf_h; row++) { + buffer[row * buf_w + idx_1D] = \ + fetch_point(in_values, w, h, + buf_corner_x + idx_1D, + buf_corner_y + row); + } + } + barrier(CLK_LOCAL_MEM_FENCE); - // Compute 3x3 median for each pixel in core (non-halo) pixels - // - // We've given you median9.h, and included it above, so you can - // use the median9() function. + // Compute 3x3 median for each pixel in core (non-halo) pixels + // + // We've given you median9.h, and included it above, so you can + // use the median9() function. + + //core point is buffer[(ly+halo)*buf_w+(lx+halo)]. For example, for (lx,ly)=(0,0), their value in the buffer should be (1,1). In the variables below, bc stands for "buffer core." + int bc_x=lx+halo; + int bc_y=ly+halo; + out_buffer = median9(buffer[(bc_y-1)*buf_w+bc_x-1], + buffer[(bc_y-1)*buf_w+bc_x], + buffer[(bc_y-1)*buf_w+bc_x+1], + buffer[(bc_y)*buf_w+bc_x-1], + buffer[(bc_y)*buf_w+bc_x], + buffer[(bc_y)*buf_w+bc_x+1], + buffer[(bc_y+1)*buf_w+bc_x-1], + buffer[(bc_y+1)*buf_w+bc_x], + buffer[(bc_y+1)*buf_w+bc_x+1]); + out_values[y*w+x]=out_buffer; + } // Each thread in the valid region (x < w, y < h) should write // back its 3x3 neighborhood median. } diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py index 1eda1bb9..ad2e0339 100644 --- a/HW3/P4/median_filter.py +++ b/HW3/P4/median_filter.py @@ -1,9 +1,10 @@ from __future__ import division import pyopencl as cl import numpy as np -import imread +#import imread import pylab - +# import os +# os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1' def round_up(global_size, group_size): r = global_size % group_size if r == 0: @@ -47,13 +48,18 @@ def numpy_median(image, iterations=10): # Create a queue for transferring data and launching computations. # Turn on profiling to allow us to check event times. - queue = cl.CommandQueue(context, context.devices[0], + queue = cl.CommandQueue(context, context.devices[1], properties=cl.command_queue_properties.PROFILING_ENABLE) print 'The queue is using the device:', queue.device.name program = cl.Program(context, open('median_filter.cl').read()).build(options='') host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy() + pylab.gray() + + pylab.imshow(host_image) + pylab.title('original image') + host_image_filtered = np.zeros_like(host_image) gpu_image_a = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4) @@ -74,17 +80,24 @@ def numpy_median(image, iterations=10): # Send image to the device, non-blocking cl.enqueue_copy(queue, gpu_image_a, host_image, is_blocking=False) - + #seconds=[] num_iters = 10 for iter in range(num_iters): - program.median_3x3(queue, global_size, local_size, + event=program.median_3x3(queue, global_size, local_size, gpu_image_a, gpu_image_b, local_memory, width, height, buf_width, buf_height, halo) - + # seconds.append((event.profile.end - event.profile.start) / 1e9) # swap filtering direction gpu_image_a, gpu_image_b = gpu_image_b, gpu_image_a cl.enqueue_copy(queue, host_image_filtered, gpu_image_a, is_blocking=True) - + seconds=(event.profile.end - event.profile.start) / 1e9 + pylab.figure() + pylab.imshow(host_image_filtered) + pylab.title('after - zoom') assert np.allclose(host_image_filtered, numpy_median(host_image, num_iters)) + + print("took {} seconds".format(seconds)) + + pylab.show() diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt new file mode 100644 index 00000000..fb5f963e --- /dev/null +++ b/HW3/P5/P5.txt @@ -0,0 +1,38 @@ +Part 1: +Maze1: +Finished after 865 iterations, 350.717984 ms total, 0.405454316763 ms per iteration +Found 2 regions +Maze2: +Finished after 492 iterations, 212.373632 ms total, 0.431653723577 ms per iteration +Found 35 regions + +Part 2: +Maze1: +Finished after 520 iterations, 236.204736 ms total, 0.454239876923 ms per iteration +Found 2 regions +Maze2: +Finished after 269 iterations, 150.651488 ms total, 0.56004270632 ms per iteration +Found 35 regions + +Part 3: +Maze1: +Finished after 10 iterations, 6.055488 ms total, 0.6055488 ms per iteration +Found 2 regions +Maze2: +Finished after 9 iterations, 5.425952 ms total, 0.602883555556 ms per iteration +Found 35 regions + +Part 4: +Maze1: +Finished after 10 iterations, 18.7112 ms total, 1.87112 ms per iteration +Found 2 regions +Maze2: +Finished after 9 iterations, 16.863136 ms total, 1.87368177778 ms per iteration +Found 35 regions + + + +I see that by serializing the grandparenting, the computation takes one fewer iteration to complete, but takes > 2x longer per iteration. It appears that the decreased number of reads to global memory does not balance out the increased serialization. Perhaps if my GPU had a greater number of cores the serialization wouldn't be as noticible. Additionally this would potentially be a better idea if there was some way to begin iterating over parts of the workgroup that had already been grandparented -- since the other threads in the workgroup are just waiting. + +Part 5: +In this case, atomic operations prevent race conditions -- access to specific memory locations by multiple threads at once. Using min() instead of atomic_min() still results in the correct answer, and results in a slightly shorter time per iteration, but requires more iterations to complete, since values can get updated redundantly. I found that, on my machine, the difference between using min() and atomic_min() is negligible. \ No newline at end of file diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl index 78b986b3..f0f99e02 100644 --- a/HW3/P5/label_regions.cl +++ b/HW3/P5/label_regions.cl @@ -27,6 +27,28 @@ get_clamped_value(__global __read_only int *labels, return labels[y * w + x]; } +int +find_min(__local int *buffer, + int buf_x, int buf_y, + int buf_w, + int w, int h){ + int i,j; + //set min_val intially to center value + int min_val=buffer[buf_y * buf_w + buf_x]; + // if not a wall point + if (min_val!=w*h){ + for (i = -1; i <= 1; i++){ + for (j = -1; j <= 1; j++){ + //only check on off-diagonals, not corners or center point, or maze walls (=0). + if (i!=j){ + min_val=min(min_val,buffer[(buf_y+j)*buf_w+(buf_x+i)]); + } + } + } + } + return min_val; +} + __kernel void propagate_labels(__global __read_write int *labels, __global __write_only int *changed_flag, @@ -80,20 +102,60 @@ propagate_labels(__global __read_write int *labels, old_label = buffer[buf_y * buf_w + buf_x]; // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2) - + + //part 4 + //if current thread is first in workgroup +/* if (lx==0 && ly==0){ + int current,idx, row, col;; + int previous=w*h; + for (row=halo; row< (buf_h-halo); row++){ + for (col=halo; col< (buf_w-halo);col++){ + idx=row*buf_w+col; + + //not a wall + if (buffer[idx]!=h*w){ + //don't look into global memory if not necessary + if (previous == buffer[idx]){ + buffer[idx]=current; + } + //otherwise, do grandparenting + else{ + previous = buffer[idx]; + buffer[idx]=labels[buffer[idx]]; + current=buffer[idx]; + } + } + } + } + }*/ + //Part 2 + //if current pixel is not a wall + if (buffer[buf_y * buf_w + buf_x]!=h*w){ + + buffer[buf_y * buf_w + buf_x]=labels[buffer[buf_y * buf_w + buf_x]]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // stay in bounds if ((x < w) && (y < h)) { // CODE FOR PART 1 HERE // We set new_label to the value of old_label, but you will need // to adjust this for correctness. - new_label = old_label; + new_label = find_min(buffer, + buf_x, buf_y, + buf_w, + w, h); if (new_label != old_label) { // CODE FOR PART 3 HERE + atomic_min(&labels[old_label],new_label); + atomic_min(&labels[y * w + x],new_label); // indicate there was a change this iteration. // multiple threads might write this. *(changed_flag) += 1; - labels[y * w + x] = new_label; + /*labels[old_label]=min(labels[old_label],new_label); + labels[y * w + x] = new_label;*/ } } } diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py index c6ce60cb..9ae071b8 100644 --- a/HW3/P5/label_regions.py +++ b/HW3/P5/label_regions.py @@ -3,7 +3,9 @@ import pyopencl as cl import numpy as np import pylab - +import pdb +import os +os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1' def round_up(global_size, group_size): r = global_size % group_size if r == 0: @@ -36,13 +38,13 @@ def round_up(global_size, group_size): # Create a queue for transferring data and launching computations. # Turn on profiling to allow us to check event times. - queue = cl.CommandQueue(context, context.devices[0], + queue = cl.CommandQueue(context, context.devices[1], properties=cl.command_queue_properties.PROFILING_ENABLE) print 'The queue is using the device:', queue.device.name program = cl.Program(context, open('label_regions.cl').read()).build(options='') - host_image = np.load('maze1.npy') + host_image = np.load('maze2.npy') host_labels = np.empty_like(host_image) host_done_flag = np.zeros(1).astype(np.int32) @@ -103,6 +105,7 @@ def round_up(global_size, group_size): break # there were changes, so continue running print host_done_flag + if itercount % 100 == 0 and show_progress: cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True) pylab.imshow(host_labels)