diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl index 5a11c020..b042c4a3 100644 --- a/HW3/P2/mandelbrot.cl +++ b/HW3/P2/mandelbrot.cl @@ -13,7 +13,21 @@ mandelbrot(__global __read_only float *coords_real, int iter; if ((x < w) && (y < h)) { - // YOUR CODE HERE - ; + c_real = coords_real[x + y * w]; + c_imag = coords_imag[x + y * w]; + z_real = 0; + z_imag = 0; + + for (iter = 0; iter < max_iter; ++iter) { + if (z_real * z_real + z_imag * z_imag > 4.0) { + break; + } + float temp = z_real * z_real - z_imag * z_imag + c_real; + + z_imag = (z_real * z_imag * 2) + c_imag; + z_real = temp; + } + + out_counts[x + y * w] = iter; } } diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt new file mode 100644 index 00000000..75073736 --- /dev/null +++ b/HW3/P3/P3.txt @@ -0,0 +1,8 @@ +Hardware + +Intel(R) Core(TM) i7-3537 CPU @2.00GHz + + +Performance: + +configuration ('coalesced', 128, 128): 0.00205134 seconds \ No newline at end of file diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl index 4fb771d2..c7ee1399 100644 --- a/HW3/P3/sum.cl +++ b/HW3/P3/sum.cl @@ -5,11 +5,12 @@ __kernel void sum_coalesced(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); + unsigned int global_size = get_global_size(0); // thread i (i.e., with i = get_global_id()) should add x[i], // x[i + get_global_size()], ... up to N-1, and store in sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (int i = get_global_id(0); i < N; i += global_size) { + sum += x[i]; } fast[local_id] = sum; @@ -24,8 +25,11 @@ __kernel void sum_coalesced(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (int i = get_local_size(0) / 2; i > 0; i /= 2) { + if (local_id < i) { + fast[local_id] += fast[local_id + i] + } + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; @@ -38,7 +42,7 @@ __kernel void sum_blocked(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); - int k = ceil(float(N) / get_global_size(0)); + int k = ceil((float)N / get_global_size(0)); // thread with global_id 0 should add 0..k-1 // thread with global_id 1 should add k..2k-1 @@ -48,8 +52,9 @@ __kernel void sum_blocked(__global float* x, // // Be careful that each thread stays in bounds, both relative to // size of x (i.e., N), and the range it's assigned to sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (int i = get_global_id(0) * k; i < (get_global_id(0) + 1) * k && + i < N; ++i) { + sum += x[i]; } fast[local_id] = sum; @@ -64,8 +69,11 @@ __kernel void sum_blocked(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (int i = get_local_size(0) / 2; i > 0; i /= 2) { + if (local_id < i) { + fast[local_id] += fast[local_id + i] + } + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py index c16e9fa6..a0d56da2 100644 --- a/HW3/P3/tune.py +++ b/HW3/P3/tune.py @@ -23,7 +23,7 @@ def create_data(N): times = {} for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) @@ -40,7 +40,7 @@ def create_data(N): format(num_workgroups, num_workers, seconds)) for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl index 07bb294c..9f9b3eda 100644 --- a/HW3/P4/median_filter.cl +++ b/HW3/P4/median_filter.cl @@ -1,5 +1,29 @@ #include "median9.h" +float +find_closest(__global __read_only float *in_values, + int w, int h, + int x, int y) +{ + // fix out of bounds pixels to closest valid pixel + if (x < 0) { + x = 0; + } + else if (x >= w) { + x = w - 1; + } + + if (y < 0) { + y = 0; + } + else if (y >= h) { + y = h - 1; + } + + return in_values[x + y * w]; +} + + // 3x3 median filter __kernel void median_3x3(__global __read_only float *in_values, @@ -13,6 +37,21 @@ median_3x3(__global __read_only float *in_values, // without using the local buffer, first, then adjust your code to // use such a buffer after you have that working. + // constant coordinates to remember translations between local and global + const int x = get_global_id(0); + const int y = get_global_id(1); + + // absolute position in local buffer + const int x_rel = get_local_id(0); + const int y_rel = get_local_id(1); + + // remember offset coordinate for local buffer + const int x_buf_corner = x - x_rel - halo; + const int y_buf_corner = y - y_rel - halo; + + // local coordinate of our pixel + const int x_buf = x_rel + halo; + const int y_buf = y_rel + halo; // Load into buffer (with 1-pixel halo). // @@ -22,13 +61,36 @@ median_3x3(__global __read_only float *in_values, // Note that globally out-of-bounds pixels should be replaced // with the nearest valid pixel's value. + const int t_idx = x_rel + y_rel * get_local_size(0); + + if (t_idx < buf_w) { + for (int i = 0; i < buf_h; ++i) { + buffer[i * buf_w + t_idx] = find_closest(in_values, w, h, + x_buf_corner + t_idx, + y_buf_corner + i); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); // Compute 3x3 median for each pixel in core (non-halo) pixels // // We've given you median9.h, and included it above, so you can // use the median9() function. - // Each thread in the valid region (x < w, y < h) should write // back its 3x3 neighborhood median. + + // only calculate and write back if in valid region + if (x < w && y < h) { + out_values[x + y * w] = median9(buffer[(y_buf - 1) * buf_w + x_buf - 1], + buffer[(y_buf - 1) * buf_w + x_buf], + buffer[(y_buf - 1) * buf_w + x_buf + 1], + buffer[y_buf * buf_w + x_buf - 1], + buffer[y_buf * buf_w + x_buf], + buffer[y_buf * buf_w + x_buf + 1], + buffer[(y_buf + 1) * buf_w + x_buf - 1], + buffer[(y_buf + 1) * buf_w + x_buf], + buffer[(y_buf + 1) * buf_w + x_buf + 1]); + } } diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py index 1eda1bb9..a181c05a 100644 --- a/HW3/P4/median_filter.py +++ b/HW3/P4/median_filter.py @@ -1,8 +1,8 @@ from __future__ import division import pyopencl as cl import numpy as np -import imread import pylab +import os.path def round_up(global_size, group_size): r = global_size % group_size @@ -51,7 +51,8 @@ def numpy_median(image, iterations=10): properties=cl.command_queue_properties.PROFILING_ENABLE) print 'The queue is using the device:', queue.device.name - program = cl.Program(context, open('median_filter.cl').read()).build(options='') + curdir = os.path.dirname(os.path.realpath(__file__)) + program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir]) host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy() host_image_filtered = np.zeros_like(host_image) diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt new file mode 100644 index 00000000..1d2e277b --- /dev/null +++ b/HW3/P5/P5.txt @@ -0,0 +1,59 @@ +Part 1: implement updates from neighbors + +Maze 1 +Finished after 894 iterations, 197.5328 ms total, 0.222069508197 ms per iteration +Found 2 regions + +Maze 2 +Finished after 528 iterations, 116.40336 ms total, 0.220460909091 ms per iteration +Found 35 regions + +=============================================================================== + +Part 2: fetch grandparents + +Maze 1 +Finished after 529 iterations, 116.8076 ms total, 0.220808317580 ms per iteration + +Maze 2 +Finished after 274 iterations, 62.08362 ms total, 0.226582554745 ms per iteration + +=============================================================================== + +Part 3: merge parent regions + +Maze 1 +Finished after 10 iterations, 2.4026 ms total, 0.24026 ms per iteration + +Maze 2 +Finished after 9 iterations, 2.17255 ms total, 0.241394444444 ms per iteration + +=============================================================================== + +Part 4: efficient grandparents + +Maze 1 +Finished after 10 iterations, 4.59674 ms total, 0.459674 ms per iteration + +Maze 2 +Finished after 9 iterations, 4.1608 ms total, 0.462311111111 ms per iteration + +It is quite apparent that using a single thread to check the labels in its +workgroup caused performance to worsen. I think this has to do with the +overhead associated with accessing global memory we are trying to avoid, and +the speedup we gain from parallelism. Here, we can see that using only one +thread, i.e. serializing this part of the algorithm, takes away much of the +benefit we gained in the parallel approach, even though it decreased number of +accesses to global memory. + +=============================================================================== + +Part 5: no atomic operations + +Without atomic operations, we introduce the possibility of running into the +race condition in which our old_label is updated redundantly, such that while +the output of the algorithm is not incorrect, we can increase the number of +iterations. This is dangerous because it happens basically +nondeterministically, so in the (very unlikely) worst case it could cause our +algorithm to perform relatively slowly, just because more work (with no +progress) is being done. \ No newline at end of file diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl index 78b986b3..fc0437da 100644 --- a/HW3/P5/label_regions.cl +++ b/HW3/P5/label_regions.cl @@ -81,19 +81,54 @@ propagate_labels(__global __read_write int *labels, // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2) + // Part 2 + // + // if (old_label < w * h) { + // buffer[buf_y * buf_w + buf_x] = labels[old_label]; + // } + + // Part 4 + + // Update workgroup labels + if (lx == 0 && ly == 0) { + int max_iter = buf_w * buf_h; + int temp, last; + int prev = -1; + for (int i = 0; i < max_iter; ++i) { + temp = buffer[i]; + if (temp < w * h) { + // if current label is not the same as previous, reset + if (prev != temp) { + prev = temp; + last = labels[prev]; + } + buffer[i] = last; + } + } + } + // stay in bounds if ((x < w) && (y < h)) { // CODE FOR PART 1 HERE - // We set new_label to the value of old_label, but you will need - // to adjust this for correctness. + + // min over all possible values new_label = old_label; + if (new_label < w * h) { + int row_min = min(buffer[buf_y * buf_w + buf_x - 1], + buffer[buf_y * buf_w + buf_x + 1]); + int col_min = min(buffer[buf_x + (buf_y - 1) * buf_w], + buffer[buf_x + (buf_y + 1) * buf_w]); + new_label = min(row_min, col_min); + } if (new_label != old_label) { // CODE FOR PART 3 HERE // indicate there was a change this iteration. // multiple threads might write this. *(changed_flag) += 1; - labels[y * w + x] = new_label; + // labels[y * w + x] = new_label; + atomic_min(&labels[old_label], new_label); + atomic_min(&labels[x + y * w], new_label); } } }