From 1884d7a1b8dccfde9f5809bdd62c45e30c79c5a1 Mon Sep 17 00:00:00 2001 From: Evan Yao Date: Fri, 20 Nov 2015 04:09:22 -0500 Subject: [PATCH 1/2] done with HW 3 --- HW3/P3/P3.txt | 1 + HW3/P5/P5.txt | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 HW3/P3/P3.txt create mode 100644 HW3/P5/P5.txt diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt new file mode 100644 index 00000000..aaf12c99 --- /dev/null +++ b/HW3/P3/P3.txt @@ -0,0 +1 @@ +The best configuration and time for me was: configuration ('coalesced', 128, 128): 0.00291864 seconds diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt new file mode 100644 index 00000000..7922cd98 --- /dev/null +++ b/HW3/P5/P5.txt @@ -0,0 +1,62 @@ +Part 1 + +Maze 1: +Finished after 878 iterations, 261.55712 ms total, 0.297901047836 ms per iteration +Found 2 regions + +Maze 2: +Finished after 517 iterations, 153.9384 ms total, 0.297753191489 ms per iteration +Found 35 regions + + +Part 2 + +Maze 1: +Finished after 529 iterations, 158.00224 ms total, 0.298680982987 ms per iteration +Found 2 regions + +Maze 2: +Finished after 273 iterations, 81.45792 ms total, 0.298380659341 ms per iteration +Found 35 regions + + +Part 3 + +Maze 1: +Finished after 11 iterations, 3.37152 ms total, 0.306501818182 ms per iteration +Found 2 regions + +Maze 2: +Finished after 9 iterations, 2.7204 ms total, 0.302266666667 ms per iteration +Found 35 regions + + +Part 4 + +Maze 1: +Finished after 70 iterations, 52.56808 ms total, 0.750972571429 ms per iteration +Found 2 regions + +Maze 2: +Finished after 103 iterations, 76.77008 ms total, 0.745340582524 ms per iteration +Found 35 regions + + +It seems like in my case, serialization of the "finding grandparents" process +is not the best as it leads to a 2.5 time increase in time per iteration. + + +Part 5 + +Suppose that our current label sees 2 other labels, both of which have a +smaller label number than our current one. In that case, if we did atomic +updates, all 3 labels will become the minimum of these 3 labels. However, +it is possible that if we did "min" first, then "reassignment", the order +of the 2 mins and the 2 reassignments can make a difference. For example: + +Suppose our current label at a square is 3, and there are two neighbors +with labels 2 and 1. We would like to update 3 -> 2. However, when two +different threads compute the min of (3,2) and (3,1), they will get 1 and 2. +Now, assume that we assign that label to be 1, and THEN assign it to be 2. +UH OH! Now we have a problem and will have to run for at least another iteration +to fix it. From 0010692a17f19c6022e9e73e836a7e0f82676fb5 Mon Sep 17 00:00:00 2001 From: Evan Yao Date: Fri, 20 Nov 2015 04:10:44 -0500 Subject: [PATCH 2/2] all set with HW 3 --- HW3/P2/mandelbrot.py | 2 +- HW3/P3/sum.cl | 40 +++++++++++++++++++----- HW3/P4/median_filter.cl | 68 +++++++++++++++++++++++++++++++++++++++++ HW3/P5/label_regions.cl | 57 +++++++++++++++++++++++++++++++++- HW3/P5/label_regions.py | 4 +-- 5 files changed, 159 insertions(+), 12 deletions(-) diff --git a/HW3/P2/mandelbrot.py b/HW3/P2/mandelbrot.py index 7c197af2..2978cac0 100644 --- a/HW3/P2/mandelbrot.py +++ b/HW3/P2/mandelbrot.py @@ -1,7 +1,7 @@ from __future__ import division import pyopencl as cl import numpy as np -import pylab +import matplotlib.pyplot as pylab def round_up(global_size, group_size): r = global_size % group_size diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl index ee914740..82f24dd8 100644 --- a/HW3/P3/sum.cl +++ b/HW3/P3/sum.cl @@ -5,11 +5,15 @@ __kernel void sum_coalesced(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); + size_t i = get_global_id(0); + size_t global_size = get_global_size(0); + size_t group_size = get_local_size(0); + int counter; // thread i (i.e., with i = get_global_id()) should add x[i], // x[i + get_global_size()], ... up to N-1, and store in sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (counter = 0; counter * global_size + i < N; counter++) { + sum += x[i + counter * global_size]; } fast[local_id] = sum; @@ -24,8 +28,14 @@ __kernel void sum_coalesced(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + int k; + + for (k = group_size / 2 ;k > 0; k >>= 1) { + if (local_id < k) { + fast[local_id] += fast[local_id + k]; + } + + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; @@ -38,6 +48,9 @@ __kernel void sum_blocked(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); + size_t global_id = get_global_id(0); + size_t group_size = get_local_size(0); + int k = ceil((float)N / get_global_size(0)); // thread with global_id 0 should add 0..k-1 @@ -48,8 +61,13 @@ __kernel void sum_blocked(__global float* x, // // Be careful that each thread stays in bounds, both relative to // size of x (i.e., N), and the range it's assigned to sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + int count; + + for (count = global_id * k; count < (global_id + 1)*k; count++) { // YOUR CODE HERE + if (count < N) + { + sum += x[count]; + } } fast[local_id] = sum; @@ -64,8 +82,14 @@ __kernel void sum_blocked(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + uint d; + + for (d = group_size / 2 ;d > 0; d >>= 1) { + if (local_id < d) { + fast[local_id] += fast[local_id + d]; + } + + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl index 07bb294c..12e94e13 100644 --- a/HW3/P4/median_filter.cl +++ b/HW3/P4/median_filter.cl @@ -1,5 +1,27 @@ #include "median9.h" +// helper function which replaces the version of FETCH +// that Ray provided. This function will check if x and y +// are within bounds, and if not return the closest pixel. +float FETCH_new(__global __read_only float *in_values, + int width, int height, + int x, int y) +{ + if (x < 0) + x = 0; + + else if (x > width - 1) + x = width - 1; + + if (y < 0) + y = 0; + + else if (y > height - 1) + y = height - 1; + + return in_values[y * width + x]; +} + // 3x3 median filter __kernel void median_3x3(__global __read_only float *in_values, @@ -31,4 +53,50 @@ median_3x3(__global __read_only float *in_values, // Each thread in the valid region (x < w, y < h) should write // back its 3x3 neighborhood median. + // Global position of output pixel + const int x = get_global_id(0); + const int y = get_global_id(1); + + // Local position relative to (0, 0) in workgroup + const int lx = get_local_id(0); + const int ly = get_local_id(1); + + // coordinates of the upper left corner of the buffer in image + // space, including halo + const int buf_corner_x = x - lx - halo; + const int buf_corner_y = y - ly - halo; + + // coordinates of our pixel in the local buffer + const int buf_x = lx + halo; + const int buf_y = ly + halo; + + // 1D index of thread within our work-group + const int idx_1D = ly * get_local_size(0) + lx; + + int row; + + if (idx_1D < buf_w) + for (row = 0; row < buf_h; row++) { + buffer[row * buf_w + idx_1D] = \ + FETCH_new(in_values, w, h, + buf_corner_x + idx_1D, + buf_corner_y + row); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if ((y < h && x < w)) + { + int top = (buf_y - 1) * buf_w + buf_x; + int middle = buf_y * buf_w + buf_x; + int bottom = (buf_y + 1)* buf_w + buf_x; + + out_values[y * w + x] = median9( + buffer[top - 1], buffer[top], buffer[top + 1], + buffer[middle - 1], buffer[middle], buffer[middle + 1], + buffer[bottom - 1], buffer[bottom], buffer[bottom + 1] + ); + } + + } diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl index 78b986b3..931258b9 100644 --- a/HW3/P5/label_regions.cl +++ b/HW3/P5/label_regions.cl @@ -75,12 +75,50 @@ propagate_labels(__global __read_write int *labels, // the local buffer is loaded barrier(CLK_LOCAL_MEM_FENCE); + int current = buf_y * buf_w + buf_x; // Fetch the value from the buffer the corresponds to // the pixel for this thread - old_label = buffer[buf_y * buf_w + buf_x]; + old_label = buffer[current]; // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2) + + /* + if (old_label < w * h) + { + buffer[current] = labels[old_label]; // grab grandparent + } + */ + + + if ((lx == 0) && (ly == 0)) + { + int prev_key = -1000; + int prev_result; + + for (int i = 0; i < buf_w * buf_h; i++) + { + int this_label = buffer[i]; + + if (this_label >= w * h) + continue; + + if (prev_key == this_label) + { + buffer[i] = prev_result; + } + + else + { + prev_key = this_label; + prev_result = labels[prev_key]; + } + } + } + + + barrier(CLK_LOCAL_MEM_FENCE); + // stay in bounds if ((x < w) && (y < h)) { // CODE FOR PART 1 HERE @@ -88,10 +126,27 @@ propagate_labels(__global __read_write int *labels, // to adjust this for correctness. new_label = old_label; + if (new_label < w * h) + { + int this = buf_y * buf_w + buf_x; + new_label = + min(buffer[(buf_y + 1) * buf_w + buf_x], + min(buffer[(buf_y - 1) * buf_w + buf_x], + min(buffer[this + 1], + min(buffer[this - 1], new_label + )))); + } + if (new_label != old_label) { // CODE FOR PART 3 HERE // indicate there was a change this iteration. // multiple threads might write this. + + // + atomic_min(&labels[old_label], new_label); + + atomic_min(&labels[y * w + x], new_label); + *(changed_flag) += 1; labels[y * w + x] = new_label; } diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py index c6ce60cb..3e5092ac 100644 --- a/HW3/P5/label_regions.py +++ b/HW3/P5/label_regions.py @@ -2,7 +2,7 @@ import sys import pyopencl as cl import numpy as np -import pylab +import matplotlib.pyplot as pylab def round_up(global_size, group_size): r = global_size % group_size @@ -42,7 +42,7 @@ def round_up(global_size, group_size): program = cl.Program(context, open('label_regions.cl').read()).build(options='') - host_image = np.load('maze1.npy') + host_image = np.load('maze2.npy') host_labels = np.empty_like(host_image) host_done_flag = np.zeros(1).astype(np.int32)