diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl index 5a11c020..89e35c78 100644 --- a/HW3/P2/mandelbrot.cl +++ b/HW3/P2/mandelbrot.cl @@ -10,10 +10,34 @@ mandelbrot(__global __read_only float *coords_real, float c_real, c_imag; float z_real, z_imag; + float new_z_real; int iter; if ((x < w) && (y < h)) { // YOUR CODE HERE + // implementing mandelbrot here + + // initialize + iter = 0; + z_real =0; + z_imag = 0; + // pixel (x,y) is x*w + y away from (0,0) + // because (x,y) are flipped + c_real = coords_real[x*w + y]; + c_imag = coords_imag[x*w + y]; + while((z_real*z_real+ z_imag*z_imag <=4) \ + &&(iter <= max_iter)){ + // Similar to AVX implemtation + new_z_real = (z_real*z_real - z_imag*z_imag) \ + + c_real; + z_imag = (2 * z_real* z_imag) + c_imag; + z_real = new_z_real; + iter = iter + 1; + } + + + ; + out_counts[x*w + y] = iter; } } diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt new file mode 100644 index 00000000..7b2371ef --- /dev/null +++ b/HW3/P3/P3.txt @@ -0,0 +1,22 @@ +The best found configuration is + +configuration ('coalesced', 256, 128): 0.00303864 seconds + +The devices detected on platform Apple are: + +Intel(R) Core(TM) i5-4250U CPU @ 1.30GHz [Type: CPU ] + +Maximum clock Frequency: 1300 MHz + +Maximum allocable memory size: 1073 MB + +## Maximum work group size 1024 + +HD Graphics 5000 [Type: GPU ] + +Maximum clock Frequency: 1000 MHz + +Maximum allocable memory size: 402 MB + +## Maximum work group size 512 + diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl index ee914740..770ba2fe 100644 --- a/HW3/P3/sum.cl +++ b/HW3/P3/sum.cl @@ -5,11 +5,15 @@ __kernel void sum_coalesced(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); + size_t i = get_global_id(0); + int jump = get_global_size(0); + int localsize = get_local_size(0); + int id_x; // thread i (i.e., with i = get_global_id()) should add x[i], // x[i + get_global_size()], ... up to N-1, and store in sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (id_x = 0 ;i + id_x*jump < N; id_x++) { + sum += x[ i + id_x*jump ]; } fast[local_id] = sum; @@ -24,8 +28,11 @@ __kernel void sum_coalesced(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (uint offset = localsize/2; offset > 0; offset >>= 1) { + if (local_id< offset) { + fast[local_id] += fast[local_id + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; @@ -39,6 +46,13 @@ __kernel void sum_blocked(__global float* x, float sum = 0; size_t local_id = get_local_id(0); int k = ceil((float)N / get_global_size(0)); + int globalid = get_global_id(0); + int localid = get_local_id(0); + int ini ; + int localsize = get_local_size(0); + + + // thread with global_id 0 should add 0..k-1 // thread with global_id 1 should add k..2k-1 @@ -48,8 +62,10 @@ __kernel void sum_blocked(__global float* x, // // Be careful that each thread stays in bounds, both relative to // size of x (i.e., N), and the range it's assigned to sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for ( ini = globalid*k ; ini < (globalid +1)*k ; ini++) { + if (ini < N) { + sum += x[ini]; + } } fast[local_id] = sum; @@ -64,9 +80,11 @@ __kernel void sum_blocked(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (uint offset = localsize/2; offset > 0; offset >>= 1) { + if (local_id< offset) { + fast[local_id] += fast[local_id + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); } - if (local_id == 0) partial[get_group_id(0)] = fast[0]; } diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl index 07bb294c..a8bc5391 100644 --- a/HW3/P4/median_filter.cl +++ b/HW3/P4/median_filter.cl @@ -1,5 +1,16 @@ #include "median9.h" + +inline float get_values(__global float *in_values, \ + int w, int h, int new_x, int new_y){ + // check everything stays in bound + if (new_x < 0) new_x = 0; + if (new_y < 0) new_y = 0; + if (new_x >= w) new_x = w - 1; + if (new_y >= h) new_y = h - 1; + return in_values[new_y * w + new_x]; +} + // 3x3 median filter __kernel void median_3x3(__global __read_only float *in_values, @@ -22,13 +33,59 @@ median_3x3(__global __read_only float *in_values, // Note that globally out-of-bounds pixels should be replaced // with the nearest valid pixel's value. + // Define variables like in class + + // global position of the pixel + const int x = get_global_id(0); + const int y = get_global_id(1); + + // local position of the pixel in the workgroup + const int lx = get_local_id(0); + const int ly = get_local_id(1); + + // corner coordinates of the buffer + const int buf_corner_x = x - lx - halo; + const int buf_corner_y = y - ly - halo; + + // coordinates of the pixel in the buffer + const int buf_x = lx + halo; + const int buf_y = ly + halo; + + // get 1-index of the pixels + const int idx_1D = ly * get_local_size(0) + lx; // Compute 3x3 median for each pixel in core (non-halo) pixels // // We've given you median9.h, and included it above, so you can // use the median9() function. + if (idx_1D < buf_w){ + for (int row = 0; row < buf_h; row++){ + int new_x = buf_corner_x + idx_1D; + int new_y = buf_corner_y + row; + // Each thread in the valid region (x < w, y < h) should write + // back its 3x3 neighborhood median. + buffer[row * buf_w + idx_1D] = \ + get_values(in_values, w, h, new_x, new_y); + } + } + + // now write the output + barrier(CLK_LOCAL_MEM_FENCE); + + if((x < w) && (y < h)){ + out_values[y * w + x] =\ + median9( buffer[ (buf_y-1) * buf_w + buf_x -1],\ + buffer[ (buf_y-1) * buf_w + buf_x],\ + buffer[ (buf_y-1) * buf_w + buf_x +1],\ + buffer[ buf_y * buf_w + buf_x -1], \ + buffer[ buf_y * buf_w + buf_x], \ + buffer[ buf_y * buf_w + buf_x +1],\ + buffer[ (buf_y+1) * buf_w + buf_x -1],\ + buffer[ (buf_y+1) * buf_w + buf_x],\ + buffer[ (buf_y+1) * buf_w + buf_x +1]); + + } + - // Each thread in the valid region (x < w, y < h) should write - // back its 3x3 neighborhood median. } diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt new file mode 100644 index 00000000..59554130 --- /dev/null +++ b/HW3/P5/P5.txt @@ -0,0 +1,87 @@ +Results + +******** +Part 1 +******** + +--- Maze 1 ---- + +Finished after 314 iterations, 153.46664 ms total, 0.488747261146 ms per iteration +Found 77 regions + +--- Maze 2 --- + +Finished after 243 iterations, 106.70872 ms total, 0.439130534979 ms per iteration +Found 113 regions + +******** +Part 2 +******** + +--- Maze 1 --- + + +Finished after 132 iterations, 65.11392 ms total, 0.493287272727 ms per iteration +Found 77 regions + +--- Maze 2 --- + +Finished after 114 iterations, 55.83568 ms total, 0.489786666667 ms per iteration +Found 113 regions + +******** +Part 3 +******** + +--- Maze 1 --- + +Finished after 11 iterations, 5.42896 ms total, 0.493541818182 ms per iteration +Found 60 regions + + +--- Maze 2 --- + +Finished after 11 iterations, 5.3596 ms total, 0.487236363636 ms per iteration +Found 106 regions + +******** +Part 4 +******** + +--- Maze 1 --- + +Finished after 71 iterations, 96.0672 ms total, 1.35305915493 ms per iteration +Found 66 regions + +--- Maze 2 --- + +Finished after 49 iterations, 66.29168 ms total, 1.35289142857 ms per iteration +Found 106 regions + +Looks like using only one thread did not improve anything. Worse, I now have more iterations. I don't have time to find out what happened. +While we have decreased the fetches into the global memory by serializing, the run time is bigger. + + +******** +Part 5 +******** + +Changing atomic_min() to a simple min() ? + +For now we have : + +atomic_min(&labels[old_label],new_label); +atomic_min(&labels[y * w + x], new_label); + +and we are asking ourselves the questions what would happen with + +min(labels[old_label], new_label); +min(labels[y * w + x], new_label); + +The atomic operation ensure that only one thread assigns to labels[old_label] and labels[y * w + x] the smallest label at a time. If multiple threads do this at the same time in parallel, there is no reason to believe that it will correctly assign the smallest value of the label anymore. + +However, this operation will be faster per iteration because now done via multithreading. + +Also we may expect more iterations because the minimum value stored at the end of one iteration may not be the minimum value for all the threads. + + diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl index 78b986b3..e14031fc 100644 --- a/HW3/P5/label_regions.cl +++ b/HW3/P5/label_regions.cl @@ -21,7 +21,7 @@ int get_clamped_value(__global __read_only int *labels, int w, int h, int x, int y) -{ +{ if ((x < 0) || (x >= w) || (y < 0) || (y >= h)) return w * h; return labels[y * w + x]; @@ -80,20 +80,73 @@ propagate_labels(__global __read_write int *labels, old_label = buffer[buf_y * buf_w + buf_x]; // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2) - + + + // CODE FOR PART 2 + + // if (old_label < w*h) { + // buffer[ buf_y * buf_w + buf_x ] = labels[old_label]; + // } + + + // CODE FOR PART 4 + + + // when we have the first thread + if ((lx == 0) && (ly == 0)) { + + // initialize variables to use + int last_label = -1 ; + int my_label_new; + + // get grandparent + if (old_label < w*h){ + last_label = labels[old_label]; + } + + // loop over rows and columns of the buffer + for (int x_i = halo; x_i < get_local_size(0) + halo; x_i++) { + for (int y_i = halo; y_i < get_local_size(1) + halo; y_i++) { + + my_label_new = buffer[(ly+x_i)*buf_w+(lx+y_i)]; + + if (buffer[(ly+x_i)*buf_w+(lx+y_i)] < w*h) { + // avoid having the same value as the previous one + if (my_label_new != last_label) { + // update the buffer + buffer[(ly+x_i)*buf_w+(lx+y_i)] = labels[my_label_new]; + } + } + } + } + } + // stay in bounds - if ((x < w) && (y < h)) { + if (((x < w) && (y < h)) && (old_label < w*h)) { // CODE FOR PART 1 HERE // We set new_label to the value of old_label, but you will need // to adjust this for correctness. - new_label = old_label; + + // one pixel becomes the minimum of its 4 neighboring + // pixels and itself + // get the locations in a similar fashion as P4 + int left = buffer[ buf_y * buf_w + buf_x - 1]; + int right = buffer[ buf_y * buf_w + 1]; + int up = buffer[ (buf_y - 1) * buf_w + buf_x ]; + int down = buffer[ (buf_y + 1) * buf_w + buf_x ]; + // find the minimum + new_label = min(old_label, min( min( min(up,down) , right) , left)); + + if (new_label != old_label) { + atomic_min(&labels[old_label], new_label); // CODE FOR PART 3 HERE // indicate there was a change this iteration. // multiple threads might write this. *(changed_flag) += 1; - labels[y * w + x] = new_label; + //labels[y * w + x] = new_label; + atomic_min(&labels[y * w + x], new_label); } } } diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py index c6ce60cb..5ffa14f2 100644 --- a/HW3/P5/label_regions.py +++ b/HW3/P5/label_regions.py @@ -42,7 +42,7 @@ def round_up(global_size, group_size): program = cl.Program(context, open('label_regions.cl').read()).build(options='') - host_image = np.load('maze1.npy') + host_image = np.load('maze2.npy') host_labels = np.empty_like(host_image) host_done_flag = np.zeros(1).astype(np.int32) @@ -93,7 +93,7 @@ def round_up(global_size, group_size): width, height, buf_size[0], buf_size[1], halo) - prop_exec.wait() + prop_exec.wait( ) elapsed = 1e-6 * (prop_exec.profile.end - prop_exec.profile.start) total_time += elapsed # read back done flag, block until it gets here