From 11c6d3737633fb956cbae522288ef90953430992 Mon Sep 17 00:00:00 2001 From: tperol Date: Mon, 16 Nov 2015 22:09:16 -0500 Subject: [PATCH 01/13] finished P2 --- HW3/P2/mandelbrot.cl | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl index 5a11c020..241d884a 100644 --- a/HW3/P2/mandelbrot.cl +++ b/HW3/P2/mandelbrot.cl @@ -10,10 +10,32 @@ mandelbrot(__global __read_only float *coords_real, float c_real, c_imag; float z_real, z_imag; + float new_z_real; int iter; if ((x < w) && (y < h)) { // YOUR CODE HERE + // implementing mandelbrot here + + // initialize + iter = 0; + z_real =0; + z_imag = 0; + c_real = coords_real[w*x + y]; + c_imag = coords_imag[w*x + y]; + while((z_real*z_real+ z_imag*z_imag <=4) \ + &&(iter <= max_iter)){ + // Similar to AVX implemtation + new_z_real = (z_real*z_real - z_imag*z_imag) \ + + c_real; + z_imag = (2 * z_real* z_imag) + c_imag; + z_real = new_z_real; + iter = iter + 1; + } + + + ; + out_counts[x*w + y] = iter; } } From ee7c109bb9a016e5511a6c9453a388aa4fba7a8d Mon Sep 17 00:00:00 2001 From: tperol Date: Tue, 17 Nov 2015 10:15:22 -0500 Subject: [PATCH 02/13] commented P2 --- HW3/P2/mandelbrot.cl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl index 241d884a..89e35c78 100644 --- a/HW3/P2/mandelbrot.cl +++ b/HW3/P2/mandelbrot.cl @@ -21,8 +21,10 @@ mandelbrot(__global __read_only float *coords_real, iter = 0; z_real =0; z_imag = 0; - c_real = coords_real[w*x + y]; - c_imag = coords_imag[w*x + y]; + // pixel (x,y) is x*w + y away from (0,0) + // because (x,y) are flipped + c_real = coords_real[x*w + y]; + c_imag = coords_imag[x*w + y]; while((z_real*z_real+ z_imag*z_imag <=4) \ &&(iter <= max_iter)){ // Similar to AVX implemtation From 9a19ae2193dc960c47cf15e1536150c471df38b0 Mon Sep 17 00:00:00 2001 From: tperol Date: Thu, 19 Nov 2015 15:16:11 -0500 Subject: [PATCH 03/13] done with Prb 3 --- HW3/P3/P3.txt | 22 ++++++++++++++++++++++ HW3/P3/sum.cl | 36 +++++++++++++++++++++++++++--------- 2 files changed, 49 insertions(+), 9 deletions(-) create mode 100644 HW3/P3/P3.txt diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt new file mode 100644 index 00000000..7b2371ef --- /dev/null +++ b/HW3/P3/P3.txt @@ -0,0 +1,22 @@ +The best found configuration is + +configuration ('coalesced', 256, 128): 0.00303864 seconds + +The devices detected on platform Apple are: + +Intel(R) Core(TM) i5-4250U CPU @ 1.30GHz [Type: CPU ] + +Maximum clock Frequency: 1300 MHz + +Maximum allocable memory size: 1073 MB + +## Maximum work group size 1024 + +HD Graphics 5000 [Type: GPU ] + +Maximum clock Frequency: 1000 MHz + +Maximum allocable memory size: 402 MB + +## Maximum work group size 512 + diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl index ee914740..770ba2fe 100644 --- a/HW3/P3/sum.cl +++ b/HW3/P3/sum.cl @@ -5,11 +5,15 @@ __kernel void sum_coalesced(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); + size_t i = get_global_id(0); + int jump = get_global_size(0); + int localsize = get_local_size(0); + int id_x; // thread i (i.e., with i = get_global_id()) should add x[i], // x[i + get_global_size()], ... up to N-1, and store in sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (id_x = 0 ;i + id_x*jump < N; id_x++) { + sum += x[ i + id_x*jump ]; } fast[local_id] = sum; @@ -24,8 +28,11 @@ __kernel void sum_coalesced(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (uint offset = localsize/2; offset > 0; offset >>= 1) { + if (local_id< offset) { + fast[local_id] += fast[local_id + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; @@ -39,6 +46,13 @@ __kernel void sum_blocked(__global float* x, float sum = 0; size_t local_id = get_local_id(0); int k = ceil((float)N / get_global_size(0)); + int globalid = get_global_id(0); + int localid = get_local_id(0); + int ini ; + int localsize = get_local_size(0); + + + // thread with global_id 0 should add 0..k-1 // thread with global_id 1 should add k..2k-1 @@ -48,8 +62,10 @@ __kernel void sum_blocked(__global float* x, // // Be careful that each thread stays in bounds, both relative to // size of x (i.e., N), and the range it's assigned to sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for ( ini = globalid*k ; ini < (globalid +1)*k ; ini++) { + if (ini < N) { + sum += x[ini]; + } } fast[local_id] = sum; @@ -64,9 +80,11 @@ __kernel void sum_blocked(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (uint offset = localsize/2; offset > 0; offset >>= 1) { + if (local_id< offset) { + fast[local_id] += fast[local_id + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); } - if (local_id == 0) partial[get_group_id(0)] = fast[0]; } From 463581cca7c5d9ce080eb3375fbe4d4c71a8e9bf Mon Sep 17 00:00:00 2001 From: tperol Date: Thu, 19 Nov 2015 20:17:02 -0500 Subject: [PATCH 04/13] finished P4 --- HW3/P4/median_filter.cl | 61 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 2 deletions(-) diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl index 07bb294c..31f3f221 100644 --- a/HW3/P4/median_filter.cl +++ b/HW3/P4/median_filter.cl @@ -1,5 +1,16 @@ #include "median9.h" + +inline float get_values(__global float *in_values, \ + int w, int h, int new_x, int new_y){ + // check everything stays in bound + if (new_x < 0) new_x = 0; + if (new_y < 0) new_y = 0; + if (new_x >= w) new_x = w - 1; + if (new_y >= h) new_y = h - 1; + return in_values[new_y * w + new_x]; +} + // 3x3 median filter __kernel void median_3x3(__global __read_only float *in_values, @@ -22,13 +33,59 @@ median_3x3(__global __read_only float *in_values, // Note that globally out-of-bounds pixels should be replaced // with the nearest valid pixel's value. + // Define variables like in class + + // global position of the pixel + const int x = get_global_id(0); + const int y = get_global_id(1); + + // local position of the pixel in the workgroup + const int lx = get_local_id(0); + const int ly = get_local_id(1); + + // corner coordinates of the buffer + const int buf_corner_x = x - lx - halo; + const int buf_corner_y = y - ly - halo; + + // coordinates of the pixel in the buffer + const int buf_x = lx + halo; + const int buf_y = ly + halo; + + // get 1-index of the pixels + const int idx_1D = ly * get_local_size(0) + lx; // Compute 3x3 median for each pixel in core (non-halo) pixels // // We've given you median9.h, and included it above, so you can // use the median9() function. + if (idx_1D < buf_w){ + for (int row = 0; row < buf_h; row++){ + int new_x = buf_corner_x + idx_1D; + int new_y = buf_corner_y + row; + // Each thread in the valid region (x < w, y < h) should write + // back its 3x3 neighborhood median. + buffer[row * buf_w + idx_1D] = \ + get_values(in_values, w, h, new_x, new_y); + } + } + + //# Make sure all threads reach the next part after the local buffer is loaded + barrier(CLK_LOCAL_MEM_FENCE); + + if((x < w) && (y < h)){ + out_values[y * w + x] =\ + median9(buffer[(buf_y-1) * buf_w + buf_x -1],\ + buffer[(buf_y-1) * buf_w + buf_x],\ + buffer[(buf_y-1) * buf_w + buf_x +1],\ + buffer[buf_y * buf_w + buf_x -1], \ + buffer[buf_y * buf_w + buf_x], \ + buffer[buf_y * buf_w + buf_x +1],\ + buffer[(buf_y+1) * buf_w + buf_x -1],\ + buffer[(buf_y+1) * buf_w + buf_x],\ + buffer[(buf_y+1) * buf_w + buf_x +1]); + + } + - // Each thread in the valid region (x < w, y < h) should write - // back its 3x3 neighborhood median. } From 5935042c0ec23581f660eac25062c0dca987995d Mon Sep 17 00:00:00 2001 From: tperol Date: Fri, 20 Nov 2015 09:29:50 -0500 Subject: [PATCH 05/13] finished commenting P4 --- HW3/P4/median_filter.cl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl index 31f3f221..a8bc5391 100644 --- a/HW3/P4/median_filter.cl +++ b/HW3/P4/median_filter.cl @@ -70,20 +70,20 @@ median_3x3(__global __read_only float *in_values, } } - //# Make sure all threads reach the next part after the local buffer is loaded + // now write the output barrier(CLK_LOCAL_MEM_FENCE); if((x < w) && (y < h)){ out_values[y * w + x] =\ - median9(buffer[(buf_y-1) * buf_w + buf_x -1],\ - buffer[(buf_y-1) * buf_w + buf_x],\ - buffer[(buf_y-1) * buf_w + buf_x +1],\ - buffer[buf_y * buf_w + buf_x -1], \ - buffer[buf_y * buf_w + buf_x], \ - buffer[buf_y * buf_w + buf_x +1],\ - buffer[(buf_y+1) * buf_w + buf_x -1],\ - buffer[(buf_y+1) * buf_w + buf_x],\ - buffer[(buf_y+1) * buf_w + buf_x +1]); + median9( buffer[ (buf_y-1) * buf_w + buf_x -1],\ + buffer[ (buf_y-1) * buf_w + buf_x],\ + buffer[ (buf_y-1) * buf_w + buf_x +1],\ + buffer[ buf_y * buf_w + buf_x -1], \ + buffer[ buf_y * buf_w + buf_x], \ + buffer[ buf_y * buf_w + buf_x +1],\ + buffer[ (buf_y+1) * buf_w + buf_x -1],\ + buffer[ (buf_y+1) * buf_w + buf_x],\ + buffer[ (buf_y+1) * buf_w + buf_x +1]); } From 79cdf52695a220461154c6884ff21ecdc824b7bb Mon Sep 17 00:00:00 2001 From: tperol Date: Fri, 20 Nov 2015 16:45:40 -0500 Subject: [PATCH 06/13] done p5 part1 --- HW3/P5/label_regions.cl | 17 ++++++++++++++--- HW3/P5/label_regions.py | 2 +- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl index 78b986b3..7905c490 100644 --- a/HW3/P5/label_regions.cl +++ b/HW3/P5/label_regions.cl @@ -21,7 +21,7 @@ int get_clamped_value(__global __read_only int *labels, int w, int h, int x, int y) -{ +{ if ((x < 0) || (x >= w) || (y < 0) || (y >= h)) return w * h; return labels[y * w + x]; @@ -82,11 +82,22 @@ propagate_labels(__global __read_write int *labels, // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2) // stay in bounds - if ((x < w) && (y < h)) { + if (((x < w) && (y < h)) && (old_label < w*h)) { // CODE FOR PART 1 HERE // We set new_label to the value of old_label, but you will need // to adjust this for correctness. - new_label = old_label; + + // one pixel becomes the minimum of its 4 neighboring + // pixels and itself + // get the locations in a similar fashion as P4 + int left = buffer[ buf_y * buf_w + buf_x - 1]; + int right = buffer[ buf_y * buf_w + 1]; + int up = buffer[ (buf_y - 1) * buf_w + buf_x ]; + int down = buffer[ (buf_y + 1) * buf_w + buf_x ]; + // find the minimum + new_label = min(old_label, min( min( min(up,down) , right) , left)); + + if (new_label != old_label) { // CODE FOR PART 3 HERE diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py index c6ce60cb..13d7c7d5 100644 --- a/HW3/P5/label_regions.py +++ b/HW3/P5/label_regions.py @@ -93,7 +93,7 @@ def round_up(global_size, group_size): width, height, buf_size[0], buf_size[1], halo) - prop_exec.wait() + prop_exec.wait( ) elapsed = 1e-6 * (prop_exec.profile.end - prop_exec.profile.start) total_time += elapsed # read back done flag, block until it gets here From bedeb0ae19bf2644cf8e8cd9b4433ed956235551 Mon Sep 17 00:00:00 2001 From: tperol Date: Fri, 20 Nov 2015 19:40:51 -0500 Subject: [PATCH 07/13] done with part 2, working on part4 --- HW3/P5/label_regions.cl | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl index 7905c490..74216a79 100644 --- a/HW3/P5/label_regions.cl +++ b/HW3/P5/label_regions.cl @@ -80,7 +80,39 @@ propagate_labels(__global __read_write int *labels, old_label = buffer[buf_y * buf_w + buf_x]; // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2) - + + + // CODE FOR PART 2 + + if (old_label < w*h) { + buffer[ buf_y * buf_w + buf_x ] = labels[old_label]; + } + + + // CODE FOR PART 4 + + // when we have the first thread + //if ((lx == 0) && (ly == 0)) { +// + //// loop over rows and columns + //for (int x_i = halo; x_i < buf_h - halo; x_i++) { + //for (int y_i = halo; y_i < buf_w - halo; y_i++) { +// + //// obtain grand parent + //if (old_label < w*h) { + //if (this_label != last_label) { + //buffer[x_i + buf_w * y_i] = labels[buffer[x_i + buf_w * y_i]]; + //last_label = this_label; + //} + //else { + //buffer[x_i + buf_w * y_i] = buffer[last_index]; + //} + //last_index = x_i + buf_w * y_i; + //} + //} + //} + //} + // stay in bounds if (((x < w) && (y < h)) && (old_label < w*h)) { // CODE FOR PART 1 HERE From f4c252222aeaca558c86cfe0c00a9c694586ab53 Mon Sep 17 00:00:00 2001 From: tperol Date: Fri, 20 Nov 2015 19:46:14 -0500 Subject: [PATCH 08/13] finished part 3 --- HW3/P5/label_regions.cl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl index 74216a79..079e1888 100644 --- a/HW3/P5/label_regions.cl +++ b/HW3/P5/label_regions.cl @@ -132,11 +132,13 @@ propagate_labels(__global __read_write int *labels, if (new_label != old_label) { + atomic_min(&labels[old_label],new_label); // CODE FOR PART 3 HERE // indicate there was a change this iteration. // multiple threads might write this. *(changed_flag) += 1; - labels[y * w + x] = new_label; + //labels[y * w + x] = new_label; + atomic_min(&labels[x + y * w], new_label); } } } From e5c1709b281ca1c06767b561751ad45991c0c3fa Mon Sep 17 00:00:00 2001 From: tperol Date: Fri, 20 Nov 2015 21:02:54 -0500 Subject: [PATCH 09/13] got part 4 working --- HW3/P5/label_regions.cl | 53 ++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl index 079e1888..5cb633a2 100644 --- a/HW3/P5/label_regions.cl +++ b/HW3/P5/label_regions.cl @@ -84,34 +84,39 @@ propagate_labels(__global __read_write int *labels, // CODE FOR PART 2 - if (old_label < w*h) { - buffer[ buf_y * buf_w + buf_x ] = labels[old_label]; - } + // if (old_label < w*h) { + // buffer[ buf_y * buf_w + buf_x ] = labels[old_label]; + // } // CODE FOR PART 4 + barrier(CLK_LOCAL_MEM_FENCE); // when we have the first thread - //if ((lx == 0) && (ly == 0)) { -// - //// loop over rows and columns - //for (int x_i = halo; x_i < buf_h - halo; x_i++) { - //for (int y_i = halo; y_i < buf_w - halo; y_i++) { -// - //// obtain grand parent - //if (old_label < w*h) { - //if (this_label != last_label) { - //buffer[x_i + buf_w * y_i] = labels[buffer[x_i + buf_w * y_i]]; - //last_label = this_label; - //} - //else { - //buffer[x_i + buf_w * y_i] = buffer[last_index]; - //} - //last_index = x_i + buf_w * y_i; - //} - //} - //} - //} + if ((lx == 0) && (ly == 0)) { + + // initialize variables to use + int last_label; + int my_label; + + // loop over rows and columns of the buffer + for (int x_i = halo; x_i < buf_h - halo; x_i++) { + for (int y_i = halo; y_i < buf_w - halo; y_i++) { + + my_label = buffer[x_i + buf_w * y_i]; + // obtain grand parent + if (old_label < w*h) { + // avoid having the same value as the previous one + if (my_label != last_label) { + // update the buffer + buffer[x_i + buf_w * y_i] = labels[my_label]; + // update the last label + last_label = my_label; + } + } + } + } + } // stay in bounds if (((x < w) && (y < h)) && (old_label < w*h)) { @@ -138,7 +143,7 @@ propagate_labels(__global __read_write int *labels, // multiple threads might write this. *(changed_flag) += 1; //labels[y * w + x] = new_label; - atomic_min(&labels[x + y * w], new_label); + atomic_min(&labels[y * w + x], new_label); } } } From 8ff034e892d422d2ab7d1ad68539f0560f567f67 Mon Sep 17 00:00:00 2001 From: tperol Date: Fri, 20 Nov 2015 21:17:32 -0500 Subject: [PATCH 10/13] writing P5.txt --- HW3/P5/P5.txt | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 HW3/P5/P5.txt diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt new file mode 100644 index 00000000..caa4de85 --- /dev/null +++ b/HW3/P5/P5.txt @@ -0,0 +1,52 @@ +Results + +******** +Part 1 +******** + +--- Maze 1 ---- + +Finished after 314 iterations, 153.46664 ms total, 0.488747261146 ms per iteration +Found 77 regions + +--- Maze 2 --- + +Finished after 243 iterations, 106.70872 ms total, 0.439130534979 ms per iteration +Found 113 regions + +******** +Part 2 +******** + +--- Maze 1 --- + + +Finished after 132 iterations, 65.11392 ms total, 0.493287272727 ms per iteration +Found 77 regions + +--- Maze 2 --- + +Finished after 114 iterations, 55.83568 ms total, 0.489786666667 ms per iteration +Found 113 regions + +******** +Part 3 +******** + +--- Maze 1 --- + +Finished after 11 iterations, 5.42896 ms total, 0.493541818182 ms per iteration +Found 60 regions + + +--- Maze 2 --- + +Finished after 11 iterations, 5.3596 ms total, 0.487236363636 ms per iteration +Found 106 regions + +******** +Part 4 +******** + +--- Maze 1 --- + From 1a2cb7f65d5416973d3bd1139f5258ba503b1b1f Mon Sep 17 00:00:00 2001 From: tperol Date: Fri, 20 Nov 2015 21:30:29 -0500 Subject: [PATCH 11/13] debugging part 4... --- HW3/P5/label_regions.cl | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl index 5cb633a2..95a4bd69 100644 --- a/HW3/P5/label_regions.cl +++ b/HW3/P5/label_regions.cl @@ -97,23 +97,27 @@ propagate_labels(__global __read_write int *labels, // initialize variables to use int last_label; - int my_label; + int my_label_new; + int my_last_idx; // loop over rows and columns of the buffer - for (int x_i = halo; x_i < buf_h - halo; x_i++) { - for (int y_i = halo; y_i < buf_w - halo; y_i++) { + for (int x_i = halo; x_i < get_local_size(0) + halo; x_i++) { + for (int y_i = halo; y_i < get_local_size(1) + halo; y_i++) { - my_label = buffer[x_i + buf_w * y_i]; + my_label_new = buffer[x_i + buf_w * y_i]; // obtain grand parent if (old_label < w*h) { // avoid having the same value as the previous one - if (my_label != last_label) { + if (my_label_new != last_label) { // update the buffer - buffer[x_i + buf_w * y_i] = labels[my_label]; + buffer[x_i + buf_w * y_i] = labels[my_label_new]; // update the last label - last_label = my_label; + last_label = my_label_new; + } else { + buffer[x_i + buf_w * y_i ] = labels[buffer[my_last_idx]]; } } + my_last_idx = x_i + buf_w * y_i ; } } } From 60f5082be6e27743b8454dde38f5cc901bdd79d5 Mon Sep 17 00:00:00 2001 From: tperol Date: Fri, 20 Nov 2015 22:40:23 -0500 Subject: [PATCH 12/13] finished part 4, and text for part 4. now working on text for part 5 --- HW3/P5/P5.txt | 11 +++++++++++ HW3/P5/label_regions.cl | 25 ++++++++++++------------- HW3/P5/label_regions.py | 2 +- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt index caa4de85..c9f00249 100644 --- a/HW3/P5/P5.txt +++ b/HW3/P5/P5.txt @@ -50,3 +50,14 @@ Part 4 --- Maze 1 --- +Finished after 71 iterations, 96.0672 ms total, 1.35305915493 ms per iteration +Found 66 regions + +--- Maze 2 --- + +Finished after 49 iterations, 66.29168 ms total, 1.35289142857 ms per iteration +Found 106 regions + +Looks like using only one thread did not improve anything. Worse, I now have more iterations. I don't have time to find out what happened. +While we have decreased the fetches into the global memory by serializing, the run time is bigger. + diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl index 95a4bd69..e37e5bba 100644 --- a/HW3/P5/label_regions.cl +++ b/HW3/P5/label_regions.cl @@ -91,33 +91,32 @@ propagate_labels(__global __read_write int *labels, // CODE FOR PART 4 - barrier(CLK_LOCAL_MEM_FENCE); + // when we have the first thread if ((lx == 0) && (ly == 0)) { // initialize variables to use - int last_label; + int last_label = -1 ; int my_label_new; - int my_last_idx; + + // get grandparent + if (old_label < w*h){ + last_label = labels[old_label]; + } // loop over rows and columns of the buffer for (int x_i = halo; x_i < get_local_size(0) + halo; x_i++) { for (int y_i = halo; y_i < get_local_size(1) + halo; y_i++) { - my_label_new = buffer[x_i + buf_w * y_i]; - // obtain grand parent - if (old_label < w*h) { + my_label_new = buffer[(ly+x_i)*buf_w+(lx+y_i)]; + + if (buffer[(ly+x_i)*buf_w+(lx+y_i)] < w*h) { // avoid having the same value as the previous one if (my_label_new != last_label) { // update the buffer - buffer[x_i + buf_w * y_i] = labels[my_label_new]; - // update the last label - last_label = my_label_new; - } else { - buffer[x_i + buf_w * y_i ] = labels[buffer[my_last_idx]]; - } + buffer[(ly+x_i)*buf_w+(lx+y_i)] = labels[my_label_new]; + } } - my_last_idx = x_i + buf_w * y_i ; } } } diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py index 13d7c7d5..5ffa14f2 100644 --- a/HW3/P5/label_regions.py +++ b/HW3/P5/label_regions.py @@ -42,7 +42,7 @@ def round_up(global_size, group_size): program = cl.Program(context, open('label_regions.cl').read()).build(options='') - host_image = np.load('maze1.npy') + host_image = np.load('maze2.npy') host_labels = np.empty_like(host_image) host_done_flag = np.zeros(1).astype(np.int32) From f53b2184167c2d80f23bae519e9572674b0a9fad Mon Sep 17 00:00:00 2001 From: tperol Date: Fri, 20 Nov 2015 23:25:22 -0500 Subject: [PATCH 13/13] finished HW3 --- HW3/P5/P5.txt | 24 ++++++++++++++++++++++++ HW3/P5/label_regions.cl | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt index c9f00249..59554130 100644 --- a/HW3/P5/P5.txt +++ b/HW3/P5/P5.txt @@ -61,3 +61,27 @@ Found 106 regions Looks like using only one thread did not improve anything. Worse, I now have more iterations. I don't have time to find out what happened. While we have decreased the fetches into the global memory by serializing, the run time is bigger. + +******** +Part 5 +******** + +Changing atomic_min() to a simple min() ? + +For now we have : + +atomic_min(&labels[old_label],new_label); +atomic_min(&labels[y * w + x], new_label); + +and we are asking ourselves the questions what would happen with + +min(labels[old_label], new_label); +min(labels[y * w + x], new_label); + +The atomic operation ensure that only one thread assigns to labels[old_label] and labels[y * w + x] the smallest label at a time. If multiple threads do this at the same time in parallel, there is no reason to believe that it will correctly assign the smallest value of the label anymore. + +However, this operation will be faster per iteration because now done via multithreading. + +Also we may expect more iterations because the minimum value stored at the end of one iteration may not be the minimum value for all the threads. + + diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl index e37e5bba..e14031fc 100644 --- a/HW3/P5/label_regions.cl +++ b/HW3/P5/label_regions.cl @@ -140,7 +140,7 @@ propagate_labels(__global __read_write int *labels, if (new_label != old_label) { - atomic_min(&labels[old_label],new_label); + atomic_min(&labels[old_label], new_label); // CODE FOR PART 3 HERE // indicate there was a change this iteration. // multiple threads might write this.