From ed6f0167fd1f6097ce9e454c2950c5db00a0c513 Mon Sep 17 00:00:00 2001 From: "Thouis (Ray) Jones" Date: Mon, 2 Nov 2015 21:49:00 -0500 Subject: [PATCH 1/5] remove extraneous +4 --- HW3/P3/tune.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py index c16e9fa6..a0d56da2 100644 --- a/HW3/P3/tune.py +++ b/HW3/P3/tune.py @@ -23,7 +23,7 @@ def create_data(N): times = {} for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) @@ -40,7 +40,7 @@ def create_data(N): format(num_workgroups, num_workers, seconds)) for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) From 6706ee708d97c656caa08b284792b3eee63892cf Mon Sep 17 00:00:00 2001 From: "Thouis (Ray) Jones" Date: Wed, 4 Nov 2015 21:46:08 -0500 Subject: [PATCH 2/5] typo in typecast --- HW3/P3/sum.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl index 4fb771d2..ee914740 100644 --- a/HW3/P3/sum.cl +++ b/HW3/P3/sum.cl @@ -38,7 +38,7 @@ __kernel void sum_blocked(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); - int k = ceil(float(N) / get_global_size(0)); + int k = ceil((float)N / get_global_size(0)); // thread with global_id 0 should add 0..k-1 // thread with global_id 1 should add k..2k-1 From f7dc5f66fb34745e3ed74460cdd037836c440df7 Mon Sep 17 00:00:00 2001 From: "Thouis (Ray) Jones" Date: Thu, 5 Nov 2015 09:43:23 -0500 Subject: [PATCH 3/5] unused module, set include path --- HW3/P4/median_filter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py index 1eda1bb9..a181c05a 100644 --- a/HW3/P4/median_filter.py +++ b/HW3/P4/median_filter.py @@ -1,8 +1,8 @@ from __future__ import division import pyopencl as cl import numpy as np -import imread import pylab +import os.path def round_up(global_size, group_size): r = global_size % group_size @@ -51,7 +51,8 @@ def numpy_median(image, iterations=10): properties=cl.command_queue_properties.PROFILING_ENABLE) print 'The queue is using the device:', queue.device.name - program = cl.Program(context, open('median_filter.cl').read()).build(options='') + curdir = os.path.dirname(os.path.realpath(__file__)) + program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir]) host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy() host_image_filtered = np.zeros_like(host_image) From 5ed7b3c01f7ac1ae19fded5bf3f7c510bd103308 Mon Sep 17 00:00:00 2001 From: Rohit Ahuja Date: Fri, 20 Nov 2015 15:28:26 -0500 Subject: [PATCH 4/5] finished p3, p4, p5 --- HW3/P3/P3.txt | 129 ++++++++++++++++++++++++++++++++++++++++ HW3/P3/sum.cl | 30 +++++++--- HW3/P4/median_filter.cl | 65 ++++++++++++++++++++ HW3/P5/P5.txt | 36 +++++++++++ HW3/P5/label_regions.cl | 44 +++++++++++++- HW3/P5/label_regions.py | 20 +++---- 6 files changed, 302 insertions(+), 22 deletions(-) create mode 100644 HW3/P3/P3.txt create mode 100644 HW3/P5/P5.txt diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt new file mode 100644 index 00000000..17610e78 --- /dev/null +++ b/HW3/P3/P3.txt @@ -0,0 +1,129 @@ +For coalesced reads I got the following data: It seems that for coalesced reads 64 workgroups at 128 workers gives the best time of .0024 seconds + + + +5000153.3125 +coalesced reads, workgroups: 8, num_workers: 4, 0.16036008 seconds + +5000160.3125 +coalesced reads, workgroups: 8, num_workers: 8, 0.08918312 seconds + +5000151.75 +coalesced reads, workgroups: 8, num_workers: 16, 0.05170184 seconds + +5000152.5 +coalesced reads, workgroups: 8, num_workers: 32, 0.02803776 seconds + +5000153.875 +coalesced reads, workgroups: 8, num_workers: 64, 0.0140196 seconds + +5000153.375 +coalesced reads, workgroups: 8, num_workers: 128, 0.00762976 seconds + +5000160.34375 +coalesced reads, workgroups: 16, num_workers: 4, 0.09207176 seconds + +5000151.71875 +coalesced reads, workgroups: 16, num_workers: 8, 0.05598272 seconds + +5000152.65625 +coalesced reads, workgroups: 16, num_workers: 16, 0.0280048 seconds + +5000153.9375 +coalesced reads, workgroups: 16, num_workers: 32, 0.01415656 seconds + +5000153.28125 +coalesced reads, workgroups: 16, num_workers: 64, 0.00719744 seconds + +5000153.28125 +coalesced reads, workgroups: 16, num_workers: 128, 0.00399504 seconds + +5000151.73438 +coalesced reads, workgroups: 32, num_workers: 4, 0.05799328 seconds + +5000152.65625 +coalesced reads, workgroups: 32, num_workers: 8, 0.02760288 seconds + +5000154.0 +coalesced reads, workgroups: 32, num_workers: 16, 0.01413456 seconds + +5000153.34375 +coalesced reads, workgroups: 32, num_workers: 32, 0.00713032 seconds + +5000153.34375 +coalesced reads, workgroups: 32, num_workers: 64, 0.00372464 seconds + +5000153.26562 +coalesced reads, workgroups: 32, num_workers: 128, 0.0026336 seconds + +5000152.61719 +coalesced reads, workgroups: 64, num_workers: 4, 0.02696808 seconds + +5000154.01562 +coalesced reads, workgroups: 64, num_workers: 8, 0.01386544 seconds + +5000153.32031 +coalesced reads, workgroups: 64, num_workers: 16, 0.007178 seconds + +5000153.375 +coalesced reads, workgroups: 64, num_workers: 32, 0.00390184 seconds + +5000153.33594 +coalesced reads, workgroups: 64, num_workers: 64, 0.00275184 seconds + +5000153.36719 +coalesced reads, workgroups: 64, num_workers: 128, 0.00248552 seconds + +5000154.01562 +coalesced reads, workgroups: 128, num_workers: 4, 0.02766432 seconds + +5000153.29297 +coalesced reads, workgroups: 128, num_workers: 8, 0.01399096 seconds + +5000153.39844 +coalesced reads, workgroups: 128, num_workers: 16, 0.00782096 seconds + +5000153.30469 +coalesced reads, workgroups: 128, num_workers: 32, 0.00377976 seconds + +5000153.33984 +coalesced reads, workgroups: 128, num_workers: 64, 0.00254232 seconds + +5000153.30859 +coalesced reads, workgroups: 128, num_workers: 128, 0.00252368 seconds + +5000153.31445 +coalesced reads, workgroups: 256, num_workers: 4, 0.02999856 seconds + +5000153.38086 +coalesced reads, workgroups: 256, num_workers: 8, 0.0142652 seconds + +5000153.29102 +coalesced reads, workgroups: 256, num_workers: 16, 0.0075244 seconds + +5000153.35352 +coalesced reads, workgroups: 256, num_workers: 32, 0.00397056 seconds + +5000153.36523 +coalesced reads, workgroups: 256, num_workers: 64, 0.00257784 seconds + +5000153.31641 +coalesced reads, workgroups: 256, num_workers: 128, 0.00277344 seconds + +5000153.37109 +coalesced reads, workgroups: 512, num_workers: 4, 0.02859184 seconds + +5000153.30176 +coalesced reads, workgroups: 512, num_workers: 8, 0.014304 seconds + +5000153.36328 +coalesced reads, workgroups: 512, num_workers: 16, 0.00809184 seconds + +5000153.35547 +coalesced reads, workgroups: 512, num_workers: 32, 0.00424744 seconds + +5000153.32617 +coalesced reads, workgroups: 512, num_workers: 64, 0.00255136 seconds + +5000153.3457 +coalesced reads, workgroups: 512, num_workers: 128, 0.0030888 seconds diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl index ee914740..2354ad58 100644 --- a/HW3/P3/sum.cl +++ b/HW3/P3/sum.cl @@ -5,11 +5,14 @@ __kernel void sum_coalesced(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); + int group_size = get_local_size(0); + int global_size = get_global_size(0); + int global_id = get_global_id(0); // thread i (i.e., with i = get_global_id()) should add x[i], // x[i + get_global_size()], ... up to N-1, and store in sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (int i = global_id; i < N; i += global_size) { // YOUR CODE HERE + sum += x[i]; // YOUR CODE HERE } fast[local_id] = sum; @@ -24,8 +27,11 @@ __kernel void sum_coalesced(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (int offset = group_size >> 1; offset > 0; offset >>= 1) { // YOUR CODE HERE + if ((int)local_id < offset) { // YOUR CODE HERE + fast[local_id] += fast[local_id + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; @@ -38,8 +44,9 @@ __kernel void sum_blocked(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); + int global_id = get_global_id(0); + int group_size = get_local_size(0); int k = ceil((float)N / get_global_size(0)); - // thread with global_id 0 should add 0..k-1 // thread with global_id 1 should add k..2k-1 // thread with global_id 2 should add 2k..3k-1 @@ -48,8 +55,10 @@ __kernel void sum_blocked(__global float* x, // // Be careful that each thread stays in bounds, both relative to // size of x (i.e., N), and the range it's assigned to sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (int i = global_id*k; i < (global_id+1)*k; i++) { // YOUR CODE HERE + if (i < N) { + sum += x[i]; // YOUR CODE HERE + } } fast[local_id] = sum; @@ -64,8 +73,11 @@ __kernel void sum_blocked(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (int offset = group_size >> 1; offset > 0; offset >>= 1) { // YOUR CODE HERE + if ((int)local_id < offset) { // YOUR CODE HERE + fast[local_id] += fast[local_id + offset]; + } + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl index 07bb294c..eaa9b141 100644 --- a/HW3/P4/median_filter.cl +++ b/HW3/P4/median_filter.cl @@ -1,5 +1,21 @@ #include "median9.h" +float +FETCH(__global __read_only float *in_values, int w, int h, int x, int y) +{ + if (x < 0) { + x = 0; + } else if (x > w-1) { + x = w-1; + } + if (y < 0) { + y = 0; + } else if (y > h-1) { + y = h-1; + } + return in_values[y*w+x]; +} + // 3x3 median filter __kernel void median_3x3(__global __read_only float *in_values, @@ -31,4 +47,53 @@ median_3x3(__global __read_only float *in_values, // Each thread in the valid region (x < w, y < h) should write // back its 3x3 neighborhood median. + + // Pulled from http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ + // Global position of output pixel + const int x = get_global_id(0); + const int y = get_global_id(1); + + // Local position relative to (0, 0) in workgroup + const int lx = get_local_id(0); + const int ly = get_local_id(1); + + // coordinates of the upper left corner of the buffer in image + // space, including halo + const int buf_corner_x = x - lx - halo; + const int buf_corner_y = y - ly - halo; + + // coordinates of our pixel in the local buffer + const int buf_x = lx + halo; + const int buf_y = ly + halo; + + // 1D index of thread within our work-group + const int idx_1D = ly * get_local_size(0) + lx; + + int row; + + if (idx_1D < buf_w) { + for (row = 0; row < buf_h; row++) { + buffer[row * buf_w + idx_1D] = FETCH(in_values, w, h, \ + buf_corner_x + idx_1D, \ + buf_corner_y + row); + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // Processing code here... + // + // Should only use buffer, buf_x, buf_y. + + // write output + if ((y < h) && (x < w)) // stay in bounds + out_values[y * w + x] = median9(buffer[(buf_y-1) * buf_w + buf_x-1], + buffer[(buf_y-1) * buf_w + buf_x], + buffer[(buf_y-1) * buf_w + buf_x+1], + buffer[buf_y * buf_w + buf_x-1], + buffer[buf_y * buf_w + buf_x], + buffer[buf_y * buf_w + buf_x+1], + buffer[(buf_y+1) * buf_w + buf_x-1], + buffer[(buf_y+1) * buf_w + buf_x], + buffer[(buf_y+1) * buf_w + buf_x+1]); } diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt new file mode 100644 index 00000000..89e6ecc7 --- /dev/null +++ b/HW3/P5/P5.txt @@ -0,0 +1,36 @@ +Part 1: +Maze 1: Finished after 888 iterations, 450.07784 ms total, 0.506844414414 ms per iteration +Found 2 regions + +Maze 2: Finished after 516 iterations, 258.12848 ms total, 0.500248992248 ms per iteration +Found 35 regions + +Part 2: +Maze 1: Finished after 529 iterations, 262.70208 ms total, 0.496601285444 ms per iteration +Found 2 regions + +Maze 2: Finished after 273 iterations, 137.47456 ms total, 0.50356981685 ms per iteration +Found 35 regions + +Part 3: +Maze 1: Finished after 10 iterations, 4.89648 ms total, 0.489648 ms per iteration +Found 2 regions + +Maze 2: Finished after 9 iterations, 4.50248 ms total, 0.500275555556 ms per iteration +Found 35 regions + +Part 4: +Maze 1: Finished after 19 iterations, 19.07336 ms total, 1.00386105263 ms per iteration +Found 2 regions + +Maze 2: Finished after 17 iterations, 17.0408 ms total, 1.0024 ms per iteration +Found 35 regions + +Discussion: +Despite avoiding repeating global memory reads in the single threaded sequential scan, the single thread workgroup labeling resulted in double the time to label the maze. This is because we are updating the labels in parallel. Empirically, we are bottlenecked by computation. We don't achieve much speedup given we are reducing global memory reads, so reading is not the issue. Thus, for my hardware, single thread is not a good option. + +There are two general factors that can impact the speed. 1) If the pixels are similar, then there would be more speedup in part 4 since there would be fewer updates. 2) If a GPU can has more core and can perform reads and writes much more quickly, then the speedup in part 4 would also be much more. + +Part 5: +Discussion: +Using atomic_min is useful because it treats reading and writing as one task in our OS scheduler. However, it causes serialized reads and writes. The issue with min instead of atomic_min is that we can run into race conditions. One thread can write a value that is not necessarily the min value after another thread has written the min value. As a result, we would significantly decrease our perfomance since it would take longer to label the entire region. diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl index 78b986b3..1f8aa03a 100644 --- a/HW3/P5/label_regions.cl +++ b/HW3/P5/label_regions.cl @@ -45,6 +45,10 @@ propagate_labels(__global __read_write int *labels, const int lx = get_local_id(0); const int ly = get_local_id(1); + // Local workgroup sizes + const int wgx = get_local_size(1); + const int wgy = get_local_size(0); + // coordinates of the upper left corner of the buffer in image // space, including halo const int buf_corner_x = x - lx - halo; @@ -80,20 +84,54 @@ propagate_labels(__global __read_write int *labels, old_label = buffer[buf_y * buf_w + buf_x]; // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2) + // grab the grandparent label + // if (old_label < w*h) { + // buffer[buf_y * buf_w + buf_x] = labels[old_label]; + // } + // if we are at the top left corner of the work group, we propogate grandparent + // labels across the group in a serialized fashion + if (lx == 0 && ly == 0) { + int prev, curr; + for (int i = 0; i < wgy; i++) { + for (int j = 0; j < wgx; j++) { + int id = (i + halo)*buf_w + (j + halo); + curr = buffer[id]; + + if (curr < w*h) { + if (curr != prev) { + prev = curr; + buffer[id] = labels[curr]; + } + } + } + } + } + + // grandparent labels have been propogated + barrier(CLK_LOCAL_MEM_FENCE); + // stay in bounds - if ((x < w) && (y < h)) { + // we must check if old_label is less than w*h since the foreground pixels are bounded by + // w*h and we don't want to blend the background pixels with foreground pixels + if ((x < w) && (y < h) && (old_label < w*h)) { // CODE FOR PART 1 HERE // We set new_label to the value of old_label, but you will need // to adjust this for correctness. - new_label = old_label; + // grab the smallest pixel amoung the current and surrounding pixels + new_label = min(old_label, + min(buffer[buf_y*buf_w+buf_x+1], + min(buffer[buf_y*buf_w+buf_x-1], + min(buffer[(buf_y-1)*buf_w+buf_x], buffer[(buf_y+1)*buf_w+buf_x])))); if (new_label != old_label) { // CODE FOR PART 3 HERE // indicate there was a change this iteration. // multiple threads might write this. + // merging old and new parent with same new_label + atomic_min(&labels[old_label], new_label); *(changed_flag) += 1; - labels[y * w + x] = new_label; + atomic_min(&labels[y * w + x], new_label); } } } diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py index c6ce60cb..276179fd 100644 --- a/HW3/P5/label_regions.py +++ b/HW3/P5/label_regions.py @@ -2,7 +2,7 @@ import sys import pyopencl as cl import numpy as np -import pylab +from matplotlib import pyplot def round_up(global_size, group_size): r = global_size % group_size @@ -75,9 +75,9 @@ def round_up(global_size, group_size): # Show the initial labels cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True) - pylab.imshow(host_labels) - pylab.title(itercount) - pylab.show() + pyplot.imshow(host_labels) + pyplot.title(itercount) + pyplot.show() show_progress = True total_time = 0 @@ -105,9 +105,9 @@ def round_up(global_size, group_size): print host_done_flag if itercount % 100 == 0 and show_progress: cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True) - pylab.imshow(host_labels) - pylab.title(itercount) - pylab.show() + pyplot.imshow(host_labels) + pyplot.title(itercount) + pyplot.show() if itercount % 10000 == 0: print 'Reached maximal number of iterations, aborting' sys.exit(0) @@ -116,6 +116,6 @@ def round_up(global_size, group_size): # Show final result cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True) print 'Found {} regions'.format(len(np.unique(host_labels)) - 1) - pylab.imshow(host_labels) - pylab.title(itercount) - pylab.show() + pyplot.imshow(host_labels) + pyplot.title(itercount) + pyplot.show() From 8a0e32e7fa76f86aca0ce8642f943f0392eff053 Mon Sep 17 00:00:00 2001 From: Rohit Ahuja Date: Fri, 20 Nov 2015 15:36:12 -0500 Subject: [PATCH 5/5] Changed P3.txt --- HW3/P3/P3.txt | 216 ++++++++++++++++++++------------------------------ 1 file changed, 88 insertions(+), 128 deletions(-) diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt index 17610e78..8d11671f 100644 --- a/HW3/P3/P3.txt +++ b/HW3/P3/P3.txt @@ -1,129 +1,89 @@ -For coalesced reads I got the following data: It seems that for coalesced reads 64 workgroups at 128 workers gives the best time of .0024 seconds +The best configuration was coalesced reads, workgroups: 64, num_workers: 128, 0.00275208 seconds - - -5000153.3125 -coalesced reads, workgroups: 8, num_workers: 4, 0.16036008 seconds - -5000160.3125 -coalesced reads, workgroups: 8, num_workers: 8, 0.08918312 seconds - -5000151.75 -coalesced reads, workgroups: 8, num_workers: 16, 0.05170184 seconds - -5000152.5 -coalesced reads, workgroups: 8, num_workers: 32, 0.02803776 seconds - -5000153.875 -coalesced reads, workgroups: 8, num_workers: 64, 0.0140196 seconds - -5000153.375 -coalesced reads, workgroups: 8, num_workers: 128, 0.00762976 seconds - -5000160.34375 -coalesced reads, workgroups: 16, num_workers: 4, 0.09207176 seconds - -5000151.71875 -coalesced reads, workgroups: 16, num_workers: 8, 0.05598272 seconds - -5000152.65625 -coalesced reads, workgroups: 16, num_workers: 16, 0.0280048 seconds - -5000153.9375 -coalesced reads, workgroups: 16, num_workers: 32, 0.01415656 seconds - -5000153.28125 -coalesced reads, workgroups: 16, num_workers: 64, 0.00719744 seconds - -5000153.28125 -coalesced reads, workgroups: 16, num_workers: 128, 0.00399504 seconds - -5000151.73438 -coalesced reads, workgroups: 32, num_workers: 4, 0.05799328 seconds - -5000152.65625 -coalesced reads, workgroups: 32, num_workers: 8, 0.02760288 seconds - -5000154.0 -coalesced reads, workgroups: 32, num_workers: 16, 0.01413456 seconds - -5000153.34375 -coalesced reads, workgroups: 32, num_workers: 32, 0.00713032 seconds - -5000153.34375 -coalesced reads, workgroups: 32, num_workers: 64, 0.00372464 seconds - -5000153.26562 -coalesced reads, workgroups: 32, num_workers: 128, 0.0026336 seconds - -5000152.61719 -coalesced reads, workgroups: 64, num_workers: 4, 0.02696808 seconds - -5000154.01562 -coalesced reads, workgroups: 64, num_workers: 8, 0.01386544 seconds - -5000153.32031 -coalesced reads, workgroups: 64, num_workers: 16, 0.007178 seconds - -5000153.375 -coalesced reads, workgroups: 64, num_workers: 32, 0.00390184 seconds - -5000153.33594 -coalesced reads, workgroups: 64, num_workers: 64, 0.00275184 seconds - -5000153.36719 -coalesced reads, workgroups: 64, num_workers: 128, 0.00248552 seconds - -5000154.01562 -coalesced reads, workgroups: 128, num_workers: 4, 0.02766432 seconds - -5000153.29297 -coalesced reads, workgroups: 128, num_workers: 8, 0.01399096 seconds - -5000153.39844 -coalesced reads, workgroups: 128, num_workers: 16, 0.00782096 seconds - -5000153.30469 -coalesced reads, workgroups: 128, num_workers: 32, 0.00377976 seconds - -5000153.33984 -coalesced reads, workgroups: 128, num_workers: 64, 0.00254232 seconds - -5000153.30859 -coalesced reads, workgroups: 128, num_workers: 128, 0.00252368 seconds - -5000153.31445 -coalesced reads, workgroups: 256, num_workers: 4, 0.02999856 seconds - -5000153.38086 -coalesced reads, workgroups: 256, num_workers: 8, 0.0142652 seconds - -5000153.29102 -coalesced reads, workgroups: 256, num_workers: 16, 0.0075244 seconds - -5000153.35352 -coalesced reads, workgroups: 256, num_workers: 32, 0.00397056 seconds - -5000153.36523 -coalesced reads, workgroups: 256, num_workers: 64, 0.00257784 seconds - -5000153.31641 -coalesced reads, workgroups: 256, num_workers: 128, 0.00277344 seconds - -5000153.37109 -coalesced reads, workgroups: 512, num_workers: 4, 0.02859184 seconds - -5000153.30176 -coalesced reads, workgroups: 512, num_workers: 8, 0.014304 seconds - -5000153.36328 -coalesced reads, workgroups: 512, num_workers: 16, 0.00809184 seconds - -5000153.35547 -coalesced reads, workgroups: 512, num_workers: 32, 0.00424744 seconds - -5000153.32617 -coalesced reads, workgroups: 512, num_workers: 64, 0.00255136 seconds - -5000153.3457 -coalesced reads, workgroups: 512, num_workers: 128, 0.0030888 seconds +#0: Intel(R) Core(TM) i7-4650U CPU @ 1.70GHz on Apple +#1: HD Graphics 5000 on Apple +coalesced reads, workgroups: 8, num_workers: 4, 0.173174 seconds +coalesced reads, workgroups: 8, num_workers: 8, 0.09775128 seconds +coalesced reads, workgroups: 8, num_workers: 16, 0.05811304 seconds +coalesced reads, workgroups: 8, num_workers: 32, 0.02927688 seconds +coalesced reads, workgroups: 8, num_workers: 64, 0.01506872 seconds +coalesced reads, workgroups: 8, num_workers: 128, 0.00775552 seconds +coalesced reads, workgroups: 16, num_workers: 4, 0.09545584 seconds +coalesced reads, workgroups: 16, num_workers: 8, 0.05122536 seconds +coalesced reads, workgroups: 16, num_workers: 16, 0.03318448 seconds +coalesced reads, workgroups: 16, num_workers: 32, 0.01706696 seconds +coalesced reads, workgroups: 16, num_workers: 64, 0.00832944 seconds +coalesced reads, workgroups: 16, num_workers: 128, 0.00449384 seconds +coalesced reads, workgroups: 32, num_workers: 4, 0.06161328 seconds +coalesced reads, workgroups: 32, num_workers: 8, 0.02915712 seconds +coalesced reads, workgroups: 32, num_workers: 16, 0.01422552 seconds +coalesced reads, workgroups: 32, num_workers: 32, 0.00753888 seconds +coalesced reads, workgroups: 32, num_workers: 64, 0.00383592 seconds +coalesced reads, workgroups: 32, num_workers: 128, 0.00296608 seconds +coalesced reads, workgroups: 64, num_workers: 4, 0.02890824 seconds +coalesced reads, workgroups: 64, num_workers: 8, 0.0142484 seconds +coalesced reads, workgroups: 64, num_workers: 16, 0.0075216 seconds +coalesced reads, workgroups: 64, num_workers: 32, 0.00401608 seconds +coalesced reads, workgroups: 64, num_workers: 64, 0.00291696 seconds +coalesced reads, workgroups: 64, num_workers: 128, 0.00275208 seconds +coalesced reads, workgroups: 128, num_workers: 4, 0.02929296 seconds +coalesced reads, workgroups: 128, num_workers: 8, 0.01506728 seconds +coalesced reads, workgroups: 128, num_workers: 16, 0.00750416 seconds +coalesced reads, workgroups: 128, num_workers: 32, 0.00392192 seconds +coalesced reads, workgroups: 128, num_workers: 64, 0.00282296 seconds +coalesced reads, workgroups: 128, num_workers: 128, 0.00283168 seconds +coalesced reads, workgroups: 256, num_workers: 4, 0.02977544 seconds +coalesced reads, workgroups: 256, num_workers: 8, 0.01534176 seconds +coalesced reads, workgroups: 256, num_workers: 16, 0.00795784 seconds +coalesced reads, workgroups: 256, num_workers: 32, 0.00426864 seconds +coalesced reads, workgroups: 256, num_workers: 64, 0.0028708 seconds +coalesced reads, workgroups: 256, num_workers: 128, 0.00280752 seconds +coalesced reads, workgroups: 512, num_workers: 4, 0.02997592 seconds +coalesced reads, workgroups: 512, num_workers: 8, 0.01532384 seconds +coalesced reads, workgroups: 512, num_workers: 16, 0.00771888 seconds +coalesced reads, workgroups: 512, num_workers: 32, 0.00434744 seconds +coalesced reads, workgroups: 512, num_workers: 64, 0.00287568 seconds +coalesced reads, workgroups: 512, num_workers: 128, 0.00275472 seconds +blocked reads, workgroups: 8, num_workers: 4, 0.18406568 seconds +blocked reads, workgroups: 8, num_workers: 8, 0.11625232 seconds +blocked reads, workgroups: 8, num_workers: 16, 0.08278504 seconds +blocked reads, workgroups: 8, num_workers: 32, 0.04329048 seconds +blocked reads, workgroups: 8, num_workers: 64, 0.0209084 seconds +blocked reads, workgroups: 8, num_workers: 128, 0.01256128 seconds +blocked reads, workgroups: 16, num_workers: 4, 0.11378736 seconds +blocked reads, workgroups: 16, num_workers: 8, 0.07066096 seconds +blocked reads, workgroups: 16, num_workers: 16, 0.04468824 seconds +blocked reads, workgroups: 16, num_workers: 32, 0.02061624 seconds +blocked reads, workgroups: 16, num_workers: 64, 0.01264984 seconds +blocked reads, workgroups: 16, num_workers: 128, 0.03288288 seconds +blocked reads, workgroups: 32, num_workers: 4, 0.06355624 seconds +blocked reads, workgroups: 32, num_workers: 8, 0.03810152 seconds +blocked reads, workgroups: 32, num_workers: 16, 0.02057768 seconds +blocked reads, workgroups: 32, num_workers: 32, 0.01233816 seconds +blocked reads, workgroups: 32, num_workers: 64, 0.03313528 seconds +blocked reads, workgroups: 32, num_workers: 128, 0.08156096 seconds +blocked reads, workgroups: 64, num_workers: 4, 0.0351416 seconds +blocked reads, workgroups: 64, num_workers: 8, 0.01899656 seconds +blocked reads, workgroups: 64, num_workers: 16, 0.01270664 seconds +blocked reads, workgroups: 64, num_workers: 32, 0.0335048 seconds +blocked reads, workgroups: 64, num_workers: 64, 0.08059792 seconds +blocked reads, workgroups: 64, num_workers: 128, 0.08670848 seconds +blocked reads, workgroups: 128, num_workers: 4, 0.0413724 seconds +blocked reads, workgroups: 128, num_workers: 8, 0.02386552 seconds +blocked reads, workgroups: 128, num_workers: 16, 0.01572904 seconds +blocked reads, workgroups: 128, num_workers: 32, 0.04104112 seconds +blocked reads, workgroups: 128, num_workers: 64, 0.08430344 seconds +blocked reads, workgroups: 128, num_workers: 128, 0.06656264 seconds +blocked reads, workgroups: 256, num_workers: 4, 0.03564384 seconds +blocked reads, workgroups: 256, num_workers: 8, 0.02051488 seconds +blocked reads, workgroups: 256, num_workers: 16, 0.01278368 seconds +blocked reads, workgroups: 256, num_workers: 32, 0.03704784 seconds +blocked reads, workgroups: 256, num_workers: 64, 0.07087392 seconds +blocked reads, workgroups: 256, num_workers: 128, 0.04985056 seconds +blocked reads, workgroups: 512, num_workers: 4, 0.0354248 seconds +blocked reads, workgroups: 512, num_workers: 8, 0.02118616 seconds +blocked reads, workgroups: 512, num_workers: 16, 0.01358552 seconds +blocked reads, workgroups: 512, num_workers: 32, 0.03686744 seconds +blocked reads, workgroups: 512, num_workers: 64, 0.04923632 seconds +blocked reads, workgroups: 512, num_workers: 128, 0.03931944 seconds +configuration ('coalesced', 64, 128): 0.00275208 seconds \ No newline at end of file