Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hw3 #423

Open
wants to merge 7 commits into
base: HW3
Choose a base branch
from
Open

Hw3 #423

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions HW3/P3/P3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
The fastest configuration was the coaslesced setup with a workgroup size of 512, and 128 workers.
This configuration completed the task in 0.00309848 seconds.
38 changes: 29 additions & 9 deletions HW3/P3/sum.cl
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@ __kernel void sum_coalesced(__global float* x,

// thread i (i.e., with i = get_global_id()) should add x[i],
// x[i + get_global_size()], ... up to N-1, and store in sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE

unsigned int thread_id = get_global_id(0);
unsigned int step_size = get_global_size(0);

for (unsigned int i = thread_id; i < N; i += step_size) { // YOUR CODE HERE
sum += x[i]; // YOUR CODE HERE
}

fast[local_id] = sum;
Expand All @@ -24,8 +28,14 @@ __kernel void sum_coalesced(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE

unsigned int local_size = get_local_size(0);

for (unsigned int j = local_size/2; j > 0; j >>= 1) { // YOUR CODE HERE
if( local_id < j) {
fast[local_id] += fast[local_id + j]; // YOUR CODE HERE
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand All @@ -38,7 +48,8 @@ __kernel void sum_blocked(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
int k = ceil(float(N) / get_global_size(0));

int k = ceil((float)N / get_global_size(0));

// thread with global_id 0 should add 0..k-1
// thread with global_id 1 should add k..2k-1
Expand All @@ -48,8 +59,11 @@ __kernel void sum_blocked(__global float* x,
//
// Be careful that each thread stays in bounds, both relative to
// size of x (i.e., N), and the range it's assigned to sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE

unsigned int thread_id = get_global_id(0);

for (unsigned int i = k * thread_id; i < k * (thread_id + 1) && i < N; i++) { // YOUR CODE HERE
sum += x[i]; // YOUR CODE HERE
}

fast[local_id] = sum;
Expand All @@ -64,8 +78,14 @@ __kernel void sum_blocked(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE

unsigned int local_size = get_local_size(0);

for (unsigned int j = local_size/2; j > 0; j >>= 1) { // YOUR CODE HERE
if( local_id < j) {
fast[local_id] += fast[local_id + j]; // YOUR CODE HERE
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand Down
6 changes: 4 additions & 2 deletions HW3/P3/tune.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import pyopencl as cl
import numpy as np
import os

def create_data(N):
return host_x, x

if __name__ == "__main__":
os.environ["PYOPENCL_COMPILER_OUTPUT"] = "1"
N = 1e7

platforms = cl.get_platforms()
Expand All @@ -23,7 +25,7 @@ def create_data(N):
times = {}

for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
local = cl.LocalMemory(num_workers * 4)
Expand All @@ -40,7 +42,7 @@ def create_data(N):
format(num_workgroups, num_workers, seconds))

for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
local = cl.LocalMemory(num_workers * 4)
Expand Down
65 changes: 65 additions & 0 deletions HW3/P4/median_filter.cl
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
#include "median9.h"

// From HW3 P5
float
get_clamped_value(__global __read_only float *labels,
int w, int h,
int x, int y)
{
int c_x = min(w-1, max(0, x)), c_y = min(h-1, max(0, y));
return labels[c_y * w + c_x];
}


// 3x3 median filter
__kernel void
median_3x3(__global __read_only float *in_values,
Expand All @@ -22,13 +33,67 @@ median_3x3(__global __read_only float *in_values,
// Note that globally out-of-bounds pixels should be replaced
// with the nearest valid pixel's value.

// Based on HW3 Problem 5

// Global position of output pixel
const int x = get_global_id(0);
const int y = get_global_id(1);

// Local position relative to (0, 0) in workgroup
const int lx = get_local_id(0);
const int ly = get_local_id(1);

// coordinates of the upper left corner of the buffer in image
// space, including halo
const int buf_corner_x = x - lx - halo;
const int buf_corner_y = y - ly - halo;

// coordinates of our pixel in the local buffer
const int buf_x = lx + halo;
const int buf_y = ly + halo;

// 1D index of thread within our work-group
const int idx_1D = ly * get_local_size(0) + lx;

// Load the relevant labels to a local buffer with a halo
if (idx_1D < buf_w) {
for (int row = 0; row < buf_h; row++) {
buffer[row * buf_w + idx_1D] =
get_clamped_value(in_values,
w, h,
buf_corner_x + idx_1D, buf_corner_y + row);
}
}

// Make sure all threads reach the next part after
// the local buffer is loaded
barrier(CLK_LOCAL_MEM_FENCE);


// Compute 3x3 median for each pixel in core (non-halo) pixels
//
// We've given you median9.h, and included it above, so you can
// use the median9() function.

const int dx[3] = {-1, 0, 1}, dy[3] = {-1, 0, 1};
int idxArr[9];

for( int i=0; i<3; i++ ) {
for ( int j=0; j<3; j++ ) {
idxArr[i*3+j] = (buf_y + dy[i])*buf_w + (buf_x + dx[j]);
}
}

// Each thread in the valid region (x < w, y < h) should write
// back its 3x3 neighborhood median.

//From HW3 P5
// stay in bounds
if ((x < w) && (y < h)) {
out_values[y*w + x] =
median9( buffer[ idxArr[0] ], buffer[ idxArr[1] ], buffer[ idxArr[2] ],
buffer[ idxArr[3] ], buffer[ idxArr[4] ], buffer[ idxArr[5] ],
buffer[ idxArr[6] ], buffer[ idxArr[7] ], buffer[ idxArr[8] ] );
}

}
61 changes: 61 additions & 0 deletions HW3/P5/P5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
Part 1:
========================
Finished after 911 iterations, 214.33168 ms total, 0.235270779363 ms per iteration
Found 2 regions

Finished after 531 iterations, 124.05312 ms total, 0.233621694915 ms per iteration
Found 35 regions

Part 2:
========================
Finished after 529 iterations, 133.35928 ms total, 0.252096937618 ms per iteration
Found 2 regions

Finished after 269 iterations, 68.04856 ms total, 0.252968624535 ms per iteration
Found 35 regions

Part 3:
========================
Finished after 8 iterations, 2.59184 ms total, 0.32398 ms per iteration
Found 2 regions

Finished after 8 iterations, 2.4332 ms total, 0.30415 ms per iteration
Found 35 regions

Part 4:
========================
Finished after 10 iterations, 7.37784 ms total, 0.737784 ms per iteration
Found 2 regions

Finished after 9 iterations, 6.66816 ms total, 0.740906666667 ms per iteration
Found 35 regions

Using a single-thread for caching seems to have made the running time much slower (looking at the time per iteration
.) Computation is serialized during this caching process, so there is a big upfront cost that needs to be weighed
against the ongoing cost of expensive, potentially serialized, main memory accesses.

I suspect the worst case memory access scenario where neighbouring nodes all try to access main memory for a single
cached value only happens quite late in the process. Earlier iterations are probably facing more diversity of memory
accesses since there's a greater range of values still remaining. It follows then, that perhaps with much more
complex mazes, there could be scenarios where the overall iteration count will be high, but parts of the maze will
have already "stabilized" early in the process. Therefore, these "stabilized" parts are repeatedly drawing on the same
cached values. It seems there aren't enough iterations in these examples for it to be worth the upfront cost (at
least on my hardware setup.) Potentially, other setups that have different computation and memory access speeds might
reveal differences in the results since the trade-off is weighed differently.

Part 5:
========================
If instead of atomic_min() we use min(), then the update step is not done in a single transaction, meaning between
the min() check and the subsequent update, a different thread could update the reference value. Suppose this
different thread actually updated the reference to an even lower value than what we had intended. Now, if we go ahead
with our update, we are actually *increasing* the reference value. It could therefore also lead to an increase in
this value between iterations.

So while it would be faster to do without atomic_min() as memory access is not serialized, it is also slow since it
means more iterations have to be performed. While empirical testing is needed to determine which is the better
trade-off, my sense is that for simple, low-iteration mazes, atomic_min() is going to present a significant overhead
so it might be better to do a few more iterations. The opposite is likely true for high-iteration mazes.

This makes things more inefficient since this cache is potentially being updated with worse values. However, because
ultimately the stopping condition is whether or not any more updates have been performed, it won't effect the
correctness of the algorithm.
45 changes: 44 additions & 1 deletion HW3/P5/label_regions.cl
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ propagate_labels(__global __read_write int *labels,
int buf_w, int buf_h,
const int halo)
{

const int dx[4] = {-1, 0, 1, 0}, dy[4] = {0, -1, 0, 1};
// halo is the additional number of cells in one direction

// Global position of output pixel
Expand Down Expand Up @@ -80,20 +82,61 @@ propagate_labels(__global __read_write int *labels,
old_label = buffer[buf_y * buf_w + buf_x];

// CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)

// Part 2
/*
for( int i=0; i<4; i++ ) {
if( buffer[(buf_y+dy[i])*buf_w + (buf_x + dx[i])] < w * h ) {
buffer[(buf_y+dy[i])*buf_w + (buf_x + dx[i])] = labels[ buffer[(buf_y+dy[i])*buf_w + (buf_x + dx[i])] ];
}
}
*/

// Part 4
// Reference: Piazza @524
unsigned int ls0 = get_local_size(0), ls1 = get_local_size(1);

if( lx == 0 && ly == 0 ) { //Use the first thread
unsigned int prev = -1, gparent = -1; // 1 variable cache
for( int c_lx = 0; c_lx < ls0; c_lx++ ) { // Update the entire local buffer
for( int c_ly = 0; c_ly < ls1; c_ly++ ) {
unsigned int cur_idx = (c_ly + halo) * buf_w + (c_lx + halo);
unsigned int parent = buffer[cur_idx];

if( parent == w * h ) continue; // Background pixel

if( parent == prev ) { // 1 variable cache success!
buffer[cur_idx] = gparent;
}
else { // Update the cache
buffer[cur_idx] = labels[parent];
prev = parent;
gparent = buffer[cur_idx];
}
}
}
}

barrier(CLK_LOCAL_MEM_FENCE);

// stay in bounds
if ((x < w) && (y < h)) {
// CODE FOR PART 1 HERE
// We set new_label to the value of old_label, but you will need
// to adjust this for correctness.
new_label = old_label;
if( new_label != w * h ) { // See Piazza @486
for( int i=0; i<4; i++ ) {
new_label = min( new_label, buffer[(buf_y+dy[i])*buf_w + (buf_x + dx[i])] );
}
}

if (new_label != old_label) {
// CODE FOR PART 3 HERE
// indicate there was a change this iteration.
// multiple threads might write this.
*(changed_flag) += 1;
labels[y * w + x] = new_label;
atomic_min(&labels[old_label], new_label);
}
}
}
2 changes: 1 addition & 1 deletion HW3/P5/label_regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def round_up(global_size, group_size):

program = cl.Program(context, open('label_regions.cl').read()).build(options='')

host_image = np.load('maze1.npy')
host_image = np.load('maze2.npy')
host_labels = np.empty_like(host_image)
host_done_flag = np.zeros(1).astype(np.int32)

Expand Down