Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hw3 #428

Open
wants to merge 11 commits into
base: HW3
Choose a base branch
from
Open

Hw3 #428

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 18 additions & 4 deletions HW3/P2/mandelbrot.cl
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,25 @@ mandelbrot(__global __read_only float *coords_real,
const int y = get_global_id(1);

float c_real, c_imag;
float z_real, z_imag;
float z_real, z_imag, z_real_temp;
int iter;

c_real = coords_real[x + y * w];
c_imag = coords_imag[x + y * w];
z_real = 0;
z_imag = 0;


if ((x < w) && (y < h)) {
// YOUR CODE HERE
;
// YOUR CODE HERE
for(iter = 0; iter < max_iter; iter++)
{
if((z_real * z_real + z_imag * z_imag) > 4)
break;
z_real_temp = ((z_real * z_real) - (z_imag * z_imag)) + c_real;
z_imag = (2 * z_real * z_imag) + c_imag;
z_real = z_real_temp;
}
out_counts[x + y * w] = iter;
}
}
}
94 changes: 94 additions & 0 deletions HW3/P3/P3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Solutions for HW3 - P3

Hardware -
#1: Intel(R) Iris(TM) Graphics 6100 on Apple

Best Configuration - configuration ('coalesced', 512, 64): 0.00286168 seconds (majority winner based on a few runs)

Raw Output:

coalesced reads, workgroups: 8, num_workers: 4, 0.19061496 seconds
coalesced reads, workgroups: 8, num_workers: 8, 0.09478136 seconds
coalesced reads, workgroups: 8, num_workers: 16, 0.05535224 seconds
coalesced reads, workgroups: 8, num_workers: 32, 0.03110032 seconds
coalesced reads, workgroups: 8, num_workers: 64, 0.02122016 seconds
coalesced reads, workgroups: 8, num_workers: 128, 0.00785568 seconds
coalesced reads, workgroups: 16, num_workers: 4, 0.09534232 seconds
coalesced reads, workgroups: 16, num_workers: 8, 0.04909168 seconds
coalesced reads, workgroups: 16, num_workers: 16, 0.02945256 seconds
coalesced reads, workgroups: 16, num_workers: 32, 0.02130744 seconds
coalesced reads, workgroups: 16, num_workers: 64, 0.00840584 seconds
coalesced reads, workgroups: 16, num_workers: 128, 0.00625968 seconds
coalesced reads, workgroups: 32, num_workers: 4, 0.05298352 seconds
coalesced reads, workgroups: 32, num_workers: 8, 0.02782392 seconds
coalesced reads, workgroups: 32, num_workers: 16, 0.0139668 seconds
coalesced reads, workgroups: 32, num_workers: 32, 0.011858 seconds
coalesced reads, workgroups: 32, num_workers: 64, 0.00589408 seconds
coalesced reads, workgroups: 32, num_workers: 128, 0.00364424 seconds
coalesced reads, workgroups: 64, num_workers: 4, 0.02987728 seconds
coalesced reads, workgroups: 64, num_workers: 8, 0.01375704 seconds
coalesced reads, workgroups: 64, num_workers: 16, 0.0074668 seconds
coalesced reads, workgroups: 64, num_workers: 32, 0.00482704 seconds
coalesced reads, workgroups: 64, num_workers: 64, 0.00407016 seconds
coalesced reads, workgroups: 64, num_workers: 128, 0.00323848 seconds
coalesced reads, workgroups: 128, num_workers: 4, 0.03022608 seconds
coalesced reads, workgroups: 128, num_workers: 8, 0.0170572 seconds
coalesced reads, workgroups: 128, num_workers: 16, 0.01060368 seconds
coalesced reads, workgroups: 128, num_workers: 32, 0.00647328 seconds
coalesced reads, workgroups: 128, num_workers: 64, 0.00367552 seconds
coalesced reads, workgroups: 128, num_workers: 128, 0.00308912 seconds
coalesced reads, workgroups: 256, num_workers: 4, 0.025532 seconds
coalesced reads, workgroups: 256, num_workers: 8, 0.00918576 seconds
coalesced reads, workgroups: 256, num_workers: 16, 0.00548232 seconds
coalesced reads, workgroups: 256, num_workers: 32, 0.00522008 seconds
coalesced reads, workgroups: 256, num_workers: 64, 0.00321048 seconds
coalesced reads, workgroups: 256, num_workers: 128, 0.00406888 seconds
coalesced reads, workgroups: 512, num_workers: 4, 0.02146176 seconds
coalesced reads, workgroups: 512, num_workers: 8, 0.01369368 seconds
coalesced reads, workgroups: 512, num_workers: 16, 0.0064036 seconds
coalesced reads, workgroups: 512, num_workers: 32, 0.00533056 seconds
coalesced reads, workgroups: 512, num_workers: 64, 0.00286168 seconds
coalesced reads, workgroups: 512, num_workers: 128, 0.0032056 seconds
blocked reads, workgroups: 8, num_workers: 4, 0.15275448 seconds
blocked reads, workgroups: 8, num_workers: 8, 0.08203064 seconds
blocked reads, workgroups: 8, num_workers: 16, 0.05936312 seconds
blocked reads, workgroups: 8, num_workers: 32, 0.03968904 seconds
blocked reads, workgroups: 8, num_workers: 64, 0.01966736 seconds
blocked reads, workgroups: 8, num_workers: 128, 0.01259184 seconds
blocked reads, workgroups: 16, num_workers: 4, 0.08024176 seconds
blocked reads, workgroups: 16, num_workers: 8, 0.04805832 seconds
blocked reads, workgroups: 16, num_workers: 16, 0.03266928 seconds
blocked reads, workgroups: 16, num_workers: 32, 0.0196948 seconds
blocked reads, workgroups: 16, num_workers: 64, 0.01304792 seconds
blocked reads, workgroups: 16, num_workers: 128, 0.00887808 seconds
blocked reads, workgroups: 32, num_workers: 4, 0.0454124 seconds
blocked reads, workgroups: 32, num_workers: 8, 0.02621024 seconds
blocked reads, workgroups: 32, num_workers: 16, 0.01601688 seconds
blocked reads, workgroups: 32, num_workers: 32, 0.0123308 seconds
blocked reads, workgroups: 32, num_workers: 64, 0.00911488 seconds
blocked reads, workgroups: 32, num_workers: 128, 0.00661224 seconds
blocked reads, workgroups: 64, num_workers: 4, 0.0253416 seconds
blocked reads, workgroups: 64, num_workers: 8, 0.01482472 seconds
blocked reads, workgroups: 64, num_workers: 16, 0.01003168 seconds
blocked reads, workgroups: 64, num_workers: 32, 0.00830936 seconds
blocked reads, workgroups: 64, num_workers: 64, 0.00668752 seconds
blocked reads, workgroups: 64, num_workers: 128, 0.01031256 seconds
blocked reads, workgroups: 128, num_workers: 4, 0.02511456 seconds
blocked reads, workgroups: 128, num_workers: 8, 0.01601144 seconds
blocked reads, workgroups: 128, num_workers: 16, 0.01162648 seconds
blocked reads, workgroups: 128, num_workers: 32, 0.00754224 seconds
blocked reads, workgroups: 128, num_workers: 64, 0.00873912 seconds
blocked reads, workgroups: 128, num_workers: 128, 0.01093456 seconds
blocked reads, workgroups: 256, num_workers: 4, 0.02031408 seconds
blocked reads, workgroups: 256, num_workers: 8, 0.0130572 seconds
blocked reads, workgroups: 256, num_workers: 16, 0.00682696 seconds
blocked reads, workgroups: 256, num_workers: 32, 0.00649288 seconds
blocked reads, workgroups: 256, num_workers: 64, 0.00751592 seconds
blocked reads, workgroups: 256, num_workers: 128, 0.0089632 seconds
blocked reads, workgroups: 512, num_workers: 4, 0.01979328 seconds
blocked reads, workgroups: 512, num_workers: 8, 0.0130584 seconds
blocked reads, workgroups: 512, num_workers: 16, 0.00902016 seconds
blocked reads, workgroups: 512, num_workers: 32, 0.00585392 seconds
blocked reads, workgroups: 512, num_workers: 64, 0.00664848 seconds
blocked reads, workgroups: 512, num_workers: 128, 0.00892288 seconds

37 changes: 27 additions & 10 deletions HW3/P3/sum.cl
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,15 @@ __kernel void sum_coalesced(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
size_t global_id = get_global_id(0);
uint gs = get_local_size(0);

// thread i (i.e., with i = get_global_id()) should add x[i],
// x[i + get_global_size()], ... up to N-1, and store in sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (uint c = global_id; c < N; c = c + get_global_size(0)) {
if( global_id < N ){

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if/why this is necessary.
If initially c = global_id, and the for loop condition is c<N, then why would a thread with a global_id >= N enter the loop execution?

sum += x[c];
}
}

fast[local_id] = sum;
Expand All @@ -24,8 +28,11 @@ __kernel void sum_coalesced(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for(uint s = gs/2; s > 0; s >>= 1) {
if(local_id < s) {
fast[local_id] += fast[local_id+s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand All @@ -38,7 +45,10 @@ __kernel void sum_blocked(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
int k = ceil(float(N) / get_global_size(0));
size_t global_id = get_global_id(0);
int k = ceil((float)N / get_global_size(0));
uint gs = get_local_size(0);


// thread with global_id 0 should add 0..k-1
// thread with global_id 1 should add k..2k-1
Expand All @@ -48,12 +58,15 @@ __kernel void sum_blocked(__global float* x,
//
// Be careful that each thread stays in bounds, both relative to
// size of x (i.e., N), and the range it's assigned to sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (uint c = k*global_id; c < k*global_id + k; c = c + 1) {
if( c < N ){
sum += x[c];
}
}

fast[local_id] = sum;
barrier(CLK_LOCAL_MEM_FENCE);


// binary reduction
//
Expand All @@ -64,9 +77,13 @@ __kernel void sum_blocked(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for(uint s = gs/2; s > 0; s >>= 1) {
if(local_id < s) {
fast[local_id] += fast[local_id+s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];

}
6 changes: 3 additions & 3 deletions HW3/P3/tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def create_data(N):
times = {}

for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
local = cl.LocalMemory(num_workers * 4)
Expand All @@ -40,7 +40,7 @@ def create_data(N):
format(num_workgroups, num_workers, seconds))

for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
local = cl.LocalMemory(num_workers * 4)
Expand All @@ -54,7 +54,7 @@ def create_data(N):
assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
times['blocked', num_workgroups, num_workers] = seconds
print("blocked reads, workgroups: {}, num_workers: {}, {} seconds".
format(num_workgroups, num_workers, seconds))
format(num_workgroups, num_workers, seconds))

best_time = min(times.values())
best_configuration = [config for config in times if times[config] == best_time]
Expand Down
57 changes: 56 additions & 1 deletion HW3/P4/median_filter.cl
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
#include "median9.h"

// clamped pixel fetch
float FETCH(__global __read_only float *in_values, int w, int h, int x, int y)
{
if(x < 0)
x = 0;
if(x >= w)
x = w - 1;
if(y < 0)
y = 0;
if(y >= h)
y = h - 1;
return in_values[y * w + x];
}

// 3x3 median filter
__kernel void
median_3x3(__global __read_only float *in_values,
Expand All @@ -12,7 +26,26 @@ median_3x3(__global __read_only float *in_values,
// Note: It may be easier for you to implement median filtering
// without using the local buffer, first, then adjust your code to
// use such a buffer after you have that working.

// Global position of output pixel
const int x = get_global_id(0);
const int y = get_global_id(1);

// Local position relative to (0, 0) in workgroup
const int lx = get_local_id(0);
const int ly = get_local_id(1);

// coordinates of the upper left corner of the buffer in image
// space, including halo
const int buf_corner_x = x - lx - halo;
const int buf_corner_y = y - ly - halo;

// coordinates of our pixel in the local buffer
const int buf_x = lx + halo;
const int buf_y = ly + halo;

// 1D index of thread within our work-group
const int idx_1D = ly * get_local_size(0) + lx;

// Load into buffer (with 1-pixel halo).
//
Expand All @@ -21,14 +54,36 @@ median_3x3(__global __read_only float *in_values,
//
// Note that globally out-of-bounds pixels should be replaced
// with the nearest valid pixel's value.

int row;

if (idx_1D < buf_w)
for (row = 0; row < buf_h; row++) {
buffer[row * buf_w + idx_1D] = \
FETCH(in_values, w, h,
buf_corner_x + idx_1D,
buf_corner_y + row);
}

barrier(CLK_LOCAL_MEM_FENCE);

// Compute 3x3 median for each pixel in core (non-halo) pixels
//
// We've given you median9.h, and included it above, so you can
// use the median9() function.

if ((y < h) && (x < w)) // stay in bounds
{
buffer[buf_y * buf_w + buf_x] = median9( buffer[(buf_y - 1) * buf_w + buf_x - 1], buffer[(buf_y - 1) * buf_w + buf_x], buffer[(buf_y - 1) * buf_w + buf_x + 1],

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You write into a buffer place that is concurrently being accessed by another thread.
You should either avoid this or use barriers between the reads and the write.
In this case you could have pushed the new value to the output array.

buffer[buf_y * buf_w + buf_x - 1], buffer[buf_y * buf_w + buf_x], buffer[buf_y * buf_w + buf_x + 1],
buffer[(buf_y + 1) * buf_w + buf_x - 1], buffer[(buf_y + 1) * buf_w + buf_x], buffer[(buf_y + 1) * buf_w + buf_x + 1] );
}


barrier(CLK_LOCAL_MEM_FENCE);

// Each thread in the valid region (x < w, y < h) should write
// back its 3x3 neighborhood median.
if ((y < h) && (x < w)) // stay in bounds
out_values[y * w + x] = \
buffer[buf_y * buf_w + buf_x];
}
9 changes: 7 additions & 2 deletions HW3/P4/median_filter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from __future__ import division
import pyopencl as cl
import numpy as np
import imread
import pylab
import os.path

def round_up(global_size, group_size):
r = global_size % group_size
Expand Down Expand Up @@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
properties=cl.command_queue_properties.PROFILING_ENABLE)
print 'The queue is using the device:', queue.device.name

program = cl.Program(context, open('median_filter.cl').read()).build(options='')
curdir = os.path.dirname(os.path.realpath(__file__))
program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])

host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
host_image_filtered = np.zeros_like(host_image)
Expand Down Expand Up @@ -88,3 +89,7 @@ def numpy_median(image, iterations=10):
cl.enqueue_copy(queue, host_image_filtered, gpu_image_a, is_blocking=True)

assert np.allclose(host_image_filtered, numpy_median(host_image, num_iters))

pylab.imshow(host_image_filtered)

pylab.show()
Loading