Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hw3 #417

Open
wants to merge 5 commits into
base: HW3
Choose a base branch
from
Open

Hw3 #417

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions HW3/P2/mandelbrot.cl
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,21 @@ mandelbrot(__global __read_only float *coords_real,
int iter;

if ((x < w) && (y < h)) {
// YOUR CODE HERE
;
c_real = coords_real[x + y * w];
c_imag = coords_imag[x + y * w];
z_real = 0;
z_imag = 0;

for (iter = 0; iter < max_iter; ++iter) {
if (z_real * z_real + z_imag * z_imag > 4.0) {
break;
}
float temp = z_real * z_real - z_imag * z_imag + c_real;

z_imag = (z_real * z_imag * 2) + c_imag;
z_real = temp;
}

out_counts[x + y * w] = iter;
}
}
8 changes: 8 additions & 0 deletions HW3/P3/P3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Hardware

Intel(R) Core(TM) i7-3537 CPU @2.00GHz


Performance:

configuration ('coalesced', 128, 128): 0.00205134 seconds
26 changes: 17 additions & 9 deletions HW3/P3/sum.cl
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@ __kernel void sum_coalesced(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
unsigned int global_size = get_global_size(0);

// thread i (i.e., with i = get_global_id()) should add x[i],
// x[i + get_global_size()], ... up to N-1, and store in sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (int i = get_global_id(0); i < N; i += global_size) {
sum += x[i];
}

fast[local_id] = sum;
Expand All @@ -24,8 +25,11 @@ __kernel void sum_coalesced(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (int i = get_local_size(0) / 2; i > 0; i /= 2) {
if (local_id < i) {
fast[local_id] += fast[local_id + i]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing ';' at the end of the line.

}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand All @@ -38,7 +42,7 @@ __kernel void sum_blocked(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
int k = ceil(float(N) / get_global_size(0));
int k = ceil((float)N / get_global_size(0));

// thread with global_id 0 should add 0..k-1
// thread with global_id 1 should add k..2k-1
Expand All @@ -48,8 +52,9 @@ __kernel void sum_blocked(__global float* x,
//
// Be careful that each thread stays in bounds, both relative to
// size of x (i.e., N), and the range it's assigned to sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (int i = get_global_id(0) * k; i < (get_global_id(0) + 1) * k &&
i < N; ++i) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice use of the loop short-cutting

sum += x[i];
}

fast[local_id] = sum;
Expand All @@ -64,8 +69,11 @@ __kernel void sum_blocked(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (int i = get_local_size(0) / 2; i > 0; i /= 2) {
if (local_id < i) {
fast[local_id] += fast[local_id + i]
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand Down
4 changes: 2 additions & 2 deletions HW3/P3/tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def create_data(N):
times = {}

for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
local = cl.LocalMemory(num_workers * 4)
Expand All @@ -40,7 +40,7 @@ def create_data(N):
format(num_workgroups, num_workers, seconds))

for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
local = cl.LocalMemory(num_workers * 4)
Expand Down
64 changes: 63 additions & 1 deletion HW3/P4/median_filter.cl
Original file line number Diff line number Diff line change
@@ -1,5 +1,29 @@
#include "median9.h"

float
find_closest(__global __read_only float *in_values,
int w, int h,
int x, int y)
{
// fix out of bounds pixels to closest valid pixel
if (x < 0) {
x = 0;
}
else if (x >= w) {
x = w - 1;
}

if (y < 0) {
y = 0;
}
else if (y >= h) {
y = h - 1;
}

return in_values[x + y * w];
}


// 3x3 median filter
__kernel void
median_3x3(__global __read_only float *in_values,
Expand All @@ -13,6 +37,21 @@ median_3x3(__global __read_only float *in_values,
// without using the local buffer, first, then adjust your code to
// use such a buffer after you have that working.

// constant coordinates to remember translations between local and global
const int x = get_global_id(0);
const int y = get_global_id(1);

// absolute position in local buffer
const int x_rel = get_local_id(0);
const int y_rel = get_local_id(1);

// remember offset coordinate for local buffer
const int x_buf_corner = x - x_rel - halo;
const int y_buf_corner = y - y_rel - halo;

// local coordinate of our pixel
const int x_buf = x_rel + halo;
const int y_buf = y_rel + halo;

// Load into buffer (with 1-pixel halo).
//
Expand All @@ -22,13 +61,36 @@ median_3x3(__global __read_only float *in_values,
// Note that globally out-of-bounds pixels should be replaced
// with the nearest valid pixel's value.

const int t_idx = x_rel + y_rel * get_local_size(0);

if (t_idx < buf_w) {
for (int i = 0; i < buf_h; ++i) {
buffer[i * buf_w + t_idx] = find_closest(in_values, w, h,
x_buf_corner + t_idx,
y_buf_corner + i);
}
}

barrier(CLK_LOCAL_MEM_FENCE);

// Compute 3x3 median for each pixel in core (non-halo) pixels
//
// We've given you median9.h, and included it above, so you can
// use the median9() function.


// Each thread in the valid region (x < w, y < h) should write
// back its 3x3 neighborhood median.

// only calculate and write back if in valid region
if (x < w && y < h) {
out_values[x + y * w] = median9(buffer[(y_buf - 1) * buf_w + x_buf - 1],
buffer[(y_buf - 1) * buf_w + x_buf],
buffer[(y_buf - 1) * buf_w + x_buf + 1],
buffer[y_buf * buf_w + x_buf - 1],
buffer[y_buf * buf_w + x_buf],
buffer[y_buf * buf_w + x_buf + 1],
buffer[(y_buf + 1) * buf_w + x_buf - 1],
buffer[(y_buf + 1) * buf_w + x_buf],
buffer[(y_buf + 1) * buf_w + x_buf + 1]);
}
}
5 changes: 3 additions & 2 deletions HW3/P4/median_filter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from __future__ import division
import pyopencl as cl
import numpy as np
import imread
import pylab
import os.path

def round_up(global_size, group_size):
r = global_size % group_size
Expand Down Expand Up @@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
properties=cl.command_queue_properties.PROFILING_ENABLE)
print 'The queue is using the device:', queue.device.name

program = cl.Program(context, open('median_filter.cl').read()).build(options='')
curdir = os.path.dirname(os.path.realpath(__file__))
program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])

host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
host_image_filtered = np.zeros_like(host_image)
Expand Down
59 changes: 59 additions & 0 deletions HW3/P5/P5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
Part 1: implement updates from neighbors

Maze 1
Finished after 894 iterations, 197.5328 ms total, 0.222069508197 ms per iteration
Found 2 regions

Maze 2
Finished after 528 iterations, 116.40336 ms total, 0.220460909091 ms per iteration
Found 35 regions

===============================================================================

Part 2: fetch grandparents

Maze 1
Finished after 529 iterations, 116.8076 ms total, 0.220808317580 ms per iteration

Maze 2
Finished after 274 iterations, 62.08362 ms total, 0.226582554745 ms per iteration

===============================================================================

Part 3: merge parent regions

Maze 1
Finished after 10 iterations, 2.4026 ms total, 0.24026 ms per iteration

Maze 2
Finished after 9 iterations, 2.17255 ms total, 0.241394444444 ms per iteration

===============================================================================

Part 4: efficient grandparents

Maze 1
Finished after 10 iterations, 4.59674 ms total, 0.459674 ms per iteration

Maze 2
Finished after 9 iterations, 4.1608 ms total, 0.462311111111 ms per iteration

It is quite apparent that using a single thread to check the labels in its
workgroup caused performance to worsen. I think this has to do with the
overhead associated with accessing global memory we are trying to avoid, and
the speedup we gain from parallelism. Here, we can see that using only one
thread, i.e. serializing this part of the algorithm, takes away much of the
benefit we gained in the parallel approach, even though it decreased number of
accesses to global memory.

===============================================================================

Part 5: no atomic operations

Without atomic operations, we introduce the possibility of running into the
race condition in which our old_label is updated redundantly, such that while
the output of the algorithm is not incorrect, we can increase the number of
iterations. This is dangerous because it happens basically
nondeterministically, so in the (very unlikely) worst case it could cause our
algorithm to perform relatively slowly, just because more work (with no
progress) is being done.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There will probably be some progress, but indeed might be slower.

41 changes: 38 additions & 3 deletions HW3/P5/label_regions.cl
Original file line number Diff line number Diff line change
Expand Up @@ -81,19 +81,54 @@ propagate_labels(__global __read_write int *labels,

// CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)

// Part 2
//
// if (old_label < w * h) {
// buffer[buf_y * buf_w + buf_x] = labels[old_label];
// }

// Part 4

// Update workgroup labels
if (lx == 0 && ly == 0) {
int max_iter = buf_w * buf_h;
int temp, last;
int prev = -1;
for (int i = 0; i < max_iter; ++i) {
temp = buffer[i];
if (temp < w * h) {
// if current label is not the same as previous, reset
if (prev != temp) {
prev = temp;
last = labels[prev];
}
buffer[i] = last;
}
}
}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing barrier after this part.


// stay in bounds
if ((x < w) && (y < h)) {
// CODE FOR PART 1 HERE
// We set new_label to the value of old_label, but you will need
// to adjust this for correctness.

// min over all possible values
new_label = old_label;
if (new_label < w * h) {
int row_min = min(buffer[buf_y * buf_w + buf_x - 1],
buffer[buf_y * buf_w + buf_x + 1]);
int col_min = min(buffer[buf_x + (buf_y - 1) * buf_w],
buffer[buf_x + (buf_y + 1) * buf_w]);
new_label = min(row_min, col_min);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should also compare to buffer[buf_y * buf_w + buf_x]

}

if (new_label != old_label) {
// CODE FOR PART 3 HERE
// indicate there was a change this iteration.
// multiple threads might write this.
*(changed_flag) += 1;
labels[y * w + x] = new_label;
// labels[y * w + x] = new_label;
atomic_min(&labels[old_label], new_label);
atomic_min(&labels[x + y * w], new_label);
}
}
}