Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HW3 #420

Open
wants to merge 5 commits into
base: HW3
Choose a base branch
from
Open

HW3 #420

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions HW3/P3/P3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Using VM on Windows set up for CS205:

best:
configuration ('blocked', 16, 4): 0.004673046 seconds


The platforms detected are:
---------------------------
AMD Accelerated Parallel Processing Advanced Micro Devices, Inc. version: OpenCL 2.0 AMD-APP (1800.8)
The devices detected on platform AMD Accelerated Parallel Processing are:
---------------------------
Intel(R) Core(TM) i7-3632QM CPU @ 2.20GHz [Type: CPU ]
Maximum clock Frequency: 2195 MHz
Maximum allocable memory size: 1049 MB
Maximum work group size 1024
---------------------------
This context is associated with 1 devices
38 changes: 29 additions & 9 deletions HW3/P3/sum.cl
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,15 @@ __kernel void sum_coalesced(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
size_t global_id = get_global_id(0);
size_t global_size = get_global_size(0);
size_t local_size = get_local_size(0);
int n;

// thread i (i.e., with i = get_global_id()) should add x[i],
// x[i + get_global_size()], ... up to N-1, and store in sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (n = 0; global_id + n * global_size < N; n++) { // YOUR CODE HERE
sum += x[global_id + n * global_size]; // YOUR CODE HERE
}

fast[local_id] = sum;
Expand All @@ -24,8 +28,13 @@ __kernel void sum_coalesced(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE

for (uint j = local_size / 2; j > 0; j >>= 1) { // YOUR CODE HERE
if (local_id < j)
{
fast[local_id] += fast[local_id+j];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand All @@ -38,7 +47,11 @@ __kernel void sum_blocked(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
int k = ceil(float(N) / get_global_size(0));
int k = ceil((float)N / get_global_size(0));
size_t global_id = get_global_id(0);
size_t global_size = get_global_size(0);
size_t local_size = get_local_size(0);
int n;

// thread with global_id 0 should add 0..k-1
// thread with global_id 1 should add k..2k-1
Expand All @@ -48,8 +61,11 @@ __kernel void sum_blocked(__global float* x,
//
// Be careful that each thread stays in bounds, both relative to
// size of x (i.e., N), and the range it's assigned to sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (n = k*global_id; n < (global_id + 1) * k; n++) { // YOUR CODE HERE
if (n < N)
{
sum += x[n];
}
}

fast[local_id] = sum;
Expand All @@ -64,8 +80,12 @@ __kernel void sum_blocked(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (uint j = local_size / 2; j > 0; j >>= 1) { // YOUR CODE HERE
if (local_id < j)
{
fast[local_id] += fast[local_id+j];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand Down
4 changes: 2 additions & 2 deletions HW3/P3/tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def create_data(N):
times = {}

for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
local = cl.LocalMemory(num_workers * 4)
Expand All @@ -40,7 +40,7 @@ def create_data(N):
format(num_workgroups, num_workers, seconds))

for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
local = cl.LocalMemory(num_workers * 4)
Expand Down
72 changes: 72 additions & 0 deletions HW3/P4/median_filter.cl
Original file line number Diff line number Diff line change
@@ -1,5 +1,27 @@
#include "median9.h"

// returns value within the picture
float get_in(__global __read_only float *in_values, int w, int h, int x, int y)
{
if (x < 0)
{
x = 0;
}
else if (x > w - 1)
{
x = w - 1;
}
if (y < 0)
{
y = 0;
}
else if (y > h - 1)
{
y = h - 1;
}
return in_values[y * w + x];
}

// 3x3 median filter
__kernel void
median_3x3(__global __read_only float *in_values,
Expand All @@ -22,6 +44,43 @@ median_3x3(__global __read_only float *in_values,
// Note that globally out-of-bounds pixels should be replaced
// with the nearest valid pixel's value.

// Global position of output pixel
const int x = get_global_id(0);
const int y = get_global_id(1);

// Local position relative to (0, 0) in workgroup
const int lx = get_local_id(0);
const int ly = get_local_id(1);

// coordinates of the upper left corner of the buffer in image
// space, including halo
const int buf_corner_x = x - lx - halo;
const int buf_corner_y = y - ly - halo;

// coordinates of our pixel in the local buffer
const int buf_x = lx + halo;
const int buf_y = ly + halo;

// 1D index of thread within our work-group
const int idx_1D = ly * get_local_size(0) + lx;

int row;

// get all the pixels in the bounds
if (idx_1D < buf_w)
{
for (row = 0; row < buf_h; row++)
{
buffer[row * buf_w + idx_1D] = get_in(in_values, w, h,
buf_corner_x + idx_1D, buf_corner_y + row);
}
}

barrier(CLK_LOCAL_MEM_FENCE);

// Processing code here...
//
// Should only use buffer, buf_x, buf_y.

// Compute 3x3 median for each pixel in core (non-halo) pixels
//
Expand All @@ -31,4 +90,17 @@ median_3x3(__global __read_only float *in_values,

// Each thread in the valid region (x < w, y < h) should write
// back its 3x3 neighborhood median.
if (x < w && y < h)
{
out_values[y * w + x] = median9(buffer[(buf_y - 1) * buf_w + buf_x - 1],
buffer[(buf_y - 1) * buf_w + buf_x],
buffer[(buf_y - 1) * buf_w + buf_x + 1],
buffer[(buf_y) * buf_w + buf_x - 1],
buffer[(buf_y) * buf_w + buf_x],
buffer[(buf_y) * buf_w + buf_x + 1],
buffer[(buf_y + 1) * buf_w + buf_x - 1],
buffer[(buf_y + 1) * buf_w + buf_x],
buffer[(buf_y + 1) * buf_w + buf_x + 1]
); // <- he is sad. coding is hard
}
}
5 changes: 3 additions & 2 deletions HW3/P4/median_filter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from __future__ import division
import pyopencl as cl
import numpy as np
import imread
import pylab
import os.path

def round_up(global_size, group_size):
r = global_size % group_size
Expand Down Expand Up @@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
properties=cl.command_queue_properties.PROFILING_ENABLE)
print 'The queue is using the device:', queue.device.name

program = cl.Program(context, open('median_filter.cl').read()).build(options='')
curdir = os.path.dirname(os.path.realpath(__file__))
program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])

host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
host_image_filtered = np.zeros_like(host_image)
Expand Down
47 changes: 47 additions & 0 deletions HW3/P5/P5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
Part 1:

Maze 1:
Finished after 879 iterations, 261.9836 ms total, 0.298047326507 ms per iteration
Found 2 regions

Maze 2:
Finished after 514 iterations, 152.45928 ms total, 0.296613385214 ms per iteration
Found 35 regions


Part 2:

Maze 1:
Finished after 529 iterations, 157.65824 ms total, 0.298030699433 ms per iteration
Found 2 regions

Maze 2:
Finished after 273 iterations, 81.33136 ms total, 0.297917069597 ms per iteration
Found 35 regions


Part 3:

Maze 1:
Finished after 10 iterations, 3.07216 ms total, 0.307216 ms per iteration
Found 2 regions

Maze 2:
Finished after 10 iterations, 3.03024 ms total, 0.303024 ms per iteration
Found 35 regions

Part 4:

Maze 1:
Finished after 11 iterations, 8.83696 ms total, 0.80336 ms per iteration
Found 2 regions

Maze 2:
Finished after 10 iterations, 8.00088 ms total, 0.800088 ms per iteration
Found 35 regions

For part 4, the optimization of reducing global memory reads by using a single thread to do that action only once per group actually slowed down the speed of the program. This is because that action was serialized over the whole group so the read and update could only be done one at a time rather than with the faster GPU memory access. The global memory access is not the limiting factor. This implementation would work if the memory access was much slower, perhaps if it was using the GPU of an old command line computer.


Part 5:
If we replace atomic_min() with min(), the memory updates are no longer serial but we lose the guarantee that the update occurs on a given iteration. So there are the two factors of increased speed yet lost accuracy, and it is hard to tell which one would win out. The increased speed due to parallelization is great when the threads are working on updating different labels. On the other hand, it is more difficult to understand the behavior when the min updates are no longer atomic. What may happen is that one update could be decreasing a label to 5, but another is decreasing it to 2. Depending on when the write occurs, either one can occur, so if the label should be 2, it can still be stalling at 5 until the write of 2 occurs. So I do not think an increase in label can ever occur from one iteration to another, but a decrease could be postponed or not done in time. Eventually, however, we should be able to see that the min values converge to the state they were meant to go to, it is just that the time to get there would not be consistent from one run to the next.
55 changes: 55 additions & 0 deletions HW3/P5/label_regions.cl
Original file line number Diff line number Diff line change
Expand Up @@ -80,20 +80,75 @@ propagate_labels(__global __read_write int *labels,
old_label = buffer[buf_y * buf_w + buf_x];

// CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
// Part 2:
//if (old_label < w * h)
//{
// buffer[buf_y * buf_w + buf_x] = labels[old_label];
//}

// Part 4:
// once per local group
if (lx == 0 && ly == 0)
{
int cur_label;
int prev_label = -25;
int prev_grand_label;
int pixels = buf_w * buf_h;
int i;
// iterate over pixels
for(i = 0; i < pixels; i++)
{
cur_label = buffer[i];

// within pic
if (cur_label < w * h)
{
// if labels on same value, use the already found grand label
if (cur_label == prev_label)
{
buffer[i] = prev_grand_label;
}
// otherwise update the grand label then use
else
{
prev_label = cur_label;
prev_grand_label = labels[prev_label];
buffer[i] = prev_grand_label;
}
}
}

}
barrier(CLK_LOCAL_MEM_FENCE);

// stay in bounds
new_label = old_label;
if ((x < w) && (y < h)) {
// CODE FOR PART 1 HERE
// We set new_label to the value of old_label, but you will need
// to adjust this for correctness.
new_label = old_label;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After parts 2 and 4, you should use buffer[buf_w * buf_y + buf_x] instead of old_label.

if (new_label < w * h)
{
// min of adjacents
new_label = min(new_label,
min(buffer[(buf_y - 1) * buf_w + buf_x],
min(buffer[(buf_y) * buf_w + buf_x - 1],
min(buffer[(buf_y) * buf_w + buf_x + 1],
buffer[(buf_y + 1) * buf_w + buf_x]))));
}

if (new_label != old_label) {
// CODE FOR PART 3 HERE
// indicate there was a change this iteration.
// multiple threads might write this.
// atomic update
atomic_min(&labels[old_label], new_label);
atomic_min(&labels[y * w + x], new_label);

*(changed_flag) += 1;
labels[y * w + x] = new_label;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line needs to be removed (you are doing the atomic_min update in 147).


}
}
}