Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hw3 #436

Open
wants to merge 8 commits into
base: HW3
Choose a base branch
from
Open

Hw3 #436

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion HW3/P2/mandelbrot.cl
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,24 @@ mandelbrot(__global __read_only float *coords_real,
float c_real, c_imag;
float z_real, z_imag;
int iter;
float z_real_temp;

if ((x < w) && (y < h)) {
// YOUR CODE HERE
;
c_real = coords_real[y * w + x];
c_imag = coords_imag[y * w + x];
z_real = 0;
z_imag = 0;

iter = 0;

while(z_real * z_real + z_imag * z_imag < 4 && iter < max_iter) {
z_real_temp = z_real;
z_real = z_real * z_real - z_imag * z_imag + c_real;
z_imag = 2 * z_imag * z_real_temp + c_imag;
iter += 1;
}

out_counts[y * w + x] = iter;
}
}
4 changes: 4 additions & 0 deletions HW3/P3/P3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#0: Intel(R) Core(TM) i5-5257U CPU @ 2.70GHz on Apple
#1: Intel(R) Iris(TM) Graphics 6100 on Apple

configuration ('coalesced', 64, 64): 0.0027708 seconds
35 changes: 26 additions & 9 deletions HW3/P3/sum.cl
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,17 @@ __kernel void sum_coalesced(__global float* x,
long N)
{
float sum = 0;

size_t i = get_global_id(0);
size_t local_id = get_local_id(0);

size_t k = get_global_size(0);
size_t ls = get_local_size(0);

// thread i (i.e., with i = get_global_id()) should add x[i],
// x[i + get_global_size()], ... up to N-1, and store in sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (uint index = i; index < N; index += k) { // YOUR CODE HERE
sum += x[index]; // YOUR CODE HERE
}

fast[local_id] = sum;
Expand All @@ -24,8 +29,12 @@ __kernel void sum_coalesced(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (uint s = ls/2; s > 0; s >>= 1) { // YOUR CODE HERE
// YOUR CODE HERE
if (local_id < s) {
fast[local_id] += fast[local_id + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand All @@ -37,8 +46,12 @@ __kernel void sum_blocked(__global float* x,
long N)
{
float sum = 0;

size_t i = get_global_id(0);
size_t local_id = get_local_id(0);
int k = ceil(float(N) / get_global_size(0));

int k = ceil((float)N / get_global_size(0));
int ls = get_local_size(0);

// thread with global_id 0 should add 0..k-1
// thread with global_id 1 should add k..2k-1
Expand All @@ -48,8 +61,8 @@ __kernel void sum_blocked(__global float* x,
//
// Be careful that each thread stays in bounds, both relative to
// size of x (i.e., N), and the range it's assigned to sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (uint index = k * i; index < k * (i + 1) && index < N; index++) { // YOUR CODE HERE
sum += x[index]; // YOUR CODE HERE
}

fast[local_id] = sum;
Expand All @@ -64,8 +77,12 @@ __kernel void sum_blocked(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (uint s = ls/2; s > 0; s >>= 1) { // YOUR CODE HERE
// YOUR CODE HERE
if (local_id < s) {
fast[local_id] += fast[local_id + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand Down
4 changes: 2 additions & 2 deletions HW3/P3/tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def create_data(N):
times = {}

for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
local = cl.LocalMemory(num_workers * 4)
Expand All @@ -40,7 +40,7 @@ def create_data(N):
format(num_workgroups, num_workers, seconds))

for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
local = cl.LocalMemory(num_workers * 4)
Expand Down
72 changes: 67 additions & 5 deletions HW3/P4/median_filter.cl
Original file line number Diff line number Diff line change
@@ -1,13 +1,30 @@
#include "median9.h"

// Note that globally out-of-bounds pixels should be replaced
// with the nearest valid pixel's value.
static float
fetch(__global __read_only float *in_values,
int w, int h,
int x, int y)
{
if (x < 0) x = 0;
else if (x >= w) x = w-1;

if (y < 0) y = 0;
else if (y >= h) y = h-1;

return in_values[y * w + x];
}


// 3x3 median filter
__kernel void
median_3x3(__global __read_only float *in_values,
__global __write_only float *out_values,
__local float *buffer,
int w, int h,
int buf_w, int buf_h,
const int halo)
const int halo) // width of halo on one side
{
// Note: It may be easier for you to implement median filtering
// without using the local buffer, first, then adjust your code to
Expand All @@ -18,10 +35,39 @@ median_3x3(__global __read_only float *in_values,
//
// It may be helpful to consult HW3 Problem 5, and
// https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
//
// Note that globally out-of-bounds pixels should be replaced
// with the nearest valid pixel's value.


// Global position of output pixel
const int x = get_global_id(0);
const int y = get_global_id(1);

// Local position relative to (0, 0) in workgroup
const int lx = get_local_id(0);
const int ly = get_local_id(1);

// coordinates of the upper left corner of the buffer in image
// space, including halo
const int buf_corner_x = x - lx - halo;
const int buf_corner_y = y - ly - halo;

// coordinates of our pixel in the local buffer
const int buf_x = lx + halo;
const int buf_y = ly + halo;

// 1D index of thread within our work-group
const int idx_1D = ly * get_local_size(0) + lx;

int row;

if (idx_1D < buf_w)
for (row = 0; row < buf_h; row++) {
buffer[row * buf_w + idx_1D] = \
fetch(in_values, w, h,
buf_corner_x + idx_1D,
buf_corner_y + row);
}

barrier(CLK_LOCAL_MEM_FENCE);

// Compute 3x3 median for each pixel in core (non-halo) pixels
//
Expand All @@ -31,4 +77,20 @@ median_3x3(__global __read_only float *in_values,

// Each thread in the valid region (x < w, y < h) should write
// back its 3x3 neighborhood median.
}

// Processing code here...
//
// Should only use buffer, buf_x, buf_y.

// write output
if ((y < h) && (x < w)) // stay in bounds
out_values[y * w + x] = median9(buffer[(buf_y-1) * buf_w + buf_x - 1],
buffer[(buf_y-1) * buf_w + buf_x],
buffer[(buf_y-1) * buf_w + buf_x + 1],
buffer[buf_y * buf_w + buf_x - 1],
buffer[buf_y * buf_w + buf_x],
buffer[buf_y * buf_w + buf_x + 1],
buffer[(buf_y+1) * buf_w + buf_x - 1],
buffer[(buf_y+1) * buf_w + buf_x],
buffer[(buf_y+1) * buf_w + buf_x + 1]);
}
5 changes: 3 additions & 2 deletions HW3/P4/median_filter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from __future__ import division
import pyopencl as cl
import numpy as np
import imread
import pylab
import os.path

def round_up(global_size, group_size):
r = global_size % group_size
Expand Down Expand Up @@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
properties=cl.command_queue_properties.PROFILING_ENABLE)
print 'The queue is using the device:', queue.device.name

program = cl.Program(context, open('median_filter.cl').read()).build(options='')
curdir = os.path.dirname(os.path.realpath(__file__))
program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])

host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
host_image_filtered = np.zeros_like(host_image)
Expand Down
38 changes: 38 additions & 0 deletions HW3/P5/P5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
1. Iteration counts and average kernel times after each part

(1) Part 1
- Maze 1 : Finished after 915 iterations, 194.06176 ms total, 0.212089355191 ms per iteration
Found 2 regions
- Maze 2 : Finished after 532 iterations, 111.65112 ms total, 0.209870526316 ms per iteration
Found 35 regions

(2) Part 2
- Maze 1 : Finished after 529 iterations, 109.62104 ms total, 0.207223137996 ms per iteration
Found 2 regions
- Maze 2 : Finished after 273 iterations, 56.87648 ms total, 0.208338754579 ms per iteration
Found 35 regions

(3) Part 3
- Maze 1 : Finished after 10 iterations, 3.1164 ms total, 0.31164 ms per iteration
Found 2 regions
- Maze 2 : Finished after 9 iterations, 2.72808 ms total, 0.30312 ms per iteration
Found 35 regions

(4) Part 4
- Maze 1 : Finished after 16 iterations, 8.2236 ms total, 0.513975 ms per iteration
Found 2 regions
- Maze 2 : Finished after 15 iterations, 7.52448 ms total, 0.501632 ms per iteration
Found 35 regions


2. Part 4 discussion
Based on my empirical result, using a single thread to reduce redundant memory access rather made it more slower than part 3 codes did. Number of terations has increased 2 times and execution speed decreased to half.
I think even though part 3 code makes redundant memory access and it might take some time, but still using multiple threads override the advantage of serial access. Therefore, in my machine, the time saved by lessening redundant lookup is smaller than the time saved by parallel implementation.
But this might depend on different conditions or environment of GPUs. For example, if the speed of GPU access is very slow, and the input image mostly has the same labels within a workgroup, then seriel implementation (code of part 4) will be better.
On the other hand, if the GPU access speed is not bottleneck anymore and the image has label that varies a lot even within a same group, code for P4 will work better in efficiency.


3. Part 5 explanation
atomic_min() makes sure that a certain memory address will be only accessed by one thread at a time. So in our code, it makes the label[old_label] always gets the actual minimum values when we use atomic_min(). However, if we use min() instead of atomic_min(), it might cause label[old_label] to be overwritten so that it has different value, resulting a value in label to increase.
The final result will still be correct since once the minimum value is taken, this value is keep used. However, the performance of the algorithm (time and iterations) might vary.
The operation using min() will be faster since it allows parallelism, however, the # of iterations will increase since a single thread will take more computation until it converge to get the actual minimum value.
50 changes: 49 additions & 1 deletion HW3/P5/label_regions.cl
Original file line number Diff line number Diff line change
Expand Up @@ -80,20 +80,68 @@ propagate_labels(__global __read_write int *labels,
old_label = buffer[buf_y * buf_w + buf_x];

// CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)

// PART 2
// if (old_label < w * h) { // to check if it's background pixel
// buffer[buf_y * buf_w + buf_x] = labels[buffer[buf_y * buf_w + buf_x]];
// }

// PART 4
// only thread for lx, ly = (0, 0) will be used to fetching the data
if (lx == 0 && ly == 0) {
int current_label, current_label_index, last_label;
int work_group_size_x = get_local_size(0);
int work_group_size_y = get_local_size(1);

for (int row = 0; row < work_group_size_x; row++) {
for (int col = 0; col < work_group_size_y; col++) {

current_label_index = (row + halo) * buf_w + (col + halo);
current_label = buffer[current_label_index];

if (current_label < w * h) { // not background
if (current_label == last_label) {
buffer[current_label_index] = last_label;
}
else {
last_label = current_label;
buffer[current_label_index] = labels[current_label];
}
}

}
}
}

// Make sure all threads reach the next part after
// the local buffer is loaded
barrier(CLK_LOCAL_MEM_FENCE);

// stay in bounds
if ((x < w) && (y < h)) {
// CODE FOR PART 1 HERE
// We set new_label to the value of old_label, but you will need
// to adjust this for correctness.
new_label = old_label;

if (new_label < w * h) { // to check if it's background pixel

int up_pixel = buffer[(buf_y - 1) * buf_w + buf_x];
int left_pixel = buffer[buf_y * buf_w + buf_x -1];
int down_pixel = buffer[(buf_y + 1) * buf_w + buf_x];
int right_pixel = buffer[buf_y * buf_w + buf_x + 1];

new_label = min(min(min(min(old_label, up_pixel), left_pixel), down_pixel), right_pixel);
}

if (new_label != old_label) {
// CODE FOR PART 3 HERE
// indicate there was a change this iteration.
// multiple threads might write this.
*(changed_flag) += 1;
labels[y * w + x] = new_label;
atomic_min(&labels[old_label], new_label);
// labels[y * w + x] = new_label;
atomic_min(&labels[y * w + x], new_label);
}
}
}
5 changes: 4 additions & 1 deletion HW3/P5/label_regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import pyopencl as cl
import numpy as np
import pylab
# import os

# os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'

def round_up(global_size, group_size):
r = global_size % group_size
Expand Down Expand Up @@ -42,7 +45,7 @@ def round_up(global_size, group_size):

program = cl.Program(context, open('label_regions.cl').read()).build(options='')

host_image = np.load('maze1.npy')
host_image = np.load('maze2.npy')
host_labels = np.empty_like(host_image)
host_done_flag = np.zeros(1).astype(np.int32)

Expand Down