Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HW 3 #415

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open

HW 3 #415

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion HW3/P2/mandelbrot.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import division
import pyopencl as cl
import numpy as np
import pylab
import matplotlib.pyplot as pylab

def round_up(global_size, group_size):
r = global_size % group_size
Expand Down
1 change: 1 addition & 0 deletions HW3/P3/P3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The best configuration and time for me was: configuration ('coalesced', 128, 128): 0.00291864 seconds
40 changes: 32 additions & 8 deletions HW3/P3/sum.cl
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,15 @@ __kernel void sum_coalesced(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
size_t i = get_global_id(0);
size_t global_size = get_global_size(0);
size_t group_size = get_local_size(0);

int counter;
// thread i (i.e., with i = get_global_id()) should add x[i],
// x[i + get_global_size()], ... up to N-1, and store in sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (counter = 0; counter * global_size + i < N; counter++) {
sum += x[i + counter * global_size];
}

fast[local_id] = sum;
Expand All @@ -24,8 +28,14 @@ __kernel void sum_coalesced(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
int k;

for (k = group_size / 2 ;k > 0; k >>= 1) {
if (local_id < k) {
fast[local_id] += fast[local_id + k];
}

barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand All @@ -38,6 +48,9 @@ __kernel void sum_blocked(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
size_t global_id = get_global_id(0);
size_t group_size = get_local_size(0);

int k = ceil((float)N / get_global_size(0));

// thread with global_id 0 should add 0..k-1
Expand All @@ -48,8 +61,13 @@ __kernel void sum_blocked(__global float* x,
//
// Be careful that each thread stays in bounds, both relative to
// size of x (i.e., N), and the range it's assigned to sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
int count;

for (count = global_id * k; count < (global_id + 1)*k; count++) { // YOUR CODE HERE
if (count < N)
{
sum += x[count];
}
}

fast[local_id] = sum;
Expand All @@ -64,8 +82,14 @@ __kernel void sum_blocked(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
uint d;

for (d = group_size / 2 ;d > 0; d >>= 1) {
if (local_id < d) {
fast[local_id] += fast[local_id + d];
}

barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand Down
68 changes: 68 additions & 0 deletions HW3/P4/median_filter.cl
Original file line number Diff line number Diff line change
@@ -1,5 +1,27 @@
#include "median9.h"

// helper function which replaces the version of FETCH
// that Ray provided. This function will check if x and y
// are within bounds, and if not return the closest pixel.
float FETCH_new(__global __read_only float *in_values,
int width, int height,
int x, int y)
{
if (x < 0)
x = 0;

else if (x > width - 1)
x = width - 1;

if (y < 0)
y = 0;

else if (y > height - 1)
y = height - 1;

return in_values[y * width + x];
}

// 3x3 median filter
__kernel void
median_3x3(__global __read_only float *in_values,
Expand Down Expand Up @@ -31,4 +53,50 @@ median_3x3(__global __read_only float *in_values,

// Each thread in the valid region (x < w, y < h) should write
// back its 3x3 neighborhood median.
// Global position of output pixel
const int x = get_global_id(0);
const int y = get_global_id(1);

// Local position relative to (0, 0) in workgroup
const int lx = get_local_id(0);
const int ly = get_local_id(1);

// coordinates of the upper left corner of the buffer in image
// space, including halo
const int buf_corner_x = x - lx - halo;
const int buf_corner_y = y - ly - halo;

// coordinates of our pixel in the local buffer
const int buf_x = lx + halo;
const int buf_y = ly + halo;

// 1D index of thread within our work-group
const int idx_1D = ly * get_local_size(0) + lx;

int row;

if (idx_1D < buf_w)
for (row = 0; row < buf_h; row++) {
buffer[row * buf_w + idx_1D] = \
FETCH_new(in_values, w, h,
buf_corner_x + idx_1D,
buf_corner_y + row);
}

barrier(CLK_LOCAL_MEM_FENCE);

if ((y < h && x < w))
{
int top = (buf_y - 1) * buf_w + buf_x;
int middle = buf_y * buf_w + buf_x;
int bottom = (buf_y + 1)* buf_w + buf_x;

out_values[y * w + x] = median9(
buffer[top - 1], buffer[top], buffer[top + 1],
buffer[middle - 1], buffer[middle], buffer[middle + 1],
buffer[bottom - 1], buffer[bottom], buffer[bottom + 1]
);
}


}
62 changes: 62 additions & 0 deletions HW3/P5/P5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
Part 1

Maze 1:
Finished after 878 iterations, 261.55712 ms total, 0.297901047836 ms per iteration
Found 2 regions

Maze 2:
Finished after 517 iterations, 153.9384 ms total, 0.297753191489 ms per iteration
Found 35 regions


Part 2

Maze 1:
Finished after 529 iterations, 158.00224 ms total, 0.298680982987 ms per iteration
Found 2 regions

Maze 2:
Finished after 273 iterations, 81.45792 ms total, 0.298380659341 ms per iteration
Found 35 regions


Part 3

Maze 1:
Finished after 11 iterations, 3.37152 ms total, 0.306501818182 ms per iteration
Found 2 regions

Maze 2:
Finished after 9 iterations, 2.7204 ms total, 0.302266666667 ms per iteration
Found 35 regions


Part 4

Maze 1:
Finished after 70 iterations, 52.56808 ms total, 0.750972571429 ms per iteration
Found 2 regions

Maze 2:
Finished after 103 iterations, 76.77008 ms total, 0.745340582524 ms per iteration
Found 35 regions


It seems like in my case, serialization of the "finding grandparents" process
is not the best as it leads to a 2.5 time increase in time per iteration.


Part 5

Suppose that our current label sees 2 other labels, both of which have a
smaller label number than our current one. In that case, if we did atomic
updates, all 3 labels will become the minimum of these 3 labels. However,
it is possible that if we did "min" first, then "reassignment", the order
of the 2 mins and the 2 reassignments can make a difference. For example:

Suppose our current label at a square is 3, and there are two neighbors
with labels 2 and 1. We would like to update 3 -> 2. However, when two
different threads compute the min of (3,2) and (3,1), they will get 1 and 2.
Now, assume that we assign that label to be 1, and THEN assign it to be 2.
UH OH! Now we have a problem and will have to run for at least another iteration
to fix it.
57 changes: 56 additions & 1 deletion HW3/P5/label_regions.cl
Original file line number Diff line number Diff line change
Expand Up @@ -75,23 +75,78 @@ propagate_labels(__global __read_write int *labels,
// the local buffer is loaded
barrier(CLK_LOCAL_MEM_FENCE);

int current = buf_y * buf_w + buf_x;
// Fetch the value from the buffer the corresponds to
// the pixel for this thread
old_label = buffer[buf_y * buf_w + buf_x];
old_label = buffer[current];

// CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)

/*
if (old_label < w * h)
{
buffer[current] = labels[old_label]; // grab grandparent
}
*/


if ((lx == 0) && (ly == 0))
{
int prev_key = -1000;
int prev_result;

for (int i = 0; i < buf_w * buf_h; i++)
{
int this_label = buffer[i];

if (this_label >= w * h)
continue;

if (prev_key == this_label)
{
buffer[i] = prev_result;
}

else
{
prev_key = this_label;
prev_result = labels[prev_key];

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should also update the buffer of the current index.

}
}
}



barrier(CLK_LOCAL_MEM_FENCE);

// stay in bounds
if ((x < w) && (y < h)) {
// CODE FOR PART 1 HERE
// We set new_label to the value of old_label, but you will need
// to adjust this for correctness.
new_label = old_label;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After parts 2 and 4, you should use buffer[buf_w * buf_y + buf_x] instead of old_label.


if (new_label < w * h)
{
int this = buf_y * buf_w + buf_x;
new_label =
min(buffer[(buf_y + 1) * buf_w + buf_x],
min(buffer[(buf_y - 1) * buf_w + buf_x],
min(buffer[this + 1],
min(buffer[this - 1], new_label
))));
}

if (new_label != old_label) {
// CODE FOR PART 3 HERE
// indicate there was a change this iteration.
// multiple threads might write this.

//
atomic_min(&labels[old_label], new_label);

atomic_min(&labels[y * w + x], new_label);

*(changed_flag) += 1;
labels[y * w + x] = new_label;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be removed, as you are doing the atomic_min in line 148.

}
Expand Down
4 changes: 2 additions & 2 deletions HW3/P5/label_regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import sys
import pyopencl as cl
import numpy as np
import pylab
import matplotlib.pyplot as pylab

def round_up(global_size, group_size):
r = global_size % group_size
Expand Down Expand Up @@ -42,7 +42,7 @@ def round_up(global_size, group_size):

program = cl.Program(context, open('label_regions.cl').read()).build(options='')

host_image = np.load('maze1.npy')
host_image = np.load('maze2.npy')
host_labels = np.empty_like(host_image)
host_done_flag = np.zeros(1).astype(np.int32)

Expand Down