Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hw3 #433

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open

Hw3 #433

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions HW3/P2/mandelbrot.cl
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,34 @@ mandelbrot(__global __read_only float *coords_real,

float c_real, c_imag;
float z_real, z_imag;
float new_z_real;
int iter;

if ((x < w) && (y < h)) {
// YOUR CODE HERE
// implementing mandelbrot here

// initialize
iter = 0;
z_real =0;
z_imag = 0;
// pixel (x,y) is x*w + y away from (0,0)
// because (x,y) are flipped
c_real = coords_real[x*w + y];
c_imag = coords_imag[x*w + y];
while((z_real*z_real+ z_imag*z_imag <=4) \
&&(iter <= max_iter)){
// Similar to AVX implemtation
new_z_real = (z_real*z_real - z_imag*z_imag) \
+ c_real;
z_imag = (2 * z_real* z_imag) + c_imag;
z_real = new_z_real;
iter = iter + 1;
}



;
out_counts[x*w + y] = iter;
}
}
22 changes: 22 additions & 0 deletions HW3/P3/P3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
The best found configuration is

configuration ('coalesced', 256, 128): 0.00303864 seconds

The devices detected on platform Apple are:

Intel(R) Core(TM) i5-4250U CPU @ 1.30GHz [Type: CPU ]

Maximum clock Frequency: 1300 MHz

Maximum allocable memory size: 1073 MB

## Maximum work group size 1024

HD Graphics 5000 [Type: GPU ]

Maximum clock Frequency: 1000 MHz

Maximum allocable memory size: 402 MB

## Maximum work group size 512

36 changes: 27 additions & 9 deletions HW3/P3/sum.cl
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,15 @@ __kernel void sum_coalesced(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
size_t i = get_global_id(0);
int jump = get_global_size(0);
int localsize = get_local_size(0);
int id_x;

// thread i (i.e., with i = get_global_id()) should add x[i],
// x[i + get_global_size()], ... up to N-1, and store in sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (id_x = 0 ;i + id_x*jump < N; id_x++) {
sum += x[ i + id_x*jump ];
}

fast[local_id] = sum;
Expand All @@ -24,8 +28,11 @@ __kernel void sum_coalesced(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (uint offset = localsize/2; offset > 0; offset >>= 1) {
if (local_id< offset) {
fast[local_id] += fast[local_id + offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand All @@ -39,6 +46,13 @@ __kernel void sum_blocked(__global float* x,
float sum = 0;
size_t local_id = get_local_id(0);
int k = ceil((float)N / get_global_size(0));
int globalid = get_global_id(0);
int localid = get_local_id(0);
int ini ;
int localsize = get_local_size(0);




// thread with global_id 0 should add 0..k-1
// thread with global_id 1 should add k..2k-1
Expand All @@ -48,8 +62,10 @@ __kernel void sum_blocked(__global float* x,
//
// Be careful that each thread stays in bounds, both relative to
// size of x (i.e., N), and the range it's assigned to sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for ( ini = globalid*k ; ini < (globalid +1)*k ; ini++) {
if (ini < N) {
sum += x[ini];
}
}

fast[local_id] = sum;
Expand All @@ -64,9 +80,11 @@ __kernel void sum_blocked(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (uint offset = localsize/2; offset > 0; offset >>= 1) {
if (local_id< offset) {
fast[local_id] += fast[local_id + offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
}
61 changes: 59 additions & 2 deletions HW3/P4/median_filter.cl
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
#include "median9.h"


inline float get_values(__global float *in_values, \
int w, int h, int new_x, int new_y){
// check everything stays in bound
if (new_x < 0) new_x = 0;
if (new_y < 0) new_y = 0;
if (new_x >= w) new_x = w - 1;
if (new_y >= h) new_y = h - 1;
return in_values[new_y * w + new_x];
}

// 3x3 median filter
__kernel void
median_3x3(__global __read_only float *in_values,
Expand All @@ -22,13 +33,59 @@ median_3x3(__global __read_only float *in_values,
// Note that globally out-of-bounds pixels should be replaced
// with the nearest valid pixel's value.

// Define variables like in class

// global position of the pixel
const int x = get_global_id(0);
const int y = get_global_id(1);

// local position of the pixel in the workgroup
const int lx = get_local_id(0);
const int ly = get_local_id(1);

// corner coordinates of the buffer
const int buf_corner_x = x - lx - halo;
const int buf_corner_y = y - ly - halo;

// coordinates of the pixel in the buffer
const int buf_x = lx + halo;
const int buf_y = ly + halo;

// get 1-index of the pixels
const int idx_1D = ly * get_local_size(0) + lx;

// Compute 3x3 median for each pixel in core (non-halo) pixels
//
// We've given you median9.h, and included it above, so you can
// use the median9() function.

if (idx_1D < buf_w){
for (int row = 0; row < buf_h; row++){
int new_x = buf_corner_x + idx_1D;
int new_y = buf_corner_y + row;
// Each thread in the valid region (x < w, y < h) should write
// back its 3x3 neighborhood median.
buffer[row * buf_w + idx_1D] = \
get_values(in_values, w, h, new_x, new_y);
}
}

// now write the output
barrier(CLK_LOCAL_MEM_FENCE);

if((x < w) && (y < h)){
out_values[y * w + x] =\
median9( buffer[ (buf_y-1) * buf_w + buf_x -1],\
buffer[ (buf_y-1) * buf_w + buf_x],\
buffer[ (buf_y-1) * buf_w + buf_x +1],\
buffer[ buf_y * buf_w + buf_x -1], \
buffer[ buf_y * buf_w + buf_x], \
buffer[ buf_y * buf_w + buf_x +1],\
buffer[ (buf_y+1) * buf_w + buf_x -1],\
buffer[ (buf_y+1) * buf_w + buf_x],\
buffer[ (buf_y+1) * buf_w + buf_x +1]);

}


// Each thread in the valid region (x < w, y < h) should write
// back its 3x3 neighborhood median.
}
87 changes: 87 additions & 0 deletions HW3/P5/P5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
Results

********
Part 1
********

--- Maze 1 ----

Finished after 314 iterations, 153.46664 ms total, 0.488747261146 ms per iteration
Found 77 regions

--- Maze 2 ---

Finished after 243 iterations, 106.70872 ms total, 0.439130534979 ms per iteration
Found 113 regions

********
Part 2
********

--- Maze 1 ---


Finished after 132 iterations, 65.11392 ms total, 0.493287272727 ms per iteration
Found 77 regions

--- Maze 2 ---

Finished after 114 iterations, 55.83568 ms total, 0.489786666667 ms per iteration
Found 113 regions

********
Part 3
********

--- Maze 1 ---

Finished after 11 iterations, 5.42896 ms total, 0.493541818182 ms per iteration
Found 60 regions


--- Maze 2 ---

Finished after 11 iterations, 5.3596 ms total, 0.487236363636 ms per iteration
Found 106 regions

********
Part 4
********

--- Maze 1 ---

Finished after 71 iterations, 96.0672 ms total, 1.35305915493 ms per iteration
Found 66 regions

--- Maze 2 ---

Finished after 49 iterations, 66.29168 ms total, 1.35289142857 ms per iteration
Found 106 regions

Looks like using only one thread did not improve anything. Worse, I now have more iterations. I don't have time to find out what happened.
While we have decreased the fetches into the global memory by serializing, the run time is bigger.


********
Part 5
********

Changing atomic_min() to a simple min() ?

For now we have :

atomic_min(&labels[old_label],new_label);
atomic_min(&labels[y * w + x], new_label);

and we are asking ourselves the questions what would happen with

min(labels[old_label], new_label);
min(labels[y * w + x], new_label);

The atomic operation ensure that only one thread assigns to labels[old_label] and labels[y * w + x] the smallest label at a time. If multiple threads do this at the same time in parallel, there is no reason to believe that it will correctly assign the smallest value of the label anymore.

However, this operation will be faster per iteration because now done via multithreading.

Also we may expect more iterations because the minimum value stored at the end of one iteration may not be the minimum value for all the threads.


63 changes: 58 additions & 5 deletions HW3/P5/label_regions.cl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ int
get_clamped_value(__global __read_only int *labels,
int w, int h,
int x, int y)
{
{
if ((x < 0) || (x >= w) || (y < 0) || (y >= h))
return w * h;
return labels[y * w + x];
Expand Down Expand Up @@ -80,20 +80,73 @@ propagate_labels(__global __read_write int *labels,
old_label = buffer[buf_y * buf_w + buf_x];

// CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)



// CODE FOR PART 2

// if (old_label < w*h) {
// buffer[ buf_y * buf_w + buf_x ] = labels[old_label];
// }


// CODE FOR PART 4


// when we have the first thread
if ((lx == 0) && (ly == 0)) {

// initialize variables to use
int last_label = -1 ;
int my_label_new;

// get grandparent
if (old_label < w*h){
last_label = labels[old_label];
}

// loop over rows and columns of the buffer
for (int x_i = halo; x_i < get_local_size(0) + halo; x_i++) {
for (int y_i = halo; y_i < get_local_size(1) + halo; y_i++) {

my_label_new = buffer[(ly+x_i)*buf_w+(lx+y_i)];

if (buffer[(ly+x_i)*buf_w+(lx+y_i)] < w*h) {
// avoid having the same value as the previous one
if (my_label_new != last_label) {
// update the buffer
buffer[(ly+x_i)*buf_w+(lx+y_i)] = labels[my_label_new];
}
}
}
}
}

// stay in bounds
if ((x < w) && (y < h)) {
if (((x < w) && (y < h)) && (old_label < w*h)) {
// CODE FOR PART 1 HERE
// We set new_label to the value of old_label, but you will need
// to adjust this for correctness.
new_label = old_label;

// one pixel becomes the minimum of its 4 neighboring
// pixels and itself
// get the locations in a similar fashion as P4
int left = buffer[ buf_y * buf_w + buf_x - 1];
int right = buffer[ buf_y * buf_w + 1];
int up = buffer[ (buf_y - 1) * buf_w + buf_x ];
int down = buffer[ (buf_y + 1) * buf_w + buf_x ];
// find the minimum
new_label = min(old_label, min( min( min(up,down) , right) , left));



if (new_label != old_label) {
atomic_min(&labels[old_label], new_label);
// CODE FOR PART 3 HERE
// indicate there was a change this iteration.
// multiple threads might write this.
*(changed_flag) += 1;
labels[y * w + x] = new_label;
//labels[y * w + x] = new_label;
atomic_min(&labels[y * w + x], new_label);
}
}
}
Loading