-
Notifications
You must be signed in to change notification settings - Fork 96
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Hw3 #417
base: HW3
Are you sure you want to change the base?
Hw3 #417
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
Hardware | ||
|
||
Intel(R) Core(TM) i7-3537 CPU @2.00GHz | ||
|
||
|
||
Performance: | ||
|
||
configuration ('coalesced', 128, 128): 0.00205134 seconds |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,11 +5,12 @@ __kernel void sum_coalesced(__global float* x, | |
{ | ||
float sum = 0; | ||
size_t local_id = get_local_id(0); | ||
unsigned int global_size = get_global_size(0); | ||
|
||
// thread i (i.e., with i = get_global_id()) should add x[i], | ||
// x[i + get_global_size()], ... up to N-1, and store in sum. | ||
for (;;) { // YOUR CODE HERE | ||
; // YOUR CODE HERE | ||
for (int i = get_global_id(0); i < N; i += global_size) { | ||
sum += x[i]; | ||
} | ||
|
||
fast[local_id] = sum; | ||
|
@@ -24,8 +25,11 @@ __kernel void sum_coalesced(__global float* x, | |
// You can assume get_local_size(0) is a power of 2. | ||
// | ||
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ | ||
for (;;) { // YOUR CODE HERE | ||
; // YOUR CODE HERE | ||
for (int i = get_local_size(0) / 2; i > 0; i /= 2) { | ||
if (local_id < i) { | ||
fast[local_id] += fast[local_id + i] | ||
} | ||
barrier(CLK_LOCAL_MEM_FENCE); | ||
} | ||
|
||
if (local_id == 0) partial[get_group_id(0)] = fast[0]; | ||
|
@@ -38,7 +42,7 @@ __kernel void sum_blocked(__global float* x, | |
{ | ||
float sum = 0; | ||
size_t local_id = get_local_id(0); | ||
int k = ceil(float(N) / get_global_size(0)); | ||
int k = ceil((float)N / get_global_size(0)); | ||
|
||
// thread with global_id 0 should add 0..k-1 | ||
// thread with global_id 1 should add k..2k-1 | ||
|
@@ -48,8 +52,9 @@ __kernel void sum_blocked(__global float* x, | |
// | ||
// Be careful that each thread stays in bounds, both relative to | ||
// size of x (i.e., N), and the range it's assigned to sum. | ||
for (;;) { // YOUR CODE HERE | ||
; // YOUR CODE HERE | ||
for (int i = get_global_id(0) * k; i < (get_global_id(0) + 1) * k && | ||
i < N; ++i) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice use of the loop short-cutting |
||
sum += x[i]; | ||
} | ||
|
||
fast[local_id] = sum; | ||
|
@@ -64,8 +69,11 @@ __kernel void sum_blocked(__global float* x, | |
// You can assume get_local_size(0) is a power of 2. | ||
// | ||
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ | ||
for (;;) { // YOUR CODE HERE | ||
; // YOUR CODE HERE | ||
for (int i = get_local_size(0) / 2; i > 0; i /= 2) { | ||
if (local_id < i) { | ||
fast[local_id] += fast[local_id + i] | ||
} | ||
barrier(CLK_LOCAL_MEM_FENCE); | ||
} | ||
|
||
if (local_id == 0) partial[get_group_id(0)] = fast[0]; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
Part 1: implement updates from neighbors | ||
|
||
Maze 1 | ||
Finished after 894 iterations, 197.5328 ms total, 0.222069508197 ms per iteration | ||
Found 2 regions | ||
|
||
Maze 2 | ||
Finished after 528 iterations, 116.40336 ms total, 0.220460909091 ms per iteration | ||
Found 35 regions | ||
|
||
=============================================================================== | ||
|
||
Part 2: fetch grandparents | ||
|
||
Maze 1 | ||
Finished after 529 iterations, 116.8076 ms total, 0.220808317580 ms per iteration | ||
|
||
Maze 2 | ||
Finished after 274 iterations, 62.08362 ms total, 0.226582554745 ms per iteration | ||
|
||
=============================================================================== | ||
|
||
Part 3: merge parent regions | ||
|
||
Maze 1 | ||
Finished after 10 iterations, 2.4026 ms total, 0.24026 ms per iteration | ||
|
||
Maze 2 | ||
Finished after 9 iterations, 2.17255 ms total, 0.241394444444 ms per iteration | ||
|
||
=============================================================================== | ||
|
||
Part 4: efficient grandparents | ||
|
||
Maze 1 | ||
Finished after 10 iterations, 4.59674 ms total, 0.459674 ms per iteration | ||
|
||
Maze 2 | ||
Finished after 9 iterations, 4.1608 ms total, 0.462311111111 ms per iteration | ||
|
||
It is quite apparent that using a single thread to check the labels in its | ||
workgroup caused performance to worsen. I think this has to do with the | ||
overhead associated with accessing global memory we are trying to avoid, and | ||
the speedup we gain from parallelism. Here, we can see that using only one | ||
thread, i.e. serializing this part of the algorithm, takes away much of the | ||
benefit we gained in the parallel approach, even though it decreased number of | ||
accesses to global memory. | ||
|
||
=============================================================================== | ||
|
||
Part 5: no atomic operations | ||
|
||
Without atomic operations, we introduce the possibility of running into the | ||
race condition in which our old_label is updated redundantly, such that while | ||
the output of the algorithm is not incorrect, we can increase the number of | ||
iterations. This is dangerous because it happens basically | ||
nondeterministically, so in the (very unlikely) worst case it could cause our | ||
algorithm to perform relatively slowly, just because more work (with no | ||
progress) is being done. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There will probably be some progress, but indeed might be slower. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -81,19 +81,54 @@ propagate_labels(__global __read_write int *labels, | |
|
||
// CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2) | ||
|
||
// Part 2 | ||
// | ||
// if (old_label < w * h) { | ||
// buffer[buf_y * buf_w + buf_x] = labels[old_label]; | ||
// } | ||
|
||
// Part 4 | ||
|
||
// Update workgroup labels | ||
if (lx == 0 && ly == 0) { | ||
int max_iter = buf_w * buf_h; | ||
int temp, last; | ||
int prev = -1; | ||
for (int i = 0; i < max_iter; ++i) { | ||
temp = buffer[i]; | ||
if (temp < w * h) { | ||
// if current label is not the same as previous, reset | ||
if (prev != temp) { | ||
prev = temp; | ||
last = labels[prev]; | ||
} | ||
buffer[i] = last; | ||
} | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing barrier after this part. |
||
|
||
// stay in bounds | ||
if ((x < w) && (y < h)) { | ||
// CODE FOR PART 1 HERE | ||
// We set new_label to the value of old_label, but you will need | ||
// to adjust this for correctness. | ||
|
||
// min over all possible values | ||
new_label = old_label; | ||
if (new_label < w * h) { | ||
int row_min = min(buffer[buf_y * buf_w + buf_x - 1], | ||
buffer[buf_y * buf_w + buf_x + 1]); | ||
int col_min = min(buffer[buf_x + (buf_y - 1) * buf_w], | ||
buffer[buf_x + (buf_y + 1) * buf_w]); | ||
new_label = min(row_min, col_min); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should also compare to buffer[buf_y * buf_w + buf_x] |
||
} | ||
|
||
if (new_label != old_label) { | ||
// CODE FOR PART 3 HERE | ||
// indicate there was a change this iteration. | ||
// multiple threads might write this. | ||
*(changed_flag) += 1; | ||
labels[y * w + x] = new_label; | ||
// labels[y * w + x] = new_label; | ||
atomic_min(&labels[old_label], new_label); | ||
atomic_min(&labels[x + y * w], new_label); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing ';' at the end of the line.