-
Notifications
You must be signed in to change notification settings - Fork 96
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Hw3 #426
base: HW3
Are you sure you want to change the base?
Hw3 #426
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
The best configuration was coalesced reads, workgroups: 64, num_workers: 128, 0.00275208 seconds | ||
|
||
#0: Intel(R) Core(TM) i7-4650U CPU @ 1.70GHz on Apple | ||
#1: HD Graphics 5000 on Apple | ||
coalesced reads, workgroups: 8, num_workers: 4, 0.173174 seconds | ||
coalesced reads, workgroups: 8, num_workers: 8, 0.09775128 seconds | ||
coalesced reads, workgroups: 8, num_workers: 16, 0.05811304 seconds | ||
coalesced reads, workgroups: 8, num_workers: 32, 0.02927688 seconds | ||
coalesced reads, workgroups: 8, num_workers: 64, 0.01506872 seconds | ||
coalesced reads, workgroups: 8, num_workers: 128, 0.00775552 seconds | ||
coalesced reads, workgroups: 16, num_workers: 4, 0.09545584 seconds | ||
coalesced reads, workgroups: 16, num_workers: 8, 0.05122536 seconds | ||
coalesced reads, workgroups: 16, num_workers: 16, 0.03318448 seconds | ||
coalesced reads, workgroups: 16, num_workers: 32, 0.01706696 seconds | ||
coalesced reads, workgroups: 16, num_workers: 64, 0.00832944 seconds | ||
coalesced reads, workgroups: 16, num_workers: 128, 0.00449384 seconds | ||
coalesced reads, workgroups: 32, num_workers: 4, 0.06161328 seconds | ||
coalesced reads, workgroups: 32, num_workers: 8, 0.02915712 seconds | ||
coalesced reads, workgroups: 32, num_workers: 16, 0.01422552 seconds | ||
coalesced reads, workgroups: 32, num_workers: 32, 0.00753888 seconds | ||
coalesced reads, workgroups: 32, num_workers: 64, 0.00383592 seconds | ||
coalesced reads, workgroups: 32, num_workers: 128, 0.00296608 seconds | ||
coalesced reads, workgroups: 64, num_workers: 4, 0.02890824 seconds | ||
coalesced reads, workgroups: 64, num_workers: 8, 0.0142484 seconds | ||
coalesced reads, workgroups: 64, num_workers: 16, 0.0075216 seconds | ||
coalesced reads, workgroups: 64, num_workers: 32, 0.00401608 seconds | ||
coalesced reads, workgroups: 64, num_workers: 64, 0.00291696 seconds | ||
coalesced reads, workgroups: 64, num_workers: 128, 0.00275208 seconds | ||
coalesced reads, workgroups: 128, num_workers: 4, 0.02929296 seconds | ||
coalesced reads, workgroups: 128, num_workers: 8, 0.01506728 seconds | ||
coalesced reads, workgroups: 128, num_workers: 16, 0.00750416 seconds | ||
coalesced reads, workgroups: 128, num_workers: 32, 0.00392192 seconds | ||
coalesced reads, workgroups: 128, num_workers: 64, 0.00282296 seconds | ||
coalesced reads, workgroups: 128, num_workers: 128, 0.00283168 seconds | ||
coalesced reads, workgroups: 256, num_workers: 4, 0.02977544 seconds | ||
coalesced reads, workgroups: 256, num_workers: 8, 0.01534176 seconds | ||
coalesced reads, workgroups: 256, num_workers: 16, 0.00795784 seconds | ||
coalesced reads, workgroups: 256, num_workers: 32, 0.00426864 seconds | ||
coalesced reads, workgroups: 256, num_workers: 64, 0.0028708 seconds | ||
coalesced reads, workgroups: 256, num_workers: 128, 0.00280752 seconds | ||
coalesced reads, workgroups: 512, num_workers: 4, 0.02997592 seconds | ||
coalesced reads, workgroups: 512, num_workers: 8, 0.01532384 seconds | ||
coalesced reads, workgroups: 512, num_workers: 16, 0.00771888 seconds | ||
coalesced reads, workgroups: 512, num_workers: 32, 0.00434744 seconds | ||
coalesced reads, workgroups: 512, num_workers: 64, 0.00287568 seconds | ||
coalesced reads, workgroups: 512, num_workers: 128, 0.00275472 seconds | ||
blocked reads, workgroups: 8, num_workers: 4, 0.18406568 seconds | ||
blocked reads, workgroups: 8, num_workers: 8, 0.11625232 seconds | ||
blocked reads, workgroups: 8, num_workers: 16, 0.08278504 seconds | ||
blocked reads, workgroups: 8, num_workers: 32, 0.04329048 seconds | ||
blocked reads, workgroups: 8, num_workers: 64, 0.0209084 seconds | ||
blocked reads, workgroups: 8, num_workers: 128, 0.01256128 seconds | ||
blocked reads, workgroups: 16, num_workers: 4, 0.11378736 seconds | ||
blocked reads, workgroups: 16, num_workers: 8, 0.07066096 seconds | ||
blocked reads, workgroups: 16, num_workers: 16, 0.04468824 seconds | ||
blocked reads, workgroups: 16, num_workers: 32, 0.02061624 seconds | ||
blocked reads, workgroups: 16, num_workers: 64, 0.01264984 seconds | ||
blocked reads, workgroups: 16, num_workers: 128, 0.03288288 seconds | ||
blocked reads, workgroups: 32, num_workers: 4, 0.06355624 seconds | ||
blocked reads, workgroups: 32, num_workers: 8, 0.03810152 seconds | ||
blocked reads, workgroups: 32, num_workers: 16, 0.02057768 seconds | ||
blocked reads, workgroups: 32, num_workers: 32, 0.01233816 seconds | ||
blocked reads, workgroups: 32, num_workers: 64, 0.03313528 seconds | ||
blocked reads, workgroups: 32, num_workers: 128, 0.08156096 seconds | ||
blocked reads, workgroups: 64, num_workers: 4, 0.0351416 seconds | ||
blocked reads, workgroups: 64, num_workers: 8, 0.01899656 seconds | ||
blocked reads, workgroups: 64, num_workers: 16, 0.01270664 seconds | ||
blocked reads, workgroups: 64, num_workers: 32, 0.0335048 seconds | ||
blocked reads, workgroups: 64, num_workers: 64, 0.08059792 seconds | ||
blocked reads, workgroups: 64, num_workers: 128, 0.08670848 seconds | ||
blocked reads, workgroups: 128, num_workers: 4, 0.0413724 seconds | ||
blocked reads, workgroups: 128, num_workers: 8, 0.02386552 seconds | ||
blocked reads, workgroups: 128, num_workers: 16, 0.01572904 seconds | ||
blocked reads, workgroups: 128, num_workers: 32, 0.04104112 seconds | ||
blocked reads, workgroups: 128, num_workers: 64, 0.08430344 seconds | ||
blocked reads, workgroups: 128, num_workers: 128, 0.06656264 seconds | ||
blocked reads, workgroups: 256, num_workers: 4, 0.03564384 seconds | ||
blocked reads, workgroups: 256, num_workers: 8, 0.02051488 seconds | ||
blocked reads, workgroups: 256, num_workers: 16, 0.01278368 seconds | ||
blocked reads, workgroups: 256, num_workers: 32, 0.03704784 seconds | ||
blocked reads, workgroups: 256, num_workers: 64, 0.07087392 seconds | ||
blocked reads, workgroups: 256, num_workers: 128, 0.04985056 seconds | ||
blocked reads, workgroups: 512, num_workers: 4, 0.0354248 seconds | ||
blocked reads, workgroups: 512, num_workers: 8, 0.02118616 seconds | ||
blocked reads, workgroups: 512, num_workers: 16, 0.01358552 seconds | ||
blocked reads, workgroups: 512, num_workers: 32, 0.03686744 seconds | ||
blocked reads, workgroups: 512, num_workers: 64, 0.04923632 seconds | ||
blocked reads, workgroups: 512, num_workers: 128, 0.03931944 seconds | ||
configuration ('coalesced', 64, 128): 0.00275208 seconds |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
Part 1: | ||
Maze 1: Finished after 888 iterations, 450.07784 ms total, 0.506844414414 ms per iteration | ||
Found 2 regions | ||
|
||
Maze 2: Finished after 516 iterations, 258.12848 ms total, 0.500248992248 ms per iteration | ||
Found 35 regions | ||
|
||
Part 2: | ||
Maze 1: Finished after 529 iterations, 262.70208 ms total, 0.496601285444 ms per iteration | ||
Found 2 regions | ||
|
||
Maze 2: Finished after 273 iterations, 137.47456 ms total, 0.50356981685 ms per iteration | ||
Found 35 regions | ||
|
||
Part 3: | ||
Maze 1: Finished after 10 iterations, 4.89648 ms total, 0.489648 ms per iteration | ||
Found 2 regions | ||
|
||
Maze 2: Finished after 9 iterations, 4.50248 ms total, 0.500275555556 ms per iteration | ||
Found 35 regions | ||
|
||
Part 4: | ||
Maze 1: Finished after 19 iterations, 19.07336 ms total, 1.00386105263 ms per iteration | ||
Found 2 regions | ||
|
||
Maze 2: Finished after 17 iterations, 17.0408 ms total, 1.0024 ms per iteration | ||
Found 35 regions | ||
|
||
Discussion: | ||
Despite avoiding repeating global memory reads in the single threaded sequential scan, the single thread workgroup labeling resulted in double the time to label the maze. This is because we are updating the labels in parallel. Empirically, we are bottlenecked by computation. We don't achieve much speedup given we are reducing global memory reads, so reading is not the issue. Thus, for my hardware, single thread is not a good option. | ||
|
||
There are two general factors that can impact the speed. 1) If the pixels are similar, then there would be more speedup in part 4 since there would be fewer updates. 2) If a GPU can has more core and can perform reads and writes much more quickly, then the speedup in part 4 would also be much more. | ||
|
||
Part 5: | ||
Discussion: | ||
Using atomic_min is useful because it treats reading and writing as one task in our OS scheduler. However, it causes serialized reads and writes. The issue with min instead of atomic_min is that we can run into race conditions. One thread can write a value that is not necessarily the min value after another thread has written the min value. As a result, we would significantly decrease our perfomance since it would take longer to label the entire region. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,6 +45,10 @@ propagate_labels(__global __read_write int *labels, | |
const int lx = get_local_id(0); | ||
const int ly = get_local_id(1); | ||
|
||
// Local workgroup sizes | ||
const int wgx = get_local_size(1); | ||
const int wgy = get_local_size(0); | ||
|
||
// coordinates of the upper left corner of the buffer in image | ||
// space, including halo | ||
const int buf_corner_x = x - lx - halo; | ||
|
@@ -80,20 +84,54 @@ propagate_labels(__global __read_write int *labels, | |
old_label = buffer[buf_y * buf_w + buf_x]; | ||
|
||
// CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2) | ||
// grab the grandparent label | ||
// if (old_label < w*h) { | ||
// buffer[buf_y * buf_w + buf_x] = labels[old_label]; | ||
// } | ||
// if we are at the top left corner of the work group, we propogate grandparent | ||
// labels across the group in a serialized fashion | ||
if (lx == 0 && ly == 0) { | ||
int prev, curr; | ||
for (int i = 0; i < wgy; i++) { | ||
for (int j = 0; j < wgx; j++) { | ||
int id = (i + halo)*buf_w + (j + halo); | ||
curr = buffer[id]; | ||
|
||
if (curr < w*h) { | ||
if (curr != prev) { | ||
prev = curr; | ||
buffer[id] = labels[curr]; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If curr is equal to prev, you need to update the value of buffer[id] to the value of the grandparent (of prev). |
||
} | ||
} | ||
} | ||
} | ||
|
||
// grandparent labels have been propogated | ||
barrier(CLK_LOCAL_MEM_FENCE); | ||
|
||
|
||
// stay in bounds | ||
if ((x < w) && (y < h)) { | ||
// we must check if old_label is less than w*h since the foreground pixels are bounded by | ||
// w*h and we don't want to blend the background pixels with foreground pixels | ||
if ((x < w) && (y < h) && (old_label < w*h)) { | ||
// CODE FOR PART 1 HERE | ||
// We set new_label to the value of old_label, but you will need | ||
// to adjust this for correctness. | ||
new_label = old_label; | ||
// grab the smallest pixel amoung the current and surrounding pixels | ||
new_label = min(old_label, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After grandparent fetching old_label might be different than buffer[buf_y * buf_w + buf_x] |
||
min(buffer[buf_y*buf_w+buf_x+1], | ||
min(buffer[buf_y*buf_w+buf_x-1], | ||
min(buffer[(buf_y-1)*buf_w+buf_x], buffer[(buf_y+1)*buf_w+buf_x])))); | ||
|
||
if (new_label != old_label) { | ||
// CODE FOR PART 3 HERE | ||
// indicate there was a change this iteration. | ||
// multiple threads might write this. | ||
// merging old and new parent with same new_label | ||
atomic_min(&labels[old_label], new_label); | ||
*(changed_flag) += 1; | ||
labels[y * w + x] = new_label; | ||
atomic_min(&labels[y * w + x], new_label); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that prev is not initialized, and so may have an arbitrary value when read later in line 101.