diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
index 5a11c020..1bed0388 100644
--- a/HW3/P2/mandelbrot.cl
+++ b/HW3/P2/mandelbrot.cl
@@ -9,11 +9,24 @@ mandelbrot(__global __read_only float *coords_real,
     const int y = get_global_id(1);
 
     float c_real, c_imag;
-    float z_real, z_imag;
+    float z_real, z_imag, z_real_temp;
     int iter;
 
     if ((x < w) && (y < h)) {
         // YOUR CODE HERE
-        ;
+        iter=0;
+        z_real=coords_real[y * w + x];
+        z_imag=coords_imag[y * w + x];
+        c_real=coords_real[y * w + x];
+        c_imag=coords_imag[y * w + x];
+        //printf("%f",z_real);
+        while (((z_real*z_real+z_imag*z_imag)<4) && (iter<max_iter)) {
+          z_real_temp=z_real*z_real-z_imag*z_imag+c_real;
+          z_imag=2*z_real*z_imag+c_imag;
+          z_real=z_real_temp;
+          iter=iter+1;
+        }
+        out_counts[y * w + x]=iter;
+        //printf("%i",iter);
     }
 }
diff --git a/HW3/P2/mandelbrot.py b/HW3/P2/mandelbrot.py
index 7c197af2..01b2a894 100644
--- a/HW3/P2/mandelbrot.py
+++ b/HW3/P2/mandelbrot.py
@@ -2,7 +2,9 @@
 import pyopencl as cl
 import numpy as np
 import pylab
-
+import os
+import pdb
+os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
 def round_up(global_size, group_size):
     r = global_size % group_size
     if r == 0:
@@ -45,7 +47,7 @@ def make_coords(center=(-0.575 - 0.575j),
 
     # Create a queue for transferring data and launching computations.
     # Turn on profiling to allow us to check event times.
-    queue = cl.CommandQueue(context, context.devices[0],
+    queue = cl.CommandQueue(context, context.devices[1],
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
@@ -55,16 +57,17 @@ def make_coords(center=(-0.575 - 0.575j),
     real_coords = np.real(in_coords).copy()
     imag_coords = np.imag(in_coords).copy()
 
+
     gpu_real = cl.Buffer(context, cl.mem_flags.READ_ONLY, real_coords.size * 4)
     gpu_imag = cl.Buffer(context, cl.mem_flags.READ_ONLY, real_coords.size * 4)
     gpu_counts = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, real_coords.size * 4)
 
     local_size = (8, 8)  # 64 pixels per work group
     global_size = tuple([round_up(g, l) for g, l in zip(in_coords.shape[::-1], local_size)])
+    pdb.set_trace()
     width = np.int32(in_coords.shape[1])
     height = np.int32(in_coords.shape[0])
     max_iters = np.int32(1024)
-
     cl.enqueue_copy(queue, gpu_real, real_coords, is_blocking=False)
     cl.enqueue_copy(queue, gpu_imag, imag_coords, is_blocking=False)
 
@@ -73,7 +76,6 @@ def make_coords(center=(-0.575 - 0.575j),
                                width, height, max_iters)
 
     cl.enqueue_copy(queue, out_counts, gpu_counts, is_blocking=True)
-
     seconds = (event.profile.end - event.profile.start) / 1e9
     print("{} Million Complex FMAs in {} seconds, {} million Complex FMAs / second".format(out_counts.sum() / 1e6, seconds, (out_counts.sum() / seconds) / 1e6))
     pylab.imshow(np.log(out_counts))
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
new file mode 100644
index 00000000..1ffe140a
--- /dev/null
+++ b/HW3/P3/P3.txt
@@ -0,0 +1,5 @@
+For my machine, any configurations with >32 workers resulted in an error. It also seems that when the gloabl size>4096, then the computation does not complete correctly in the blocked reads case (I see an assertion error). 
+
+With that in mind, the best performance was achieved with the greatest number of workgroups and workers
+coalesced reads, workgroups: 512, num_workers: 32, 0.013397824 seconds
+blocked reads, workgroups: 128, num_workers: 16, 0.03992816 seconds
\ No newline at end of file
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index 4fb771d2..e84d7e6d 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -5,13 +5,17 @@ __kernel void sum_coalesced(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
+    int global_id=get_global_id(0); 
+    //float local_size=get_local_size(0);
+    //int lim=(int)log2(local_size);
+    int i,j,offset;
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    for (i=global_id;i<N;i=i+get_global_size(0)) { // YOUR CODE HERE
+        sum=sum+x[i]; 
     }
-
+    //printf("thread[%d], sum=%f \n",global_id,sum);
     fast[local_id] = sum;
     barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -24,8 +28,15 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    j=1;
+    offset=get_local_size(0)>>j;
+    while(offset>1) {
+    //for (j = 1; j <= lim; j++) { // YOUR CODE HERE
+        offset=get_local_size(0)>>j;
+        //printf("global_id[%d], offset=%d\n",global_id,offset);
+        fast[local_id]=fast[local_id]+fast[local_id+offset];   
+        barrier(CLK_LOCAL_MEM_FENCE);
+        j++;
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -38,7 +49,12 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+    int global_id = get_global_id(0);
+    //float local_size=get_local_size(0);
+    int k = ceil((float)(N) / get_global_size(0));
+    //int lim=(int)log2(local_size);
+    int i,j,offset;
+
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,13 +64,14 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (i=global_id*k; i <= (k*(global_id+1)-1); i++) { // YOUR CODE HERE
+        sum=sum+x[i]; 
     }
-
+    
+    
     fast[local_id] = sum;
     barrier(CLK_LOCAL_MEM_FENCE);
-
+    //printf("local size=%f, lim=%d \n",local_size,lim);
     // binary reduction
     //
     // thread i should sum fast[i] and fast[i + offset] and store back
@@ -64,9 +81,18 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    j=1;
+    offset=get_local_size(0)>>j;
+    while(offset>1) {
+    //for (j = 1; j <= lim; j++) { // YOUR CODE HERE
+        offset=get_local_size(0)>>j;
+        //printf("global_id[%d], offset=%d\n",global_id,offset);
+        fast[local_id]=fast[local_id]+fast[local_id+offset];   
+        barrier(CLK_LOCAL_MEM_FENCE);
+        j++;
     }
 
-    if (local_id == 0) partial[get_group_id(0)] = fast[0];
+    if (local_id == 0) {
+        partial[get_group_id(0)] = fast[0];
+    }
 }
diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
index a0d56da2..5df8bc50 100644
--- a/HW3/P3/tune.py
+++ b/HW3/P3/tune.py
@@ -1,6 +1,8 @@
 import pyopencl as cl
 import numpy as np
-
+import pdb
+import os
+os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
 def create_data(N):
     return host_x, x
 
@@ -13,7 +15,7 @@ def create_data(N):
         print("#{0}: {1} on {2}".format(i, d.name, d.platform.name))
     ctx = cl.Context(devices)
 
-    queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+    queue = cl.CommandQueue(ctx, ctx.devices[1], properties=cl.command_queue_properties.PROFILING_ENABLE)
 
     program = cl.Program(ctx, open('sum.cl').read()).build(options='')
 
@@ -25,8 +27,9 @@ def create_data(N):
     for num_workgroups in 2 ** np.arange(3, 10):
         partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
-        for num_workers in 2 ** np.arange(2, 8):
+        for num_workers in 2 ** np.arange(2, 6):
             local = cl.LocalMemory(num_workers * 4)
+            #pdb.set_trace()
             event = program.sum_coalesced(queue, (num_workgroups * num_workers,), (num_workers,),
                                           x, partial_sums, local, np.uint64(N))
             cl.enqueue_copy(queue, host_partial, partial_sums, is_blocking=True)
@@ -34,6 +37,7 @@ def create_data(N):
             sum_gpu = sum(host_partial)
             sum_host = sum(host_x)
             seconds = (event.profile.end - event.profile.start) / 1e9
+            #pdb.set_trace()
             assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
             times['coalesced', num_workgroups, num_workers] = seconds
             print("coalesced reads, workgroups: {}, num_workers: {}, {} seconds".
@@ -42,15 +46,18 @@ def create_data(N):
     for num_workgroups in 2 ** np.arange(3, 10):
         partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
-        for num_workers in 2 ** np.arange(2, 8):
+        for num_workers in 2 ** np.arange(2, 6):
+            print("global_size={}".format(num_workers*num_workgroups))
             local = cl.LocalMemory(num_workers * 4)
             event = program.sum_blocked(queue, (num_workgroups * num_workers,), (num_workers,),
                                         x, partial_sums, local, np.uint64(N))
+            #pdb.set_trace()
             cl.enqueue_copy(queue, host_partial, partial_sums, is_blocking=True)
 
             sum_gpu = sum(host_partial)
             sum_host = sum(host_x)
             seconds = (event.profile.end - event.profile.start) / 1e9
+            
             assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
             times['blocked', num_workgroups, num_workers] = seconds
             print("blocked reads, workgroups: {}, num_workers: {}, {} seconds".
diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
index 07bb294c..f126738c 100644
--- a/HW3/P4/median_filter.cl
+++ b/HW3/P4/median_filter.cl
@@ -1,4 +1,76 @@
-#include "median9.h"
+//#include "median9.h"
+
+
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+#define max(a, b) (((a) < (b)) ? (b) : (a))
+
+#define cas(a, b) tmp = min(a, b); b = max(a, b); a = tmp
+
+inline float median9(float s0, float s1, float s2,
+                     float s3, float s4, float s5,
+                     float s6, float s7, float s8)
+{
+    // http://a-hackers-craic.blogspot.com/2011/05/3x3-median-filter-or-branchless.html
+    float tmp;
+        
+    cas(s1, s2);
+    cas(s4, s5);
+    cas(s7, s8);
+
+    cas(s0, s1);
+    cas(s3, s4);
+    cas(s6, s7);
+
+    cas(s1, s2);
+    cas(s4, s5);
+    cas(s7, s8);
+
+    cas(s3, s6);
+    cas(s4, s7);
+    cas(s5, s8);
+    cas(s0, s3);
+
+    cas(s1, s4);
+    cas(s2, s5);
+    cas(s3, s6);
+
+    cas(s4, s7);
+    cas(s1, s3);
+
+    cas(s2, s6);
+
+    cas(s2, s3);
+    cas(s4, s6);
+
+    cas(s3, s4);
+
+    return s4;
+}
+
+//Return image value at point (x,y), where the indicies are relative to the global image. If the indicies are out of bounds, choose closest in bounds value instead. 
+inline float fetch_point(__global __read_only float *img, 
+                   int w, int h,
+                   int x, int y)
+{
+    float out_img;
+    while (x<0) {
+        x++;
+    }
+    while (x>w-1){
+        x--;
+    }
+    while (y<0) {
+        y++;
+    }
+    while (y>h-1){
+        y--;
+    }
+    out_img=img[w*y+x];
+
+    return out_img;
+
+}
+    
 
 // 3x3 median filter
 __kernel void
@@ -13,7 +85,27 @@ median_3x3(__global __read_only float *in_values,
     // without using the local buffer, first, then adjust your code to
     // use such a buffer after you have that working.
 
+    //store calculated median here before transferring 
+    float out_buffer; 
+    
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
 
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+    int row;
+
+    if (x < w && y < h){
     // Load into buffer (with 1-pixel halo).
     //
     // It may be helpful to consult HW3 Problem 5, and
@@ -21,14 +113,38 @@ median_3x3(__global __read_only float *in_values,
     //
     // Note that globally out-of-bounds pixels should be replaced
     // with the nearest valid pixel's value.
+        if (idx_1D < buf_w) {
+            for (row = 0; row < buf_h; row++) {
+                buffer[row * buf_w + idx_1D] = \
+                    fetch_point(in_values, w, h,
+                          buf_corner_x + idx_1D,
+                          buf_corner_y + row);
+          }
+        }
 
+        barrier(CLK_LOCAL_MEM_FENCE);
 
-    // Compute 3x3 median for each pixel in core (non-halo) pixels
-    //
-    // We've given you median9.h, and included it above, so you can
-    // use the median9() function.
+        // Compute 3x3 median for each pixel in core (non-halo) pixels
+        //
+        // We've given you median9.h, and included it above, so you can
+        // use the median9() function.
+
+        //core point is buffer[(ly+halo)*buf_w+(lx+halo)]. For example, for (lx,ly)=(0,0), their value in the buffer should be (1,1). In the variables below, bc stands for "buffer core."
+        int bc_x=lx+halo;
+        int bc_y=ly+halo;
+        out_buffer = median9(buffer[(bc_y-1)*buf_w+bc_x-1],
+                             buffer[(bc_y-1)*buf_w+bc_x],
+                             buffer[(bc_y-1)*buf_w+bc_x+1],
+                             buffer[(bc_y)*buf_w+bc_x-1],
+                             buffer[(bc_y)*buf_w+bc_x],
+                             buffer[(bc_y)*buf_w+bc_x+1],
+                             buffer[(bc_y+1)*buf_w+bc_x-1],
+                             buffer[(bc_y+1)*buf_w+bc_x],
+                             buffer[(bc_y+1)*buf_w+bc_x+1]);
 
 
+        out_values[y*w+x]=out_buffer;
+  }
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
 }
diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
index 1eda1bb9..ad2e0339 100644
--- a/HW3/P4/median_filter.py
+++ b/HW3/P4/median_filter.py
@@ -1,9 +1,10 @@
 from __future__ import division
 import pyopencl as cl
 import numpy as np
-import imread
+#import imread
 import pylab
-
+# import os
+# os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
 def round_up(global_size, group_size):
     r = global_size % group_size
     if r == 0:
@@ -47,13 +48,18 @@ def numpy_median(image, iterations=10):
 
     # Create a queue for transferring data and launching computations.
     # Turn on profiling to allow us to check event times.
-    queue = cl.CommandQueue(context, context.devices[0],
+    queue = cl.CommandQueue(context, context.devices[1],
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
     program = cl.Program(context, open('median_filter.cl').read()).build(options='')
 
     host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
+    pylab.gray()
+
+    pylab.imshow(host_image)
+    pylab.title('original image')
+
     host_image_filtered = np.zeros_like(host_image)
 
     gpu_image_a = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)
@@ -74,17 +80,24 @@ def numpy_median(image, iterations=10):
 
     # Send image to the device, non-blocking
     cl.enqueue_copy(queue, gpu_image_a, host_image, is_blocking=False)
-
+    #seconds=[]
     num_iters = 10
     for iter in range(num_iters):
-        program.median_3x3(queue, global_size, local_size,
+        event=program.median_3x3(queue, global_size, local_size,
                            gpu_image_a, gpu_image_b, local_memory,
                            width, height,
                            buf_width, buf_height, halo)
-
+     #   seconds.append((event.profile.end - event.profile.start) / 1e9)
         # swap filtering direction
         gpu_image_a, gpu_image_b = gpu_image_b, gpu_image_a
 
     cl.enqueue_copy(queue, host_image_filtered, gpu_image_a, is_blocking=True)
-
+    seconds=(event.profile.end - event.profile.start) / 1e9
+    pylab.figure()
+    pylab.imshow(host_image_filtered)
+    pylab.title('after - zoom')
     assert np.allclose(host_image_filtered, numpy_median(host_image, num_iters))
+    
+    print("took {} seconds".format(seconds))
+
+    pylab.show()
diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
new file mode 100644
index 00000000..fb5f963e
--- /dev/null
+++ b/HW3/P5/P5.txt
@@ -0,0 +1,38 @@
+Part 1:
+Maze1:
+Finished after 865 iterations, 350.717984 ms total, 0.405454316763 ms per iteration
+Found 2 regions
+Maze2:
+Finished after 492 iterations, 212.373632 ms total, 0.431653723577 ms per iteration
+Found 35 regions
+
+Part 2:
+Maze1:
+Finished after 520 iterations, 236.204736 ms total, 0.454239876923 ms per iteration
+Found 2 regions
+Maze2:
+Finished after 269 iterations, 150.651488 ms total, 0.56004270632 ms per iteration
+Found 35 regions
+
+Part 3:
+Maze1:
+Finished after 10 iterations, 6.055488 ms total, 0.6055488 ms per iteration
+Found 2 regions
+Maze2:
+Finished after 9 iterations, 5.425952 ms total, 0.602883555556 ms per iteration
+Found 35 regions
+
+Part 4:
+Maze1:
+Finished after 10 iterations, 18.7112 ms total, 1.87112 ms per iteration
+Found 2 regions
+Maze2:
+Finished after 9 iterations, 16.863136 ms total, 1.87368177778 ms per iteration
+Found 35 regions
+
+
+
+I see that by serializing the grandparenting, the computation takes one fewer iteration to complete, but takes > 2x longer per iteration. It appears that the decreased number of reads to global memory does not balance out the increased serialization. Perhaps if my GPU had a greater number of cores the serialization wouldn't be as noticible. Additionally this would potentially be a better idea if there was some way to begin iterating over parts of the workgroup that had already been grandparented -- since the other threads in the workgroup are just waiting. 
+
+Part 5: 
+In this case, atomic operations prevent race conditions -- access to specific memory locations by multiple threads at once. Using min() instead of atomic_min() still results in the correct answer, and results in a slightly shorter time per iteration, but requires more iterations to complete, since values can get updated redundantly. I found that, on my machine, the difference between using min() and atomic_min() is negligible. 
\ No newline at end of file
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 78b986b3..f0f99e02 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -27,6 +27,28 @@ get_clamped_value(__global __read_only int *labels,
     return labels[y * w + x];
 }
 
+int 
+find_min(__local int *buffer,
+         int buf_x, int buf_y, 
+         int buf_w,
+         int w, int h){
+    int i,j;
+    //set min_val intially to center value
+    int min_val=buffer[buf_y * buf_w + buf_x];
+    // if not a wall point
+    if (min_val!=w*h){
+        for (i = -1; i <= 1; i++){
+            for (j = -1; j <= 1; j++){
+                //only check on off-diagonals, not corners or center point, or maze walls (=0).
+                if  (i!=j){
+                    min_val=min(min_val,buffer[(buf_y+j)*buf_w+(buf_x+i)]);
+                }
+            }
+        }
+    }
+    return min_val;
+}
+
 __kernel void
 propagate_labels(__global __read_write int *labels,
                  __global __write_only int *changed_flag,
@@ -80,20 +102,60 @@ propagate_labels(__global __read_write int *labels,
     old_label = buffer[buf_y * buf_w + buf_x];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
-    
+
+    //part 4
+    //if current thread is first in workgroup
+/*    if (lx==0 && ly==0){
+        int current,idx, row, col;;
+        int previous=w*h;
+        for (row=halo; row< (buf_h-halo); row++){
+            for (col=halo; col< (buf_w-halo);col++){
+                idx=row*buf_w+col;
+
+                //not a wall
+                if (buffer[idx]!=h*w){
+                    //don't look into global memory if not necessary
+                    if (previous == buffer[idx]){
+                        buffer[idx]=current;
+                    }
+                    //otherwise, do grandparenting
+                    else{
+                        previous = buffer[idx];
+                        buffer[idx]=labels[buffer[idx]];
+                        current=buffer[idx];
+                    }
+                }
+            }
+        }   
+    }*/
+    //Part 2
+    //if current pixel is not a wall
+    if (buffer[buf_y * buf_w + buf_x]!=h*w){
+
+        buffer[buf_y * buf_w + buf_x]=labels[buffer[buf_y * buf_w + buf_x]];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+
     // stay in bounds
     if ((x < w) && (y < h)) {
         // CODE FOR PART 1 HERE
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
-        new_label = old_label;
+        new_label = find_min(buffer, 
+                             buf_x, buf_y, 
+                             buf_w,
+                             w, h);
 
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
+            atomic_min(&labels[old_label],new_label);
+            atomic_min(&labels[y * w + x],new_label);
             // indicate there was a change this iteration.
             // multiple threads might write this.
             *(changed_flag) += 1;
-            labels[y * w + x] = new_label;
+            /*labels[old_label]=min(labels[old_label],new_label);
+            labels[y * w + x] = new_label;*/
         }
     }
 }
diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py
index c6ce60cb..9ae071b8 100644
--- a/HW3/P5/label_regions.py
+++ b/HW3/P5/label_regions.py
@@ -3,7 +3,9 @@
 import pyopencl as cl
 import numpy as np
 import pylab
-
+import pdb
+import os
+os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
 def round_up(global_size, group_size):
     r = global_size % group_size
     if r == 0:
@@ -36,13 +38,13 @@ def round_up(global_size, group_size):
 
     # Create a queue for transferring data and launching computations.
     # Turn on profiling to allow us to check event times.
-    queue = cl.CommandQueue(context, context.devices[0],
+    queue = cl.CommandQueue(context, context.devices[1],
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
     program = cl.Program(context, open('label_regions.cl').read()).build(options='')
 
-    host_image = np.load('maze1.npy')
+    host_image = np.load('maze2.npy')
     host_labels = np.empty_like(host_image)
     host_done_flag = np.zeros(1).astype(np.int32)
 
@@ -103,6 +105,7 @@ def round_up(global_size, group_size):
             break
         # there were changes, so continue running
         print host_done_flag
+
         if itercount % 100 == 0 and show_progress:
             cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True)
             pylab.imshow(host_labels)