harvard-cs205 · haosutang · Nov 3, 2015 · Nov 5, 2015 · Nov 5, 2015 · Nov 16, 2015
diff --git a/.git.bfg-report/2015-11-03/11-37-30/cache-stats.txt b/.git.bfg-report/2015-11-03/11-37-30/cache-stats.txt
@@ -0,0 +1,4 @@
+(apply,CacheStats{hitCount=112, missCount=70, loadSuccessCount=70, loadExceptionCount=0, totalLoadTime=185199690, evictionCount=0})
+(tree,CacheStats{hitCount=156, missCount=125, loadSuccessCount=120, loadExceptionCount=0, totalLoadTime=209773886, evictionCount=0})
+(commit,CacheStats{hitCount=36, missCount=36, loadSuccessCount=36, loadExceptionCount=0, totalLoadTime=120618566, evictionCount=0})
+(tag,CacheStats{hitCount=0, missCount=0, loadSuccessCount=0, loadExceptionCount=0, totalLoadTime=0, evictionCount=0})
diff --git a/.git.bfg-report/2015-11-03/11-37-30/deleted-files.txt b/.git.bfg-report/2015-11-03/11-37-30/deleted-files.txt
@@ -0,0 +1 @@
+2c8c249ccb705d86464062748879e243832d8748 176626884 image.npz
diff --git a/.git.bfg-report/2015-11-03/11-37-30/object-id-map.old-new.txt b/.git.bfg-report/2015-11-03/11-37-30/object-id-map.old-new.txt
@@ -0,0 +1,27 @@
+091da4a92561434ea3f83e168d774dbc823a71cd afdfad4da10a213564adc019c73daeb080c290a0
+0caec684df270136f8994b7a440b8887b021cdb1 0dd0f85e4a361c1f8fd3b7891e7f0ff00ad0dd12
+0df4ba3d1ed8a6d000c73824771848d548811852 d57f19df041e727a091b36f620749c4e90825222
+132c45fec95b1f06e4b1dacc4d038924df13e3fa 69cdd1004db36b05ba68e5c84e2b431398c9c5c4
+296463fa9ca352573ad27c9f0a62e655e2879c1c d73744ccd976030a9c5747f55e4dce4dc01df1e0
+2b503b3cc9a6af6b025068d5762da9f94c2bcc35 f4d29581c2968b5600a123c46cd4fc80d1455dd9
+2fefccc72fbdc602a98e6a8863a7cdf6fbddf486 cf408e405f649226421180edf98460af7b789476
+46fde603928535d91b6c20d7493143f182a9184f a67a3b65c2c256336da432066e3d545f7715915f
+4bb1c42e82f65f2a7e11e9962aa440b593d21591 c9deb5a4f6ea76943f3402e792201c6c0eb1b35a
+578aebef8837a52a9f3e5e2a1067e36853aa595f eb74eae1d92ca72455a4a7350800dbbcae27338d
+5bfee183d6a3b393f000f3da7120f84b3e4d4453 dcd8b9481bea25b9fb0f1cf7688dc296bfd04459
+5c983813b314e1fac2acbcb9cd39a9970a1d9e4b 7b1e7fc39ea621bf61b13cdd66583e2f10bc4be0
+5d5e3d32301a8c0c9e27639a660b08ddf64a5789 99b4dc236c9d8af19db510a00a369ad6782abb99
+6d98fe5bb47c0eae6314f4f989dfeb75be9f0e66 3c948f6c78933d19cd8f057b5bb60cac4e23a19f
+748b9880bc9e50c870c346f232e10516ce64ce2e 40bde769b599c15cce591f27adf38d1dba12901e
+7e5250aa6b8adaaa9ce0b540b7078d786c7cefdd 8892eaa24bd6e1c45d71db9a2eb14ffba27534df
+99f98ab588d3b23bac83f48f60f9409c9ca212fc 4db29f94190931a9a5ba5d33739a72dac8d831cf
+9b33a4f616f08d2ac4287b276356359b14007995 606b3fd3348545e369b86adbd9174c49bd3d3e45
+9d4d1e802c5a532bf35d08b8da3453961b254839 f449d7d26ed71c06375cd37b101a74fb97ba3fbf
+a24b78d0e8f0ab353fb87b587ec19a0b068b7747 8ea78ff263a3e5d33e63c5d844a76f4759f97987
+c6561bc15047eb9a747df5191dca6fd82d5ef1a5 0f838e1096934173d1d32381fefc268f7aecebb0
+cc57ed6753b50abcca1fc445d6463afba82e1649 eaeab959cb57844deade137f9a8af0b15c501e47
+dfe20862f1d2c717304f1d954e08f8c66bfd75a2 2b4100f071b42206517dec46c3164fd87119d357
+e7b96f1a622b61522e20ba3dedea59db270620a6 8100ff1fc52e7d3fed6a970266048e9453bf8e78
+eb2cc6604a44f567dc5e8f938113a4afe085aa48 a665be5a65b8d8ed1b957bc07f53603f7aeda456
+f25e969fe1ebbc63a2dca79e220b524cb570a0f0 3535c11c926356a8874f6446bbd72a718727439b
+f75fc3591a08d64048d6c3d5d379a69cf45c69df 7cbba8ca1361e48e1502ee09bb1275c8c076c255
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 *~
 *.pyc
 *.pyxbldc
+*.npz
diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
@@ -10,10 +10,21 @@ mandelbrot(__global __read_only float *coords_real,
 
     float c_real, c_imag;
     float z_real, z_imag;
+    float temp;
     int iter;
 
     if ((x < w) && (y < h)) {
-        // YOUR CODE HERE
-        ;
+        z_real = 0.;
+        c_real = coords_real[y*w+x];
+        z_imag = 0.;
+        c_imag = coords_imag[y*w+x];
+        iter = 0;
+        while(((z_real*z_real+z_imag*z_imag) < 4) && (iter < max_iter)){
+            temp = z_real*z_real -z_imag*z_imag+c_real;
+            z_imag = 2*z_real*z_imag+c_imag;
+            z_real = temp;
+            iter ++;
+        }
+        out_counts[y*w+x] = iter;
     }
 }
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
@@ -0,0 +1,91 @@
+Best configuration:
+configuration ('coalesced', 128, 128): 0.0022836 seconds
+
+----------------------
+#0: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz on Apple
+#1: Iris on Apple
+coalesced reads, workgroups: 8, num_workers: 4, 0.13820736 seconds
+coalesced reads, workgroups: 8, num_workers: 8, 0.07522864 seconds
+coalesced reads, workgroups: 8, num_workers: 16, 0.03894968 seconds
+coalesced reads, workgroups: 8, num_workers: 32, 0.01939744 seconds
+coalesced reads, workgroups: 8, num_workers: 64, 0.01012384 seconds
+coalesced reads, workgroups: 8, num_workers: 128, 0.00510888 seconds
+coalesced reads, workgroups: 16, num_workers: 4, 0.07443104 seconds
+coalesced reads, workgroups: 16, num_workers: 8, 0.0354476 seconds
+coalesced reads, workgroups: 16, num_workers: 16, 0.01942088 seconds
+coalesced reads, workgroups: 16, num_workers: 32, 0.0095256 seconds
+coalesced reads, workgroups: 16, num_workers: 64, 0.00377312 seconds
+coalesced reads, workgroups: 16, num_workers: 128, 0.0030688 seconds
+coalesced reads, workgroups: 32, num_workers: 4, 0.03869232 seconds
+coalesced reads, workgroups: 32, num_workers: 8, 0.01968616 seconds
+coalesced reads, workgroups: 32, num_workers: 16, 0.00996952 seconds
+coalesced reads, workgroups: 32, num_workers: 32, 0.0052508 seconds
+coalesced reads, workgroups: 32, num_workers: 64, 0.002854 seconds
+coalesced reads, workgroups: 32, num_workers: 128, 0.00294048 seconds
+coalesced reads, workgroups: 64, num_workers: 4, 0.019068 seconds
+coalesced reads, workgroups: 64, num_workers: 8, 0.0098072 seconds
+coalesced reads, workgroups: 64, num_workers: 16, 0.00523128 seconds
+coalesced reads, workgroups: 64, num_workers: 32, 0.00287544 seconds
+coalesced reads, workgroups: 64, num_workers: 64, 0.00305128 seconds
+coalesced reads, workgroups: 64, num_workers: 128, 0.002908 seconds
+coalesced reads, workgroups: 128, num_workers: 4, 0.01902536 seconds
+coalesced reads, workgroups: 128, num_workers: 8, 0.00996336 seconds
+coalesced reads, workgroups: 128, num_workers: 16, 0.00519712 seconds
+coalesced reads, workgroups: 128, num_workers: 32, 0.0028336 seconds
+coalesced reads, workgroups: 128, num_workers: 64, 0.00265088 seconds
+coalesced reads, workgroups: 128, num_workers: 128, 0.0022836 seconds
+coalesced reads, workgroups: 256, num_workers: 4, 0.01938496 seconds
+coalesced reads, workgroups: 256, num_workers: 8, 0.01029512 seconds
+coalesced reads, workgroups: 256, num_workers: 16, 0.00518696 seconds
+coalesced reads, workgroups: 256, num_workers: 32, 0.00288824 seconds
+coalesced reads, workgroups: 256, num_workers: 64, 0.00251904 seconds
+coalesced reads, workgroups: 256, num_workers: 128, 0.0024184 seconds
+coalesced reads, workgroups: 512, num_workers: 4, 0.02045848 seconds
+coalesced reads, workgroups: 512, num_workers: 8, 0.0104252 seconds
+coalesced reads, workgroups: 512, num_workers: 16, 0.00544816 seconds
+coalesced reads, workgroups: 512, num_workers: 32, 0.00294584 seconds
+coalesced reads, workgroups: 512, num_workers: 64, 0.00273072 seconds
+coalesced reads, workgroups: 512, num_workers: 128, 0.00257088 seconds
+blocked reads, workgroups: 8, num_workers: 4, 0.21957688 seconds
+blocked reads, workgroups: 8, num_workers: 8, 0.13403528 seconds
+blocked reads, workgroups: 8, num_workers: 16, 0.08049544 seconds
+blocked reads, workgroups: 8, num_workers: 32, 0.04249248 seconds
+blocked reads, workgroups: 8, num_workers: 64, 0.01939344 seconds
+blocked reads, workgroups: 8, num_workers: 128, 0.01121992 seconds
+blocked reads, workgroups: 16, num_workers: 4, 0.12281136 seconds
+blocked reads, workgroups: 16, num_workers: 8, 0.0711032 seconds
+blocked reads, workgroups: 16, num_workers: 16, 0.040288 seconds
+blocked reads, workgroups: 16, num_workers: 32, 0.01929832 seconds
+blocked reads, workgroups: 16, num_workers: 64, 0.01115656 seconds
+blocked reads, workgroups: 16, num_workers: 128, 0.02065632 seconds
+blocked reads, workgroups: 32, num_workers: 4, 0.06618664 seconds
+blocked reads, workgroups: 32, num_workers: 8, 0.03625648 seconds
+blocked reads, workgroups: 32, num_workers: 16, 0.01941776 seconds
+blocked reads, workgroups: 32, num_workers: 32, 0.0112864 seconds
+blocked reads, workgroups: 32, num_workers: 64, 0.020172 seconds
+blocked reads, workgroups: 32, num_workers: 128, 0.05790488 seconds
+blocked reads, workgroups: 64, num_workers: 4, 0.03469184 seconds
+blocked reads, workgroups: 64, num_workers: 8, 0.018126 seconds
+blocked reads, workgroups: 64, num_workers: 16, 0.01120904 seconds
+blocked reads, workgroups: 64, num_workers: 32, 0.02006592 seconds
+blocked reads, workgroups: 64, num_workers: 64, 0.0561036 seconds
+blocked reads, workgroups: 64, num_workers: 128, 0.06538024 seconds
+blocked reads, workgroups: 128, num_workers: 4, 0.034544 seconds
+blocked reads, workgroups: 128, num_workers: 8, 0.01909208 seconds
+blocked reads, workgroups: 128, num_workers: 16, 0.01207088 seconds
+blocked reads, workgroups: 128, num_workers: 32, 0.02292568 seconds
+blocked reads, workgroups: 128, num_workers: 64, 0.06212224 seconds
+blocked reads, workgroups: 128, num_workers: 128, 0.04734296 seconds
+blocked reads, workgroups: 256, num_workers: 4, 0.0348528 seconds
+blocked reads, workgroups: 256, num_workers: 8, 0.01931432 seconds
+blocked reads, workgroups: 256, num_workers: 16, 0.0112232 seconds
+blocked reads, workgroups: 256, num_workers: 32, 0.02347664 seconds
+blocked reads, workgroups: 256, num_workers: 64, 0.04581624 seconds
+blocked reads, workgroups: 256, num_workers: 128, 0.03153152 seconds
+blocked reads, workgroups: 512, num_workers: 4, 0.03445376 seconds
+blocked reads, workgroups: 512, num_workers: 8, 0.01904256 seconds
+blocked reads, workgroups: 512, num_workers: 16, 0.01207264 seconds
+blocked reads, workgroups: 512, num_workers: 32, 0.02433416 seconds
+blocked reads, workgroups: 512, num_workers: 64, 0.03053264 seconds
+blocked reads, workgroups: 512, num_workers: 128, 0.02399624 seconds
+configuration ('coalesced', 128, 128): 0.0022836 seconds
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
@@ -8,8 +8,8 @@ __kernel void sum_coalesced(__global float* x,
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    for (uint i=get_global_id(0); i<N; i = i+get_global_size(0)) { // YOUR CODE HERE
+        sum = sum + x[i]; // YOUR CODE HERE 
     }
 
     fast[local_id] = sum;
@@ -24,8 +24,13 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+    uint gs = get_local_size(0);
+    for(uint s = gs/2; s > 0; s >>= 1) {
+        if(local_id < s) {
+          fast[local_id] += fast[local_id+s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -38,7 +43,7 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+    int k = ceil((float)N / get_global_size(0));
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,8 +53,9 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (uint i=k*get_global_id(0);i<k*(get_global_id(0)+1);i++) { // YOUR CODE HERE
+        if(i<N)
+            sum += x[i]; // YOUR CODE HERE
     }
 
     fast[local_id] = sum;
@@ -64,8 +70,13 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+    uint gs = get_local_size(0);
+    for(uint s = gs/2; s > 0; s >>= 1) {
+        if(local_id < s) {
+          fast[local_id] += fast[local_id+s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];

diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
@@ -23,7 +23,7 @@ def create_data(N):
     times = {}
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
@@ -40,7 +40,7 @@ def create_data(N):
                   format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
@@ -31,4 +31,56 @@ median_3x3(__global __read_only float *in_values,
 
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
+
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+    int row;
+
+    if (idx_1D < buf_w)
+        for (row = 0; row < buf_h; row++) {
+
+            // Handle boundary case, use the closest pixel
+            int this_x = buf_corner_x + idx_1D;
+            int this_y = buf_corner_y + row;
+            if(this_x>=w) this_x = w-1;
+            else if(this_x<0) this_x = 0;
+            if(this_y>=h) this_y = h-1;
+            else if(this_y<0) this_y = 0;
+            buffer[row * buf_w + idx_1D] = in_values[this_y * w + this_x];
+        }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // write output
+    if((y<h) && (x<w))  //stay in bound
+      //Apply median filter within buffer
+      out_values[y*w+x] = median9(buffer[buf_x-1+(buf_y-1)*buf_w],
+                                  buffer[buf_x+(buf_y-1)*buf_w],
+                                  buffer[buf_x+1+(buf_y-1)*buf_w],
+                                  buffer[buf_x-1+buf_y*buf_w],
+                                  buffer[buf_x+buf_y*buf_w],
+                                  buffer[buf_x+1+buf_y*buf_w],
+                                  buffer[buf_x-1+(buf_y+1)*buf_w],
+                                  buffer[buf_x+(buf_y+1)*buf_w],
+                                  buffer[buf_x+1+(buf_y+1)*buf_w]);
+
+
 }
diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
@@ -1,8 +1,8 @@
 from __future__ import division
 import pyopencl as cl
 import numpy as np
-import imread
 import pylab
+import os.path
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
-    program = cl.Program(context, open('median_filter.cl').read()).build(options='')
+    curdir = os.path.dirname(os.path.realpath(__file__))
+    program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
 
     host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
     host_image_filtered = np.zeros_like(host_image)

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
@@ -0,0 +1,47 @@
+Part 1. implement updates from neighbors
+[Maze1]
+Finished after 880 iterations, 277.56768 ms total, 0.315417818182 ms per iteration
+Found 2 regions
+
+[Maze2]
+Finished after 516 iterations, 162.2668 ms total, 0.314470542636 ms per iteration
+Found 35 regions
+
+--------------------
+Part 2. fetch grandparents
+[Maze1]
+Finished after 529 iterations, 166.75776 ms total, 0.315232060491 ms per iteration
+Found 2 regions
+
+[Maze2]
+Finished after 273 iterations, 85.42904 ms total, 0.312926886447 ms per iteration
+Found 35 regions
+
+--------------------
+Part 3. merge parent regions
+[Maze1]
+Finished after 10 iterations, 3.08848 ms total, 0.308848 ms per iteration
+Found 2 regions
+
+[Maze2]
+Finished after 9 iterations, 2.77736 ms total, 0.308595555556 ms per iteration
+Found 35 regions
+
+--------------------
+Part 4. efficient grandparents
+[Maze1]
+Finished after 884 iterations, 273.95088 ms total, 0.30989918552 ms per iteration
+Found 2 regions
+
+[Maze2]
+Finished after 517 iterations, 161.49896 ms total, 0.312377098646 ms per iteration
+Found 35 regions
+
+Using efficient grandparents, the program runs about 60% slower than part 2, suggesting the serialization to avoid redundant memory reads does not optimize as we think it to be. The possible reason is the memory reads are not so inefficient comparing to other operations such as assignments and therefore the time efficiency is not bound by memory reads. In other words, comparing to the increase in time from serialized label propagating, the time saved from memory reads is trivial. Using single thread and loop over all elements in the local group makes the program inaccessible to the advantages of parallel propagating, and therefore results in more iterations. And the iteration speed is now bound by the single thread (in my program the first one), but is not significant in this case.
+
+This happens on my laptop with Intel Iris 1536 MB it is the case. Things might be different for other graphics cards where memory reads is really slow and this kind of serialization will help.
+
+--------------------
+Part5. no atomic operations
+
+The advantage of using atomic operation is that it contains comparison and writing in one step and therefore serializes memory access and avoids writing conflicts. Using min will not have this effect: we are not sure which thread last writes the label when they access to the same element together. However, since labeling is not a one-time process, it takes multiple iterations and propagating the labels until not change flag. Therefore, using min() will not affect the final labeled result, but may take slightly more iterations.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		2c8c249ccb705d86464062748879e243832d8748 176626884 image.npz