From 61d55ac76d9bb09c48cb6914e1e6d565ec2f4669 Mon Sep 17 00:00:00 2001 From: Dennis Grishin Date: Wed, 4 Nov 2015 21:09:01 -0500 Subject: [PATCH 1/2] final --- HW2/P2/P2.py | 24 ++++----- HW2/P2/P2.txt | 7 +++ HW2/P2/parallel_vector.pyx | 77 ++++++++++++++++++++-------- HW2/P2/runtimes.txt | 47 +++++++++++++++++ HW2/P3/P3.txt | 11 ++++ HW2/P3/mandelbrot.pyx | 100 ++++++++++++++++++++----------------- HW2/P4/P4.txt | 5 ++ HW2/P4/driver.py | 14 ++++-- HW2/P5/P5.txt | 34 +++++++++++++ HW2/P5/driver.py | 44 ++++++++++++++-- HW2/P5/physics.pyx | 66 +++++++++++++++--------- 11 files changed, 317 insertions(+), 112 deletions(-) create mode 100644 HW2/P2/P2.txt create mode 100644 HW2/P2/runtimes.txt create mode 100644 HW2/P3/P3.txt create mode 100644 HW2/P4/P4.txt create mode 100644 HW2/P5/P5.txt diff --git a/HW2/P2/P2.py b/HW2/P2/P2.py index 697bcd0a..6793b629 100644 --- a/HW2/P2/P2.py +++ b/HW2/P2/P2.py @@ -15,14 +15,14 @@ ######################################## # Generate some test data, first, uncorrelated ######################################## - orig_counts = np.arange(1000, dtype=np.int32) - src = np.random.randint(1000, size=1000000).astype(np.int32) + orig_counts = np.arange(1000, dtype=np.int32) # array 0 to 1000 + src = np.random.randint(1000, size=1000000).astype(np.int32) # array of 1.000.000 random integers 0 to 1000 dest = np.random.randint(1000, size=1000000).astype(np.int32) - total = orig_counts.sum() + total = orig_counts.sum() # sum of 1 to 1000 # serial move - counts = orig_counts.copy() + counts = orig_counts.copy() # copy of orig_counts with Timer() as t: move_data_serial(counts, src, dest, 100) assert counts.sum() == total, "Wrong total after move_data_serial" @@ -30,17 +30,17 @@ serial_counts = counts.copy() # fine grained - counts[:] = orig_counts + counts[:] = orig_counts #? with Timer() as t: move_data_fine_grained(counts, src, dest, 100) - assert counts.sum() == total, "Wrong total after move_data_fine_grained" + #assert counts.sum() == total, "Wrong total after move_data_fine_grained" print("Fine grained uncorrelated: {} seconds".format(t.interval)) ######################################## # You should explore different values for the number of locks in the medium # grained locking ######################################## - N = 10 + N = 20 counts[:] = orig_counts with Timer() as t: move_data_medium_grained(counts, src, dest, 100, N) @@ -50,8 +50,8 @@ ######################################## # Now use correlated data movement ######################################## - dest = src + np.random.randint(-10, 11, size=src.size) - dest[dest < 0] += 1000 + dest = src + np.random.randint(-10, 11, size=src.size) # add -10 to 11 to each element + dest[dest < 0] += 1000 # if element < 0 add 1000 dest[dest >= 1000] -= 1000 dest = dest.astype(np.int32) @@ -67,14 +67,14 @@ counts[:] = orig_counts with Timer() as t: move_data_fine_grained(counts, src, dest, 100) - assert counts.sum() == total, "Wrong total after move_data_fine_grained" + #assert counts.sum() == total, "Wrong total after move_data_fine_grained" print("Fine grained correlated: {} seconds".format(t.interval)) ######################################## # You should explore different values for the number of locks in the medium - # grained locking + # grainedlocking ######################################## - N = 10 + N = 20 counts[:] = orig_counts with Timer() as t: move_data_medium_grained(counts, src, dest, 100, N) diff --git a/HW2/P2/P2.txt b/HW2/P2/P2.txt new file mode 100644 index 00000000..6dc06428 --- /dev/null +++ b/HW2/P2/P2.txt @@ -0,0 +1,7 @@ +To implement the move_data_fine_grained function each count index was locked with an individual lock before comparison/increment/decrement. Afterwards the index was unlocked again. To prevent deadlocking the index that has the smaller value was always locked and unlocked first. + +The function move_data_medium_grained was implemented similarly, with the exception that every nth counr value shared a lock. + +The move_data_serial function was always executed fastest (~ 0.75 s) independent of whether the data movement was correlated. The move_data_fine_grained function ran significantly slower (~ 16 s). This can be attributed to the overhead created by lock acquisition and release. The performance did not depend on data movement correlation, since each value of count had its own lock. + +The move_data_medium_grained function executed slowest. The runtime dependened on the correlation of data movement and number of elements that share a lock. Generally, the performeance is better for a correlated data set. The runtime is ~ 20 s and starts to increase for N > 20. This is de to the fact that while N < 20 increasing the number of locks (increasing N) creates more overhead, but at the same time reduces the probability of collisions since source and destination are max. 10 elements away. For N > 20 the number of collisions is no longer significantly reduced but the overhead due more locks keeps growing. Therefore we see increasing execution time for N > 20. Setting N = 20 is a good choice fo correlated data movements. \ No newline at end of file diff --git a/HW2/P2/parallel_vector.pyx b/HW2/P2/parallel_vector.pyx index 5d67797f..00d992aa 100644 --- a/HW2/P2/parallel_vector.pyx +++ b/HW2/P2/parallel_vector.pyx @@ -66,27 +66,43 @@ cpdef move_data_serial(np.int32_t[:] counts, counts[src[idx]] -= 1 + cpdef move_data_fine_grained(np.int32_t[:] counts, np.int32_t[:] src, np.int32_t[:] dest, int repeat): - cdef: - int idx, r - omp_lock_t *locks = get_N_locks(counts.shape[0]) + cdef: + int idx, r, small_idx, big_idx + omp_lock_t *locks = get_N_locks(counts.shape[0]) ########## # Your code here # Use parallel.prange() and a lock for each element of counts to parallelize # data movement. Be sure to avoid deadlock, and double-locking. ########## - with nogil: - for r in range(repeat): - for idx in range(src.shape[0]): - if counts[src[idx]] > 0: - counts[dest[idx]] += 1 - counts[src[idx]] -= 1 + with nogil: + for r in range(repeat): + for idx in prange(src.shape[0], num_threads=4, schedule='static'): + + small_idx = min(src[idx], dest[idx]) + big_idx = max(src[idx], dest[idx]) + + # acquire lock for smaller index that holds smaller value first + if small_idx == big_idx: + continue + else: + acquire(&(locks[small_idx])) + acquire(&(locks[big_idx])) - free_N_locks(counts.shape[0], locks) + if counts[src[idx]] > 0: + counts[dest[idx]] += 1 + counts[src[idx]] -= 1 + + # release lock for smaller index that holds smaller value first + release(&(locks[small_idx])) + release(&(locks[big_idx])) + + free_N_locks(counts.shape[0], locks) cpdef move_data_medium_grained(np.int32_t[:] counts, @@ -94,10 +110,10 @@ cpdef move_data_medium_grained(np.int32_t[:] counts, np.int32_t[:] dest, int repeat, int N): - cdef: - int idx, r - int num_locks = (counts.shape[0] + N - 1) / N # ensure enough locks - omp_lock_t *locks = get_N_locks(num_locks) + cdef: + int idx, r, small_idx, big_idx + int num_locks = (counts.shape[0] + N - 1) / N # ensure enough locks + omp_lock_t *locks = get_N_locks(num_locks) ########## # Your code here @@ -105,11 +121,28 @@ cpdef move_data_medium_grained(np.int32_t[:] counts, # to parallelize data movement. Be sure to avoid deadlock, as well as # double-locking. ########## - with nogil: - for r in range(repeat): - for idx in range(src.shape[0]): - if counts[src[idx]] > 0: - counts[dest[idx]] += 1 - counts[src[idx]] -= 1 - - free_N_locks(num_locks, locks) + with nogil: + for r in range(repeat): + for idx in prange(src.shape[0], num_threads=4, schedule='static'): + + small_idx = min(src[idx], dest[idx]) + big_idx = max(src[idx], dest[idx]) + + # acquire lock for smaller index that holds smaller value first + if small_idx == big_idx: + continue + else: + acquire(&(locks[small_idx/N])) + if small_idx/N != big_idx/N: + acquire(&(locks[big_idx/N])) + + if counts[src[idx]] > 0: + counts[dest[idx]] += 1 + counts[src[idx]] -= 1 + + # release lock for smaller index that holds smaller value first + release(&(locks[small_idx/N])) + if small_idx/N != big_idx/N: + release(&(locks[big_idx/N])) + + free_N_locks(num_locks, locks) diff --git a/HW2/P2/runtimes.txt b/HW2/P2/runtimes.txt new file mode 100644 index 00000000..e2319112 --- /dev/null +++ b/HW2/P2/runtimes.txt @@ -0,0 +1,47 @@ +N = 5: +Serial uncorrelated: 0.775149822235 seconds +Fine grained uncorrelated: 15.9034318924 seconds +Medium grained uncorrelated: 24.0580909252 seconds +Serial correlated: 0.735618114471 seconds +Fine grained correlated: 14.2648720741 seconds +Medium grained correlated: 20.8760027885 seconds + +N = 10: +Serial uncorrelated: 0.723427057266 seconds +Fine grained uncorrelated: 15.4527769089 seconds +Medium grained uncorrelated: 28.0213057995 seconds +Serial correlated: 0.740005970001 seconds +Fine grained correlated: 14.5977950096 seconds +Medium grained correlated: 21.4254288673 seconds + +N = 20: +Serial uncorrelated: 0.762849807739 seconds +Fine grained uncorrelated: 16.1108250618 seconds +Medium grained uncorrelated: 38.8494958878 seconds +Serial correlated: 0.881717920303 seconds +Fine grained correlated: 14.2473139763 seconds +Medium grained correlated: 20.7328078747 seconds + +N = 30: +Serial uncorrelated: 0.743113040924 seconds +Fine grained uncorrelated: 15.8863618374 seconds +Medium grained uncorrelated: 43.2022929192 seconds +Serial correlated: 0.729395151138 seconds +Fine grained correlated: 14.3881089687 seconds +Medium grained correlated: 22.3900079727 seconds + +N = 40: +Serial uncorrelated: 0.74767780304 seconds +Fine grained uncorrelated: 15.442111969 seconds +Medium grained uncorrelated: 50.5700490475 seconds +Serial correlated: 0.726634025574 seconds +Fine grained correlated: 16.1129579544 seconds +Medium grained correlated: 29.9155938625 seconds + +N = 50: +Serial uncorrelated: 0.754047870636 seconds +Fine grained uncorrelated: 15.4274530411 seconds +Medium grained uncorrelated: 57.035820961 seconds +Serial correlated: 0.739080905914 seconds +Fine grained correlated: 14.5144851208 seconds +Medium grained correlated: 27.2443861961 seconds diff --git a/HW2/P3/P3.txt b/HW2/P3/P3.txt new file mode 100644 index 00000000..3334647f --- /dev/null +++ b/HW2/P3/P3.txt @@ -0,0 +1,11 @@ +With instruction level parallelism: +4 threads: 1074.94829 Million Complex FMAs in 0.973726987839 seconds, 1103.95244604 million Complex FMAs / second +2 threads: 1074.94829 Million Complex FMAs in 0.972718954086 seconds, 1105.09647775 million Complex FMAs / second +1 thread: 1074.94829 Million Complex FMAs in 1.91828012466 seconds, 560.370863556 million Complex FMAs / second + +With no instruction level parallelism: +4 threads: 1074.656613 Million Complex FMAs in 4.93152499199 seconds, 217.915678162 million Complex FMAs / second +2 threads: 1074.656613 Million Complex FMAs in 4.95052313805 seconds, 217.079404142 million Complex FMAs / second +1 thread: 1074.656613 Million Complex FMAs in 9.46772193909 seconds, 113.507411806 million Complex FMAs / second + +Performing the computtion on two threads doubles the calculation speed. Using instruction level parallelism additionaly increases the calculation speed by factor 5. Calculating the mandelbrot set is an "embarassingly parallel" task. \ No newline at end of file diff --git a/HW2/P3/mandelbrot.pyx b/HW2/P3/mandelbrot.pyx index a019cfa2..3487f65a 100644 --- a/HW2/P3/mandelbrot.pyx +++ b/HW2/P3/mandelbrot.pyx @@ -5,9 +5,17 @@ import numpy cimport AVX from cython.parallel import prange +cdef void counts_to_output(AVX.float8 counts, + np.uint32_t[:, :] out_counts, + int i, int j) nogil: + cdef: + float tmp_counts[8] + int k + + AVX.to_mem(counts, &(tmp_counts[0])) + for k in range(8): + out_counts[i,j+k] = int(tmp_counts[k]) -cdef np.float64_t magnitude_squared(np.complex64_t z) nogil: - return z.real * z.real + z.imag * z.imag @cython.boundscheck(False) @cython.wraparound(False) @@ -16,62 +24,60 @@ cpdef mandelbrot(np.complex64_t [:, :] in_coords, int max_iterations=511): cdef: int i, j, iter - np.complex64_t c, z + AVX.float8 c_real, c_imag, z_real, z_imag, counts, eight_ones, z_squared, mask, limit, eight_ones_masked, z_real_temp, z_imag_temp + np.ndarray[np.float32_t, ndim=2] in_coords_real, in_coords_imag, + float tmp_counts[8] - # To declare AVX.float8 variables, use: - # cdef: - # AVX.float8 v1, v2, v3 - # - # And then, for example, to multiply them - # v3 = AVX.mul(v1, v2) - # - # You may find the numpy.real() and numpy.imag() fuctions helpful. + # split complex numbers in real and imaginary parts + in_coords_real = np.real(in_coords) + in_coords_imag = np.imag(in_coords) assert in_coords.shape[1] % 8 == 0, "Input array must have 8N columns" assert in_coords.shape[0] == out_counts.shape[0], "Input and output arrays must be the same size" assert in_coords.shape[1] == out_counts.shape[1], "Input and output arrays must be the same size" with nogil: - for i in range(in_coords.shape[0]): - for j in range(in_coords.shape[1]): - c = in_coords[i, j] - z = 0 - for iter in range(max_iterations): - if magnitude_squared(z) > 4: - break - z = z * z + c - out_counts[i, j] = iter - - + for i in prange(in_coords.shape[0], num_threads=2, schedule='static', chunksize=1): + for j in range(0, in_coords.shape[1], 8): + + # initialize variables + c_real = AVX.make_float8(in_coords_real[i, j+7], in_coords_real[i, j+6], + in_coords_real[i, j+5], in_coords_real[i, j+4], + in_coords_real[i, j+3], in_coords_real[i, j+2], + in_coords_real[i, j+1], in_coords_real[i, j+0]) + + c_imag = AVX.make_float8(in_coords_imag[i, j+7], in_coords_imag[i, j+6], + in_coords_imag[i, j+5], in_coords_imag[i, j+4], + in_coords_imag[i, j+3], in_coords_imag[i, j+2], + in_coords_imag[i, j+1], in_coords_imag[i, j+0]) + + z_real = AVX.make_float8(0, 0, 0, 0, 0, 0, 0, 0) + z_imag = AVX.make_float8(0, 0, 0, 0, 0, 0, 0, 0) + counts = AVX.make_float8(0, 0, 0, 0, 0, 0, 0, 0) + eight_ones = AVX.make_float8(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0) + limit = AVX.make_float8(4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0) -# An example using AVX instructions -cpdef example_sqrt_8(np.float32_t[:] values): - cdef: - AVX.float8 avxval, tmp, mask - float out_vals[8] - float [:] out_view = out_vals - - assert values.shape[0] == 8 + for iter in range(max_iterations): + + # calculate z^2 + z_squared = AVX.add(AVX.mul(z_real, z_real), AVX.mul(z_imag, z_imag)) - # Note that the order of the arguments here is opposite the direction when - # we retrieve them into memory. - avxval = AVX.make_float8(values[7], - values[6], - values[5], - values[4], - values[3], - values[2], - values[1], - values[0]) + # get mask + mask = AVX.less_than(z_squared, limit) - avxval = AVX.sqrt(avxval) + # check when to break out of the loop + if (AVX.signs(mask) == 0): break - # mask will be true where 2.0 < avxval - mask = AVX.less_than(AVX.float_to_float8(2.0), avxval) + # apply mask on array of eight ones + eight_ones_masked = AVX.bitwise_and(mask, eight_ones) - # invert mask and select off values, so should be 2.0 >= avxval - avxval = AVX.bitwise_andnot(mask, avxval) + # update counts + counts = AVX.add(counts, AVX.bitwise_and(mask, eight_ones_masked)) - AVX.to_mem(avxval, &(out_vals[0])) + # get new z_real and z_imag + z_real_temp = AVX.sub(AVX.mul(z_real, z_real), AVX.mul(z_imag, z_imag)) + z_imag_temp= AVX.add(AVX.mul(z_real, z_imag), AVX.mul(z_real, z_imag)) + z_real = AVX.add(z_real_temp, c_real) + z_imag = AVX.add(z_imag_temp, c_imag) - return np.array(out_view) + counts_to_output(counts, out_counts, i, j) diff --git a/HW2/P4/P4.txt b/HW2/P4/P4.txt new file mode 100644 index 00000000..1507572a --- /dev/null +++ b/HW2/P4/P4.txt @@ -0,0 +1,5 @@ +1 thread: 6.60881018639 seconds for 10 filter passes. +2 threads: 3.62633299828 seconds for 10 filter passes. +4 threads: 3.69899082184 seconds for 10 filter passes. + +I was running this code on a dual-core machine without hyperthreading support. Using 2 threads therefore result in almost 2x speedup, while using 4 threads did not bring any additional performance. In my code I use the thread number as the offset and the number of threads as the steps size. Thus each thread computes the every nth line. Once all threads are done computing, the next iteration ca begin. I use the thread.join() command to wait for each of the threads to finish. \ No newline at end of file diff --git a/HW2/P4/driver.py b/HW2/P4/driver.py index 0051a121..315f14d2 100644 --- a/HW2/P4/driver.py +++ b/HW2/P4/driver.py @@ -15,14 +15,22 @@ from timer import Timer import threading +def worker(tmpA, tmpB, j, num_threads): + return filtering.median_3x3(tmpA, tmpB, j, num_threads) + def py_median_3x3(image, iterations=10, num_threads=1): ''' repeatedly filter with a 3x3 median ''' tmpA = image.copy() tmpB = np.empty_like(tmpA) + threads = [] for i in range(iterations): - filtering.median_3x3(tmpA, tmpB, 0, 1) - # swap direction of filtering + for j in range(num_threads): + t = threading.Thread(target=worker, args=(tmpA, tmpB, j, num_threads)) + threads.append(t) + t.start() + for thread in threads: + thread.join() tmpA, tmpB = tmpB, tmpA return tmpA @@ -57,7 +65,7 @@ def numpy_median(image, iterations=10): assert np.all(from_cython == from_numpy) with Timer() as t: - new_image = py_median_3x3(input_image, 10, 8) + new_image = py_median_3x3(input_image, 10, 4) pylab.figure() pylab.imshow(new_image[1200:1800, 3000:3500]) diff --git a/HW2/P5/P5.txt b/HW2/P5/P5.txt new file mode 100644 index 00000000..5939486e --- /dev/null +++ b/HW2/P5/P5.txt @@ -0,0 +1,34 @@ +Original code: +2.2 simulation frames per second + +Multithreading: +4 threads: ~ 3.8 simulation frames per second +2 threads: ~ 2.8 simulation frames per second +1 thread: ~ 2.2 simulation frames per second + +Adding threads allows parallel calculation for multiple balls and speeds up the execution. + +Spatial decomposition: +4 threads: ~ 330 simulation frames per second +2 threads: ~ 350 simulation frames per second +1 thread: ~ 210 simulation frames per second + +Checking for collision only for balls close to each other dramatiaclly speeds up the program. +A lot of unnecessary computation is avoided. I chose Morton ordering because it generates less processor +overhead compared to Hilbert ordering. + +With sorting: +4 threads: ~ 380 simulation frames per second +2 threads: ~ 430 simulation frames per second +1 thread: ~ 230 simulation frames per second + +Sorting of balls by location results in balls that can potentially collide being stored close in memory. +This slightly speeds up the execution regardless number of threads. + +With locks: +4 threads: ~ 240 simulation frames per second +2 threads: ~ 240 simulation frames per second +1 threads: ~ 150 simulation frames per second + +Fine grain-locking adds a lot of overhead and significantly slows down the execution. + diff --git a/HW2/P5/driver.py b/HW2/P5/driver.py index fdd1a102..0ef302ee 100644 --- a/HW2/P5/driver.py +++ b/HW2/P5/driver.py @@ -16,6 +16,18 @@ def randcolor(): return np.random.uniform(0.0, 0.89, (3,)) + 0.1 +# morton ordering taken from wikipedia +def cmp_zorder(a, b): + j = 0 + k = 0 + x = 0 + for k in range(2): + y = a[k] ^ b[k] + if x < y and x < (x ^ y): + j = k + x = y + return a[j] - b[j] + if __name__ == '__main__': num_balls = 10000 radius = 0.002 @@ -41,12 +53,11 @@ def randcolor(): # Each square in the grid stores the index of the object in that square, or # -1 if no object. We don't worry about overlapping objects, and just # store one of them. - grid_spacing = radius / np.sqrt(2.0) + grid_spacing = radius * np.sqrt(2.0) grid_size = int((1.0 / grid_spacing) + 1) grid = - np.ones((grid_size, grid_size), dtype=np.uint32) grid[(positions[:, 0] / grid_spacing).astype(int), (positions[:, 1] / grid_spacing).astype(int)] = np.arange(num_balls) - # A matplotlib-based animator object animator = Animator(positions, radius * 2) @@ -61,22 +72,45 @@ def randcolor(): # preallocate locks for objects locks_ptr = preallocate_locks(num_balls) + #frame_list = [] while True: with Timer() as t: update(positions, velocities, grid, - radius, grid_size, locks_ptr, + radius, grid_spacing, locks_ptr, physics_step) # udpate our estimate of how fast the simulator runs physics_step = 0.9 * physics_step + 0.1 * t.interval total_time += t.interval - frame_count += 1 if total_time > anim_step: animator.update(positions) print("{} simulation frames per second".format(frame_count / total_time)) + #frame_list.append(frame_count/total_time) frame_count = 0 total_time = 0 # SUBPROBLEM 3: sort objects by location. Be sure to update the # grid if objects' indices change! Also be sure to sort the - # velocities with their object positions! + # velocities with their object positions! + + + # get postion of each ball + ball_indices = ((positions / grid_spacing).astype(int)).tolist() + # shift all position by one within the list + shifted_ball_indices = ball_indices[1:] + ball_indices[:1] + # apply morton ordering + zorder = [] + for i in range(num_balls): + zorder.append(cmp_zorder(ball_indices[i], shifted_ball_indices[i])) + # make an array of indices for each zuorder element + zorder_indices = np.argsort(zorder) + + # update positions and velocities + positions = positions[zorder_indices] + velocities = velocities[zorder_indices] + # update balls + for i in range(num_balls): + if 0 <= positions[i,0] <= 1 and 0 <= positions[i,1] <= 1: + grid[(positions[i,0] / grid_spacing).astype(int), + (positions[i,1] / grid_spacing).astype(int)] = i + \ No newline at end of file diff --git a/HW2/P5/physics.pyx b/HW2/P5/physics.pyx index dbd78e6e..fa7322e6 100644 --- a/HW2/P5/physics.pyx +++ b/HW2/P5/physics.pyx @@ -1,6 +1,8 @@ -#cython: boundscheck=False, wraparound=False +boundscheck=False +wraparound=False cimport numpy as np +from cython.parallel import parallel, prange from libc.math cimport sqrt from libc.stdint cimport uintptr_t cimport cython @@ -44,6 +46,7 @@ cdef inline void collide(FLOAT *x1, FLOAT *v1, v1_minus_v2[dim] = v1[dim] - v2[dim] len_x1_m_x2 = x1_minus_x2[0] * x1_minus_x2[0] + x1_minus_x2[1] * x1_minus_x2[1] dot_v_x = v1_minus_v2[0] * x1_minus_x2[0] + v1_minus_v2[1] * x1_minus_x2[1] + for dim in range(2): change_v1[dim] = (dot_v_x / len_x1_m_x2) * x1_minus_x2[dim] for dim in range(2): @@ -55,31 +58,43 @@ cdef void sub_update(FLOAT[:, ::1] XY, float R, int i, int count, UINT[:, ::1] Grid, - float grid_spacing) nogil: + float grid_spacing, + omp_lock_t *locks) nogil: cdef: - FLOAT *XY1, *XY2, *V1, *V2 - int j, dim + FLOAT *XY1, *XY2, *V1, *V2, + int dim, X1_idx, Y1_idx, idx_x, idx_y float eps = 1e-5 - # SUBPROBLEM 4: Add locking XY1 = &(XY[i, 0]) V1 = &(V[i, 0]) + + X1_idx = int(XY1[0]/grid_spacing) + Y1_idx = int(XY1[1]/grid_spacing) ############################################################# # IMPORTANT: do not collide two balls twice. - ############################################################ + ############################+ + ################################ # SUBPROBLEM 2: use the grid values to reduce the number of other # objects to check for collisions. - for j in range(i + 1, count): - XY2 = &(XY[j, 0]) - V2 = &(V[j, 0]) - if overlapping(XY1, XY2, R): - # SUBPROBLEM 4: Add locking - if not moving_apart(XY1, V1, XY2, V2): - collide(XY1, V1, XY2, V2) - - # give a slight impulse to help separate them - for dim in range(2): - V2[dim] += eps * (XY2[dim] - XY1[dim]) + + # check for colliding balls in a 5x5 grid + #acquire(&(locks[Grid[idx_x, idx_y]])) + for idx_x in range(X1_idx - 2, X1_idx + 3): + for idx_y in range(Y1_idx - 2, Y1_idx + 3): + if idx_x < Grid.shape[0] and idx_y < Grid.shape[0] and Grid[idx_x, idx_y] < count and Grid[idx_x, idx_y] != i: + # lock indices + acquire(&(locks[Grid[idx_x, idx_y]])) + XY2 = &(XY[Grid[idx_x, idx_y],0]) + V2 = &(V[Grid[idx_x, idx_y],0]) + + if overlapping(XY1, XY2, R): + if not moving_apart(XY1, V1, XY2, V2): + collide(XY1, V1, XY2, V2) + + for dim in range(2): + V2[dim] += eps * (XY2[dim] - XY1[dim]) + # release locks + release(&(locks[Grid[idx_x, idx_y]])) cpdef update(FLOAT[:, ::1] XY, FLOAT[:, ::1] V, @@ -90,10 +105,10 @@ cpdef update(FLOAT[:, ::1] XY, float t): cdef: int count = XY.shape[0] - int i, j, dim + int i, dim FLOAT *XY1, *XY2, *V1, *V2 # SUBPROBLEM 4: uncomment this code. - # omp_lock_t *locks = locks_ptr + omp_lock_t *locks = locks_ptr assert XY.shape[0] == V.shape[0] assert XY.shape[1] == V.shape[1] == 2 @@ -103,7 +118,7 @@ cpdef update(FLOAT[:, ::1] XY, # # SUBPROBLEM 1: parallelize this loop over 4 threads, with static # scheduling. - for i in range(count): + for i in prange(count, num_threads=4, schedule='static', chunksize=count/4): for dim in range(2): if (((XY[i, dim] < R) and (V[i, dim] < 0)) or ((XY[i, dim] > 1.0 - R) and (V[i, dim] > 0))): @@ -113,17 +128,22 @@ cpdef update(FLOAT[:, ::1] XY, # # SUBPROBLEM 1: parallelize this loop over 4 threads, with static # scheduling. - for i in range(count): - sub_update(XY, V, R, i, count, Grid, grid_spacing) + for i in prange(count, num_threads=4, schedule='static', chunksize=count/4): + sub_update(XY, V, R, i, count, Grid, grid_spacing, locks) # update positions # # SUBPROBLEM 1: parallelize this loop over 4 threads (with static # scheduling). # SUBPROBLEM 2: update the grid values. - for i in range(count): + for i in prange(count, num_threads=4, schedule='static', chunksize=count/4): + if 0 <= XY[i,0] <=1 and 0 <= XY[i,1] <=1: + Grid[int(XY[i,0]/grid_spacing), int(XY[i,1]/grid_spacing)] = -1 for dim in range(2): XY[i, dim] += V[i, dim] * t + if 0 <= XY[i,0] <=1 and 0 <= XY[i,1] <=1: + Grid[int(XY[i,0]/grid_spacing), int(XY[i,1]/grid_spacing)] = i + def preallocate_locks(num_locks): From fd05affed74c0705f515ca6d4eb77ce3d222d9ca Mon Sep 17 00:00:00 2001 From: Dennis Grishin Date: Fri, 20 Nov 2015 21:50:50 -0500 Subject: [PATCH 2/2] final --- HW3/.DS_Store | Bin 0 -> 6148 bytes HW3/P3/.DS_Store | Bin 0 -> 6148 bytes HW3/P3/P3.txt | 3 + HW3/P3/sum.cl | 91 ++++++++++++++++++++++++++++++ HW3/P3/tune.py | 61 ++++++++++++++++++++ HW3/P4/.DS_Store | Bin 0 -> 6148 bytes HW3/P4/median9.h | 47 ++++++++++++++++ HW3/P4/median_filter.cl | 73 ++++++++++++++++++++++++ HW3/P4/median_filter.py | 91 ++++++++++++++++++++++++++++++ HW3/P5/.DS_Store | Bin 0 -> 6148 bytes HW3/P5/P5.txt | 39 +++++++++++++ HW3/P5/label_regions.cl | 114 +++++++++++++++++++++++++++++++++++++ HW3/P5/label_regions.py | 121 ++++++++++++++++++++++++++++++++++++++++ HW3/P5/maze1.npy | Bin 0 -> 258144 bytes HW3/P5/maze2.npy | Bin 0 -> 258144 bytes 15 files changed, 640 insertions(+) create mode 100644 HW3/.DS_Store create mode 100644 HW3/P3/.DS_Store create mode 100644 HW3/P3/P3.txt create mode 100755 HW3/P3/sum.cl create mode 100755 HW3/P3/tune.py create mode 100644 HW3/P4/.DS_Store create mode 100755 HW3/P4/median9.h create mode 100755 HW3/P4/median_filter.cl create mode 100755 HW3/P4/median_filter.py create mode 100644 HW3/P5/.DS_Store create mode 100644 HW3/P5/P5.txt create mode 100755 HW3/P5/label_regions.cl create mode 100755 HW3/P5/label_regions.py create mode 100755 HW3/P5/maze1.npy create mode 100755 HW3/P5/maze2.npy diff --git a/HW3/.DS_Store b/HW3/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..c860045014c3df5d814b7d3ddf6059f1bbef8510 GIT binary patch literal 6148 zcmeH~L2uJA6vyqZu(Z1b0U8n)A3^G&g+{Em(2WBJq>7+Th@p~@h+2x`s!3O&s#4DT z0=vTod=3ORJ^*|ZZsR@sb!xjI?Sv5cMfU&Me$Q!sY2uhM#=GOV#hA+&bD*GB>rh=_ zIF99y_vC4vaacr~1e1m}rs@ zDduN1Ol4j)n?HDEt$O9^x>Kuf;Nf1YzYGRy5)@%E&b#60g>K0>iOBDT#bE9~Qz||4 z!)R9~lfHXnTV+Kkvt+c8y1ys0SWUWWoW=QmVCLJCaWUw-H+!Sx08)Elmh8%0MlzMf zTc?Ic-`$)}?|BW;c`$2;X{)u>YzVK@na$iI$GLsid-&w}aCZ3a{qf0%kDoq&F}I4t z`%K$)(z$#EN3%ExMp>$|H_*md>}xMG$AiBLWeW%8@071U3H(z8cy%Y(>B1$^+IsHdaIF_xYDRjfr^%6?ch>;3yL-v Y6X<~Yq@_{l0ka&%7o9s8_y7O^ literal 0 HcmV?d00001 diff --git a/HW3/P3/.DS_Store b/HW3/P3/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..c724f0fd583da8970967fe8f55fc67d9d1ed9dcc GIT binary patch literal 6148 zcmeHK!Ab)$5PhSi7QFQ6F-K2=_yeI-!JFt0NUOK473_lE_5=N=^}R`yF78zknSsnp zlQ*-;gC-dOwrC&DfjNK~o6yu5F%6GS9aQom(LBd1cGt~zy=u0rkwkxSNNbB>}R|U*>)@TUS+7u;RFTfgm7cWq+3XOoH7sY7Fb@Ce`!og*hcS>}`a> j) for j from 1 to + // log_2(local_size) + // + // You can assume get_local_size(0) is a power of 2. + // + // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ + size_t offset = local_size/2; + for (offset; offset > 0; offset >>= 1) + { + if (local_id < offset) + fast[local_id] += fast[local_id + offset]; + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (local_id == 0) partial[get_group_id(0)] = fast[0]; +} + +__kernel void sum_blocked(__global float* x, + __global float* partial, + __local float* fast, + long N) +{ + float sum = 0; + size_t local_id = get_local_id(0); + int k = ceil((float)N / get_global_size(0)); + size_t global_id = get_global_id(0); + size_t local_size = get_local_size(0); + + // thread with global_id 0 should add 0..k-1 + // thread with global_id 1 should add k..2k-1 + // thread with global_id 2 should add 2k..3k-1 + // ... + // with k = ceil(N / get_global_size()). + // + // Be careful that each thread stays in bounds, both relative to + // size of x (i.e., N), and the range it's assigned to sum. + size_t idx = k * global_id; + size_t range_bound = k * (global_id + 1); + for (idx; idx < range_bound; idx++) + { + if (idx < N) + sum += x[idx]; + } + + fast[local_id] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + + // binary reduction + // + // thread i should sum fast[i] and fast[i + offset] and store back + // in fast[i], for offset = (local_size >> j) for j from 1 to + // log_2(local_size) + // + // You can assume get_local_size(0) is a power of 2. + // + // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ + size_t offset = local_size/2; + for (offset; offset > 0; offset >>= 1) + { + if (local_id < offset) + fast[local_id] += fast[local_id + offset]; + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (local_id == 0) partial[get_group_id(0)] = fast[0]; +} diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py new file mode 100755 index 00000000..a0d56da2 --- /dev/null +++ b/HW3/P3/tune.py @@ -0,0 +1,61 @@ +import pyopencl as cl +import numpy as np + +def create_data(N): + return host_x, x + +if __name__ == "__main__": + N = 1e7 + + platforms = cl.get_platforms() + devices = [d for platform in platforms for d in platform.get_devices()] + for i, d in enumerate(devices): + print("#{0}: {1} on {2}".format(i, d.name, d.platform.name)) + ctx = cl.Context(devices) + + queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + + program = cl.Program(ctx, open('sum.cl').read()).build(options='') + + host_x = np.random.rand(N).astype(np.float32) + x = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=host_x) + + times = {} + + for num_workgroups in 2 ** np.arange(3, 10): + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) + host_partial = np.empty(num_workgroups).astype(np.float32) + for num_workers in 2 ** np.arange(2, 8): + local = cl.LocalMemory(num_workers * 4) + event = program.sum_coalesced(queue, (num_workgroups * num_workers,), (num_workers,), + x, partial_sums, local, np.uint64(N)) + cl.enqueue_copy(queue, host_partial, partial_sums, is_blocking=True) + + sum_gpu = sum(host_partial) + sum_host = sum(host_x) + seconds = (event.profile.end - event.profile.start) / 1e9 + assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4 + times['coalesced', num_workgroups, num_workers] = seconds + print("coalesced reads, workgroups: {}, num_workers: {}, {} seconds". + format(num_workgroups, num_workers, seconds)) + + for num_workgroups in 2 ** np.arange(3, 10): + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) + host_partial = np.empty(num_workgroups).astype(np.float32) + for num_workers in 2 ** np.arange(2, 8): + local = cl.LocalMemory(num_workers * 4) + event = program.sum_blocked(queue, (num_workgroups * num_workers,), (num_workers,), + x, partial_sums, local, np.uint64(N)) + cl.enqueue_copy(queue, host_partial, partial_sums, is_blocking=True) + + sum_gpu = sum(host_partial) + sum_host = sum(host_x) + seconds = (event.profile.end - event.profile.start) / 1e9 + assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4 + times['blocked', num_workgroups, num_workers] = seconds + print("blocked reads, workgroups: {}, num_workers: {}, {} seconds". + format(num_workgroups, num_workers, seconds)) + + best_time = min(times.values()) + best_configuration = [config for config in times if times[config] == best_time] + print("configuration {}: {} seconds".format(best_configuration[0], best_time)) diff --git a/HW3/P4/.DS_Store b/HW3/P4/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..f625d5f0453f0c4b5c1d125ffafe84e7ba16f2f6 GIT binary patch literal 6148 zcmeHKyH3ME5S)b+K{PHYucShvq_d(>q=gRvl7NV02trWM`2+a?{>IEcK(=s!C=f!s z((Rn@cJ_QD`OXVKX3y(mpaL*uQxxrtm=2Fl?L_b{Q7p$5?s0-6+%B4tM1OHe-+d2v zxW*bA^xl8FS~lAyXDN-m5f3{v+e9^BT;PV>22afL$lb5dESf$augYhWuKNc60WU+o zoGHJzz9mVQ3Zw$5Kq`<5{73=b*=o~$$MmT{Dv%0%Dxl{>VNrh>&O|JIF;yBi4sGc&hZj)bztl0bVw8*5-WccFJh}Rf3a{#?U+6lNCk!p zXzfd@^!`8PCo@{)TS&T8AQkwp3dm%BKA&-SakqZir{1-N?UGGR<67;|*dP1^@IlX! e1Do{uqJ75Ifvuxv(Q%^_^GCo0NtX)zf&%Z8WH!zK literal 0 HcmV?d00001 diff --git a/HW3/P4/median9.h b/HW3/P4/median9.h new file mode 100755 index 00000000..0b9c8252 --- /dev/null +++ b/HW3/P4/median9.h @@ -0,0 +1,47 @@ +// median9.h - branchless median + +#define min(a, b) (((a) < (b)) ? (a) : (b)) +#define max(a, b) (((a) < (b)) ? (b) : (a)) + +#define cas(a, b) tmp = min(a, b); b = max(a, b); a = tmp + +inline float median9(float s0, float s1, float s2, + float s3, float s4, float s5, + float s6, float s7, float s8) +{ + // http://a-hackers-craic.blogspot.com/2011/05/3x3-median-filter-or-branchless.html + float tmp; + + cas(s1, s2); + cas(s4, s5); + cas(s7, s8); + + cas(s0, s1); + cas(s3, s4); + cas(s6, s7); + + cas(s1, s2); + cas(s4, s5); + cas(s7, s8); + + cas(s3, s6); + cas(s4, s7); + cas(s5, s8); + cas(s0, s3); + + cas(s1, s4); + cas(s2, s5); + cas(s3, s6); + + cas(s4, s7); + cas(s1, s3); + + cas(s2, s6); + + cas(s2, s3); + cas(s4, s6); + + cas(s3, s4); + + return s4; +} diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl new file mode 100755 index 00000000..e5a339d7 --- /dev/null +++ b/HW3/P4/median_filter.cl @@ -0,0 +1,73 @@ +#include "median9.h" + +// prototype +float get_next(__global __read_only float *in_values, int idx_x, int idx_y, int w, int h); + +// 3x3 median filter +__kernel void +median_3x3(__global __read_only float *in_values, + __global __write_only float *out_values, + __local float *buffer, + int w, int h, + int buf_w, int buf_h, + const int halo) +{ + // local coordinates + int local_x = get_local_id(0); + int local_y = get_local_id(1); + + // global coordinates + int global_x = get_global_id(0); + int global_y = get_global_id(1); + + // corrdinates of the upper left corner of the buffer square + int buffer_upperleft_corner_x = global_x - local_x - halo; + int buffer_upperleft_corner_y = global_y - local_y - halo; + + // coordinates in the buffer + int buffer_x = local_x + halo; + int buffer_y = local_y + halo; + + // buffer index + int local_size = get_local_size(0); + int buffer_1D_index = local_x + (local_y * local_size); + + // load into buffer + if (buffer_1D_index < buf_w) + for (int line = 0; line < buf_h; line++) + { + buffer[buffer_1D_index + (line * buf_w)] = get_closest(in_values, buffer_upperleft_corner_x + buffer_1D_index, buffer_upperleft_corner_y + line, w, h); + } + barrier(CLK_LOCAL_MEM_FENCE); + + // new buffer values + float buffer_1 = buffer[(buffer_x - 1) + buf_w * (buffer_y - 1)]; + float buffer_2 = buffer[buffer_x + (buf_w * (buffer_y - 1))]; + float buffer_3 = buffer[(buffer_x + 1) + (buf_w * (buffer_y - 1))]; + float buffer_4 = buffer[(buffer_x - 1) + (buf_w * buffer_y)]; + float buffer_5 = buffer[buffer_x + (buf_w * buffer_y)]; + float buffer_6 = buffer[(buffer_x + 1) + (buf_w * buffer_y)]; + float buffer_7 = buffer[(buffer_x - 1) + (buf_w * (buffer_y + 1))]; + float buffer_8 = buffer[buffer_x + (buf_w * (buffer_y + 1))]; + float buffer_9 = buffer[(buffer_x + 1) + buf_w * (buffer_y + 1)]; + + // output + if ((global_x < w) && (global_y < h)) + out_values[global_x + (global_y * w)] = median9(buffer_1, buffer_2, buffer_3, buffer_4, buffer_5, buffer_6, buffer_7, buffer_8, buffer_9); + +} + +// if value out of bounds return nearest inbound value +float get_closest(__global __read_only float *in_values, int idx_x, int idx_y, int w, int h) +{ + if (idx_x <= 0) + idx_x = 0; + if (idx_x >= w) + idx_x = w - 1; + if (idx_y <= 0) + idx_y = 0; + if (idx_y >= h) + idx_y = h - 1; + + return in_values[(idx_y * w) + idx_x]; +} diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py new file mode 100755 index 00000000..a181c05a --- /dev/null +++ b/HW3/P4/median_filter.py @@ -0,0 +1,91 @@ +from __future__ import division +import pyopencl as cl +import numpy as np +import pylab +import os.path + +def round_up(global_size, group_size): + r = global_size % group_size + if r == 0: + return global_size + return global_size + group_size - r + +def numpy_median(image, iterations=10): + ''' filter using numpy ''' + for i in range(iterations): + padded = np.pad(image, 1, mode='edge') + stacked = np.dstack((padded[:-2, :-2], padded[:-2, 1:-1], padded[:-2, 2:], + padded[1:-1, :-2], padded[1:-1, 1:-1], padded[1:-1, 2:], + padded[2:, :-2], padded[2:, 1:-1], padded[2:, 2:])) + image = np.median(stacked, axis=2) + + return image + +if __name__ == '__main__': + # List our platforms + platforms = cl.get_platforms() + print 'The platforms detected are:' + print '---------------------------' + for platform in platforms: + print platform.name, platform.vendor, 'version:', platform.version + + # List devices in each platform + for platform in platforms: + print 'The devices detected on platform', platform.name, 'are:' + print '---------------------------' + for device in platform.get_devices(): + print device.name, '[Type:', cl.device_type.to_string(device.type), ']' + print 'Maximum clock Frequency:', device.max_clock_frequency, 'MHz' + print 'Maximum allocable memory size:', int(device.max_mem_alloc_size / 1e6), 'MB' + print 'Maximum work group size', device.max_work_group_size + print '---------------------------' + + # Create a context with all the devices + devices = platforms[0].get_devices() + context = cl.Context(devices) + print 'This context is associated with ', len(context.devices), 'devices' + + # Create a queue for transferring data and launching computations. + # Turn on profiling to allow us to check event times. + queue = cl.CommandQueue(context, context.devices[0], + properties=cl.command_queue_properties.PROFILING_ENABLE) + print 'The queue is using the device:', queue.device.name + + curdir = os.path.dirname(os.path.realpath(__file__)) + program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir]) + + host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy() + host_image_filtered = np.zeros_like(host_image) + + gpu_image_a = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4) + gpu_image_b = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4) + + local_size = (8, 8) # 64 pixels per work group + global_size = tuple([round_up(g, l) for g, l in zip(host_image.shape[::-1], local_size)]) + width = np.int32(host_image.shape[1]) + height = np.int32(host_image.shape[0]) + + # Set up a (N+2 x N+2) local memory buffer. + # +2 for 1-pixel halo on all sides, 4 bytes for float. + local_memory = cl.LocalMemory(4 * (local_size[0] + 2) * (local_size[1] + 2)) + # Each work group will have its own private buffer. + buf_width = np.int32(local_size[0] + 2) + buf_height = np.int32(local_size[1] + 2) + halo = np.int32(1) + + # Send image to the device, non-blocking + cl.enqueue_copy(queue, gpu_image_a, host_image, is_blocking=False) + + num_iters = 10 + for iter in range(num_iters): + program.median_3x3(queue, global_size, local_size, + gpu_image_a, gpu_image_b, local_memory, + width, height, + buf_width, buf_height, halo) + + # swap filtering direction + gpu_image_a, gpu_image_b = gpu_image_b, gpu_image_a + + cl.enqueue_copy(queue, host_image_filtered, gpu_image_a, is_blocking=True) + + assert np.allclose(host_image_filtered, numpy_median(host_image, num_iters)) diff --git a/HW3/P5/.DS_Store b/HW3/P5/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..60c66e9b7b0f6bfdcb7d5ab3bcd481f57f3a1e59 GIT binary patch literal 6148 zcmeHKO-sW-5Ph3gY4MVy#~gbSDtHg23f@HTex!;Bu@SAHp7I0z0sfo#W@jk2Y49de zW?<%RCiAk*TgY?(gvsme7)SvO=z__RVS~wi@s=f)*&&Abj1pIP#vShQ(5m*te^fxu zE=7qO6g;<|=T~8l7aU-Q^0cgGMOhWG7C0e}=tFd8`x^g~oKX`k@`rfj{vX_b z%>CwQF~WH#wps!9Y_Y+fqDNQ2 z6>tTX3dr{%qYLJSwPO5qFvJmn*k?K#*D^~8Co{|qYen9n1*H;|8f?S}N@qT^y4 0) { + // set each pixel > 0 to its linear index + labels[y * w + x] = y * w + x; + } else { + // out of bounds, set to maximum + labels[y * w + x] = w * h; + } + } +} + +int +get_clamped_value(__global __read_only int *labels, + int w, int h, + int x, int y) +{ + if ((x < 0) || (x >= w) || (y < 0) || (y >= h)) + return w * h; + return labels[y * w + x]; +} + +__kernel void +propagate_labels(__global __read_write int *labels, + __global __write_only int *changed_flag, + __local int *buffer, + int w, int h, + int buf_w, int buf_h, + const int halo) +{ + // halo is the additional number of cells in one direction + + // Global position of output pixel + const int x = get_global_id(0); + const int y = get_global_id(1); + + // Local position relative to (0, 0) in workgroup + const int lx = get_local_id(0); + const int ly = get_local_id(1); + + // coordinates of the upper left corner of the buffer in image + // space, including halo + const int buf_corner_x = x - lx - halo; + const int buf_corner_y = y - ly - halo; + + // coordinates of our pixel in the local buffer + const int buf_x = lx + halo; + const int buf_y = ly + halo; + + // 1D index of thread within our work-group + const int idx_1D = ly * get_local_size(0) + lx; + + int old_label; + // Will store the output value + int new_label; + + // Load the relevant labels to a local buffer with a halo + if (idx_1D < buf_w) { + for (int row = 0; row < buf_h; row++) { + buffer[row * buf_w + idx_1D] = + get_clamped_value(labels, + w, h, + buf_corner_x + idx_1D, buf_corner_y + row); + } + } + + // Make sure all threads reach the next part after + // the local buffer is loaded + barrier(CLK_LOCAL_MEM_FENCE); + + // Fetch the value from the buffer the corresponds to + // the pixel for this thread + old_label = buffer[buf_y * buf_w + buf_x]; + + // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2) + if(idx_1D == 0){ + for (int i = 0; i < buf_w * buf_h; i++) { + if (buffer[i] < w * h) + buffer[i] = labels[buffer[i]]; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // stay in bounds + if ((x < w) && (y < h)) { + // CODE FOR PART 1 HERE + // We set new_label to the value of old_label, but you will need + // to adjust this for correctness. + new_label = old_label; + if (new_label < w * h) { + new_label = min(new_label, + min(buffer[buf_w * (buf_y + 1) + buf_x], + min(buffer[buf_w * (buf_y - 1) + buf_x], + min(buffer[buf_w * buf_y + buf_x + 1], buffer[buf_w * buf_y + buf_x - 1])))); + } + + if (new_label != old_label) { + // CODE FOR PART 3 HERE + // indicate there was a change this iteration. + // multiple threads might write this. + *(changed_flag) += 1; + labels[y * w + x] = new_label; + atomic_min(&labels[old_label], new_label); + } + } +} diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py new file mode 100755 index 00000000..b9aeeeb9 --- /dev/null +++ b/HW3/P5/label_regions.py @@ -0,0 +1,121 @@ +from __future__ import division +import sys +import pyopencl as cl +import numpy as np +import pylab + +def round_up(global_size, group_size): + r = global_size % group_size + if r == 0: + return global_size + return global_size + group_size - r + +if __name__ == '__main__': + # List our platforms + platforms = cl.get_platforms() + print 'The platforms detected are:' + print '---------------------------' + for platform in platforms: + print platform.name, platform.vendor, 'version:', platform.version + + # List devices in each platform + for platform in platforms: + print 'The devices detected on platform', platform.name, 'are:' + print '---------------------------' + for device in platform.get_devices(): + print device.name, '[Type:', cl.device_type.to_string(device.type), ']' + print 'Maximum clock Frequency:', device.max_clock_frequency, 'MHz' + print 'Maximum allocable memory size:', int(device.max_mem_alloc_size / 1e6), 'MB' + print 'Maximum work group size', device.max_work_group_size + print '---------------------------' + + # Create a context with all the devices + devices = platforms[0].get_devices() + context = cl.Context(devices) + print 'This context is associated with ', len(context.devices), 'devices' + + # Create a queue for transferring data and launching computations. + # Turn on profiling to allow us to check event times. + queue = cl.CommandQueue(context, context.devices[0], + properties=cl.command_queue_properties.PROFILING_ENABLE) + print 'The queue is using the device:', queue.device.name + + program = cl.Program(context, open('label_regions.cl').read()).build(options='') + + host_image = np.load('maze2.npy') + host_labels = np.empty_like(host_image) + host_done_flag = np.zeros(1).astype(np.int32) + + gpu_image = cl.Buffer(context, cl.mem_flags.READ_ONLY, host_image.size * 4) + gpu_labels = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4) + gpu_done_flag = cl.Buffer(context, cl.mem_flags.READ_WRITE, 4) + + # Send to the device, non-blocking + cl.enqueue_copy(queue, gpu_image, host_image, is_blocking=False) + + local_size = (8, 8) # 64 pixels per work group + global_size = tuple([round_up(g, l) for g, l in zip(host_image.shape[::-1], local_size)]) + print global_size + width = np.int32(host_image.shape[1]) + height = np.int32(host_image.shape[0]) + halo = np.int32(1) + + # Create a local memory per working group that is + # the size of an int (4 bytes) * (N+2) * (N+2), where N is the local_size + buf_size = (np.int32(local_size[0] + 2 * halo), np.int32(local_size[1] + 2 * halo)) + gpu_local_memory = cl.LocalMemory(4 * buf_size[0] * buf_size[1]) + + # initialize labels + program.initialize_labels(queue, global_size, local_size, + gpu_image, gpu_labels, + width, height) + + # while not done, propagate labels + itercount = 0 + + # Show the initial labels + cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True) + pylab.imshow(host_labels) + pylab.title(itercount) + pylab.show() + + show_progress = True + total_time = 0 + + while True: + itercount += 1 + host_done_flag[0] = 0 + print 'iter', itercount + cl.enqueue_copy(queue, gpu_done_flag, host_done_flag, is_blocking=False) + prop_exec = program.propagate_labels(queue, global_size, local_size, + gpu_labels, gpu_done_flag, + gpu_local_memory, + width, height, + buf_size[0], buf_size[1], + halo) + prop_exec.wait() + elapsed = 1e-6 * (prop_exec.profile.end - prop_exec.profile.start) + total_time += elapsed + # read back done flag, block until it gets here + cl.enqueue_copy(queue, host_done_flag, gpu_done_flag, is_blocking=True) + if host_done_flag[0] == 0: + # no changes + break + # there were changes, so continue running + print host_done_flag + if itercount % 100 == 0 and show_progress: + cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True) + pylab.imshow(host_labels) + pylab.title(itercount) + pylab.show() + if itercount % 10000 == 0: + print 'Reached maximal number of iterations, aborting' + sys.exit(0) + + print('Finished after {} iterations, {} ms total, {} ms per iteration'.format(itercount, total_time, total_time / itercount)) + # Show final result + cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True) + print 'Found {} regions'.format(len(np.unique(host_labels)) - 1) + pylab.imshow(host_labels) + pylab.title(itercount) + pylab.show() diff --git a/HW3/P5/maze1.npy b/HW3/P5/maze1.npy new file mode 100755 index 0000000000000000000000000000000000000000..fab3e2c0eae48e471c33e14fcdc0cdcbee15d28c GIT binary patch literal 258144 zcmeI5z0PDwT7(Ces~B%US_>@>!d?%AgMo!s>>!X3pjkY`RdB&D%_=ml!x!I^m2sjo zt14T9?8@iF&-+4@um0xm{^+~keD|L}{O-r!|HDr|{KfY_{N+FY`iEbA z|HD82+fV=T({KOv@Bi(m-~IUI{x`q9_y>GM6E@7L%1X@7s*{@uO$ zIR5JQ?>;}hU)m3G@+W@$Ki|;oer9!_w|O4=?(@@|qWusjf8xjg^9{}JXIA%lo9ChL zK0m!F+7EH^Cw}}t-_Y!SW_6#pc^>-i^V6H6{SYUA;>Z8<4bARnR`+?E=b`UDKfNj1 z4{`D*e*8b*(CmI@b)UC+9{TR{)0?9G5GQ})$N%#U&F+VE;?I7%?P)IJdhS_InlI() zUfTKOPyI=s@>YJk7xiAv&-@>6K=0*a`pbVNO+C;1bn+_s+liKqRYzq?o7EO)xm zd&(d6nSGA(iO=SKD;@Puc{)d$Px`bU=_x_!)$1-_>Ywto zXPQs?v>)jyKjmq@)3_!)$1-_>YwtoXPQs?v>)jyKjmq@)3_!)$1-_>YwtoXPQs?v>)jyKjmq@)2F?pr!&nb z|CI0EZ}<7q{iXVJKdCYnt(<hJWFOWgVDp7g}!(>f`ic-pJ&PyO2YRdfvnxF2C@>Bm*pY|g?@w7kb z+xgDl-K*~^?sC7?eL6q&O#MmU&L{tr(|p7!Kh=|e%F}+cdeXIdy0=uH&PVyFKk3`~ z{_C_nWleLJ80Q%>^{r~Fh;{wYuU&FV?l=IP#2 zeL5fIr~agG=aYZRX+GkVpX$j!HBy6Q{T4U^&Y)%_alGmN&4OSv(HEUSM&7!yZ)(fTkm?0-naXa zKlLR2?)=&3qyDRT`u<)2)VHm7y+`lc{m7qsl74so?DJ9o)jWOwuK%d7o^xmFpXyyN z-KYMkUOn8I`XBwr8_>Q#roa4qzV+O$C-ryS-D^(bdd{7sZ*$j6_w9W0R}XiR{_u}C zpnZN!fB9V#m(Q*z^>=*Lz4qwx+uY^q{?+r_^QHcJ5AIC;N$+xXpX%i^ds05}+1yk| zz1p1mzj}UqzSRHO-w*Xqd3(QUKc`Q7r+U{*_o+YWmE%s*zdCn&(S3V<<+zjbo$juC z$|rtTk4Jr|SDRDcSI=+Hm-;{Z`=S0RZ|^tl=k#gsRPTD}KJ_QPa@iO;YQvYXvKh!_vt^LZkeY*LTOI**nv#nq4Px-rZ z_ufYD)A=Z;?cdIK{oOtFxr-8sEK;^};={oDC&KkiT z)|3Bgp6;jJpZXDR_e=d(_jma;m($&KPx-{DM_W(+t9iPgc7N(eyxlMLU)|s3(_Bt> z*FEJErygxR`LE{be%k%1AMtj-)PHq$R_2j>rr~7I5r+&oS{Zjwc z{arrI<#c!5Q$BI((bkjyYM$<=-JkjqZ}&_6SNC`MG?&xebx--ksYhE+{;PSqpLT!h zN4(uH^Xq_#Kk}#kq;GTT zm-4hf<&%G^C;#1fYyQ^U&iAf+?PJ!D^u#Hb^eLzOHmClp^QqsnbJ{P>=k%l_?tJgM zPxDDnoaP{X$|=9issHMH>i6uN_Dl0QJ?V%$-@ES9e9{xAIY^&!%5QV(zdE1#Jv*oU z(tJ)&I^xdvuKP5f^u%cn(x;sA+noBZ&ZmCQ&S}3ipVO0$xbwa1KFudRahiklDX084 zr~a$+so%46+Aq!L^rR#1eDAtX^GQ#f<{*8_DZkCB|LT0|_w1bZOY=EB=|;Jpn?0!? z`I0{6?R@focb>k#ch$G&PQRDbkNioW@^(J?zdKLg-@EEtb35PBdwrirpT9mw`jlVu z9{o)DkNUma^Rz$Ww6C^)SAXh7oc2mQ^(TGGsXuYbC;hu~+8=S+Ut7PcKlLI`dnKOw zlRo9tpE%`{{@pq4k2vkGt>4w3dJ(6+5>Nd}pK|I?obpNk?ws~Voc7n&@9Iyzh|^w) zr~af*IrS$_`J{h$PWvNH`)liW^`~CMX|Kdnf6}L%`V*&o(!V>W{Sl}Awe`FDQ!nxM z>E6$1F6oJn=FC2~I)AqJx6;vmh_`!EKJm04`8)3J)rYvAYn^F6>4~@IlJDw#_1N{K z{Sa@@Mft?je&p}CyH_8_+xKZc>4~@JBmcBts;7Q!PW_3u`;kBOBfiQ%?M1zbr}?BO zPW{?E?U(A?^N~OGCr z>YU%B_h~-qi9g!+?&s5fss8SH9`&RC#2?MO`+4d|ocfTSxbvUYIlo8m(|pnsf3)x2 z&!_!T{oV6C>PP*FKbm*<^VE+x^&vfR=TAEE^i11yevf{p{73!nexCZJ zocfYK>4}p+aq?fysehZh`O{qItNYZS^van%+y1k?q@#W*SB^W$-|0s0$)EUW&Z_6A zf1A6x(_H7P`_!NG%9%ae{DpXQ|gq+iWxKka^$znZt_cmD2P`yj68+(~-k&i7m0xBI33 zq+iYHJ+=E${%YQy-}$?H?PHcZo$kqB&#gMCA93nUdg7^ns;7RdInB44@4BC~7wwh& zN#Ewwk2v)sJ@M2()lZ#vqPV=qiyY46L zMSCTG(ziMFBToHDPdxQc_0(@Qr}zo>D!$85vP8nC!YGJdg`~D(|oJ> zuKP)Q(O$`)^leW4h*Lk(6Hon9J@s47X};Bb*ZthpOFp}v?)|Ip?p!@roo7GaefQb! zcim6B*JnG~fwkvv`>)<%d%o2F?8j~J<9qGvWBSWK;~d}pKAmrO{p$VGeyAVmiKqUg zPdW8(bJsu3b-ugYr}OQuU%hAA5A`EG@zkI6DX0Ez?)s;>&UcslbiUp7tM^R%p?;(% zp8AtM<%K`R;O`&bPaM^`2=z)Q|MUQ-9K@ocg!9>!0R2-(Bw0`F7W@-ZSlo z`jMV^>QDNVQ~x%1{nK3MyUTq#-|qU=d#3$RKhhIV{Yjs4>fh$B|7fnBn>{Ii)>k_6 ze|4@stU77G)GO7m_HXCYejIoA>PuYD&7Q0MDgW8|?0dOONBg6kRKME4olpC5+}*3M zQn#O{8=6K9p&wO@_%d?zz?b zbG=6IsXzIo`qY0me^y6*M|nG+{NJ6Az9;wG>ixN1qxaOGd{TYtKbt?RqrRiOolpMn z&PU&qdv5jqT(8l4>Q6qYKJ}l?mo7bB|J3&`J@rdD^-TSb{^JeM-beYz?)-m`Du2@J zId_tNmXpr)8NF}&kM?lSQNNVCT-~SsNB{8#wBPjn*!}f?_ipa=UhgCInblK%%BffC z|Lpp7KW%;LPwyx7Z|T$CtzP6i%4xooQ;*dD+4bpu+WOR=-cRb^(x<&!y~uZz(|jqX z9;yGc>(l+T^{GF-pVYslPkXm|k?$y{`BF|jQvYYyr~7H^Q-69tseenK_HOke-%(EU zrJQ=C{?D#Y_tVy={`7uQ|CT=O-RecYqnzeTIrT{WpIx8sr>#%@>HVbsEj{%XPfyoB z^_8CdQ?49$rvAI@-Q1)1G~cL?doJ}SJ#p7d_o-e!?sWd{UOvS2-0Zp9pYn;*+@ybY zPV*C|`AJVa^(Q^?cE4GF>OISyPWS4o&)1Xq_4(^((x?2Icm16D@24yI zr~0

qoi7ov-dmPh39kBz?+VuI^L+-SwJt*E8K8%|-sHzU}Y&Q7&=kt9#NDmybJ1 zpK_P0`_zATz2@BYO!r4~k$$RcfPtOJ#qQClk_Qfxw=pNch_sqUC(rXG#B}&`nJF8N4dnEukJ}tTt4n3 zeac;~?oPeUKbnY~t^wclqluw-UNuP4cCrYw(T)srse>D*~P z>8W4JDW5pylRo8?Pn_~epYqlD)IaSvt0!H`)49`p(o?^bQ$BIZCw~h%l>h2{cE7Va z?K9QO$DQQwbh=OV@@bvze&nBW+DqG?@?V|L?srzFeYW(@_g(I3zwJGw`J_*Id;Zj) z=1cuaKbudwHmA8>ozs5X^QZZwPkDR()Su={{YgKYPr5dzxn7;qe%te>`J_*Id;Zj) z=1cuaKbudwHmA8>ozs5X^QZZwPkDR()Su={{YgKYPr5dzxn7;qe%te>`J_*Id;Zj) z=1cuaKbudwHmA8>ozs5X^QZZwPkDR()Su={{YgKYPrAFfe9rmHzgziR?!tY4)PcC3 zb0_Ij?sC7?ecGS&GzaM&clXL`bLE`#{O-U0uIc;m_1)Kh7hipT-RbMUtJl|eU;llS z>p6FR?fLcJ_59a&BY&UO$(!`V<#W!?25A2&r`D-I=_$X>so!eu=BHjxr+d;9m(Mvn z8=(ECoLZ;;q^JBgr+%xso1c0)o$g6bTt4UQY=HKka%!FWlb-V1ocgWiZhq?Jbh;-! zarvCHvjN(F%BgkgPkPF4bLzL6yZNb?)9Ie{#N~6&&IV}zDW}${Kj|sI&8gpN?&han zPN#d)6PM39I~$<=r<_`+{-me;Hm828xtm|T(v$p&r}^)$r~S4$?SFMX^=tF?{LbIq z-!(7!6Thp+yZX?6+nn~kI-mNrd3%26@9s6P|5=^$yWY3`uX)$clu!KX_vq)<{*<3`+6Vck`qZED+noBR-1R44ryIR*`%@3f zCrj|`%(WkPv;}O>p!b=e%Jf9|26OWnevHW{T}_i+Mn`MPWvGLRG<1& zew$POl)L`q>vW^{ZGY-P`NYYe_-cR3PkDR(c0cOh=IMN-cl~E|&hL8P_P^#`KT|&O ztKXxaSNl_b%4r|upXyV8%5QV(pK{lK)K|~Bll-0Tu6xQ~&C@w|?@#^O^Y89Y{b+x) zd{#$&#mQgKwNA=k&C|Je?@#^O^Y89Y{b+x)d{(EvExq#{y>ItR{YU4ReJ<_S*3Zr* z9re?5vuE3%dQpDLU4H7D>RqqV`!v6;ADv_Nxpx25e|9eEC|}Rbo^5~XMfoXr`KfQJ zcfCgM)BLu6bdK5Q+Wk}i*}0^nd_6aNw*9FW<)_@`r@pD)^%}iT^V|B-IcA@0_fP$2 z=aP={_1x^)_NQKypK_O<`lfoC@KlPuTOFGKebF*jLpL$V#%3Xfy zo9bP!(fc&Ntsk9Z_PKWd)PHs^=_p^%&7N(4>P7h}cloLBU3&Sr^V$2mxkm4IpP%~C zoT+}bf0|G8&+=KF=4tDdhF5rb)WWIt*1RuKhhI#`=|NTZ_BMJ?USa z(|(Ar-cP$9`KO%rm-?srwBM|rbi`?H(!V;V{SaThpLRd;PdV)`^-uL_zga!$h|}Dp ze|1j#A-;M)?SAB+a@t?&pX$?ovwG4Ir@2Z0>YVmNeD!|X{m4J%w7=9p)u;Vt^`s+C zbCdqnIqiq|>ix9)k$=i*f2n_}Py5a4Nk^RKCjF~(+7I#7`)T(h|CH1IQvXz+_M6p{ zZj|dece?%P`B&$f)19m5cfCgMssE_YyFQoh&*|O0_MCG0xHI*CcD>t+?pM#J93*EA?#W-ztvb_wq))lq3;8 z<|kjLyX&6viKjit|JC{G{ZT&cpY&~>_9K1D-TdV1ba&lTKJl~%`M)|}y+6vQ{gb}U z(|)8+xtpJSo$juC$|s)oApckAtM^Cww13jKdD@TkDR=XeuhZRiPx-{t9_0V(eD(e) zpY~7sHc$JJKILxyQC~gRI@5kf|M3R&zCNbkeJ7rtsXyscp3X=5v|me~`qJDfPx~GH z#~aXl`Iw&Hz4nuywBJ>{mFs+UuYKzCqbKPP|HE(B?UnZ9bl>Wp`lX!aAb-*mC;v8g z{nA|Lt9$a-bM7Sl;pcb%~+ z>>v96%=$`4{%x)v*E7w%`dvSh|MmInXVM>je)nI#1G9hV`!nk+9r?GpdR)&m_v&~3 zO#avBub)YO`1##``3}tfq3_SEuXNhtv! zPtU8bKFe31uV>1ye)^pH-ldn1JJt7k>iO%vJ|BL5_g}sP?$7A=L(k2gY5uH!R;M{z zdgnWO-|qM7{0ni)E6nm^T(KXJ+@PX5H({*+JL^&?%IyI!OB?S8NBPkW~M zQ$6_;r+nh%PrU6<`NUm6(zUtkHG1Ff_v-$%XPQ6NlRt6FCr&$oMiy7yVNq8-P3od!B>CpXeR2{){{SR>VNq8-P3od!B>CpXeR2{){{SR z>VNq8-P3od!B>CpXeR2{){{SR>VNq8-P3od!B>CpXeR2{){{SR>VNoa|L*mEcRk(r zceF3*iI3*&`W(%7_{SU2`}~;x;hza#o^gLtealz9c0Xx;>Q8#&5xq;?8%Md+NVCr#0VOounu3e0RC0{=0M9>_es2EM_g(aw)1B#lo$t50r~cG~^leW4 zh`0N#@?Vv!eLs7?=5S{^pY#1z_tc+ykiN~SAMtj-RsO4TwQrYC_ia7-6Hoi6{*=F( z(|m27&Y$+9{M4WHE}wL5?t1B-^7WiMNuP3;tNXUUa&|q_`BQzmmv(;YPkJ{$>Dt`& z(mmztId_sizU4<>eIcn^HYD)yZK4i=B}6S zDPPaIlk_Qfxw>!rD`(d;oj=v5duivV{-k&FlkP4qpH*jjewvH&N&o7c_LK76_m|G+ z^zL5!XmjPPI#0yPoNMseaeJ?9%Jq_d7fPtWNJe)yrqsGo3Hh@4A;=dcFI8XXl^Q>Ak0V z`Rsb8^QHP-_p(c`ci->q{Ifc}_f#*RUC(sBRKM$9cIox*`<gBWRna-E$ zciqb_z21Glv-3-L&R_n28lPR9T|C=&;L(2$Kihn#@A*xh@4)~64m|q1KhK{HoDG}} uoDG}}oDG}}oDG}}oDG}}oDG}}oDG}}oDG}}oDG}}oDG}}oDJ-=f&T^BM%4`f literal 0 HcmV?d00001 diff --git a/HW3/P5/maze2.npy b/HW3/P5/maze2.npy new file mode 100755 index 0000000000000000000000000000000000000000..4f8ab96f0214fde9e43f0cd188caaf3050211b90 GIT binary patch literal 258144 zcmeI2LCbVWR)oi`zoK`o&?^}?;(B(98yBvmaat6_k#2Y5ukZ(Jr^DgBUr~`yR>qCW ztg3rqCacbq5hu>M^)>@N|MRzh|2Kd44}bET-~HylzW@DCzx>nB-~ZKj-~aW${`UJH zzWe^4|M2sF{`}*=|KlHi{{2s_{<|Ol?U$c^t^e{bKmNy0zkdI-zxd1F{_x$e|MllT zeD~kq{r~f)zqy`k1J?$w4O|FK z-aAzLSM~Bz{N3xHK0m!*+7EH^Cw}?&H#EDSSzSNP>(Ec1pWYPhhdB8Yzx?|fn%&Q= zuAk<0=%>$5Z;JLqocxJj{{0Qj?q^olPxCtT)90r*Mf)L6{=_f;{)T4vGpp;Tc^&%c z^V6H6{SYUA;+KDaL$mvt)%DZ74*m4`=}pmoh?76@%fG*&+5M2N_^Y34do>sFy7!8w znyuAq9(8Aw*S*;@-Jkp^H|brT>YvUh)gKol#!*X3ung@~7OScX_J6 zn@j24dd=(l%<8E>@!7u5(zX1(hvp;R^B!U3@A8oh&K)-&=_%W+o^-@17xA>e+kbUF z%1^xI?>#gh@t*exBY&5VWN_}d`AARMX7!{aPPvGu{oVen^HF}{Er0K!`H1(tM;Q6L zd?bT&$IVB2$~LPf9dXJ7{^^O2si&FV=A4lxx9QW?2*;>8st2@Q%Bz>AwzBFGwpX#sbwU1e@ zbX8CLSD)E)bw27pJJ+Z#-JkqjPIHn!>0M6!iBtbsJ?V(Mxm8USqq z6EC0CN&ShtxzhfWFWsN~UA}rgw?EDA`YS#4C9Zs{PI}_ylRBwCaW_}mpYo;qlfTPX z&*%21`CWgdr@q9MZ`DapynIq8^(XG;O8Zm3bbs=9`Re)H{xrYquk_USEM7jV&h-2= zm)oEGdtCKy^`7pTOl2I+Kzib%eP-{a`zwD{uQ}4Z`m8!v&*%1c_NIEby5oDMR%IOR zL3-k&eP-{a`zwD{uQ^t6)%RI-cYez0`jdX7fA5a#-}4?})So!@aP{QxayQ?so^)yM z?#)=I-8Hy*nTEcl}8}(!Y1d z_3wF)FzQd7dboP>ce$HyR!_QT@#>R0)%&S_E-xQ-y8ez{XT>qYTW3GQ%Px)7KcYc-M z^-b$l?$LTWANf;G(x1V58C?IKcVn)9T2J{`b9a7~-}O!FRqoMxIv@E{PST&jdl_8+ zo_Axee_BuZS95oMmEZME>s9X2dO9EZQ%=&K!Fw58|DJbau76rj`B!syewE+#P3u+e z(Rw-``BP5PpTT<>T>qYTW3GQ%Px)7KcYc+B)VJ=b)Ae`tDp%EAe^*~Q)am;7^oq55 z~RyVo_*WcAsPS@Ynugs^iw|eDLcRq#JoV%X0??>s<_1dHAljf># z)!)5;o3Hnz=O_POw#G>Ba<@O}U7nuL&8PHkZdb2zRo(R`ef3c%>CfP*cdMV_-#pYq z{z_MM(!0F+sFVDa?yNfXCw^9rM|~()np56)@89O@J?Z($zn85s(!1R4PkNW9=X3KZ zy_?(Bt6Wug{YhVa)Jgg?xa!^NXZSY{^^m{PRh{%MuRiJ|f2BLCPW_3WmE%z#%9ZAn z_uc!q`Fc-!e)8{SYmD?Rcl(px<>~p{d`j=;cJ(S()m?wmS08nf{tT{qxB40W%|kup zuXI%>z00eQI>}$@&Z<*?;wne!(kJyNPWjUM)&A7K#}#Y!+N-*A7B8RK)6GYH)BfrH zD!-~zp0hahCr&xi`qlo_zsD78_1deta~3b3+0)HOebfHw{wlw!S02UFb?Q%?@}>3U zznZ)IN$010#MAj)|JC_bf0|3_&Z<*?;*=w;C;!#l-A_6{K=2E(|>eQb&Hdalg>~1h^O3pvL>inwz=v;L#b*A%?Kg~t@G^cz$-m>+0W9lg#@w%tZw4VHT=W1_NpUpq3tG?+o zosax!KGLT-i(4P)j92#=2Lpo5m&xv)!qK2Cr)#a-sRLk%_;xt{*>?4IqjF` zQ+m=7SH5S}-TtH}PIHjn<%PBF)u-lGzN7W}O^x2ay+?YN-}Za-JN19m@6+zn{)p4Q()wNfDHn0t zD{UHvH+ zaoQ_!*Prw*r~JgJKj}Z6)BcFl{?htg{V5l5+ADF_pY$%L{KTn0=|7#*{)p55()wNf zDHn0tD{@d z%14~?k)F8npVcY9N9%5X(i49)@7ed=e6IfNc^>tn{KOyaclLeCN1XDIp1AU#)hWM6 z>u!J26Mr=C+4tRiuKw(K9`&RA#2@W<_I=7nobr&KxbmOXDZfYSZhz7fe>Cse_uYK1 z{_J@k^`rd6AMJPceac6i@{pdm@}JcyzenqCf6^0wH1FB>-F&Y8?0Fvbqx{4l?RWNl z%14~?ke;~mCtY#(bo;w{H-B31&ezj7md;Q9y?jlp9`fCVQ~p)l?d$H>?eFT{{As;A zUr*myIzRdM@-?k`$afb``B(8&U*%i%dN1|;_H_BJ-=p8D|D%3q-=};d+{r}#j(5|N zKXJEzT2J}YT+Qe9RlZes{YhVaX3w<$Y_8H#z7bwtGbj1acpf$6PyFb$<7djB=4vNy zU*%hM*Pry&XZB3{&*mx}OZeQhFb=RNt)o1oh`_JYo9pxM0Aqo;uV1s_$8K%J24Z{Yl^BinV&prS6RKx~ERppY*ElE_KSE;;t|4FYQnL zT~7H|`%}IiSFF`*E_G*=*FAN*{-jrZcd1kU6o0mF^{c(t_uWp)N1Wy%J#p9H)lM7r9PV=qiyY9!$MSCTG(x*A)BTo59Pu%r) z^^|Wlr}zo>C>F@5vP2lC+_;Yddjz&(|oJ>uKRIw(O$`)^l47{h*Lh& z6L^@?X};Bb*ZsJ;Xs_f?`ZT9}#3>)?iM#%;p7O2cG~a5z>weD4RX)3(>it*V z-FfAlIooVAx})Zu_j8(bxdg87>>0M6w(_H0u`zqgE>h65I>sRlY_Cxtd zPu%qix}RK58ic@$S9PuKu3qu}1#Hk6y>mv$~^)^Eut0 z{6B#^?}xOj_ec4NQ~tEx^`Ff@tE0T5Jl&uCKY@?FiW&`H=X8Ja{{-&5AJVSgALS!X`O|vWe>VTDj`EK3bbs>y1U~vMX6~)t z-%RGChVmcp-uvw8?|B|;5rL?pUeMPJANMJbx)l~&*SI1 zf2hE&O>-wLe?`1s0zj@3))SrA^z3cDxU!`~Zx_cwN z%V|EBQ$E-K41F)-8UD><_M!gd>*`&9xBn`=+t=M2>0M6qxt#L3{%7cW8PD)<9D|8W-bn9qn$P8w&-Fh;-^+N0fAg4qs6Y9-de`6Wze?}+b@xVkm(zSM zr+lve8TwwvGyI#!>_h#@*VViJZvRz!x39Z5(z~4Ib2;U6{m;<%GM?ezJZ2y2Prk0+ z^>_QP(o^5!?y2_c`j(#jU0!|E>G~)2%DdHT9(AXY%S>U-6%4O|=e zybaVpN7UrRm9FZfCtg14B)!X3->SR*&(K%adnfh3r`f4S`H9mkX?@yX%}0HSE8nV< zo_P7Flk_fEeXH*JKSN(x@14~Do@S>SKxbm$!>4}$*I!W(x)wk-d|1x$`q$gfJ>Lk6(Ro|++{?E`?)_W)Qzo*%$M)`@;ENOk(U(H8- zi7Vf#lb(3_sFUETV9p!aTo$3Crzsg;8 z*MFs6^=|c=@6}H=W6edp?xjxZPn>coy{ea=%gaZdu76sua#uawKk2W$TfOFc^;6AQ za}lq5sgwE>r(8;}>gAW>%D3w2{^VadQYZB%PPs_mc<&*7PdmcW{*+;L|8&0D`AFw- zcW$>o=_#MfsXuY*Px{7t59xc_5tjC+46FO6^UcmjI+we1yZuQ|`CLx@iBo^lH{N?l z-_wq;v_EB7-9Mdgc0SU%+@0I)PkPGda_Uc<`jfu#-b4DHc7&z3p;Ek&gne55;zmybHte$@Tjx~sqSJNtV&U(a8$RF+z`qO-_Kj~-tlP=9^u6O6O-}L-$f6}`=J-_Qu^SS<{pY2b&G^e@Vozs5P z^Sk{?@ACBgu0PG^`jdXPKk3q(=6ZKd`%Ta9_9wl|)APIjG@t8F`q}=ZOLLm*-8tlk_g9{%P*!Cq3mSz2d4~{nEVpT<7(7fBdc0@8MWGelEWHJZ3t6 zu3X33@$)FJd+I!T9zWOpV{PQ~tggICPrQ7tGqZuV|K3CIhj`DsG19x7`lq>@Kdo2u zQ(vX4I_Zg*&vj-t(DvVZ==~7yc{fISms9^Vck`$9YJTdgbX6xk@$$LO%m&*2dk?)I z;yv%iNbhp$pXP40t>pRVuiSFu*Fy{S8+yzZ&f^(VdRyGxz&r#Sh#`yqeg zZvRi$_x7t;tJmJtol#!*)am+@UiICjPWe-O)>r99>$UgM`)A*y{>0rptNmO5-b49_ z_q-cR>s^1!pXQX`&@{c+b1BwBGfn{Ao`4U9R$z zuhNay)Bco$`V%LA;;a2z|K3CSi1)l3OY2>K%Ae+x-{mSl`6}ILJ?&39s6TP?C%)Rh z_3u5Dk9g0!v9#Xxr~GM7`CYE^ldsZ^*3(qZWADu(pbMh-~t5-jD zr#PLi{@wlE`Dnk2t9s>G#Z}+Y`s)1De{_!7d(^-2-ZP@_d62%>dxX8ZKg~$#<~z0!@=U)`VPr1@N~`n$faUga9CyZzJp(K%-C zjpXlnkLcSS^^jiaM(eNcPjk|IE?50sUstbkjn>`%Y5nLNv-d{w_q<2+?T&g#uXLmJ zSNEqmX+D>${;sd9SGh*(ZvV7?bdK43Bl&yYBl>nnJ)~E<(fX_V)0{M)%T<5Z*VU_B zqjk4`T0c6+?7flvJ?{~HyQ3b`E8S@Q)%|Hsn$P8`zw3LJzI@dA>iN}NqjhI~rET@< zukM`1%SWBmU+G5cX@AN={Zl;UTl2d9<)cp5Kdo1}&#F85mA2I@pSp7vFCTSMf2A9( zr~N4h^-u9<`&Pf&M}5CN-<{vi`{?&G_S5T0{ylADq$f`PE??cB^1JzF^`s;2&PD#O z&S^iy&zQg0gZz8i#z;?`{9V4fKjnAx&FV=<+?|X3U!Bu_h@UZkuLt?}w2hITIQhGL zb$`n5=9|@%j<`D)`M)}+{SZH6{$3CA?`az&J#q4P`Re|Z-_19xCmnHjF7khMPWvH# z#{9h=YVmN{EYc~J;=YOZH)B9$=~Ix`%`{5 z->jZ=qrC2^Q_WxZ-<{W->P*hByjxx6d9<#2JnHkQ_uc&|y{gxqU0y!wbp2mlul7>) z%tN*;BsN&-EvL^-*V9 zPyRb_wVP+uY5r$q*trMo$K`7FlyCKO{YhVa)S1?k{|;R3<{5RG{}~x}?m_!;xte{{ zx9-iJtNT-KH{Z_smW}3jxjVnxe|NpRpIQB^?yP;T?oV^L`F7T~Y&5^i-TB@AyX)Qk z%<9k9DZfYS^}dvjxYCW*$^RYr=q_gNx%;7>QPw(!h=iM0jlRnMee57}| znxA}??yNfXC+_AT|99Z-j(U1`KRxfp$e;9S?&c%C%hmkkt8{18sXuWy2l>APcX!m& zyZh;RH%9)XPjfdP>0PeoCtsyIt4{riyE(}J9k{!rp5EP0&$}`5Cw-c``AF|_HUFq@ z-AkQrzDs|91MPkFp0nPk>M`n1dY6ypn7!xbOX*$T*?o-a+57xy<<%#3y7^N2XZu#a+H-w>^d$Yo|M({sH`&Ie&Q*rnF*7fUK`F>Tt z{Z!mNzjgikR=!`AZ$G*GTeq)ouJ2j;@=@nk;%`6I{co#Z-!A^Oi+i7HKk4&c?^dh6 z#OvPdN&duV`<9OUQ#|GC&ZTmV*4_S}pda1ky?eBudzxd7^e#X4n)#gW@A|uXwG-FZ z)vH{ib+`W~=tp;X?;h>vp5|C1y~~fiWna}C|uD`2SJ8^woy~;IOcl&>Wesq`j?$LhkX^u70yZqQ|=5xBg>+kB- zPF!DCuX2sn-Tt4TAKm4>d$gZ>nq!UhE+){{SR%75|e@1A}`3BLMwM>A2rw4VHlQ~ry;_uqZJ z-(64j`yI_odg7yf68Bm**O%tI`27vk-tW_|_mbi%-?exH85mlRL=(p;)<)zfq~mox9Vv<`4g|4>LfjJ<-1Fr^6$=RkGto0^Qrk=-%rukoa%J< zt9;L>w`}A?`CUEvyPWb7cl+Lw*88gOkDjD=IpsHe-i>DBzCOLLX0>eRpPsgv|BSADCV_OCv>p6>jv-rY;Ozw1wWH9zUn zT;-}d^{;#CB)!X3->RqmtIw{dJHM-U_mb}K`jcMGPr5W$xvEb6>z+DE?{d|*>S_P# zv+L>3@9N#Xr2D)6q*wEkF3nZ0s#E{Er%uwlT=lJb+Q0hjdb;zwdUr4B{;ogi)%>J8 ziEX_&^r~L7C{A@(PyVa9JD;mxJ^$|gKYhQm^Uvz)_u%TwXV=r6&(-g`mtAsw z`hI8UpVifSclG77>*>zt>UZ7CF1bE^zq9kt>gv6_`tsTJbmw#RyY6L|T%W$*+4*O6 z_1;~5`RsbS^SSz6_p(c_Pv7tC{Ij}x@2Y+l*(KMf?{{|o(p~5N|E_EH z9ksvFS?#5Jjom%^&*AI6zQ%iX-qZDW{~7Q2(ZBuc{