harvard-cs205 · patrickday9 · Oct 26, 2015 · Oct 26, 2015 · Oct 26, 2015 · Nov 2, 2015
diff --git a/HW2/P2/P2.py b/HW2/P2/P2.py
@@ -10,6 +10,7 @@
 import numpy as np
 from timer import Timer
 from parallel_vector import move_data_serial, move_data_fine_grained, move_data_medium_grained
+import matplotlib.pyplot as plt
 
 if __name__ == '__main__':
     ########################################
@@ -21,6 +22,7 @@
 
     total = orig_counts.sum()
 
+
     # serial move
     counts = orig_counts.copy()
     with Timer() as t:
@@ -29,23 +31,66 @@
     print("Serial uncorrelated: {} seconds".format(t.interval))
     serial_counts = counts.copy()
 
-    # fine grained
-    counts[:] = orig_counts
-    with Timer() as t:
-        move_data_fine_grained(counts, src, dest, 100)
-    assert counts.sum() == total, "Wrong total after move_data_fine_grained"
-    print("Fine grained uncorrelated: {} seconds".format(t.interval))
+    ### fine grained, 4 threads, uncorrelated, multilock ####
+    # counts[:] = orig_counts
+    # with Timer() as t:
+    #     move_data_fine_grained(counts, src, dest, 100, 4)
+    # assert counts.sum() == total, "Wrong total after move_data_fine_grained"
+    # print("Fine grained uncorrelated: {} seconds".format(t.interval))
+    # for t in threads:
+
+    ### fine grained, multithread, uncorrelated ###
+    # Save output for multiple Threads
+    threads = range(1,10)
+    time_fine_threads_uncor = []
+
+    for th in threads:
+        counts[:] = orig_counts
+        with Timer() as t:
+            move_data_fine_grained(counts, src, dest, 100, th)
+        assert counts.sum() == total, "Wrong total after move_data_fine_grained"
+        print("Number of Threads: {}".format(th))
+        print("Fine grained uncorrelated: {} seconds".format(t.interval))
+        time_fine_threads_uncor.append(t.interval)
 
     ########################################
     # You should explore different values for the number of locks in the medium
     # grained locking
     ########################################
-    N = 10
-    counts[:] = orig_counts
-    with Timer() as t:
-        move_data_medium_grained(counts, src, dest, 100, N)
-    assert counts.sum() == total, "Wrong total after move_data_medium_grained"
-    print("Medium grained uncorrelated: {} seconds".format(t.interval))
+
+    ### Medium grained, 4 threads, uncorrelated multi-lock ###
+    # Create a range of N values for Part 1
+    N_buffer = np.arange(1,21)
+
+    # Create a few N values
+    N_pts = [1,2,4,5,10]
+
+    # Save Medium grain time results
+    output_time_mg_uncor = []
+    output_time_mg_uncor_pts = []
+
+    # for N in N_buffer:
+    #     counts[:] = orig_counts
+    #     with Timer() as t:
+    #         move_data_medium_grained(counts, src, dest, 100, N, 4)
+    #     assert counts.sum() == total, "Wrong total after move_data_medium_grained"
+    #     if (N == N_pts).any():
+    #         print("Number of Locks: {}".format(N))
+    #         print("Medium grained uncorrelated: {} seconds".format(t.interval))
+    #         output_time_mg_uncor_pts.append(t.interval)
+    #     output_time_mg_uncor.append(t.interval)
+
+    ### Medium Grain, Multithread, Uncorrelated ###
+    time_med_threads_uncor = []
+    for th in threads:
+        N=10
+        counts[:] = orig_counts
+        with Timer() as t:
+            move_data_medium_grained(counts, src, dest, 100, N, th)
+        assert counts.sum() == total, "Wrong total after move_data_medium_grained"
+        print("Number of Threads: {}".format(th))
+        print("Medium grained uncorrelated: {} seconds".format(t.interval))
+        time_med_threads_uncor.append(t.interval)
 
     ########################################
     # Now use correlated data movement
@@ -62,21 +107,85 @@
     assert counts.sum() == total, "Wrong total after move_data_serial"
     print("Serial correlated: {} seconds".format(t.interval))
     serial_counts = counts.copy()
+
+    ### fine grained, 4 threads, correlated, multilock ####
+    # counts[:] = orig_counts
+    # with Timer() as t:
+    #     move_data_fine_grained(counts, src, dest, 100, 4)
+    # assert counts.sum() == total, "Wrong total after move_data_fine_grained"
+    # print("Fine grained correlated: {} seconds".format(t.interval))
 
-    # fine grained
-    counts[:] = orig_counts
-    with Timer() as t:
-        move_data_fine_grained(counts, src, dest, 100)
-    assert counts.sum() == total, "Wrong total after move_data_fine_grained"
-    print("Fine grained correlated: {} seconds".format(t.interval))
-
+    ### fine grained, multithread, correlated ###
+    time_fine_threads_cor = []
+    for th in threads:
+        counts[:] = orig_counts
+        with Timer() as t:
+            move_data_fine_grained(counts, src, dest, 100, th)
+        assert counts.sum() == total, "Wrong total after move_data_fine_grained"
+        print("Number of Locks: {}".format(th))
+        print("Fine grained uncorrelated: {} seconds".format(t.interval))
+        time_fine_threads_cor.append(t.interval)
+
     ########################################
     # You should explore different values for the number of locks in the medium
     # grained locking
     ########################################
-    N = 10
-    counts[:] = orig_counts
-    with Timer() as t:
-        move_data_medium_grained(counts, src, dest, 100, N)
-    assert counts.sum() == total, "Wrong total after move_data_medium_grained"
-    print("Medium grained correlated: {} seconds".format(t.interval))
+
+    ### medium grained, 4 threads, correlated, multilock ####
+    # output_time_mg_cor = []
+    # output_time_mg_cor_pts = []
+    # for N in N_buffer:
+    #     counts[:] = orig_counts
+    #     with Timer() as t:
+    #         move_data_medium_grained(counts, src, dest, 100, N, 4)
+    #     assert counts.sum() == total, "Wrong total after move_data_medium_grained"
+    #     if (N == N_pts).any():
+    #         print("Number of Locks: {}".format(N))
+    #         print("Medium grained correlated: {} seconds".format(t.interval))
+    #         output_time_mg_cor_pts.append(t.interval)
+    #     output_time_mg_cor.append(t.interval)
+
+
+    ### Medium Grain, Multithread, Correlated ###
+    time_med_threads_cor = []
+    N=10
+    for th in threads:
+        counts[:] = orig_counts
+        with Timer() as t:
+            move_data_medium_grained(counts, src, dest, 100, N, th)
+        assert counts.sum() == total, "Wrong total after move_data_medium_grained"
+        print("Number of Threads: {}".format(th))
+        print("Medium grained uncorrelated: {} seconds".format(t.interval))
+        time_med_threads_cor.append(t.interval)
+
+
+    ### Multi-N ploting ####
+    # plt.figure(figsize=(10,8))
+    # plt.plot(N_buffer, output_time_mg_uncor)
+    # plt.plot(N_buffer, output_time_mg_cor)
+    # plt.scatter(N_pts, output_time_mg_uncor_pts, s=50, c='Red', label=u'UnCorrelated')
+    # plt.scatter(N_pts, output_time_mg_cor_pts, s=50, c='Green', label=u'Correlated')
+    # plt.title("Time of Array Suffle Correlated vs. Uncorrelated (Medium Grain Locking)")
+    # plt.xlabel("N")
+    # plt.ylabel("Completation Time")
+    # plt.legend(loc=2)
+    # plt.show()
+
+    ### Multithread ploting ####
+    plt.figure(figsize=(10,8))
+
+    # Uncorrelated 
+    plt.plot(threads, time_fine_threads_uncor, label=u'Fine Uncorrelated')
+    plt.plot(threads, time_med_threads_uncor, label=u'Medium Uncorrelated')
+
+    # Correlated
+    plt.plot(threads, time_fine_threads_cor, label=u'Fine Correlated')
+    plt.plot(threads, time_med_threads_cor, label=u'Medium Correlated')
+
+    # plt.scatter(N_pts, output_time_mg_uncor_pts, s=50, c='Red', label=u'UnCorrelated')
+    # plt.scatter(N_pts, output_time_mg_cor_pts, s=50, c='Green', label=u'Correlated')
+    plt.title("Time of Multithreaded Array Suffle Correlated vs. Uncorrelated")
+    plt.xlabel("Threads")
+    plt.ylabel("Completation Time")
+    plt.legend(loc=2)
+    plt.show()    
diff --git a/HW2/P2/P2.txt b/HW2/P2/P2.txt
@@ -0,0 +1,42 @@
+### Part A ###
+
+# Uncorrelated
+Serial uncorrelated: 0.327378034592 seconds
+Fine grained uncorrelated: 8.42167806625 seconds
+
+Number of Locks: 1
+Medium grained uncorrelated: 9.59998893738 seconds
+Number of Locks: 2
+Medium grained uncorrelated: 10.9623169899 seconds
+Number of Locks: 4
+Medium grained uncorrelated: 11.0328228474 seconds
+Number of Locks: 5
+Medium grained uncorrelated: 11.3029088974 seconds
+Number of Locks: 10
+Medium grained uncorrelated: 9.50029206276 seconds
+
+
+# Correlated 
+Serial correlated: 0.472969055176 seconds
+Fine grained correlated: 7.73167490959 seconds
+
+Number of Locks: 1
+Medium grained correlated: 9.56373286247 seconds
+Number of Locks: 2
+Medium grained correlated: 8.43163609505 seconds
+Number of Locks: 4
+Medium grained correlated: 8.61915707588 seconds
+Number of Locks: 5
+Medium grained correlated: 8.97014594078 seconds
+Number of Locks: 10
+Medium grained correlated: 11.4742889404 seconds
+
+For this part of the problem, we fixed the number of threads to 4 and changed the amount of locks available to the medium grained data move. We can see from the plotted results that there is not much difference in execution time for the uncorrelated data move, i.e. the amount of data we lock for each thread does not change the performance of the data move. For the correlated data, we see that additional locks increase the performance. This makes sense as we increase the number of locks, there is additional data for threads to operate. The ideal amount of locks should be N=20 as this would ensure that the source and destination are no more than 10 elements away from one another in either direction. However, my results tell a different story. There is a bad performance "spikes" at N = 10 and 20. Based on the results I would pick N between 10 and 20.
+
+### Part B ###
+
+Serial uncorrelated: 0.36355805397 seconds
+Serial correlated: 0.428131818771 seconds
+
+In part B we were required to fix the number of locks to 10 and see how performance varies as we enumerate the amount of the threads available to move data. From the plot we can see that there is a point of diminishing returns after 4 threads. The performance is improved by 4 threads but decreases very quickly as we increase the number of threads. Moreover, the threaded results are much worse than the serial results, this is due to the communication needed between threads to move the data.
+
diff --git a/HW2/P2/P2_multilock.png b/HW2/P2/P2_multilock.png
diff --git a/HW2/P2/P2_mutlithread.png b/HW2/P2/P2_mutlithread.png
diff --git a/HW2/P2/parallel_vector.pyx b/HW2/P2/parallel_vector.pyx
@@ -1,12 +1,10 @@
 # turn off bounds checking & wraparound for arrays
-#cython: boundscheck=False, wraparound=False
+# cython: boundscheck=False, wraparound=False
 
 ##################################################
 # setup and helper code
 ##################################################
-
-
-from cython.parallel import parallel, prange
+from cython.parallel import parallel, prange, threadid
 from openmp cimport omp_lock_t, \
     omp_init_lock, omp_destroy_lock, \
     omp_set_lock, omp_unset_lock, omp_get_thread_num
@@ -15,7 +13,6 @@ from libc.stdlib cimport malloc, free
 import numpy as np
 cimport numpy as np
 
-
 # lock helper functions
 cdef void acquire(omp_lock_t *l) nogil:
     omp_set_lock(l)
@@ -45,11 +42,71 @@ cdef void free_N_locks(int N, omp_lock_t *locks) nogil:
 
     free(<void *> locks)
 
+# My function for checking lock conditions for fine grain
+cdef void check_lock(int idx, omp_lock_t *locks, np.int32_t[:] counts, 
+    np.int32_t[:] src, np.int32_t[:] dest) nogil:
+
+    # If the source index is greater, grab it's lock first
+    # This prevents deadlocking
+    if src[idx] > dest[idx]:
+        acquire(&locks[src[idx]])
+        acquire(&locks[dest[idx]])
+        counts[dest[idx]] += 1
+        counts[src[idx]] -= 1
+        release(&locks[src[idx]])
+        release(&locks[dest[idx]])
+
+    # If the destination's index is greater, grab it first
+    # Also prevents deadlocking
+    elif src[idx] < dest[idx]:
+        acquire(&locks[dest[idx]])
+        acquire(&locks[src[idx]])
+        counts[dest[idx]] += 1
+        counts[src[idx]] -= 1
+        release(&locks[src[idx]])
+        release(&locks[dest[idx]])
+
+    # If the indexs are equal, only grab one lock
+    # This prevents double locking   
+    else:
+        acquire(&locks[src[idx]]) 
+        counts[dest[idx]] += 1
+        counts[src[idx]] -= 1
+        release(&locks[src[idx]])
+
+
+# My Function Updated for course grain locking
+cdef void check_lock_med(int idx, int N, omp_lock_t *locks, np.int32_t[:] counts, 
+    np.int32_t[:] src, np.int32_t[:] dest) nogil:
+
+    # We must now check for the index/N as that cooresponds to the
+    # number of locks that are available
+    if src[idx]/N > dest[idx]/N:
+        acquire(&locks[src[idx]/N])
+        acquire(&locks[dest[idx]/N])
+        counts[dest[idx]] += 1
+        counts[src[idx]] -= 1
+        release(&locks[src[idx]/N])
+        release(&locks[dest[idx]/N])
+
+    elif src[idx]/N < dest[idx]/N:
+        acquire(&locks[dest[idx]/N])
+        acquire(&locks[src[idx]/N])
+        counts[dest[idx]] += 1
+        counts[src[idx]] -= 1
+        release(&locks[src[idx]/N])
+        release(&locks[dest[idx]/N])
+
+    else:
+        acquire(&locks[src[idx]/N]) 
+        counts[dest[idx]] += 1
+        counts[src[idx]] -= 1
+        release(&locks[src[idx]/N])
+
 
 ##################################################
 # Your code below
 ##################################################
-
 cpdef move_data_serial(np.int32_t[:] counts,
                        np.int32_t[:] src,
                        np.int32_t[:] dest,
@@ -65,11 +122,12 @@ cpdef move_data_serial(np.int32_t[:] counts,
                    counts[dest[idx]] += 1
                    counts[src[idx]] -= 1
 
-
+# Updated move_data_fine_grained for parallel implementation
 cpdef move_data_fine_grained(np.int32_t[:] counts,
                              np.int32_t[:] src,
                              np.int32_t[:] dest,
-                             int repeat):
+                             int repeat,
+                             int threads):
    cdef:
        int idx, r
        omp_lock_t *locks = get_N_locks(counts.shape[0])
@@ -79,12 +137,11 @@ cpdef move_data_fine_grained(np.int32_t[:] counts,
    # Use parallel.prange() and a lock for each element of counts to parallelize
    # data movement.  Be sure to avoid deadlock, and double-locking.
    ##########
-   with nogil:
-       for r in range(repeat):
-           for idx in range(src.shape[0]):
-               if counts[src[idx]] > 0:
-                   counts[dest[idx]] += 1
-                   counts[src[idx]] -= 1
+
+   for r in xrange(repeat):
+       for idx in prange(src.shape[0], nogil=True, num_threads=threads): 
+           if counts[src[idx]] > 0:
+               check_lock(idx, locks, counts, src, dest)
 
    free_N_locks(counts.shape[0], locks)
 
@@ -93,7 +150,8 @@ cpdef move_data_medium_grained(np.int32_t[:] counts,
                                np.int32_t[:] src,
                                np.int32_t[:] dest,
                                int repeat,
-                               int N):
+                               int N,
+                               int threads):
    cdef:
        int idx, r
        int num_locks = (counts.shape[0] + N - 1) / N  # ensure enough locks
@@ -105,11 +163,11 @@ cpdef move_data_medium_grained(np.int32_t[:] counts,
    # to parallelize data movement.  Be sure to avoid deadlock, as well as
    # double-locking.
    ##########
-   with nogil:
-       for r in range(repeat):
-           for idx in range(src.shape[0]):
-               if counts[src[idx]] > 0:
-                   counts[dest[idx]] += 1
-                   counts[src[idx]] -= 1
-
-   free_N_locks(num_locks, locks)
+
+   for r in xrange(repeat):
+       for idx in prange(src.shape[0], nogil=True, num_threads=threads):
+           if counts[src[idx]] > 0:
+               check_lock_med(idx, N, locks, counts, src, dest)
+
+   free_N_locks(num_locks, locks)          
+