Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HW3 #419

Open
wants to merge 21 commits into
base: HW3
Choose a base branch
from
Open

HW3 #419

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 134 additions & 25 deletions HW2/P2/P2.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import numpy as np
from timer import Timer
from parallel_vector import move_data_serial, move_data_fine_grained, move_data_medium_grained
import matplotlib.pyplot as plt

if __name__ == '__main__':
########################################
Expand All @@ -21,6 +22,7 @@

total = orig_counts.sum()


# serial move
counts = orig_counts.copy()
with Timer() as t:
Expand All @@ -29,23 +31,66 @@
print("Serial uncorrelated: {} seconds".format(t.interval))
serial_counts = counts.copy()

# fine grained
counts[:] = orig_counts
with Timer() as t:
move_data_fine_grained(counts, src, dest, 100)
assert counts.sum() == total, "Wrong total after move_data_fine_grained"
print("Fine grained uncorrelated: {} seconds".format(t.interval))
### fine grained, 4 threads, uncorrelated, multilock ####
# counts[:] = orig_counts
# with Timer() as t:
# move_data_fine_grained(counts, src, dest, 100, 4)
# assert counts.sum() == total, "Wrong total after move_data_fine_grained"
# print("Fine grained uncorrelated: {} seconds".format(t.interval))
# for t in threads:

### fine grained, multithread, uncorrelated ###
# Save output for multiple Threads
threads = range(1,10)
time_fine_threads_uncor = []

for th in threads:
counts[:] = orig_counts
with Timer() as t:
move_data_fine_grained(counts, src, dest, 100, th)
assert counts.sum() == total, "Wrong total after move_data_fine_grained"
print("Number of Threads: {}".format(th))
print("Fine grained uncorrelated: {} seconds".format(t.interval))
time_fine_threads_uncor.append(t.interval)

########################################
# You should explore different values for the number of locks in the medium
# grained locking
########################################
N = 10
counts[:] = orig_counts
with Timer() as t:
move_data_medium_grained(counts, src, dest, 100, N)
assert counts.sum() == total, "Wrong total after move_data_medium_grained"
print("Medium grained uncorrelated: {} seconds".format(t.interval))

### Medium grained, 4 threads, uncorrelated multi-lock ###
# Create a range of N values for Part 1
N_buffer = np.arange(1,21)

# Create a few N values
N_pts = [1,2,4,5,10]

# Save Medium grain time results
output_time_mg_uncor = []
output_time_mg_uncor_pts = []

# for N in N_buffer:
# counts[:] = orig_counts
# with Timer() as t:
# move_data_medium_grained(counts, src, dest, 100, N, 4)
# assert counts.sum() == total, "Wrong total after move_data_medium_grained"
# if (N == N_pts).any():
# print("Number of Locks: {}".format(N))
# print("Medium grained uncorrelated: {} seconds".format(t.interval))
# output_time_mg_uncor_pts.append(t.interval)
# output_time_mg_uncor.append(t.interval)

### Medium Grain, Multithread, Uncorrelated ###
time_med_threads_uncor = []
for th in threads:
N=10
counts[:] = orig_counts
with Timer() as t:
move_data_medium_grained(counts, src, dest, 100, N, th)
assert counts.sum() == total, "Wrong total after move_data_medium_grained"
print("Number of Threads: {}".format(th))
print("Medium grained uncorrelated: {} seconds".format(t.interval))
time_med_threads_uncor.append(t.interval)

########################################
# Now use correlated data movement
Expand All @@ -62,21 +107,85 @@
assert counts.sum() == total, "Wrong total after move_data_serial"
print("Serial correlated: {} seconds".format(t.interval))
serial_counts = counts.copy()

### fine grained, 4 threads, correlated, multilock ####
# counts[:] = orig_counts
# with Timer() as t:
# move_data_fine_grained(counts, src, dest, 100, 4)
# assert counts.sum() == total, "Wrong total after move_data_fine_grained"
# print("Fine grained correlated: {} seconds".format(t.interval))

# fine grained
counts[:] = orig_counts
with Timer() as t:
move_data_fine_grained(counts, src, dest, 100)
assert counts.sum() == total, "Wrong total after move_data_fine_grained"
print("Fine grained correlated: {} seconds".format(t.interval))

### fine grained, multithread, correlated ###
time_fine_threads_cor = []
for th in threads:
counts[:] = orig_counts
with Timer() as t:
move_data_fine_grained(counts, src, dest, 100, th)
assert counts.sum() == total, "Wrong total after move_data_fine_grained"
print("Number of Locks: {}".format(th))
print("Fine grained uncorrelated: {} seconds".format(t.interval))
time_fine_threads_cor.append(t.interval)

########################################
# You should explore different values for the number of locks in the medium
# grained locking
########################################
N = 10
counts[:] = orig_counts
with Timer() as t:
move_data_medium_grained(counts, src, dest, 100, N)
assert counts.sum() == total, "Wrong total after move_data_medium_grained"
print("Medium grained correlated: {} seconds".format(t.interval))

### medium grained, 4 threads, correlated, multilock ####
# output_time_mg_cor = []
# output_time_mg_cor_pts = []
# for N in N_buffer:
# counts[:] = orig_counts
# with Timer() as t:
# move_data_medium_grained(counts, src, dest, 100, N, 4)
# assert counts.sum() == total, "Wrong total after move_data_medium_grained"
# if (N == N_pts).any():
# print("Number of Locks: {}".format(N))
# print("Medium grained correlated: {} seconds".format(t.interval))
# output_time_mg_cor_pts.append(t.interval)
# output_time_mg_cor.append(t.interval)


### Medium Grain, Multithread, Correlated ###
time_med_threads_cor = []
N=10
for th in threads:
counts[:] = orig_counts
with Timer() as t:
move_data_medium_grained(counts, src, dest, 100, N, th)
assert counts.sum() == total, "Wrong total after move_data_medium_grained"
print("Number of Threads: {}".format(th))
print("Medium grained uncorrelated: {} seconds".format(t.interval))
time_med_threads_cor.append(t.interval)


### Multi-N ploting ####
# plt.figure(figsize=(10,8))
# plt.plot(N_buffer, output_time_mg_uncor)
# plt.plot(N_buffer, output_time_mg_cor)
# plt.scatter(N_pts, output_time_mg_uncor_pts, s=50, c='Red', label=u'UnCorrelated')
# plt.scatter(N_pts, output_time_mg_cor_pts, s=50, c='Green', label=u'Correlated')
# plt.title("Time of Array Suffle Correlated vs. Uncorrelated (Medium Grain Locking)")
# plt.xlabel("N")
# plt.ylabel("Completation Time")
# plt.legend(loc=2)
# plt.show()

### Multithread ploting ####
plt.figure(figsize=(10,8))

# Uncorrelated
plt.plot(threads, time_fine_threads_uncor, label=u'Fine Uncorrelated')
plt.plot(threads, time_med_threads_uncor, label=u'Medium Uncorrelated')

# Correlated
plt.plot(threads, time_fine_threads_cor, label=u'Fine Correlated')
plt.plot(threads, time_med_threads_cor, label=u'Medium Correlated')

# plt.scatter(N_pts, output_time_mg_uncor_pts, s=50, c='Red', label=u'UnCorrelated')
# plt.scatter(N_pts, output_time_mg_cor_pts, s=50, c='Green', label=u'Correlated')
plt.title("Time of Multithreaded Array Suffle Correlated vs. Uncorrelated")
plt.xlabel("Threads")
plt.ylabel("Completation Time")
plt.legend(loc=2)
plt.show()
42 changes: 42 additions & 0 deletions HW2/P2/P2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
### Part A ###

# Uncorrelated
Serial uncorrelated: 0.327378034592 seconds
Fine grained uncorrelated: 8.42167806625 seconds

Number of Locks: 1
Medium grained uncorrelated: 9.59998893738 seconds
Number of Locks: 2
Medium grained uncorrelated: 10.9623169899 seconds
Number of Locks: 4
Medium grained uncorrelated: 11.0328228474 seconds
Number of Locks: 5
Medium grained uncorrelated: 11.3029088974 seconds
Number of Locks: 10
Medium grained uncorrelated: 9.50029206276 seconds


# Correlated
Serial correlated: 0.472969055176 seconds
Fine grained correlated: 7.73167490959 seconds

Number of Locks: 1
Medium grained correlated: 9.56373286247 seconds
Number of Locks: 2
Medium grained correlated: 8.43163609505 seconds
Number of Locks: 4
Medium grained correlated: 8.61915707588 seconds
Number of Locks: 5
Medium grained correlated: 8.97014594078 seconds
Number of Locks: 10
Medium grained correlated: 11.4742889404 seconds

For this part of the problem, we fixed the number of threads to 4 and changed the amount of locks available to the medium grained data move. We can see from the plotted results that there is not much difference in execution time for the uncorrelated data move, i.e. the amount of data we lock for each thread does not change the performance of the data move. For the correlated data, we see that additional locks increase the performance. This makes sense as we increase the number of locks, there is additional data for threads to operate. The ideal amount of locks should be N=20 as this would ensure that the source and destination are no more than 10 elements away from one another in either direction. However, my results tell a different story. There is a bad performance "spikes" at N = 10 and 20. Based on the results I would pick N between 10 and 20.

### Part B ###

Serial uncorrelated: 0.36355805397 seconds
Serial correlated: 0.428131818771 seconds

In part B we were required to fix the number of locks to 10 and see how performance varies as we enumerate the amount of the threads available to move data. From the plot we can see that there is a point of diminishing returns after 4 threads. The performance is improved by 4 threads but decreases very quickly as we increase the number of threads. Moreover, the threaded results are much worse than the serial results, this is due to the communication needed between threads to move the data.

Binary file added HW2/P2/P2_multilock.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added HW2/P2/P2_mutlithread.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
104 changes: 81 additions & 23 deletions HW2/P2/parallel_vector.pyx
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
# turn off bounds checking & wraparound for arrays
#cython: boundscheck=False, wraparound=False
# cython: boundscheck=False, wraparound=False

##################################################
# setup and helper code
##################################################


from cython.parallel import parallel, prange
from cython.parallel import parallel, prange, threadid
from openmp cimport omp_lock_t, \
omp_init_lock, omp_destroy_lock, \
omp_set_lock, omp_unset_lock, omp_get_thread_num
Expand All @@ -15,7 +13,6 @@ from libc.stdlib cimport malloc, free
import numpy as np
cimport numpy as np


# lock helper functions
cdef void acquire(omp_lock_t *l) nogil:
omp_set_lock(l)
Expand Down Expand Up @@ -45,11 +42,71 @@ cdef void free_N_locks(int N, omp_lock_t *locks) nogil:

free(<void *> locks)

# My function for checking lock conditions for fine grain
cdef void check_lock(int idx, omp_lock_t *locks, np.int32_t[:] counts,
np.int32_t[:] src, np.int32_t[:] dest) nogil:

# If the source index is greater, grab it's lock first
# This prevents deadlocking
if src[idx] > dest[idx]:
acquire(&locks[src[idx]])
acquire(&locks[dest[idx]])
counts[dest[idx]] += 1
counts[src[idx]] -= 1
release(&locks[src[idx]])
release(&locks[dest[idx]])

# If the destination's index is greater, grab it first
# Also prevents deadlocking
elif src[idx] < dest[idx]:
acquire(&locks[dest[idx]])
acquire(&locks[src[idx]])
counts[dest[idx]] += 1
counts[src[idx]] -= 1
release(&locks[src[idx]])
release(&locks[dest[idx]])

# If the indexs are equal, only grab one lock
# This prevents double locking
else:
acquire(&locks[src[idx]])
counts[dest[idx]] += 1
counts[src[idx]] -= 1
release(&locks[src[idx]])


# My Function Updated for course grain locking
cdef void check_lock_med(int idx, int N, omp_lock_t *locks, np.int32_t[:] counts,
np.int32_t[:] src, np.int32_t[:] dest) nogil:

# We must now check for the index/N as that cooresponds to the
# number of locks that are available
if src[idx]/N > dest[idx]/N:
acquire(&locks[src[idx]/N])
acquire(&locks[dest[idx]/N])
counts[dest[idx]] += 1
counts[src[idx]] -= 1
release(&locks[src[idx]/N])
release(&locks[dest[idx]/N])

elif src[idx]/N < dest[idx]/N:
acquire(&locks[dest[idx]/N])
acquire(&locks[src[idx]/N])
counts[dest[idx]] += 1
counts[src[idx]] -= 1
release(&locks[src[idx]/N])
release(&locks[dest[idx]/N])

else:
acquire(&locks[src[idx]/N])
counts[dest[idx]] += 1
counts[src[idx]] -= 1
release(&locks[src[idx]/N])


##################################################
# Your code below
##################################################

cpdef move_data_serial(np.int32_t[:] counts,
np.int32_t[:] src,
np.int32_t[:] dest,
Expand All @@ -65,11 +122,12 @@ cpdef move_data_serial(np.int32_t[:] counts,
counts[dest[idx]] += 1
counts[src[idx]] -= 1


# Updated move_data_fine_grained for parallel implementation
cpdef move_data_fine_grained(np.int32_t[:] counts,
np.int32_t[:] src,
np.int32_t[:] dest,
int repeat):
int repeat,
int threads):
cdef:
int idx, r
omp_lock_t *locks = get_N_locks(counts.shape[0])
Expand All @@ -79,12 +137,11 @@ cpdef move_data_fine_grained(np.int32_t[:] counts,
# Use parallel.prange() and a lock for each element of counts to parallelize
# data movement. Be sure to avoid deadlock, and double-locking.
##########
with nogil:
for r in range(repeat):
for idx in range(src.shape[0]):
if counts[src[idx]] > 0:
counts[dest[idx]] += 1
counts[src[idx]] -= 1

for r in xrange(repeat):
for idx in prange(src.shape[0], nogil=True, num_threads=threads):
if counts[src[idx]] > 0:
check_lock(idx, locks, counts, src, dest)

free_N_locks(counts.shape[0], locks)

Expand All @@ -93,7 +150,8 @@ cpdef move_data_medium_grained(np.int32_t[:] counts,
np.int32_t[:] src,
np.int32_t[:] dest,
int repeat,
int N):
int N,
int threads):
cdef:
int idx, r
int num_locks = (counts.shape[0] + N - 1) / N # ensure enough locks
Expand All @@ -105,11 +163,11 @@ cpdef move_data_medium_grained(np.int32_t[:] counts,
# to parallelize data movement. Be sure to avoid deadlock, as well as
# double-locking.
##########
with nogil:
for r in range(repeat):
for idx in range(src.shape[0]):
if counts[src[idx]] > 0:
counts[dest[idx]] += 1
counts[src[idx]] -= 1

free_N_locks(num_locks, locks)

for r in xrange(repeat):
for idx in prange(src.shape[0], nogil=True, num_threads=threads):
if counts[src[idx]] > 0:
check_lock_med(idx, N, locks, counts, src, dest)

free_N_locks(num_locks, locks)

Loading