DATA310-monte-carlo

# DATA-3100
# Adefoluke Shemsu

setwd("~/Documents/Education/Penn/Classes/DATA 310/Week 2")

#1. In your own words, explain what a probability distribution is and why you might use one.

# Probability distribution describes any values and probabilities of a variable or vector within a specified range.

#2. X ~ N(26,54) (i.e. mean of 26 and variance of 54) 

#a. p(24 < X < 30) = ?

(pnorm(30, 26, sqrt(54))) - (pnorm(24, 26, sqrt(54))) 

# p(24 < X < 30) = .314 or 31.4% (rounded from .3141458)

#b. Find lb and ub on the symmetric CI such that p(lb < X < ub) = .9

((100 - 90) / 2 + 90) / 100

qnorm(0.95, 26, sqrt(54)) #ub = 38.08716
qnorm((1-0.95), 26, sqrt(54)) #lb = 13.91284

#3. Y ~ N(43,23) (i.e. mean of 43 and variance of 23) 

#a. p(40 < Y < 50) = ?

(pnorm(50, 43, sqrt(23))) - (pnorm(40, 43, sqrt(23))) 

# p(40 < Y < 50) = .662 or 66.2% (rounded from .6619928)

#b. The symmetric CI such p(lb < Y < ub) = .99

((100 - 99) / 2 + 99) / 100

qnorm(0.995, mean = 43, sd = sqrt(23)) #ub = 55.35324
qnorm(1-0.995, mean = 43, sd = sqrt(23)) #lb = 30.64676

#4. Modify the R code YahtzeeFirstRoll.R to answer the following questions:

# Original code - will modify in (a) and (b)

# How many dice
numdice = 5
# How many simulations
numsims = 1000000
# Initializes a vector that contains the outcome of each simulation
maxnumber <- matrix(NA, nrow = numsims, ncol = 1)  
# Sets the seed
set.seed(12221979)

for (s in 1:numsims) {
  
  # Draws uniform r.v.
  uniform <- runif(n = numdice, min = 0, max = 1)
  # Initializes a vector
  dice <- matrix(-9, nrow = numdice, ncol = 1)  
  # Transforms uniform r.v. into dice rolls
  # Applies inverse sampling 
  dice[uniform >= 0 & uniform <= (1/6)] <- 1
  dice[uniform > (1/6) & uniform <= (2/6)] <- 2
  dice[uniform > (2/6) & uniform <= (3/6)] <- 3
  dice[uniform > (3/6) & uniform <= (4/6)] <- 4
  dice[uniform > (4/6) & uniform <= (5/6)] <- 5
  dice[uniform > (5/6) & uniform <= (6/6)] <- 6
  
  # Finds the mode of dice and stores in maxnum
  uniqv <- unique(dice)
  maxnum <- uniqv[which.max(tabulate(match(dice, uniqv)))]
  
  # Stores how many dice equal the mode
  maxnumber[s] <- sum(dice == maxnum)
}

#a. Approximate the probability of the “maxnumber” random variable equaling one when rolling four dice

# How many dice
numdice = 4
# How many simulations
numsims = 1000000
# Initializes a vector that contains the outcome of each simulation
maxnumber <- matrix(NA, nrow = numsims, ncol = 1)  
# Sets the seed
set.seed(12221979)

for (s in 1:numsims) {
  
  # Draws uniform r.v.
  uniform <- runif(n = numdice, min = 0, max = 1)
  # Initializes a vector
  dice <- matrix(-9, nrow = numdice, ncol = 1)  
  # Transforms uniform r.v. into dice rolls
  # Applies inverse sampling 
  dice[uniform >= 0 & uniform <= (1/6)] <- 1
  dice[uniform > (1/6) & uniform <= (2/6)] <- 2
  dice[uniform > (2/6) & uniform <= (3/6)] <- 3
  dice[uniform > (3/6) & uniform <= (4/6)] <- 4
  dice[uniform > (4/6) & uniform <= (5/6)] <- 5
  dice[uniform > (5/6) & uniform <= (6/6)] <- 6
  
  # Finds the mode of dice and stores in maxnum
  uniqv <- unique(dice)
  maxnum <- uniqv[which.max(tabulate(match(dice, uniqv)))]
  
  # Stores how many dice equal the mode
  maxnumber[s] <- sum(dice == maxnum)
}

table(maxnumber)
mean(maxnumber == 1) * 100

#The probability of the “maxnumber” random variable equaling one when rolling four dice is ~27.8%.

#b. Approximate the probability of the “maxnumber” random variable equaling three or more when we roll six dice

# How many dice
numdice = 6
# How many simulations
numsims = 1000000
# Initializes a vector that contains the outcome of each simulation
maxnumber <- matrix(NA, nrow = numsims, ncol = 1)  
# Sets the seed
set.seed(12221979)

for (s in 1:numsims) {
  
  # Draws uniform r.v.
  uniform <- runif(n = numdice, min = 0, max = 1)
  # Initializes a vector
  dice <- matrix(-9, nrow = numdice, ncol = 1)  
  # Transforms uniform r.v. into dice rolls
  # Applies inverse sampling 
  dice[uniform >= 0 & uniform <= (1/6)] <- 1
  dice[uniform > (1/6) & uniform <= (2/6)] <- 2
  dice[uniform > (2/6) & uniform <= (3/6)] <- 3
  dice[uniform > (3/6) & uniform <= (4/6)] <- 4
  dice[uniform > (4/6) & uniform <= (5/6)] <- 5
  dice[uniform > (5/6) & uniform <= (6/6)] <- 6
  
  # Finds the mode of dice and stores in maxnum
  uniqv <- unique(dice)
  maxnum <- uniqv[which.max(tabulate(match(dice, uniqv)))]
  
  # Stores how many dice equal the mode
  maxnumber[s] <- sum(dice == maxnum)
}

table(maxnumber)
mean(maxnumber >= 3)*100

#The probability of the “maxnumber” random variable equaling one when rolling four dice is ~36.6%.

#c. In Yahtzee, there are two ways to roll what is called a “large straight”. You either need to a) roll a one, a two, 
# a three, a four, and a five on your five dice, b) roll a two, a three, a four, a five, and a six on your five dice. 
# Approximate the probability of rolling a large straight (hint: you can identify that a large straight has been 
# rolled when value of “maxnum” is 1 and the sum of the five dice rolls equals 15 or 19).

# How many dice
numdice = 5
# How many simulations
numsims = 1000000
# Initializes a vector that contains the outcome of each simulation
maxnumber <- matrix(NA, nrow = numsims, ncol = 1)  
# Initializes a vector that contains the outcome of each simulation
sumdice <- matrix(NA, nrow = numsims, ncol = 1) 
# Sets the seed
set.seed(12221979)

for (s in 1:numsims) {
  
  # Draws uniform r.v.
  uniform <- runif(n = numdice, min = 0, max = 1)
  # Initializes a vector
  dice <- matrix(-9, nrow = numdice, ncol = 1)  
  # Transforms uniform r.v. into dice rolls
  # Applies inverse sampling 
  dice[uniform >= 0 & uniform <= (1/6)] <- 1
  dice[uniform > (1/6) & uniform <= (2/6)] <- 2
  dice[uniform > (2/6) & uniform <= (3/6)] <- 3
  dice[uniform > (3/6) & uniform <= (4/6)] <- 4
  dice[uniform > (4/6) & uniform <= (5/6)] <- 5
  dice[uniform > (5/6) & uniform <= (6/6)] <- 6
  
  # Finds the mode of dice and stores in maxnum
  uniqv <- unique(dice)
  maxnum <- uniqv[which.max(tabulate(match(dice, uniqv)))]
  
  # Stores how many dice equal the mode
  maxnumber[s] <- sum(dice == maxnum)
  
  # Stores the sum of the dice
  sumdice[s] <- sum(dice)
}

table(maxnumber)
table(sumdice)

100 * mean(maxnumber == 1 & (sumdice == 15 | sumdice == 20))

#The probability of rolling a large straight--measured by the probability of the “maxnumber” variable equaling 1--
# given that the “sumdice” variable is 15 or 20, is ~3.1%.

#5. We are going to write a Monte Carlo simulation in R to simulate the flipping of a series of coins.

#a. Before you start working on this problem, write down a series of 25 made-up coin flips without using R or any other
# randomizing device (e.g., {HTTHTHTHHTHTTTTTHTHTHTHTT}).

HTHHTHTHTHTHTHHHHHTTTTHHH

#b. Modify the R code FlippingCoinsPartial.R to simulate the approximate cdf of the random variable equal to the longest 
# streak in a series of 25 fair coin flips when you run 100,000 simulations. Every spot where you observe “XXX” in the 
# code indicates something that you will need to fill in for your simulation to work.

# Sets the seed
set.seed(12221979)

# Defines the number of coins being flipped
numcoins = 25
# Defines the number of simulations
numsims = 100000
# Defines the probability that a coin is heads
probheads = 0.5

# Initializes a vector that contains the longest streak in every simulation
longstreak <- matrix(NA, nrow = numsims, ncol = 1) 

# Uses a for loop to loop over simulations
for (s in 1:numsims) {
  
  # Uses rbinom() function to draw a series Bernoulli r.v.s 
  # The number r.v.s in the series is defined by the parameter numcoins
  coins <- rbinom(numcoins, size = 1, prob = probheads)
  # Initializes a vector that contains the streak after every coin
  streak <- matrix(NA, nrow = numcoins, ncol = 1) 
  # Initializes the streak to 1 after the first coin
  streak[1] <- 1
  # Uses a for loop to calculate streak after every coin
  for (c in 2:numcoins) {
    
    streak[c] <- 1 + streak[c-1]*(coins[c] == coins[c-1])
    
  }
  
  # Identifies the longest streak and stores in longest streak vector
  longstreak[s] <- max(streak)
  
}

# Uses table command to create a pdf
pdf <- table(longstreak) / numsims
# Creates the cdf by looping over the pdf
cdf <- pdf
for (c in 2:dim(cdf)) {
  cdf[c] <- cdf[c-1] + pdf[c]
}

cdfdf <- as.data.frame(cdf)

# c. According to the cdf, what is the probability of having a streak of more than 5 coins show up the same?

head(cdfdf)

(1 - cdfdf$Freq[cdfdf$longstreak == 5]) * 100

# The probability of having a streak of more than 5 coins show up the same is 29.67%.

# d. Using the series of coin flips you constructed before beginning this problem, what is the probability of having a 
# streak longer than your longest streak?

(1 - cdfdf$Freq[cdfdf$longstreak == 3]) * 100

# The probability of having a longer streak is 84.92%.