-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDATA310-monte-carlo
273 lines (201 loc) · 8.73 KB
/
DATA310-monte-carlo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
# DATA-3100
# Adefoluke Shemsu
setwd("~/Documents/Education/Penn/Classes/DATA 310/Week 2")
#1. In your own words, explain what a probability distribution is and why you might use one.
# Probability distribution describes any values and probabilities of a variable or vector within a specified range.
#2. X ~ N(26,54) (i.e. mean of 26 and variance of 54)
#a. p(24 < X < 30) = ?
(pnorm(30, 26, sqrt(54))) - (pnorm(24, 26, sqrt(54)))
# p(24 < X < 30) = .314 or 31.4% (rounded from .3141458)
#b. Find lb and ub on the symmetric CI such that p(lb < X < ub) = .9
((100 - 90) / 2 + 90) / 100
qnorm(0.95, 26, sqrt(54)) #ub = 38.08716
qnorm((1-0.95), 26, sqrt(54)) #lb = 13.91284
#3. Y ~ N(43,23) (i.e. mean of 43 and variance of 23)
#a. p(40 < Y < 50) = ?
(pnorm(50, 43, sqrt(23))) - (pnorm(40, 43, sqrt(23)))
# p(40 < Y < 50) = .662 or 66.2% (rounded from .6619928)
#b. The symmetric CI such p(lb < Y < ub) = .99
((100 - 99) / 2 + 99) / 100
qnorm(0.995, mean = 43, sd = sqrt(23)) #ub = 55.35324
qnorm(1-0.995, mean = 43, sd = sqrt(23)) #lb = 30.64676
#4. Modify the R code YahtzeeFirstRoll.R to answer the following questions:
# Original code - will modify in (a) and (b)
# How many dice
numdice = 5
# How many simulations
numsims = 1000000
# Initializes a vector that contains the outcome of each simulation
maxnumber <- matrix(NA, nrow = numsims, ncol = 1)
# Sets the seed
set.seed(12221979)
for (s in 1:numsims) {
# Draws uniform r.v.
uniform <- runif(n = numdice, min = 0, max = 1)
# Initializes a vector
dice <- matrix(-9, nrow = numdice, ncol = 1)
# Transforms uniform r.v. into dice rolls
# Applies inverse sampling
dice[uniform >= 0 & uniform <= (1/6)] <- 1
dice[uniform > (1/6) & uniform <= (2/6)] <- 2
dice[uniform > (2/6) & uniform <= (3/6)] <- 3
dice[uniform > (3/6) & uniform <= (4/6)] <- 4
dice[uniform > (4/6) & uniform <= (5/6)] <- 5
dice[uniform > (5/6) & uniform <= (6/6)] <- 6
# Finds the mode of dice and stores in maxnum
uniqv <- unique(dice)
maxnum <- uniqv[which.max(tabulate(match(dice, uniqv)))]
# Stores how many dice equal the mode
maxnumber[s] <- sum(dice == maxnum)
}
#a. Approximate the probability of the “maxnumber” random variable equaling one when rolling four dice
# How many dice
numdice = 4
# How many simulations
numsims = 1000000
# Initializes a vector that contains the outcome of each simulation
maxnumber <- matrix(NA, nrow = numsims, ncol = 1)
# Sets the seed
set.seed(12221979)
for (s in 1:numsims) {
# Draws uniform r.v.
uniform <- runif(n = numdice, min = 0, max = 1)
# Initializes a vector
dice <- matrix(-9, nrow = numdice, ncol = 1)
# Transforms uniform r.v. into dice rolls
# Applies inverse sampling
dice[uniform >= 0 & uniform <= (1/6)] <- 1
dice[uniform > (1/6) & uniform <= (2/6)] <- 2
dice[uniform > (2/6) & uniform <= (3/6)] <- 3
dice[uniform > (3/6) & uniform <= (4/6)] <- 4
dice[uniform > (4/6) & uniform <= (5/6)] <- 5
dice[uniform > (5/6) & uniform <= (6/6)] <- 6
# Finds the mode of dice and stores in maxnum
uniqv <- unique(dice)
maxnum <- uniqv[which.max(tabulate(match(dice, uniqv)))]
# Stores how many dice equal the mode
maxnumber[s] <- sum(dice == maxnum)
}
table(maxnumber)
mean(maxnumber == 1) * 100
#The probability of the “maxnumber” random variable equaling one when rolling four dice is ~27.8%.
#b. Approximate the probability of the “maxnumber” random variable equaling three or more when we roll six dice
# How many dice
numdice = 6
# How many simulations
numsims = 1000000
# Initializes a vector that contains the outcome of each simulation
maxnumber <- matrix(NA, nrow = numsims, ncol = 1)
# Sets the seed
set.seed(12221979)
for (s in 1:numsims) {
# Draws uniform r.v.
uniform <- runif(n = numdice, min = 0, max = 1)
# Initializes a vector
dice <- matrix(-9, nrow = numdice, ncol = 1)
# Transforms uniform r.v. into dice rolls
# Applies inverse sampling
dice[uniform >= 0 & uniform <= (1/6)] <- 1
dice[uniform > (1/6) & uniform <= (2/6)] <- 2
dice[uniform > (2/6) & uniform <= (3/6)] <- 3
dice[uniform > (3/6) & uniform <= (4/6)] <- 4
dice[uniform > (4/6) & uniform <= (5/6)] <- 5
dice[uniform > (5/6) & uniform <= (6/6)] <- 6
# Finds the mode of dice and stores in maxnum
uniqv <- unique(dice)
maxnum <- uniqv[which.max(tabulate(match(dice, uniqv)))]
# Stores how many dice equal the mode
maxnumber[s] <- sum(dice == maxnum)
}
table(maxnumber)
mean(maxnumber >= 3)*100
#The probability of the “maxnumber” random variable equaling one when rolling four dice is ~36.6%.
#c. In Yahtzee, there are two ways to roll what is called a “large straight”. You either need to a) roll a one, a two,
# a three, a four, and a five on your five dice, b) roll a two, a three, a four, a five, and a six on your five dice.
# Approximate the probability of rolling a large straight (hint: you can identify that a large straight has been
# rolled when value of “maxnum” is 1 and the sum of the five dice rolls equals 15 or 19).
# How many dice
numdice = 5
# How many simulations
numsims = 1000000
# Initializes a vector that contains the outcome of each simulation
maxnumber <- matrix(NA, nrow = numsims, ncol = 1)
# Initializes a vector that contains the outcome of each simulation
sumdice <- matrix(NA, nrow = numsims, ncol = 1)
# Sets the seed
set.seed(12221979)
for (s in 1:numsims) {
# Draws uniform r.v.
uniform <- runif(n = numdice, min = 0, max = 1)
# Initializes a vector
dice <- matrix(-9, nrow = numdice, ncol = 1)
# Transforms uniform r.v. into dice rolls
# Applies inverse sampling
dice[uniform >= 0 & uniform <= (1/6)] <- 1
dice[uniform > (1/6) & uniform <= (2/6)] <- 2
dice[uniform > (2/6) & uniform <= (3/6)] <- 3
dice[uniform > (3/6) & uniform <= (4/6)] <- 4
dice[uniform > (4/6) & uniform <= (5/6)] <- 5
dice[uniform > (5/6) & uniform <= (6/6)] <- 6
# Finds the mode of dice and stores in maxnum
uniqv <- unique(dice)
maxnum <- uniqv[which.max(tabulate(match(dice, uniqv)))]
# Stores how many dice equal the mode
maxnumber[s] <- sum(dice == maxnum)
# Stores the sum of the dice
sumdice[s] <- sum(dice)
}
table(maxnumber)
table(sumdice)
100 * mean(maxnumber == 1 & (sumdice == 15 | sumdice == 20))
#The probability of rolling a large straight--measured by the probability of the “maxnumber” variable equaling 1--
# given that the “sumdice” variable is 15 or 20, is ~3.1%.
#5. We are going to write a Monte Carlo simulation in R to simulate the flipping of a series of coins.
#a. Before you start working on this problem, write down a series of 25 made-up coin flips without using R or any other
# randomizing device (e.g., {HTTHTHTHHTHTTTTTHTHTHTHTT}).
HTHHTHTHTHTHTHHHHHTTTTHHH
#b. Modify the R code FlippingCoinsPartial.R to simulate the approximate cdf of the random variable equal to the longest
# streak in a series of 25 fair coin flips when you run 100,000 simulations. Every spot where you observe “XXX” in the
# code indicates something that you will need to fill in for your simulation to work.
# Sets the seed
set.seed(12221979)
# Defines the number of coins being flipped
numcoins = 25
# Defines the number of simulations
numsims = 100000
# Defines the probability that a coin is heads
probheads = 0.5
# Initializes a vector that contains the longest streak in every simulation
longstreak <- matrix(NA, nrow = numsims, ncol = 1)
# Uses a for loop to loop over simulations
for (s in 1:numsims) {
# Uses rbinom() function to draw a series Bernoulli r.v.s
# The number r.v.s in the series is defined by the parameter numcoins
coins <- rbinom(numcoins, size = 1, prob = probheads)
# Initializes a vector that contains the streak after every coin
streak <- matrix(NA, nrow = numcoins, ncol = 1)
# Initializes the streak to 1 after the first coin
streak[1] <- 1
# Uses a for loop to calculate streak after every coin
for (c in 2:numcoins) {
streak[c] <- 1 + streak[c-1]*(coins[c] == coins[c-1])
}
# Identifies the longest streak and stores in longest streak vector
longstreak[s] <- max(streak)
}
# Uses table command to create a pdf
pdf <- table(longstreak) / numsims
# Creates the cdf by looping over the pdf
cdf <- pdf
for (c in 2:dim(cdf)) {
cdf[c] <- cdf[c-1] + pdf[c]
}
cdfdf <- as.data.frame(cdf)
# c. According to the cdf, what is the probability of having a streak of more than 5 coins show up the same?
head(cdfdf)
(1 - cdfdf$Freq[cdfdf$longstreak == 5]) * 100
# The probability of having a streak of more than 5 coins show up the same is 29.67%.
# d. Using the series of coin flips you constructed before beginning this problem, what is the probability of having a
# streak longer than your longest streak?
(1 - cdfdf$Freq[cdfdf$longstreak == 3]) * 100
# The probability of having a longer streak is 84.92%.