-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpg_Permutation_Glassfrog.Rmd
348 lines (264 loc) · 11.8 KB
/
pg_Permutation_Glassfrog.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
---
title: "20241014_Treatment_Limma_Models_Revised"
author: "cecilia"
date: "10/14/24"
output: html_document
Here is the link to all the data and scripts
https://dataverse.tdl.org/dataset.xhtml?persistentId=doi:10.18738/T8/RHYJQ5
This is the code I would use:
20221201_Treatment_Limma_Models_Revised.Rmd
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE)
```
```{r echo = T}
library(dplyr)
library(limma)
library(tidyverse)
library(tibble)
library(ggplot2)
```
Functions used throughout
Function for calculating q values -- number and proportion of randomly generated t stastics that are greater than or equal to the observed t statistic. The proportion indicates the empirical p-value adjustment
t = a matrix of observed t statistics generated by limma
t.rand = a list of n elements each of which is matrix of t statistics generated by limma after randomization of sample association with treatment and preference
n = number of permutation iterations
```{r echo = T}
calcQ = function(t, t.rand, n){
q = matrix(0, nrow = nrow(t), ncol = ncol(t))
t = abs(t) # sign indicated the directionality, magnitude determines significance comparisons should be of magnitude not directionality
t.rand = lapply(t.rand, abs)
for(i in 1 : n){
temp = t.rand[[i]]
#for(c in 1 : 4){
for(r in 1 : nrow(t)){
if(is.na(temp[r])){
# Partial NA coefficients for 1 probe(s)
}
else if(temp[r] >= t[r]){
q[r] = q[r] + 1 # every time randamized t statistic value is great than the observed t statistic add one, so you get a total count
}
}
}
#}
q_num = q
q = q_num / n # divided the number of randomized p-values less than the obs p-value by the number of iterations
q = cbind.data.frame(GeneID = row.names(t), q_num = q_num, q=q)
}
```
Function to extract summary statistics from permutation analyses.
t.rand = a list of n elements each of which is matrix of t statistics generated by permuation follow by limma (permute_Dis_FfPref_MmPref_tvalue)
t = a matrix of observed t statistics generated by limma
p = a matrix of observed p-values generated by limma
q = a matrix of q-values generated by permutation analysis (calcQ)
```{r echo = T}
# t.rand = t_rand_Dis_FfPref_MmPref
# t = t_Dis_FfPref_MmPref
# p = p_Dis_FfPref_MmPref
# q = q_Dis_FfPref_MmPref
permStats = function(t.rand, t, p, q){
t.rand = data.frame(do.call(cbind, t.rand))
temp_permStats= t.rand %>%
mutate(t_rand_median = apply(., 1, median),
t_rand_mean = apply(. , 1, mean),
t_rand_sd = apply(., 1, sd),
upper_rand_97_5 = apply(., 1, function(x) quantile(x, probs = .975)),
lower_rand_2_5 = apply(., 1, function(x) quantile(x, probs = .025)),
abs_rand_95 = apply(., 1, function(x) quantile(abs(x), probs = .95))) %>%
rownames_to_column('GeneID') %>%
select(GeneID, t_rand_median, t_rand_mean, t_rand_sd, upper_rand_97_5, lower_rand_2_5, abs_rand_95)
t = data.frame(t)
colnames(t) <- "t_observed"
t <- t %>%
rownames_to_column('GeneID')
p = data.frame(p)
colnames(p) <- "p_value"
p <- p %>%
rownames_to_column('GeneID')
t_p_obs<-left_join(t, p)
temp_permStats<-left_join(t_p_obs, temp_permStats)
temp_permStats<-left_join(temp_permStats, q)
temp_permStats<-cbind.data.frame(temp_permStats,
p_sig = ifelse(temp_permStats$p_value<0.05, 1, 0),
q_sig = ifelse(temp_permStats$q<0.05, 2, 0))
temp_permStats<-cbind.data.frame(temp_permStats,
SigInfo = temp_permStats$p_sig + temp_permStats$q_sig)
permStats_final<-temp_permStats %>%
select(-p_sig, -q_sig)
return(permStats_final)
}
```
```{r echo = T}
plot_PxEmpP <-function(Obs_Perm_Df, plot_title) {
both = length(which(Obs_Perm_Df$SigInfo == 3))
emp_pval = length(which(Obs_Perm_Df$SigInfo == 2))
limma_p = length(which(Obs_Perm_Df$SigInfo == 1))
neither = length(which(Obs_Perm_Df$SigInfo == 0))
discordant_genes = length(which(Obs_Perm_Df$p_value < 0.05))
ggplot(Obs_Perm_Df, aes(x=-log(p_value, 10),
y = -log(q, 10),
color = as.factor(SigInfo)))+
geom_point(alpha = 0.5) +
xlim(0,4)+
ylim(0,4)+
scale_color_manual(name = "significant genes",
values = c("#d9d9d9", "#525252", "#000000", "#67000d"),
labels = c(paste("neither", neither, sep = "-"),
paste("p value", limma_p, sep = "-"),
paste("q value", emp_pval, sep = "-"),
paste("both", both, sep = "-")))+
theme_classic() +
xlab("p value")+
ylab("permutation q value") +
ggtitle(paste(plot_title, discordant_genes, sep = "-")) +
NULL
}
```
```{r echo = T}
direction_significance<-function(Obs_Perm_Df) {
dir<- cbind.data.frame(Obs_Perm_Df, dir = ifelse(Obs_Perm_Df$t_observed > 0, 1, -1))
signif<-cbind.data.frame(dir, signif = ifelse(dir$q < 0.05, 1, 0))
sig_dir <- signif %>%
mutate(sig_dir = dir*signif) %>%
select(-dir, -signif)
return(sig_dir)
}
```
Loading Sailfin raw counts.
```{r echo = T}
SailfinCounts = read.csv("/stor/work/FRI_321G_RY_Spring2024/Summer2024/glassfrogs/Cecilia_Fall/permute/final_subset_grouped.csv") #change path as needed
SailfinCounts<-SailfinCounts %>%
column_to_rownames("geneID")
nrow(SailfinCounts)#18340 rows(i.e. genes)
```
```{r echo = T}
Behavior=read_csv("/stor/work/FRI_321G_RY_Spring2024/Summer2024/glassfrogs/Cecilia_Fall/permute/meta_glassfrog_pg.csv") #change path as needed
head(Behavior)
#rename select columns -- these are the parameters for the design and contrast matrices
pigmentation = Behavior$pigmented
# Check the renamed variables
print(pigmentation)
# Now you can proceed to use these variables in your design matrix:
design <- model.matrix(~ 0+pigmentation , data = Behavior)
```
# Sailfin Differential Expression : Treatment ('up' = Ff, 'down' = Mm)
Treatment:
1 = higher expression in in female social context; -1 = higher expression in mate choice context
```{r}
design = model.matrix(~0+pigmentation)
colnames(design)= c("pigmented", "unpigmented")
contr.matrix = makeContrasts('pgm - unpgm'="pigmented - unpigmented", levels = colnames(design))
contr.matrix
print(design)
# Apply the voom transformation, accounting for mean-variance trend
vxx <- voom(SailfinCounts, design, plot = TRUE)
# Fit linear model
vfitX <- lmFit(vxx, design)
# Create contrast matrix to directly compare pigmented vs unpigmented
contrast_matrix <- makeContrasts( pigmented - unpigmented, levels = colnames(coef(vfitX)))
# makeContrasts(pigmented_vs_unpigmented = pigmented - 0, levels = design) this is wrong matrix
# Fit contrasts to the linear model
vfitX <- contrasts.fit(vfitX, contrasts = contrast_matrix)
# Apply empirical Bayes moderation to improve estimates
efitX <- eBayes(vfitX)
# Plot histogram of log-transformed p-values
hist(-log10(efitX$p.value),
breaks = 50,
col = "lightgray",
main = "Histogram of -log10(p-values)",
xlab = "-log10(p-value)",
ylab = "Frequency")
# Plot histogram of raw p-values
hist(efitX$p.value,
breaks = 50,
col = "lightblue",
main = "Histogram of Raw P-values",
xlab = "P-value",
ylab = "Frequency")
# Get unadjusted results for differentially expressed genes (p-value < 0.05)
unadjusted_results <- decideTests(efitX, p.value = 0.05, adjust.method = "none")
summary(unadjusted_results)
# Show top differentially expressed genes
top_genes <- topTable(efitX, coef = 1, sort.by = "P")
head(top_genes)
# Write the unadjusted results to a CSV file
write.fit(efitX,
unadjusted_results,
"/stor/work/FRI_321G_RY_Spring2024/Summer2024/glassfrogs/Cecilia_Fall/permute/Results_pigmented_vs_unpigmented_unadjusted_p_vals.csv",
adjust = "none",
sep = ",",
row.names = TRUE)
```
Function for limma permutations to generate a null distribution of t statistics. Permutation randomizes the order of the treatment and preference so each gets assigned to a new randomized individual. For each iteration, a limma analysis tests for an associations of expression and preference between in the female social context only
treat = individual treatment
pref = individual preference score
exp = expression data set
n = the number of permutation iterations
```{r echo = T}
permute_Treatment_tvalue = function(treat, exp, n){
t.rand = list()
treat_pref = cbind.data.frame(treatment = treat) #
for(i in 1 : n){
# Randomize the order of treatment and preference scores - individuals will be assigned to different treatments and scores. It does not dissociate and treatment and preference.
rand_treat_pref = sample_n(treat_pref,9)
# rand_treat_pref = sample_n(treat_pref, 15, replace = TRUE)
treatment_rand = rand_treat_pref$treatment
design.rand = model.matrix(~0+treatment_rand) ### ~1
colnames(design.rand)= c("pigmented", "unpigmented")
contr.matrix_rand = makeContrasts('pgmted - unpgmted'="pigmented - unpigmented", levels = colnames(design.rand))
print(design.rand)
v.rand = voom(exp, design.rand, plot=F)
vfit.rand = lmFit(v.rand, design.rand)
vfit.rand = contrasts.fit(vfit.rand, contrasts = contr.matrix_rand)
efit.rand = eBayes(vfit.rand)
t.rand[[i]] = efit.rand[["t"]]
}
return(t.rand) #outputs a list of n elements of t statistics
}
```
```{r echo = T, eval=FALSE}
set.seed(12345)
t_rand_Treatment = permute_Treatment_tvalue(pigmentation,
SailfinCounts,
1500) #takes a long time
```
```{r echo = T}
t_treatment<-efitX[["t"]]
p_treatment<-efitX[["p.value"]]
q_treatment = calcQ(t_treatment, t_rand_Treatment , 1500) #this number should m
length(which(q_treatment$q < 0.05))
length(which(q_treatment$q < 0.025))
length(which(q_treatment$q < 0.01))
Obs_Perm_Treatment<-permStats(t_rand_Treatment,
t_treatment,
p_treatment,
q_treatment)
Obs_Perm_Treatment<-direction_significance(Obs_Perm_Treatment)
#Obs_Perm_Treatment<-Obs_Perm_Treatment%>%
# rename("Treatment" = "sig_dir")
write.csv(Obs_Perm_Treatment, "/stor/work/FRI_321G_RY_Spring2024/Summer2024/glassfrogs/Cecilia_Fall/permute/glassfrog_Obs_Perm_Treatment.csv", row.names = FALSE) #change path as needed
```
```{r echo=T}
Obs_Perm_Treatment <- read_csv("/stor/work/FRI_321G_RY_Spring2024/Summer2024/glassfrogs/Cecilia_Fall/permute/glassfrog_Obs_Perm_Treatment.csv") #change path as needed
plot_PxEmpP(Obs_Perm_Treatment, "Association with pigmentation")
ggsave("/stor/work/FRI_321G_RY_Spring2024/Summer2024/glassfrogs/Cecilia_Fall/permute/frog_Treatment_pval_qval.pdf") #change path as needed
```
question: when we doesn't take batch into consideration ,the result looks really weird, the batch effect looks couldn't be removed because the data size is two small, and the batch effect is too strong
```{r}
sessionInfo()
```
```{r}
library(dplyr)
# Read the data from the file
Obs_Perm_Treatment <- read_csv("/stor/work/FRI_321G_RY_Spring2024/Summer2024/glassfrogs/Cecilia_Fall/permute/glassfrog_Obs_Perm_Treatment.csv")
# Filter genes with SigInfo == 3 (both significant)
both_significant_genes <- Obs_Perm_Treatment %>%
filter(SigInfo == 3) %>%
select(GeneID, p_value, q, t_observed, t_rand_median, t_rand_mean) %>%
arrange(p_value) # Sort by p-value in ascending order
# Save the resulting data to a CSV file
output_file <- "/stor/work/FRI_321G_RY_Spring2024/Summer2024/glassfrogs/Cecilia_Fall/permute/both_significant_genes.csv"
write.csv(both_significant_genes, file = output_file, row.names = FALSE)
# Display a message to indicate completion
cat("Both significant genes and their details have been saved to:", output_file, "\n")
```