pg_Permutation_Glassfrog.Rmd

---
title: "20241014_Treatment_Limma_Models_Revised"
author: "cecilia"
date: "10/14/24"
output: html_document
Here is the link to all the data and scripts
https://dataverse.tdl.org/dataset.xhtml?persistentId=doi:10.18738/T8/RHYJQ5
This is the code I would use:
20221201_Treatment_Limma_Models_Revised.Rmd

---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE)
```

```{r echo = T}
library(dplyr)
library(limma)
library(tidyverse)
library(tibble)
library(ggplot2)

```
Functions used throughout

Function for calculating q values -- number and proportion of randomly generated t stastics that are greater than or equal to the observed t statistic. The proportion indicates the empirical p-value adjustment

t = a matrix of observed t statistics generated by limma 
t.rand = a list of n elements each of which is matrix of t statistics generated by limma after randomization of sample association with treatment and preference
n = number of permutation iterations

```{r echo = T}
calcQ = function(t, t.rand, n){
  q = matrix(0, nrow = nrow(t), ncol = ncol(t))
  t = abs(t) # sign indicated the directionality, magnitude determines significance comparisons should be of magnitude not directionality
  t.rand = lapply(t.rand, abs)
  for(i in 1 : n){
    
    temp = t.rand[[i]]
    
    #for(c in 1 : 4){
      for(r in 1 : nrow(t)){
        if(is.na(temp[r])){
          # Partial NA coefficients for 1 probe(s)
        }
        else if(temp[r] >= t[r]){
          q[r] = q[r] + 1 # every time randamized t statistic value is great than the observed t statistic add one, so you get a total count
        }
      }
    }
  #}
  q_num = q
  q = q_num / n # divided the number of randomized p-values less than the obs p-value by the number of iterations
  q = cbind.data.frame(GeneID = row.names(t), q_num = q_num, q=q)
}
```

Function to extract summary statistics from permutation analyses. 
t.rand = a list of n elements each of which is matrix of t statistics generated by permuation follow by limma (permute_Dis_FfPref_MmPref_tvalue)
t = a matrix of observed t statistics generated by limma 
p = a matrix of observed p-values generated by limma 
q = a matrix of q-values generated by permutation analysis (calcQ)

```{r echo = T}
# t.rand = t_rand_Dis_FfPref_MmPref
# t = t_Dis_FfPref_MmPref 
# p = p_Dis_FfPref_MmPref 
# q = q_Dis_FfPref_MmPref

permStats = function(t.rand, t, p, q){
  t.rand = data.frame(do.call(cbind, t.rand))
  temp_permStats= t.rand %>% 
    mutate(t_rand_median = apply(., 1, median), 
           t_rand_mean = apply(. , 1, mean), 
           t_rand_sd = apply(., 1, sd), 
           upper_rand_97_5 = apply(., 1, function(x) quantile(x, probs = .975)), 
           lower_rand_2_5 = apply(., 1, function(x) quantile(x, probs = .025)), 
           abs_rand_95 = apply(., 1, function(x) quantile(abs(x), probs = .95))) %>%
    rownames_to_column('GeneID') %>%
    select(GeneID, t_rand_median, t_rand_mean, t_rand_sd, upper_rand_97_5, lower_rand_2_5, abs_rand_95)
  t = data.frame(t) 
  colnames(t) <- "t_observed"
    t <- t %>%
      rownames_to_column('GeneID')
  p = data.frame(p) 
  colnames(p) <- "p_value"
    p <- p %>%
      rownames_to_column('GeneID')
  t_p_obs<-left_join(t, p)
  temp_permStats<-left_join(t_p_obs, temp_permStats)
  temp_permStats<-left_join(temp_permStats, q)
  temp_permStats<-cbind.data.frame(temp_permStats, 
                              p_sig = ifelse(temp_permStats$p_value<0.05, 1, 0), 
                              q_sig = ifelse(temp_permStats$q<0.05, 2, 0))
    temp_permStats<-cbind.data.frame(temp_permStats, 
                                SigInfo = temp_permStats$p_sig + temp_permStats$q_sig)
    permStats_final<-temp_permStats %>%
      select(-p_sig, -q_sig)
  return(permStats_final)
}
    
 
```

```{r echo = T}

plot_PxEmpP <-function(Obs_Perm_Df, plot_title) {

both = length(which(Obs_Perm_Df$SigInfo == 3))
emp_pval = length(which(Obs_Perm_Df$SigInfo == 2))
limma_p = length(which(Obs_Perm_Df$SigInfo == 1))
neither = length(which(Obs_Perm_Df$SigInfo == 0))
discordant_genes = length(which(Obs_Perm_Df$p_value < 0.05))

ggplot(Obs_Perm_Df, aes(x=-log(p_value, 10), 
                                            y = -log(q, 10), 
                                       color = as.factor(SigInfo)))+
  geom_point(alpha = 0.5) + 
  xlim(0,4)+
  ylim(0,4)+
  scale_color_manual(name = "significant genes", 
                     values = c("#d9d9d9", "#525252", "#000000", "#67000d"), 
                     labels = c(paste("neither", neither, sep = "-"),
                                paste("p value", limma_p, sep = "-"), 
                                paste("q value", emp_pval, sep = "-"), 
                                paste("both", both, sep = "-")))+
  theme_classic() +
  xlab("p value")+
  ylab("permutation q value") +
  ggtitle(paste(plot_title, discordant_genes, sep = "-")) +
  NULL
}

```

```{r echo = T}

direction_significance<-function(Obs_Perm_Df) {
  dir<- cbind.data.frame(Obs_Perm_Df, dir = ifelse(Obs_Perm_Df$t_observed > 0, 1, -1))
  signif<-cbind.data.frame(dir, signif = ifelse(dir$q < 0.05, 1, 0))
  sig_dir <- signif %>%
    mutate(sig_dir = dir*signif) %>%
    select(-dir, -signif)
  return(sig_dir)
}

```

Loading Sailfin raw counts. 
```{r  echo = T}
SailfinCounts = read.csv("/stor/work/FRI_321G_RY_Spring2024/Summer2024/glassfrogs/Cecilia_Fall/permute/final_subset_grouped.csv") #change path as needed
SailfinCounts<-SailfinCounts %>%
  column_to_rownames("geneID")

nrow(SailfinCounts)#18340 rows(i.e. genes)
```

```{r echo = T}
Behavior=read_csv("/stor/work/FRI_321G_RY_Spring2024/Summer2024/glassfrogs/Cecilia_Fall/permute/meta_glassfrog_pg.csv")  #change path as needed

head(Behavior)

#rename select columns -- these are the parameters for the design and contrast matrices
pigmentation = Behavior$pigmented

# Check the renamed variables
print(pigmentation)


# Now you can proceed to use these variables in your design matrix:
design <- model.matrix(~ 0+pigmentation , data = Behavior)
``` 


# Sailfin Differential Expression : Treatment ('up' = Ff, 'down' = Mm)
Treatment: 
1 = higher expression in in female social context; -1 = higher expression in mate choice context

```{r}
design = model.matrix(~0+pigmentation)
colnames(design)= c("pigmented", "unpigmented")

contr.matrix = makeContrasts('pgm - unpgm'="pigmented - unpigmented", levels = colnames(design))
contr.matrix
print(design)


# Apply the voom transformation, accounting for mean-variance trend
vxx <- voom(SailfinCounts, design, plot = TRUE)

# Fit linear model
vfitX <- lmFit(vxx, design)

# Create contrast matrix to directly compare pigmented vs unpigmented
contrast_matrix <- makeContrasts( pigmented - unpigmented, levels = colnames(coef(vfitX)))
 # makeContrasts(pigmented_vs_unpigmented = pigmented - 0, levels = design) this is wrong matrix

# Fit contrasts to the linear model
vfitX <- contrasts.fit(vfitX, contrasts = contrast_matrix)

# Apply empirical Bayes moderation to improve estimates
efitX <- eBayes(vfitX)

# Plot histogram of log-transformed p-values
hist(-log10(efitX$p.value), 
     breaks = 50, 
     col = "lightgray", 
     main = "Histogram of -log10(p-values)", 
     xlab = "-log10(p-value)", 
     ylab = "Frequency")

# Plot histogram of raw p-values
hist(efitX$p.value, 
     breaks = 50, 
     col = "lightblue", 
     main = "Histogram of Raw P-values", 
     xlab = "P-value", 
     ylab = "Frequency")


# Get unadjusted results for differentially expressed genes (p-value < 0.05)
unadjusted_results <- decideTests(efitX, p.value = 0.05, adjust.method = "none")
summary(unadjusted_results)

# Show top differentially expressed genes
top_genes <- topTable(efitX, coef = 1, sort.by = "P")
head(top_genes)

# Write the unadjusted results to a CSV file
write.fit(efitX, 
          unadjusted_results, 
          "/stor/work/FRI_321G_RY_Spring2024/Summer2024/glassfrogs/Cecilia_Fall/permute/Results_pigmented_vs_unpigmented_unadjusted_p_vals.csv",  
          adjust = "none", 
          sep = ",", 
          row.names = TRUE)

```

Function for limma permutations to generate a null distribution of t statistics. Permutation randomizes the order of the treatment and preference so each gets assigned to a new randomized individual. For each iteration, a limma analysis tests for an associations of expression and preference between in the female social context only

treat = individual treatment
pref = individual preference score
exp = expression data set
n = the number of permutation iterations

```{r echo = T}
permute_Treatment_tvalue = function(treat,  exp, n){
  t.rand = list()
  treat_pref = cbind.data.frame(treatment = treat) #
  for(i in 1 : n){
        # Randomize the order of treatment and preference scores - individuals will be assigned to different treatments and scores. It does not dissociate and treatment and preference. 
    rand_treat_pref = sample_n(treat_pref,9)
   # rand_treat_pref = sample_n(treat_pref, 15, replace = TRUE)
    treatment_rand = rand_treat_pref$treatment

    
    design.rand = model.matrix(~0+treatment_rand)  ### ~1
    
    colnames(design.rand)= c("pigmented", "unpigmented")

    contr.matrix_rand = makeContrasts('pgmted - unpgmted'="pigmented - unpigmented", levels = colnames(design.rand))
    print(design.rand)

    v.rand = voom(exp, design.rand, plot=F)
    vfit.rand = lmFit(v.rand, design.rand)
    vfit.rand = contrasts.fit(vfit.rand, contrasts = contr.matrix_rand)
    efit.rand = eBayes(vfit.rand)
    t.rand[[i]] = efit.rand[["t"]]
  }
  
  return(t.rand) #outputs a list of n elements of t statistics 
}

```


```{r echo = T, eval=FALSE}
 set.seed(12345)
 
t_rand_Treatment = permute_Treatment_tvalue(pigmentation, 
                                      SailfinCounts,
                                      1500) #takes a long time


```

```{r echo = T}
  
t_treatment<-efitX[["t"]]
p_treatment<-efitX[["p.value"]]


 q_treatment = calcQ(t_treatment, t_rand_Treatment , 1500) #this number should m
  length(which(q_treatment$q < 0.05))
  length(which(q_treatment$q < 0.025))
  length(which(q_treatment$q < 0.01))
  
Obs_Perm_Treatment<-permStats(t_rand_Treatment,
                                      t_treatment, 
                                      p_treatment, 
                                      q_treatment)

Obs_Perm_Treatment<-direction_significance(Obs_Perm_Treatment)
#Obs_Perm_Treatment<-Obs_Perm_Treatment%>%
#  rename("Treatment" = "sig_dir")


write.csv(Obs_Perm_Treatment, "/stor/work/FRI_321G_RY_Spring2024/Summer2024/glassfrogs/Cecilia_Fall/permute/glassfrog_Obs_Perm_Treatment.csv", row.names = FALSE)  #change path as needed
```


```{r echo=T}
Obs_Perm_Treatment <- read_csv("/stor/work/FRI_321G_RY_Spring2024/Summer2024/glassfrogs/Cecilia_Fall/permute/glassfrog_Obs_Perm_Treatment.csv") #change path as needed

plot_PxEmpP(Obs_Perm_Treatment, "Association with pigmentation")

ggsave("/stor/work/FRI_321G_RY_Spring2024/Summer2024/glassfrogs/Cecilia_Fall/permute/frog_Treatment_pval_qval.pdf")  #change path as needed

```
question: when we doesn't take batch into consideration ,the result looks really weird, the batch effect looks couldn't be removed because the data size is two small, and the batch effect is too strong 

```{r}
sessionInfo()
```

```{r}
library(dplyr)

# Read the data from the file
Obs_Perm_Treatment <- read_csv("/stor/work/FRI_321G_RY_Spring2024/Summer2024/glassfrogs/Cecilia_Fall/permute/glassfrog_Obs_Perm_Treatment.csv")

# Filter genes with SigInfo == 3 (both significant)
both_significant_genes <- Obs_Perm_Treatment %>%
  filter(SigInfo == 3) %>%
  select(GeneID, p_value, q, t_observed, t_rand_median, t_rand_mean) %>%
  arrange(p_value) # Sort by p-value in ascending order

# Save the resulting data to a CSV file
output_file <- "/stor/work/FRI_321G_RY_Spring2024/Summer2024/glassfrogs/Cecilia_Fall/permute/both_significant_genes.csv"
write.csv(both_significant_genes, file = output_file, row.names = FALSE)

# Display a message to indicate completion
cat("Both significant genes and their details have been saved to:", output_file, "\n")


```