-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfw_canon.R
124 lines (98 loc) · 2.61 KB
/
fw_canon.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env Rscript
# Max Brown 2021; Wellcome Sanger Institute
###############
## Libraries ##
###############
library(argparse)
library(data.table)
######################
## Argument Parsing ##
######################
parser <- ArgumentParser()
parser$add_argument("-t", "--tsv", help = "input TSV file from fasta_windows output.")
args <- parser$parse_args()
###############
## Functions ##
###############
# get the kmers from the colnames of out data table
get_kmers <- function(x) {
colnms <- colnames(x)
matches <- regmatches(colnms, gregexpr("A*|G*|C*|T*", colnms))
matches_list <- lapply(matches, function(e) paste0(e, collapse = ""))
kmers <- unlist(matches_list)
kmers[kmers != ""]
}
# reverse complement a (vector of)
# string of DNA letters
revcomp <- function(x) {
# kmer assignment vector
kmers <- character(length(x))
# loop over vector of kmers
for (i in seq_len(length(x))) {
# bases assignement vector
vector <- character(length(kmers[1]))
bases <- strsplit(x[i], "")[[1]]
for (j in seq_len(length(bases))) {
# complement
switch(bases[j],
"A" = {
bases[j] <- "T"
},
"G" = {
bases[j] <- "C"
},
"C" = {
bases[j] <- "G"
},
"T" = {
bases[j] <- "A"
}
)
vector[j] <- bases[j]
}
# and reverse
kmers[i] <- paste0(rev(vector), collapse = "")
}
return(kmers)
}
# from a full set of non-canonical kmers
# generate the set of canonical kmers
generate_canonical_kmers <- function(x) {
kmers <- x
rev_comp_kmers <- revcomp(x)
rev_comp_kmers[!(kmers < rev_comp_kmers)]
}
# aggregate columns of our data with the same name
agg <- function(df) {
do.call(cbind, lapply(split(as.list(df), names(df)), function(x) {
Reduce(`+`, x)
}))
}
##########
## Main ##
##########
File <- args$tsv
if (is.null(File)) {
stop("[-]\tNo input file detected.")
}
dat <- fread(File)
# get kmers from our input data
kmers <- get_kmers(dat)
# rename columns
colnames(dat)[
!grepl(pattern = "ID|description|start|end", x = colnames(dat))
] <-
ifelse(kmers < revcomp(kmers), kmers, revcomp(kmers))
# aggregate data on equivalent colnames
# and set as DT
dat2 <- agg(dat)
dat2 <- setDT(as.data.frame(dat2))
# generate canonical kmers and lexicographically order them
canon_kmers <- generate_canonical_kmers(kmers)
canon_kmers_ordered <- canon_kmers[order(canon_kmers)]
# apply this to output
setcolorder(dat2, c(colnames(dat)[
grepl(pattern = "ID|description|start|end", x = colnames(dat))
], canon_kmers_ordered))
# write to stdout
fwrite(x = dat2, file = "", sep = "\t")