-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGenerateExonJunctionSequences.py
executable file
·489 lines (441 loc) · 15.6 KB
/
GenerateExonJunctionSequences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
#!/usr/bin/env python
# -*- coding: utf-8 -*-:
"""
Copyright:
Copyright Universite of Sherbrooke, departement of biochemistry and
departement of computation.
Contact:
sarah.belhamiti@usherbrooke.ca
Date:
December 2017
Description:
This script aims to compute coordinates of introns and to create junctions
sequences. Junctions corrsponds to the 100 upstreams nucleotides of an
intron and to the 100 dowstream nucleotides of the same introns.
"""
import sys
import Bio
import os
import re
from pprint import pprint
from operator import itemgetter
import argparse
def importFastaChromosome(directory, chr):
"""Import the fasta file containing an entire chromosome sequence.
:param directory: directory name containing all chromosome fasta.
:type directory: string
:param chr: chromosome wanted.
:type chr: string
:returns: sequence, entire chromosome sequence.
:rtype: string
"""
filename = directory + '/Homo_sapiens.GRCh38.dna.chromosome.' + \
chr.split('r')[1] + '.fa'
with open(filename) as f: # file opening
content = f.read()
l = content.split('\n')
if l[0].startswith('>'):
header = l[0]
sequence = "".join(l[1:])
return sequence
def readLineTranscriptInfo(l):
"""Reads a line from the file containing infromations about transcripts.
This file is downloaded from Ensembl via biomart. The human assembly is
GRCh38p12.
:param l: line from the transcript file.
:type l: string
:returns: dicoL, contains all info of the line but parsed.
:rtype: dictionary
"""
words = l.split('\t')
dicoL = {'geneId' : words[0],
'trId' : words[1],
'Chromosome' : words[2],
'Biotype' : words[3],
'5utrStart' : words[4],
'5utrEnd' : words[5],
'3utrStart' : words[6],
'3utrEnd' : words[7],
'exonStart' : words[8],
'exonEnd' : words[9],
'Rank' : words[10],
'Strand' : words[11].rstrip()}
return dicoL
def OrderInformationBiomart(directory, inputfilename):
"""Creates one line per transcript with all feature in it.
Here we read the file containing informations about transcripts and parse
it. In the input file have informations about exons, UTR, chromosomes,
strand and biotype. One line contains informations about one enxon. UTR are
given on the first exon (5UTR) and on the last exon (3UTR). Introns coord
are computated and added to the others. Then all informations are regrouped
to fit in one line.
:param directory: name of the directorycontaining the file.
:type directory: string
:param inputfilename: name of the file containing transcripts informations.
:type inputfilename: string
:returns: InformationPerGeneAndTranscript, corresponds to one line of the
output.
:rtype: string
"""
old_transcript = ""
InformationPerGeneAndTranscript = {}
inputfile = open(directory+inputfilename,"r")
for line in inputfile:
if 'ENS' in line:
dicoL = readLineTranscriptInfo(line)
if old_transcript != dicoL['trId']:
var_rank = ""
var_startExon = ""
var_endExon = ""
start5 = ""
end5 = ""
start3 = ""
end3 = ""
# if new transcript assignation
var_rank = dicoL['Rank']
var_startExon = dicoL['exonStart']
var_endExon = dicoL['exonEnd']
start5 = dicoL['5utrStart']
end5 = dicoL['5utrEnd']
start3 = dicoL['3utrStart']
end3 = dicoL['3utrEnd']
old_transcript = dicoL['trId']
else: # add information for transcript with several exons
# only 3UTR is update because it's given at the end of the tr
var_rank = var_rank + ";" + dicoL['Rank']
var_startExon = var_startExon + ";" + dicoL['exonStart']
var_endExon = var_endExon + ";" + dicoL['exonEnd']
start3 = dicoL['3utrStart']
end3 = dicoL['3utrEnd']
InformationPerGeneAndTranscript[dicoL['geneId']+"-"+dicoL['trId']] \
= dicoL['Chromosome'] + "|" + dicoL['Biotype'] + "|" + \
start5 + "|" + end5 + "|" + start3 + "|" + end3 + "|" + \
var_startExon + "|" + var_endExon + "|" + var_rank + "|" + \
dicoL['Strand']
return InformationPerGeneAndTranscript
def ExonTotal(rank, strand, start_exon, end_exon):
"""Retrieves from one transcript all its exons.
:param rank: number of exon.
:type rank: integer
:param strand: strand of the transcript, can be 1 for forward and -1 for
reverse.
:type strand: string
:param start_exon: all exon start for a transcript.
:type start_exon: integer
:param end_exon: all exon end for a transcript.
:type end_exon: integer
:returns: exon_total, array of array with start and end of each exon.
:rtype: list
"""
i = 0
exon_total = []
while i < rank:
exon = []
if (strand == "1"): # if forward gene
exon.append(int(start_exon[i]))
exon.append(int(end_exon[i]))
i += 1
else: # if reverse gene
exon.append(int(end_exon[i]))
exon.append(int(start_exon[i]))
i += 1
exon_total.append(exon)
return exon_total
def CreateStartIntron(exon_total, strand):
"""Computes all introns starts for a transcript.
:param exon_total: all start and end of each exon from a transcript.
:type exon_total: list
:param strand: strand of the transcript, can be 1 for forward and -1 for
reverse.
:type strand: string
:returns: start_intron, all introns starts from a transcript.
:rtype: list
"""
start_intron = []
for i in exon_total:
if i[1] != exon_total[-1][1]:
# EXCEPT end last exon
if strand == "1":
intronStart = int( i[1] ) + 1
else:
intronStart = int( i[1] ) - 1
start_intron.append(intronStart)
return start_intron
def CreateEndIntron(exon_total, rank, strand):
"""Computes all introns ends for a transcript.
:param exon_total: all start and end of each exon from a transcript.
:type exon_total: list
:param rank: contain all exon ranks from a transcript.
:type rank: list
:param strand: strand of the transcript, can be 1 for forward and -1 for
reverse.
:type strand: string
:returns: end_intron, all introns ends from a transcript.
:rtype: list
"""
end_intron = []
for i in exon_total:
if int(i[0]) != int(exon_total[-len(rank)][0]):
# EXCEPT start firt exon
if strand == "1":
introndEnd = int( i[0] ) - 1
else:
introndEnd = int( i[0] ) + 1
end_intron.append(introndEnd)
return end_intron
def IntronTotal(start_intron, end_intron):
"""Gets all introns coordinates from a transcript.
:param start_intron: all introns starts from a transcript.
:type start_intron: list
:param end_intron: all introns ends from a transcript.
:type end_intron: list
:returns: intron_total, all start and end of each intron from a transcript.
:rtype: list
"""
i = 0
intron_total = []
while i < len(start_intron): # for number of intron
intron = []
intron.append( str(start_intron[i]) )
intron.append( str(end_intron[i]) )
intron_total.append(intron)
i += 1
return intron_total
def AddIntron(intron_total, Intron):
""" Adds new introns in genes intron.
:param intron_total: all introns from a transcript.
:type intron_total: list
:param Intron: all intron from transcriptome.
:type Intron: list
:returns: Intron, updated with new introns.
:rtype: list
"""
for i in intron_total:
if i not in Intron:
Intron.append(i)
return Intron
def AddTranscriptPerIntron(intron_total, Dico, transcriptID):
"""Creates a dictionary with all introns and their transcript ID.
:param intron_total: all introns from a transcript.
:type intron_total: list
:param Dico: contiains for each intron, the transcript it's belong.
:type Dico: dictionary
:param transcriptID: Ensembl transcript ID.
:type transcriptID: string
:returns: Dico, with new introns.
:rtype: dictionary
"""
for i in intron_total:
if i[0]+'-'+i[1] not in Dico:
# if this intron is not in dico
Dico[i[0]+'-'+i[1]] = transcriptID
else:
last_transcript = Dico.get( i[0]+'-'+i[1] )
Dico[ i[0]+'-'+i[1] ] = last_transcript +"-"+ transcriptID
return Dico
def reverseSequence(Sequence):
""" Reverse complement a DNA sequence.
:param Sequence: DNA sequence that will be reversed.
:type Sequence: string
:returns: Sequence, the initial DNA sequence but reverse complemented.
:rtype: string
"""
reverse = ""
nucleotides = {'A' : 'T',
'T' : 'A',
'C' : 'G',
'G' : 'C'}
for n in Sequence:
if n in nucleotides:
tmp = nucleotides[n]
else :
tmp = n # in some sequences there is many N or other letter
reverse += tmp
return reverse
def Fasta(chrSequence, start, end, strand):
"""Gets a gene fasta sequence from Ensembl.
:param geneID: Ensembl Id of a gene.
:type geneID: string
:param chromosome: name of the gene chromosome
:type chromosome: string
:param start: start of the gene.
:type start: integer
:param end: end of the gene.
:type end: integer
:param strand: strand of the gene.
:type strand: string
:param specie: name of the specie.
:type specie: string
:returns: DNA sequence of the gene.
:rtype: string
"""
seq = chrSequence[start-1:end-1]
# if strand == '1':
#
# else:
# seq = chrSequence[end-1:start-1]
# seq = reverseSequence(seq)
return seq
def CreateSequence(dirFasta, dirData, inputfilename, IntronPerGene, InfoPerGene,
extension, chromosome):
"""Writes output with all junction sequences.
:param directory: name of the directory containing the file.
:type directory: string
:param inputfilename: name of the file containing transcripts informations.
:type inputfilename: string
:param IntronPerGene: contains all genes and all its introns.
:type IntronPerGene: dictionary
:param InfoPerGene: contains informations of genes.
:type InfoPerGene: dictionary
:param extension: length of the upstream sequence and dowstream sequence we
retrieve.
:type extension: integer
:param chromosome: current chromosome.
:type chromosome: string
"""
chrSequence = importFastaChromosome(dirFasta, chromosome)
output = open(dirData+inputfilename.split(".")[0]+"_Sequence.txt","w")
for gene in IntronPerGene:# for every intron by gene
if gene in InfoPerGene:
geneID = gene
chromosome = InfoPerGene[gene].split("\t")[0]
biotype = InfoPerGene[gene].split("\t")[1]
strand = InfoPerGene[gene].split("\t")[2]
for intron in IntronPerGene[gene]:
start_intron = int(intron[0])
end_intron = int(intron[1])
if strand == "1":
start_sup = start_intron - 1 - extension
end_sup = start_intron - 1
start_inf = end_intron + 1
end_inf = end_intron + 1 + extension
sequence_amont = Fasta(chrSequence, start_sup, end_sup, strand)
sequence_aval = Fasta(chrSequence, start_sup, end_sup, strand)
else :
start_sup = start_intron + 1 + extension
end_sup = start_intron + 1
start_inf = end_intron - 1
end_inf = end_intron - 1 - extension
sequence_amont = Fasta(chrSequence, end_sup, start_sup, strand)
sequence_aval = Fasta(chrSequence, end_sup, start_sup, strand)
sequence_amont = reverseSequence(sequence_amont)
sequence_aval = reverseSequence(sequence_aval)
sequence = sequence_amont + sequence_aval
output.write(">" + geneID + "|" + "-".join(intron) + \
"\n" + sequence + "\n")
output.close()
def CreateIndex(directory, inputfilename, InfoPerGeneAndTranscript,
ExonPerTranscript, IntronPerTranscript):
"""Writes output with all transcripts informations.
:param directory: name of the directorycontaining the file.
:type directory: string
:param inputfilename: name of the file containing transcripts informations.
:type inputfilename: string
:param InfoPerGeneAndTranscript: contains all genes and transcripts info.
:type InfoPerGeneAndTranscript: dictionary
:param ExonPerTranscript: contains all transcripts and their exons.
:type ExonPerTranscript: dictionary
:param IntronPerTranscript: contains all transcripts and all its introns.
:type IntronPerTranscript: dictionary
"""
output = open(directory + inputfilename.split(".")[0] + "_Index.txt","w")
for gene in InfoPerGeneAndTranscript:
exonList = ""
intronList = ""
geneID = gene.split("-")[0]
transcriptID = gene.split("-")[1]
chromosome = InfoPerGeneAndTranscript[gene].split("|")[0]
biotype = InfoPerGeneAndTranscript[gene].split("|")[1]
start5 = InfoPerGeneAndTranscript[gene].split("|")[2]
end5 = InfoPerGeneAndTranscript[gene].split("|")[3]
start3 = InfoPerGeneAndTranscript[gene].split("|")[4]
end3 = InfoPerGeneAndTranscript[gene].split("|")[5]
start_exon = InfoPerGeneAndTranscript[gene].split("|")[6].split(";")
strand = InfoPerGeneAndTranscript[gene].split("|")[9]
if transcriptID in ExonPerTranscript:
exon = ExonPerTranscript[transcriptID]
for couple in exon:
exonList = exonList+'-'.join(str(x) for x in couple)+";"
else:
print "error exon", geneID, transcriptID
if transcriptID in IntronPerTranscript:
intron = IntronPerTranscript[transcriptID]
for couple in intron:
intronList = intronList + '-'.join(couple) + ";"
else:
print "error intron", geneID, transcriptID
output.write(transcriptID +"|"+ geneID +"|"+ chromosome +"|"+ strand +\
"|"+ biotype +"|"+ exonList[:-1] +"|"+ intronList[:-1] +"|"+ \
start5 +"|"+ end5 +"|"+ start3 +"|"+ end3 +"\n")
output.close()
def build_arg_parser():
GITDIR = os.getcwd()+'/'
parser = argparse.ArgumentParser(description = 'ReplaceInformationBiomart')
parser.add_argument ('-p', '--path', default = GITDIR)
parser.add_argument ('-chr', '--chromosome', default = 'chrY')
parser.add_argument ('-ext', '--extension', default = 100)
return parser
def main(path, inputfilename, extension, chromosome):
old_gene = ""
IntronPerGene = {}
ExonPerTranscript = {}
IntronPerTranscript = {}
TranscriptPerIntron = {}
InfoPerGene = {}
dirData = path + 'Data/' + chromosome + '/'
dirFasta = path + 'Data/Fasta'
InfoPerGeneAndTr = OrderInformationBiomart(dirData, inputfilename)
for geneTr in InfoPerGeneAndTr:
infoTr = InfoPerGeneAndTr[geneTr].split("|")
geneID = geneTr.split("-")[0]
transcriptID = geneTr.split("-")[1]
chr = infoTr[0]
biotype = infoTr[1]
start5 = infoTr[2]
end5 = infoTr[3]
start3 = infoTr[4]
end3 = infoTr[5]
start_exon = infoTr[6].split(";")
end_exon = infoTr[7].split(";")
rank = infoTr[8].split(";")
strand = infoTr[9]
# if (re.compile('[1-9, X, Y]').search(chr) and not
# re.compile('[A-W, Z]').search(chr)):
InfoPerGene[geneID] = chr +'\t'+ biotype +'\t'+ strand
exon_total = ExonTotal(len(rank), strand, start_exon, end_exon)
# in some case, exon are not in the good order, so we sort them to
# avoid to get bad introns (their coords ar computed using the order
# of lines in the input file)
if strand == '1':
exon_total = sorted(exon_total, key=itemgetter(0))
else:
exon_total = sorted(exon_total, key=itemgetter(0), reverse=True)
start_intron = CreateStartIntron(exon_total, strand)
end_intron = CreateEndIntron(exon_total, rank, strand)
intron_total = IntronTotal(start_intron, end_intron)
ExonPerTranscript[transcriptID] = exon_total
IntronPerTranscript[transcriptID] = intron_total
if (old_gene != geneID):
Intron = []
Intron = AddIntron(intron_total, Intron)
old_gene = geneID
else:
# if other transcript
Intron = AddIntron(intron_total, Intron)
old_gene = geneID
IntronPerGene[geneID] = Intron
TranscriptPerIntron = AddTranscriptPerIntron(intron_total,
TranscriptPerIntron, transcriptID)
##Create file Sequence
CreateSequence(dirFasta, dirData, inputfilename, IntronPerGene, InfoPerGene,
extension, chromosome)
CreateIndex(dirData, inputfilename, InfoPerGeneAndTr, ExonPerTranscript,
IntronPerTranscript)
if __name__ == '__main__':
parser = build_arg_parser()
arg = parser.parse_args()
path = arg.path
chromosome = arg.chromosome
extension = arg.extension
inputfilename = 'HS_transcript_unspliced_'+chromosome+'.txt'
main(path, inputfilename, extension, chromosome)