-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerateShuffle.py
485 lines (429 loc) · 17.8 KB
/
generateShuffle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
#!/usr/bin/env python
# -*- coding: utf-8 -*-:
"""
Copyright:
Copyright Universite of Sherbrooke, departement of biochemistry and
departement of computation.
Date:
June 2019
Description:
This script aims to generate all random fasta needed. It will generate one
fasta file for each segment location but also for junction. The id of each
fasta is like : biotype:Trid:Start-End:Strand, exept for intron where the
biotype is at the end. To generate random sequences we retrieve the sequence
of the location we want, we transformed it into a list, then we shufle it
(change all position of all element randomly), to finally join the shuffled
list with no characters to recreate the sequence.
Data availability:
* fasta sequences used with G4RNA screener were download from Ensembl. The human assembly is GRCh38p12.
* transcripts information were downloaded from ensembl via ftp://ftp.ensembl.org/pub/release-96/gtf/homo_sapiens/
Abbreviation:
* bt for biotype
* tr for transcript
* dico for dictionnary
* w for word or words
* dir for directory
* cpt for compteur (counter)
"""
import os
import re
import math
import random
import argparse
from Bio import SeqIO
from pprint import pprint
import recurentFunction as rF
def shuffleSeq(seq):
"""Shuffle a sequence.
This function aims to shuffle a fasta sequence to randomize its sequence.
The fasta sequence is imported from a fasta file, then converted into a
list (one element corrresponds to one nucleotide), the list is shuffled and
tehn joined with nothing to recreate the sequence.
:param seq: sequence to shuffle.
:type seq: string
:returns: seq, sequence shuffled
:rtype: string
"""
seq = list(seq)
random.shuffle(seq)
seq = ''.join(seq)
return seq
def importFastaChromosome(directory, chr):
"""Import the fasta file containing an entire chromosome sequence.
:param directory: directory name containing all chromosome fasta.
:type directory: string
:param chr: chromosome wanted.
:type chr: string
:returns: sequence, entire chromosome sequence.
:rtype: string
"""
filename = directory + 'Homo_sapiens.GRCh38.dna.chromosome.' + str(chr) + '.fa'
with open(filename) as f: # file opening
content = f.read()
l = content.split('\n')
if l[0].startswith('>'):
header = l[0]
sequence = "".join(l[1:])
return sequence
def writeFasta(outDir, dicoFasta, chr, location):
"""From a dictionary {id : seq}, write a fasta file.
:param outDir: name of the directory where the output file need to be writen.
:type outDir: string
:param dicoFasta: {id : seq}
:type dicoFasta: dictionary
:param location: name of the chromosome.
:type location: string
"""
if location == 'junction':
output = open(outDir +'Shuffle_chr'+chr+'_junction.fas', "w")
for id in dicoFasta:
output.write(id + "\n")
nbLine = math.ceil( float( len(dicoFasta[id]) ) / 60 )
cpt1 = 0
cpt2 = 60
for i in range(0,int(nbLine)) :
output.write(dicoFasta[id][cpt1:cpt2] + "\n")
# to have a new line after 60 characters
cpt1 += 60
cpt2 += 60
output.close()
else:
output = open(outDir +'Shuffle_chr'+chr+'.fas', "w")
for id in dicoFasta:
output.write(id + "\n")
nbLine = math.ceil( float( len(dicoFasta[id]) ) / 60 )
cpt1 = 0
cpt2 = 60
for i in range(0,int(nbLine)) :
output.write(dicoFasta[id][cpt1:cpt2] + "\n")
# to have a new line after 60 characters
cpt1 += 60
cpt2 += 60
output.close()
def reverseSequence(Sequence):
""" Reverse complement a DNA sequence.
:param Sequence: DNA sequence that will be reversed.
:type Sequence: string
:returns: Sequence, the initial DNA sequence but reverse complemented.
:rtype: string
"""
reverse = ""
nucleotides = {'A' : 'T',
'T' : 'A',
'C' : 'G',
'G' : 'C'}
for n in Sequence:
if n in nucleotides:
tmp = nucleotides[n]
else :
tmp = n # in some sequences there is many N or other letter
reverse += tmp
return reverse
def getUTRCoordByStrand(w, dicoLine):
"""Retrieves coords of UTR depending on the strand.
:param w: contains all words of one line of the index file.
:type w: list
:param dicoLine: contains all info on a line (except UTR).
:type dicoLine: dictionary
:returns: dicoLine, contains all info on a line, updated with UTR.
:rtype: dictionary
"""
if dicoLine['Strand'] == '1':
dicoLine.update({'start5' : w[7],
'end5' : w[8],
'start3' : w[9],
'end3' : w[10]})
else:
dicoLine.update({'start5' : w[8],
'end5' : w[7],
'start3' : w[10],
'end3' : w[9]})
return dicoLine
def parseIndex(w):
"""Transforme a line from the index file into a dictionary.
As one line of the index file contain many information, we parsed it into
a dictionnary to access easly to different informations.
:param w: contains all words of one line of the index file.
:type w: list
:returns: dicoLine, contains all information of one index line.
:rtype: dictionary
"""
dicoLine = {'idTr' : w[0],
'idGene' : w[1],
'Chromosome' : w[2],
'Strand' : w[3],
'geneBiotype' : w[4],
'exonList' : w[5].split(';'),
'intronList' : w[6].split(';')}
dicoLine.update(getUTRCoordByStrand(w, dicoLine))
return dicoLine
def createCoordSpliceSite(coord1, coord2, dicoLine):
"""Depending on the strand, create coordinates of splice sites.
Creates coordnates of Donor splice site and Acceptor splice site, depending
of the strand. The coordinates are +-59 nucletides of the intron coords
because we only want to detect G4 that are close to it.
:param coord1: start of the intron.
:type coord1: int
:param coord2: end of the intron.
:type coord2: int
:param dicoLine: line of the index file.
:type dicoLine: dictionary
:returns: idDonor, idAcceptor, id of the Donor splice site and of the
Acceptor splice sites.
:rtype: strings
"""
if dicoLine['Strand'] == '1':
donorStart = coord1 - 59
donorEnd = coord1 + 58
acceptorStart = coord2 - 59
acceptorEnd = coord2 + 58
else:
donorStart = coord1 + 59
donorEnd = coord1 - 58
acceptorStart = coord2 + 59
acceptorEnd = coord2 - 58
idDonor = 'donor:'+dicoLine['Chromosome']+':'+str(donorStart)+'-'+str(donorEnd)+':'+dicoLine['Strand']
idAcceptor = 'acceptor:'+dicoLine['Chromosome']+':'+str(acceptorStart)+'-'+str(acceptorEnd)+':'+dicoLine['Strand']
return idDonor, idAcceptor
def addExonIntron(type, dico, dicoLine):
"""Add to the index of location exon, intron and also spliced sites.
First the exon or intron location is created and added to the index
location. If the location is an intron, we create Donor spliced site and
Acceptor splice site using the +- 59 nucleotides upstream and dowstream
of each intron coords.
:param type: name of the location, exon or intron.
:type type: string
:param dico: aims to contain all location of a chromosome.
:type dico: dictionary
:param dicoLine: line of the index file.
:type dicoLine: dictionary
:returns: dico, updated with new locations.
:rtype: dictionary
"""
listName = type+'List'
for elem in dicoLine[listName]:
if elem:
elemId = type+':'+dicoLine['Chromosome']+':'+elem+':'+dicoLine['Strand']
if elemId not in dico[ dicoLine['idGene'] ]:
dico[ dicoLine['idGene'] ][elemId] = {}
if type == 'intron' and elem:
coord1 = int(elem.split('-')[0])
coord2 = int(elem.split('-')[1])
idDonor,idAcceptor = createCoordSpliceSite(coord1, coord2, dicoLine)
if idDonor not in dico[ dicoLine['idGene'] ]:
dico[ dicoLine['idGene'] ][idDonor] = {}
if idAcceptor not in dico[ dicoLine['idGene'] ]:
dico[ dicoLine['idGene'] ][idAcceptor] = {}
if dicoLine['idTr'] in dicoBt:
dico[ dicoLine['idGene'] ][elemId].update({dicoLine['idTr'] : dicoBt[ dicoLine['idTr'] ]})
if type == 'intron' and elem:
dico[ dicoLine['idGene'] ][idDonor].update({dicoLine['idTr'] : dicoBt[ dicoLine['idTr'] ]})
dico[ dicoLine['idGene'] ][idAcceptor].update({dicoLine['idTr'] : dicoBt[ dicoLine['idTr'] ]})
return dico
def overlaps(interval1, interval2):
"""Computes the distance or overlap between two interval.
Compute : min(ends) - max(starts). If > 0, return the number of bp of
overlap, if 0, they are book-ended and if < 0 return the distance in
bp between them.
:param interval1: contain the start and the end of a OQs.
:type interval1: list of int
:param interval2: contain the start and the end of a gff feature.
:type interval2: list of int
:returns: min(ends) - max(starts)
:rtype: int
"""
return min(interval1[1], interval2[1]) - max(interval1[0], interval2[0])
def findCDS(coord5UTR, coord3UTR, coordExon):
"""Finds where is the CDS in an exon.
We compute the overlap between UTR and exon. Then, depending of the overlap,
we find CDS coords. If the there is a perfect overlap UTR-Exon then there is
no CDS. If there no overlap then the exon is an CDS. Finally if there is an
overlap, we compute the CDS coords.
:param coord5UTR: coords of a 5UTR.
:type coord5UTR: list of int
:param coord3UTR: coords of a 3UTR.
:type coord3UTR: list of int
:param coordExon: coords of an exon.
:type coordExon: list of int
:returns: coordCDS, coords of a CDS.
:rtype: list of int
"""
overlap5UTR = overlaps(coord5UTR, coordExon)
overlap3UTR = overlaps(coord3UTR, coordExon)
if (overlap5UTR == coord5UTR[1]-coord5UTR[0] or
overlap3UTR == coord3UTR[1]-coord3UTR[0] or
overlap5UTR == coord5UTR[0]-coord5UTR[1] or
overlap3UTR == coord3UTR[0]-coord3UTR[1]):
coordCDS = []
elif overlap5UTR < 0 and overlap3UTR < 0:
#Aucun overlap
coordCDS = coordExon
elif overlap5UTR > 0:
if coordExon[0] < coordExon[1]:
#forward strand
startCDS = coord5UTR[1] + 1
endCDS = coordExon[1]
coordCDS = [startCDS, endCDS]
else:
#reverse strand
startCDS = coord5UTR[1] - 1
endCDS = coordExon[1]
coordCDS = [startCDS, endCDS]
elif overlap3UTR > 0:
if coordExon[0] < coordExon[1]:
#forward strand
startCDS = coordExon[0]
endCDS = coord3UTR[0] - 1
coordCDS = [startCDS, endCDS]
else:
#reverse strand
startCDS = coordExon[0]
endCDS = coord3UTR[1] + 1
coordCDS = [startCDS, endCDS]
else:
if coord5UTR[0] == coord5UTR[1] or coord3UTR[0] == coord3UTR[1]:
coordCDS = [ coordExon[0], coordExon[1] ]
return coordCDS
def addUTRCDS(dicoLine, dico, dicoBt):
"""Add to the index of location UTR and CDS.
CDS coords are computed from the comparison of exon coords and UTR coords.
All those information are added to the index of location.
:param dicoLine: contains all info on a line (except UTR).
:type dicoLine: dictionary
:param dico: aims to contain all location of a chromosome.
:type dico: dictionary
:param dicoBt: contains all biotype of all transcripts.
:type dicoBt: dictionary
:returns: dico, updated with new locations.
:rtype: dictionary
"""
exonList = [ [int(e.split('-')[0]), int(e.split('-')[1])] for e in dicoLine['exonList'] ]
if dicoLine['start5'] and dicoLine['end5']:
coords5UTr = [ int(dicoLine['start5']), int(dicoLine['end5']) ]
id5UTR = '5UTR:'+dicoLine['Chromosome']+':'+dicoLine['start5']+'-'+dicoLine['end5']+':'+dicoLine['Strand']
if id5UTR not in dico[ dicoLine['idGene'] ]:
dico[ dicoLine['idGene'] ][id5UTR] = {}
if dicoLine['idTr'] in dicoBt:
dico[ dicoLine['idGene'] ][id5UTR].update({dicoLine['idTr'] : dicoBt[ dicoLine['idTr'] ]})
else:
coords5UTr = [0, 0]
if dicoLine['start3'] and dicoLine['end3']:
coords3UTR = [ int(dicoLine['start3']), int(dicoLine['end3']) ]
id3UTR = '3UTR:'+dicoLine['Chromosome']+':'+dicoLine['start3']+'-'+dicoLine['end3']+':'+dicoLine['Strand']
if id3UTR not in dico[ dicoLine['idGene'] ]:
dico[ dicoLine['idGene'] ][id3UTR] = {}
if dicoLine['idTr'] in dicoBt:
dico[ dicoLine['idGene'] ][id3UTR].update({dicoLine['idTr'] : dicoBt[ dicoLine['idTr'] ]})
else:
coords3UTR = [0, 0]
if dicoLine['idTr'] in dicoBt:
if rF.addTypeTr(dicoBt[ dicoLine['idTr'] ]) == 'Coding':
coordsCDS = [ findCDS(coords5UTr, coords3UTR, e) for e in exonList]
coordsCDS = [CDS for CDS in coordsCDS if CDS]
for CDS in coordsCDS:
CDS = '-'.join(str(x) for x in CDS)
elemId = 'CDS:'+dicoLine['Chromosome']+':'+CDS+':'+dicoLine['Strand']
if elemId not in dico[ dicoLine['idGene'] ]:
dico[ dicoLine['idGene'] ][elemId] = {}
if dicoLine['idTr'] in dicoBt:
dico[ dicoLine['idGene'] ][elemId].update({dicoLine['idTr'] : dicoBt[ dicoLine['idTr'] ]})
return dico
def getDicoIndex(filename, dicoBt):
"""Parses the Index file into a dictionary to get all locations.
Browse all transcript in the index file and computes CDS, Donor splice site
and Acceptor splice site coordinates from other coords feature (exon,
intron and UTR).
:param filename: name of the index file,
:type filename: string
:param dicoBt: contains bt of all transcripts.
:type dicoBt: dictionary
:returns: dico, index file parsed to obtaine all locations informations.
:rtype: dictionary
"""
dico = {}
with open(filename) as f:
lines = f.read().splitlines()
for l in lines:
l = l.rstrip()
words = l.split('|')
dicoLine = parseIndex(words)
if dicoLine['idGene'] not in dico:
dico[ dicoLine['idGene'] ] = {}
dico.update(addExonIntron('exon', dico, dicoLine))
dico.update(addExonIntron('intron', dico, dicoLine))
if dicoLine['start5'] or dicoLine['start3']:
dico.update(addUTRCDS(dicoLine, dico, dicoBt))
return dico
def createFasta(dico, fastaFile, chr, outputDir):
"""Create fasta file from the index of location.
Each sequence corresponds to one location. The id is like :
Gene:location:chr:start-end:Tr1-bt|Tr2-bt
Location of only one nucleotides are not added to the fasta file.
The sequence is retrieved from a fasta file containing the entire
chromosome. This file have been dowloaded from the ensembl FTP.
:param dico: aims to contain all location of a chromosome.
:type dico: dictionary
:param fastaFile: directory containing fasta of all chromosomes.
:type fastaFile: string
:param chr: name of the chromosome.
:type chr: string
:param outputDir: name of the output directory.
:type outputDir: string
"""
fastaRandom = {}
chrSeq = importFastaChromosome(fastaFile, chr)
for gene in dico:
for location in dico[gene]:
start = int(location.split(':')[2].split('-')[0])
end = int(location.split(':')[2].split('-')[1])
if end != start:
strand = location.split(':')[3]
if strand == '1':
seq = chrSeq[start-1:end-1]
else:
seq = chrSeq[end-1:start-1]
seq = reverseSequence(seq)
randomSeq = shuffleSeq(seq)
listTr = [ tr+'-'+dico[gene][location][tr] for tr in dico[gene][location] ]
id = '>'+gene+':'+location+':'+'|'.join(listTr)
fastaRandom[id] = randomSeq
writeFasta(outputDir, fastaRandom, chr, 'all')
def fromFasta(filename, outDir, chr, dico):
fastaOri = SeqIO.parse(open(filename), 'fasta')
fastaShuf = {}
for fasta in fastaOri:
name, seq = fasta.id, str(fasta.seq)
seq = shuffleSeq(seq)
gene = name.split('|')[0]
coords = name.split('|')[1]
for loc in dico[gene]:
if coords in loc:
location = loc
chr = location.split(':')[1]
coords = location.split(':')[2]
strand = location.split(':')[3]
listTr = [ tr+'-'+dico[gene][location][tr] for tr in dico[gene][location] ]
name = '>'+gene+':junction:'+chr+':'+coords+':'+strand+':'+'|'.join(listTr)
fastaShuf[name] = seq
writeFasta(outputDir, fastaShuf, chr, 'junction')
def build_arg_parser():
parser = argparse.ArgumentParser(description = 'generateRandom')
GITDIR = os.getcwd()+'/'
parser.add_argument ('-p', '--path', default = GITDIR)
parser.add_argument ('-chr', '--chromosome', default = 'Y')
return parser
if __name__ == '__main__':
parser = build_arg_parser()
arg = parser.parse_args()
chr = arg.chromosome
path = arg.path
fastaFile = path+'/Fasta/'
outputDir = path+'/chr'+chr+'/'
fastaJunction = path+'/chr'+chr+'/HS_transcript_unspliced_chr'+chr+'_Sequence.txt'
print(chr)
dicoBt = rF.createDictionaryBiotypeByTranscript(path+
'/transcriptType/transcriptType_chr'+chr+'.txt')
dicoInfo = getDicoIndex(path+'/chr'+chr+'/HS_transcript_unspliced_chr'+
chr+'_Index.txt', dicoBt)
createFasta(dicoInfo, fastaFile, chr, outputDir)
fromFasta(fastaJunction, outputDir, chr, dicoInfo)