-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreadBlastpG4Q.py
109 lines (100 loc) · 3.79 KB
/
readBlastpG4Q.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/python3.6.7
# -*- coding: utf-8 -*-:v
"""
Copyright:
Copyright Universite of Sherbrooke, departement of biochemistry and
departement of computation.
Date:
September 2019
Description:
This script read blastn output. It is used one time for the pG4r vs rG4.
For the other side, see readBlast. The only output is printed on the
terminal, and it is : the number of 'perfect hit', and the number of
blast file without any hit. A perfect hit is when chromosome and strand
from both blasted fasta are the same.
Usage:
python ~/PATH/readBlast.py
"""
import sys
import os
import re
import argparse
from pprint import pprint
from Bio.Blast import NCBIXML
def build_arg_parser():
parser = argparse.ArgumentParser(description = 'readBlast')
GITDIR = os.getcwd()+'/'
parser.add_argument ('-p', '--path', default = GITDIR)
parser.add_argument ('-nHit', '--nHit', default = 5)
return parser
def GetBestHit(path, nHit):
dicoHitG4Loc = {}
dicoHitLoc= {}
filelist = sorted(os.listdir(path))# recuperation of all file name in the directory
bestHit = {} # initialisation of the dictionary
cptEmpty = 0
cptHit = 0
for blastFile in filelist:
tmp = blastFile.split(':')
chrQ = tmp[1]
strandQ = tmp[3].rstrip().split('.')[0]
namequery = blastFile.split('.')[0]
locationT = tmp[4]
blastFile = path+blastFile
blast = NCBIXML.parse(open(blastFile,'rU'))
for record in blast:
if record.alignments:
hit = False
hitN = 0
if len(record.alignments) < nHit:
maxHit = len(record.alignments) -1
else:
maxHit = nHit
while not hit and hitN < maxHit:
# print(record.alignments[0])
strandT = record.alignments[hitN].title.split(':')[3]
chrT = record.alignments[hitN].title.split(':')[0].split('r')[1]
locationQ = record.alignments[hitN].title.split(':')[5]
if strandT == '+':
strandT = '1'
else:
strandT = '-1'
hitN += 1
if strandT == strandQ and chrT == chrQ:
cptHit += 1
hit = True
hitLoc = locationQ+'-'+locationT
if hitLoc not in dicoHitG4Loc:
dicoHitG4Loc[hitLoc] = 0
dicoHitG4Loc[hitLoc] += 1
# else:
# hitLoc = locationQ+'-'+locationT
# if hitLoc not in dicoHitLoc:
# dicoHitLoc[hitLoc] = 0
# dicoHitLoc[hitLoc] += 1
# if not hit and hitN < maxHit-1:
# hitN = 0
# while hitN < maxHit:
# print(record.alignments[0])
# strandT = record.alignments[hitN].title.split(':')[4]
# chrT = record.alignments[hitN].title.split(':')[2].split('r')[1]
# locationQ = record.alignments[hitN].title.split(':')[6]
# if strandT == '+':
# strandT = '1'
# else:
# strandT = '-1'
# hitN += 1
else:
cptEmpty += 1
print(nHit)
print('Hit : ' + str(cptHit))
print('No hit at all : ' + str(cptEmpty))
pprint(dicoHitG4Loc)
#pprint(dicoHitLoc)
print('--------------')
if __name__ == '__main__':
parser = build_arg_parser()
arg = parser.parse_args()
path = arg.path
nHit = int(arg.nHit)
GetBestHit(path, nHit)