-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathmake_regions_file.py
69 lines (55 loc) · 1.51 KB
/
make_regions_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import sys
import re, pprint
import os
import gzip
def get_pos (line):
fields = line.split ('\t')
return int (fields[0])
# read in scores, return sorted array
def get_regions (filename, outfile):
# escape gracefully if file doesn't exist
if not os.path.isfile (filename):
return
outfp = open (outfile, "w")
# initialize
out_start = 1
out_end = -1
in_start = -1
in_end = -1
# while f:
with open (filename) as f:
# f.readline() # read header , only for dbNSFP
first_line = f.readline()
# skip comment lines
while first_line.startswith ("#"):
first_line = f.readline()
pos = get_pos (first_line)
in_start = pos
out_end = pos -1
cur_pos = pos
prev_pos = pos
for line in f:
fields = line.split ('\t')
cur_pos = get_pos (line)
if cur_pos == prev_pos or cur_pos == (prev_pos + 1):
prev_pos = cur_pos
else:
# entered a new region
outfp.write ( str (out_start) + "\t" + str (in_start -1) + "\tOUT\n")
outfp.write ( str (in_start) + "\t" + str (prev_pos) + "\tIN\n")
# update
in_start = cur_pos
out_start = prev_pos + 1
prev_pos = cur_pos
outfp.write ( str (out_start) + "\t" + str (in_start -1) + "\tOUT\n")
outfp.write ( str (in_start) + "\t" + str (prev_pos) + "\tIN\n")
outfp.close()
##################################
# main part
if len (sys.argv) != 2:
print ('check_SIFTDB.py <sorted file> <outfile>')
#print sys.argv
chrom_file = sys.argv[1]
out_file = sys.argv[2]
print ("looking at " + chrom_file)
get_regions (chrom_file, out_file)