-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathfilter_supermatrix.py
executable file
·146 lines (127 loc) · 6.62 KB
/
filter_supermatrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python
#
# filter_supermatrix.py created 2017-09-24
# python3 update 2022-04-26
# add option for key taxa 2023-01-20
'''filter_supermatrix.py last modified 2023-01-20
filter a supermatrix by coverage based on a list of partitions
filter_supermatrix.py -a matrix.phy -p partitions.txt -o filtered_matrix.phy
for partitioned alignments, formats (-f) include:
clustal, fasta, nexus, phylip, phylip-relaxed, stockholm
change minimum coverage requirement with -c
retrieve gene name froms the pair stats file using --pair-stats
filter_supermatrix.py -a matrix.phy -p partitions.txt -o filtered_matrix.phy --pair-stats align_pair_stats.tab
pair stats file is the output of align_pair_stats.py, or any file as:
1-1000 GENE
or can include species or SwissProt ID
Genus_species_1-1000 sp|123456|GENE
get alignment pair stats by first running blast_to_align_pairs.py
'''
import sys
import os
import argparse
import time
import gzip
from collections import Counter
from Bio import AlignIO
def get_partitions(partitionfile):
'''read comma-delimited partition information and return a list of tuples'''
partitions = [] # list of tuples of intervals
for line in open(partitionfile,'r'):
line = line.strip()
if line:
blocks = line.split(",") # split "1:136,137:301,..." into ['1:136', '137:301',...]
for block in blocks:
alignindex = tuple( int(i) for i in block.split(":") ) # split '1:136' into ( 1,136 )
partitions.append(alignindex)
print( "# read {} partitions from {} {}".format(len(partitions), partitionfile, time.asctime() ), file=sys.stderr )
return partitions
def parts_to_genes(pairstatsfile):
'''read tabular pair-wise gene stats, return a dict where key is partition and value is gene'''
part_to_gene = {}
print( "# reading partitions and gene names from {} {}".format(pairstatsfile, time.asctime() ), file=sys.stderr )
for line in open(pairstatsfile,'r'):
line = line.strip()
if line:
lsplits = line.split("\t")
if lsplits[0]=="partition": # skip first line
continue
gene = lsplits[1].split("|")[-1]
partition = lsplits[0].split("_")[-1]
part_to_gene[partition] = gene
print( "# found {} gene names {}".format(len(part_to_gene), time.asctime() ), file=sys.stderr )
return part_to_gene
def check_alignments(fullalignment, alignformat, partitions, COVTHRESHOLD, genenamedict, key_species_list):
'''read large alignment, return a new alignment where low-coverage partitions are removed'''
if fullalignment.rsplit('.',1)[-1]=="gz": # autodetect gzip format
opentype = gzip.open
sys.stderr.write("# reading alignment {} as gzipped {}\n".format(fullalignment, time.asctime() ) )
else: # otherwise assume normal open
opentype = open
sys.stderr.write("# reading alignment {} {}\n".format(fullalignment, time.asctime() ) )
alignedseqs = AlignIO.read(opentype(fullalignment), alignformat)
num_species = len(alignedseqs)
newalign = alignedseqs[:,0:0] # start with blank alignment
newpartitions = []
print( "# alignment has {} taxa with {} positions {}".format(num_species, alignedseqs.get_alignment_length(), time.asctime() ), file=sys.stderr )
newnamedict = {} # key is new partition, value is gene name from old partition
print( "# filtering partitions {}".format( time.asctime() ), file=sys.stderr )
for part in partitions:
alignpart = alignedseqs[:, part[0]-1:part[1] ] # alignment of each partition only
completecounter = {0:0, 1:0, 2:0}
is_species_present = {} # key is species, value is boolean
for i,seqrec in enumerate(alignpart):
species = seqrec.id
seqlen = len(seqrec.seq)
lettercounts = Counter(str(seqrec.seq).replace("X","-"))
occupancyscore = 2 # by default is present, reassign if absent or partial
if lettercounts["-"] == seqlen: # seq is all gaps, so no seq
occupancyscore = 0 # set to 0 if all gaps
elif lettercounts["-"] >= seqlen * 0.5: # partial means half or more of sequence is gaps
occupancyscore = 1 # set to 1 if partial
completecounter[occupancyscore] += 1
if species in key_species_list:
if occupancyscore > 0:
is_species_present[species] = 1
else:
is_species_present[species] = 0
fullcoverage = 1.0*completecounter[2]/num_species
if key_species_list is None:
key_species_included = 1
else:
key_species_included = sum( is_species_present.values ) * 1.0 / len(key_species_list)
if fullcoverage >= COVTHRESHOLD and key_species_included >= COVTHRESHOLD:
newindices = "{}:{}".format( newalign.get_alignment_length()+1, newalign.get_alignment_length()+part[1]-part[0]+1 )
newpartitions.append( newindices )
newalign += alignpart
if genenamedict: # transfer name to new indices
newnamedict[newindices] = genenamedict.get("{}-{}".format(*part),None)
else:
print( "COVERAGE OF PARTITION {} IS {:.2f}, REMOVING".format( part, fullcoverage ), file=sys.stderr )
print( "# final alignment has {} partitions with {} sites {}".format( len(newpartitions), newalign.get_alignment_length() , time.asctime() ), file=sys.stderr )
return newalign, newpartitions, newnamedict
def main(argv, wayout):
if not len(argv):
argv.append('-h')
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__)
parser.add_argument('-a','--alignment', help="supermatrix alignment")
parser.add_argument('-c','--completeness', type=float, default=0.5, help="completeness threshold for keeping partitions [0.5]")
parser.add_argument('-f','--format', default="fasta", help="alignment format [fasta]")
parser.add_argument('-o','--output', help="output name for new alignment", required=True)
parser.add_argument('-p','--partition', help="partition file for splitting large alignments")
parser.add_argument('-r','--required-species', nargs='*', help="list of species that must end up in the alignments")
parser.add_argument('--pair-stats', help="pair stats file for gene names, to use with RAxML style partition format")
args = parser.parse_args(argv)
partitions = get_partitions(args.partition)
genenames = parts_to_genes(args.pair_stats) if args.pair_stats else None
filteredalignment, partitionlist, genenames = check_alignments(args.alignment, args.format, partitions, args.completeness, genenames, args.required_species )
AlignIO.write(filteredalignment, args.output, args.format)
print( "# Supermatrix written to {} {}".format(args.output, time.asctime() ), file=sys.stderr )
with open("{}.partition.txt".format(args.output),'w') as pf:
if args.pair_stats:
for part in partitionlist:
print( "model, {} = {}-{}".format( genenames[part], *part.split(":") ), file=pf )
else:
print( ",".join(partitionlist), file=pf )
if __name__ == "__main__":
main(sys.argv[1:], sys.stdout)