-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcheck_supermatrix_alignments.py
executable file
·353 lines (319 loc) · 16.6 KB
/
check_supermatrix_alignments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
#!/usr/bin/env python
#
# check_supermatrix_alignments.py created 2017-03-13
'''check_supermatrix_alignments.py v1.4 2022-07-07
tool to quickly check for abnormal sequences in fasta alignments
check_supermatrix_alignments.py -a matrix.phy -p partitions.txt
for partitioned alignments, formats (-f) include:
clustal, fasta, nexus, phylip, phylip-relaxed, stockholm
large matrices can be gzipped, as .gz
output consists of tab delimited fields:
Species Partitions Number-missing Percent-missing Number-partial Percent-partial
for optional occupancy matrix file:
check_supermatrix_alignments.py -a matrix.phy -p partitions.txt -m occupancy_matrix.tab
where matrix consists of three values for:
present (2), partial (1), and absent (0)
percent coverage (instead of binary) can be instead printed with --percent
to check for long runs of infrequent amino acids (likely from misassembly)
add option -b, partitions must be specified with -p
this cannot be run with --percent
check_supermatrix_alignments.py -b -a matrix.aln -p partitions.txt -m break_matrix.tab
matrix order can be changed based on a phylogenetic tree in nexus format
check_supermatrix_alignments.py -a matrix.phy -p partitions.txt -m occupancy_matrix.tab -T tree.nex
to optionally include gene names in the matrix, use --pair-stats
pair stats file is the output of align_pair_stats.py, or any file as:
1-1000 GENE
or can include species or SwissProt ID
Genus_species_1-1000 sp|123456|GENE
'''
import sys
import os
import argparse
import time
import gzip
from collections import defaultdict,Counter
from Bio import AlignIO
from Bio import Phylo
def get_partitions(partitionfile):
'''read comma-delimited partition information and return a list of tuples'''
partitions = [] # list of tuples of intervals
for line in open(partitionfile,'r'):
line = line.strip()
if line:
if line.count("=") > 0: # for RAxML type format
# WAG, homologs_134_11213 = 1-221
block = line.split("=",1)[-1].strip() # should be "1-221"
alignindex = tuple( int(i) for i in block.split("-") ) # split "1-221" into ( 1,221 )
partitions.append(alignindex)
else: # for short format
blocks = line.split(",") # split "1:136,137:301,..." into ['1:136', '137:301',...]
for block in blocks:
alignindex = tuple( int(i) for i in block.split(":") ) # split '1:136' into ( 1,136 )
partitions.append(alignindex)
print( "# read {} partitions from {} {}".format(len(partitions), partitionfile, time.asctime() ), file=sys.stderr )
return partitions
def parts_to_genes(pairstatsfile):
'''read tabular pair-wise gene stats, return a dict where key is partition and value is gene'''
part_to_gene = {}
print( "# reading partitions and gene names from {} {}".format(pairstatsfile, time.asctime() ), file=sys.stderr )
for line in open(pairstatsfile,'r'):
line = line.strip()
if line:
lsplits = line.split("\t")
if lsplits[0]=="partition": # skip first line
continue
gene = lsplits[1].split("|")[-1].replace("_HUMAN","")
partition = tuple( int(i) for i in lsplits[0].split("_")[-1].split("-") )
part_to_gene[partition] = gene
print( "# found {} gene names {}".format(len(part_to_gene), time.asctime() ), file=sys.stderr )
return part_to_gene
def check_alignments(fullalignment, alignformat, partitions, makematrix=False, writepercent=False):
'''read large alignment, return the dict where key is species and value is number of gap-only sequences'''
gapdict = {} # must set all values to zero in order to not skip full taxa
halfgapdict = defaultdict(int)
totaloccs = defaultdict(int)
occmatrix = [] if makematrix else None
if fullalignment.rsplit('.',1)[1]=="gz": # autodetect gzip format
opentype = gzip.open
print( "# reading alignment {} as gzipped {}".format(fullalignment, time.asctime() ), file=sys.stderr )
else: # otherwise assume normal open
opentype = open
print( "# reading alignment {} {}".format(fullalignment, time.asctime() ), file=sys.stderr )
alignedseqs = AlignIO.read(opentype(fullalignment), alignformat)
print( "# Alignment contains {} taxa for {} sites, including gaps".format( len(alignedseqs), alignedseqs.get_alignment_length() ) , file=sys.stderr )
for seqrec in alignedseqs: # all species must be counted once
gapdict[seqrec.id] = 0
if occmatrix is not None:
occmatrix.append([seqrec.id]) # initate each species as a new list with ID
if partitions is None: # for cases where partitions are not given
partitions = [ tuple([1,alignedseqs.get_alignment_length()]) ]
for part in partitions:
alignpart = alignedseqs[:, part[0]-1:part[1] ] # alignment of each partition only
for i,seqrec in enumerate(alignpart):
species = seqrec.id
seqlen = len(seqrec.seq)
lettercounts = Counter(str(seqrec.seq).replace("X","-"))
gapcount = lettercounts.get("-",0) + lettercounts.get("?",0)
if writepercent: # write integer of percent covered by each gene
occupancyscore = 100 - 100*gapcount/seqlen
gapdict[species] += gapcount
halfgapdict[species] += seqlen
else: # meaning stick with absent-partial-present scheme
occupancyscore = 2 # by default is present, reassign if absent or partial
if gapcount == seqlen: # seq is all gaps, so no seq
gapdict[species] += 1
occupancyscore = 0 # set to 0 if all gaps
elif gapcount >= seqlen * 0.5: # partial means half or more of sequence is gaps
halfgapdict[species] += 1
occupancyscore = 1 # set to 1 if partial
totaloccs[occupancyscore] += 1
if occmatrix: # if building matrix, add that value to the matrix at sequence i
occmatrix[i].append(str(occupancyscore))
totalspots = sum(totaloccs.values())
if writepercent:
print( "# Matrix has {} ({:.2f}%) complete and {} ({:.2f}%) empty out of {} total genes {}".format( totaloccs[100], 100.0*totaloccs[100]/totalspots, totaloccs[0], 100.0*totaloccs[0]/totalspots, totalspots , time.asctime() ), file=sys.stderr )
else:
print( "# Matrix has {} ({:.2f}%) complete and {} ({:.2f}%) partial out of {} total genes {}".format( totaloccs[2], 100.0*totaloccs[2]/totalspots, totaloccs[1], 100.0*totaloccs[1]/totalspots, totalspots , time.asctime() ), file=sys.stderr )
return gapdict, halfgapdict, occmatrix
def recheck_tabular_output(tabularfile):
gapdict = defaultdict(int)
halfgapdict = defaultdict(int)
totaloccs = defaultdict(int)
if tabularfile.rsplit('.',1)[1]=="gz": # autodetect gzip format
opentype = gzip.open
print( "# reading tabular file {} as gzipped {}".format(tabularfile, time.asctime() ), file=sys.stderr )
else: # otherwise assume normal open
opentype = open
print( "# reading tabular file {} {}".format(tabularfile, time.asctime() ), file=sys.stderr )
# parse tabular
for line in opentype(tabularfile,'rt'):
if line.strip():
lsplits = line.split("\t")
species = lsplits[0]
if species=="Species": # meaning header line, so skip
continue
for occupancyscore in lsplits[1:]:
occupancyscore = int(occupancyscore)
if occupancyscore==0:
gapdict[species] += 1
elif occupancyscore==1:
halfgapdict[species] += 1
elif occupancyscore==2:
pass
else:
sys.stderr.write("WARNING: unrecognized value in matrix {} for species {}\n".format(occupancyscore, species) )
totaloccs[occupancyscore] += 1
totalspots = sum(totaloccs.values())
print( "# Matrix has {} ({:.2f}%) complete and {} ({:.2f}%) partial out of {} total genes {}".format( totaloccs[2], 100.0*totaloccs[2]/totalspots, totaloccs[1], 100.0*totaloccs[1]/totalspots, totalspots , time.asctime() ), file=sys.stderr )
occmatrix = lsplits[1:] # use last values, which will be reused as partition length
return gapdict, halfgapdict, occmatrix
def count_breaks(fullalignment, alignformat, partitions, makematrix=False, MAXBREAKTHRES=2):
'''read large alignment, return two dicts where key is species and values are number of unbroken sequences and sum of breaks'''
species_breaks = defaultdict(int) # total constant breaks by species
species_corrects = defaultdict(int) # number of genes with no breaks by species
occmatrix = [] if makematrix else None
if fullalignment.rsplit('.',1)[1]=="gz": # autodetect gzip format
opentype = gzip.open
print( "# reading alignment {} as gzipped {}".format(fullalignment, time.asctime() ), file=sys.stderr )
else: # otherwise assume normal open
opentype = open
print( "# reading alignment {} {}".format(fullalignment, time.asctime() ), file=sys.stderr )
alignedseqs = AlignIO.read(opentype(fullalignment), alignformat)
numtaxa = len(alignedseqs)
alignlength = alignedseqs.get_alignment_length()
print( "# Alignment contains {} taxa for {} sites, including gaps".format( numtaxa, alignlength ) , file=sys.stderr )
for seqrec in alignedseqs: # all species must be counted once
if occmatrix is not None:
occmatrix.append([seqrec.id]) # initate each species as a new list with ID
for part in partitions:
aa_freq_by_site = {} # key is site number, value is Counter
const_breaker_thres = {} # key is site number, value is int of number of species / number of AAs
most_common_by_site = {} # key is site number, value is most frequent letter
alignpart = alignedseqs[:, part[0]-1:part[1] ] # alignment of each partition only
partlength = alignpart.get_alignment_length()
# get base frequency of each site
for j in range(partlength):
aligncolumn = str(alignpart[:,j])
aa_freq = Counter(aligncolumn.replace("X","-").replace("?","-"))
aa_freq_by_site[j] = aa_freq
num_aas = len(aa_freq)
breakmax = numtaxa/num_aas
const_breaker_thres[j] = breakmax
most_common_by_site[j] = aa_freq.most_common(1)[0][0]
# count number of times each taxa breaks constant site rules
for i,seqrec in enumerate(alignpart):
longest_break = 0 # float of sum of 1/f for the longest segment of uncommon AAs
runningscore = 0
total_breaks = 0
previous_letter = '-'
previous_common = '-'
species = seqrec.id
seqlen = len(seqrec.seq)
# if species is all gaps, assign as negative 1, for absent
gapcount = str(seqrec.seq).replace("X","-").replace("?","-").count("-")
if gapcount==seqlen:
occmatrix[i].append("-1")
continue
# check frequency of each letter, and sum of inverse frequencies of long runs
for k,letter in enumerate(str(seqrec.seq)):
if letter=="X" or letter=="?":
letter = "-"
# break at long gaps, either for this species, or if most common part is gap
if (letter=="-" and letter==previous_letter) or (most_common_by_site[k]=='-' and previous_common=='-'):
if runningscore > longest_break:
longest_break = runningscore
total_breaks += runningscore
runningscore = 0
# only keep running total if site is rare
# meaning frequency of letter is less than N/n
# where N is number of taxa, and n is number of AAs at that site
# that is, if there are 4 AAs across 100 taxa, then the const_breaker_thres is 25
#
# for example, to calculate inverse_freq
# if ratio is 99-G to 1-S
# then the inverse_freq for all G-taxa would be 1/99, so appx 0.01 pts
# while for the S-taxon, the score would be 1/1, so 1pt
# this severely penalizes any given AA occurring even twice, so only really highlights true constant breaks
elif aa_freq_by_site[k][letter] < const_breaker_thres[k]:
inverse_freq = 1.0 / aa_freq_by_site[k][letter]
runningscore += inverse_freq
# break at frequent or constant sites
elif letter=="P" or letter=="C": # never break at proline and cysteine
inverse_freq = 1.0 / aa_freq_by_site[k][letter]
runningscore += inverse_freq
else:
if runningscore > longest_break:
longest_break = runningscore
total_breaks += runningscore
runningscore = 0
previous_letter = letter
previous_common = most_common_by_site[k]
species_breaks[species] += total_breaks
occmatrix[i].append("{:.2f}".format(longest_break))
if total_breaks <= MAXBREAKTHRES:
species_corrects[species] += 1
return species_corrects, species_breaks, occmatrix
def main(argv, wayout):
if not len(argv):
argv.append('-h')
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__)
parser.add_argument('-a','--alignment', help="supermatrix alignment")
parser.add_argument('-b','--breaks', action="store_true", help="score consecutive breaks instead of counting coverage")
parser.add_argument('-B','--break-limit', type=int, default=2, help="max consecutive score to count gene as unbroken [default:2]")
parser.add_argument('-f','--format', default="fasta", help="alignment format [fasta]")
parser.add_argument('-H','--header', action="store_true", help="include header line")
parser.add_argument('-m','--matrix-out', help="name for optional matrix-occupancy output file")
parser.add_argument('-D','--matrix-delimiter', default="\t", help="delimiter for matrix file [default is tab]")
parser.add_argument('-T','--matrix-tree', help="optional Nexus-format tree to reorder matrix")
parser.add_argument('-p','--partition', help="partition file for splitting large alignments")
parser.add_argument('--percent', action="store_true", help="report values in matrix by percent complete")
parser.add_argument('--pair-stats', help="pair stats file for gene names, to use alternate matrix header format")
parser.add_argument('-r','--rerun-tabular', help="rerun and recalculate results use the tabular output file as input")
args = parser.parse_args(argv)
if args.rerun_tabular is None:
if args.partition is None: # turn on percent mode if there is only
print( "# no partitions given, calculating percent of total dataset" , file=sys.stderr )
args.percent = True
partitions = None
else:
partitions = get_partitions(args.partition)
genenames = parts_to_genes(args.pair_stats) if args.pair_stats else None
# check for breaks
if args.breaks:
if args.percent:
sys.exit("WARNING: PERCENT MODE --percent CANNOT BE USED WITH BREAKS -b")
maindict, secondarydict, occmatrix = count_breaks(args.alignment, args.format, partitions, args.matrix_out, args.break_limit)
# otherwise check for coverage
else:
if args.rerun_tabular:
maindict, secondarydict, partitions = recheck_tabular_output(args.rerun_tabular)
else:
maindict, secondarydict, occmatrix = check_alignments(args.alignment, args.format, partitions, args.matrix_out, args.percent)
if args.matrix_out and occmatrix:
print( "# writing matrix to {} {}".format(args.matrix_out, time.asctime() ), file=sys.stderr )
with open(args.matrix_out,'w') as mo:
# generate header line
if args.pair_stats:
headerline = ["Species"] + ["{}--{}".format(part[0], genenames.get(part,"None")) for part in partitions]
elif args.partition is None:
headerline = ["Species","PercentOccupancy"]
else:
headerline = ["Species"] + ["{}-{}".format(*part) for part in partitions]
print( args.matrix_delimiter.join(headerline) , file=mo )
# print occupancy by each species
if args.matrix_tree: # use order from a rooted nexus-format tree
occdict = {} # convert occmatrix from list of lists to dict of lists
for occrow in occmatrix:
occdict[occrow[0]] = occrow[1:]
tree = Phylo.read(args.matrix_tree,"nexus")
for clade in tree.get_terminals():
cleanname = str(clade.name).replace("'","").replace('"','')
try:
print( args.matrix_delimiter.join( [cleanname] + occdict[cleanname] ) , file=mo )
except KeyError:
print( "WARNING: CANNOT FIND TAXA {} IN ALIGNMENT, SKIPPING".format(cleanname) , file=sys.stderr )
else: # just use default order from the alignment
for occbysplist in occmatrix:
print( args.matrix_delimiter.join(occbysplist) , file=mo )
if args.percent: # just print total percentage of sites
if args.header:
# 0 1 2 3
print( "Species\tSites\tGaps\tG%" , file=wayout )
for k,v in sorted(maindict.items(), reverse=True, key=lambda x: x[1]):
print( "{}\t{}\t{}\t{:.2f}".format(k, secondarydict[k], v, v*100.0/secondarydict[k]) , file=wayout )
elif args.breaks:
if args.header:
# 0 1 2 3 4
print( "Species\tPartitions\tComplete\tM%\tBreaksum" , file=wayout )
numparts = len(partitions)
for k,v in sorted(maindict.items(), reverse=True, key=lambda x: x[1]):
print( "{}\t{}\t{}\t{:.2f}\t{}".format(k, numparts, v, v*100.0/numparts, secondarydict[k]) , file=wayout )
else: # only print normal output if not in percent mode
if args.header:
# 0 1 2 3 4 5
print( "Species\tPartitions\tMissing\tM%\tPartial\tP%" , file=wayout )
numparts = len(partitions)
for k,v in sorted(maindict.items(), reverse=True, key=lambda x: x[1]):
print( "{}\t{}\t{}\t{:.2f}\t{}\t{:.2f}".format(k, numparts, v, v*100.0/numparts, secondarydict[k], secondarydict[k]*100.0/numparts) , file=wayout )
if __name__ == "__main__":
main(sys.argv[1:], sys.stdout)