-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsplit_supermatrix_to_taxa.py
executable file
·80 lines (69 loc) · 3.4 KB
/
split_supermatrix_to_taxa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python
#
# split_supermatrix_to_taxa.py created 2017-09-18
'''split_supermatrix_to_taxa.py v1.0 2021-11-10
tool to split supermatrices into fasta files for each taxa
split_supermatrix_to_taxa.py -a matrix.phy -p partitions.txt -d taxa_dir -f phylip-relaxed
for partitioned alignments, formats (-f) include:
clustal, fasta, nexus, phylip, phylip-relaxed, stockholm
large matrices can be gzipped, as .gz
'''
import sys
import os
import argparse
import time
import gzip
from Bio import AlignIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
def get_partitions(partitionfile):
'''read comma-delimited partition information and return a list of tuples'''
partitions = [] # list of tuples of intervals
for line in open(partitionfile,'r'):
line = line.strip()
if line:
blocks = line.split(",") # split "1:136,137:301,..." into ['1:136', '137:301',...]
for block in blocks:
alignindex = tuple( int(i) for i in block.split(":") ) # split '1:136' into ( 1,136 )
partitions.append(alignindex)
sys.stderr.write( "# read {} partitions from {} {}\n".format(len(partitions), partitionfile, time.asctime() ) )
return partitions
def split_taxa(fullalignment, alignformat, taxadir, partitions):
'''read large alignment and write one fasta file for each taxa containing all proteins'''
if fullalignment.rsplit('.',1)[1]=="gz": # autodetect gzip format
opentype = gzip.open
sys.stderr.write( "# reading alignment {} as gzipped {}\n".format(fullalignment, time.asctime() ) )
else: # otherwise assume normal open
opentype = open
sys.stderr.write( "# reading alignment {} {}\n".format(fullalignment, time.asctime() ) )
alignedseqs = AlignIO.read(opentype(fullalignment), alignformat)
sys.stderr.write( "# alignment contains {} taxa with {} sites {}\n".format( len(alignedseqs), alignedseqs.get_alignment_length(), time.asctime() ) )
filecounter = 0
for seqrec in alignedseqs:
taxafilename = "{}.fasta".format( os.path.join(taxadir, seqrec.id) )
with open(taxafilename,'w') as tf:
for part in partitions:
seqpart = seqrec.seq[part[0]-1:part[1]] # correct alignment positions to python index
if str(seqpart).replace("-","").replace("X",""): # ignore completely empty entries
newseq = SeqRecord( seqrec.seq[part[0]-1:part[1]] , id="{}_{}-{}".format(seqrec.id, *part), description="" )
tf.write( newseq.format("fasta") )
filecounter += 1
sys.stderr.write( "# split alignment into {} files {}\n".format(filecounter, time.asctime() ) )
def main(argv, wayout):
if not len(argv):
argv.append('-h')
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__)
parser.add_argument('-a','--alignment', help="supermatrix alignment")
parser.add_argument('-d','--taxa-directory', default="./", help="optional directory for taxa files")
parser.add_argument('-f','--format', default="fasta", help="alignment format [fasta]")
parser.add_argument('-p','--partition', help="partition file for splitting large alignments")
args = parser.parse_args(argv)
if not os.path.isdir(args.taxa_directory):
if os.path.isfile(args.taxa_directory):
raise OSError("ERROR: Cannot create directory {}, exiting".format(args.taxa_directory) )
else:
os.mkdir(args.taxa_directory)
partitions = get_partitions(args.partition)
split_taxa(args.alignment, args.format, args.taxa_directory, partitions)
if __name__ == "__main__":
main(sys.argv[1:], sys.stdout)