Skip to content

Commit

Permalink
Merge pull request #188 from debbiemarkslab/develop
Browse files Browse the repository at this point in the history
Version 0.0.5 release
  • Loading branch information
thomashopf authored Sep 16, 2018
2 parents 97ef967 + 370df64 commit ba7afed
Show file tree
Hide file tree
Showing 7 changed files with 208 additions and 53 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ before_script:
- export DISPLAY=:99.0
- sh -e /etc/init.d/xvfb start
- sleep 3
- wget http://marks.hms.harvard.edu/evcouplings_test_cases.tar.gz
- wget https://marks.hms.harvard.edu/evcouplings/models/data/evcouplings_test_cases.tar.gz
- tar -xf evcouplings_test_cases.tar.gz -C $HOME/
script:
- coverage run -m unittest discover -s test -p "Test*.py"
Expand Down
35 changes: 32 additions & 3 deletions evcouplings/align/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
ALPHABET_RNA_NOGAP = "ACGU"
ALPHABET_RNA = GAP + ALPHABET_RNA_NOGAP

HMMER_PREFIX_WARNING = "# WARNING: seq names have been made unique by adding a prefix of"


def read_fasta(fileobj):
"""
Expand Down Expand Up @@ -111,7 +113,7 @@ def write_aln(sequences, fileobj, width=80):
)


def read_stockholm(fileobj, read_annotation=False):
def read_stockholm(fileobj, read_annotation=False, raise_hmmer_prefixes=True):
"""
Generator function to read Stockholm format alignment
file (e.g. from hmmer).
Expand All @@ -129,6 +131,10 @@ def read_stockholm(fileobj, read_annotation=False):
Stockholm alignment file
read_annotation : bool, optional (default=False)
Read annotation columns from alignment
raise_hmmer_prefixes : bool, optional (default: True)
HMMER adds number prefixes to sequence identifiers if
identifiers are not unique. If True, the parser will
raise an exception if the alignment has such prefixes.
Returns
-------
Expand Down Expand Up @@ -165,6 +171,17 @@ def read_stockholm(fileobj, read_annotation=False):
"Header missing. {}".format(line.rstrip())
)

if raise_hmmer_prefixes and line.startswith(HMMER_PREFIX_WARNING):
raise ValueError(
"HMMER added identifier prefixes to alignment because of non-unique "
"sequence identifiers. Either some sequence identifier is present "
"twice in the sequence database, or your target sequence identifier is "
"the same as an identifier in the database. In the first case, please fix "
"your sequence database. In the second case, please choose a different "
"sequence identifier for your target sequence that does not overlap with "
"the sequence database."
)

# annotation lines
if line.startswith("#"):
if read_annotation:
Expand Down Expand Up @@ -584,7 +601,8 @@ def from_dict(cls, sequences, **kwargs):

@classmethod
def from_file(cls, fileobj, format="fasta",
a3m_inserts="first", **kwargs):
a3m_inserts="first", raise_hmmer_prefixes=True,
**kwargs):
"""
Construct an alignment object by reading in an
alignment file.
Expand All @@ -598,6 +616,12 @@ def from_file(cls, fileobj, format="fasta",
a3m_inserts : {"first", "delete"}, optional (default: "first")
Strategy to deal with inserts in a3m alignment files
(see read_a3m documentation for details)
raise_hmmer_prefixes : bool, optional (default: True)
HMMER adds number prefixes to sequence identifiers in Stockholm
files if identifiers are not unique. If True, the parser will
raise an exception if a Stockholm alignment has such prefixes.
**kwargs
Additional arguments to be passed to class constructor
Returns
-------
Expand All @@ -618,7 +642,12 @@ def from_file(cls, fileobj, format="fasta",
seqs[seq_id] = seq
elif format == "stockholm":
# only reads first Stockholm alignment contained in file
ali = next(read_stockholm(fileobj, read_annotation=True))
ali = next(
read_stockholm(
fileobj, read_annotation=True,
raise_hmmer_prefixes=raise_hmmer_prefixes
)
)
seqs = ali.seqs
annotation["GF"] = ali.gf
annotation["GC"] = ali.gc
Expand Down
51 changes: 45 additions & 6 deletions evcouplings/align/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,40 @@
)


def _verify_sequence_id(sequence_id):
"""
Verify if a target sequence identifier is in proper
format for the pipeline to run without errors
(not none, and contains no whitespace)
Parameters
----------
id : str
Target sequence identifier to verify
Raises
------
InvalidParameterError
If sequence identifier is not valid
"""
if sequence_id is None:
raise InvalidParameterError(
"Target sequence identifier (sequence_id) must be defined and "
"cannot be None/null."
)

try:
if len(sequence_id.split()) != 1 or len(sequence_id) != len(sequence_id.strip()):
raise InvalidParameterError(
"Target sequence identifier (sequence_id) may not contain any "
"whitespace (spaces, tabs, ...)"
)
except AttributeError:
raise InvalidParameterError(
"Target sequence identifier (sequence_id) must be a string"
)


def _make_hmmsearch_raw_fasta(alignment_result, prefix):
"""
HMMsearch results do not contain the query sequence
Expand Down Expand Up @@ -79,7 +113,9 @@ def _add_gaps_to_query(query_sequence_ali, ali):
if len(match_index) != query_sequence_ali.L:
raise ValueError(
"HMMsearch result {} does not have a one-to-one"
" mapping to the query sequence columns".format(ar["raw_alignment_file"])
" mapping to the query sequence columns".format(
alignment_result["raw_alignment_file"]
)
)

gapped_query_sequence = ""
Expand Down Expand Up @@ -663,10 +699,8 @@ def existing(**kwargs):
# Target sequence of alignment
sequence_id = kwargs["sequence_id"]

if sequence_id is None:
raise InvalidParameterError(
"Parameter sequence_id must be defined"
)
# check if sequence identifier is valid
_verify_sequence_id(sequence_id)

# First, find focus sequence in alignment
focus_index = None
Expand Down Expand Up @@ -997,6 +1031,9 @@ def jackhmmer_search(**kwargs):
)
prefix = kwargs["prefix"]

# check if sequence identifier is valid
_verify_sequence_id(kwargs["sequence_id"])

# make sure output directory exists
create_prefix_folders(prefix)

Expand Down Expand Up @@ -1258,6 +1295,9 @@ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs):
)
prefix = kwargs["prefix"]

# check if sequence identifier is valid
_verify_sequence_id(kwargs["sequence_id"])

# make sure output directory exists
create_prefix_folders(prefix)

Expand Down Expand Up @@ -1342,7 +1382,6 @@ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs):
"hittable_file": ali["domtblout"],
}


# convert the raw output alignment to fasta format
# and add the appropriate query sequecne
raw_focus_alignment_file = _make_hmmsearch_raw_fasta(outcfg, prefix)
Expand Down
Loading

0 comments on commit ba7afed

Please sign in to comment.