Merge pull request #188 from debbiemarkslab/develop

Version 0.0.5 release
debbiemarkslab · Sep 16, 2018 · ba7afed · ba7afed
2 parents 97ef967 + 370df64
commit ba7afed
Show file tree

Hide file tree

Showing 7 changed files with 208 additions and 53 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -24,7 +24,7 @@ before_script:
 - export DISPLAY=:99.0
 - sh -e /etc/init.d/xvfb start
 - sleep 3
-- wget http://marks.hms.harvard.edu/evcouplings_test_cases.tar.gz
+- wget https://marks.hms.harvard.edu/evcouplings/models/data/evcouplings_test_cases.tar.gz
 - tar -xf evcouplings_test_cases.tar.gz -C $HOME/
 script:
 - coverage run -m unittest discover -s test -p "Test*.py"

diff --git a/evcouplings/align/alignment.py b/evcouplings/align/alignment.py
@@ -30,6 +30,8 @@
 ALPHABET_RNA_NOGAP = "ACGU"
 ALPHABET_RNA = GAP + ALPHABET_RNA_NOGAP
 
+HMMER_PREFIX_WARNING = "# WARNING: seq names have been made unique by adding a prefix of"
+
 
 def read_fasta(fileobj):
     """
@@ -111,7 +113,7 @@ def write_aln(sequences, fileobj, width=80):
 )
 
 
-def read_stockholm(fileobj, read_annotation=False):
+def read_stockholm(fileobj, read_annotation=False, raise_hmmer_prefixes=True):
     """
     Generator function to read Stockholm format alignment
     file (e.g. from hmmer).
@@ -129,6 +131,10 @@ def read_stockholm(fileobj, read_annotation=False):
         Stockholm alignment file
     read_annotation : bool, optional (default=False)
         Read annotation columns from alignment
+    raise_hmmer_prefixes : bool, optional (default: True)
+        HMMER adds number prefixes to sequence identifiers if
+        identifiers are not unique. If True, the parser will
+        raise an exception if the alignment has such prefixes.
 
     Returns
     -------
@@ -165,6 +171,17 @@ def read_stockholm(fileobj, read_annotation=False):
                 "Header missing. {}".format(line.rstrip())
             )
 
+        if raise_hmmer_prefixes and line.startswith(HMMER_PREFIX_WARNING):
+            raise ValueError(
+                "HMMER added identifier prefixes to alignment because of non-unique "
+                "sequence identifiers. Either some sequence identifier is present "
+                "twice in the sequence database, or your target sequence identifier is "
+                "the same as an identifier in the database. In the first case, please fix "
+                "your sequence database. In the second case, please choose a different "
+                "sequence identifier for your target sequence that does not overlap with "
+                "the sequence database."
+            )
+
         # annotation lines
         if line.startswith("#"):
             if read_annotation:
@@ -584,7 +601,8 @@ def from_dict(cls, sequences, **kwargs):
 
     @classmethod
     def from_file(cls, fileobj, format="fasta",
-                  a3m_inserts="first", **kwargs):
+                  a3m_inserts="first", raise_hmmer_prefixes=True,
+                  **kwargs):
         """
         Construct an alignment object by reading in an
         alignment file.
@@ -598,6 +616,12 @@ def from_file(cls, fileobj, format="fasta",
         a3m_inserts : {"first", "delete"}, optional (default: "first")
             Strategy to deal with inserts in a3m alignment files
             (see read_a3m documentation for details)
+        raise_hmmer_prefixes : bool, optional (default: True)
+            HMMER adds number prefixes to sequence identifiers in Stockholm
+            files if identifiers are not unique. If True, the parser will
+            raise an exception if a Stockholm alignment has such prefixes.
+        **kwargs
+            Additional arguments to be passed to class constructor
 
         Returns
         -------
@@ -618,7 +642,12 @@ def from_file(cls, fileobj, format="fasta",
                 seqs[seq_id] = seq
         elif format == "stockholm":
             # only reads first Stockholm alignment contained in file
-            ali = next(read_stockholm(fileobj, read_annotation=True))
+            ali = next(
+                read_stockholm(
+                    fileobj, read_annotation=True,
+                    raise_hmmer_prefixes=raise_hmmer_prefixes
+                )
+            )
             seqs = ali.seqs
             annotation["GF"] = ali.gf
             annotation["GC"] = ali.gc

diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py
@@ -41,6 +41,40 @@
 )
 
 
+def _verify_sequence_id(sequence_id):
+    """
+    Verify if a target sequence identifier is in proper
+    format for the pipeline to run without errors
+    (not none, and contains no whitespace)
+        
+    Parameters
+    ----------
+    id : str
+        Target sequence identifier to verify
+
+    Raises
+    ------
+    InvalidParameterError
+        If sequence identifier is not valid
+    """
+    if sequence_id is None:
+        raise InvalidParameterError(
+            "Target sequence identifier (sequence_id) must be defined and "
+            "cannot be None/null."
+        )
+
+    try:
+        if len(sequence_id.split()) != 1 or len(sequence_id) != len(sequence_id.strip()):
+            raise InvalidParameterError(
+                "Target sequence identifier (sequence_id) may not contain any "
+                "whitespace (spaces, tabs, ...)"
+            )
+    except AttributeError:
+        raise InvalidParameterError(
+            "Target sequence identifier (sequence_id) must be a string"
+        )
+
+
 def _make_hmmsearch_raw_fasta(alignment_result, prefix):
     """
     HMMsearch results do not contain the query sequence
@@ -79,7 +113,9 @@ def _add_gaps_to_query(query_sequence_ali, ali):
         if len(match_index) != query_sequence_ali.L:
             raise ValueError(
                 "HMMsearch result {} does not have a one-to-one"
-                " mapping to the query sequence columns".format(ar["raw_alignment_file"])
+                " mapping to the query sequence columns".format(
+                    alignment_result["raw_alignment_file"]
+                )
             )
 
         gapped_query_sequence = ""
@@ -663,10 +699,8 @@ def existing(**kwargs):
     # Target sequence of alignment
     sequence_id = kwargs["sequence_id"]
 
-    if sequence_id is None:
-        raise InvalidParameterError(
-            "Parameter sequence_id must be defined"
-        )
+    # check if sequence identifier is valid
+    _verify_sequence_id(sequence_id)
 
     # First, find focus sequence in alignment
     focus_index = None
@@ -997,6 +1031,9 @@ def jackhmmer_search(**kwargs):
     )
     prefix = kwargs["prefix"]
 
+    # check if sequence identifier is valid
+    _verify_sequence_id(kwargs["sequence_id"])
+
     # make sure output directory exists
     create_prefix_folders(prefix)
 
@@ -1258,6 +1295,9 @@ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs):
     )
     prefix = kwargs["prefix"]
 
+    # check if sequence identifier is valid
+    _verify_sequence_id(kwargs["sequence_id"])
+
     # make sure output directory exists
     create_prefix_folders(prefix)
 
@@ -1342,7 +1382,6 @@ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs):
         "hittable_file": ali["domtblout"],
     }
 
-
     # convert the raw output alignment to fasta format 
     # and add the appropriate query sequecne
     raw_focus_alignment_file = _make_hmmsearch_raw_fasta(outcfg, prefix)