diff --git a/.travis.yml b/.travis.yml index f5da67a9..4861796e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,7 +24,7 @@ before_script: - export DISPLAY=:99.0 - sh -e /etc/init.d/xvfb start - sleep 3 -- wget http://marks.hms.harvard.edu/evcouplings_test_cases.tar.gz +- wget https://marks.hms.harvard.edu/evcouplings/models/data/evcouplings_test_cases.tar.gz - tar -xf evcouplings_test_cases.tar.gz -C $HOME/ script: - coverage run -m unittest discover -s test -p "Test*.py" diff --git a/evcouplings/align/alignment.py b/evcouplings/align/alignment.py index 6714e6cc..0de50fd5 100644 --- a/evcouplings/align/alignment.py +++ b/evcouplings/align/alignment.py @@ -30,6 +30,8 @@ ALPHABET_RNA_NOGAP = "ACGU" ALPHABET_RNA = GAP + ALPHABET_RNA_NOGAP +HMMER_PREFIX_WARNING = "# WARNING: seq names have been made unique by adding a prefix of" + def read_fasta(fileobj): """ @@ -111,7 +113,7 @@ def write_aln(sequences, fileobj, width=80): ) -def read_stockholm(fileobj, read_annotation=False): +def read_stockholm(fileobj, read_annotation=False, raise_hmmer_prefixes=True): """ Generator function to read Stockholm format alignment file (e.g. from hmmer). @@ -129,6 +131,10 @@ def read_stockholm(fileobj, read_annotation=False): Stockholm alignment file read_annotation : bool, optional (default=False) Read annotation columns from alignment + raise_hmmer_prefixes : bool, optional (default: True) + HMMER adds number prefixes to sequence identifiers if + identifiers are not unique. If True, the parser will + raise an exception if the alignment has such prefixes. Returns ------- @@ -165,6 +171,17 @@ def read_stockholm(fileobj, read_annotation=False): "Header missing. {}".format(line.rstrip()) ) + if raise_hmmer_prefixes and line.startswith(HMMER_PREFIX_WARNING): + raise ValueError( + "HMMER added identifier prefixes to alignment because of non-unique " + "sequence identifiers. Either some sequence identifier is present " + "twice in the sequence database, or your target sequence identifier is " + "the same as an identifier in the database. In the first case, please fix " + "your sequence database. In the second case, please choose a different " + "sequence identifier for your target sequence that does not overlap with " + "the sequence database." + ) + # annotation lines if line.startswith("#"): if read_annotation: @@ -584,7 +601,8 @@ def from_dict(cls, sequences, **kwargs): @classmethod def from_file(cls, fileobj, format="fasta", - a3m_inserts="first", **kwargs): + a3m_inserts="first", raise_hmmer_prefixes=True, + **kwargs): """ Construct an alignment object by reading in an alignment file. @@ -598,6 +616,12 @@ def from_file(cls, fileobj, format="fasta", a3m_inserts : {"first", "delete"}, optional (default: "first") Strategy to deal with inserts in a3m alignment files (see read_a3m documentation for details) + raise_hmmer_prefixes : bool, optional (default: True) + HMMER adds number prefixes to sequence identifiers in Stockholm + files if identifiers are not unique. If True, the parser will + raise an exception if a Stockholm alignment has such prefixes. + **kwargs + Additional arguments to be passed to class constructor Returns ------- @@ -618,7 +642,12 @@ def from_file(cls, fileobj, format="fasta", seqs[seq_id] = seq elif format == "stockholm": # only reads first Stockholm alignment contained in file - ali = next(read_stockholm(fileobj, read_annotation=True)) + ali = next( + read_stockholm( + fileobj, read_annotation=True, + raise_hmmer_prefixes=raise_hmmer_prefixes + ) + ) seqs = ali.seqs annotation["GF"] = ali.gf annotation["GC"] = ali.gc diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py index 4c4fe9ee..8732b047 100644 --- a/evcouplings/align/protocol.py +++ b/evcouplings/align/protocol.py @@ -41,6 +41,40 @@ ) +def _verify_sequence_id(sequence_id): + """ + Verify if a target sequence identifier is in proper + format for the pipeline to run without errors + (not none, and contains no whitespace) + + Parameters + ---------- + id : str + Target sequence identifier to verify + + Raises + ------ + InvalidParameterError + If sequence identifier is not valid + """ + if sequence_id is None: + raise InvalidParameterError( + "Target sequence identifier (sequence_id) must be defined and " + "cannot be None/null." + ) + + try: + if len(sequence_id.split()) != 1 or len(sequence_id) != len(sequence_id.strip()): + raise InvalidParameterError( + "Target sequence identifier (sequence_id) may not contain any " + "whitespace (spaces, tabs, ...)" + ) + except AttributeError: + raise InvalidParameterError( + "Target sequence identifier (sequence_id) must be a string" + ) + + def _make_hmmsearch_raw_fasta(alignment_result, prefix): """ HMMsearch results do not contain the query sequence @@ -79,7 +113,9 @@ def _add_gaps_to_query(query_sequence_ali, ali): if len(match_index) != query_sequence_ali.L: raise ValueError( "HMMsearch result {} does not have a one-to-one" - " mapping to the query sequence columns".format(ar["raw_alignment_file"]) + " mapping to the query sequence columns".format( + alignment_result["raw_alignment_file"] + ) ) gapped_query_sequence = "" @@ -663,10 +699,8 @@ def existing(**kwargs): # Target sequence of alignment sequence_id = kwargs["sequence_id"] - if sequence_id is None: - raise InvalidParameterError( - "Parameter sequence_id must be defined" - ) + # check if sequence identifier is valid + _verify_sequence_id(sequence_id) # First, find focus sequence in alignment focus_index = None @@ -997,6 +1031,9 @@ def jackhmmer_search(**kwargs): ) prefix = kwargs["prefix"] + # check if sequence identifier is valid + _verify_sequence_id(kwargs["sequence_id"]) + # make sure output directory exists create_prefix_folders(prefix) @@ -1258,6 +1295,9 @@ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs): ) prefix = kwargs["prefix"] + # check if sequence identifier is valid + _verify_sequence_id(kwargs["sequence_id"]) + # make sure output directory exists create_prefix_folders(prefix) @@ -1342,7 +1382,6 @@ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs): "hittable_file": ali["domtblout"], } - # convert the raw output alignment to fasta format # and add the appropriate query sequecne raw_focus_alignment_file = _make_hmmsearch_raw_fasta(outcfg, prefix) diff --git a/evcouplings/compare/sifts.py b/evcouplings/compare/sifts.py index 433c916f..1c11ec23 100644 --- a/evcouplings/compare/sifts.py +++ b/evcouplings/compare/sifts.py @@ -14,40 +14,36 @@ """ from os import path -import json from collections import OrderedDict from copy import deepcopy import pandas as pd import requests -import numpy as np from evcouplings.align.alignment import ( Alignment, read_fasta, parse_header ) - from evcouplings.align.protocol import ( - jackhmmer_search, hmmbuild_and_search, _make_hmmsearch_raw_fasta + jackhmmer_search, hmmbuild_and_search ) - from evcouplings.align.tools import read_hmmer_domtbl -from evcouplings.compare.mapping import alignment_index_mapping, map_indices +from evcouplings.compare.mapping import map_indices from evcouplings.utils.system import ( - get_urllib, ResourceError, valid_file, tempdir + get_urllib, ResourceError, valid_file, tempdir, temp ) from evcouplings.utils.config import ( - parse_config, check_required + parse_config, check_required, InvalidParameterError ) from evcouplings.utils.helpers import range_overlap -UNIPROT_MAPPING_URL = "http://www.uniprot.org/mapping/" -SIFTS_URL = "ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/csv/pdb_chain_uniprot.csv.gz" +UNIPROT_MAPPING_URL = "https://www.uniprot.org/mapping/" +SIFTS_URL = "ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/csv/uniprot_segments_observed.csv.gz" SIFTS_REST_API = "http://www.ebi.ac.uk/pdbe/api/mappings/uniprot_segments/{}" # TODO: make this default parametrization more explicit (e.g. a config file in repository) # these parameters are fed as a default into SIFTS.by_alignment so that the method can be # easily used without a configuration file/any further setup -JACKHMMER_CONFIG = """ +HMMER_CONFIG = """ prefix: sequence_id: sequence_file: @@ -115,12 +111,15 @@ def fetch_uniprot_mapping(ids, from_="ACC", to="ACC", format="fasta"): return r.text -def find_homologs(**kwargs): +def find_homologs(pdb_alignment_method="jackhmmer", **kwargs): """ Identify homologs using jackhmmer or hmmbuild/hmmsearch Parameters ---------- + pdb_alignment_method : {"jackhmmer", "hmmsearch"}, + optional (default: "jackhmmer") + Sequence alignment method used for searching the PDB **kwargs Passed into jackhmmer / hmmbuild_and_search protocol (see documentation for available options) @@ -135,7 +134,7 @@ def find_homologs(**kwargs): """ # load default configuration - config = parse_config(JACKHMMER_CONFIG) + config = parse_config(HMMER_CONFIG) # update with overrides from kwargs config = { @@ -152,10 +151,10 @@ def find_homologs(**kwargs): ) # run hmmsearch (possibly preceded by hmmbuild) - if kwargs["pdb_alignment_method"] == "hmmsearch": + if pdb_alignment_method == "hmmsearch": # set up config to run hmmbuild_and_search on the unfiltered alignment file updated_config = deepcopy(config) - updated_config["alignment_file"] = config["raw_focus_alignment_file"] + updated_config["alignment_file"] = config.get("raw_focus_alignment_file") ar = hmmbuild_and_search(**updated_config) # For hmmbuild and search, we have to read the raw focus alignment file @@ -166,7 +165,7 @@ def find_homologs(**kwargs): # run jackhmmer against sequence database # at this point we have already checked to ensure # that the input is either jackhmmer or hmmsearch - else: + elif pdb_alignment_method == "jackhmmer": ar = jackhmmer_search(**config) with open(ar["raw_alignment_file"]) as a: @@ -176,6 +175,11 @@ def find_homologs(**kwargs): # if necessary with open(config["prefix"] + "_raw.fasta", "w") as f: ali.write(f) + else: + raise InvalidParameterError( + "Invalid pdb_alignment_method selected. Valid options are: " + + ", ".join(["jackhmmer", "hmmsearch"]) + ) # read hmmer hittable and simplify hits = read_hmmer_domtbl(ar["hittable_file"]) @@ -268,7 +272,7 @@ def __init__(self, sifts_table_file, sequence_file=None): ) # final table has still some entries where lengths do not match, - # remove thlse + # remove these self.table = self.table.query( "(resseq_end - resseq_start) == (uniprot_end - uniprot_start)" ) @@ -288,7 +292,7 @@ def _create_mapping_table(self, sifts_table_file): """ Create modified SIFTS mapping table (based on file at SIFTS_URL). For some of the entries, - the Uniprot sequence ranges do not map to a. + the Uniprot sequence ranges do not map to a SEQRES sequence range of the same length. These PDB IDs will be entirely replaced by a segment- based mapping extracted from the SIFTS REST API. @@ -325,12 +329,14 @@ def extract_rows(M, pdb_id): return res - get_urllib(SIFTS_URL, sifts_table_file) + # download SIFTS table (gzip-compressed csv) to temp file + temp_download_file = temp() + get_urllib(SIFTS_URL, temp_download_file) # load table and rename columns for internal use, if SIFTS # ever decided to rename theirs table = pd.read_csv( - sifts_table_file, comment="#", + temp_download_file, comment="#", compression="gzip" ).rename( columns={ @@ -346,11 +352,17 @@ def extract_rows(M, pdb_id): } ) + # TODO: remove the following if new segment-based table proves as robust solution + """ + # this block disabled for now due to use of new table + # based on observed UniProt segments + # - can probably be removed eventually + # identify problematic PDB IDs problematic_ids = table.query( "(resseq_end - resseq_start) != (uniprot_end - uniprot_start)" ).pdb_id.unique() - + # collect new mappings from segment based REST API res = [] for i, pdb_id in enumerate(problematic_ids): @@ -363,12 +375,17 @@ def extract_rows(M, pdb_id): # remove bad PDB IDs from table and add new mapping new_table = table.loc[~table.pdb_id.isin(problematic_ids)] + + # also disabled due to use of new table based on observed + # UniProt segments - can probably be removed eventually + new_table = new_table.append( pd.DataFrame(res).loc[:, table.columns] ) + """ # save for later reuse - new_table.to_csv(sifts_table_file, index=False) + table.to_csv(sifts_table_file, index=False) def _add_uniprot_ids(self): """ @@ -386,7 +403,7 @@ def _add_uniprot_ids(self): # add column to dataframe self.table.loc[:, "uniprot_id"] = self.table.loc[:, "uniprot_ac"].map(ac_to_id) - def create_sequence_file(self, output_file): + def create_sequence_file(self, output_file, chunk_size=1000, max_retries=100): """ Create FASTA sequence file containing all UniProt sequences of proteins in SIFTS. This file is required @@ -399,21 +416,68 @@ def create_sequence_file(self, output_file): ---------- output_file : str Path at which to store sequence file + chunk_size : int, optional (default: 1000) + Retrieve sequences from UniProt in chunks of this size + (too large chunks cause the mapping service to stall) + max_retries : int, optional (default: 100) + Allow this many retries when fetching sequences + from UniProt ID mapping service, which unfortunately + often suffers from connection failures. """ ids = self.table.uniprot_ac.unique().tolist() - CHUNK_SIZE = 1000 - chunks = [ - ids[i:i + CHUNK_SIZE] for i in range(0, len(ids), CHUNK_SIZE) + # retrieve sequences in chunks since ID mapping service + # tends to fail on large requests + id_chunks = [ + ids[i:i + chunk_size] for i in range(0, len(ids), chunk_size) ] - with open(output_file, "w") as f: - for ch in chunks: - # fetch sequence chunk - seqs = fetch_uniprot_mapping(ch) + # store individual retrieved chunks as list of strings + seq_chunks = [] + + # keep track of how many retries were necessary and + # abort if number exceeds max_retries + num_retries = 0 + + for ch in id_chunks: + # fetch sequence chunk; + # if there is a problem retry as long as we stay within + # maximum number of retries + while True: + try: + seqs = fetch_uniprot_mapping(ch) + break + except requests.ConnectionError as e: + # count as failed try + num_retries += 1 + + # if we retried too often, abort + if num_retries > max_retries: + raise ResourceError( + "Could not fetch sequences for SIFTS mapping tables from UniProt since " + "maximum number of retries after connection errors was exceeded. Retry " + "at a later time, or call SIFTS.create_sequence_file() with a higher value " + "for max_retries." + ) from e + + # rename identifiers in sequence file, so + # we can circumvent Uniprot sequence identifiers + # being prefixed by hmmer if a hit has exactly the + # same identifier as the query sequence + seqs = seqs.replace( + ">sp|", ">evsp|", + ).replace( + ">tr|", ">evtr|", + ) - # then store to FASTA file - f.write(seqs) + assert seqs.endswith("\n") + + # store for writing + seq_chunks.append(seqs) + + # store sequences to FASTA file in one go at the end + with open(output_file, "w") as f: + f.write("".join(seq_chunks)) self.sequence_file = output_file @@ -608,8 +672,6 @@ def by_alignment(self, min_overlap=20, reduce_chains=False, **kwargs): Find structures by sequence alignment between query sequence and sequences in PDB. - # TODO: offer option to start from HMM profile for this - Parameters ---------- min_overlap : int, optional (default: 20) @@ -621,11 +683,29 @@ def by_alignment(self, min_overlap=20, reduce_chains=False, **kwargs): protein in PDB structures). Should be set to False to identify homomultimeric contacts. **kwargs - Passed into jackhmmer_search protocol - (see documentation for available options). - Additionally, if "prefix" is given, individual - mappings will be saved to files suffixed by - the respective key in mapping table. + Defines the behaviour of find_homologs() function + used to find homologs by sequence alignment: + - which alignment method is used + (pdb_alignment_method: {"jackhmmer", "hmmsearch"}, + default: "jackhmmer"), + - parameters passed into the protocol for the selected + alignment method (evcouplings.align.jackhmmer_search or + evcouplings.align.hmmbuild_and_search). + + Default parameters are set in the HMMER_CONFIG string in this + module, other parameters will need to be overriden; these + minimally are: + - for pdb_alignment_method == "jackhmmer": + - sequence_id : str, identifier of target sequence + - jackhmmer : str, path to jackhmmer binary if not on path + - for pdb_alignment_method == "hmmsearch": + - sequence_id : str, identifier of target sequence + - raw_focus_alignment_file : str, path to input alignment file + - hmmbuild : str, path to hmmbuild binary if not on path + - hmmsearch : str, path to search binary if not on path + - additionally, if "prefix" is given, + individual mappings will be saved to files suffixed + by the respective key in mapping table. Returns ------- diff --git a/evcouplings/couplings/protocol.py b/evcouplings/couplings/protocol.py index e453bd20..cf7ee323 100644 --- a/evcouplings/couplings/protocol.py +++ b/evcouplings/couplings/protocol.py @@ -234,7 +234,7 @@ def infer_plmc(**kwargs): # store useful information about model in outcfg outcfg.update({ "num_sites": plmc_result["num_valid_sites"], - "num_sequences": plmc_result["num_valid_seqs"], + "num_valid_sequences": plmc_result["num_valid_seqs"], "effective_sequences": plmc_result["effective_samples"], "region_start": plmc_result["region_start"], }) @@ -607,7 +607,7 @@ def mean_field(**kwargs): # store useful information about model in outcfg outcfg.update({ "num_sites": model.L, - "num_sequences": model.N_valid, + "num_valid_sequences": model.N_valid, "effective_sequences": float(round(model.N_eff, 1)), "region_start": int(model.index_list[0]), }) diff --git a/evcouplings/visualize/pairs.py b/evcouplings/visualize/pairs.py index dfadb321..4a9f27db 100644 --- a/evcouplings/visualize/pairs.py +++ b/evcouplings/visualize/pairs.py @@ -1204,6 +1204,10 @@ def enrichment_pymol_script(enrichment_table, output_file, # medium t.loc[t.iloc[boundary1:boundary2].index, "color"] = "orange" + + # set the boundary for number of residues to be rendered as spheres + sphere_boundary = boundary2 + else: t = deepcopy(enrichment_table) @@ -1248,8 +1252,11 @@ def enrichment_pymol_script(enrichment_table, output_file, t.loc[t.iloc[prior_boundary:boundary].index, "color"] = 'color{}'.format(idx) prior_boundary = boundary + # set the boundary for number of residues to be rendered as spheres + sphere_boundary = boundary_list[1] + if sphere_view: - t.loc[t.iloc[0:boundary].index, "show"] = "spheres" + t.loc[t.iloc[0:sphere_boundary].index, "show"] = "spheres" if chain is not None: chain_sel = ", chain '{}'".format(chain) diff --git a/setup.py b/setup.py index ca44e7be..1d5b1964 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ name='evcouplings', # Version: - version='0.0.4', + version='0.0.5', description='A Framework for evolutionary couplings analysis', long_description=readme,