diff --git a/README.md b/README.md index b868fc2..4fd022b 100644 --- a/README.md +++ b/README.md @@ -156,3 +156,5 @@ EVcouplings is developed in the labs of [Debora Marks](http://marks.hms.harvard. * Rob Sheridan * Christian Dallago * Joe Min +* Aaron Kollasch +* Lood van Niekerk diff --git a/config/sample_config_monomer.txt b/config/sample_config_monomer.txt index fa02863..9741166 100644 --- a/config/sample_config_monomer.txt +++ b/config/sample_config_monomer.txt @@ -94,9 +94,11 @@ align: # sequence database (specify possible databases and paths in "databases" section below) database: uniref100 - # compute the redundancy-reduced number of effective sequences (M_eff) already in the alignment stage. - # To save compute time, this computation is normally carried out in the couplings stage - compute_num_effective_seqs: False + # Method for sequence weight calculation. Weights calculated here will be reused by plmc in the couplings stage + # (requiring up-to-date plmc version supporting --weights command line argument) + # Options: nogaps (recommended new strategy, parallelized), withgaps (legacy strategy, parallelized), + # legacy: old single-threaded implementation, empty/none (do not compute sequence weights in align stage) + sequence_weights: nogaps # Filter sequence alignment at this % sequence identity cutoff. Can be used to cut computation time in # the couplings stage (e.g. set to 95 to remove any sequence that is more than 95% identical to a sequence diff --git a/evcouplings/align/alignment.py b/evcouplings/align/alignment.py index f52d83f..5c71ed3 100644 --- a/evcouplings/align/alignment.py +++ b/evcouplings/align/alignment.py @@ -12,7 +12,7 @@ from pathlib import Path import numpy as np -from numba import jit +from numba import jit, prange, get_num_threads, set_num_threads from evcouplings.utils.calculations import entropy from evcouplings.utils.helpers import DefaultOrderedDict, wrap @@ -896,7 +896,7 @@ def __ensure_mapped_matrix(self): self.matrix, self.alphabet_map ) - def set_weights(self, identity_threshold=0.8): + def set_weights(self, identity_threshold=0.8, method="withgaps", weight_fileobj=None, cpu=None): """ Calculate weights for sequences in alignment by clustering all sequences with sequence identity @@ -916,12 +916,82 @@ def set_weights(self, identity_threshold=0.8): ---------- identity_threshold : float, optional (default: 0.8) Sequence identity threshold + method : str, optional (default: "withgaps") + Weight calculation method to use. Options are: + * "nogaps": Exclude gaps from identity calculation; parallelized execution with cpu threads (recommended) + * "withgaps": Include gaps in identity calculation; parallelized execution with cpu threads. Result will + be the same as for "legacy" (default for backward compatibility) + * "legacy": Inverse cluster weights, counting matches between gap characters towards identity. + Single-threaded execution using legacy implementation (faster for single-CPU execution, exploiting + symmetry of sequence identities) + * "uniform": Initialize all weights to 1 (i.e. no weighting) + weight_fileobj : file-like object (default: None) + Load weights from a numpy file. This flag will override the "method" parameter. Can contain weights + (all <= 1.0) or inverse weights/cluster members (all >= 1.0) + cpu : int, optional (default: None) + Number of parallel threads to use for computation. Set to None to use all available CPUs + (as returned by numba.get_num_threads()). Legacy computation will always be single CPU only. """ self.__ensure_mapped_matrix() - self.num_cluster_members = num_cluster_members( - self.matrix_mapped, identity_threshold - ) + available_weight_methods = ["nogaps", "withgaps", "legacy", "uniform"] + if weight_fileobj is None and method not in available_weight_methods: + raise ValueError( + "Invalid weight method selected, options are: " + (", ".join(available_weight_methods)) + ) + + if weight_fileobj is not None: + try: + weights = np.array([ + float(line.strip()) for line in weight_fileobj if line.strip() != "" + ]) + except ValueError as e: + raise ValueError( + "Invalid sequence weight file, must contain one floating point number per sequence" + ) from e + + if len(weights) != self.N: + raise ValueError( + f"Number of weights in file ({len(weights)}) does not match " + + f"number of sequences in alignment ({self.N})" + ) + + if (weights <= 1.0).all(): + # turn into cluster members, invert again below (may lead to some floating point inaccuracy) + self.num_cluster_members = 1.0 / weights + elif (weights >= 1.0).all(): + # turn into inverse + self.num_cluster_members = weights + else: + raise ValueError("Weights must all be <= 1.0 or >= 1.0 (inverse number)") + + elif method == "nogaps" or method == "withgaps": + if method == "nogaps": + exclude_value = self.alphabet_map[self.alphabet_default] + else: + exclude_value = -1 + + # get number of threads set for numba to restore after computation + if cpu is not None: + threads_before = get_num_threads() + set_num_threads(cpu) + + self.num_cluster_members = num_cluster_members_parallel( + self.matrix_mapped, identity_threshold, exclude_value + ) + + # reset number of threads if we changed it before + if cpu is not None: + set_num_threads(threads_before) + + elif method == "legacy": + self.num_cluster_members = num_cluster_members_legacy( + self.matrix_mapped, identity_threshold + ) + else: + assert method == "uniform" + self.num_cluster_members = np.ones(self.N) + self.weights = 1.0 / self.num_cluster_members # reset frequencies, since these were based on @@ -929,6 +999,34 @@ def set_weights(self, identity_threshold=0.8): self._frequencies = None self._pair_frequencies = None + def save_weights(self, fileobj, inverse_weights=False, digits=8): + """ + Save sequence weights to file + + Parameters + ---------- + fileobj : file-like obj + File object to write sequences to (in "w" / text writing mode) + inverse_weights : bool, optional (default: False) + If True, save number of sequence cluster members (i.e. inverse weights) + digits : int, optional (default: 6) + Number of decimal places to use for string formatting of weights + """ + if self.weights is None: + raise ValueError( + "No weights available for saving, need to call set_weights() with appropriate parameters first" + ) + + if inverse_weights: + values = self.num_cluster_members + fmt_string = "{v}\n" + else: + values = self.weights + fmt_string = "{v:.^{digits}f}\n" + + for v in values: + fileobj.write(fmt_string.format(v=v, digits=digits)) + @property def frequencies(self): """ @@ -991,7 +1089,7 @@ def pair_frequencies(self): return self._pair_frequencies - def identities_to(self, seq, normalize=True): + def identities_to(self, seq, normalize=True, exclude_invalid=False): """ Calculate sequence identity between sequence and all sequences in the alignment. @@ -1001,17 +1099,30 @@ def identities_to(self, seq, normalize=True): normalize : bool, optional (default: True) Calculate relative identity between 0 and 1 by normalizing with length of alignment + exclude_invalid : bool, optional (default: False) + Exclude gaps and lowercase characters from identity + calculation. False is default for backwards compatiblity, + but it is recommended to use True. """ self.__ensure_mapped_matrix() - # make sure this doesnt break with strings + # make sure this doesn't break with strings seq = np.array(list(seq)) - seq_mapped = map_matrix(seq, self.alphabet_map) - ids = identities_to_seq(seq_mapped, self.matrix_mapped) + + # value to exclude from identity calculation (-1 if gaps should be included) + if exclude_invalid: + exclude_value = self.alphabet_map[self.alphabet_default] + else: + exclude_value = -1 + + ids = identities_to_seq( + seq_mapped, self.matrix_mapped, exclude_value + ) if normalize: - return ids / self.L + l = (seq_mapped != exclude_value).sum() + return ids / l else: return ids @@ -1154,7 +1265,7 @@ def pair_frequencies(matrix, seq_weights, num_symbols, fi): @jit(nopython=True) -def identities_to_seq(seq, matrix): +def identities_to_seq(seq, matrix, exclude_value): """ Calculate number of identities to given target sequence for all sequences in the matrix @@ -1168,6 +1279,9 @@ def identities_to_seq(seq, matrix): N x L matrix containing N sequences of length L. Matrix must be mapped to range(0, num_symbols) using map_matrix function + exclude_value : int + Value >= 0 in mapped sequences that will be excluded from identity calculation, e.g. gap or lowercase character. + Set to -1 to enable legacy behaviour which includes gaps in identity calculation. Returns ------- @@ -1178,10 +1292,13 @@ def identities_to_seq(seq, matrix): N, L = matrix.shape identities = np.zeros((N, )) + # iterate through sequences in matrix for i in range(N): id_i = 0 + + # iterate through positions for j in range(L): - if matrix[i, j] == seq[j]: + if matrix[i, j] == seq[j] and matrix[i, j] != exclude_value: id_i += 1 identities[i] = id_i @@ -1190,11 +1307,18 @@ def identities_to_seq(seq, matrix): @jit(nopython=True) -def num_cluster_members(matrix, identity_threshold): +def num_cluster_members_legacy(matrix, identity_threshold): """ Calculate number of sequences in alignment within given identity_threshold of each other + Note: This function will treat gaps like any other amino acid, + i.e. a gap in either sequence will be counted as a match; + potentially giving misleading results (which is consistent + with how most early tools handled this case). At this point, + we recommend to use num_cluster_members_parallel to + exclude gaps from the calculation. + Parameters ---------- matrix : np.array @@ -1216,7 +1340,7 @@ def num_cluster_members(matrix, identity_threshold): L = 1.0 * L # minimal cluster size is 1 (self) - num_neighbors = np.ones((N)) + num_neighbors = np.ones((N, )) # compare all pairs of sequences for i in range(N - 1): @@ -1231,3 +1355,79 @@ def num_cluster_members(matrix, identity_threshold): num_neighbors[j] += 1 return num_neighbors + + +@jit(nopython=True, parallel=True) +def num_cluster_members_parallel(matrix, identity_threshold, exclude_value): + """ + Calculate number of sequences in alignment + within given identity_threshold of each other + + Note: Unlike num_cluster_members_legacy, this function allows to exclude gaps + from the calculation, which is now the recommended approach. + + Note: When excluding gaps from the calculation, the resulting matrix will + typically not be symmetric. + + The function will by default use the available number of threads + (as returned by numba.get_num_threads()). If a different number should + be used, the caller is responsible to set the number of threads with + numba.set_num_threads + + Parameters + ---------- + matrix : np.array + N x L matrix containing N sequences of length L. + Matrix must be mapped to range(0, num_symbols) using + map_matrix function + identity_threshold : float + Sequences with at least this pairwise identity will be + grouped in the same cluster. + exclude_value : int + Value >= 0 in matrix that will be excluded from identity calculation, e.g. gap or lowercase character. + Set to -1 to enable legacy behaviour num_cluster_members_legacy which includes gaps in identity calculation. + + Returns + ------- + np.array + Vector of length N containing number of cluster + members for each sequence (inverse of sequence + weight) + """ + N, L = matrix.shape + + # minimal cluster size is 1 (self) but for parallelization we set the self-hit below inside the loop + # and initialize to zero here + num_neighbors = np.zeros((N, )) + + # determine relevant sequence lengths; only count positions without gaps unlike explicitly requesting + # legacy behaviour (in this case all entry of L_seq will be == L) + L_seq = np.sum(matrix != exclude_value, axis=1) + + # compare all pairs of sequences; we cannot assume symmetry of the resulting matrix here due to exclusion of + # gaps (this is also convenient for parallelizing the outer loop); no speedup from using a separate function + # with regular range(N) in single-thread case so can always use this function + for i in prange(N): + # always set cluster size to 1 / self-hit (i == j) + num_neighbors_i = 1 + + # compare to all other sequences (do not assume symmetry per comment above) + for j in range(N): + # simply skip on self-hit, we already initialized cluster size to 1 above + if i == j: + continue + + # compare all positions between both sequences, not counting gaps or other invalid characters + # unless specifically requested + matches = 0 + for k in range(L): + if matrix[i, k] == matrix[j, k] and matrix[i, k] != exclude_value: + matches += 1 + + # calculate identity as fraction of non-gapped positions (i.e. similarity will typically be asymmetric) + if matches / L_seq[i] >= identity_threshold: + num_neighbors_i += 1 + + num_neighbors[i] = num_neighbors_i + + return num_neighbors diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py index 7fa6282..a38adce 100644 --- a/evcouplings/align/protocol.py +++ b/evcouplings/align/protocol.py @@ -849,7 +849,7 @@ def modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, * [ "prefix", "seqid_filter", "hhfilter", "minimum_sequence_coverage", "minimum_column_coverage", - "compute_num_effective_seqs", "theta", + "theta", ] ) @@ -952,8 +952,10 @@ def modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, * # compute effective number of sequences # (this is intended for cases where coupling stage is - # not run, but this number is wanted nonetheless) - if kwargs["compute_num_effective_seqs"]: + # not run, but this number is wanted nonetheless); + # handles legacy style sequence weights (not reused in couplings stage) as well + # as new-style sequence weights (reused by plmc in couplings stage) + if kwargs.get("compute_num_effective_seqs") or kwargs.get("sequence_weights"): # make sure we only compute N_eff on the columns # that would be used for model inference, dispose # the rest @@ -962,8 +964,35 @@ def modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, * else: cut_ali = ali.select(columns=~lc_cols) - # compute sequence weights - cut_ali.set_weights(kwargs["theta"]) + # handle case where we want to forward sequence weights to couplings stage + # (i.e. not for old compute_num_effective_seqs parameter) + if kwargs.get("sequence_weights") is not None: + seq_weight_file = prefix + "_sequence_weights.csv" + else: + seq_weight_file = None + + # compute sequence weights, reuse existing weights file if available if specified via sequence_weights param + if kwargs.get("reuse_alignment") and seq_weight_file is not None and valid_file(seq_weight_file): + # reload weights + with open(seq_weight_file) as f: + cut_ali.set_weights(weight_fileobj=f) + else: + # otherwise compute weights from scratch; default to legacy strategy if sequence_weights argument + # not specified (for legacy compute_num_effective_seqs case) + cut_ali.set_weights( + identity_threshold=kwargs["theta"], + method=kwargs.get("sequence_weights", "legacy"), + cpu=kwargs.get("cpu"), + ) + + # save weights to file for reuse by plmc (one weight per line in text format) + if seq_weight_file is not None: + with open(seq_weight_file, "w") as f: + cut_ali.save_weights(f) + + # add sequence weight file to outcfg to forward to couplings stage + if seq_weight_file is not None: + outcfg["sequence_weight_file"] = seq_weight_file # N_eff := sum of all sequence weights n_eff = float(cut_ali.weights.sum()) @@ -979,9 +1008,9 @@ def modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, * }) # save sequence weights to file and add to output config - outcfg["sequence_weights_file"] = prefix + "_inverse_sequence_weights.csv" + outcfg["inverse_sequence_weight_file"] = prefix + "_inverse_sequence_weights.csv" inv_seq_weights.to_csv( - outcfg["sequence_weights_file"], index=False + outcfg["inverse_sequence_weight_file"], index=False ) else: n_eff = None diff --git a/evcouplings/couplings/protocol.py b/evcouplings/couplings/protocol.py index 66940ce..5ac6247 100644 --- a/evcouplings/couplings/protocol.py +++ b/evcouplings/couplings/protocol.py @@ -81,6 +81,7 @@ def infer_plmc(**kwargs): segments (passed through) """ + # note: sequence_weight_file not enforced here for backwards compatibility check_required( kwargs, [ @@ -213,6 +214,7 @@ def infer_plmc(**kwargs): lambda_h=kwargs["lambda_h"], lambda_J=lambda_J, lambda_g=kwargs["lambda_group"], + weight_file=kwargs.get("sequence_weight_file"), cpu=kwargs["cpu"], binary=kwargs["plmc"], ) diff --git a/evcouplings/couplings/tools.py b/evcouplings/couplings/tools.py index 79c592b..959a91f 100644 --- a/evcouplings/couplings/tools.py +++ b/evcouplings/couplings/tools.py @@ -52,7 +52,8 @@ def parse_plmc_log(log): "seqs": re.compile("(\d+) valid sequences out of (\d+)"), "sites": re.compile("(\d+) sites out of (\d+)"), "region": re.compile("Region starts at (\d+)"), - "samples": re.compile("Effective number of samples: (\d+\.\d+)"), + # first match group catches ' (to 1 decimal place)' when loading existing weights file + "samples": re.compile("Effective number of samples(.*): (\d+\.\d+)"), "optimization": re.compile("Gradient optimization: (.+)") } @@ -95,7 +96,7 @@ def parse_plmc_log(log): pass valid_seqs, total_seqs = map(int, matches["seqs"]) - eff_samples = float(matches["samples"][0]) + eff_samples = float(matches["samples"][1]) opt_status = matches["optimization"][0] return ( @@ -127,7 +128,7 @@ def run_plmc(alignment, couplings_file, param_file=None, focus_seq=None, alphabet=None, theta=None, scale=None, ignore_gaps=False, iterations=None, lambda_h=None, lambda_J=None, lambda_g=None, - cpu=None, binary="plmc"): + weight_file=None, cpu=None, binary="plmc"): """ Run plmc on sequence alignment and store files with model parameters and pair couplings. @@ -175,6 +176,9 @@ def run_plmc(alignment, couplings_file, param_file=None, lambda_g : float, optional (default: None) group l1-regularization strength on couplings If None, plmc default will be used. + weight_file : str, optional (default: None) + Path to file with sequence weights (one per line) + to use instead of calculating weights with plmc. cpu : Number of cores to use for running plmc. Note that plmc has to be compiled in openmp mode to runnable with multiple cores. @@ -254,6 +258,9 @@ def run_plmc(alignment, couplings_file, param_file=None, if lambda_g is not None: cmd += ["-lg", str(lambda_g)] + if weight_file is not None: + cmd += ["-w", weight_file] + # Number of cores to use for calculation if cpu is not None: cmd += ["-n", str(cpu)] @@ -265,6 +272,15 @@ def run_plmc(alignment, couplings_file, param_file=None, # returncode == -11 (segfault) despite successful calculation return_code, stdout, stderr = run(cmd, check_returncode=False) + # save plmc output to separate log file for easier inspection if problems occur + with open(couplings_file + ".log", "w") as f: + f.write(f"cmd: {' '.join(cmd)}\n") + f.write(f"return_code: {return_code}\n") + f.write(f"stderr:\n") + f.write(stderr + "\n\n") + f.write(f"stdout:\n") + f.write(stdout + "\n") + # TODO: remove this segfault-hunting output if fixed in plmc if return_code != 0: # check if we got valid output from plmc by parsing it