fixed issues with flake8 pre-commit

saroudant · saroudant · commit 798ce3c218d6 · 2023-04-24T17:21:58.000+02:00
diff --git a/.flake8 b/.flake8
@@ -1,2 +1,8 @@
 [flake8]
 max-line-length = 120
+per-file-ignores =
+    tests/*: D10, E74
+    setup.py: D10
+    docs/conf.py: D100
+    sobolev_alignment/__init__.py: D104,F401
+    sobolev_alignment/*: E203
diff --git a/sobolev_alignment/data_normalisation.py b/sobolev_alignment/data_normalisation.py
diff --git a/sobolev_alignment/feature_analysis.py b/sobolev_alignment/feature_analysis.py
@@ -1,4 +1,11 @@
-"""FEATURE_ANALYSIS"""
+"""
+Feature analysis.
+
+@author: Soufiane Mourragui
+
+This modules contains all the codes used in the Taylor expansion for the Gaussian/Matern
+kernel.
+"""
 
 import gc
 import logging
@@ -24,7 +31,8 @@ def higher_order_contribution(
 
     Compute the features corresponding to the Taylor expansion of the kernel, i.e. $x_j exp^{-\gamma xx^T}$ for
     linear features. Returns a sparse pandas DataFrame containing all the features (columns) by samples (rows).
-    We here critically rely on the sparsity of the data-matrix to speed up computations. The current implementation is relevant in two cases:
+    We here critically rely on the sparsity of the data-matrix to speed up computations. The current implementation
+    is relevant in two cases:
     -When dimensionality is small
     -When data is sparse.
 
@@ -54,8 +62,9 @@ def higher_order_contribution(
         could lead to crash.
 
     return_matrix: bool, default to False
-        If True, then returns simply the feature-matrix without feature-naming. In cases when feature names are
-        not relevant (e.g. computing the proportion of non-linearities), return_matrix=True can help speed-up the process.
+        If True, then returns simply the feature-matrix without feature-naming. In cases when feature names
+        are not relevant (e.g. computing the proportion of non-linearities), return_matrix=True can help
+        speed-up the process.
 
     Returns
     -------
@@ -96,7 +105,7 @@ def higher_order_contribution(
 
 
 def _combination_to_idx(idx, p):
-    """Transforms a combination (tuple of feature idx) into an indicative function.
+    r"""Transform a combination (tuple of feature idx) into an indicative function.
 
     Parameters
     ----------
@@ -118,7 +127,7 @@ def _combination_to_idx(idx, p):
 
 
 def basis(x, k, gamma):
-    """Computed the basis function for a single gene, except offset term.
+    r"""Compute the basis function for a single gene, except offset term.
 
     Parameters
     ----------
diff --git a/sobolev_alignment/generate_artificial_sample.py b/sobolev_alignment/generate_artificial_sample.py
@@ -1,5 +1,5 @@
 """
-GENERATE ARTIFICIAL SAMPLE
+Generate artificial samples.
 
 @author: Soufiane Mourragui
 
@@ -26,7 +26,7 @@ def generate_samples(
     return_dist: bool = False,
 ):
     """
-    Generates artificial gene expression profiles.
+    Generate artificial gene expression profiles.
 
     Note to developers: this method has been designed to be used with scvi-tools classes. Other VAE
     implementations may break here.
@@ -111,7 +111,7 @@ def parallel_generate_samples(
     n_jobs=1,
 ):
     """
-    Generates artificial gene expression profiles.
+    Generate artificial gene expression profiles.
 
     Wrapper of parallelize generate_samples, running several threads in parallel.
     <b>Note to developers</b>: this function needs to be changed if applied to other VAE model
diff --git a/sobolev_alignment/interpolated_features.py b/sobolev_alignment/interpolated_features.py
@@ -1,3 +1,9 @@
+"""
+Compute interpolated features.
+
+@author: Soufiane Mourragui
+"""
+
 import numpy as np
 import pandas as pd
 import scipy
diff --git a/sobolev_alignment/kernel_operations.py b/sobolev_alignment/kernel_operations.py
@@ -1,3 +1,10 @@
+"""
+Kernel operations.
+
+@author: Soufiane Mourragui
+
+Custom scripts for specific matrix operations.
+"""
 import numpy as np
 
 
diff --git a/sobolev_alignment/krr_approx.py b/sobolev_alignment/krr_approx.py
@@ -255,7 +255,7 @@ def _save_coefs(self):
             self._process_coef_ridge_falkon()
 
     def _process_coef_ridge_sklearn(self):
-        """Save and process (i.e. transform to torch.Tensor) the coefficients obtained after kernel ridge regression with scikit-learn implementation."""
+        """Save and transform to torch.Tensor KRR coefficients from scikit-learn implementation."""
         self.sample_weights_ = torch.Tensor(self.ridge_clf_.dual_coef_)
         self.ridge_samples_idx_ = np.arange(self.training_data_.shape[0])
 
diff --git a/sobolev_alignment/krr_model_selection.py b/sobolev_alignment/krr_model_selection.py
@@ -1,5 +1,5 @@
 """
-<h2>Kernel Ridge Regression (KRR) model search</h2>
+Kernel Ridge Regression (KRR) model search.
 
 @author: Soufiane Mourragui
 
diff --git a/sobolev_alignment/multi_krr_approx.py b/sobolev_alignment/multi_krr_approx.py
@@ -1,6 +1,12 @@
-import torch
+"""
+Multi KRR approximation.
+
+@author: Soufiane Mourragui
 
-# Falkon import
+Scripts supporting the naive integration of several KRR. No gain is provided by such approach.
+"""
+
+import torch
 
 
 class MultiKRRApprox:
@@ -12,31 +18,32 @@ class MultiKRRApprox:
     """
 
     def __init__(self):
+        """Create instance."""
         self.krr_regressors = []
 
     def predict(self, X: torch.Tensor):
-        """Predict latent factor values given a tensor"""
+        """Predict latent factor values given a tensor."""
         prediction = [clf.transform(torch.Tensor(X)).detach().numpy() for clf in self.krr_regressors]
         prediction = torch.Tensor(prediction)
         prediction = torch.mean(prediction, axis=0)
 
         return prediction
 
     def transform(self, X: torch.Tensor):
-        """Predict latent factor values given a tensor"""
+        """Predict latent factor values given a tensor."""
         return self.predict(X)
 
     def anchors(self):
-        """Return anchors"""
+        """Return anchors."""
         return self.anchors_
 
     def process_clfs(self):
-        """Process the different classifiers"""
+        """Process the different classifiers."""
         self.anchors_ = torch.cat([clf.anchors() for clf in self.krr_regressors])
         self.sample_weights_ = torch.cat([clf.sample_weights_ for clf in self.krr_regressors])
         self.sample_weights_ = 1 / len(self.krr_regressors) * self.sample_weights_
         self.kernel_ = self.krr_regressors[0].kernel_
 
     def add_clf(self, clf):
-        """Add a classifier"""
+        """Add a classifier."""
         self.krr_regressors.append(clf)
diff --git a/sobolev_alignment/scvi_model_search.py b/sobolev_alignment/scvi_model_search.py
@@ -1,5 +1,5 @@
 """
-<h2>scVI model search</h2>
+scVI model search.
 
 @author: Soufiane Mourragui
 
@@ -132,7 +132,7 @@ def make_objective_function(train_data_an, test_data_an, batch_key=None, model=s
 
     def _objective_function(params):
         """
-        Objective function
+        Objective function.
 
         Returns a method which performs, for one set of hyperparameters, the training,
         the evaluation on test data and summing up all the results in a dictionary usable
@@ -192,7 +192,7 @@ def _objective_function(params):
 
 
 def split_dataset(data_an, test_size=0.1):
-    """Split between training and testing"""
+    """Split between training and testing."""
     train_data_df, test_data_df = train_test_split(data_an.to_df(), test_size=test_size)
     train_data_an = data_an[train_data_df.index,]
     test_data_an = data_an[test_data_df.index,]
diff --git a/sobolev_alignment/sobolev_alignment.py b/sobolev_alignment/sobolev_alignment.py
@@ -1,11 +1,12 @@
 """
-<h2>Sobolev Alignment</h2>
+Sobolev Alignment.
 
 @author: Soufiane Mourragui
 
 References
 ----------
-Mourragui et al, Identifying commonalities between cell lines and tumors at the single cell level using Sobolev Alignment of deep generative models, Biorxiv, 2022.
+Mourragui et al, Identifying commonalities between cell lines and tumors at the single cell level using
+Sobolev Alignment of deep generative models, Biorxiv, 2022.
 Lopez et al, Deep generative modeling for single-cell transcriptomics, Nature Methods, 2018.
 Meanti et al, Kernel methods through the roof: handling billions of points efficiently, NeurIPS, 2020.
 """
@@ -45,7 +46,7 @@
 
 class SobolevAlignment:
     """
-    Sobolev Alignment implementation
+    Sobolev Alignment implementation.
 
     Main class for Sobolev Alignment, which wraps all the different operations presented in Sobolev Alignment procedure:
     - Model selection (scVI and KRR)
@@ -66,8 +67,8 @@ def __init__(
         target_scvi_params: dict = None,
         source_krr_params: dict = None,
         target_krr_params: dict = None,
-        n_artificial_samples: int = int(10e5),
-        n_samples_per_sample_batch: int = 10**6,
+        n_artificial_samples: int = 10**5,
+        n_samples_per_sample_batch: int = 10**5,
         frac_save_artificial: float = 0.1,
         save_mmap: str = None,
         log_input: bool = True,
@@ -237,7 +238,7 @@ def fit(
         sample_artificial: bool = True,
     ):
         """
-        Runs the complete Sobolev Alignment workflow between a source (e.g. cell line) and a target (e.g. tumor) dataset.
+        Run complete Sobolev Alignment workflow between a source (e.g. cell line) and a target (e.g. tumor) dataset.
 
         Source and target data should be passed as AnnData and potential batch names
         (source_batch_name, target_batch_name) should be part of the "obs" element
@@ -593,7 +594,7 @@ def _compute_batch_library_size(self):
         }
 
     def _check_same_kernel(self):
-        """Same kernel has to be used for source and kernel KRR."""
+        """Verify that same kernel is used for source and kernel KRR."""
         if "kernel" in self.krr_params["source"] or "kernel" in self.krr_params["target"]:
             assert self.krr_params["source"]["kernel"] == self.krr_params["target"]["kernel"]
         if "kernel_params" in self.krr_params["source"] or "kernel_params" in self.krr_params["target"]:
@@ -797,9 +798,12 @@ def _compute_cross_cosine_sim(self):
 
     def _compute_principal_vectors(self, all_PVs=False):
         """
-        All_PVs indicate whether the data source with the most PVs should be reduced to the number of PVs of the smallest data-source.
+        Compute principal vectors by SVD of cosine similarity.
 
-        Example: source has 10 factors, target 13. all_PVs=True would yield 13 target PVs, all_PVs=False would yield 10.
+        All_PVs indicate whether the data source with the most PVs should be reduced to the
+        number of PVs of the smallest data-source.
+        Example: source has 10 factors, target 13. all_PVs=True would yield 13 target PVs,
+        all_PVs=False would yield 10.
         """
         cosine_svd = np.linalg.svd(self.cosine_sim, full_matrices=all_PVs)
         self.principal_angles = cosine_svd[1]
@@ -815,8 +819,8 @@ def compute_consensus_features(self, X_input: dict, n_similar_pv: int, fit: bool
         """
         Project data on interpolated consensus features.
 
-        Project the data on interpolated features, i.e., a linear combination of source and target SPVs which best balances the effect of source and target
-        data.
+        Project the data on interpolated features, i.e., a linear combination of source and target SPVs which
+        best balances the effect of source and target data.
 
         Parameters
         ----------
@@ -825,10 +829,11 @@ def compute_consensus_features(self, X_input: dict, n_similar_pv: int, fit: bool
         n_similar_pv: int
             Number of top SPVs to project the data on.
         fit: bool, default to True
-            Whether the interpolated times must be computed. If False, will use previously computed times, but will return an error if not previously fitted.
+            Whether the interpolated times must be computed. If False, will use previously computed times,
+            but will return an error if not previously fitted.
         return_anndata: bool, default to False
-            Whether the projected consensus features must be formatted as an AnnData with overlapping indices in obs. This allows downstream analysis.
-            By default, return a DataFrame.
+            Whether the projected consensus features must be formatted as an AnnData with overlapping
+            indices in obs. This allows downstream analysis. By default, return a DataFrame.
 
         Returns
         -------
@@ -1066,7 +1071,7 @@ def krr_model_selection(
         return self.krr_params
 
     def save(self, folder: str = ".", with_krr: bool = True, with_model: bool = True):
-        """Save Sobolev Alignment model"""
+        """Save Sobolev Alignment model."""
         if not os.path.exists(folder) and not os.path.isdir(folder):
             os.mkdir(folder)
 
@@ -1217,7 +1222,7 @@ def plot_training_metrics(self, folder: str = "."):
                 plt.show()
 
     def plot_cosine_similarity(self, folder: str = ".", absolute_cos: bool = False):
-        """Plot cosine similarity"""
+        """Plot cosine similarity."""
         if absolute_cos:
             sns.heatmap(np.abs(self.cosine_sim), cmap="seismic_r", center=0)
         else:
@@ -1330,17 +1335,18 @@ def feature_analysis(self, max_order: int = 1, gene_names: list = None):
         """
         Launch feature analysis for a trained scVI model.
 
-        Computes the gene contributions (feature weights) associated with the KRRs which approximate the latent factors and the SPVs.
-        Technically, given the kernel machine which approximates a latent factor (KRR), this method computes the weights associated
-        with the orthonormal basis in the Gaussian-kernel associated Sobolev space.
+        Computes the gene contributions (feature weights) associated with the KRRs which approximate the
+        latent factors and the SPVs. Technically, given the kernel machine which approximates a latent factor
+        (KRR), this method computes the weights associated with the orthonormal basis in the Gaussian-kernel
+        associated Sobolev space.
 
         Parameters
         ----------
         max_order: int, default to 1
             Order of the features to compute. 1 corresponds to linear features (genes), two to interaction terms.
         gene_names: list of str, default to None
-            Names of the genes passed as input to Sobolev Alignment. <b>WARNING</b> Must be in the same order as the input to
-            SobolevAlignment.fit
+            Names of the genes passed as input to Sobolev Alignment. <b>WARNING</b> Must be in the same order as
+            the input to SobolevAlignment.fit
         """
         # Make kernel parameter
         if (
@@ -1405,7 +1411,7 @@ def feature_analysis(self, max_order: int = 1, gene_names: list = None):
         }
 
     def sample_random_vector_(self, data_source, K):
-        """Sample a vector randomly for either source or target"""
+        """Sample a vector randomly for either source or target."""
         n_samples = self.approximate_krr_regressions_[data_source].anchors().shape[0]
         n_factors = self.approximate_krr_regressions_[data_source].anchors().shape[1]
 
@@ -1435,7 +1441,7 @@ def sample_random_vector_(self, data_source, K):
         return coefficients
 
     def compute_random_direction_(self, K_X, K_Y, K_XY):
-        """Sample randomly two vectors and compute cosine similarity"""
+        """Sample randomly two vectors and compute cosine similarity."""
         # Random samples
         perm_source_sample_coef = self.sample_random_vector_("source", K_X)
         perm_target_sample_coef = self.sample_random_vector_("target", K_Y)