NKI-CCB
diff --git a/‎pytest.ini
+4 b/‎pytest.ini
+4
diff --git a/‎setup.py
+35 b/‎setup.py
+35
diff --git a/‎sobolev_alignment/__init__.py
+3 b/‎sobolev_alignment/__init__.py
+3
diff --git a/‎sobolev_alignment/_scvi_default_params.py
+49 b/‎sobolev_alignment/_scvi_default_params.py
+49
diff --git a/‎sobolev_alignment/data_normalisation.py b/‎sobolev_alignment/data_normalisation.py
diff --git a/‎sobolev_alignment/example.py
+34 b/‎sobolev_alignment/example.py
+34
diff --git a/‎sobolev_alignment/feature_analysis.py
+210 b/‎sobolev_alignment/feature_analysis.py
+210
@@ -0,0 +1,4 @@
+[pytest]
+addopts = --strict
+markers =
+    slow: A test which takes more than a few seconds to run.
@@ -0,0 +1,35 @@
+import setuptools
+
+with open("README.md") as fh:
+    long_description = fh.read()
+
+setuptools.setup(
+    name="sobolev_alignment",
+    version="0.2.1",
+    author="Soufiane Mourragui <soufiane.mourragui@gmail.com>, ",
+    author_email="soufiane.mourragui@gmail.com",
+    description="SOBOLEV ALIGNMENT",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    # url="https://github.com/NKI-CCB/sobolev_alignment",
+    packages=setuptools.find_packages(),
+    install_requires=[
+        "numpy",
+        "scipy",
+        "pandas",
+        "matplotlib",
+        "scikit-learn",
+        "torch",
+        "scvi-tools",
+        "scanpy",
+        "hyperopt",
+    ],
+    python_requires=">=3.8",
+    classifiers=(
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+        "Topic :: Scientific/Engineering :: Bio-Informatics",
+        "Development Status :: 1 - Planning",
+    ),
+)
@@ -0,0 +1,3 @@
+from sobolev_alignment.krr_approx import KRRApprox
+from sobolev_alignment.scvi_model_search import model_selection
+from sobolev_alignment.sobolev_alignment import SobolevAlignment
@@ -0,0 +1,49 @@
+SCVI_DEFAULT_MODEL_PARAMS = {
+    "use_gpu": None,
+    "train_size": 0.9,
+    "validation_size": None,
+    "batch_size": 128,
+    "latent_distribution": "normal",
+    "n_hidden": 128,
+    "n_latent": 10,
+    "n_layers": 1,
+    "dropout_rate": 0.1,
+    "dispersion": "gene",
+    "gene_likelihood": "zinb",
+}
+SCVI_MODEL_PARAMS = list(SCVI_DEFAULT_MODEL_PARAMS.keys())
+
+SCVI_PLAN_PARAMS = [
+    "lr",
+    "weight_decay",
+    "eps",
+    "optimizer",
+    "n_steps_kl_warmup",
+    "n_epochs_kl_warmup",
+    "reduce_lr_on_plateau",
+    "lr_factor",
+    "lr_patience",
+    "lr_threshold",
+    "lr_scheduler_metric",
+    "lr_min",
+]
+
+SCVI_TRAIN_PARAMS = [
+    "gpus",
+    "benchmark",
+    "flush_logs_every_n_steps",
+    "check_val_every_n_epoch",
+    "max_epochs",
+    "default_root_dir",
+    "checkpoint_callback",
+    "num_sanity_val_steps",
+    "weights_summary",
+    "early_stopping",
+    "early_stopping_monitor",
+    "early_stopping_min_delta",
+    "early_stopping_patience",
+    "early_stopping_mode",
+    "progress_bar_refresh_rate",
+    "simple_progress_bar",
+    "logger",
+]
@@ -0,0 +1,34 @@
+"""
+<h2>Toy example for Sobolev Alignment</h2>
+
+@author: Soufiane Mourragui
+"""
+
+import numpy as np
+import pandas as pd
+from anndata import AnnData
+
+from sobolev_alignment import SobolevAlignment
+
+# Generate data
+n_source = 100
+n_target = 200
+n_features = 500
+
+X_source = np.random.normal(size=(n_source, n_features))
+X_source = np.exp(X_source + np.random.randint(3, 10, n_features)).astype(int)
+X_source = AnnData(X_source, obs=pd.DataFrame(np.random.choice(["A", "B"], n_source).astype(str), columns=["pool"]))
+
+X_target = np.random.normal(size=(n_target, n_features))
+X_target = np.exp(X_target + np.random.randint(3, 10, n_features)).astype(int)
+X_target = AnnData(X_target, obs=pd.DataFrame(np.random.choice(["A", "B"], n_target).astype(str), columns=["pool"]))
+
+# Create a Sobolev Alignemnt instance
+sobolev_alignment_clf = SobolevAlignment(
+    source_scvi_params={"train": {"early_stopping": True}, "model": {}, "plan": {}},
+    target_scvi_params={"train": {"early_stopping": True}, "model": {}, "plan": {}},
+    n_jobs=2,
+)
+
+# Compute consensus features
+sobolev_alignment_clf.fit(X_source, X_target, source_batch_name="pool", target_batch_name="pool")
@@ -0,0 +1,210 @@
+"""FEATURE_ANALYSIS"""
+
+import gc
+import logging
+from functools import reduce
+from itertools import combinations_with_replacement
+
+import numpy as np
+import pandas as pd
+import scipy
+from joblib import Parallel, delayed
+
+
+def higher_order_contribution(
+    d: int,
+    data: np.array,
+    sample_offset: np.array,
+    gene_names: list,
+    gamma: float,
+    n_jobs: int = 1,
+    return_matrix: bool = False,
+):
+    r"""
+    Compute the features corresponding to the Taylor expansion of the kernel.
+
+    Compute the features corresponding to the Taylor expansion of the kernel, i.e. $x_j exp^{-\gamma xx^T}$ for
+    linear features. Returns a sparse pandas DataFrame containing all the features (columns) by samples (rows).
+    We here critically rely on the sparsity of the data-matrix to speed up computations. The current implementation is relevant in two cases:
+    <ul>
+    <li> When dimensionality is small
+    <li> When data is sparse.
+    <ul>
+    High-dimensional and dense data matrices would lead to a significant over-head without computational gains,
+    and could benefit from another implementation strategy.
+
+    Parameters
+    ----------
+    d: int
+        Order of the features to compute, e.g. 1 for linear, 2 for interaction terms.
+
+    data: np.array
+        Data to compute features on, samples in the rows and genes (features) in the columns.
+
+    sample_offset: np.array
+        Offset of each sample from data.
+
+    gene_names: list
+        Names of each columns in data ; corresponds to features naming.
+
+    gamma: float
+        Value of the gamma parameter for Matérn kernel.
+
+    n_jobs: int, default to 1
+        Number of concurrent threads to use. -1 will use all CPU cores possible.
+        WARNING: for d >= 2 and a large number of genes, the routine can be memory-intensive and a high n_jobs
+        could lead to crash.
+
+    return_matrix: bool, default to False
+        If True, then returns simply the feature-matrix without feature-naming. In cases when feature names are
+        not relevant (e.g. computing the proportion of non-linearities), return_matrix=True can help speed-up the process.
+
+    Returns
+    -------
+    pd.DataFrame
+        Sparse dataframe with samples in the rows and named features in the columns.
+        For instance, when d=1, returns each column of data scaled by RKHS normalisation factor and multiplied
+        by offset value.
+    """
+    # Exploits sparsity of scRNA-seq data (even more handy when d >= 2)
+    # Note to future user: this can be an issue if data is not sparse
+    sparse_data = scipy.sparse.csc_matrix(data)
+
+    # Compute features by iterating over possible combinations
+    logging.info("\t START FEATURES")
+    combinations_features = Parallel(n_jobs=n_jobs, verbose=1, max_nbytes=1e6, pre_dispatch=int(1.5 * n_jobs))(
+        delayed(combinatorial_product)(sparse_data, x, gamma)
+        for x in combinations_with_replacement(np.arange(sparse_data.shape[1]), r=d)
+    )
+    gc.collect()
+
+    # Combine features and multiply columns by offset.
+    logging.info("\t START CONCATENATION")
+    logging.info("\t\t START STACKING")
+    combinations_features = scipy.sparse.hstack(combinations_features, format="csc")
+    logging.info("\t\t START PRODUCT")
+    combinations_features = scipy.sparse.diags(sample_offset).dot(combinations_features)
+    gc.collect()
+    if return_matrix:
+        return combinations_features
+
+    # Return names of each features.
+    logging.info("\t\t FIND NAMES")
+    combinations_names = Parallel(
+        n_jobs=min(5, n_jobs), verbose=1, max_nbytes=1e4, pre_dispatch=int(1.5 * min(5, n_jobs))
+    )(delayed(_interaction_name)(x) for x in combinations_with_replacement(gene_names, r=d))
+
+    return pd.DataFrame.sparse.from_spmatrix(data=combinations_features, columns=combinations_names)
+
+
+def _combination_to_idx(idx, p):
+    """
+    Transforms a combination (tuple of feature idx) into an indicative function.
+
+    Parameters
+    ----------
+    idx: tuple
+        Combination of features in the form of a tuple. <br/>
+        E.g. for 6 genes, (5,1) corresponds to the product of 1 and 5 and returns
+        (0,1,0,0,0,1), while (1,2,3,2) will yield (0,1,2,1,0,0). <br/>
+        <b>WARNING:</b> start at 0.
+
+    p: int
+        Number of genes (features) in the dataset.
+
+    Returns
+    -------
+    np.array
+        Indicative function of the combination
+    """
+    return np.array([np.sum(np.array(idx) == i) for i in range(p)])
+
+
+def basis(x, k, gamma):
+    """
+    Computed the basis function for a single gene, except offset term.
+
+    Parameters
+    ----------
+    x: np.array
+        Column vector (each row corresponds to a sample).
+
+    k: int
+        Order to compute.
+
+    gamma: float
+        Parameter of Matérn kernel.
+
+    Returns
+    -------
+    np.array
+        Value of the higher order feature.
+    """
+    if k == 0:
+        return np.ones(x.shape[0])
+
+    product = x
+    for _ in range(1, k):
+        product = x.multiply(product)
+    coef = np.power(2 * gamma, k / 2) / np.sqrt(scipy.math.factorial(k))
+
+    return coef * product
+
+
+def combinatorial_product(x, idx, gamma):
+    """
+    Compute the basis function for a single gene, except offset term.
+
+    Parameters
+    ----------
+    x: np.array
+        Data matrix with samples in the rows and genes in the columns
+
+    idx: tuple
+        Combinations, i.e. tuple of features to take into account.
+
+    gamma: float
+        Parameter of Matérn kernel.
+
+    Returns
+    -------
+    scipy.sparse.csc_matrix
+        Values of the higher order feature.
+    """
+    # Iterate over all genes and compute the feature weight by multiplication
+    prod = [basis(x[:, i], k, gamma) for i, k in enumerate(_combination_to_idx(idx, x.shape[1])) if k > 0]
+    if len(prod) == 0:
+        return 1
+
+    return reduce(scipy.sparse.csc_matrix.multiply, prod)
+
+
+def _interaction_name(gene_combi):
+
+    combin_name = [f"{g}^{r}" for g, r in zip(*np.unique(gene_combi, return_counts=True))]
+    return "*".join(combin_name) if len(combin_name) > 0 else "1"
+
+
+def _higher_order_interaction_wrapper(data, x, gamma, gene_names):
+    return [combinatorial_product(data, x, gamma), _interaction_name(gene_names, _combination_to_idx(x, data.shape[1]))]
+
+
+def _compute_offset(data, gamma):
+    r"""
+    Compute the sample-level offset values, i.e. $\exp -\gamma xx^T$.
+
+    Parameters
+    ----------
+    data: np.array
+        Data to compute features on, samples in the rows and genes (features) in the columns.
+
+    gamma: float
+        Value of the gamma parameter for Matérn kernel.
+
+    Returns
+    -------
+    np.array
+        One-dimensional vector with offset values of all samples.
+    """
+    sample_offset = np.linalg.norm(data, axis=1)
+    return np.exp(-gamma * np.power(sample_offset, 2))
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from sobolev_alignment.krr_approx import KRRApprox`
	`2`	`+from sobolev_alignment.scvi_model_search import model_selection`
	`3`	`+from sobolev_alignment.sobolev_alignment import SobolevAlignment`