Skip to content

Commit d12b5cb

Browse files
committed
code
1 parent 6caaa3f commit d12b5cb

26 files changed

+3261
-107
lines changed

pytest.ini

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[pytest]
2+
addopts = --strict
3+
markers =
4+
slow: A test which takes more than a few seconds to run.

setup.py

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import setuptools
2+
3+
with open("README.md") as fh:
4+
long_description = fh.read()
5+
6+
setuptools.setup(
7+
name="sobolev_alignment",
8+
version="0.2.1",
9+
author="Soufiane Mourragui <soufiane.mourragui@gmail.com>, ",
10+
author_email="soufiane.mourragui@gmail.com",
11+
description="SOBOLEV ALIGNMENT",
12+
long_description=long_description,
13+
long_description_content_type="text/markdown",
14+
# url="https://github.com/NKI-CCB/sobolev_alignment",
15+
packages=setuptools.find_packages(),
16+
install_requires=[
17+
"numpy",
18+
"scipy",
19+
"pandas",
20+
"matplotlib",
21+
"scikit-learn",
22+
"torch",
23+
"scvi-tools",
24+
"scanpy",
25+
"hyperopt",
26+
],
27+
python_requires=">=3.8",
28+
classifiers=(
29+
"Programming Language :: Python :: 3",
30+
"License :: OSI Approved :: MIT License",
31+
"Operating System :: OS Independent",
32+
"Topic :: Scientific/Engineering :: Bio-Informatics",
33+
"Development Status :: 1 - Planning",
34+
),
35+
)

sobolev_alignment/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from sobolev_alignment.krr_approx import KRRApprox
2+
from sobolev_alignment.scvi_model_search import model_selection
3+
from sobolev_alignment.sobolev_alignment import SobolevAlignment
+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
SCVI_DEFAULT_MODEL_PARAMS = {
2+
"use_gpu": None,
3+
"train_size": 0.9,
4+
"validation_size": None,
5+
"batch_size": 128,
6+
"latent_distribution": "normal",
7+
"n_hidden": 128,
8+
"n_latent": 10,
9+
"n_layers": 1,
10+
"dropout_rate": 0.1,
11+
"dispersion": "gene",
12+
"gene_likelihood": "zinb",
13+
}
14+
SCVI_MODEL_PARAMS = list(SCVI_DEFAULT_MODEL_PARAMS.keys())
15+
16+
SCVI_PLAN_PARAMS = [
17+
"lr",
18+
"weight_decay",
19+
"eps",
20+
"optimizer",
21+
"n_steps_kl_warmup",
22+
"n_epochs_kl_warmup",
23+
"reduce_lr_on_plateau",
24+
"lr_factor",
25+
"lr_patience",
26+
"lr_threshold",
27+
"lr_scheduler_metric",
28+
"lr_min",
29+
]
30+
31+
SCVI_TRAIN_PARAMS = [
32+
"gpus",
33+
"benchmark",
34+
"flush_logs_every_n_steps",
35+
"check_val_every_n_epoch",
36+
"max_epochs",
37+
"default_root_dir",
38+
"checkpoint_callback",
39+
"num_sanity_val_steps",
40+
"weights_summary",
41+
"early_stopping",
42+
"early_stopping_monitor",
43+
"early_stopping_min_delta",
44+
"early_stopping_patience",
45+
"early_stopping_mode",
46+
"progress_bar_refresh_rate",
47+
"simple_progress_bar",
48+
"logger",
49+
]

sobolev_alignment/data_normalisation.py

Whitespace-only changes.

sobolev_alignment/example.py

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
"""
2+
<h2>Toy example for Sobolev Alignment</h2>
3+
4+
@author: Soufiane Mourragui
5+
"""
6+
7+
import numpy as np
8+
import pandas as pd
9+
from anndata import AnnData
10+
11+
from sobolev_alignment import SobolevAlignment
12+
13+
# Generate data
14+
n_source = 100
15+
n_target = 200
16+
n_features = 500
17+
18+
X_source = np.random.normal(size=(n_source, n_features))
19+
X_source = np.exp(X_source + np.random.randint(3, 10, n_features)).astype(int)
20+
X_source = AnnData(X_source, obs=pd.DataFrame(np.random.choice(["A", "B"], n_source).astype(str), columns=["pool"]))
21+
22+
X_target = np.random.normal(size=(n_target, n_features))
23+
X_target = np.exp(X_target + np.random.randint(3, 10, n_features)).astype(int)
24+
X_target = AnnData(X_target, obs=pd.DataFrame(np.random.choice(["A", "B"], n_target).astype(str), columns=["pool"]))
25+
26+
# Create a Sobolev Alignemnt instance
27+
sobolev_alignment_clf = SobolevAlignment(
28+
source_scvi_params={"train": {"early_stopping": True}, "model": {}, "plan": {}},
29+
target_scvi_params={"train": {"early_stopping": True}, "model": {}, "plan": {}},
30+
n_jobs=2,
31+
)
32+
33+
# Compute consensus features
34+
sobolev_alignment_clf.fit(X_source, X_target, source_batch_name="pool", target_batch_name="pool")

sobolev_alignment/feature_analysis.py

+210
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
"""FEATURE_ANALYSIS"""
2+
3+
import gc
4+
import logging
5+
from functools import reduce
6+
from itertools import combinations_with_replacement
7+
8+
import numpy as np
9+
import pandas as pd
10+
import scipy
11+
from joblib import Parallel, delayed
12+
13+
14+
def higher_order_contribution(
15+
d: int,
16+
data: np.array,
17+
sample_offset: np.array,
18+
gene_names: list,
19+
gamma: float,
20+
n_jobs: int = 1,
21+
return_matrix: bool = False,
22+
):
23+
r"""
24+
Compute the features corresponding to the Taylor expansion of the kernel.
25+
26+
Compute the features corresponding to the Taylor expansion of the kernel, i.e. $x_j exp^{-\gamma xx^T}$ for
27+
linear features. Returns a sparse pandas DataFrame containing all the features (columns) by samples (rows).
28+
We here critically rely on the sparsity of the data-matrix to speed up computations. The current implementation is relevant in two cases:
29+
<ul>
30+
<li> When dimensionality is small
31+
<li> When data is sparse.
32+
<ul>
33+
High-dimensional and dense data matrices would lead to a significant over-head without computational gains,
34+
and could benefit from another implementation strategy.
35+
36+
Parameters
37+
----------
38+
d: int
39+
Order of the features to compute, e.g. 1 for linear, 2 for interaction terms.
40+
41+
data: np.array
42+
Data to compute features on, samples in the rows and genes (features) in the columns.
43+
44+
sample_offset: np.array
45+
Offset of each sample from data.
46+
47+
gene_names: list
48+
Names of each columns in data ; corresponds to features naming.
49+
50+
gamma: float
51+
Value of the gamma parameter for Matérn kernel.
52+
53+
n_jobs: int, default to 1
54+
Number of concurrent threads to use. -1 will use all CPU cores possible.
55+
WARNING: for d >= 2 and a large number of genes, the routine can be memory-intensive and a high n_jobs
56+
could lead to crash.
57+
58+
return_matrix: bool, default to False
59+
If True, then returns simply the feature-matrix without feature-naming. In cases when feature names are
60+
not relevant (e.g. computing the proportion of non-linearities), return_matrix=True can help speed-up the process.
61+
62+
Returns
63+
-------
64+
pd.DataFrame
65+
Sparse dataframe with samples in the rows and named features in the columns.
66+
For instance, when d=1, returns each column of data scaled by RKHS normalisation factor and multiplied
67+
by offset value.
68+
"""
69+
# Exploits sparsity of scRNA-seq data (even more handy when d >= 2)
70+
# Note to future user: this can be an issue if data is not sparse
71+
sparse_data = scipy.sparse.csc_matrix(data)
72+
73+
# Compute features by iterating over possible combinations
74+
logging.info("\t START FEATURES")
75+
combinations_features = Parallel(n_jobs=n_jobs, verbose=1, max_nbytes=1e6, pre_dispatch=int(1.5 * n_jobs))(
76+
delayed(combinatorial_product)(sparse_data, x, gamma)
77+
for x in combinations_with_replacement(np.arange(sparse_data.shape[1]), r=d)
78+
)
79+
gc.collect()
80+
81+
# Combine features and multiply columns by offset.
82+
logging.info("\t START CONCATENATION")
83+
logging.info("\t\t START STACKING")
84+
combinations_features = scipy.sparse.hstack(combinations_features, format="csc")
85+
logging.info("\t\t START PRODUCT")
86+
combinations_features = scipy.sparse.diags(sample_offset).dot(combinations_features)
87+
gc.collect()
88+
if return_matrix:
89+
return combinations_features
90+
91+
# Return names of each features.
92+
logging.info("\t\t FIND NAMES")
93+
combinations_names = Parallel(
94+
n_jobs=min(5, n_jobs), verbose=1, max_nbytes=1e4, pre_dispatch=int(1.5 * min(5, n_jobs))
95+
)(delayed(_interaction_name)(x) for x in combinations_with_replacement(gene_names, r=d))
96+
97+
return pd.DataFrame.sparse.from_spmatrix(data=combinations_features, columns=combinations_names)
98+
99+
100+
def _combination_to_idx(idx, p):
101+
"""
102+
Transforms a combination (tuple of feature idx) into an indicative function.
103+
104+
Parameters
105+
----------
106+
idx: tuple
107+
Combination of features in the form of a tuple. <br/>
108+
E.g. for 6 genes, (5,1) corresponds to the product of 1 and 5 and returns
109+
(0,1,0,0,0,1), while (1,2,3,2) will yield (0,1,2,1,0,0). <br/>
110+
<b>WARNING:</b> start at 0.
111+
112+
p: int
113+
Number of genes (features) in the dataset.
114+
115+
Returns
116+
-------
117+
np.array
118+
Indicative function of the combination
119+
"""
120+
return np.array([np.sum(np.array(idx) == i) for i in range(p)])
121+
122+
123+
def basis(x, k, gamma):
124+
"""
125+
Computed the basis function for a single gene, except offset term.
126+
127+
Parameters
128+
----------
129+
x: np.array
130+
Column vector (each row corresponds to a sample).
131+
132+
k: int
133+
Order to compute.
134+
135+
gamma: float
136+
Parameter of Matérn kernel.
137+
138+
Returns
139+
-------
140+
np.array
141+
Value of the higher order feature.
142+
"""
143+
if k == 0:
144+
return np.ones(x.shape[0])
145+
146+
product = x
147+
for _ in range(1, k):
148+
product = x.multiply(product)
149+
coef = np.power(2 * gamma, k / 2) / np.sqrt(scipy.math.factorial(k))
150+
151+
return coef * product
152+
153+
154+
def combinatorial_product(x, idx, gamma):
155+
"""
156+
Compute the basis function for a single gene, except offset term.
157+
158+
Parameters
159+
----------
160+
x: np.array
161+
Data matrix with samples in the rows and genes in the columns
162+
163+
idx: tuple
164+
Combinations, i.e. tuple of features to take into account.
165+
166+
gamma: float
167+
Parameter of Matérn kernel.
168+
169+
Returns
170+
-------
171+
scipy.sparse.csc_matrix
172+
Values of the higher order feature.
173+
"""
174+
# Iterate over all genes and compute the feature weight by multiplication
175+
prod = [basis(x[:, i], k, gamma) for i, k in enumerate(_combination_to_idx(idx, x.shape[1])) if k > 0]
176+
if len(prod) == 0:
177+
return 1
178+
179+
return reduce(scipy.sparse.csc_matrix.multiply, prod)
180+
181+
182+
def _interaction_name(gene_combi):
183+
184+
combin_name = [f"{g}^{r}" for g, r in zip(*np.unique(gene_combi, return_counts=True))]
185+
return "*".join(combin_name) if len(combin_name) > 0 else "1"
186+
187+
188+
def _higher_order_interaction_wrapper(data, x, gamma, gene_names):
189+
return [combinatorial_product(data, x, gamma), _interaction_name(gene_names, _combination_to_idx(x, data.shape[1]))]
190+
191+
192+
def _compute_offset(data, gamma):
193+
r"""
194+
Compute the sample-level offset values, i.e. $\exp -\gamma xx^T$.
195+
196+
Parameters
197+
----------
198+
data: np.array
199+
Data to compute features on, samples in the rows and genes (features) in the columns.
200+
201+
gamma: float
202+
Value of the gamma parameter for Matérn kernel.
203+
204+
Returns
205+
-------
206+
np.array
207+
One-dimensional vector with offset values of all samples.
208+
"""
209+
sample_offset = np.linalg.norm(data, axis=1)
210+
return np.exp(-gamma * np.power(sample_offset, 2))

0 commit comments

Comments
 (0)