|
| 1 | +"""FEATURE_ANALYSIS""" |
| 2 | + |
| 3 | +import gc |
| 4 | +import logging |
| 5 | +from functools import reduce |
| 6 | +from itertools import combinations_with_replacement |
| 7 | + |
| 8 | +import numpy as np |
| 9 | +import pandas as pd |
| 10 | +import scipy |
| 11 | +from joblib import Parallel, delayed |
| 12 | + |
| 13 | + |
| 14 | +def higher_order_contribution( |
| 15 | + d: int, |
| 16 | + data: np.array, |
| 17 | + sample_offset: np.array, |
| 18 | + gene_names: list, |
| 19 | + gamma: float, |
| 20 | + n_jobs: int = 1, |
| 21 | + return_matrix: bool = False, |
| 22 | +): |
| 23 | + r""" |
| 24 | + Compute the features corresponding to the Taylor expansion of the kernel. |
| 25 | +
|
| 26 | + Compute the features corresponding to the Taylor expansion of the kernel, i.e. $x_j exp^{-\gamma xx^T}$ for |
| 27 | + linear features. Returns a sparse pandas DataFrame containing all the features (columns) by samples (rows). |
| 28 | + We here critically rely on the sparsity of the data-matrix to speed up computations. The current implementation is relevant in two cases: |
| 29 | + <ul> |
| 30 | + <li> When dimensionality is small |
| 31 | + <li> When data is sparse. |
| 32 | + <ul> |
| 33 | + High-dimensional and dense data matrices would lead to a significant over-head without computational gains, |
| 34 | + and could benefit from another implementation strategy. |
| 35 | +
|
| 36 | + Parameters |
| 37 | + ---------- |
| 38 | + d: int |
| 39 | + Order of the features to compute, e.g. 1 for linear, 2 for interaction terms. |
| 40 | +
|
| 41 | + data: np.array |
| 42 | + Data to compute features on, samples in the rows and genes (features) in the columns. |
| 43 | +
|
| 44 | + sample_offset: np.array |
| 45 | + Offset of each sample from data. |
| 46 | +
|
| 47 | + gene_names: list |
| 48 | + Names of each columns in data ; corresponds to features naming. |
| 49 | +
|
| 50 | + gamma: float |
| 51 | + Value of the gamma parameter for Matérn kernel. |
| 52 | +
|
| 53 | + n_jobs: int, default to 1 |
| 54 | + Number of concurrent threads to use. -1 will use all CPU cores possible. |
| 55 | + WARNING: for d >= 2 and a large number of genes, the routine can be memory-intensive and a high n_jobs |
| 56 | + could lead to crash. |
| 57 | +
|
| 58 | + return_matrix: bool, default to False |
| 59 | + If True, then returns simply the feature-matrix without feature-naming. In cases when feature names are |
| 60 | + not relevant (e.g. computing the proportion of non-linearities), return_matrix=True can help speed-up the process. |
| 61 | +
|
| 62 | + Returns |
| 63 | + ------- |
| 64 | + pd.DataFrame |
| 65 | + Sparse dataframe with samples in the rows and named features in the columns. |
| 66 | + For instance, when d=1, returns each column of data scaled by RKHS normalisation factor and multiplied |
| 67 | + by offset value. |
| 68 | + """ |
| 69 | + # Exploits sparsity of scRNA-seq data (even more handy when d >= 2) |
| 70 | + # Note to future user: this can be an issue if data is not sparse |
| 71 | + sparse_data = scipy.sparse.csc_matrix(data) |
| 72 | + |
| 73 | + # Compute features by iterating over possible combinations |
| 74 | + logging.info("\t START FEATURES") |
| 75 | + combinations_features = Parallel(n_jobs=n_jobs, verbose=1, max_nbytes=1e6, pre_dispatch=int(1.5 * n_jobs))( |
| 76 | + delayed(combinatorial_product)(sparse_data, x, gamma) |
| 77 | + for x in combinations_with_replacement(np.arange(sparse_data.shape[1]), r=d) |
| 78 | + ) |
| 79 | + gc.collect() |
| 80 | + |
| 81 | + # Combine features and multiply columns by offset. |
| 82 | + logging.info("\t START CONCATENATION") |
| 83 | + logging.info("\t\t START STACKING") |
| 84 | + combinations_features = scipy.sparse.hstack(combinations_features, format="csc") |
| 85 | + logging.info("\t\t START PRODUCT") |
| 86 | + combinations_features = scipy.sparse.diags(sample_offset).dot(combinations_features) |
| 87 | + gc.collect() |
| 88 | + if return_matrix: |
| 89 | + return combinations_features |
| 90 | + |
| 91 | + # Return names of each features. |
| 92 | + logging.info("\t\t FIND NAMES") |
| 93 | + combinations_names = Parallel( |
| 94 | + n_jobs=min(5, n_jobs), verbose=1, max_nbytes=1e4, pre_dispatch=int(1.5 * min(5, n_jobs)) |
| 95 | + )(delayed(_interaction_name)(x) for x in combinations_with_replacement(gene_names, r=d)) |
| 96 | + |
| 97 | + return pd.DataFrame.sparse.from_spmatrix(data=combinations_features, columns=combinations_names) |
| 98 | + |
| 99 | + |
| 100 | +def _combination_to_idx(idx, p): |
| 101 | + """ |
| 102 | + Transforms a combination (tuple of feature idx) into an indicative function. |
| 103 | +
|
| 104 | + Parameters |
| 105 | + ---------- |
| 106 | + idx: tuple |
| 107 | + Combination of features in the form of a tuple. <br/> |
| 108 | + E.g. for 6 genes, (5,1) corresponds to the product of 1 and 5 and returns |
| 109 | + (0,1,0,0,0,1), while (1,2,3,2) will yield (0,1,2,1,0,0). <br/> |
| 110 | + <b>WARNING:</b> start at 0. |
| 111 | +
|
| 112 | + p: int |
| 113 | + Number of genes (features) in the dataset. |
| 114 | +
|
| 115 | + Returns |
| 116 | + ------- |
| 117 | + np.array |
| 118 | + Indicative function of the combination |
| 119 | + """ |
| 120 | + return np.array([np.sum(np.array(idx) == i) for i in range(p)]) |
| 121 | + |
| 122 | + |
| 123 | +def basis(x, k, gamma): |
| 124 | + """ |
| 125 | + Computed the basis function for a single gene, except offset term. |
| 126 | +
|
| 127 | + Parameters |
| 128 | + ---------- |
| 129 | + x: np.array |
| 130 | + Column vector (each row corresponds to a sample). |
| 131 | +
|
| 132 | + k: int |
| 133 | + Order to compute. |
| 134 | +
|
| 135 | + gamma: float |
| 136 | + Parameter of Matérn kernel. |
| 137 | +
|
| 138 | + Returns |
| 139 | + ------- |
| 140 | + np.array |
| 141 | + Value of the higher order feature. |
| 142 | + """ |
| 143 | + if k == 0: |
| 144 | + return np.ones(x.shape[0]) |
| 145 | + |
| 146 | + product = x |
| 147 | + for _ in range(1, k): |
| 148 | + product = x.multiply(product) |
| 149 | + coef = np.power(2 * gamma, k / 2) / np.sqrt(scipy.math.factorial(k)) |
| 150 | + |
| 151 | + return coef * product |
| 152 | + |
| 153 | + |
| 154 | +def combinatorial_product(x, idx, gamma): |
| 155 | + """ |
| 156 | + Compute the basis function for a single gene, except offset term. |
| 157 | +
|
| 158 | + Parameters |
| 159 | + ---------- |
| 160 | + x: np.array |
| 161 | + Data matrix with samples in the rows and genes in the columns |
| 162 | +
|
| 163 | + idx: tuple |
| 164 | + Combinations, i.e. tuple of features to take into account. |
| 165 | +
|
| 166 | + gamma: float |
| 167 | + Parameter of Matérn kernel. |
| 168 | +
|
| 169 | + Returns |
| 170 | + ------- |
| 171 | + scipy.sparse.csc_matrix |
| 172 | + Values of the higher order feature. |
| 173 | + """ |
| 174 | + # Iterate over all genes and compute the feature weight by multiplication |
| 175 | + prod = [basis(x[:, i], k, gamma) for i, k in enumerate(_combination_to_idx(idx, x.shape[1])) if k > 0] |
| 176 | + if len(prod) == 0: |
| 177 | + return 1 |
| 178 | + |
| 179 | + return reduce(scipy.sparse.csc_matrix.multiply, prod) |
| 180 | + |
| 181 | + |
| 182 | +def _interaction_name(gene_combi): |
| 183 | + |
| 184 | + combin_name = [f"{g}^{r}" for g, r in zip(*np.unique(gene_combi, return_counts=True))] |
| 185 | + return "*".join(combin_name) if len(combin_name) > 0 else "1" |
| 186 | + |
| 187 | + |
| 188 | +def _higher_order_interaction_wrapper(data, x, gamma, gene_names): |
| 189 | + return [combinatorial_product(data, x, gamma), _interaction_name(gene_names, _combination_to_idx(x, data.shape[1]))] |
| 190 | + |
| 191 | + |
| 192 | +def _compute_offset(data, gamma): |
| 193 | + r""" |
| 194 | + Compute the sample-level offset values, i.e. $\exp -\gamma xx^T$. |
| 195 | +
|
| 196 | + Parameters |
| 197 | + ---------- |
| 198 | + data: np.array |
| 199 | + Data to compute features on, samples in the rows and genes (features) in the columns. |
| 200 | +
|
| 201 | + gamma: float |
| 202 | + Value of the gamma parameter for Matérn kernel. |
| 203 | +
|
| 204 | + Returns |
| 205 | + ------- |
| 206 | + np.array |
| 207 | + One-dimensional vector with offset values of all samples. |
| 208 | + """ |
| 209 | + sample_offset = np.linalg.norm(data, axis=1) |
| 210 | + return np.exp(-gamma * np.power(sample_offset, 2)) |
0 commit comments