Skip to content

Commit c17cd27

Browse files
authored
patch: scikit-learn 1.6 compatibility (#726)
* WIP: low hanging fix * add sklearn-compat dependency * preprocessing module * decomposition module * mixture and feature_selection modules * meta module * top level modules * WIP: do not use validate_data * check_X_y with changed check_array * use validate_data
1 parent 13b20df commit c17cd27

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+861
-244
lines changed

sklego/_sklearn_compat.py

+520
Large diffs are not rendered by default.

sklego/common.py

+7-8
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
import numpy as np
66
import pandas as pd
77
from sklearn.base import BaseEstimator, TransformerMixin
8-
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
8+
from sklearn.utils.validation import check_is_fitted
9+
10+
from sklego._sklearn_compat import validate_data
911

1012

1113
class TrainOnlyTransformerMixin(TransformerMixin, BaseEstimator):
@@ -79,11 +81,11 @@ def fit(self, X, y=None):
7981
The fitted transformer.
8082
"""
8183
if y is None:
82-
check_array(X, estimator=self)
84+
validate_data(self, X=X, reset=True)
8385
else:
84-
check_X_y(X, y, estimator=self, multi_output=True)
86+
validate_data(self, X=X, y=y, multi_output=True, reset=True)
87+
8588
self.X_hash_ = self._hash(X)
86-
self.n_features_in_ = X.shape[1]
8789
return self
8890

8991
@staticmethod
@@ -145,10 +147,7 @@ def transform(self, X, y=None):
145147
If the input dimension does not match the training dimension.
146148
"""
147149
check_is_fitted(self, ["X_hash_", "n_features_in_"])
148-
check_array(X, estimator=self)
149-
150-
if X.shape[1] != self.n_features_in_:
151-
raise ValueError(f"Unexpected input dimension {X.shape[1]}, expected {self.n_features_in_}")
150+
validate_data(self, X=X, reset=False)
152151

153152
if self._hash(X) == self.X_hash_:
154153
return self.transform_train(X)

sklego/decomposition/pca_reconstruction.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import numpy as np
22
from sklearn.base import BaseEstimator, OutlierMixin
33
from sklearn.decomposition import PCA
4-
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
4+
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
5+
6+
from sklego._sklearn_compat import validate_data
57

68

79
class PCAOutlierDetection(OutlierMixin, BaseEstimator):
@@ -94,7 +96,7 @@ def fit(self, X, y=None):
9496
ValueError
9597
If `threshold` is `None`.
9698
"""
97-
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
99+
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=True)
98100
if not self.threshold:
99101
raise ValueError("The `threshold` value cannot be `None`.")
100102

@@ -108,8 +110,6 @@ def fit(self, X, y=None):
108110
)
109111
self.pca_.fit(X, y)
110112
self.offset_ = -self.threshold
111-
112-
self.n_features_in_ = X.shape[1]
113113
return self
114114

115115
def difference(self, X):
@@ -126,6 +126,8 @@ def difference(self, X):
126126
The calculated difference.
127127
"""
128128
check_is_fitted(self, ["pca_", "offset_"])
129+
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
130+
129131
reduced = self.pca_.transform(X)
130132
diff = np.sum(np.abs(self.pca_.inverse_transform(reduced) - X), axis=1)
131133
if self.variant == "relative":
@@ -157,8 +159,8 @@ def predict(self, X):
157159
array-like of shape (n_samples,)
158160
The predicted data. 1 for inliers, -1 for outliers.
159161
"""
160-
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
161162
check_is_fitted(self, ["pca_", "offset_"])
163+
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
162164
result = np.ones(X.shape[0])
163165
result[self.difference(X) > self.threshold] = -1
164166
return result.astype(int)

sklego/decomposition/umap_reconstruction.py

+14-5
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88

99
import numpy as np
1010
from sklearn.base import BaseEstimator, OutlierMixin
11-
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
11+
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
12+
13+
from sklego._sklearn_compat import validate_data
1214

1315

1416
class UMAPOutlierDetection(OutlierMixin, BaseEstimator):
@@ -100,9 +102,10 @@ def fit(self, X, y=None):
100102
- If `n_components` is less than 2.
101103
- If `threshold` is `None`.
102104
"""
103-
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
104105
if y is not None:
105-
y = check_array(y, estimator=self, ensure_2d=False)
106+
X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
107+
else:
108+
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=True)
106109

107110
if not self.threshold:
108111
raise ValueError("The `threshold` value cannot be `None`.")
@@ -116,7 +119,6 @@ def fit(self, X, y=None):
116119
)
117120
self.umap_.fit(X, y)
118121
self.offset_ = -self.threshold
119-
self.n_features_in_ = X.shape[1]
120122
return self
121123

122124
def difference(self, X):
@@ -133,6 +135,8 @@ def difference(self, X):
133135
The calculated difference.
134136
"""
135137
check_is_fitted(self, ["umap_", "offset_"])
138+
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
139+
136140
reduced = self.umap_.transform(X)
137141
diff = np.sum(np.abs(self.umap_.inverse_transform(reduced) - X), axis=1)
138142
if self.variant == "relative":
@@ -155,8 +159,8 @@ def predict(self, X):
155159
array-like of shape (n_samples,)
156160
The predicted data. 1 for inliers, -1 for outliers.
157161
"""
158-
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
159162
check_is_fitted(self, ["umap_", "offset_"])
163+
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
160164
result = np.ones(X.shape[0])
161165
result[self.difference(X) > self.threshold] = -1
162166
return result.astype(int)
@@ -172,3 +176,8 @@ def score_samples(self, X):
172176

173177
def _more_tags(self):
174178
return {"non_deterministic": True}
179+
180+
def __sklearn_tags__(self):
181+
tags = super().__sklearn_tags__()
182+
tags.non_deterministic = True
183+
return tags

sklego/dummy.py

+11-12
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,9 @@
22

33
import numpy as np
44
from sklearn.base import BaseEstimator, RegressorMixin
5-
from sklearn.utils import check_X_y
6-
from sklearn.utils.validation import (
7-
FLOAT_DTYPES,
8-
check_array,
9-
check_is_fitted,
10-
check_random_state,
11-
)
5+
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted, check_random_state
6+
7+
from sklego._sklearn_compat import validate_data
128

139

1410
class RandomRegressor(RegressorMixin, BaseEstimator):
@@ -72,8 +68,7 @@ def fit(self, X: np.array, y: np.array) -> "RandomRegressor":
7268
"""
7369
if self.strategy not in self._ALLOWED_STRATEGIES:
7470
raise ValueError(f"strategy {self.strategy} is not in {self._ALLOWED_STRATEGIES}")
75-
X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
76-
self.n_features_in_ = X.shape[1]
71+
X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
7772

7873
self.min_ = np.min(y)
7974
self.max_ = np.max(y)
@@ -99,9 +94,7 @@ def predict(self, X):
9994
rs = check_random_state(self.random_state)
10095
check_is_fitted(self, ["n_features_in_", "min_", "max_", "mu_", "sigma_"])
10196

102-
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
103-
if X.shape[1] != self.n_features_in_:
104-
raise ValueError(f"Unexpected input dimension {X.shape[1]}, expected {self.dim_}")
97+
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
10598

10699
if self.strategy == "normal":
107100
return rs.normal(self.mu_, self.sigma_, X.shape[0])
@@ -127,3 +120,9 @@ def allowed_strategies(self):
127120

128121
def _more_tags(self):
129122
return {"poor_score": True, "non_deterministic": True}
123+
124+
def __sklearn_tags__(self):
125+
tags = super().__sklearn_tags__()
126+
tags.non_deterministic = True
127+
tags.regressor_tags.poor_score = True
128+
return tags

sklego/feature_selection/mrmr.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
from sklearn.base import BaseEstimator
55
from sklearn.feature_selection import f_classif, f_regression
66
from sklearn.feature_selection._base import SelectorMixin
7-
from sklearn.utils.validation import check_is_fitted, check_X_y
7+
from sklearn.utils.validation import check_is_fitted
8+
9+
from sklego._sklearn_compat import validate_data
810

911

1012
def _redundancy_pearson(X, selected, left):
@@ -201,13 +203,12 @@ def fit(self, X, y):
201203
202204
k parameter is not integer type or is < n_features_in (X.shape[1]) or < 1
203205
"""
204-
X, y = check_X_y(X, y, dtype="numeric", y_numeric=True)
206+
X, y = validate_data(self, X=X, y=y, dtype="numeric", y_numeric=True, reset=True)
205207
self._y_dtype = y.dtype
206208

207209
relevance = self._get_relevance
208210
redundancy = self._get_redundancy
209211

210-
self.n_features_in_ = X.shape[1]
211212
left_features = list(range(self.n_features_in_))
212213
selected_features = []
213214
selected_scores = []

sklego/linear_model.py

+17-13
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,12 @@
2121
from sklearn.utils.validation import (
2222
FLOAT_DTYPES,
2323
_check_sample_weight,
24-
check_array,
2524
check_is_fitted,
2625
column_or_1d,
2726
)
2827

28+
from sklego._sklearn_compat import check_array, validate_data
29+
2930

3031
class LowessRegression(RegressorMixin, BaseEstimator):
3132
"""`LowessRegression` estimator: LOWESS (Locally Weighted Scatterplot Smoothing) is a type of
@@ -96,7 +97,7 @@ def fit(self, X, y):
9697
- If `span` is not between 0 and 1.
9798
- If `sigma` is negative.
9899
"""
99-
X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
100+
X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
100101
if self.span is not None:
101102
if not 0 <= self.span <= 1:
102103
raise ValueError(f"Param `span` must be 0 <= span <= 1, got: {self.span}")
@@ -138,8 +139,8 @@ def predict(self, X):
138139
array-like of shape (n_samples,)
139140
The predicted values.
140141
"""
141-
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
142142
check_is_fitted(self, ["X_", "y_"])
143+
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
143144

144145
try:
145146
results = np.stack([np.average(self.y_, weights=self._calc_wts(x_i=x_i)) for x_i in X])
@@ -233,7 +234,7 @@ def fit(self, X, y):
233234
self : ProbWeightRegression
234235
The fitted estimator.
235236
"""
236-
X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
237+
X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
237238

238239
# Construct the problem.
239240
betas = cp.Variable(X.shape[1])
@@ -263,8 +264,8 @@ def predict(self, X):
263264
array-like of shape (n_samples,)
264265
The predicted data.
265266
"""
266-
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
267267
check_is_fitted(self, ["coef_"])
268+
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
268269
return np.dot(X, self.coef_)
269270

270271
@property
@@ -345,8 +346,6 @@ class DeadZoneRegressor(RegressorMixin, BaseEstimator):
345346
346347
print(y_pred)
347348
```
348-
349-
350349
"""
351350

352351
_ALLOWED_EFFECTS = ("linear", "quadratic", "constant")
@@ -381,7 +380,8 @@ def fit(self, X, y):
381380
ValueError
382381
If `effect` is not one of "linear", "quadratic" or "constant".
383382
"""
384-
X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
383+
X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
384+
385385
if self.effect not in self._ALLOWED_EFFECTS:
386386
raise ValueError(f"effect {self.effect} must be in {self._ALLOWED_EFFECTS}")
387387

@@ -458,8 +458,9 @@ def predict(self, X):
458458
array-like of shape (n_samples,)
459459
The predicted data.
460460
"""
461-
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
462461
check_is_fitted(self, ["coef_"])
462+
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
463+
463464
return np.dot(X, self.coef_)
464465

465466
@property
@@ -970,8 +971,6 @@ def __init__(
970971
self.fit_intercept = fit_intercept
971972
self.copy_X = copy_X
972973
self.positive = positive
973-
if method not in ("SLSQP", "TNC", "L-BFGS-B"):
974-
raise ValueError(f'method should be one of "SLSQP", "TNC", "L-BFGS-B", ' f"got {method} instead")
975974
self.method = method
976975

977976
@abstractmethod
@@ -1021,6 +1020,10 @@ def fit(self, X, y, sample_weight=None):
10211020
self : BaseScipyMinimizeRegressor
10221021
Fitted linear model.
10231022
"""
1023+
if self.method not in {"SLSQP", "TNC", "L-BFGS-B"}:
1024+
msg = f"method should be one of 'SLSQP', 'TNC', 'L-BFGS-B', got {self.method} instead"
1025+
raise ValueError(msg)
1026+
10241027
X_, grad_loss, loss = self._prepare_inputs(X, sample_weight, y)
10251028

10261029
d = X_.shape[1] - self.n_features_in_ # This is either zero or one.
@@ -1051,7 +1054,8 @@ def _prepare_inputs(self, X, sample_weight, y):
10511054
This method is called by `fit` to prepare the inputs for the optimization problem. It adds an intercept column
10521055
to `X` if `fit_intercept=True`, and returns the loss function and its gradient.
10531056
"""
1054-
X, y = check_X_y(X, y, y_numeric=True)
1057+
X, y = validate_data(self, X=X, y=y, y_numeric=True, reset=True)
1058+
10551059
sample_weight = _check_sample_weight(sample_weight, X)
10561060
self.n_features_in_ = X.shape[1]
10571061

@@ -1081,7 +1085,7 @@ def predict(self, X):
10811085
The predicted data.
10821086
"""
10831087
check_is_fitted(self)
1084-
X = check_array(X)
1088+
X = validate_data(self, X=X, reset=False)
10851089

10861090
return X @ self.coef_ + self.intercept_
10871091

sklego/meta/_grouped_utils.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,10 @@
55
import narwhals.stable.v1 as nw
66
import pandas as pd
77
from scipy.sparse import issparse
8-
from sklearn.utils import check_array
98
from sklearn.utils.validation import _ensure_no_complex_data
109

10+
from sklego._sklearn_compat import check_array
11+
1112

1213
def parse_X_y(X, y, groups, check_X=True, **kwargs) -> nw.DataFrame:
1314
"""Converts X, y to narwhals dataframe.

sklego/meta/confusion_balancer.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
from sklearn.base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin
33
from sklearn.metrics import confusion_matrix
44
from sklearn.utils.multiclass import unique_labels
5-
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted, check_X_y
5+
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
66

7+
from sklego._sklearn_compat import validate_data
78
from sklego.base import ProbabilisticClassifier
89

910

@@ -63,7 +64,8 @@ def fit(self, X, y):
6364
If the underlying estimator does not have a `predict_proba` method.
6465
"""
6566

66-
X, y = check_X_y(X, y, estimator=self.estimator, dtype=FLOAT_DTYPES)
67+
X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
68+
6769
if not isinstance(self.estimator, ProbabilisticClassifier):
6870
raise ValueError(
6971
"The ConfusionBalancer meta model only works on classification models with .predict_proba."
@@ -72,7 +74,6 @@ def fit(self, X, y):
7274
self.classes_ = unique_labels(y)
7375
cfm = confusion_matrix(y, self.estimator_.predict(X)).T + self.cfm_smooth
7476
self.cfm_ = cfm / cfm.sum(axis=1).reshape(-1, 1)
75-
self.n_features_in_ = X.shape[1]
7677
return self
7778

7879
def predict_proba(self, X):
@@ -90,7 +91,7 @@ def predict_proba(self, X):
9091
The predicted values.
9192
"""
9293
check_is_fitted(self, ["cfm_", "classes_", "estimator_"])
93-
X = check_array(X, dtype=FLOAT_DTYPES)
94+
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
9495
preds = self.estimator_.predict_proba(X)
9596
return (1 - self.alpha) * preds + self.alpha * preds @ self.cfm_
9697

@@ -108,5 +109,5 @@ def predict(self, X):
108109
The predicted values.
109110
"""
110111
check_is_fitted(self, ["cfm_", "classes_", "estimator_"])
111-
X = check_array(X, dtype=FLOAT_DTYPES)
112+
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
112113
return self.classes_[self.predict_proba(X).argmax(axis=1)]

0 commit comments

Comments
 (0)