Skip to content

Commit b5a7009

Browse files
committed
update wasserstein and linting
1 parent b8b237f commit b5a7009

File tree

3 files changed

+62
-45
lines changed

3 files changed

+62
-45
lines changed

nannyml/drift/univariate/calculator.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,6 @@ def __init__(
111111
chunker : Chunker
112112
The `Chunker` used to split the data sets into a lists of chunks.
113113
thresholds: dict
114-
115114
Defaults to::
116115
117116
{
@@ -136,8 +135,7 @@ def __init__(
136135
The `chi2` method does not support custom thresholds for now. Additional research is required to determine
137136
how to transition from its current p-value based implementation.
138137
139-
computation_params : dict
140-
138+
computation_params: dict
141139
Defaults to::
142140
143141
{

nannyml/drift/univariate/methods.py

+54-38
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#
33
# License: Apache Software License 2.0
44

5-
""" This module contains the different drift detection method implementations.
5+
"""This module contains the different drift detection method implementations.
66
77
The :class:`~nannyml.drift.univariate.methods.MethodFactory` will convert the drift detection method names
88
into an instance of the base :class:`~nannyml.drift.univariate.methods.Method` class.
@@ -62,10 +62,8 @@ def __init__(
6262
computation_params : dict, default=None
6363
A dictionary specifying parameter names and values to be used in the computation of the
6464
drift method.
65-
upper_threshold : float, default=None
66-
An optional upper threshold for the data quality metric.
67-
lower_threshold : float, default=None
68-
An optional lower threshold for the data quality metric.
65+
threshold : Threshold
66+
Threshold class defining threshold strategy.
6967
upper_threshold_limit : float, default=None
7068
An optional upper threshold limit for the data quality metric.
7169
lower_threshold_limit : float, default=0
@@ -257,6 +255,7 @@ class JensenShannonDistance(Method):
257255
"""
258256

259257
def __init__(self, **kwargs) -> None:
258+
"""Initialize Jensen-Shannon method."""
260259
super().__init__(
261260
display_name='Jensen-Shannon distance',
262261
column_name='jensen_shannon',
@@ -339,6 +338,7 @@ class KolmogorovSmirnovStatistic(Method):
339338
"""
340339

341340
def __init__(self, **kwargs) -> None:
341+
"""Initialize Kolmogorov-Smirnov method."""
342342
super().__init__(
343343
display_name='Kolmogorov-Smirnov statistic',
344344
column_name='kolmogorov_smirnov',
@@ -405,7 +405,7 @@ def _calculate(self, data: pd.Series):
405405
chunk_rel_freqs = chunk_proba_in_qts / len(data)
406406
rel_freq_lower_than_edges = len(data[data < self._qts[0]]) / len(data)
407407
chunk_rel_freqs = rel_freq_lower_than_edges + np.cumsum(chunk_rel_freqs)
408-
stat = np.max(abs(self._ref_rel_freqs - chunk_rel_freqs))
408+
stat = np.max(abs(self._ref_rel_freqs - chunk_rel_freqs)) # type: ignore
409409
else:
410410
stat, _ = ks_2samp(self._reference_data, data)
411411

@@ -420,6 +420,7 @@ class Chi2Statistic(Method):
420420
"""
421421

422422
def __init__(self, **kwargs) -> None:
423+
"""Initialize Chi2-contingency method."""
423424
super().__init__(
424425
display_name='Chi2 statistic',
425426
column_name='chi2',
@@ -444,6 +445,16 @@ def __init__(self, **kwargs) -> None:
444445
self._fitted = False
445446

446447
def fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
448+
"""Fits Chi2 Method on reference data.
449+
450+
Parameters
451+
----------
452+
reference_data: pd.DataFrame
453+
The reference data used for fitting a Method. Must have target data available.
454+
timestamps: Optional[pd.Series], default=None
455+
A series containing the reference data Timestamps
456+
457+
"""
447458
super().fit(reference_data, timestamps)
448459

449460
# Thresholding is based on p-values. Ignoring all custom thresholding and disable plotting a threshold
@@ -470,6 +481,16 @@ def _calculate(self, data: pd.Series):
470481
return stat
471482

472483
def alert(self, value: float):
484+
"""Evaluates if an alert has occurred for Chi2 on the current chunk data.
485+
486+
For Chi2 alerts are based on p-values rather than the actual method values like
487+
in all other Univariate drift methods.
488+
489+
Parameters
490+
----------
491+
value: float
492+
The method value for a given chunk
493+
"""
473494
return self._p_value < 0.05
474495

475496
def _calc_chi2(self, data: pd.Series):
@@ -491,6 +512,7 @@ class LInfinityDistance(Method):
491512
"""
492513

493514
def __init__(self, **kwargs) -> None:
515+
"""Initialize L-Infinity Distance method."""
494516
super().__init__(
495517
display_name='L-Infinity distance',
496518
column_name='l_infinity',
@@ -537,6 +559,7 @@ class WassersteinDistance(Method):
537559
"""
538560

539561
def __init__(self, **kwargs) -> None:
562+
"""Initialize Wasserstein Distance method."""
540563
super().__init__(
541564
display_name='Wasserstein distance',
542565
column_name='wasserstein',
@@ -579,6 +602,9 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
579602
reference_proba_in_bins, self._bin_edges = np.histogram(reference_data, bins=self.n_bins)
580603
self._ref_rel_freqs = reference_proba_in_bins / len(reference_data)
581604
self._bin_width = self._bin_edges[1] - self._bin_edges[0]
605+
self._ref_min = self._bin_edges[0]
606+
self._ref_max = self._bin_edges[-1]
607+
self._ref_cdf = np.cumsum(self._ref_rel_freqs)
582608

583609
self._fitted = True
584610
self._reference_size = len(reference_data)
@@ -596,42 +622,31 @@ def _calculate(self, data: pd.Series):
596622
if (
597623
self.calculation_method == 'auto' and self._reference_size >= 10_000
598624
) or self.calculation_method == 'estimated':
599-
min_chunk = np.min(data)
600-
601-
if min_chunk < self._bin_edges[0]:
602-
extra_bins_left = (min_chunk - self._bin_edges[0]) / self._bin_width
603-
extra_bins_left = np.ceil(extra_bins_left)
625+
data_histogram, _ = np.histogram(data, bins=self._bin_edges)
626+
data_histogram = data_histogram / len(data)
627+
data_smaller = data[data < self._ref_min]
628+
data_bigger = data[data > self._ref_max]
629+
sample_size = len(data)
630+
del data
631+
632+
if len(data_smaller) > 0:
633+
amount_smaller = len(data_smaller) / sample_size
634+
term_smaller = wasserstein_distance(data_smaller, np.full(len(data_smaller), self._ref_min))
635+
term_smaller = term_smaller * amount_smaller
604636
else:
605-
extra_bins_left = 0
637+
term_smaller, amount_smaller = 0, 0
606638

607-
max_chunk = np.max(data)
608-
609-
if max_chunk > self._bin_edges[-1]:
610-
extra_bins_right = (max_chunk - self._bin_edges[-1]) / self._bin_width
611-
extra_bins_right = np.ceil(extra_bins_right)
639+
if len(data_bigger) > 0:
640+
amount_bigger = len(data_bigger) / sample_size
641+
term_bigger = wasserstein_distance(data_bigger, np.full(len(data_bigger), self._ref_max))
642+
term_bigger = term_bigger * amount_bigger
612643
else:
613-
extra_bins_right = 0
614-
615-
left_edges_to_prepand = np.arange(
616-
min_chunk - self._bin_width, self._bin_edges[0] - self._bin_width, self._bin_width
617-
)
618-
right_edges_to_append = np.arange(
619-
self._bin_edges[-1] + self._bin_width, max_chunk + self._bin_width, self._bin_width
620-
)
621-
622-
updated_edges = np.concatenate([left_edges_to_prepand, self._bin_edges, right_edges_to_append])
623-
updated_ref_binned_pdf = np.concatenate(
624-
[np.zeros(len(left_edges_to_prepand)), self._ref_rel_freqs, np.zeros(len(right_edges_to_append))]
625-
)
626-
627-
chunk_histogram, _ = np.histogram(data, bins=updated_edges)
628-
629-
chunk_binned_pdf = chunk_histogram / len(data)
630-
631-
ref_binned_cdf = np.cumsum(updated_ref_binned_pdf)
632-
chunk_binned_cdf = np.cumsum(chunk_binned_pdf)
644+
term_bigger, amount_bigger = 0, 0
633645

634-
distance = np.sum(np.abs(ref_binned_cdf - chunk_binned_cdf) * self._bin_width)
646+
data_cdf = np.cumsum(data_histogram)
647+
data_cdf = data_cdf + amount_smaller # if there's some data on the left-hand side
648+
term_within = np.sum(np.abs(self._ref_cdf - data_cdf) * self._bin_width)
649+
distance = term_within + term_smaller + term_bigger
635650
else:
636651
distance = wasserstein_distance(self._reference_data, data)
637652

@@ -644,6 +659,7 @@ class HellingerDistance(Method):
644659
"""Calculates the Hellinger Distance between two distributions."""
645660

646661
def __init__(self, **kwargs) -> None:
662+
"""Initialize Hellinger Distance method."""
647663
super().__init__(
648664
display_name='Hellinger distance',
649665
column_name='hellinger',

nannyml/drift/univariate/result.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ def __init__(
4444
analysis_data: pd.DataFrame = None,
4545
reference_data: pd.DataFrame = None,
4646
):
47-
"""
47+
"""Initialize resuts class.
48+
4849
Parameters
4950
----------
5051
results_data: pd.DataFrame
@@ -112,6 +113,7 @@ def __init__(
112113

113114
@property
114115
def methods(self) -> List[Method]:
116+
"""Methods used during calculation."""
115117
return cast(List[Method], self.metrics)
116118

117119
def _filter(
@@ -167,9 +169,9 @@ def _get_result_property(self, property_name: str) -> List[pd.Series]:
167169
return continuous_values + categorical_values
168170

169171
def keys(self) -> List[Key]:
170-
"""
171-
Creates a list of keys for continuos and categorial columns where each Key is a `namedtuple('Key',
172-
'properties display_names')`
172+
"""Creates a list of keys for continuos and categorial columns.
173+
174+
Each Key is a `namedtuple('Key', 'properties display_names')`
173175
"""
174176
continuous_keys = [
175177
Key(properties=(column, method.column_name), display_names=(column, method.display_name))
@@ -204,6 +206,7 @@ def plot(
204206
- 'distribution'
205207
plots feature distribution per :class:`~nannyml.chunk.Chunk`.
206208
Joyplot for continuous features, stacked bar charts for categorical features.
209+
207210
Returns
208211
-------
209212
fig: :class:`plotly.graph_objs._figure.Figure`

0 commit comments

Comments
 (0)