2
2
#
3
3
# License: Apache Software License 2.0
4
4
5
- """ This module contains the different drift detection method implementations.
5
+ """This module contains the different drift detection method implementations.
6
6
7
7
The :class:`~nannyml.drift.univariate.methods.MethodFactory` will convert the drift detection method names
8
8
into an instance of the base :class:`~nannyml.drift.univariate.methods.Method` class.
@@ -62,10 +62,8 @@ def __init__(
62
62
computation_params : dict, default=None
63
63
A dictionary specifying parameter names and values to be used in the computation of the
64
64
drift method.
65
- upper_threshold : float, default=None
66
- An optional upper threshold for the data quality metric.
67
- lower_threshold : float, default=None
68
- An optional lower threshold for the data quality metric.
65
+ threshold : Threshold
66
+ Threshold class defining threshold strategy.
69
67
upper_threshold_limit : float, default=None
70
68
An optional upper threshold limit for the data quality metric.
71
69
lower_threshold_limit : float, default=0
@@ -257,6 +255,7 @@ class JensenShannonDistance(Method):
257
255
"""
258
256
259
257
def __init__ (self , ** kwargs ) -> None :
258
+ """Initialize Jensen-Shannon method."""
260
259
super ().__init__ (
261
260
display_name = 'Jensen-Shannon distance' ,
262
261
column_name = 'jensen_shannon' ,
@@ -339,6 +338,7 @@ class KolmogorovSmirnovStatistic(Method):
339
338
"""
340
339
341
340
def __init__ (self , ** kwargs ) -> None :
341
+ """Initialize Kolmogorov-Smirnov method."""
342
342
super ().__init__ (
343
343
display_name = 'Kolmogorov-Smirnov statistic' ,
344
344
column_name = 'kolmogorov_smirnov' ,
@@ -405,7 +405,7 @@ def _calculate(self, data: pd.Series):
405
405
chunk_rel_freqs = chunk_proba_in_qts / len (data )
406
406
rel_freq_lower_than_edges = len (data [data < self ._qts [0 ]]) / len (data )
407
407
chunk_rel_freqs = rel_freq_lower_than_edges + np .cumsum (chunk_rel_freqs )
408
- stat = np .max (abs (self ._ref_rel_freqs - chunk_rel_freqs ))
408
+ stat = np .max (abs (self ._ref_rel_freqs - chunk_rel_freqs )) # type: ignore
409
409
else :
410
410
stat , _ = ks_2samp (self ._reference_data , data )
411
411
@@ -420,6 +420,7 @@ class Chi2Statistic(Method):
420
420
"""
421
421
422
422
def __init__ (self , ** kwargs ) -> None :
423
+ """Initialize Chi2-contingency method."""
423
424
super ().__init__ (
424
425
display_name = 'Chi2 statistic' ,
425
426
column_name = 'chi2' ,
@@ -444,6 +445,16 @@ def __init__(self, **kwargs) -> None:
444
445
self ._fitted = False
445
446
446
447
def fit (self , reference_data : pd .Series , timestamps : Optional [pd .Series ] = None ) -> Self :
448
+ """Fits Chi2 Method on reference data.
449
+
450
+ Parameters
451
+ ----------
452
+ reference_data: pd.DataFrame
453
+ The reference data used for fitting a Method. Must have target data available.
454
+ timestamps: Optional[pd.Series], default=None
455
+ A series containing the reference data Timestamps
456
+
457
+ """
447
458
super ().fit (reference_data , timestamps )
448
459
449
460
# Thresholding is based on p-values. Ignoring all custom thresholding and disable plotting a threshold
@@ -470,6 +481,16 @@ def _calculate(self, data: pd.Series):
470
481
return stat
471
482
472
483
def alert (self , value : float ):
484
+ """Evaluates if an alert has occurred for Chi2 on the current chunk data.
485
+
486
+ For Chi2 alerts are based on p-values rather than the actual method values like
487
+ in all other Univariate drift methods.
488
+
489
+ Parameters
490
+ ----------
491
+ value: float
492
+ The method value for a given chunk
493
+ """
473
494
return self ._p_value < 0.05
474
495
475
496
def _calc_chi2 (self , data : pd .Series ):
@@ -491,6 +512,7 @@ class LInfinityDistance(Method):
491
512
"""
492
513
493
514
def __init__ (self , ** kwargs ) -> None :
515
+ """Initialize L-Infinity Distance method."""
494
516
super ().__init__ (
495
517
display_name = 'L-Infinity distance' ,
496
518
column_name = 'l_infinity' ,
@@ -537,6 +559,7 @@ class WassersteinDistance(Method):
537
559
"""
538
560
539
561
def __init__ (self , ** kwargs ) -> None :
562
+ """Initialize Wasserstein Distance method."""
540
563
super ().__init__ (
541
564
display_name = 'Wasserstein distance' ,
542
565
column_name = 'wasserstein' ,
@@ -579,6 +602,9 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
579
602
reference_proba_in_bins , self ._bin_edges = np .histogram (reference_data , bins = self .n_bins )
580
603
self ._ref_rel_freqs = reference_proba_in_bins / len (reference_data )
581
604
self ._bin_width = self ._bin_edges [1 ] - self ._bin_edges [0 ]
605
+ self ._ref_min = self ._bin_edges [0 ]
606
+ self ._ref_max = self ._bin_edges [- 1 ]
607
+ self ._ref_cdf = np .cumsum (self ._ref_rel_freqs )
582
608
583
609
self ._fitted = True
584
610
self ._reference_size = len (reference_data )
@@ -596,42 +622,31 @@ def _calculate(self, data: pd.Series):
596
622
if (
597
623
self .calculation_method == 'auto' and self ._reference_size >= 10_000
598
624
) or self .calculation_method == 'estimated' :
599
- min_chunk = np .min (data )
600
-
601
- if min_chunk < self ._bin_edges [0 ]:
602
- extra_bins_left = (min_chunk - self ._bin_edges [0 ]) / self ._bin_width
603
- extra_bins_left = np .ceil (extra_bins_left )
625
+ data_histogram , _ = np .histogram (data , bins = self ._bin_edges )
626
+ data_histogram = data_histogram / len (data )
627
+ data_smaller = data [data < self ._ref_min ]
628
+ data_bigger = data [data > self ._ref_max ]
629
+ sample_size = len (data )
630
+ del data
631
+
632
+ if len (data_smaller ) > 0 :
633
+ amount_smaller = len (data_smaller ) / sample_size
634
+ term_smaller = wasserstein_distance (data_smaller , np .full (len (data_smaller ), self ._ref_min ))
635
+ term_smaller = term_smaller * amount_smaller
604
636
else :
605
- extra_bins_left = 0
637
+ term_smaller , amount_smaller = 0 , 0
606
638
607
- max_chunk = np .max (data )
608
-
609
- if max_chunk > self ._bin_edges [- 1 ]:
610
- extra_bins_right = (max_chunk - self ._bin_edges [- 1 ]) / self ._bin_width
611
- extra_bins_right = np .ceil (extra_bins_right )
639
+ if len (data_bigger ) > 0 :
640
+ amount_bigger = len (data_bigger ) / sample_size
641
+ term_bigger = wasserstein_distance (data_bigger , np .full (len (data_bigger ), self ._ref_max ))
642
+ term_bigger = term_bigger * amount_bigger
612
643
else :
613
- extra_bins_right = 0
614
-
615
- left_edges_to_prepand = np .arange (
616
- min_chunk - self ._bin_width , self ._bin_edges [0 ] - self ._bin_width , self ._bin_width
617
- )
618
- right_edges_to_append = np .arange (
619
- self ._bin_edges [- 1 ] + self ._bin_width , max_chunk + self ._bin_width , self ._bin_width
620
- )
621
-
622
- updated_edges = np .concatenate ([left_edges_to_prepand , self ._bin_edges , right_edges_to_append ])
623
- updated_ref_binned_pdf = np .concatenate (
624
- [np .zeros (len (left_edges_to_prepand )), self ._ref_rel_freqs , np .zeros (len (right_edges_to_append ))]
625
- )
626
-
627
- chunk_histogram , _ = np .histogram (data , bins = updated_edges )
628
-
629
- chunk_binned_pdf = chunk_histogram / len (data )
630
-
631
- ref_binned_cdf = np .cumsum (updated_ref_binned_pdf )
632
- chunk_binned_cdf = np .cumsum (chunk_binned_pdf )
644
+ term_bigger , amount_bigger = 0 , 0
633
645
634
- distance = np .sum (np .abs (ref_binned_cdf - chunk_binned_cdf ) * self ._bin_width )
646
+ data_cdf = np .cumsum (data_histogram )
647
+ data_cdf = data_cdf + amount_smaller # if there's some data on the left-hand side
648
+ term_within = np .sum (np .abs (self ._ref_cdf - data_cdf ) * self ._bin_width )
649
+ distance = term_within + term_smaller + term_bigger
635
650
else :
636
651
distance = wasserstein_distance (self ._reference_data , data )
637
652
@@ -644,6 +659,7 @@ class HellingerDistance(Method):
644
659
"""Calculates the Hellinger Distance between two distributions."""
645
660
646
661
def __init__ (self , ** kwargs ) -> None :
662
+ """Initialize Hellinger Distance method."""
647
663
super ().__init__ (
648
664
display_name = 'Hellinger distance' ,
649
665
column_name = 'hellinger' ,
0 commit comments