Skip to content

Commit

Permalink
Optimize calculation for L-Infinity method (#342)
Browse files Browse the repository at this point in the history
* Use pandas `value_counts` for L-Infinity method

* Optimize further using `sub` function

* Fix mypy error
  • Loading branch information
michael-nml authored Nov 20, 2023
1 parent 93ac6e7 commit 0478b24
Showing 1 changed file with 4 additions and 12 deletions.
16 changes: 4 additions & 12 deletions nannyml/drift/univariate/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,12 +503,11 @@ def __init__(self, **kwargs) -> None:
An optional lower threshold for the performance metric.
"""

self._reference_proba: Optional[dict] = None
self._reference_proba: Optional[pd.Series] = None

def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
reference_data = _remove_nans(reference_data)
ref_labels = reference_data.unique()
self._reference_proba = {label: (reference_data == label).sum() / len(reference_data) for label in ref_labels}
self._reference_proba = reference_data.value_counts(normalize=True)

return self

Expand All @@ -520,16 +519,9 @@ def _calculate(self, data: pd.Series):
data = _remove_nans(data)
if data.empty:
return np.nan
data_labels = data.unique()
data_ratios = {label: (data == label).sum() / len(data) for label in data_labels}

union_labels = set(self._reference_proba.keys()) | set(data_labels)

differences = {}
for label in union_labels:
differences[label] = np.abs(self._reference_proba.get(label, 0) - data_ratios.get(label, 0))

return max(differences.values())
analysis_data_ratio = data.value_counts(normalize=True)
return self._reference_proba.sub(analysis_data_ratio, fill_value=0).abs().max()


@MethodFactory.register(key='wasserstein', feature_type=FeatureType.CONTINUOUS)
Expand Down

0 comments on commit 0478b24

Please sign in to comment.