From 872c8846c978aa680afa74bce45eb25dfac72f7d Mon Sep 17 00:00:00 2001 From: JB Kurland <65054169+JustinKurland@users.noreply.github.com> Date: Wed, 18 Oct 2023 17:29:24 -0400 Subject: [PATCH 1/4] Update datetime_helpers.py `floor_date` has been been modified to have a `polars` engine now as well. --- src/pytimetk/utils/datetime_helpers.py | 68 +++++++++++++++++++++++--- 1 file changed, 61 insertions(+), 7 deletions(-) diff --git a/src/pytimetk/utils/datetime_helpers.py b/src/pytimetk/utils/datetime_helpers.py index 29dba17d..cf749e14 100644 --- a/src/pytimetk/utils/datetime_helpers.py +++ b/src/pytimetk/utils/datetime_helpers.py @@ -10,6 +10,7 @@ from typing import Union, List from pytimetk.utils.checks import check_series_or_datetime +from pytimetk.utils.polars_helpers import pandas_to_polars_frequency try: import holidays @@ -21,17 +22,35 @@ def floor_date( idx: Union[pd.Series, pd.DatetimeIndex], unit: str = "D", -) -> pd.Series: - '''Robust date flooring. + engine: str = 'pandas', + ) -> pd.Series: + ''' + Robust date flooring. - The `floor_date` function takes a pandas Series of dates and returns a new Series with the dates rounded down to the specified unit. It's more robust than the pandas `floor` function, which does weird things with irregular frequencies like Month which are actually regular. + The `floor_date` function takes a pandas Series of dates and returns a new Series + with the dates rounded down to the specified unit. It's more robust than the + pandas `floor` function, which does weird things with irregular frequencies + like Month which are actually regular. Parameters ---------- idx : pd.Series or pd.DatetimeIndex - The `idx` parameter is a pandas Series or pandas DatetimeIndex object that contains datetime values. It represents the dates that you want to round down. + The `idx` parameter is a pandas Series or pandas DatetimeIndex object that + contains datetime values. It represents the dates that you want to round down. unit : str, optional - The `unit` parameter in the `floor_date` function is a string that specifies the time unit to which the dates in the `idx` series should be rounded down. It has a default value of "D", which stands for day. Other possible values for the `unit` parameter could be + The `unit` parameter in the `floor_date` function is a string that specifies + the time unit to which the dates in the `idx` series should be rounded down. + It has a default value of "D", which stands for day. Other possible values + for the `unit` parameter could be. + engine : str, optional + The `engine` parameter is used to specify the engine to use for calculating + the floor datetime. It can be either "pandas" or "polars". + + - The default value is "pandas". + + - When "polars", the function will internally use the `polars` library for + calculating the floor datetime. This can be faster than using "pandas" for + large datasets. Returns ------- @@ -53,11 +72,27 @@ def floor_date( # dates.floor("M") # ValueError: is a non-fixed frequency # floor_date works as expected - tk.floor_date(dates, unit="M") + tk.floor_date(dates, unit="M", engine='pandas') ``` ''' + # Common checks - check_series_or_datetime(idx) + check_series_or_datetime1(idx) + + if engine == 'pandas': + return _floor_date_pandas(idx, unit) + elif engine == 'polars': + return _floor_date_polars(idx, unit) + else: + raise ValueError("Invalid engine. Use 'pandas' or 'polars'.") + +def _floor_date_pandas( + idx: Union[pd.Series, pd.DatetimeIndex], + unit: str = "D", + ) -> pd.Series: + ''' + Robust date flooring. + ''' # If idx is a DatetimeIndex, convert to Series if isinstance(idx, pd.DatetimeIndex): @@ -80,6 +115,25 @@ def floor_date( return date +def _floor_date_polars( + idx: Union[pd.Series, pd.DatetimeIndex], + unit: str = "D", +) -> pd.Series: + ''' + Robust date flooring. + ''' + + # If idx is a DatetimeIndex, convert to Series + if isinstance(idx, pd.DatetimeIndex): + idx = pl.Series(idx).alias('idx') + + date = (idx + .dt + .truncate(every=pandas_to_polars_frequency(unit)) + ).to_pandas() + + return date + @pf.register_series_method def ceil_date( idx: Union[pd.Series, pd.DatetimeIndex], From 3bfec545df4222f73fed073975351e072434e598 Mon Sep 17 00:00:00 2001 From: JB Kurland <65054169+JustinKurland@users.noreply.github.com> Date: Wed, 18 Oct 2023 17:37:04 -0400 Subject: [PATCH 2/4] Create string_helpers.py Decouple circular import --- src/pytimetk/utils/string_helpers.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 src/pytimetk/utils/string_helpers.py diff --git a/src/pytimetk/utils/string_helpers.py b/src/pytimetk/utils/string_helpers.py new file mode 100644 index 00000000..6bff4afd --- /dev/null +++ b/src/pytimetk/utils/string_helpers.py @@ -0,0 +1,13 @@ +import re + +def parse_freq_str1(freq_str): + match = re.match(r'(\d+)?([A-Z]+|min)', freq_str) + if not match: + raise ValueError(f"Invalid frequency string: {freq_str}") + + quantity, unit = match.groups() + + if quantity is None: + quantity = 1 + + return quantity, unit From 4ee73cb9ed95cce7bf2ea68e5ca06d7c3776ea40 Mon Sep 17 00:00:00 2001 From: JB Kurland <65054169+JustinKurland@users.noreply.github.com> Date: Wed, 18 Oct 2023 17:37:34 -0400 Subject: [PATCH 3/4] Update string_helpers.py --- src/pytimetk/utils/string_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytimetk/utils/string_helpers.py b/src/pytimetk/utils/string_helpers.py index 6bff4afd..a931f275 100644 --- a/src/pytimetk/utils/string_helpers.py +++ b/src/pytimetk/utils/string_helpers.py @@ -1,6 +1,6 @@ import re -def parse_freq_str1(freq_str): +def parse_freq_str(freq_str): match = re.match(r'(\d+)?([A-Z]+|min)', freq_str) if not match: raise ValueError(f"Invalid frequency string: {freq_str}") From 04da89708c9d1730a4dec83eef7504c13a3d7214 Mon Sep 17 00:00:00 2001 From: JB Kurland <65054169+JustinKurland@users.noreply.github.com> Date: Wed, 18 Oct 2023 17:42:21 -0400 Subject: [PATCH 4/4] Update datetime_helpers.py - floor_date + string_helpers Added `floor_date` but also to avoid having circular import I have moved `parse_freq_str` to it's own module --- src/pytimetk/utils/datetime_helpers.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/src/pytimetk/utils/datetime_helpers.py b/src/pytimetk/utils/datetime_helpers.py index cf749e14..d629ce38 100644 --- a/src/pytimetk/utils/datetime_helpers.py +++ b/src/pytimetk/utils/datetime_helpers.py @@ -11,6 +11,8 @@ from pytimetk.utils.checks import check_series_or_datetime from pytimetk.utils.polars_helpers import pandas_to_polars_frequency +from pytimetk.utils.string_helpers import parse_freq_str + try: import holidays @@ -77,7 +79,7 @@ def floor_date( ''' # Common checks - check_series_or_datetime1(idx) + check_series_or_datetime(idx) if engine == 'pandas': return _floor_date_pandas(idx, unit) @@ -185,18 +187,6 @@ def ceil_date( return date -def parse_freq_str(freq_str): - match = re.match(r'(\d+)?([A-Z]+|min)', freq_str) - if not match: - raise ValueError(f"Invalid frequency string: {freq_str}") - - quantity, unit = match.groups() - - if quantity is None: - quantity = 1 - - return quantity, unit - def freq_to_dateoffset(freq_str): # Adjusted regex to account for potential absence of numeric part quantity, unit = parse_freq_str(freq_str)