Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update datetime_helpers.py - floor_date + string_helpers #132

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 62 additions & 18 deletions src/pytimetk/utils/datetime_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
from typing import Union, List

from pytimetk.utils.checks import check_series_or_datetime
from pytimetk.utils.polars_helpers import pandas_to_polars_frequency
from pytimetk.utils.string_helpers import parse_freq_str


try:
import holidays
Expand All @@ -21,17 +24,35 @@
def floor_date(
idx: Union[pd.Series, pd.DatetimeIndex],
unit: str = "D",
) -> pd.Series:
'''Robust date flooring.
engine: str = 'pandas',
) -> pd.Series:
'''
Robust date flooring.

The `floor_date` function takes a pandas Series of dates and returns a new Series with the dates rounded down to the specified unit. It's more robust than the pandas `floor` function, which does weird things with irregular frequencies like Month which are actually regular.
The `floor_date` function takes a pandas Series of dates and returns a new Series
with the dates rounded down to the specified unit. It's more robust than the
pandas `floor` function, which does weird things with irregular frequencies
like Month which are actually regular.

Parameters
----------
idx : pd.Series or pd.DatetimeIndex
The `idx` parameter is a pandas Series or pandas DatetimeIndex object that contains datetime values. It represents the dates that you want to round down.
The `idx` parameter is a pandas Series or pandas DatetimeIndex object that
contains datetime values. It represents the dates that you want to round down.
unit : str, optional
The `unit` parameter in the `floor_date` function is a string that specifies the time unit to which the dates in the `idx` series should be rounded down. It has a default value of "D", which stands for day. Other possible values for the `unit` parameter could be
The `unit` parameter in the `floor_date` function is a string that specifies
the time unit to which the dates in the `idx` series should be rounded down.
It has a default value of "D", which stands for day. Other possible values
for the `unit` parameter could be.
engine : str, optional
The `engine` parameter is used to specify the engine to use for calculating
the floor datetime. It can be either "pandas" or "polars".

- The default value is "pandas".

- When "polars", the function will internally use the `polars` library for
calculating the floor datetime. This can be faster than using "pandas" for
large datasets.

Returns
-------
Expand All @@ -53,11 +74,27 @@ def floor_date(
# dates.floor("M") # ValueError: <MonthEnd> is a non-fixed frequency

# floor_date works as expected
tk.floor_date(dates, unit="M")
tk.floor_date(dates, unit="M", engine='pandas')
```
'''

# Common checks
check_series_or_datetime(idx)

if engine == 'pandas':
return _floor_date_pandas(idx, unit)
elif engine == 'polars':
return _floor_date_polars(idx, unit)
else:
raise ValueError("Invalid engine. Use 'pandas' or 'polars'.")

def _floor_date_pandas(
idx: Union[pd.Series, pd.DatetimeIndex],
unit: str = "D",
) -> pd.Series:
'''
Robust date flooring.
'''

# If idx is a DatetimeIndex, convert to Series
if isinstance(idx, pd.DatetimeIndex):
Expand All @@ -80,6 +117,25 @@ def floor_date(

return date

def _floor_date_polars(
idx: Union[pd.Series, pd.DatetimeIndex],
unit: str = "D",
) -> pd.Series:
'''
Robust date flooring.
'''

# If idx is a DatetimeIndex, convert to Series
if isinstance(idx, pd.DatetimeIndex):
idx = pl.Series(idx).alias('idx')

date = (idx
.dt
.truncate(every=pandas_to_polars_frequency(unit))
).to_pandas()

return date

@pf.register_series_method
def ceil_date(
idx: Union[pd.Series, pd.DatetimeIndex],
Expand Down Expand Up @@ -131,18 +187,6 @@ def ceil_date(

return date

def parse_freq_str(freq_str):
match = re.match(r'(\d+)?([A-Z]+|min)', freq_str)
if not match:
raise ValueError(f"Invalid frequency string: {freq_str}")

quantity, unit = match.groups()

if quantity is None:
quantity = 1

return quantity, unit

def freq_to_dateoffset(freq_str):
# Adjusted regex to account for potential absence of numeric part
quantity, unit = parse_freq_str(freq_str)
Expand Down
13 changes: 13 additions & 0 deletions src/pytimetk/utils/string_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import re

def parse_freq_str(freq_str):
match = re.match(r'(\d+)?([A-Z]+|min)', freq_str)
if not match:
raise ValueError(f"Invalid frequency string: {freq_str}")

quantity, unit = match.groups()

if quantity is None:
quantity = 1

return quantity, unit
Loading