-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix: filter for most recent files #163
Changes from all commits
ffb4fce
f37f9b4
2790449
b54d02c
8bc3e91
fd4e6ae
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -83,15 +83,21 @@ def get_latest_file(file_list: list[Path]) -> Path: | |
try: | ||
# Define the keywords to filter relevant files | ||
keywords = ["konsolidiertelesefassungmitfehlerkorrekturen", "außerordentlicheveröffentlichung"] | ||
|
||
files_containing_keywords = [ | ||
path for path in file_list if any(keyword in path.name.lower() for keyword in keywords) | ||
] | ||
# Find the most recent file based on keywords and date suffixes | ||
latest_file = max( | ||
(path for path in file_list if any(keyword in path.name.lower() for keyword in keywords)), | ||
key=lambda path: ( | ||
int(path.stem.split("_")[-1]), # "gültig von" date | ||
int(path.stem.split("_")[-2]), # "gültig bis" date | ||
), | ||
) | ||
if any(files_containing_keywords): | ||
# Find the most recent file based on keywords and date suffixes | ||
latest_file = max( | ||
(path for path in files_containing_keywords), | ||
key=_get_sort_key, | ||
) | ||
else: # different versions but no kosildierte Lesefassung or außerordentliche Veröffentlichung at all | ||
latest_file = max( | ||
(path for path in file_list), | ||
key=_get_sort_key, | ||
) | ||
|
||
except ValueError as e: | ||
logger.error("Error processing file list: {}", e) | ||
|
@@ -154,19 +160,58 @@ def parse_raw_nachrichtenstrukturzeile(input_path: Path) -> list[str]: | |
|
||
|
||
_pattern = re.compile( | ||
r"MIG(?:Strom|Gas)?-?informatorischeLesefassung?(.*?)" | ||
r"MIG(?:Strom|Gas)?-?informatorischeLesefassung?(?P<version>(?P<major>\d+)\.(?P<minor>\d+)(?P<suffix>[a-z]?))" | ||
r"(?:_|KonsolidierteLesefassung|-AußerordentlicheVeröffentlichung)", | ||
re.IGNORECASE, | ||
) | ||
|
||
|
||
def _extract_document_version(path: Path) -> str: | ||
document_str = str(path) | ||
def _extract_document_version(path: Path | str) -> tuple[str, int | None, int | None, str]: | ||
""" | ||
Extracts the document version (major.minor+suffix) details from the given file path. | ||
|
||
Args: | ||
path (Path | str): The path to the file. | ||
Example: path/to/file/ORDCHGMIG-informatorischeLesefassung1.1a_99991231_20231001.docx | ||
-> version: 1.1a, major: 1, minor: 1, suffix: a | ||
|
||
Returns: | ||
tuple: A tuple containing the document version (str), major version (int or None), | ||
minor version (int or None), and suffix (str). | ||
""" | ||
|
||
if isinstance(path, str): | ||
document_str = path | ||
else: | ||
document_str = str(path) | ||
matches = _pattern.search(document_str) | ||
if matches: | ||
document_version = matches.group(1) | ||
document_version = matches.group("version") | ||
major = matches.group("major") | ||
minor = matches.group("minor") | ||
suffix = matches.group("suffix") | ||
if document_version == "": | ||
logger.warning(f"❌ No document version found in {path}.", fg="red") | ||
return document_version | ||
return document_version or "", int(major) or 0, int(minor) or 0, suffix or "" | ||
logger.error(f"❌ Unexpected document name in {path}.", fg="red") | ||
return "" | ||
return "", None, None, "" | ||
|
||
|
||
def _get_sort_key(path: Path) -> tuple[int, int, int | None, int | None, str]: | ||
""" | ||
Extracts the sort key from the given path. | ||
|
||
Args: | ||
path (Path): The path object to extract the sort key from. | ||
Example: path/to/file/ORDCHGMIG-informatorischeLesefassung1.1a_99991231_20231001.docx | ||
with gueltig_von_date: 20231001 and gueltig_bis_date: 99991231, major: 1, minor: 1, suffix: a | ||
|
||
Returns: | ||
tuple: A tuple containing the "gültig von" date (int), | ||
"gültig bis" date (int), major version (int or None), minor version (int or None), and suffix (str). | ||
""" | ||
parts = path.stem.split("_") | ||
gueltig_von_date = int(parts[-1]) | ||
gueltig_bis_date = int(parts[-2]) | ||
Comment on lines
+214
to
+215
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Es ist mir eine private Funktion aber ich bin darüber gestolpert, dass es Date heißt obwohl da ein int ist. Der Type hint stimmt, und wenn man weiß wie der Dateiname aussieht, dann ist auch klar was passiert. Wenn man das zum ersten Mal sieht könnte vllt ein Beispiel Dateiname oder Pfad im Docstring nicht schaden. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
_, major, minor, suffix = _extract_document_version(parts[-3]) | ||
return gueltig_von_date, gueltig_bis_date, major, minor, suffix |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,41 @@ | ||
# serializer version: 1 | ||
# name: TestParsing.test_extract_document_version[IFTSTA] | ||
'' | ||
tuple( | ||
'', | ||
None, | ||
None, | ||
'', | ||
) | ||
# --- | ||
# name: TestParsing.test_extract_document_version[REMADV] | ||
'2.9b' | ||
tuple( | ||
'2.9b', | ||
2, | ||
9, | ||
'b', | ||
) | ||
# --- | ||
# name: TestParsing.test_extract_document_version[REQOTE] | ||
'1.3' | ||
tuple( | ||
'1.3', | ||
1, | ||
3, | ||
'', | ||
) | ||
# --- | ||
# name: TestParsing.test_extract_document_version[UTILMDG] | ||
'G1.0a' | ||
tuple( | ||
'', | ||
None, | ||
None, | ||
'', | ||
) | ||
# --- | ||
# name: TestParsing.test_extract_document_version[UTILMDS] | ||
'S1.1' | ||
tuple( | ||
'', | ||
None, | ||
None, | ||
'', | ||
) | ||
# --- |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Du könntest auch die capturing groups im pattern direkt benennen.
https://docs.python.org/3/library/re.html#re.Match.groupdict
Dann hängt das assignment hier nicht implizit an der Reihenfolge, sondern ist direkt im pattern schon benannt. Named groups sind ein einfacher Weg um Regex lesbarer zu machen
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fd4e6ae