From 51f4f6d8742eadd4cc5ba2fdecc70c5c53d79f28 Mon Sep 17 00:00:00 2001 From: Daniel <139119540+DeltaDaniel@users.noreply.github.com> Date: Tue, 8 Oct 2024 08:46:27 +0200 Subject: [PATCH] fix: filter for most recent files (#163) * fix filter for most recent files * added private fct for sort key * changed sorting key including major and minor versions, adapted tests --------- Co-authored-by: konstantin --- src/migmose/__main__.py | 2 +- src/migmose/parsing.py | 73 +++++++++++++++---- unittests/__snapshots__/test_parsing.ambr | 35 +++++++-- ...test_reduced_nested_nachrichtenstruktur.py | 2 +- 4 files changed, 91 insertions(+), 21 deletions(-) diff --git a/src/migmose/__main__.py b/src/migmose/__main__.py index af5d94e..b0498eb 100644 --- a/src/migmose/__main__.py +++ b/src/migmose/__main__.py @@ -142,7 +142,7 @@ def main( format_version, output_dir_for_format, ) - document_version = _extract_document_version(file) + document_version, *_ = _extract_document_version(file) reduced_nested_nachrichtenstruktur.output_tree(m_format, output_dir_for_format, document_version) diff --git a/src/migmose/parsing.py b/src/migmose/parsing.py index 91ff223..4dc2866 100644 --- a/src/migmose/parsing.py +++ b/src/migmose/parsing.py @@ -83,15 +83,21 @@ def get_latest_file(file_list: list[Path]) -> Path: try: # Define the keywords to filter relevant files keywords = ["konsolidiertelesefassungmitfehlerkorrekturen", "außerordentlicheveröffentlichung"] - + files_containing_keywords = [ + path for path in file_list if any(keyword in path.name.lower() for keyword in keywords) + ] # Find the most recent file based on keywords and date suffixes - latest_file = max( - (path for path in file_list if any(keyword in path.name.lower() for keyword in keywords)), - key=lambda path: ( - int(path.stem.split("_")[-1]), # "gültig von" date - int(path.stem.split("_")[-2]), # "gültig bis" date - ), - ) + if any(files_containing_keywords): + # Find the most recent file based on keywords and date suffixes + latest_file = max( + (path for path in files_containing_keywords), + key=_get_sort_key, + ) + else: # different versions but no kosildierte Lesefassung or außerordentliche Veröffentlichung at all + latest_file = max( + (path for path in file_list), + key=_get_sort_key, + ) except ValueError as e: logger.error("Error processing file list: {}", e) @@ -154,19 +160,58 @@ def parse_raw_nachrichtenstrukturzeile(input_path: Path) -> list[str]: _pattern = re.compile( - r"MIG(?:Strom|Gas)?-?informatorischeLesefassung?(.*?)" + r"MIG(?:Strom|Gas)?-?informatorischeLesefassung?(?P(?P\d+)\.(?P\d+)(?P[a-z]?))" r"(?:_|KonsolidierteLesefassung|-AußerordentlicheVeröffentlichung)", re.IGNORECASE, ) -def _extract_document_version(path: Path) -> str: - document_str = str(path) +def _extract_document_version(path: Path | str) -> tuple[str, int | None, int | None, str]: + """ + Extracts the document version (major.minor+suffix) details from the given file path. + + Args: + path (Path | str): The path to the file. + Example: path/to/file/ORDCHGMIG-informatorischeLesefassung1.1a_99991231_20231001.docx + -> version: 1.1a, major: 1, minor: 1, suffix: a + + Returns: + tuple: A tuple containing the document version (str), major version (int or None), + minor version (int or None), and suffix (str). + """ + + if isinstance(path, str): + document_str = path + else: + document_str = str(path) matches = _pattern.search(document_str) if matches: - document_version = matches.group(1) + document_version = matches.group("version") + major = matches.group("major") + minor = matches.group("minor") + suffix = matches.group("suffix") if document_version == "": logger.warning(f"❌ No document version found in {path}.", fg="red") - return document_version + return document_version or "", int(major) or 0, int(minor) or 0, suffix or "" logger.error(f"❌ Unexpected document name in {path}.", fg="red") - return "" + return "", None, None, "" + + +def _get_sort_key(path: Path) -> tuple[int, int, int | None, int | None, str]: + """ + Extracts the sort key from the given path. + + Args: + path (Path): The path object to extract the sort key from. + Example: path/to/file/ORDCHGMIG-informatorischeLesefassung1.1a_99991231_20231001.docx + with gueltig_von_date: 20231001 and gueltig_bis_date: 99991231, major: 1, minor: 1, suffix: a + + Returns: + tuple: A tuple containing the "gültig von" date (int), + "gültig bis" date (int), major version (int or None), minor version (int or None), and suffix (str). + """ + parts = path.stem.split("_") + gueltig_von_date = int(parts[-1]) + gueltig_bis_date = int(parts[-2]) + _, major, minor, suffix = _extract_document_version(parts[-3]) + return gueltig_von_date, gueltig_bis_date, major, minor, suffix diff --git a/unittests/__snapshots__/test_parsing.ambr b/unittests/__snapshots__/test_parsing.ambr index a3ea520..42f5164 100644 --- a/unittests/__snapshots__/test_parsing.ambr +++ b/unittests/__snapshots__/test_parsing.ambr @@ -1,16 +1,41 @@ # serializer version: 1 # name: TestParsing.test_extract_document_version[IFTSTA] - '' + tuple( + '', + None, + None, + '', + ) # --- # name: TestParsing.test_extract_document_version[REMADV] - '2.9b' + tuple( + '2.9b', + 2, + 9, + 'b', + ) # --- # name: TestParsing.test_extract_document_version[REQOTE] - '1.3' + tuple( + '1.3', + 1, + 3, + '', + ) # --- # name: TestParsing.test_extract_document_version[UTILMDG] - 'G1.0a' + tuple( + '', + None, + None, + '', + ) # --- # name: TestParsing.test_extract_document_version[UTILMDS] - 'S1.1' + tuple( + '', + None, + None, + '', + ) # --- diff --git a/unittests/test_reduced_nested_nachrichtenstruktur.py b/unittests/test_reduced_nested_nachrichtenstruktur.py index b11dcf5..3dec7c5 100644 --- a/unittests/test_reduced_nested_nachrichtenstruktur.py +++ b/unittests/test_reduced_nested_nachrichtenstruktur.py @@ -92,7 +92,7 @@ def test_output_tree(self, message_format: EdifactFormat, tmp_path, snapshot): reduced_nested_nachrichtenstruktur = ReducedNestedNachrichtenstruktur.create_reduced_nested_nachrichtenstruktur( nested_nachrichtenstruktur ) - document_version = _extract_document_version(file_path) + document_version, *_ = _extract_document_version(file_path) reduced_nested_nachrichtenstruktur.output_tree(message_format, tmp_path, document_version) with open(tmp_path / f"{message_format}{document_version}.tree", "r", encoding="utf-8") as actual_file: assert actual_file.read() == snapshot