From 51f4f6d8742eadd4cc5ba2fdecc70c5c53d79f28 Mon Sep 17 00:00:00 2001
From: Daniel <139119540+DeltaDaniel@users.noreply.github.com>
Date: Tue, 8 Oct 2024 08:46:27 +0200
Subject: [PATCH] fix: filter for most recent files (#163)

* fix filter for most recent files

* added private fct for sort key

* changed sorting key including major and minor versions, adapted tests

---------

Co-authored-by: konstantin <konstantin.klein@hochfrequenz.de>
---
 src/migmose/__main__.py                       |  2 +-
 src/migmose/parsing.py                        | 73 +++++++++++++++----
 unittests/__snapshots__/test_parsing.ambr     | 35 +++++++--
 ...test_reduced_nested_nachrichtenstruktur.py |  2 +-
 4 files changed, 91 insertions(+), 21 deletions(-)
diff --git a/src/migmose/__main__.py b/src/migmose/__main__.py
index af5d94e..b0498eb 100644
--- a/src/migmose/__main__.py
+++ b/src/migmose/__main__.py
@@ -142,7 +142,7 @@ def main(
                 format_version,
                 output_dir_for_format,
             )
-            document_version = _extract_document_version(file)
+            document_version, *_ = _extract_document_version(file)
             reduced_nested_nachrichtenstruktur.output_tree(m_format, output_dir_for_format, document_version)
 
 
diff --git a/src/migmose/parsing.py b/src/migmose/parsing.py
index 91ff223..4dc2866 100644
--- a/src/migmose/parsing.py
+++ b/src/migmose/parsing.py
@@ -83,15 +83,21 @@ def get_latest_file(file_list: list[Path]) -> Path:
     try:
         # Define the keywords to filter relevant files
         keywords = ["konsolidiertelesefassungmitfehlerkorrekturen", "außerordentlicheveröffentlichung"]
-
+        files_containing_keywords = [
+            path for path in file_list if any(keyword in path.name.lower() for keyword in keywords)
+        ]
         # Find the most recent file based on keywords and date suffixes
-        latest_file = max(
-            (path for path in file_list if any(keyword in path.name.lower() for keyword in keywords)),
-            key=lambda path: (
-                int(path.stem.split("_")[-1]),  # "gültig von" date
-                int(path.stem.split("_")[-2]),  # "gültig bis" date
-            ),
-        )
+        if any(files_containing_keywords):
+            # Find the most recent file based on keywords and date suffixes
+            latest_file = max(
+                (path for path in files_containing_keywords),
+                key=_get_sort_key,
+            )
+        else:  # different versions but no kosildierte Lesefassung or außerordentliche Veröffentlichung at all
+            latest_file = max(
+                (path for path in file_list),
+                key=_get_sort_key,
+            )
 
     except ValueError as e:
         logger.error("Error processing file list: {}", e)
@@ -154,19 +160,58 @@ def parse_raw_nachrichtenstrukturzeile(input_path: Path) -> list[str]:
 
 
 _pattern = re.compile(
-    r"MIG(?:Strom|Gas)?-?informatorischeLesefassung?(.*?)"
+    r"MIG(?:Strom|Gas)?-?informatorischeLesefassung?(?P<version>(?P<major>\d+)\.(?P<minor>\d+)(?P<suffix>[a-z]?))"
     r"(?:_|KonsolidierteLesefassung|-AußerordentlicheVeröffentlichung)",
     re.IGNORECASE,
 )
 
 
-def _extract_document_version(path: Path) -> str:
-    document_str = str(path)
+def _extract_document_version(path: Path | str) -> tuple[str, int | None, int | None, str]:
+    """
+    Extracts the document version (major.minor+suffix) details from the given file path.
+
+    Args:
+        path (Path | str): The path to the file.
+        Example: path/to/file/ORDCHGMIG-informatorischeLesefassung1.1a_99991231_20231001.docx
+        -> version: 1.1a, major: 1, minor: 1, suffix: a
+
+    Returns:
+        tuple: A tuple containing the document version (str), major version (int or None),
+                minor version (int or None), and suffix (str).
+    """
+
+    if isinstance(path, str):
+        document_str = path
+    else:
+        document_str = str(path)
     matches = _pattern.search(document_str)
     if matches:
-        document_version = matches.group(1)
+        document_version = matches.group("version")
+        major = matches.group("major")
+        minor = matches.group("minor")
+        suffix = matches.group("suffix")
         if document_version == "":
             logger.warning(f"❌ No document version found in {path}.", fg="red")
-        return document_version
+        return document_version or "", int(major) or 0, int(minor) or 0, suffix or ""
     logger.error(f"❌ Unexpected document name in {path}.", fg="red")
-    return ""
+    return "", None, None, ""
+
+
+def _get_sort_key(path: Path) -> tuple[int, int, int | None, int | None, str]:
+    """
+    Extracts the sort key from the given path.
+
+    Args:
+        path (Path): The path object to extract the sort key from.
+            Example: path/to/file/ORDCHGMIG-informatorischeLesefassung1.1a_99991231_20231001.docx
+            with gueltig_von_date: 20231001 and gueltig_bis_date: 99991231, major: 1, minor: 1, suffix: a
+
+    Returns:
+        tuple: A tuple containing the "gültig von" date (int),
+                "gültig bis" date (int), major version (int or None), minor version (int or None), and suffix (str).
+    """
+    parts = path.stem.split("_")
+    gueltig_von_date = int(parts[-1])
+    gueltig_bis_date = int(parts[-2])
+    _, major, minor, suffix = _extract_document_version(parts[-3])
+    return gueltig_von_date, gueltig_bis_date, major, minor, suffix
diff --git a/unittests/__snapshots__/test_parsing.ambr b/unittests/__snapshots__/test_parsing.ambr
index a3ea520..42f5164 100644
--- a/unittests/__snapshots__/test_parsing.ambr
+++ b/unittests/__snapshots__/test_parsing.ambr
@@ -1,16 +1,41 @@
 # serializer version: 1
 # name: TestParsing.test_extract_document_version[IFTSTA]
-  ''
+  tuple(
+    '',
+    None,
+    None,
+    '',
+  )
 # ---
 # name: TestParsing.test_extract_document_version[REMADV]
-  '2.9b'
+  tuple(
+    '2.9b',
+    2,
+    9,
+    'b',
+  )
 # ---
 # name: TestParsing.test_extract_document_version[REQOTE]
-  '1.3'
+  tuple(
+    '1.3',
+    1,
+    3,
+    '',
+  )
 # ---
 # name: TestParsing.test_extract_document_version[UTILMDG]
-  'G1.0a'
+  tuple(
+    '',
+    None,
+    None,
+    '',
+  )
 # ---
 # name: TestParsing.test_extract_document_version[UTILMDS]
-  'S1.1'
+  tuple(
+    '',
+    None,
+    None,
+    '',
+  )
 # ---
diff --git a/unittests/test_reduced_nested_nachrichtenstruktur.py b/unittests/test_reduced_nested_nachrichtenstruktur.py
index b11dcf5..3dec7c5 100644
--- a/unittests/test_reduced_nested_nachrichtenstruktur.py
+++ b/unittests/test_reduced_nested_nachrichtenstruktur.py
@@ -92,7 +92,7 @@ def test_output_tree(self, message_format: EdifactFormat, tmp_path, snapshot):
         reduced_nested_nachrichtenstruktur = ReducedNestedNachrichtenstruktur.create_reduced_nested_nachrichtenstruktur(
             nested_nachrichtenstruktur
         )
-        document_version = _extract_document_version(file_path)
+        document_version, *_ = _extract_document_version(file_path)
         reduced_nested_nachrichtenstruktur.output_tree(message_format, tmp_path, document_version)
         with open(tmp_path / f"{message_format}{document_version}.tree", "r", encoding="utf-8") as actual_file:
             assert actual_file.read() == snapshot