Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: filter for most recent files #163

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/migmose/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def main(
format_version,
output_dir_for_format,
)
document_version = _extract_document_version(file)
document_version, *_ = _extract_document_version(file)
reduced_nested_nachrichtenstruktur.output_tree(m_format, output_dir_for_format, document_version)


Expand Down
73 changes: 59 additions & 14 deletions src/migmose/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,15 +83,21 @@ def get_latest_file(file_list: list[Path]) -> Path:
try:
# Define the keywords to filter relevant files
keywords = ["konsolidiertelesefassungmitfehlerkorrekturen", "außerordentlicheveröffentlichung"]

files_containing_keywords = [
path for path in file_list if any(keyword in path.name.lower() for keyword in keywords)
]
# Find the most recent file based on keywords and date suffixes
latest_file = max(
(path for path in file_list if any(keyword in path.name.lower() for keyword in keywords)),
key=lambda path: (
int(path.stem.split("_")[-1]), # "gültig von" date
int(path.stem.split("_")[-2]), # "gültig bis" date
),
)
if any(files_containing_keywords):
# Find the most recent file based on keywords and date suffixes
latest_file = max(
(path for path in files_containing_keywords),
key=_get_sort_key,
)
else: # different versions but no kosildierte Lesefassung or außerordentliche Veröffentlichung at all
latest_file = max(
(path for path in file_list),
key=_get_sort_key,
)

except ValueError as e:
logger.error("Error processing file list: {}", e)
Expand Down Expand Up @@ -154,19 +160,58 @@ def parse_raw_nachrichtenstrukturzeile(input_path: Path) -> list[str]:


_pattern = re.compile(
r"MIG(?:Strom|Gas)?-?informatorischeLesefassung?(.*?)"
r"MIG(?:Strom|Gas)?-?informatorischeLesefassung?(?P<version>(?P<major>\d+)\.(?P<minor>\d+)(?P<suffix>[a-z]?))"
r"(?:_|KonsolidierteLesefassung|-AußerordentlicheVeröffentlichung)",
re.IGNORECASE,
)


def _extract_document_version(path: Path) -> str:
document_str = str(path)
def _extract_document_version(path: Path | str) -> tuple[str, int | None, int | None, str]:
"""
Extracts the document version (major.minor+suffix) details from the given file path.

Args:
path (Path | str): The path to the file.
Example: path/to/file/ORDCHGMIG-informatorischeLesefassung1.1a_99991231_20231001.docx
-> version: 1.1a, major: 1, minor: 1, suffix: a

Returns:
tuple: A tuple containing the document version (str), major version (int or None),
minor version (int or None), and suffix (str).
"""

if isinstance(path, str):
document_str = path
else:
document_str = str(path)
matches = _pattern.search(document_str)
if matches:
document_version = matches.group(1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Du könntest auch die capturing groups im pattern direkt benennen.

https://docs.python.org/3/library/re.html#re.Match.groupdict

Dann hängt das assignment hier nicht implizit an der Reihenfolge, sondern ist direkt im pattern schon benannt. Named groups sind ein einfacher Weg um Regex lesbarer zu machen

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

document_version = matches.group("version")
major = matches.group("major")
minor = matches.group("minor")
suffix = matches.group("suffix")
if document_version == "":
logger.warning(f"❌ No document version found in {path}.", fg="red")
return document_version
return document_version or "", int(major) or 0, int(minor) or 0, suffix or ""
logger.error(f"❌ Unexpected document name in {path}.", fg="red")
return ""
return "", None, None, ""


def _get_sort_key(path: Path) -> tuple[int, int, int | None, int | None, str]:
"""
Extracts the sort key from the given path.

Args:
path (Path): The path object to extract the sort key from.
Example: path/to/file/ORDCHGMIG-informatorischeLesefassung1.1a_99991231_20231001.docx
with gueltig_von_date: 20231001 and gueltig_bis_date: 99991231, major: 1, minor: 1, suffix: a

Returns:
tuple: A tuple containing the "gültig von" date (int),
"gültig bis" date (int), major version (int or None), minor version (int or None), and suffix (str).
"""
parts = path.stem.split("_")
gueltig_von_date = int(parts[-1])
gueltig_bis_date = int(parts[-2])
Comment on lines +214 to +215
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Es ist mir eine private Funktion aber ich bin darüber gestolpert, dass es Date heißt obwohl da ein int ist. Der Type hint stimmt, und wenn man weiß wie der Dateiname aussieht, dann ist auch klar was passiert. Wenn man das zum ersten Mal sieht könnte vllt ein Beispiel Dateiname oder Pfad im Docstring nicht schaden.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_, major, minor, suffix = _extract_document_version(parts[-3])
return gueltig_von_date, gueltig_bis_date, major, minor, suffix
35 changes: 30 additions & 5 deletions unittests/__snapshots__/test_parsing.ambr
Original file line number Diff line number Diff line change
@@ -1,16 +1,41 @@
# serializer version: 1
# name: TestParsing.test_extract_document_version[IFTSTA]
''
tuple(
'',
None,
None,
'',
)
# ---
# name: TestParsing.test_extract_document_version[REMADV]
'2.9b'
tuple(
'2.9b',
2,
9,
'b',
)
# ---
# name: TestParsing.test_extract_document_version[REQOTE]
'1.3'
tuple(
'1.3',
1,
3,
'',
)
# ---
# name: TestParsing.test_extract_document_version[UTILMDG]
'G1.0a'
tuple(
'',
None,
None,
'',
)
# ---
# name: TestParsing.test_extract_document_version[UTILMDS]
'S1.1'
tuple(
'',
None,
None,
'',
)
# ---
2 changes: 1 addition & 1 deletion unittests/test_reduced_nested_nachrichtenstruktur.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def test_output_tree(self, message_format: EdifactFormat, tmp_path, snapshot):
reduced_nested_nachrichtenstruktur = ReducedNestedNachrichtenstruktur.create_reduced_nested_nachrichtenstruktur(
nested_nachrichtenstruktur
)
document_version = _extract_document_version(file_path)
document_version, *_ = _extract_document_version(file_path)
reduced_nested_nachrichtenstruktur.output_tree(message_format, tmp_path, document_version)
with open(tmp_path / f"{message_format}{document_version}.tree", "r", encoding="utf-8") as actual_file:
assert actual_file.read() == snapshot
Expand Down