Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: filter for most recent files #163

Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/migmose/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def main(
format_version,
output_dir_for_format,
)
document_version = _extract_document_version(file)
document_version, *_ = _extract_document_version(file)
reduced_nested_nachrichtenstruktur.output_tree(m_format, output_dir_for_format, document_version)


Expand Down
55 changes: 41 additions & 14 deletions src/migmose/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,15 +83,21 @@ def get_latest_file(file_list: list[Path]) -> Path:
try:
# Define the keywords to filter relevant files
keywords = ["konsolidiertelesefassungmitfehlerkorrekturen", "außerordentlicheveröffentlichung"]

files_containing_keywords = [
path for path in file_list if any(keyword in path.name.lower() for keyword in keywords)
]
# Find the most recent file based on keywords and date suffixes
latest_file = max(
(path for path in file_list if any(keyword in path.name.lower() for keyword in keywords)),
key=lambda path: (
int(path.stem.split("_")[-1]), # "gültig von" date
int(path.stem.split("_")[-2]), # "gültig bis" date
),
)
if any(files_containing_keywords):
# Find the most recent file based on keywords and date suffixes
latest_file = max(
(path for path in files_containing_keywords),
key=_get_sort_key,
)
else: # different versions but no kosildierte Lesefassung or außerordentliche Veröffentlichung at all
latest_file = max(
(path for path in file_list),
key=_get_sort_key,
)

except ValueError as e:
logger.error("Error processing file list: {}", e)
Expand Down Expand Up @@ -154,19 +160,40 @@ def parse_raw_nachrichtenstrukturzeile(input_path: Path) -> list[str]:


_pattern = re.compile(
r"MIG(?:Strom|Gas)?-?informatorischeLesefassung?(.*?)"
r"MIG(?:Strom|Gas)?-?informatorischeLesefassung?((\d+)\.(\d+)([a-z]?))"
r"(?:_|KonsolidierteLesefassung|-AußerordentlicheVeröffentlichung)",
re.IGNORECASE,
)


def _extract_document_version(path: Path) -> str:
document_str = str(path)
def _extract_document_version(path: Path | str) -> tuple[str, int | None, int | None, str]:
"""Returns the document version, major, minor, and suffix from the given file path."""
if isinstance(path, str):
document_str = path
else:
document_str = str(path)
matches = _pattern.search(document_str)
if matches:
document_version = matches.group(1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Du könntest auch die capturing groups im pattern direkt benennen.

https://docs.python.org/3/library/re.html#re.Match.groupdict

Dann hängt das assignment hier nicht implizit an der Reihenfolge, sondern ist direkt im pattern schon benannt. Named groups sind ein einfacher Weg um Regex lesbarer zu machen

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

document_version, major, minor, suffix = matches.groups()
if document_version == "":
logger.warning(f"❌ No document version found in {path}.", fg="red")
return document_version
return document_version or "", int(major) or 0, int(minor) or 0, suffix or ""
logger.error(f"❌ Unexpected document name in {path}.", fg="red")
return ""
return "", None, None, ""


def _get_sort_key(path: Path) -> tuple[int, int, int | None, int | None, str]:
"""
Extracts the sort key from the given path.

Parameters:
- path (Path): The path object to extract the sort key from.

Returns:
- tuple: A tuple containing the "gültig von" date, "gültig bis" date, and version number.
"""
parts = path.stem.split("_")
gueltig_von_date = int(parts[-1])
gueltig_bis_date = int(parts[-2])
Comment on lines +214 to +215
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Es ist mir eine private Funktion aber ich bin darüber gestolpert, dass es Date heißt obwohl da ein int ist. Der Type hint stimmt, und wenn man weiß wie der Dateiname aussieht, dann ist auch klar was passiert. Wenn man das zum ersten Mal sieht könnte vllt ein Beispiel Dateiname oder Pfad im Docstring nicht schaden.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_, major, minor, suffix = _extract_document_version(parts[-3])
return gueltig_von_date, gueltig_bis_date, major, minor, suffix
35 changes: 30 additions & 5 deletions unittests/__snapshots__/test_parsing.ambr
Original file line number Diff line number Diff line change
@@ -1,16 +1,41 @@
# serializer version: 1
# name: TestParsing.test_extract_document_version[IFTSTA]
''
tuple(
'',
None,
None,
'',
)
# ---
# name: TestParsing.test_extract_document_version[REMADV]
'2.9b'
tuple(
'2.9b',
2,
9,
'b',
)
# ---
# name: TestParsing.test_extract_document_version[REQOTE]
'1.3'
tuple(
'1.3',
1,
3,
'',
)
# ---
# name: TestParsing.test_extract_document_version[UTILMDG]
'G1.0a'
tuple(
'',
None,
None,
'',
)
# ---
# name: TestParsing.test_extract_document_version[UTILMDS]
'S1.1'
tuple(
'',
None,
None,
'',
)
# ---
2 changes: 1 addition & 1 deletion unittests/test_reduced_nested_nachrichtenstruktur.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def test_output_tree(self, message_format: EdifactFormat, tmp_path, snapshot):
reduced_nested_nachrichtenstruktur = ReducedNestedNachrichtenstruktur.create_reduced_nested_nachrichtenstruktur(
nested_nachrichtenstruktur
)
document_version = _extract_document_version(file_path)
document_version, *_ = _extract_document_version(file_path)
reduced_nested_nachrichtenstruktur.output_tree(message_format, tmp_path, document_version)
with open(tmp_path / f"{message_format}{document_version}.tree", "r", encoding="utf-8") as actual_file:
assert actual_file.read() == snapshot
Expand Down