Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Checking if there are more than one MIG docx files for a message_format #37

Merged
merged 10 commits into from
Apr 16, 2024
40 changes: 39 additions & 1 deletion src/migmose/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
"""

import json
import re
from datetime import datetime
from pathlib import Path
from typing import Generator, Union

Expand All @@ -28,7 +30,7 @@ def find_file_to_format(message_formats: list[EdifactFormat], input_dir: Path) -
if len(list_of_all_files) == 1:
file_dict[message_format] = list_of_all_files[0]
elif len(list_of_all_files) > 1:
logger.warning(f"⚠️ There are several files for {message_format}.", fg="red")
file_dict[message_format] = get_latest_file(list_of_all_files)
else:
logger.warning(f"⚠️ No file found for {message_format}.", fg="red")
if file_dict:
Expand All @@ -37,6 +39,42 @@ def find_file_to_format(message_formats: list[EdifactFormat], input_dir: Path) -
raise click.Abort()


def get_latest_file(file_list):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Kannst du hier Type hints adden?

hamidhajiparvaneh marked this conversation as resolved.
Show resolved Hide resolved
"""
This function takes a list of docx files Path
and returns the Path of the latest MIG docx file based on the timestamp in its name.
The timestamp is assumed to be formatted as YYYYMMDD and located just before the ".docx" extension.

Parameters:
file_list (list of Path): A list containing file paths with timestamps.

Returns:
Path: The path of the latest file. Returns None if no valid date is found.
"""

def extract_date(file_path):
# Regex to extract the date format YYYYMMDD from the filename as a string
match = re.search(r"(\d{8})\.docx$", file_path.name)
if match:
# Return the date as a datetime object for comparison and the path for use
return datetime.strptime(match.group(1), "%Y%m%d"), file_path
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

das datetime hat keine zeitzone 😶 aber es ist egal, weil alle keine zeitzone haben :)

return None, None

# Initialize variables to keep track of the latest file and date
latest_file = None
latest_date = None
hamidhajiparvaneh marked this conversation as resolved.
Show resolved Hide resolved

for file_path in file_list:
date, path = extract_date(file_path)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if date:
if latest_file is None or date > latest_date:
latest_file = path
latest_date = date

# Return the path of the file with the latest date
return latest_file


def preliminary_output_as_json(table: list[str], message_format: EdifactFormat, output_dir: Path) -> None:
"""
Writes the preliminary output as json.
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
11 changes: 11 additions & 0 deletions unittests/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,17 @@ def test_find_only_one_file(self, caplog):
assert f"No file found for {EdifactFormat.ORDRSP}." in caplog.text
assert file_dict[EdifactFormat.ORDCHG] == input_dir / Path("ORDCHG_MIG_1_1_info_20230331_v2.docx")

def test_find_only_one_file_multiple_docx(self):
"""
Tests to find multiple docx files with the same message_format.
"""
message_formats = [EdifactFormat.IFTSTA]
input_dir = Path("unittests/test_data/")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am surprised that this path is working :D

file_dict = find_file_to_format(message_formats, input_dir)
assert file_dict[EdifactFormat.IFTSTA] == input_dir / Path(
"IFTSTAMIG-informatorischeLesefassung2.0emitFehlerkorrekturenStand11.03.2024_99991231_20240311.docx"
)

def test_parse_raw_nachrichtenstrukturzeile(self):
"""
Test to parse the raw nachrichtenstrukturzeile from a docx file.
Expand Down