Correct RoW/RoE and add logging

cmutel · cmutel · commit 0e5dcbc4e5b8 · 2024-06-14T09:31:16.000+02:00
diff --git a/ecoinvent_migrate/data_io.py b/ecoinvent_migrate/data_io.py
@@ -10,11 +10,7 @@
 def get_change_report_filepath(version: str, release: EcoinventRelease) -> Path:
     """Get the filepath to the Excel change report file"""
     files = release.list_extra_files(version)
-    candidates = [
-        key
-        for key in files
-        if "change report" in key.lower() and "annex" in key.lower()
-    ]
+    candidates = [key for key in files if "change report" in key.lower() and "annex" in key.lower()]
     if not candidates:
         raise ValueError(
             "Can't find suitable change report filename from release files:\n\t{}".format(
diff --git a/ecoinvent_migrate/errors.py b/ecoinvent_migrate/errors.py
@@ -8,3 +8,7 @@ class Mismatch(Exception):
 
 class VersionJump(Exception):
     pass
+
+
+class MissingDatabase(Exception):
+    pass
diff --git a/ecoinvent_migrate/main.py b/ecoinvent_migrate/main.py
@@ -3,11 +3,14 @@
 from pathlib import Path
 from typing import Optional
 
+import pandas as pd
 from ecoinvent_interface import EcoinventRelease, Settings
 from loguru import logger
 
+from .utils import configure_logs
 from .data_io import get_change_report_filepath, setup_project
 from .errors import VersionJump
+from .wrangling import resolve_glo_row_rer_roe, source_target_pair_as_bw_dict
 
 
 def get_change_report_context(
@@ -59,9 +62,7 @@ def get_change_report_context(
 {excel_filepath.name}
         """
         )
-    logger.info(
-        "Using change report annex file {filename}", filename=excel_filepath.name
-    )
+    logger.info("Using change report annex file {filename}", filename=excel_filepath.name)
 
     setup_project(
         source_version=source_version,
@@ -78,14 +79,49 @@ def generate_technosphere_mapping(
     source_version: str,
     target_version: str,
     project_name: str = "ecoinvent-migration",
+    system_model: str = "cutoff",
     ecoinvent_username: Optional[str] = None,
     ecoinvent_password: Optional[str] = None,
+    write_logs: bool = True
 ) -> Path:
     """Generate a Randonneur mapping file for technosphere edge attributes from source to target."""
+    configure_logs(write_logs=write_logs)
+
     excel_filepath = get_change_report_context(
         source_version=source_version,
         target_version=target_version,
         project_name=project_name,
         ecoinvent_username=ecoinvent_username,
         ecoinvent_password=ecoinvent_password,
     )
+
+    sheet_names = pd.ExcelFile(excel_filepath).sheet_names
+    candidates = [name for name in sheet_names if name.lower() == "qualitative changes"]
+    if not candidates:
+        raise ValueError(
+            "Can't find suitable sheet name in change report file. Looking for 'qualitative changes', found:\n\t{}".format(
+                "\n\t".join(sheet_names)
+            )
+        )
+    elif len(candidates) > 1:
+        raise ValueError(
+            "Found multiple sheet names like 'qualitative changes' for change report file:\n\t{}".format(
+                "\n\t".join(sheet_names)
+            )
+        )
+
+    data = [
+        pair
+        for row in pd.read_excel(io=excel_filepath, sheet_name=candidates[0]).to_dict(
+            orient="records"
+        )
+        for pair in source_target_pair_as_bw_dict(row, source_version, target_version)
+    ]
+    data = resolve_glo_row_rer_roe(
+        data=data,
+        source_version=source_version,
+        target_version=target_version,
+        system_model=system_model,
+    )
+    data = [ds for ds in data if ds['source'] != ds['target']]
+    return data
diff --git a/ecoinvent_migrate/utils.py b/ecoinvent_migrate/utils.py
@@ -0,0 +1,19 @@
+import datetime
+from loguru import logger
+import sys
+from pathlib import Path
+
+from platformdirs import user_log_dir
+
+
+def configure_logs(write_logs: bool = True) -> None:
+    logger.remove()
+    logger.add(sys.stderr, level="INFO")
+    if write_logs:
+        logs_dir = (
+            Path(user_log_dir("ecoinvent_migrate", "pylca")) / datetime.datetime.now().isoformat()[:19].replace(":", "-")
+        )
+        logger.info("Writing logs to {path}", path=logs_dir)
+        logs_dir.mkdir(parents=True, exist_ok=True)
+        logger.add(logs_dir / "debug.log", level="DEBUG")
+        logger.add(logs_dir / "info.log", level="INFO")
diff --git a/ecoinvent_migrate/wrangling.py b/ecoinvent_migrate/wrangling.py
@@ -1,10 +1,21 @@
 import itertools
+import math
+from numbers import Number
+from typing import List
 
-from .errors import Mismatch, Uncombinable
+import bw2data as bd
+from loguru import logger
+
+from .errors import Mismatch, MissingDatabase, Uncombinable
 
 
 def split_by_semicolon(row: dict, version: str) -> list[dict]:
     """Possible split a data row into"""
+    if isinstance(row[f"Activity Name - {version}"], Number) and math.isnan(
+        row[f"Activity Name - {version}"]
+    ):
+        return []
+
     len_product = len(row[f"Reference Product - {version}"].split(";\n"))
     len_unit = len(row[f"Reference Product Unit - {version}"].split(";\n"))
     if len_product != len_unit:
@@ -113,9 +124,7 @@ def source_target_pair_as_bw_dict(
     ```
 
     """
-    versions = [
-        x.split(" - ")[-1].strip() for x in row if x.startswith("Activity Name")
-    ]
+    versions = [x.split(" - ")[-1].strip() for x in row if x.startswith("Activity Name")]
     if f"Activity Name - {source_version}" not in row:
         raise ValueError(
             f"""Can't find source version {source_version} in data row.
@@ -128,6 +137,10 @@ def source_target_pair_as_bw_dict(
         )
 
     sources = split_by_semicolon(row, source_version)
+    if not sources:
+        # New unit process dataset, no source objects
+        return []
+
     targets = split_by_semicolon(row, target_version)
     if len(sources) > 1 and len(targets) > 1 and len(sources) != len(targets):
         raise Uncombinable(
@@ -140,4 +153,94 @@ def source_target_pair_as_bw_dict(
     elif len(targets) == 1:
         targets = itertools.repeat(targets[0])
 
-    return [{"source": s, "target": t} for s, t in zip(sources, targets)]
+    return [
+        {"source": s, "target": t}
+        for s, t in zip(sources, targets)
+        if all(v.lower() != "nan" for v in itertools.chain(s.values(), t.values()))
+    ]
+
+
+def resolve_glo_row_rer_roe(
+    data: List[dict], source_version: str, target_version: str, system_model: str
+) -> List[dict]:
+    """Iterate through `data`, and change `location` attribute to `RoW` or `RoE` when needed.
+
+    Looks in actual database to get correct `location` attributes."""
+    source_db_name = f"ecoinvent-{source_version}-{system_model}"
+    target_db_name = f"ecoinvent-{target_version}-{system_model}"
+    if source_db_name not in bd.databases:
+        raise MissingDatabase(f"Missing source database: {source_db_name}")
+    if target_db_name not in bd.databases:
+        raise MissingDatabase(f"Missing target database: {target_db_name}")
+
+    logger.info("Loading source database {db} to cache data attributes", db=source_db_name)
+    source_lookup = {
+        tuple([o[attr] for attr in ("name", "location", "reference product")])
+        for o in bd.Database(source_db_name)
+    }
+    logger.info("Loading target database {db} to cache data attributes", db=target_db_name)
+    target_lookup = {
+        tuple([o[attr] for attr in ("name", "location", "reference product")])
+        for o in bd.Database(target_db_name)
+    }
+
+    for obj in data:
+        source_missing = None
+        for kind, lookup, db_name in [
+            ("source", source_lookup, source_db_name),
+            ("target", target_lookup, target_db_name),
+        ]:
+            key = tuple([obj[kind][attr] for attr in ("name", "location", "reference product")])
+            if key in lookup:
+                continue
+            elif (
+                key not in lookup
+                and obj[kind]["location"] == "GLO"
+                and (key[0], "RoW", key[2]) in lookup
+            ):
+                obj[kind]["location"] = "RoW"
+                logger.debug(
+                    "{kind} process {name} location corrected to 'RoW'",
+                    kind=kind,
+                    name=obj[kind]['name'],
+                )
+            elif (
+                key not in lookup
+                and obj[kind]["location"] == "RER"
+                and (key[0], "RoE", key[2]) in lookup
+            ):
+                obj[kind]["location"] = "RoE"
+                logger.debug(
+                    "{kind} process {name} location corrected to 'RoE'",
+                    kind=kind,
+                    name=obj[kind]['name'],
+                )
+            else:
+                if kind == 'target' and source_missing:
+                    # Missing in both source and target for this system model
+                    source_missing = None
+                    continue
+                elif kind == "source":
+                    source_missing = obj[kind]
+                else:
+                    # Only missing in target database - but this is a big problem, we don't have a
+                    # suitable target for existing edges to relink to.
+                    logger.warning(
+                        "{kind.title()} process given in change report but missing in {db_name} lookup: {ds}",
+                        kind=kind,
+                        db_name=db_name,
+                        ds=obj[kind],
+                    )
+                # raise KeyError(
+                #     f"""Can't find {kind} object in database {db_name}: {obj[kind]}"""
+                # )
+        if source_missing:
+            # Only a debug message because this won't break anything - there is no process in the
+            # source database to miss a link from.
+            logger.debug(
+                "Source process given in change report but missing in {db_name} lookup: {ds}",
+                db_name=source_db_name,
+                ds=source_missing,
+            )
+
+    return data
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,10 +29,11 @@ classifiers = [
 ]
 requires-python = ">=3.10"
 dependencies = [
-    "ecoinvent_interface",
     "bw2data>=4.0.dev41",
     "bw2io>=0.9.dev27",
+    "ecoinvent_interface",
     "loguru",
+    "platformdirs",
 ]
 
 [project.urls]
@@ -47,7 +48,7 @@ testing = [
     "ecoinvent_migrate",
     "pytest",
     "pytest-cov",
-    "python-coveralls"
+    "python-coveralls",
 ]
 dev = [
     "build",
@@ -78,7 +79,7 @@ testpaths = ["tests/*.py"]
 
 [tool.flake8]
 # Some sane defaults for the code style checker flake8
-max_line_length = 88
+max_line_length = 100
 extend_ignore = ["E203", "W503"]
 # ^  Black-compatible
 #    E203 and W503 have edge cases handled by black
@@ -91,11 +92,11 @@ exclude = [
 ]
 
 [tool.black]
-line-length = 88
+line-length = 100
 
 [tool.isort]
 profile = "black"
-line_length = 88
+line_length = 100
 multi_line_output = 3
 include_trailing_comma = true
 force_grid_wrap = 0
diff --git a/tests/test_wrangling.py b/tests/test_wrangling.py
@@ -212,3 +212,47 @@ def test_source_target_pair_as_bw_dict_valueerror():
         source_target_pair_as_bw_dict(given, "3.8", "3.10")
     with pytest.raises(ValueError):
         source_target_pair_as_bw_dict(given, "3.9.1", "3.11")
+
+
+def test_source_target_pair_as_bw_dict_new_dataset():
+    given = {
+        "Activity Name - 3.9.1": float("NaN"),
+        "Geography - 3.9.1": float("NaN"),
+        "Reference Product - 3.9.1": float("NaN"),
+        "Reference Product Unit - 3.9.1": float("NaN"),
+        "Activity Name - 3.10": "baling",
+        "Geography - 3.10": "GLO",
+        "Reference Product - 3.10": "baling",
+        "Reference Product Unit - 3.10": "unit",
+    }
+    assert source_target_pair_as_bw_dict(given, "3.9.1", "3.10") == []
+
+
+def test_source_target_pair_as_bw_dict_multiple_some_missing():
+    given = {
+        "Activity Name - 3.9.1": "p-nitrotoluene production",
+        "Geography - 3.9.1": "GLO",
+        "Reference Product - 3.9.1": "nan;\nnan;\np-nitrotoluene",
+        "Reference Product Unit - 3.9.1": "nan;\nnan;\nkg",
+        "Activity Name - 3.10": "nitrotoluenes production, toluene nitration",
+        "Geography - 3.10": "GLO",
+        "Reference Product - 3.10": "m-nitrotoluene;\no-nitrotoluene;\np-nitrotoluene",
+        "Reference Product Unit - 3.10": "kg;\nkg;\nkg",
+    }
+    expected = [
+        {
+            "source": {
+                "name": "p-nitrotoluene production",
+                "location": "GLO",
+                "reference product": "p-nitrotoluene",
+                "unit": "kg",
+            },
+            "target": {
+                "name": "nitrotoluenes production, toluene nitration",
+                "location": "GLO",
+                "reference product": "p-nitrotoluene",
+                "unit": "kg",
+            },
+        },
+    ]
+    assert source_target_pair_as_bw_dict(given, "3.9.1", "3.10") == expected