Merge pull request #5 from Haitham-ghaida/fix-ei-nov24-rel

cmutel · web-flow · commit 50f71c80a42d · 2024-12-05T22:58:44.000+01:00
Fix ei nov24 rel
diff --git a/ecoinvent_migrate/main.py b/ecoinvent_migrate/main.py
@@ -205,7 +205,7 @@ def generate_biosphere_mapping(
     output_directory: Optional[Path] = None,
     output_version: str = "3.0.0",
     description: Optional[str] = None,
-) -> Path:
+) -> Optional[Path]:
     """Generate a Randonneur mapping file for biosphere edge attributes from source to target."""
     configure_logs(write_logs=write_logs)
 
@@ -246,23 +246,52 @@ def generate_biosphere_mapping(
         description = f"Data migration file from {source_db_name} to {target_db_name} generated with `ecoinvent_migrate` version {__version__}"
 
     if not missing_sheet:
-        data = pd.read_excel(io=excel_filepath, sheet_name=candidates[0]).to_dict(orient="records")
-        data = source_target_biosphere_pair(
-            data=data,
-            source_version=source_version,
-            target_version=target_version,
-            keep_deletions=keep_deletions,
-        )
-        affected_uuids = {
-            o["source"]["uuid"]
-            for o in itertools.chain(data.get("replace", []), data.get("delete", []))
-        }
-        data = supplement_biosphere_changes_with_real_data_comparison(
-            data=data,
-            affected_uuids=affected_uuids,
-            source_version=source_version,
-            target_version=target_version,
-        )
+        # Try reading the sheet
+        df = pd.read_excel(io=excel_filepath, sheet_name=candidates[0])
+
+        # Handle the multi-index case
+        if df.columns[0].startswith("**"):
+            logger.debug("Detected multi-index format, adjusting reading parameters")
+            df = pd.read_excel(io=excel_filepath, sheet_name=candidates[0], skiprows=1)
+
+        # Handle the new format case
+        if "deleted exchanges" in df.columns:
+            logger.debug("Detected new exchange format, adjusting data structure")
+            # Get the actual column headers from the first row
+            new_headers = {col: val for col, val in df.iloc[0].items() if isinstance(val, str)}
+            df = df.rename(columns=new_headers).iloc[1:]
+
+        if df.empty:
+            logger.info(
+                "EE Deletions sheet is empty in change report for {source_v} to {target_v}. This likely means no biosphere changes.",
+                source_v=source_version,
+                target_v=target_version,
+            )
+            data = {"delete": [], "replace": []}
+        else:
+            data = df.to_dict(orient="records")
+            data = source_target_biosphere_pair(
+                data=data,
+                source_version=source_version,
+                target_version=target_version,
+                keep_deletions=keep_deletions,
+            )
+            # Ensure both keys exist
+            if "delete" not in data:
+                data["delete"] = []
+            if "replace" not in data:
+                data["replace"] = []
+
+            affected_uuids = {
+                o["source"]["uuid"]
+                for o in itertools.chain(data.get("replace", []), data.get("delete", []))
+            }
+            data = supplement_biosphere_changes_with_real_data_comparison(
+                data=data,
+                affected_uuids=affected_uuids,
+                source_version=source_version,
+                target_version=target_version,
+            )
     else:
         data = supplement_biosphere_changes_with_real_data_comparison(
             data={"delete": [], "replace": []},
@@ -271,16 +300,28 @@ def generate_biosphere_mapping(
             target_version=target_version,
         )
 
-    if not data["delete"] and not data["replace"]:
-        logger.info("It seems like there are no biosphere changes for this release. Doing nothing.")
-        return
+    # Ensure we have non-empty data before creating Datapackage
+    has_data = False
+    cleaned_data = {}
+    for key in ["delete", "replace"]:
+        if data.get(key) and len(data[key]) > 0:
+            cleaned_data[key] = data[key]
+            has_data = True
+
+    if not has_data:
+        logger.info("No valid biosphere changes found after processing. Doing nothing.")
+        return None
 
     dp = Datapackage(
         name=f"{source_db_name}-{target_db_name}",
         description=description,
         contributors=[
-            {"title": "ecoinvent association", "path": "https://ecoinvent.org/", "roles": ["author"]},
-            {"title": "Chris Mutel", "path": "https://chris.mutel.org/", "roles": ["wrangler"]},
+            {
+                "title": "ecoinvent association",
+                "path": "https://ecoinvent.org/",
+                "role": "author",
+            },
+            {"title": "Chris Mutel", "path": "https://chris.mutel.org/", "role": "wrangler"},
         ],
         mapping_source=MappingConstants.ECOSPOLD2_BIO,
         mapping_target=MappingConstants.ECOSPOLD2_BIO,
@@ -290,7 +331,9 @@ def generate_biosphere_mapping(
         target_id=target_db_name,
         licenses=licenses,
     )
-    for key, value in data.items():
+
+    # Only add non-empty data sections
+    for key, value in cleaned_data.items():
         dp.add_data(key, value)
 
     if write_file:
diff --git a/ecoinvent_migrate/wrangling.py b/ecoinvent_migrate/wrangling.py
@@ -311,21 +311,42 @@ def split_replace_disaggregate(data: List[dict], target_lookup: dict) -> dict:
 
 
 def get_column_labels(example: dict, version: str) -> dict:
-    """Guess column labels from Excel change report annex."""
+    """Guess column labels from Excel change report annex.
+
+    Now handles multiple formats:
+    - Standard format: "UUID/ID - version"
+    - New format: Where labels might be in values
+    """
     uuid_tries = [f"UUID - {version}", f"ID - {version}"]
+    name_tries = [f"Name - {version}", f"{version} name", f"{version} - name"]
+
+    # Try standard format first
     for uuid_try in uuid_tries:
         if uuid_try in example:
             uuid = uuid_try
             break
     else:
-        raise ValueError(f"Can't find uuid field for database version {version} in {example}")
-    name_tries = [f"Name - {version}", f"{version} name", f"{version} - name"]
+        # If standard format fails, check for new format
+        for key, value in example.items():
+            if isinstance(value, str) and any(try_pattern in value for try_pattern in uuid_tries):
+                uuid = key
+                break
+        else:
+            raise ValueError(f"Can't find uuid field for database version {version} in {example}")
+
+    # Same pattern for name
     for name_try in name_tries:
         if name_try in example:
             name = name_try
             break
     else:
-        raise ValueError(f"Can't find name field for database version {version} in {example}")
+        for key, value in example.items():
+            if isinstance(value, str) and any(try_pattern in value for try_pattern in name_tries):
+                name = key
+                break
+        else:
+            raise ValueError(f"Can't find name field for database version {version} in {example}")
+
     return {
         "uuid": uuid,
         "name": name,
@@ -335,33 +356,56 @@ def get_column_labels(example: dict, version: str) -> dict:
 def source_target_biosphere_pair(
     data: List[dict], source_version: str, target_version: str, keep_deletions: bool
 ) -> List[dict]:
-    """Turn pandas DataFrame rows into source/target pairs."""
+    """Turn pandas DataFrame rows into source/target pairs.
+
+    The function now handles both old and new EE Deletions formats:
+    - Old format: Direct source/target columns
+    - New format: Deletion/replacement columns with explicit relationships
+    """
+    # For empty data, return empty structure
+    if not data:
+        return {"replace": [], "delete": []}
+
+    # Try old format first
     source_labels = get_column_labels(example=data[0], version=source_version)
     target_labels = get_column_labels(example=data[0], version=target_version)
 
-    formatted = {
-        "replace": [
-            {
-                "source": {k: row[v] for k, v in source_labels.items()},
-                "target": {k: row[v] for k, v in target_labels.items()},
-                "conversion_factor": float(row.get("Conversion Factor (old-new)", 1.0)),
-                "comment": row.get("Comment"),
-            }
-            for row in data
-            if not isnan(row[target_labels["uuid"]])
-        ]
-    }
-    if keep_deletions:
-        formatted["delete"] = [
-            {
-                "source": {k: row[v] for k, v in source_labels.items()},
-                "comment": row.get("Comment"),
-            }
-            for row in data
-            if isnan(row[target_labels["uuid"]])
-        ]
+    # Initialize the result structure
+    formatted = {"replace": [], "delete": [] if keep_deletions else None}
+
+    # Process each row
+    for row in data:
+        # Skip empty or invalid rows
+        if any(isnan(row.get(v)) for v in source_labels.values()):
+            continue
+
+        # Create source entry
+        source_entry = {k: row[v] for k, v in source_labels.items()}
+
+        # Check if there's a valid target
+        has_target = not any(isnan(row.get(v, float("nan"))) for v in target_labels.values())
+
+        if has_target:
+            formatted["replace"].append(
+                {
+                    "source": source_entry,
+                    "target": {k: row[v] for k, v in target_labels.items()},
+                    "conversion_factor": float(row.get("Conversion Factor (old-new)", 1.0)),
+                    "comment": row.get("Comment"),
+                }
+            )
+        elif keep_deletions:
+            formatted["delete"].append(
+                {
+                    "source": source_entry,
+                    "comment": row.get("Comment"),
+                }
+            )
 
+    # Clean up the formatted data
     for lst in formatted.values():
+        if lst is None:
+            continue
         for obj in lst:
             if "comment" in obj and (not obj["comment"] or isnan(obj["comment"])):
                 del obj["comment"]
@@ -370,4 +414,8 @@ def source_target_biosphere_pair(
             ):
                 del obj["conversion_factor"]
 
+    # Remove empty delete list if not keeping deletions
+    if not keep_deletions:
+        del formatted["delete"]
+
     return formatted