Skip to content

Commit 50f71c8

Browse files
authored
Merge pull request #5 from Haitham-ghaida/fix-ei-nov24-rel
Fix ei nov24 rel
2 parents 84f40f4 + 452769d commit 50f71c8

File tree

2 files changed

+141
-50
lines changed

2 files changed

+141
-50
lines changed

ecoinvent_migrate/main.py

+67-24
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ def generate_biosphere_mapping(
205205
output_directory: Optional[Path] = None,
206206
output_version: str = "3.0.0",
207207
description: Optional[str] = None,
208-
) -> Path:
208+
) -> Optional[Path]:
209209
"""Generate a Randonneur mapping file for biosphere edge attributes from source to target."""
210210
configure_logs(write_logs=write_logs)
211211

@@ -246,23 +246,52 @@ def generate_biosphere_mapping(
246246
description = f"Data migration file from {source_db_name} to {target_db_name} generated with `ecoinvent_migrate` version {__version__}"
247247

248248
if not missing_sheet:
249-
data = pd.read_excel(io=excel_filepath, sheet_name=candidates[0]).to_dict(orient="records")
250-
data = source_target_biosphere_pair(
251-
data=data,
252-
source_version=source_version,
253-
target_version=target_version,
254-
keep_deletions=keep_deletions,
255-
)
256-
affected_uuids = {
257-
o["source"]["uuid"]
258-
for o in itertools.chain(data.get("replace", []), data.get("delete", []))
259-
}
260-
data = supplement_biosphere_changes_with_real_data_comparison(
261-
data=data,
262-
affected_uuids=affected_uuids,
263-
source_version=source_version,
264-
target_version=target_version,
265-
)
249+
# Try reading the sheet
250+
df = pd.read_excel(io=excel_filepath, sheet_name=candidates[0])
251+
252+
# Handle the multi-index case
253+
if df.columns[0].startswith("**"):
254+
logger.debug("Detected multi-index format, adjusting reading parameters")
255+
df = pd.read_excel(io=excel_filepath, sheet_name=candidates[0], skiprows=1)
256+
257+
# Handle the new format case
258+
if "deleted exchanges" in df.columns:
259+
logger.debug("Detected new exchange format, adjusting data structure")
260+
# Get the actual column headers from the first row
261+
new_headers = {col: val for col, val in df.iloc[0].items() if isinstance(val, str)}
262+
df = df.rename(columns=new_headers).iloc[1:]
263+
264+
if df.empty:
265+
logger.info(
266+
"EE Deletions sheet is empty in change report for {source_v} to {target_v}. This likely means no biosphere changes.",
267+
source_v=source_version,
268+
target_v=target_version,
269+
)
270+
data = {"delete": [], "replace": []}
271+
else:
272+
data = df.to_dict(orient="records")
273+
data = source_target_biosphere_pair(
274+
data=data,
275+
source_version=source_version,
276+
target_version=target_version,
277+
keep_deletions=keep_deletions,
278+
)
279+
# Ensure both keys exist
280+
if "delete" not in data:
281+
data["delete"] = []
282+
if "replace" not in data:
283+
data["replace"] = []
284+
285+
affected_uuids = {
286+
o["source"]["uuid"]
287+
for o in itertools.chain(data.get("replace", []), data.get("delete", []))
288+
}
289+
data = supplement_biosphere_changes_with_real_data_comparison(
290+
data=data,
291+
affected_uuids=affected_uuids,
292+
source_version=source_version,
293+
target_version=target_version,
294+
)
266295
else:
267296
data = supplement_biosphere_changes_with_real_data_comparison(
268297
data={"delete": [], "replace": []},
@@ -271,16 +300,28 @@ def generate_biosphere_mapping(
271300
target_version=target_version,
272301
)
273302

274-
if not data["delete"] and not data["replace"]:
275-
logger.info("It seems like there are no biosphere changes for this release. Doing nothing.")
276-
return
303+
# Ensure we have non-empty data before creating Datapackage
304+
has_data = False
305+
cleaned_data = {}
306+
for key in ["delete", "replace"]:
307+
if data.get(key) and len(data[key]) > 0:
308+
cleaned_data[key] = data[key]
309+
has_data = True
310+
311+
if not has_data:
312+
logger.info("No valid biosphere changes found after processing. Doing nothing.")
313+
return None
277314

278315
dp = Datapackage(
279316
name=f"{source_db_name}-{target_db_name}",
280317
description=description,
281318
contributors=[
282-
{"title": "ecoinvent association", "path": "https://ecoinvent.org/", "roles": ["author"]},
283-
{"title": "Chris Mutel", "path": "https://chris.mutel.org/", "roles": ["wrangler"]},
319+
{
320+
"title": "ecoinvent association",
321+
"path": "https://ecoinvent.org/",
322+
"role": "author",
323+
},
324+
{"title": "Chris Mutel", "path": "https://chris.mutel.org/", "role": "wrangler"},
284325
],
285326
mapping_source=MappingConstants.ECOSPOLD2_BIO,
286327
mapping_target=MappingConstants.ECOSPOLD2_BIO,
@@ -290,7 +331,9 @@ def generate_biosphere_mapping(
290331
target_id=target_db_name,
291332
licenses=licenses,
292333
)
293-
for key, value in data.items():
334+
335+
# Only add non-empty data sections
336+
for key, value in cleaned_data.items():
294337
dp.add_data(key, value)
295338

296339
if write_file:

ecoinvent_migrate/wrangling.py

+74-26
Original file line numberDiff line numberDiff line change
@@ -311,21 +311,42 @@ def split_replace_disaggregate(data: List[dict], target_lookup: dict) -> dict:
311311

312312

313313
def get_column_labels(example: dict, version: str) -> dict:
314-
"""Guess column labels from Excel change report annex."""
314+
"""Guess column labels from Excel change report annex.
315+
316+
Now handles multiple formats:
317+
- Standard format: "UUID/ID - version"
318+
- New format: Where labels might be in values
319+
"""
315320
uuid_tries = [f"UUID - {version}", f"ID - {version}"]
321+
name_tries = [f"Name - {version}", f"{version} name", f"{version} - name"]
322+
323+
# Try standard format first
316324
for uuid_try in uuid_tries:
317325
if uuid_try in example:
318326
uuid = uuid_try
319327
break
320328
else:
321-
raise ValueError(f"Can't find uuid field for database version {version} in {example}")
322-
name_tries = [f"Name - {version}", f"{version} name", f"{version} - name"]
329+
# If standard format fails, check for new format
330+
for key, value in example.items():
331+
if isinstance(value, str) and any(try_pattern in value for try_pattern in uuid_tries):
332+
uuid = key
333+
break
334+
else:
335+
raise ValueError(f"Can't find uuid field for database version {version} in {example}")
336+
337+
# Same pattern for name
323338
for name_try in name_tries:
324339
if name_try in example:
325340
name = name_try
326341
break
327342
else:
328-
raise ValueError(f"Can't find name field for database version {version} in {example}")
343+
for key, value in example.items():
344+
if isinstance(value, str) and any(try_pattern in value for try_pattern in name_tries):
345+
name = key
346+
break
347+
else:
348+
raise ValueError(f"Can't find name field for database version {version} in {example}")
349+
329350
return {
330351
"uuid": uuid,
331352
"name": name,
@@ -335,33 +356,56 @@ def get_column_labels(example: dict, version: str) -> dict:
335356
def source_target_biosphere_pair(
336357
data: List[dict], source_version: str, target_version: str, keep_deletions: bool
337358
) -> List[dict]:
338-
"""Turn pandas DataFrame rows into source/target pairs."""
359+
"""Turn pandas DataFrame rows into source/target pairs.
360+
361+
The function now handles both old and new EE Deletions formats:
362+
- Old format: Direct source/target columns
363+
- New format: Deletion/replacement columns with explicit relationships
364+
"""
365+
# For empty data, return empty structure
366+
if not data:
367+
return {"replace": [], "delete": []}
368+
369+
# Try old format first
339370
source_labels = get_column_labels(example=data[0], version=source_version)
340371
target_labels = get_column_labels(example=data[0], version=target_version)
341372

342-
formatted = {
343-
"replace": [
344-
{
345-
"source": {k: row[v] for k, v in source_labels.items()},
346-
"target": {k: row[v] for k, v in target_labels.items()},
347-
"conversion_factor": float(row.get("Conversion Factor (old-new)", 1.0)),
348-
"comment": row.get("Comment"),
349-
}
350-
for row in data
351-
if not isnan(row[target_labels["uuid"]])
352-
]
353-
}
354-
if keep_deletions:
355-
formatted["delete"] = [
356-
{
357-
"source": {k: row[v] for k, v in source_labels.items()},
358-
"comment": row.get("Comment"),
359-
}
360-
for row in data
361-
if isnan(row[target_labels["uuid"]])
362-
]
373+
# Initialize the result structure
374+
formatted = {"replace": [], "delete": [] if keep_deletions else None}
375+
376+
# Process each row
377+
for row in data:
378+
# Skip empty or invalid rows
379+
if any(isnan(row.get(v)) for v in source_labels.values()):
380+
continue
381+
382+
# Create source entry
383+
source_entry = {k: row[v] for k, v in source_labels.items()}
384+
385+
# Check if there's a valid target
386+
has_target = not any(isnan(row.get(v, float("nan"))) for v in target_labels.values())
387+
388+
if has_target:
389+
formatted["replace"].append(
390+
{
391+
"source": source_entry,
392+
"target": {k: row[v] for k, v in target_labels.items()},
393+
"conversion_factor": float(row.get("Conversion Factor (old-new)", 1.0)),
394+
"comment": row.get("Comment"),
395+
}
396+
)
397+
elif keep_deletions:
398+
formatted["delete"].append(
399+
{
400+
"source": source_entry,
401+
"comment": row.get("Comment"),
402+
}
403+
)
363404

405+
# Clean up the formatted data
364406
for lst in formatted.values():
407+
if lst is None:
408+
continue
365409
for obj in lst:
366410
if "comment" in obj and (not obj["comment"] or isnan(obj["comment"])):
367411
del obj["comment"]
@@ -370,4 +414,8 @@ def source_target_biosphere_pair(
370414
):
371415
del obj["conversion_factor"]
372416

417+
# Remove empty delete list if not keeping deletions
418+
if not keep_deletions:
419+
del formatted["delete"]
420+
373421
return formatted

0 commit comments

Comments
 (0)