Skip to content

Commit 0e5dcbc

Browse files
committed
Correct RoW/RoE and add logging
1 parent 192a9a9 commit 0e5dcbc

File tree

7 files changed

+221
-18
lines changed

7 files changed

+221
-18
lines changed

ecoinvent_migrate/data_io.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,7 @@
1010
def get_change_report_filepath(version: str, release: EcoinventRelease) -> Path:
1111
"""Get the filepath to the Excel change report file"""
1212
files = release.list_extra_files(version)
13-
candidates = [
14-
key
15-
for key in files
16-
if "change report" in key.lower() and "annex" in key.lower()
17-
]
13+
candidates = [key for key in files if "change report" in key.lower() and "annex" in key.lower()]
1814
if not candidates:
1915
raise ValueError(
2016
"Can't find suitable change report filename from release files:\n\t{}".format(

ecoinvent_migrate/errors.py

+4
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,7 @@ class Mismatch(Exception):
88

99
class VersionJump(Exception):
1010
pass
11+
12+
13+
class MissingDatabase(Exception):
14+
pass

ecoinvent_migrate/main.py

+39-3
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,14 @@
33
from pathlib import Path
44
from typing import Optional
55

6+
import pandas as pd
67
from ecoinvent_interface import EcoinventRelease, Settings
78
from loguru import logger
89

10+
from .utils import configure_logs
911
from .data_io import get_change_report_filepath, setup_project
1012
from .errors import VersionJump
13+
from .wrangling import resolve_glo_row_rer_roe, source_target_pair_as_bw_dict
1114

1215

1316
def get_change_report_context(
@@ -59,9 +62,7 @@ def get_change_report_context(
5962
{excel_filepath.name}
6063
"""
6164
)
62-
logger.info(
63-
"Using change report annex file {filename}", filename=excel_filepath.name
64-
)
65+
logger.info("Using change report annex file {filename}", filename=excel_filepath.name)
6566

6667
setup_project(
6768
source_version=source_version,
@@ -78,14 +79,49 @@ def generate_technosphere_mapping(
7879
source_version: str,
7980
target_version: str,
8081
project_name: str = "ecoinvent-migration",
82+
system_model: str = "cutoff",
8183
ecoinvent_username: Optional[str] = None,
8284
ecoinvent_password: Optional[str] = None,
85+
write_logs: bool = True
8386
) -> Path:
8487
"""Generate a Randonneur mapping file for technosphere edge attributes from source to target."""
88+
configure_logs(write_logs=write_logs)
89+
8590
excel_filepath = get_change_report_context(
8691
source_version=source_version,
8792
target_version=target_version,
8893
project_name=project_name,
8994
ecoinvent_username=ecoinvent_username,
9095
ecoinvent_password=ecoinvent_password,
9196
)
97+
98+
sheet_names = pd.ExcelFile(excel_filepath).sheet_names
99+
candidates = [name for name in sheet_names if name.lower() == "qualitative changes"]
100+
if not candidates:
101+
raise ValueError(
102+
"Can't find suitable sheet name in change report file. Looking for 'qualitative changes', found:\n\t{}".format(
103+
"\n\t".join(sheet_names)
104+
)
105+
)
106+
elif len(candidates) > 1:
107+
raise ValueError(
108+
"Found multiple sheet names like 'qualitative changes' for change report file:\n\t{}".format(
109+
"\n\t".join(sheet_names)
110+
)
111+
)
112+
113+
data = [
114+
pair
115+
for row in pd.read_excel(io=excel_filepath, sheet_name=candidates[0]).to_dict(
116+
orient="records"
117+
)
118+
for pair in source_target_pair_as_bw_dict(row, source_version, target_version)
119+
]
120+
data = resolve_glo_row_rer_roe(
121+
data=data,
122+
source_version=source_version,
123+
target_version=target_version,
124+
system_model=system_model,
125+
)
126+
data = [ds for ds in data if ds['source'] != ds['target']]
127+
return data

ecoinvent_migrate/utils.py

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import datetime
2+
from loguru import logger
3+
import sys
4+
from pathlib import Path
5+
6+
from platformdirs import user_log_dir
7+
8+
9+
def configure_logs(write_logs: bool = True) -> None:
10+
logger.remove()
11+
logger.add(sys.stderr, level="INFO")
12+
if write_logs:
13+
logs_dir = (
14+
Path(user_log_dir("ecoinvent_migrate", "pylca")) / datetime.datetime.now().isoformat()[:19].replace(":", "-")
15+
)
16+
logger.info("Writing logs to {path}", path=logs_dir)
17+
logs_dir.mkdir(parents=True, exist_ok=True)
18+
logger.add(logs_dir / "debug.log", level="DEBUG")
19+
logger.add(logs_dir / "info.log", level="INFO")

ecoinvent_migrate/wrangling.py

+108-5
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,21 @@
11
import itertools
2+
import math
3+
from numbers import Number
4+
from typing import List
25

3-
from .errors import Mismatch, Uncombinable
6+
import bw2data as bd
7+
from loguru import logger
8+
9+
from .errors import Mismatch, MissingDatabase, Uncombinable
410

511

612
def split_by_semicolon(row: dict, version: str) -> list[dict]:
713
"""Possible split a data row into"""
14+
if isinstance(row[f"Activity Name - {version}"], Number) and math.isnan(
15+
row[f"Activity Name - {version}"]
16+
):
17+
return []
18+
819
len_product = len(row[f"Reference Product - {version}"].split(";\n"))
920
len_unit = len(row[f"Reference Product Unit - {version}"].split(";\n"))
1021
if len_product != len_unit:
@@ -113,9 +124,7 @@ def source_target_pair_as_bw_dict(
113124
```
114125
115126
"""
116-
versions = [
117-
x.split(" - ")[-1].strip() for x in row if x.startswith("Activity Name")
118-
]
127+
versions = [x.split(" - ")[-1].strip() for x in row if x.startswith("Activity Name")]
119128
if f"Activity Name - {source_version}" not in row:
120129
raise ValueError(
121130
f"""Can't find source version {source_version} in data row.
@@ -128,6 +137,10 @@ def source_target_pair_as_bw_dict(
128137
)
129138

130139
sources = split_by_semicolon(row, source_version)
140+
if not sources:
141+
# New unit process dataset, no source objects
142+
return []
143+
131144
targets = split_by_semicolon(row, target_version)
132145
if len(sources) > 1 and len(targets) > 1 and len(sources) != len(targets):
133146
raise Uncombinable(
@@ -140,4 +153,94 @@ def source_target_pair_as_bw_dict(
140153
elif len(targets) == 1:
141154
targets = itertools.repeat(targets[0])
142155

143-
return [{"source": s, "target": t} for s, t in zip(sources, targets)]
156+
return [
157+
{"source": s, "target": t}
158+
for s, t in zip(sources, targets)
159+
if all(v.lower() != "nan" for v in itertools.chain(s.values(), t.values()))
160+
]
161+
162+
163+
def resolve_glo_row_rer_roe(
164+
data: List[dict], source_version: str, target_version: str, system_model: str
165+
) -> List[dict]:
166+
"""Iterate through `data`, and change `location` attribute to `RoW` or `RoE` when needed.
167+
168+
Looks in actual database to get correct `location` attributes."""
169+
source_db_name = f"ecoinvent-{source_version}-{system_model}"
170+
target_db_name = f"ecoinvent-{target_version}-{system_model}"
171+
if source_db_name not in bd.databases:
172+
raise MissingDatabase(f"Missing source database: {source_db_name}")
173+
if target_db_name not in bd.databases:
174+
raise MissingDatabase(f"Missing target database: {target_db_name}")
175+
176+
logger.info("Loading source database {db} to cache data attributes", db=source_db_name)
177+
source_lookup = {
178+
tuple([o[attr] for attr in ("name", "location", "reference product")])
179+
for o in bd.Database(source_db_name)
180+
}
181+
logger.info("Loading target database {db} to cache data attributes", db=target_db_name)
182+
target_lookup = {
183+
tuple([o[attr] for attr in ("name", "location", "reference product")])
184+
for o in bd.Database(target_db_name)
185+
}
186+
187+
for obj in data:
188+
source_missing = None
189+
for kind, lookup, db_name in [
190+
("source", source_lookup, source_db_name),
191+
("target", target_lookup, target_db_name),
192+
]:
193+
key = tuple([obj[kind][attr] for attr in ("name", "location", "reference product")])
194+
if key in lookup:
195+
continue
196+
elif (
197+
key not in lookup
198+
and obj[kind]["location"] == "GLO"
199+
and (key[0], "RoW", key[2]) in lookup
200+
):
201+
obj[kind]["location"] = "RoW"
202+
logger.debug(
203+
"{kind} process {name} location corrected to 'RoW'",
204+
kind=kind,
205+
name=obj[kind]['name'],
206+
)
207+
elif (
208+
key not in lookup
209+
and obj[kind]["location"] == "RER"
210+
and (key[0], "RoE", key[2]) in lookup
211+
):
212+
obj[kind]["location"] = "RoE"
213+
logger.debug(
214+
"{kind} process {name} location corrected to 'RoE'",
215+
kind=kind,
216+
name=obj[kind]['name'],
217+
)
218+
else:
219+
if kind == 'target' and source_missing:
220+
# Missing in both source and target for this system model
221+
source_missing = None
222+
continue
223+
elif kind == "source":
224+
source_missing = obj[kind]
225+
else:
226+
# Only missing in target database - but this is a big problem, we don't have a
227+
# suitable target for existing edges to relink to.
228+
logger.warning(
229+
"{kind.title()} process given in change report but missing in {db_name} lookup: {ds}",
230+
kind=kind,
231+
db_name=db_name,
232+
ds=obj[kind],
233+
)
234+
# raise KeyError(
235+
# f"""Can't find {kind} object in database {db_name}: {obj[kind]}"""
236+
# )
237+
if source_missing:
238+
# Only a debug message because this won't break anything - there is no process in the
239+
# source database to miss a link from.
240+
logger.debug(
241+
"Source process given in change report but missing in {db_name} lookup: {ds}",
242+
db_name=source_db_name,
243+
ds=source_missing,
244+
)
245+
246+
return data

pyproject.toml

+6-5
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,11 @@ classifiers = [
2929
]
3030
requires-python = ">=3.10"
3131
dependencies = [
32-
"ecoinvent_interface",
3332
"bw2data>=4.0.dev41",
3433
"bw2io>=0.9.dev27",
34+
"ecoinvent_interface",
3535
"loguru",
36+
"platformdirs",
3637
]
3738

3839
[project.urls]
@@ -47,7 +48,7 @@ testing = [
4748
"ecoinvent_migrate",
4849
"pytest",
4950
"pytest-cov",
50-
"python-coveralls"
51+
"python-coveralls",
5152
]
5253
dev = [
5354
"build",
@@ -78,7 +79,7 @@ testpaths = ["tests/*.py"]
7879

7980
[tool.flake8]
8081
# Some sane defaults for the code style checker flake8
81-
max_line_length = 88
82+
max_line_length = 100
8283
extend_ignore = ["E203", "W503"]
8384
# ^ Black-compatible
8485
# E203 and W503 have edge cases handled by black
@@ -91,11 +92,11 @@ exclude = [
9192
]
9293

9394
[tool.black]
94-
line-length = 88
95+
line-length = 100
9596

9697
[tool.isort]
9798
profile = "black"
98-
line_length = 88
99+
line_length = 100
99100
multi_line_output = 3
100101
include_trailing_comma = true
101102
force_grid_wrap = 0

tests/test_wrangling.py

+44
Original file line numberDiff line numberDiff line change
@@ -212,3 +212,47 @@ def test_source_target_pair_as_bw_dict_valueerror():
212212
source_target_pair_as_bw_dict(given, "3.8", "3.10")
213213
with pytest.raises(ValueError):
214214
source_target_pair_as_bw_dict(given, "3.9.1", "3.11")
215+
216+
217+
def test_source_target_pair_as_bw_dict_new_dataset():
218+
given = {
219+
"Activity Name - 3.9.1": float("NaN"),
220+
"Geography - 3.9.1": float("NaN"),
221+
"Reference Product - 3.9.1": float("NaN"),
222+
"Reference Product Unit - 3.9.1": float("NaN"),
223+
"Activity Name - 3.10": "baling",
224+
"Geography - 3.10": "GLO",
225+
"Reference Product - 3.10": "baling",
226+
"Reference Product Unit - 3.10": "unit",
227+
}
228+
assert source_target_pair_as_bw_dict(given, "3.9.1", "3.10") == []
229+
230+
231+
def test_source_target_pair_as_bw_dict_multiple_some_missing():
232+
given = {
233+
"Activity Name - 3.9.1": "p-nitrotoluene production",
234+
"Geography - 3.9.1": "GLO",
235+
"Reference Product - 3.9.1": "nan;\nnan;\np-nitrotoluene",
236+
"Reference Product Unit - 3.9.1": "nan;\nnan;\nkg",
237+
"Activity Name - 3.10": "nitrotoluenes production, toluene nitration",
238+
"Geography - 3.10": "GLO",
239+
"Reference Product - 3.10": "m-nitrotoluene;\no-nitrotoluene;\np-nitrotoluene",
240+
"Reference Product Unit - 3.10": "kg;\nkg;\nkg",
241+
}
242+
expected = [
243+
{
244+
"source": {
245+
"name": "p-nitrotoluene production",
246+
"location": "GLO",
247+
"reference product": "p-nitrotoluene",
248+
"unit": "kg",
249+
},
250+
"target": {
251+
"name": "nitrotoluenes production, toluene nitration",
252+
"location": "GLO",
253+
"reference product": "p-nitrotoluene",
254+
"unit": "kg",
255+
},
256+
},
257+
]
258+
assert source_target_pair_as_bw_dict(given, "3.9.1", "3.10") == expected

0 commit comments

Comments
 (0)