Skip to content

Commit aa41dce

Browse files
committed
Add geometry encoding and decoding functions.
These differ from `shapely_to_cf` and `cf_to_shapely` by returning all variables. Those function, simply encode and decode geometry-related variables.
1 parent 52160a5 commit aa41dce

File tree

6 files changed

+266
-28
lines changed

6 files changed

+266
-28
lines changed

cf_xarray/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,6 @@
99
from .options import set_options # noqa
1010
from .utils import _get_version
1111

12+
from . import geometry # noqa
13+
1214
__version__ = _get_version()

cf_xarray/geometry.py

+190-11
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,188 @@
11
from __future__ import annotations
22

3+
import copy
34
from collections.abc import Sequence
45

56
import numpy as np
67
import pandas as pd
78
import xarray as xr
89

10+
GEOMETRY_CONTAINER_NAME = "geometry_container"
11+
12+
__all__ = [
13+
"decode_geometries",
14+
"encode_geometries",
15+
"cf_to_shapely",
16+
"shapely_to_cf",
17+
]
18+
19+
20+
def decode_geometries(encoded: xr.Dataset) -> xr.Dataset:
21+
"""
22+
Decode CF encoded geometries to a numpy object array containing shapely geometries.
23+
24+
Parameters
25+
----------
26+
encoded : Dataset
27+
A Xarray Dataset containing encoded geometries.
28+
29+
Returns
30+
-------
31+
Dataset
32+
A Xarray Dataset containing decoded geometries.
33+
34+
See Also
35+
--------
36+
shapely_to_cf
37+
cf_to_shapely
38+
encode_geometries
39+
"""
40+
if GEOMETRY_CONTAINER_NAME not in encoded._variables:
41+
raise NotImplementedError(
42+
f"Currently only a single geometry variable named {GEOMETRY_CONTAINER_NAME!r} is supported."
43+
"A variable by this name is not present in the provided dataset."
44+
)
45+
46+
enc_geom_var = encoded[GEOMETRY_CONTAINER_NAME]
47+
geom_attrs = enc_geom_var.attrs
48+
# Grab the coordinates attribute
49+
geom_attrs.update(enc_geom_var.encoding)
50+
51+
geom_var = cf_to_shapely(encoded).variable
52+
53+
todrop = (GEOMETRY_CONTAINER_NAME,) + tuple(
54+
s
55+
for s in " ".join(
56+
geom_attrs.get(attr, "")
57+
for attr in [
58+
"interior_ring",
59+
"node_coordinates",
60+
"node_count",
61+
"part_node_count",
62+
"coordinates",
63+
]
64+
).split(" ")
65+
if s
66+
)
67+
decoded = encoded.drop_vars(todrop)
68+
69+
name = geom_attrs.get("variable_name", None)
70+
if name in decoded.dims:
71+
decoded = decoded.assign_coords(
72+
xr.Coordinates(coords={name: geom_var}, indexes={})
73+
)
74+
else:
75+
decoded[name] = geom_var
76+
77+
# Is this a good idea? We are deleting information.
78+
for var in decoded._variables.values():
79+
if var.attrs.get("geometry") == GEOMETRY_CONTAINER_NAME:
80+
var.attrs.pop("geometry")
81+
return decoded
82+
83+
84+
def encode_geometries(ds: xr.Dataset):
85+
"""
86+
Encode any discovered geometry variables using the CF conventions.
87+
88+
Practically speaking, geometry variables are numpy object arrays where the first
89+
element is a shapely geometry.
90+
91+
.. warning::
92+
93+
Only a single geometry variable is supported at present. Contributions to fix this
94+
are welcome.
95+
96+
Parameters
97+
----------
98+
ds : Dataset
99+
Dataset containing at least one geometry variable.
100+
101+
Returns
102+
-------
103+
Dataset
104+
Where all geometry variables are encoded.
105+
106+
See Also
107+
--------
108+
shapely_to_cf
109+
cf_to_shapely
110+
"""
111+
from shapely import (
112+
LineString,
113+
MultiLineString,
114+
MultiPoint,
115+
MultiPolygon,
116+
Point,
117+
Polygon,
118+
)
119+
120+
SHAPELY_TYPES = (
121+
Point,
122+
LineString,
123+
Polygon,
124+
MultiPoint,
125+
MultiLineString,
126+
MultiPolygon,
127+
)
128+
129+
geom_var_names = [
130+
name
131+
for name, var in ds._variables.items()
132+
if var.dtype == "O" and isinstance(var.data.flat[0], SHAPELY_TYPES)
133+
]
134+
if not geom_var_names:
135+
return ds
136+
137+
if to_drop := set(geom_var_names) & set(ds._indexes):
138+
# e.g. xvec GeometryIndex
139+
ds = ds.drop_indexes(to_drop)
140+
141+
if len(geom_var_names) > 1:
142+
raise NotImplementedError(
143+
"Multiple geometry variables are not supported at this time. "
144+
"Contributions to fix this are welcome. "
145+
f"Detected geometry variables are {geom_var_names!r}"
146+
)
147+
148+
(name,) = geom_var_names
149+
variables = {}
150+
# If `name` is a dimension name, then we need to drop it. Otherwise we don't
151+
# So set errors="ignore"
152+
variables.update(
153+
shapely_to_cf(ds[name]).drop_vars(name, errors="ignore")._variables
154+
)
155+
156+
geom_var = ds[name]
157+
158+
more_updates = {}
159+
for varname, var in ds._variables.items():
160+
if varname == name:
161+
continue
162+
if name in var.dims:
163+
var = var.copy()
164+
var._attrs = copy.deepcopy(var._attrs)
165+
var.attrs["geometry"] = GEOMETRY_CONTAINER_NAME
166+
# The grid_mapping and coordinates attributes can be carried by the geometry container
167+
# variable provided they are also carried by the data variables associated with the container.
168+
if to_add := geom_var.attrs.get("coordinates", ""):
169+
var.attrs["coordinates"] = var.attrs.get("coordinates", "") + to_add
170+
more_updates[varname] = var
171+
variables.update(more_updates)
172+
173+
# WARNING: cf-xarray specific convention.
174+
# For vector data cubes, `name` is a dimension name.
175+
# By encoding to CF, we have
176+
# encoded the information in that variable across many different
177+
# variables (e.g. node_count) with `name` as a dimension.
178+
# We have to record `name` somewhere so that we reconstruct
179+
# a geometry variable of the right name at decode-time.
180+
variables[GEOMETRY_CONTAINER_NAME].attrs["variable_name"] = name
181+
182+
encoded = xr.Dataset(variables)
183+
184+
return encoded
185+
9186

10187
def reshape_unique_geometries(
11188
ds: xr.Dataset,
@@ -119,13 +296,15 @@ def shapely_to_cf(geometries: xr.DataArray | Sequence, grid_mapping: str | None
119296
f"Mixed geometry types are not supported in CF-compliant datasets. Got {types}"
120297
)
121298

299+
ds[GEOMETRY_CONTAINER_NAME].attrs.update(coordinates="crd_x crd_y")
300+
122301
# Special treatment of selected grid mappings
123302
if grid_mapping == "longitude_latitude":
124303
# Special case for longitude_latitude grid mapping
125304
ds = ds.rename(crd_x="lon", crd_y="lat")
126305
ds.lon.attrs.update(units="degrees_east", standard_name="longitude")
127306
ds.lat.attrs.update(units="degrees_north", standard_name="latitude")
128-
ds.geometry_container.attrs.update(coordinates="lon lat")
307+
ds[GEOMETRY_CONTAINER_NAME].attrs.update(coordinates="lon lat")
129308
ds.x.attrs.update(units="degrees_east", standard_name="longitude")
130309
ds.y.attrs.update(units="degrees_north", standard_name="latitude")
131310
elif grid_mapping is not None:
@@ -157,7 +336,7 @@ def cf_to_shapely(ds: xr.Dataset):
157336
----------
158337
Please refer to the CF conventions document: http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#geometries
159338
"""
160-
geom_type = ds.geometry_container.attrs["geometry_type"]
339+
geom_type = ds[GEOMETRY_CONTAINER_NAME].attrs["geometry_type"]
161340
if geom_type == "point":
162341
geometries = cf_to_points(ds)
163342
elif geom_type == "line":
@@ -235,7 +414,7 @@ def points_to_cf(pts: xr.DataArray | Sequence):
235414
# Special case when we have no MultiPoints
236415
if (ds.node_count == 1).all():
237416
ds = ds.drop_vars("node_count")
238-
del ds.geometry_container.attrs["node_count"]
417+
del ds[GEOMETRY_CONTAINER_NAME].attrs["node_count"]
239418
return ds
240419

241420

@@ -259,18 +438,18 @@ def cf_to_points(ds: xr.Dataset):
259438
from shapely.geometry import MultiPoint, Point
260439

261440
# Shorthand for convenience
262-
geo = ds.geometry_container.attrs
441+
geo = ds[GEOMETRY_CONTAINER_NAME].attrs
263442

264443
# The features dimension name, defaults to the one of 'node_count' or the dimension of the coordinates, if present.
265444
feat_dim = None
266445
if "coordinates" in geo and feat_dim is None:
267446
xcoord_name, _ = geo["coordinates"].split(" ")
268447
(feat_dim,) = ds[xcoord_name].dims
269448

270-
x_name, y_name = ds.geometry_container.attrs["node_coordinates"].split(" ")
449+
x_name, y_name = ds[GEOMETRY_CONTAINER_NAME].attrs["node_coordinates"].split(" ")
271450
xy = np.stack([ds[x_name].values, ds[y_name].values], axis=-1)
272451

273-
node_count_name = ds.geometry_container.attrs.get("node_count")
452+
node_count_name = ds[GEOMETRY_CONTAINER_NAME].attrs.get("node_count")
274453
if node_count_name is None:
275454
# No node_count means all geometries are single points (node_count = 1)
276455
# And if we had no coordinates, then the dimension defaults to "features"
@@ -363,7 +542,7 @@ def lines_to_cf(lines: xr.DataArray | Sequence):
363542
# Special case when we have no MultiLines
364543
if len(ds.part_node_count) == len(ds.node_count):
365544
ds = ds.drop_vars("part_node_count")
366-
del ds.geometry_container.attrs["part_node_count"]
545+
del ds[GEOMETRY_CONTAINER_NAME].attrs["part_node_count"]
367546
return ds
368547

369548

@@ -387,7 +566,7 @@ def cf_to_lines(ds: xr.Dataset):
387566
from shapely import GeometryType, from_ragged_array
388567

389568
# Shorthand for convenience
390-
geo = ds.geometry_container.attrs
569+
geo = ds[GEOMETRY_CONTAINER_NAME].attrs
391570

392571
# The features dimension name, defaults to the one of 'node_count'
393572
# or the dimension of the coordinates, if present.
@@ -503,12 +682,12 @@ def polygons_to_cf(polygons: xr.DataArray | Sequence):
503682
# Special case when we have no MultiPolygons and no holes
504683
if len(ds.part_node_count) == len(ds.node_count):
505684
ds = ds.drop_vars("part_node_count")
506-
del ds.geometry_container.attrs["part_node_count"]
685+
del ds[GEOMETRY_CONTAINER_NAME].attrs["part_node_count"]
507686

508687
# Special case when we have no holes
509688
if (ds.interior_ring == 0).all():
510689
ds = ds.drop_vars("interior_ring")
511-
del ds.geometry_container.attrs["interior_ring"]
690+
del ds[GEOMETRY_CONTAINER_NAME].attrs["interior_ring"]
512691
return ds
513692

514693

@@ -532,7 +711,7 @@ def cf_to_polygons(ds: xr.Dataset):
532711
from shapely import GeometryType, from_ragged_array
533712

534713
# Shorthand for convenience
535-
geo = ds.geometry_container.attrs
714+
geo = ds[GEOMETRY_CONTAINER_NAME].attrs
536715

537716
# The features dimension name, defaults to the one of 'part_node_count'
538717
# or the dimension of the coordinates, if present.

cf_xarray/tests/test_geometry.py

+32-11
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,23 @@
44

55
import cf_xarray as cfxr
66

7+
from ..geometry import decode_geometries, encode_geometries
78
from . import requires_shapely
89

910

11+
@pytest.fixture
12+
def polygon_geometry() -> xr.DataArray:
13+
from shapely.geometry import Polygon
14+
15+
# empty/fill workaround to avoid numpy deprecation(warning) due to the array interface of shapely geometries.
16+
geoms = np.empty(2, dtype=object)
17+
geoms[:] = [
18+
Polygon(([50, 0], [40, 15], [30, 0])),
19+
Polygon(([70, 50], [60, 65], [50, 50])),
20+
]
21+
return xr.DataArray(geoms, dims=("index",), name="geometry")
22+
23+
1024
@pytest.fixture
1125
def geometry_ds():
1226
from shapely.geometry import MultiPoint, Point
@@ -127,18 +141,9 @@ def geometry_line_without_multilines_ds():
127141

128142

129143
@pytest.fixture
130-
def geometry_polygon_without_holes_ds():
131-
from shapely.geometry import Polygon
132-
133-
# empty/fill workaround to avoid numpy deprecation(warning) due to the array interface of shapely geometries.
134-
geoms = np.empty(2, dtype=object)
135-
geoms[:] = [
136-
Polygon(([50, 0], [40, 15], [30, 0])),
137-
Polygon(([70, 50], [60, 65], [50, 50])),
138-
]
139-
144+
def geometry_polygon_without_holes_ds(polygon_geometry):
145+
shp_da = polygon_geometry
140146
ds = xr.Dataset()
141-
shp_da = xr.DataArray(geoms, dims=("index",), name="geometry")
142147

143148
cf_ds = ds.assign(
144149
x=xr.DataArray(
@@ -521,3 +526,19 @@ def test_reshape_unique_geometries(geometry_ds):
521526
in_ds = in_ds.assign(geometry=geoms)
522527
with pytest.raises(ValueError, match="The geometry variable must be 1D"):
523528
cfxr.geometry.reshape_unique_geometries(in_ds)
529+
530+
531+
@requires_shapely
532+
def test_encode_decode(geometry_ds, polygon_geometry):
533+
534+
geom_dim_ds = xr.Dataset()
535+
geom_dim_ds = geom_dim_ds.assign_coords(
536+
xr.Coordinates(
537+
coords={"geoms": xr.Variable("geoms", polygon_geometry.variable)},
538+
indexes={},
539+
)
540+
).assign({"foo": ("geoms", [1, 2])})
541+
542+
for ds in (geometry_ds[1], polygon_geometry.to_dataset(), geom_dim_ds):
543+
roundtripped = decode_geometries(encode_geometries(ds))
544+
xr.testing.assert_identical(ds, roundtripped)

doc/api.rst

+10
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,16 @@ Top-level API
1717
encode_multi_index_as_compress
1818
decode_compress_to_multi_index
1919

20+
Geometries
21+
----------
22+
.. autosummary::
23+
:toctree: generated/
24+
25+
geometry.decode_geometries
26+
geometry.encode_geometries
27+
geometry.shapely_to_cf
28+
geometry.cf_to_shapely
29+
2030
.. currentmodule:: xarray
2131

2232
DataArray

doc/coding.md

+4
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ xr.set_options(display_expand_data=False)
2626

2727
`cf_xarray` aims to support encoding and decoding variables using CF conventions not yet implemented by Xarray.
2828

29+
## Geometries
30+
31+
See [](./geometry.md) for more.
32+
2933
## Compression by gathering
3034

3135
The ["compression by gathering"](http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#compression-by-gathering)

0 commit comments

Comments
 (0)