From 608e47966e7b134580faa65788fc3db66c1fd192 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 1 Jul 2024 09:40:53 -0600 Subject: [PATCH 01/12] Allow encoding/decoding multiple geometries --- cf_xarray/geometry.py | 279 +++++++++++++++++++++++++++--------------- 1 file changed, 180 insertions(+), 99 deletions(-) diff --git a/cf_xarray/geometry.py b/cf_xarray/geometry.py index febac8bc..9a78f446 100644 --- a/cf_xarray/geometry.py +++ b/cf_xarray/geometry.py @@ -1,7 +1,8 @@ from __future__ import annotations import copy -from collections.abc import Sequence +from collections import ChainMap +from collections.abc import Hashable, Sequence import numpy as np import pandas as pd @@ -30,6 +31,46 @@ # 1. node coordinates are exact; the 'normal' coordinates are a reasonable value to use, if you do not know how to interpret the nodes. +def _assert_single_geometry_container(ds: xr.Dataset) -> str: + container_names = _get_geometry_containers(ds) + if len(container_names) > 1: + raise ValueError( + "Only one geometry container is supported by cf_to_points. " + "To handle multiple geometries use `decode_geometries` instead." + ) + (container_name,) = container_names + return container_name + + +def _get_geometry_containers(obj: xr.DataArray | xr.Dataset) -> list[Hashable]: + """ + Translate from key (either CF key or variable name) to its bounds' variable names. + + This function interprets the ``geometry`` attribute on DataArrays. + + Parameters + ---------- + obj : DataArray, Dataset + DataArray belonging to the coordinate to be checked + + Returns + ------- + List[str] + Variable name(s) in parent xarray object that are bounds of `key` + """ + + if isinstance(obj, xr.DataArray): + obj = obj._to_temp_dataset() + variables = obj._variables + + results = set() + for name, var in variables.items(): + attrs_or_encoding = ChainMap(var.attrs, var.encoding) + if "geometry_type" in attrs_or_encoding: + results.update([name]) + return list(results) + + def decode_geometries(encoded: xr.Dataset) -> xr.Dataset: """ Decode CF encoded geometries to a numpy object array containing shapely geometries. @@ -50,46 +91,56 @@ def decode_geometries(encoded: xr.Dataset) -> xr.Dataset: cf_to_shapely encode_geometries """ - if GEOMETRY_CONTAINER_NAME not in encoded._variables: + + containers = _get_geometry_containers(encoded) + if not containers: raise NotImplementedError( - f"Currently only a single geometry variable named {GEOMETRY_CONTAINER_NAME!r} is supported." - "A variable by this name is not present in the provided dataset." + "No geometry container variables detected, none of the provided variables " + "have a `geometry_type` attribute." ) - enc_geom_var = encoded[GEOMETRY_CONTAINER_NAME] - geom_attrs = enc_geom_var.attrs - # Grab the coordinates attribute - geom_attrs.update(enc_geom_var.encoding) - - geom_var = cf_to_shapely(encoded).variable - - todrop = (GEOMETRY_CONTAINER_NAME,) + tuple( - s - for s in " ".join( - geom_attrs.get(attr, "") - for attr in [ - "interior_ring", - "node_coordinates", - "node_count", - "part_node_count", - "coordinates", - ] - ).split(" ") - if s - ) - decoded = encoded.drop_vars(todrop) - - name = geom_attrs.get("variable_name", None) - if name in decoded.dims: - decoded = decoded.assign_coords( - xr.Coordinates(coords={name: geom_var}, indexes={}) + todrop = [] + decoded = xr.Dataset() + for container_name in containers: + enc_geom_var = encoded[container_name] + geom_attrs = enc_geom_var.attrs + # Grab the coordinates attribute + geom_attrs.update(enc_geom_var.encoding) + + geom_var = cf_to_shapely(encoded, container=container_name).variable + + todrop.extend( + (container_name,) + + tuple( + s + for s in " ".join( + geom_attrs.get(attr, "") + for attr in [ + "interior_ring", + "node_coordinates", + "node_count", + "part_node_count", + "coordinates", + ] + ).split(" ") + if s + ) ) - else: - decoded[name] = geom_var + + name = geom_attrs.get("variable_name", None) + if name in encoded.dims: + decoded = decoded.assign_coords( + xr.Coordinates(coords={name: geom_var}, indexes={}) + ) + else: + decoded[name] = geom_var + + decoded.update(encoded.drop_vars(todrop)) # Is this a good idea? We are deleting information. + # OTOH we have decoded it to a useful in-memory representation for var in decoded._variables.values(): - if var.attrs.get("geometry") == GEOMETRY_CONTAINER_NAME: + if var.attrs.get("geometry") in containers: var.attrs.pop("geometry") return decoded @@ -158,43 +209,47 @@ def encode_geometries(ds: xr.Dataset): f"Detected geometry variables are {geom_var_names!r}" ) - (name,) = geom_var_names variables = {} - # If `name` is a dimension name, then we need to drop it. Otherwise we don't - # So set errors="ignore" - variables.update( - shapely_to_cf(ds[name]).drop_vars(name, errors="ignore")._variables - ) + for name in geom_var_names: + container_name = GEOMETRY_CONTAINER_NAME + "_" + name + # If `name` is a dimension name, then we need to drop it. Otherwise we don't + # So set errors="ignore" + variables.update( + shapely_to_cf(ds[name], container_name=container_name) + .drop_vars(name, errors="ignore") + ._variables + ) - geom_var = ds[name] - - more_updates = {} - for varname, var in ds._variables.items(): - if varname == name: - continue - # TODO: this is incomplete. It works for vector data cubes where one of the geometry vars - # is a dimension coordinate. - if name in var.dims: - var = var.copy() - var._attrs = copy.deepcopy(var._attrs) - var.attrs["geometry"] = GEOMETRY_CONTAINER_NAME - # The grid_mapping and coordinates attributes can be carried by the geometry container - # variable provided they are also carried by the data variables associated with the container. - if to_add := geom_var.attrs.get("coordinates", ""): - var.attrs["coordinates"] = var.attrs.get("coordinates", "") + to_add - more_updates[varname] = var - variables.update(more_updates) - - # WARNING: cf-xarray specific convention. - # For vector data cubes, `name` is a dimension name. - # By encoding to CF, we have - # encoded the information in that variable across many different - # variables (e.g. node_count) with `name` as a dimension. - # We have to record `name` somewhere so that we reconstruct - # a geometry variable of the right name at decode-time. - variables[GEOMETRY_CONTAINER_NAME].attrs["variable_name"] = name - - encoded = xr.Dataset(variables) + geom_var = ds[name] + more_updates = {} + for varname, var in ds._variables.items(): + if varname == name: + continue + # TODO: this is incomplete. It works for vector data cubes where one of the geometry vars + # is a dimension coordinate. + if name in var.dims: + var = var.copy() + var._attrs = copy.deepcopy(var._attrs) + var.attrs["geometry"] = container_name + # The grid_mapping and coordinates attributes can be carried by the geometry container + # variable provided they are also carried by the data variables associated with the container. + if to_add := geom_var.attrs.get("coordinates", ""): + var.attrs["coordinates"] = var.attrs.get("coordinates", "") + to_add + more_updates[varname] = var + variables.update(more_updates) + + # WARNING: cf-xarray specific convention. + # For vector data cubes, `name` is a dimension name. + # By encoding to CF, we have + # encoded the information in that variable across many different + # variables (e.g. node_count) with `name` as a dimension. + # We have to record `name` somewhere so that we reconstruct + # a geometry variable of the right name at decode-time. + variables[container_name].attrs["variable_name"] = name + + encoded = xr.Dataset(variables).set_coords( + set(ds._coord_names) - set(geom_var_names) + ) return encoded @@ -263,7 +318,11 @@ def reshape_unique_geometries( return out -def shapely_to_cf(geometries: xr.DataArray | Sequence, grid_mapping: str | None = None): +def shapely_to_cf( + geometries: xr.DataArray | Sequence, + grid_mapping: str | None = None, + container_name: str = GEOMETRY_CONTAINER_NAME, +): """ Convert a DataArray with shapely geometry objects into a CF-compliant dataset. @@ -277,8 +336,8 @@ def shapely_to_cf(geometries: xr.DataArray | Sequence, grid_mapping: str | None A CF grid mapping name. When given, coordinates and attributes are named and set accordingly. Defaults to None, in which case the coordinates are simply names "crd_x" and "crd_y". - .. warning:: - Only the `longitude_latitude` grid mapping is currently implemented. + container_name: str, optional + Name for the "geometry container" scalar variable in the encoded Dataset Returns ------- @@ -289,7 +348,7 @@ def shapely_to_cf(geometries: xr.DataArray | Sequence, grid_mapping: str | None - 'node_count' : The number of nodes per feature. Always present for Lines and Polygons. For Points: only present if there are multipart geometries. - 'part_node_count' : The number of nodes per individual geometry. Only for Lines with multipart geometries and for Polygons with multipart geometries or holes. - 'interior_ring' : Integer boolean indicating whether rings are interior or exterior. Only for Polygons with holes. - - 'geometry_container' : Empty variable with attributes describing the geometry type. + - container_name : Empty variable with attributes describing the geometry type. References ---------- @@ -309,17 +368,17 @@ def shapely_to_cf(geometries: xr.DataArray | Sequence, grid_mapping: str | None for geom in geometries } if types.issubset({"Point", "MultiPoint"}): - ds = points_to_cf(geometries) + ds = points_to_cf(geometries, container_name=container_name) elif types.issubset({"LineString", "MultiLineString"}): - ds = lines_to_cf(geometries) + ds = lines_to_cf(geometries, container_name=container_name) elif types.issubset({"Polygon", "MultiPolygon"}): - ds = polygons_to_cf(geometries) + ds = polygons_to_cf(geometries, container_name=container_name) else: raise ValueError( f"Mixed geometry types are not supported in CF-compliant datasets. Got {types}" ) - ds[GEOMETRY_CONTAINER_NAME].attrs.update(coordinates="crd_x crd_y") + ds[container_name].attrs.update(coordinates="crd_x crd_y") if ( grid_mapping is None @@ -347,7 +406,7 @@ def shapely_to_cf(geometries: xr.DataArray | Sequence, grid_mapping: str | None ds.x.attrs.update(units="degrees", standard_name="grid_longitude") ds.lat.attrs.update(units="degrees", standard_name="grid_latitude") ds.y.attrs.update(units="degrees", standard_name="grid_latitude") - ds[GEOMETRY_CONTAINER_NAME].attrs.update(coordinates="lon lat") + ds[container_name].attrs.update(coordinates="lon lat") elif grid_mapping is not None: ds.crd_x.attrs.update(standard_name="projection_x_coordinate") ds.x.attrs.update(standard_name="projection_x_coordinate") @@ -357,7 +416,7 @@ def shapely_to_cf(geometries: xr.DataArray | Sequence, grid_mapping: str | None return ds -def cf_to_shapely(ds: xr.Dataset): +def cf_to_shapely(ds: xr.Dataset, *, container: str = GEOMETRY_CONTAINER_NAME): """ Convert geometries stored in a CF-compliant way to shapely objects stored in a single variable. @@ -378,13 +437,24 @@ def cf_to_shapely(ds: xr.Dataset): ---------- Please refer to the CF conventions document: http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#geometries """ - geom_type = ds[GEOMETRY_CONTAINER_NAME].attrs["geometry_type"] + if container not in ds._variables: + raise ValueError( + f"{container!r} is not the name of a variable in the provided Dataset." + ) + if not (geom_type := ds[container].attrs.get("geometry_type", None)): + raise ValueError( + f"{container!r} is not the name of a valid geometry variable. " + "It does not have a `geometry_type` attribute." + ) + + # Extract all necessary geometry variables + subds = ds.cf[[container]] if geom_type == "point": - geometries = cf_to_points(ds) + geometries = cf_to_points(subds) elif geom_type == "line": - geometries = cf_to_lines(ds) + geometries = cf_to_lines(subds) elif geom_type == "polygon": - geometries = cf_to_polygons(ds) + geometries = cf_to_polygons(subds) else: raise ValueError( f"Valid CF geometry types are 'point', 'line' and 'polygon'. Got {geom_type}" @@ -393,7 +463,9 @@ def cf_to_shapely(ds: xr.Dataset): return geometries.rename("geometry") -def points_to_cf(pts: xr.DataArray | Sequence): +def points_to_cf( + pts: xr.DataArray | Sequence, *, container_name: str = GEOMETRY_CONTAINER_NAME +): """Get a list of points (shapely.geometry.[Multi]Point) and return a CF-compliant geometry dataset. Parameters @@ -433,7 +505,7 @@ def points_to_cf(pts: xr.DataArray | Sequence): ds = xr.Dataset( data_vars={ "node_count": xr.DataArray(node_count, dims=(dim,)), - "geometry_container": xr.DataArray( + container_name: xr.DataArray( attrs={ "geometry_type": "point", "node_count": "node_count", @@ -456,7 +528,7 @@ def points_to_cf(pts: xr.DataArray | Sequence): # Special case when we have no MultiPoints if (ds.node_count == 1).all(): ds = ds.drop_vars("node_count") - del ds[GEOMETRY_CONTAINER_NAME].attrs["node_count"] + del ds[container_name].attrs["node_count"] return ds @@ -467,7 +539,7 @@ def cf_to_points(ds: xr.Dataset): ---------- ds : xr.Dataset A dataset with CF-compliant point geometries. - Must have a "geometry_container" variable with at least a 'node_coordinates' attribute. + Must have a *single* "geometry container" variable with at least a 'node_coordinates' attribute. Must also have the two 1D variables listed by this attribute. Returns @@ -479,8 +551,9 @@ def cf_to_points(ds: xr.Dataset): """ from shapely.geometry import MultiPoint, Point + container_name = _assert_single_geometry_container(ds) # Shorthand for convenience - geo = ds[GEOMETRY_CONTAINER_NAME].attrs + geo = ds[container_name].attrs # The features dimension name, defaults to the one of 'node_count' or the dimension of the coordinates, if present. feat_dim = None @@ -488,10 +561,10 @@ def cf_to_points(ds: xr.Dataset): xcoord_name, _ = geo["coordinates"].split(" ") (feat_dim,) = ds[xcoord_name].dims - x_name, y_name = ds[GEOMETRY_CONTAINER_NAME].attrs["node_coordinates"].split(" ") + x_name, y_name = ds[container_name].attrs["node_coordinates"].split(" ") xy = np.stack([ds[x_name].values, ds[y_name].values], axis=-1) - node_count_name = ds[GEOMETRY_CONTAINER_NAME].attrs.get("node_count") + node_count_name = ds[container_name].attrs.get("node_count") if node_count_name is None: # No node_count means all geometries are single points (node_count = 1) # And if we had no coordinates, then the dimension defaults to "features" @@ -515,7 +588,9 @@ def cf_to_points(ds: xr.Dataset): return xr.DataArray(geoms, dims=node_count.dims, coords=node_count.coords) -def lines_to_cf(lines: xr.DataArray | Sequence): +def lines_to_cf( + lines: xr.DataArray | Sequence, *, container_name: str = GEOMETRY_CONTAINER_NAME +): """Convert an iterable of lines (shapely.geometry.[Multi]Line) into a CF-compliant geometry dataset. Parameters @@ -560,7 +635,7 @@ def lines_to_cf(lines: xr.DataArray | Sequence): data_vars={ "node_count": xr.DataArray(node_count, dims=(dim,)), "part_node_count": xr.DataArray(part_node_count, dims=("part",)), - "geometry_container": xr.DataArray( + container_name: xr.DataArray( attrs={ "geometry_type": "line", "node_count": "node_count", @@ -584,7 +659,7 @@ def lines_to_cf(lines: xr.DataArray | Sequence): # Special case when we have no MultiLines if len(ds.part_node_count) == len(ds.node_count): ds = ds.drop_vars("part_node_count") - del ds[GEOMETRY_CONTAINER_NAME].attrs["part_node_count"] + del ds[container_name].attrs["part_node_count"] return ds @@ -607,8 +682,10 @@ def cf_to_lines(ds: xr.Dataset): """ from shapely import GeometryType, from_ragged_array + container_name = _assert_single_geometry_container(ds) + # Shorthand for convenience - geo = ds[GEOMETRY_CONTAINER_NAME].attrs + geo = ds[container_name].attrs # The features dimension name, defaults to the one of 'node_count' # or the dimension of the coordinates, if present. @@ -648,7 +725,9 @@ def cf_to_lines(ds: xr.Dataset): return xr.DataArray(geoms, dims=node_count.dims, coords=node_count.coords) -def polygons_to_cf(polygons: xr.DataArray | Sequence): +def polygons_to_cf( + polygons: xr.DataArray | Sequence, *, container_name: str = GEOMETRY_CONTAINER_NAME +): """Convert an iterable of polygons (shapely.geometry.[Multi]Polygon) into a CF-compliant geometry dataset. Parameters @@ -699,7 +778,7 @@ def polygons_to_cf(polygons: xr.DataArray | Sequence): "node_count": xr.DataArray(node_count, dims=(dim,)), "interior_ring": xr.DataArray(interior_ring, dims=("part",)), "part_node_count": xr.DataArray(part_node_count, dims=("part",)), - "geometry_container": xr.DataArray( + container_name: xr.DataArray( attrs={ "geometry_type": "polygon", "node_count": "node_count", @@ -724,12 +803,12 @@ def polygons_to_cf(polygons: xr.DataArray | Sequence): # Special case when we have no MultiPolygons and no holes if len(ds.part_node_count) == len(ds.node_count): ds = ds.drop_vars("part_node_count") - del ds[GEOMETRY_CONTAINER_NAME].attrs["part_node_count"] + del ds[container_name].attrs["part_node_count"] # Special case when we have no holes if (ds.interior_ring == 0).all(): ds = ds.drop_vars("interior_ring") - del ds[GEOMETRY_CONTAINER_NAME].attrs["interior_ring"] + del ds[container_name].attrs["interior_ring"] return ds @@ -752,8 +831,10 @@ def cf_to_polygons(ds: xr.Dataset): """ from shapely import GeometryType, from_ragged_array + container_name = _assert_single_geometry_container(ds) + # Shorthand for convenience - geo = ds[GEOMETRY_CONTAINER_NAME].attrs + geo = ds[container_name].attrs # The features dimension name, defaults to the one of 'part_node_count' # or the dimension of the coordinates, if present. From 5c327a5aeb18c139a70d7ff15cafe3967c0cc71f Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 1 Jul 2024 12:04:09 -0600 Subject: [PATCH 02/12] Generalize naming --- cf_xarray/geometry.py | 253 +++++++++++++++++++++++------------------- 1 file changed, 138 insertions(+), 115 deletions(-) diff --git a/cf_xarray/geometry.py b/cf_xarray/geometry.py index 9a78f446..adfc38f6 100644 --- a/cf_xarray/geometry.py +++ b/cf_xarray/geometry.py @@ -3,6 +3,7 @@ import copy from collections import ChainMap from collections.abc import Hashable, Sequence +from dataclasses import dataclass import numpy as np import pandas as pd @@ -31,6 +32,70 @@ # 1. node coordinates are exact; the 'normal' coordinates are a reasonable value to use, if you do not know how to interpret the nodes. +@dataclass +class GeometryNames: + def __init__(self, suffix: str = "", grid_mapping: str | None = None): + self.container_name: str = GEOMETRY_CONTAINER_NAME + suffix + self.node_dim: str = "node" + suffix + self.node_count: str = "node_count" + suffix + self.node_coordinates_x: str = "x" + suffix + self.node_coordinates_y: str = "y" + suffix + self.coordinates_x: str = "crd_x" + suffix + self.coordinates_y: str = "crd_y" + suffix + self.part_node_count: str = "part_node_count" + suffix + self.part_dim: str = "part" + suffix + self.interior_ring: str = "interior_ring" + suffix + self.attrs_x: dict[str, str] = {} + self.attrs_y: dict[str, str] = {} + + # Special treatment of selected grid mappings + if grid_mapping in ["latitude_longitude", "rotated_latitude_longitude"]: + # Special case for longitude_latitude type grid mappings + self.coordinates_x = "lon" + self.coordinates_y = "lat" + if grid_mapping == "latitude_longitude": + self.attrs_x = dict(units="degrees_east", standard_name="longitude") + self.attrs_y = dict(units="degrees_north", standard_name="latitude") + elif grid_mapping == "rotated_latitude_longitude": + self.attrs_x = dict( + units="degrees_east", standard_name="grid_longitude" + ) + self.attrs_y = dict( + units="degrees_north", standard_name="grid_latitude" + ) + elif grid_mapping is not None: + self.attrs_x = dict(standard_name="projection_x_coordinate") + self.attrs_y = dict(standard_name="projection_y_coordinate") + + @property + def geometry_container_attrs(self) -> dict[str, str]: + return { + "node_count": self.node_count, + "node_coordinates": f"{self.node_coordinates_x} {self.node_coordinates_y}", + "coordinates": f"{self.coordinates_x} {self.coordinates_y}", + } + + def coords(self, *, dim: str, x, y, crdX, crdY) -> dict[str, xr.DataArray]: + return { + self.node_coordinates_x: xr.DataArray( + x, dims=self.node_dim, attrs={"axis": "X", **self.attrs_x} + ), + self.node_coordinates_y: xr.DataArray( + y, dims=self.node_dim, attrs={"axis": "Y", **self.attrs_y} + ), + self.coordinates_x: xr.DataArray( + crdX, + dims=(dim,), + attrs={"nodes": self.node_coordinates_x, **self.attrs_x}, + ), + self.coordinates_y: xr.DataArray( + crdY, + dims=(dim,), + attrs={"nodes": self.node_coordinates_y, **self.attrs_y}, + ), + } + + def _assert_single_geometry_container(ds: xr.Dataset) -> str: container_names = _get_geometry_containers(ds) if len(container_names) > 1: @@ -202,20 +267,13 @@ def encode_geometries(ds: xr.Dataset): # e.g. xvec GeometryIndex ds = ds.drop_indexes(to_drop) - if len(geom_var_names) > 1: - raise NotImplementedError( - "Multiple geometry variables are not supported at this time. " - "Contributions to fix this are welcome. " - f"Detected geometry variables are {geom_var_names!r}" - ) - variables = {} for name in geom_var_names: container_name = GEOMETRY_CONTAINER_NAME + "_" + name # If `name` is a dimension name, then we need to drop it. Otherwise we don't # So set errors="ignore" variables.update( - shapely_to_cf(ds[name], container_name=container_name) + shapely_to_cf(ds[name], suffix="_" + name) .drop_vars(name, errors="ignore") ._variables ) @@ -321,7 +379,8 @@ def reshape_unique_geometries( def shapely_to_cf( geometries: xr.DataArray | Sequence, grid_mapping: str | None = None, - container_name: str = GEOMETRY_CONTAINER_NAME, + *, + suffix: str = "", ): """ Convert a DataArray with shapely geometry objects into a CF-compliant dataset. @@ -367,19 +426,6 @@ def shapely_to_cf( geom.item().geom_type if isinstance(geom, xr.DataArray) else geom.geom_type for geom in geometries } - if types.issubset({"Point", "MultiPoint"}): - ds = points_to_cf(geometries, container_name=container_name) - elif types.issubset({"LineString", "MultiLineString"}): - ds = lines_to_cf(geometries, container_name=container_name) - elif types.issubset({"Polygon", "MultiPolygon"}): - ds = polygons_to_cf(geometries, container_name=container_name) - else: - raise ValueError( - f"Mixed geometry types are not supported in CF-compliant datasets. Got {types}" - ) - - ds[container_name].attrs.update(coordinates="crd_x crd_y") - if ( grid_mapping is None and isinstance(geometries, xr.DataArray) @@ -389,29 +435,22 @@ def shapely_to_cf( grid_mapping = geometries.coords[grid_mapping_varname].attrs[ "grid_mapping_name" ] - for name_ in ["x", "y", "crd_x", "crd_y"]: - ds[name_].attrs["grid_mapping"] = grid_mapping_varname - - # Special treatment of selected grid mappings - if grid_mapping in ["latitude_longitude", "rotated_latitude_longitude"]: - # Special case for longitude_latitude type grid mappings - ds = ds.rename(crd_x="lon", crd_y="lat") - if grid_mapping == "latitude_longitude": - ds.lon.attrs.update(units="degrees_east", standard_name="longitude") - ds.x.attrs.update(units="degrees_east", standard_name="longitude") - ds.lat.attrs.update(units="degrees_north", standard_name="latitude") - ds.y.attrs.update(units="degrees_north", standard_name="latitude") - elif grid_mapping == "rotated_latitude_longitude": - ds.lon.attrs.update(units="degrees", standard_name="grid_longitude") - ds.x.attrs.update(units="degrees", standard_name="grid_longitude") - ds.lat.attrs.update(units="degrees", standard_name="grid_latitude") - ds.y.attrs.update(units="degrees", standard_name="grid_latitude") - ds[container_name].attrs.update(coordinates="lon lat") - elif grid_mapping is not None: - ds.crd_x.attrs.update(standard_name="projection_x_coordinate") - ds.x.attrs.update(standard_name="projection_x_coordinate") - ds.crd_y.attrs.update(standard_name="projection_y_coordinate") - ds.y.attrs.update(standard_name="projection_y_coordinate") + + names = GeometryNames(suffix=suffix, grid_mapping=grid_mapping) + + if types.issubset({"Point", "MultiPoint"}): + ds = points_to_cf(geometries, names=names) + elif types.issubset({"LineString", "MultiLineString"}): + ds = lines_to_cf(geometries, names=names) + elif types.issubset({"Polygon", "MultiPolygon"}): + ds = polygons_to_cf(geometries, names=names) + else: + raise ValueError( + f"Mixed geometry types are not supported in CF-compliant datasets. Got {types}" + ) + + # for name_ in ["x", "y", "crd_x", "crd_y"]: + # ds[name_].attrs["grid_mapping"] = grid_mapping_varname return ds @@ -463,9 +502,7 @@ def cf_to_shapely(ds: xr.Dataset, *, container: str = GEOMETRY_CONTAINER_NAME): return geometries.rename("geometry") -def points_to_cf( - pts: xr.DataArray | Sequence, *, container_name: str = GEOMETRY_CONTAINER_NAME -): +def points_to_cf(pts: xr.DataArray | Sequence, *, names: GeometryNames | None = None): """Get a list of points (shapely.geometry.[Multi]Point) and return a CF-compliant geometry dataset. Parameters @@ -482,6 +519,7 @@ def points_to_cf( from shapely.geometry import MultiPoint if isinstance(pts, xr.DataArray): + # TODO: Fix this hardcoding dim = pts.dims[0] coord = pts[dim] if dim in pts.coords else None pts_ = pts.values.tolist() @@ -502,33 +540,27 @@ def points_to_cf( crdX.append(xy[0, 0]) crdY.append(xy[0, 1]) + if names is None: + names = GeometryNames() + ds = xr.Dataset( data_vars={ - "node_count": xr.DataArray(node_count, dims=(dim,)), - container_name: xr.DataArray( - attrs={ - "geometry_type": "point", - "node_count": "node_count", - "node_coordinates": "x y", - "coordinates": "crd_x crd_y", - } + names.node_count: xr.DataArray(node_count, dims=(dim,)), + names.container_name: xr.DataArray( + data=np.nan, + attrs={"geometry_type": "point", **names.geometry_container_attrs}, ), }, - coords={ - "x": xr.DataArray(x, dims=("node",), attrs={"axis": "X"}), - "y": xr.DataArray(y, dims=("node",), attrs={"axis": "Y"}), - "crd_x": xr.DataArray(crdX, dims=(dim,), attrs={"nodes": "x"}), - "crd_y": xr.DataArray(crdY, dims=(dim,), attrs={"nodes": "y"}), - }, + coords=names.coords(x=x, y=y, crdX=crdX, crdY=crdY, dim=dim), ) if coord is not None: ds = ds.assign_coords({dim: coord}) # Special case when we have no MultiPoints - if (ds.node_count == 1).all(): - ds = ds.drop_vars("node_count") - del ds[container_name].attrs["node_count"] + if (ds[names.node_count] == 1).data.all(): + ds = ds.drop_vars(names.node_count) + del ds[names.container_name].attrs["node_count"] return ds @@ -585,12 +617,13 @@ def cf_to_points(ds: xr.Dataset): geoms[i] = MultiPoint(xy[j : j + n, :]) j += n - return xr.DataArray(geoms, dims=node_count.dims, coords=node_count.coords) + da = xr.DataArray(geoms, dims=node_count.dims, coords=node_count.coords) + if node_count_name: + del da[node_count_name] + return da -def lines_to_cf( - lines: xr.DataArray | Sequence, *, container_name: str = GEOMETRY_CONTAINER_NAME -): +def lines_to_cf(lines: xr.DataArray | Sequence, *, names: GeometryNames | None = None): """Convert an iterable of lines (shapely.geometry.[Multi]Line) into a CF-compliant geometry dataset. Parameters @@ -615,6 +648,9 @@ def lines_to_cf( coord = None lines_ = np.array(lines) + if names is None: + names = GeometryNames() + _, arr, offsets = to_ragged_array(lines_) x = arr[:, 0] y = arr[:, 1] @@ -633,33 +669,23 @@ def lines_to_cf( ds = xr.Dataset( data_vars={ - "node_count": xr.DataArray(node_count, dims=(dim,)), - "part_node_count": xr.DataArray(part_node_count, dims=("part",)), - container_name: xr.DataArray( - attrs={ - "geometry_type": "line", - "node_count": "node_count", - "part_node_count": "part_node_count", - "node_coordinates": "x y", - "coordinates": "crd_x crd_y", - } + names.node_count: xr.DataArray(node_count, dims=(dim,)), + names.container_name: xr.DataArray( + data=np.nan, + attrs={"geometry_type": "line", **names.geometry_container_attrs}, ), }, - coords={ - "x": xr.DataArray(x, dims=("node",), attrs={"axis": "X"}), - "y": xr.DataArray(y, dims=("node",), attrs={"axis": "Y"}), - "crd_x": xr.DataArray(crdX, dims=(dim,), attrs={"nodes": "x"}), - "crd_y": xr.DataArray(crdY, dims=(dim,), attrs={"nodes": "y"}), - }, + coords=names.coords(x=x, y=y, crdX=crdX, crdY=crdY, dim=dim), ) if coord is not None: ds = ds.assign_coords({dim: coord}) # Special case when we have no MultiLines - if len(ds.part_node_count) == len(ds.node_count): - ds = ds.drop_vars("part_node_count") - del ds[container_name].attrs["part_node_count"] + if len(part_node_count) != len(node_count): + ds[names.part_node_count] = xr.DataArray(part_node_count, dims=names.part_dim) + ds[names.container_name].attrs["part_node_count"] = names.part_node_count + return ds @@ -722,11 +748,13 @@ def cf_to_lines(ds: xr.Dataset): # get items from lines or multilines depending on number of parts geoms = np.where(np.diff(offset2) == 1, lines[offset2[:-1]], multilines) - return xr.DataArray(geoms, dims=node_count.dims, coords=node_count.coords) + return xr.DataArray( + geoms, dims=node_count.dims, coords=node_count.coords + ).drop_vars(node_count_name) def polygons_to_cf( - polygons: xr.DataArray | Sequence, *, container_name: str = GEOMETRY_CONTAINER_NAME + polygons: xr.DataArray | Sequence, *, names: GeometryNames | None = None ): """Convert an iterable of polygons (shapely.geometry.[Multi]Polygon) into a CF-compliant geometry dataset. @@ -735,6 +763,9 @@ def polygons_to_cf( polygons : sequence of shapely.geometry.Polygon or MultiPolygon The sequence of [multi]polygons to translate to a CF dataset. + names: GeometryNames, optional + Structure that helps manipulate geometry attrs. + Returns ------- xr.Dataset @@ -752,6 +783,9 @@ def polygons_to_cf( coord = None polygons_ = np.array(polygons) + if names is None: + names = GeometryNames() + _, arr, offsets = to_ragged_array(polygons_) x = arr[:, 0] y = arr[:, 1] @@ -775,40 +809,27 @@ def polygons_to_cf( ds = xr.Dataset( data_vars={ - "node_count": xr.DataArray(node_count, dims=(dim,)), - "interior_ring": xr.DataArray(interior_ring, dims=("part",)), - "part_node_count": xr.DataArray(part_node_count, dims=("part",)), - container_name: xr.DataArray( - attrs={ - "geometry_type": "polygon", - "node_count": "node_count", - "part_node_count": "part_node_count", - "interior_ring": "interior_ring", - "node_coordinates": "x y", - "coordinates": "crd_x crd_y", - } + names.node_count: xr.DataArray(node_count, dims=(dim,)), + names.container_name: xr.DataArray( + data=np.nan, + attrs={"geometry_type": "polygon", **names.geometry_container_attrs}, ), }, - coords={ - "x": xr.DataArray(x, dims=("node",), attrs={"axis": "X"}), - "y": xr.DataArray(y, dims=("node",), attrs={"axis": "Y"}), - "crd_x": xr.DataArray(crdX, dims=(dim,), attrs={"nodes": "x"}), - "crd_y": xr.DataArray(crdY, dims=(dim,), attrs={"nodes": "y"}), - }, + coords=names.coords(x=x, y=y, crdX=crdX, crdY=crdY, dim=dim), ) if coord is not None: ds = ds.assign_coords({dim: coord}) # Special case when we have no MultiPolygons and no holes - if len(ds.part_node_count) == len(ds.node_count): - ds = ds.drop_vars("part_node_count") - del ds[container_name].attrs["part_node_count"] + if len(part_node_count) != len(node_count): + ds[names.part_node_count] = xr.DataArray(part_node_count, dims=names.part_dim) + ds[names.container_name].attrs["part_node_count"] = names.part_node_count # Special case when we have no holes - if (ds.interior_ring == 0).all(): - ds = ds.drop_vars("interior_ring") - del ds[container_name].attrs["interior_ring"] + if (interior_ring != 0).any(): + ds[names.interior_ring] = xr.DataArray(interior_ring, dims=names.part_dim) + ds[names.container_name].attrs["interior_ring"] = names.interior_ring return ds @@ -886,4 +907,6 @@ def cf_to_polygons(ds: xr.Dataset): # get items from polygons or multipolygons depending on number of parts geoms = np.where(np.diff(offset3) == 1, polygons[offset3[:-1]], multipolygons) - return xr.DataArray(geoms, dims=node_count.dims, coords=node_count.coords) + return xr.DataArray( + geoms, dims=node_count.dims, coords=node_count.coords + ).drop_vars(node_count_name) From 8e55a3fd34d4a493dfc989735d4bbb1154c07e58 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 1 Jul 2024 12:51:41 -0600 Subject: [PATCH 03/12] cleanup --- cf_xarray/geometry.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cf_xarray/geometry.py b/cf_xarray/geometry.py index adfc38f6..4835b0a8 100644 --- a/cf_xarray/geometry.py +++ b/cf_xarray/geometry.py @@ -10,6 +10,7 @@ import xarray as xr GEOMETRY_CONTAINER_NAME = "geometry_container" +FEATURES_DIM_NAME = "features" __all__ = [ "decode_geometries", @@ -34,6 +35,8 @@ @dataclass class GeometryNames: + """Helper class to ease handling of all the variable names needed for CF geometries.""" + def __init__(self, suffix: str = "", grid_mapping: str | None = None): self.container_name: str = GEOMETRY_CONTAINER_NAME + suffix self.node_dim: str = "node" + suffix @@ -315,7 +318,7 @@ def encode_geometries(ds: xr.Dataset): def reshape_unique_geometries( ds: xr.Dataset, geom_var: str = "geometry", - new_dim: str = "features", + new_dim: str = FEATURES_DIM_NAME, ) -> xr.Dataset: """Reshape a dataset containing a geometry variable so that all unique features are identified along a new dimension. @@ -524,7 +527,7 @@ def points_to_cf(pts: xr.DataArray | Sequence, *, names: GeometryNames | None = coord = pts[dim] if dim in pts.coords else None pts_ = pts.values.tolist() else: - dim = "features" + dim = FEATURES_DIM_NAME coord = None pts_ = pts @@ -599,8 +602,8 @@ def cf_to_points(ds: xr.Dataset): node_count_name = ds[container_name].attrs.get("node_count") if node_count_name is None: # No node_count means all geometries are single points (node_count = 1) - # And if we had no coordinates, then the dimension defaults to "features" - feat_dim = feat_dim or "features" + # And if we had no coordinates, then the dimension defaults to FEATURES_DIM_NAME + feat_dim = feat_dim or FEATURES_DIM_NAME node_count = xr.DataArray([1] * xy.shape[0], dims=(feat_dim,)) if feat_dim in ds.coords: node_count = node_count.assign_coords({feat_dim: ds[feat_dim]}) From 8b1787191ef34f2fa49d6f38682ed37fad9e1c48 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 1 Jul 2024 13:00:21 -0600 Subject: [PATCH 04/12] Some cleanup --- cf_xarray/geometry.py | 41 ++++++++++++++++++++------------ cf_xarray/tests/test_geometry.py | 2 ++ 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/cf_xarray/geometry.py b/cf_xarray/geometry.py index 4835b0a8..3ec6c6b0 100644 --- a/cf_xarray/geometry.py +++ b/cf_xarray/geometry.py @@ -37,7 +37,12 @@ class GeometryNames: """Helper class to ease handling of all the variable names needed for CF geometries.""" - def __init__(self, suffix: str = "", grid_mapping: str | None = None): + def __init__( + self, + suffix: str = "", + grid_mapping_name: str | None = None, + grid_mapping: str | None = None, + ): self.container_name: str = GEOMETRY_CONTAINER_NAME + suffix self.node_dim: str = "node" + suffix self.node_count: str = "node_count" + suffix @@ -51,24 +56,29 @@ def __init__(self, suffix: str = "", grid_mapping: str | None = None): self.attrs_x: dict[str, str] = {} self.attrs_y: dict[str, str] = {} + gmattr = {"grid_mapping": grid_mapping} if grid_mapping else {} # Special treatment of selected grid mappings - if grid_mapping in ["latitude_longitude", "rotated_latitude_longitude"]: + if grid_mapping_name in ["latitude_longitude", "rotated_latitude_longitude"]: # Special case for longitude_latitude type grid mappings self.coordinates_x = "lon" self.coordinates_y = "lat" - if grid_mapping == "latitude_longitude": - self.attrs_x = dict(units="degrees_east", standard_name="longitude") - self.attrs_y = dict(units="degrees_north", standard_name="latitude") - elif grid_mapping == "rotated_latitude_longitude": + if grid_mapping_name == "latitude_longitude": + self.attrs_x = dict( + units="degrees_east", standard_name="longitude", **gmattr + ) + self.attrs_y = dict( + units="degrees_north", standard_name="latitude", **gmattr + ) + elif grid_mapping_name == "rotated_latitude_longitude": self.attrs_x = dict( - units="degrees_east", standard_name="grid_longitude" + units="degrees_east", standard_name="grid_longitude", **gmattr ) self.attrs_y = dict( - units="degrees_north", standard_name="grid_latitude" + units="degrees_north", standard_name="grid_latitude", **gmattr ) - elif grid_mapping is not None: - self.attrs_x = dict(standard_name="projection_x_coordinate") - self.attrs_y = dict(standard_name="projection_y_coordinate") + elif grid_mapping_name is not None: + self.attrs_x = dict(standard_name="projection_x_coordinate", **gmattr) + self.attrs_y = dict(standard_name="projection_y_coordinate", **gmattr) @property def geometry_container_attrs(self) -> dict[str, str]: @@ -429,6 +439,8 @@ def shapely_to_cf( geom.item().geom_type if isinstance(geom, xr.DataArray) else geom.geom_type for geom in geometries } + + grid_mapping_varname = None if ( grid_mapping is None and isinstance(geometries, xr.DataArray) @@ -439,7 +451,9 @@ def shapely_to_cf( "grid_mapping_name" ] - names = GeometryNames(suffix=suffix, grid_mapping=grid_mapping) + names = GeometryNames( + suffix=suffix, grid_mapping_name=grid_mapping, grid_mapping=grid_mapping_varname + ) if types.issubset({"Point", "MultiPoint"}): ds = points_to_cf(geometries, names=names) @@ -452,9 +466,6 @@ def shapely_to_cf( f"Mixed geometry types are not supported in CF-compliant datasets. Got {types}" ) - # for name_ in ["x", "y", "crd_x", "crd_y"]: - # ds[name_].attrs["grid_mapping"] = grid_mapping_varname - return ds diff --git a/cf_xarray/tests/test_geometry.py b/cf_xarray/tests/test_geometry.py index c412f3c7..a9e680d7 100644 --- a/cf_xarray/tests/test_geometry.py +++ b/cf_xarray/tests/test_geometry.py @@ -360,6 +360,8 @@ def test_shapely_to_cf_errors(): ) assert encoded["x"].attrs["standard_name"] == "projection_x_coordinate" assert encoded["y"].attrs["standard_name"] == "projection_y_coordinate" + for name in ["x", "y", "crd_x", "crd_y"]: + assert "grid_mapping" not in encoded[name].attrs @requires_shapely From 742b5091011691898cf0cd3e9c520b8c200dc585 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 1 Jul 2024 17:21:52 -0600 Subject: [PATCH 05/12] Updates --- cf_xarray/geometry.py | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/cf_xarray/geometry.py b/cf_xarray/geometry.py index 3ec6c6b0..2e46f115 100644 --- a/cf_xarray/geometry.py +++ b/cf_xarray/geometry.py @@ -55,30 +55,28 @@ def __init__( self.interior_ring: str = "interior_ring" + suffix self.attrs_x: dict[str, str] = {} self.attrs_y: dict[str, str] = {} + self.grid_mapping_attr = {"grid_mapping": grid_mapping} if grid_mapping else {} - gmattr = {"grid_mapping": grid_mapping} if grid_mapping else {} # Special treatment of selected grid mappings if grid_mapping_name in ["latitude_longitude", "rotated_latitude_longitude"]: # Special case for longitude_latitude type grid mappings self.coordinates_x = "lon" self.coordinates_y = "lat" if grid_mapping_name == "latitude_longitude": - self.attrs_x = dict( - units="degrees_east", standard_name="longitude", **gmattr - ) - self.attrs_y = dict( - units="degrees_north", standard_name="latitude", **gmattr - ) + self.attrs_x = dict(units="degrees_east", standard_name="longitude") + self.attrs_y = dict(units="degrees_north", standard_name="latitude") elif grid_mapping_name == "rotated_latitude_longitude": self.attrs_x = dict( - units="degrees_east", standard_name="grid_longitude", **gmattr + units="degrees_east", standard_name="grid_longitude" ) self.attrs_y = dict( - units="degrees_north", standard_name="grid_latitude", **gmattr + units="degrees_north", standard_name="grid_latitude" ) elif grid_mapping_name is not None: - self.attrs_x = dict(standard_name="projection_x_coordinate", **gmattr) - self.attrs_y = dict(standard_name="projection_y_coordinate", **gmattr) + self.attrs_x = dict(standard_name="projection_x_coordinate") + self.attrs_y = dict(standard_name="projection_y_coordinate") + self.attrs_x.update(self.grid_mapping_attr) + self.attrs_y.update(self.grid_mapping_attr) @property def geometry_container_attrs(self) -> dict[str, str]: @@ -86,6 +84,7 @@ def geometry_container_attrs(self) -> dict[str, str]: "node_count": self.node_count, "node_coordinates": f"{self.node_coordinates_x} {self.node_coordinates_y}", "coordinates": f"{self.coordinates_x} {self.coordinates_y}", + **self.grid_mapping_attr, } def coords(self, *, dim: str, x, y, crdX, crdY) -> dict[str, xr.DataArray]: @@ -182,6 +181,7 @@ def decode_geometries(encoded: xr.Dataset) -> xr.Dataset: for container_name in containers: enc_geom_var = encoded[container_name] geom_attrs = enc_geom_var.attrs + # Grab the coordinates attribute geom_attrs.update(enc_geom_var.encoding) @@ -282,11 +282,13 @@ def encode_geometries(ds: xr.Dataset): variables = {} for name in geom_var_names: - container_name = GEOMETRY_CONTAINER_NAME + "_" + name + # TODO: do we prefer this choice be invariant to number of geometry variables + suffix = "_" + name if len(geom_var_names) > 1 else "" + container_name = GEOMETRY_CONTAINER_NAME + suffix # If `name` is a dimension name, then we need to drop it. Otherwise we don't # So set errors="ignore" variables.update( - shapely_to_cf(ds[name], suffix="_" + name) + shapely_to_cf(ds[name], suffix=suffix) .drop_vars(name, errors="ignore") ._variables ) @@ -447,9 +449,10 @@ def shapely_to_cf( and (grid_mapping_varname := geometries.attrs.get("grid_mapping")) ): if grid_mapping_varname in geometries.coords: - grid_mapping = geometries.coords[grid_mapping_varname].attrs[ - "grid_mapping_name" - ] + # Not all CRS can be encoded in CF + grid_mapping = geometries.coords[grid_mapping_varname].attrs.get( + "grid_mapping_name", None + ) names = GeometryNames( suffix=suffix, grid_mapping_name=grid_mapping, grid_mapping=grid_mapping_varname @@ -512,6 +515,8 @@ def cf_to_shapely(ds: xr.Dataset, *, container: str = GEOMETRY_CONTAINER_NAME): raise ValueError( f"Valid CF geometry types are 'point', 'line' and 'polygon'. Got {geom_type}" ) + if gm := ds[container].attrs.get("grid_mapping"): + geometries.attrs["grid_mapping"] = gm return geometries.rename("geometry") From dcc3a54f02589230c56c969c9cae0bf7f662a32c Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 1 Jul 2024 22:21:36 -0600 Subject: [PATCH 06/12] fix typing --- cf_xarray/geometry.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cf_xarray/geometry.py b/cf_xarray/geometry.py index 2e46f115..e282de59 100644 --- a/cf_xarray/geometry.py +++ b/cf_xarray/geometry.py @@ -87,7 +87,7 @@ def geometry_container_attrs(self) -> dict[str, str]: **self.grid_mapping_attr, } - def coords(self, *, dim: str, x, y, crdX, crdY) -> dict[str, xr.DataArray]: + def coords(self, *, dim: Hashable, x, y, crdX, crdY) -> dict[str, xr.DataArray]: return { self.node_coordinates_x: xr.DataArray( x, dims=self.node_dim, attrs={"axis": "X", **self.attrs_x} @@ -108,7 +108,7 @@ def coords(self, *, dim: str, x, y, crdX, crdY) -> dict[str, xr.DataArray]: } -def _assert_single_geometry_container(ds: xr.Dataset) -> str: +def _assert_single_geometry_container(ds: xr.Dataset) -> Hashable: container_names = _get_geometry_containers(ds) if len(container_names) > 1: raise ValueError( @@ -176,7 +176,7 @@ def decode_geometries(encoded: xr.Dataset) -> xr.Dataset: "have a `geometry_type` attribute." ) - todrop = [] + todrop: list[Hashable] = [] decoded = xr.Dataset() for container_name in containers: enc_geom_var = encoded[container_name] @@ -283,7 +283,7 @@ def encode_geometries(ds: xr.Dataset): variables = {} for name in geom_var_names: # TODO: do we prefer this choice be invariant to number of geometry variables - suffix = "_" + name if len(geom_var_names) > 1 else "" + suffix = "_" + str(name) if len(geom_var_names) > 1 else "" container_name = GEOMETRY_CONTAINER_NAME + suffix # If `name` is a dimension name, then we need to drop it. Otherwise we don't # So set errors="ignore" @@ -472,7 +472,7 @@ def shapely_to_cf( return ds -def cf_to_shapely(ds: xr.Dataset, *, container: str = GEOMETRY_CONTAINER_NAME): +def cf_to_shapely(ds: xr.Dataset, *, container: Hashable = GEOMETRY_CONTAINER_NAME): """ Convert geometries stored in a CF-compliant way to shapely objects stored in a single variable. From 9abf76c61c934fb2fd326566e3f4a75ba7a11207 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 17 Jul 2024 13:21:37 +0530 Subject: [PATCH 07/12] Add test --- cf_xarray/tests/test_geometry.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cf_xarray/tests/test_geometry.py b/cf_xarray/tests/test_geometry.py index a9e680d7..31cb2de8 100644 --- a/cf_xarray/tests/test_geometry.py +++ b/cf_xarray/tests/test_geometry.py @@ -502,6 +502,10 @@ def test_encode_decode(geometry_ds, polygon_geometry): ) ).assign({"foo": ("geoms", [1, 2])}) - for ds in (geometry_ds[1], polygon_geometry.to_dataset(), geom_dim_ds): + polyds = ( + polygon_geometry.rename("polygons").rename({"index": "index2"}).to_dataset() + ) + multi_ds = xr.merge([polyds, geometry_ds[1]]) + for ds in (geometry_ds[1], polygon_geometry.to_dataset(), geom_dim_ds, multi_ds): roundtripped = decode_geometries(encode_geometries(ds)) xr.testing.assert_identical(ds, roundtripped) From cc2a31ce8004a70edf2eb1a4987595664699ad0e Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 17 Jul 2024 13:25:03 +0530 Subject: [PATCH 08/12] Cleanup --- cf_xarray/geometry.py | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/cf_xarray/geometry.py b/cf_xarray/geometry.py index e282de59..46150c7d 100644 --- a/cf_xarray/geometry.py +++ b/cf_xarray/geometry.py @@ -87,25 +87,50 @@ def geometry_container_attrs(self) -> dict[str, str]: **self.grid_mapping_attr, } - def coords(self, *, dim: Hashable, x, y, crdX, crdY) -> dict[str, xr.DataArray]: - return { + def coords( + self, + *, + dim: Hashable, + x: np.ndarray, + y: np.ndarray, + crdX: np.ndarray | None = None, + crdY: np.ndarray | None = None, + ) -> dict[str, xr.DataArray]: + """ + Construct coordinate DataArrays for the numpy data (x, y, crdX, crdY) + + Parameters + ---------- + x: array + Node coordinates for X coordinate + y: array + Node coordinates for Y coordinate + crdX: array, optional + Nominal X coordinate + crdY: array, optional + Nominal X coordinate + """ + mapping = { self.node_coordinates_x: xr.DataArray( x, dims=self.node_dim, attrs={"axis": "X", **self.attrs_x} ), self.node_coordinates_y: xr.DataArray( y, dims=self.node_dim, attrs={"axis": "Y", **self.attrs_y} ), - self.coordinates_x: xr.DataArray( + } + if crdX is not None: + mapping[self.coordinates_x] = xr.DataArray( crdX, dims=(dim,), attrs={"nodes": self.node_coordinates_x, **self.attrs_x}, - ), - self.coordinates_y: xr.DataArray( + ) + if crdY is not None: + mapping[self.coordinates_y] = xr.DataArray( crdY, dims=(dim,), attrs={"nodes": self.node_coordinates_y, **self.attrs_y}, - ), - } + ) + return mapping def _assert_single_geometry_container(ds: xr.Dataset) -> Hashable: From f92f098802b81025ae69b31a8631e34e894ed672 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 17 Jul 2024 13:26:48 +0530 Subject: [PATCH 09/12] Docs --- cf_xarray/geometry.py | 7 +------ doc/api.rst | 1 + doc/geometry.md | 5 ++--- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/cf_xarray/geometry.py b/cf_xarray/geometry.py index 46150c7d..88ab4798 100644 --- a/cf_xarray/geometry.py +++ b/cf_xarray/geometry.py @@ -175,7 +175,7 @@ def _get_geometry_containers(obj: xr.DataArray | xr.Dataset) -> list[Hashable]: def decode_geometries(encoded: xr.Dataset) -> xr.Dataset: """ - Decode CF encoded geometries to a numpy object array containing shapely geometries. + Decode CF encoded geometries to numpy object arrays containing shapely geometries. Parameters ---------- @@ -255,11 +255,6 @@ def encode_geometries(ds: xr.Dataset): Practically speaking, geometry variables are numpy object arrays where the first element is a shapely geometry. - .. warning:: - - Only a single geometry variable is supported at present. Contributions to fix this - are welcome. - Parameters ---------- ds : Dataset diff --git a/doc/api.rst b/doc/api.rst index 4651f017..493a01a9 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -26,6 +26,7 @@ Geometries geometry.encode_geometries geometry.shapely_to_cf geometry.cf_to_shapely + geometry.GeometryNames .. currentmodule:: xarray diff --git a/doc/geometry.md b/doc/geometry.md index 3a2380f8..e105c88b 100644 --- a/doc/geometry.md +++ b/doc/geometry.md @@ -97,9 +97,8 @@ ds.identical(decoded) The following limitations can be relaxed in the future. PRs welcome! -1. cf-xarray uses `"geometry_container"` as the name for the geometry variable always -1. cf-xarray only supports decoding a single geometry in a Dataset. -1. CF xarray will not set the `"geometry"` attribute that links a variable to a geometry by default unless the geometry variable is a dimension coordiante for that variable. This heuristic works OK for vector data cubes (e.g. [xvec](https://xvec.readthedocs.io/en/stable/)). You should set the `"geometry"` attribute manually otherwise. Suggestions for better behaviour here are very welcome. +1. cf-xarray uses `"geometry_container"` as the name for the geometry variable always. If there are multiple geometry variables then `"geometry_N"`is used where N is an integer >= 0. cf-xarray behaves similarly for all associated geometry variables names: i.e. `"node"`, `"node_count"`, `"part_node_count"`, `"part"`, `"interior_ring"`. `"x"`, `"y"` (with suffixes if needed) are always the node coordinate variable names, and `"crd_x"`, `"crd_y"` are the nominal X, Y coordinate locations. None of this is configurable at the moment. +1. CF xarray will not set the `"geometry"` attribute that links a variable to a geometry by default unless the geometry variable is a dimension coordinate for that variable. This heuristic works OK for vector data cubes (e.g. [xvec](https://xvec.readthedocs.io/en/stable/)). You should set the `"geometry"` attribute manually otherwise. Suggestions for better behaviour here are very welcome. ## Lower-level conversions From a99ce571895171b2a0dc3ddee92559bc4fd32a5f Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 17 Jul 2024 13:31:05 +0530 Subject: [PATCH 10/12] nit --- cf_xarray/geometry.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cf_xarray/geometry.py b/cf_xarray/geometry.py index 88ab4798..3ef5f96c 100644 --- a/cf_xarray/geometry.py +++ b/cf_xarray/geometry.py @@ -559,6 +559,8 @@ def points_to_cf(pts: xr.DataArray | Sequence, *, names: GeometryNames | None = if isinstance(pts, xr.DataArray): # TODO: Fix this hardcoding + if pts.ndim != 1: + raise ValueError("Only 1D DataArrays are supported.") dim = pts.dims[0] coord = pts[dim] if dim in pts.coords else None pts_ = pts.values.tolist() From bcd96debaa37e6c392648a58c2078a111d343e6d Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 17 Jul 2024 13:32:01 +0530 Subject: [PATCH 11/12] comment --- cf_xarray/geometry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cf_xarray/geometry.py b/cf_xarray/geometry.py index 3ef5f96c..fe75f4ff 100644 --- a/cf_xarray/geometry.py +++ b/cf_xarray/geometry.py @@ -474,6 +474,7 @@ def shapely_to_cf( "grid_mapping_name", None ) + # TODO: consider accepting a GeometryNames instance from the user instead names = GeometryNames( suffix=suffix, grid_mapping_name=grid_mapping, grid_mapping=grid_mapping_varname ) From 5ab4a0155321b596b8c5f60c3c7fee808ecea971 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 17 Jul 2024 14:54:59 +0530 Subject: [PATCH 12/12] fix typing --- cf_xarray/geometry.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cf_xarray/geometry.py b/cf_xarray/geometry.py index fe75f4ff..fe48d58a 100644 --- a/cf_xarray/geometry.py +++ b/cf_xarray/geometry.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd import xarray as xr +from numpy.typing import ArrayLike GEOMETRY_CONTAINER_NAME = "geometry_container" FEATURES_DIM_NAME = "features" @@ -91,10 +92,10 @@ def coords( self, *, dim: Hashable, - x: np.ndarray, - y: np.ndarray, - crdX: np.ndarray | None = None, - crdY: np.ndarray | None = None, + x: ArrayLike, + y: ArrayLike, + crdX: ArrayLike | None = None, + crdY: ArrayLike | None = None, ) -> dict[str, xr.DataArray]: """ Construct coordinate DataArrays for the numpy data (x, y, crdX, crdY) @@ -263,7 +264,9 @@ def encode_geometries(ds: xr.Dataset): Returns ------- Dataset - Where all geometry variables are encoded. + Where all geometry variables are encoded. The information in a single geometry + variable in the input is split across multiple variables in the returned Dataset + following the CF conventions. See Also --------