pydata · max-sixty · Mar 25, 2020 · Jan 30, 2020 · Jan 30, 2020 · Jan 30, 2020
diff --git a/doc/api.rst b/doc/api.rst
@@ -30,6 +30,7 @@ Top-level functions
    zeros_like
    ones_like
    dot
+   polyval
    map_blocks
    show_versions
    set_options
@@ -171,6 +172,7 @@ Computation
    Dataset.quantile
    Dataset.differentiate
    Dataset.integrate
+   Dataset.polyfit
 
 **Aggregation**:
 :py:attr:`~Dataset.all`
@@ -349,6 +351,7 @@ Computation
    DataArray.quantile
    DataArray.differentiate
    DataArray.integrate
+   DataArray.polyfit
    DataArray.str
 
 **Aggregation**:

diff --git a/doc/computation.rst b/doc/computation.rst
@@ -317,6 +317,32 @@ trapezoidal rule using their coordinates,
     and integration along multidimensional coordinate are not supported.
 
 
+.. _compute.polyfit:
+
+Fitting polynomials
+===================
+
+Xarray objects provide an interface for performing linear or polynomial regressions
+using the least-squares method. :py:meth:`~xarray.DataArray.polyfit` computes the
+best fitting coefficients along a given dimension and for a given order,
+
+.. ipython:: python
+
+    x = np.arange(10)
+    a = xr.DataArray(3 + 4 * x, dims=['x'], coords={'x': x})
+    out = a.polyfit(dim='x', deg=1, full=True)
+    out
+
+The method outputs a dataset containing the coefficients (and more if `full=True`).
+The inverse operation is done with :py:meth:`~xarray.polyval`,
+
+.. ipython:: python
+
+    xr.polyval(coord=x, coeffs=out.polyfit_coefficients)
+
+.. note::
+    These methods replicate the behaviour of :py:func:`numpy.polyfit` and :py:func:`numpy.polyval`.
+
 .. _compute.broadcasting:
 
 Broadcasting by dimension name

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -149,6 +149,8 @@ Breaking changes
 
 New Features
 ~~~~~~~~~~~~
+- Added :py:meth:`DataArray.polyfit` and :py:func:`xarray.polyval` for fitting polynomials. (:issue:`3349`)
+  By `Pascal Bourgault <https://github.com/aulemahal>`_.
 - :py:meth:`DataArray.sel` and :py:meth:`Dataset.sel` now support :py:class:`pandas.CategoricalIndex`. (:issue:`3669`)
   By `Keisuke Fujii <https://github.com/fujiisoup>`_.
 - Support using an existing, opened h5netcdf ``File`` with

diff --git a/xarray/__init__.py b/xarray/__init__.py
@@ -17,7 +17,7 @@
 from .core.alignment import align, broadcast
 from .core.combine import auto_combine, combine_by_coords, combine_nested
 from .core.common import ALL_DIMS, full_like, ones_like, zeros_like
-from .core.computation import apply_ufunc, dot, where
+from .core.computation import apply_ufunc, dot, polyval, where
 from .core.concat import concat
 from .core.dataarray import DataArray
 from .core.dataset import Dataset
@@ -65,6 +65,7 @@
     "open_mfdataset",
     "open_rasterio",
     "open_zarr",
+    "polyval",
     "register_dataarray_accessor",
     "register_dataset_accessor",
     "save_mfdataset",

diff --git a/xarray/core/computation.py b/xarray/core/computation.py
@@ -1299,3 +1299,35 @@ def where(cond, x, y):
         dataset_join="exact",
         dask="allowed",
     )
+
+
+def polyval(coord, coeffs, degree_dim="degree"):
+    """Evaluate a polynomial at specific values
+
+    Parameters
+    ----------
+    coord : DataArray
+        The 1D coordinate along which to evaluate the polynomial.
+    coeffs : DataArray
+        Coefficients of the polynomials.
+    degree_dim : str, default "degree"
+        Name of the polynomial degree dimension in `coeffs`.
+
+    See also
+    --------
+    xarray.DataArray.polyfit
+    numpy.polyval
+    """
+    from .dataarray import DataArray
+    from .missing import get_clean_interp_index
+
+    x = get_clean_interp_index(coord, coord.name)
+
+    deg_coord = coeffs[degree_dim]
+
+    lhs = DataArray(
+        np.vander(x, int(deg_coord.max()) + 1),
+        dims=(coord.name, degree_dim),
+        coords={coord.name: coord, degree_dim: np.arange(deg_coord.max() + 1)[::-1]},
+    )
+    return (lhs * coeffs).sum(degree_dim)
diff --git a/xarray/core/dask_array_ops.py b/xarray/core/dask_array_ops.py
@@ -95,3 +95,30 @@ def func(x, window, axis=-1):
     # crop boundary.
     index = (slice(None),) * axis + (slice(drop_size, drop_size + orig_shape[axis]),)
     return out[index]
+
+
+def least_squares(lhs, rhs, rcond=None, skipna=False):
+    import dask.array as da
+
+    lhs_da = da.from_array(lhs, chunks=(rhs.chunks[0], lhs.shape[1]))
+    if skipna:
+        added_dim = rhs.ndim == 1
+        if added_dim:
+            rhs = rhs.reshape(rhs.shape[0], 1)
+        results = da.apply_along_axis(
+            nputils._nanpolyfit_1d,
+            0,
+            rhs,
+            lhs_da,
+            dtype=float,
+            shape=(lhs.shape[1] + 1,),
+            rcond=rcond,
+        )
+        coeffs = results[:-1, ...]
+        residuals = results[-1, ...]
+        if added_dim:
+            coeffs = coeffs.reshape(coeffs.shape[0])
+            residuals = residuals.reshape(residuals.shape[0])
+    else:
+        coeffs, residuals, _, _ = da.linalg.lstsq(lhs_da, rhs)
+    return coeffs, residuals
diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -3245,6 +3245,59 @@ def map_blocks(
 
         return map_blocks(func, self, args, kwargs)
 
+    def polyfit(
+        self,
+        dim: Hashable,
+        deg: int,
+        skipna: bool = None,
+        rcond: float = None,
+        w: Union[Hashable, Any] = None,
+        full: bool = False,
+        cov: bool = False,
+    ):
+        """
+        Least squares polynomial fit.
+
+        This replicates the behaviour of `numpy.polyfit` but differs by skipping
+        invalid values when `skipna = True`.
+
+        Parameters
+        ----------
+        dim : hashable
+            Coordinate along which to fit the polynomials.
+        deg : int
+            Degree of the fitting polynomial.
+        skipna : bool, optional
+            If True, removes all invalid values before fitting each 1D slices of the array
+            Default is True if data is stored in a dask.array or if there is any
+            invalid values, False otherwise.
+        rcond : float, optional
+            Relative condition number to the fit.
+        w : Union[Hashable, Any], optional
+            Weights to apply to the y-coordinate of the sample points.
+            Can be an array-like object or the name of a coordinate in the dataset.
+        full : bool, optional
+            Whether to return the residuals, matrix rank and singular values in addition
+            to the coefficients.
+        cov : Union[bool, str], optional
+            Whether to return to the covariance matrix in addition to the coefficients.
+            The matrix is not scaled if `cov = 'unscaled'`.
+
+        See documentation of `numpy.polyfit`.
+
+        Returns
+        -------
+        A single dataset which containts:
+            polyfit_coefficients : The coefficients of the best fit.
+            polyfit_residuals : The residuals of the least-square computation (only included if `full=True`)
+            [dim]_matrix_rank : The effective rank of the scaled Vandermonde coefficient matrix (only included if `full=True`)
+            [dim]_singular_values : The singular values of the scaled Vandermonde coefficient matrix (only included if `full=True`)
+            polyfit_covariance : The covariance matrix of the polynomial coefficient estimates (only included if `full=False` and `cov=True`)
+        """
+        return self._to_temp_dataset().polyfit(
+            dim, deg, skipna=skipna, rcond=rcond, w=w, full=full, cov=cov
+        )
+
     # this needs to be at the end, or mypy will confuse with `str`
     # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names
     str = property(StringAccessor)

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -75,6 +75,7 @@
     merge_coordinates_without_align,
     merge_data_and_coords,
 )
+from .missing import get_clean_interp_index
 from .options import OPTIONS, _get_keep_attrs
 from .pycompat import dask_array_type
 from .utils import (
@@ -5724,5 +5725,173 @@ def map_blocks(
 
         return map_blocks(func, self, args, kwargs)
 
+    def polyfit(
+        self,
+        dim: Hashable,
+        deg: int,
+        skipna: bool = None,
+        rcond: float = None,
+        w: Union[Hashable, Any] = None,
+        full: bool = False,
+        cov: Union[bool, str] = False,
+    ):
+        """
+        Least squares polynomial fit.
+
+        This replicates the behaviour of `numpy.polyfit` but differs by skipping
+        invalid values when `skipna = True`.
+
+        Parameters
+        ----------
+        dim : hashable
+            Coordinate along which to fit the polynomials.
+        deg : int
+            Degree of the fitting polynomial.
+        skipna : bool, optional
+            If True, removes all invalid values before fitting each 1D slices of the array
+            Default is True if data is stored in a dask.array or if there is any
+            invalid values, False otherwise.
+        rcond : float, optional
+            Relative condition number to the fit.
+        w : Union[Hashable, Any], optional
+            Weights to apply to the y-coordinate of the sample points.
+            Can be an array-like object or the name of a coordinate in the dataset.
+        full : bool, optional
+            Whether to return the residuals, matrix rank and singular values in addition
+            to the coefficients.
+        cov : Union[bool, str], optional
+            Whether to return to the covariance matrix in addition to the coefficients.
+            The matrix is not scaled if `cov = 'unscaled'`.
+
+        See documentation of `numpy.polyfit`.
+
+        Returns
+        -------
+        A single dataset which containts:
+            [var_]polyfit_coefficients : The coefficients of the best fit for each variable in this dataset.
+            [var_]polyfit_residuals : The residuals of the least-square computation for each variable (only included if `full=True`)
+            [dim]_matrix_rank : The effective rank of the scaled Vandermonde coefficient matrix (only included if `full=True`)
+            [dim]_singular_values : The singular values of the scaled Vandermonde coefficient matrix (only included if `full=True`)
+            [var_]polyfit_covariance : The covariance matrix of the polynomial coefficient estimates (only included if `full=False` and `cov=True`)
+        """
+        variables = {}
+        skipna_da = skipna
+
+        x = get_clean_interp_index(self, dim)
+        xname = "{}_".format(self[dim].name)
+        order = int(deg) + 1
+        lhs = np.vander(x, order)
+
+        if rcond is None:
+            rcond = x.shape[0] * np.core.finfo(x.dtype).eps
+
+        # Weights:
+        if w is not None:
+            if isinstance(w, Hashable):
+                w = self.coords[w]
+            w = np.asarray(w)
+            if w.ndim != 1:
+                raise TypeError("Expected a 1-d array for weights.")
+            if w.shape[0] != lhs.shape[0]:
+                raise TypeError("Expected w and {} to have the same length".format(dim))
+            lhs *= w[:, np.newaxis]
+
+        # Scaling
+        scale = np.sqrt((lhs * lhs).sum(axis=0))
+        lhs /= scale
+
+        degree_dim = utils.get_temp_dimname(self.dims, "degree")
+
+        rank = np.linalg.matrix_rank(lhs)
+        if rank != order and not full:
+            warnings.warn(
+                "Polyfit may be poorly conditioned", np.RankWarning, stacklevel=4
+            )
+
+        if full:
+            rank = xr.DataArray(rank, name=xname + "matrix_rank")
+            variables[rank.name] = rank
+            sing = np.linalg.svd(lhs, compute_uv=False)
+            sing = xr.DataArray(
+                sing,
+                dims=(degree_dim,),
+                coords={degree_dim: np.arange(order)[::-1]},
+                name=xname + "singular_values",
+            )
+            variables[sing.name] = sing
+
+        for name, da in self.data_vars.items():
+            if dim not in da.dims:
+                continue
+
+            if skipna is None:
+                if isinstance(da.data, dask_array_type):
+                    skipna_da = True
+                else:
+                    skipna_da = np.any(da.isnull())
+
+            dims_to_stack = [dimname for dimname in da.dims if dimname != dim]
+            stacked_coords = {}
+            if dims_to_stack:
+                stacked_dim = utils.get_temp_dimname(dims_to_stack, "stacked")
+                rhs = da.transpose(dim, *dims_to_stack).stack(
+                    {stacked_dim: dims_to_stack}
+                )
+                stacked_coords = {stacked_dim: rhs[stacked_dim]}
+                scale_da = scale[:, np.newaxis]
+            else:
+                rhs = da
+                scale_da = scale
+
+            if w is not None:
+                rhs *= w[:, np.newaxis]
+
+            coeffs, residuals = duck_array_ops.least_squares(
+                lhs, rhs.data, rcond=rcond, skipna=skipna_da
+            )
+
+            if isinstance(name, str):
+                name = "{}_".format(name)
+            else:
+                # Thus a ReprObject => polyfit was called on a DataArray
+                name = ""
+
+            coeffs = xr.DataArray(
+                coeffs / scale_da,
+                dims=[degree_dim] + list(stacked_coords.keys()),
+                coords={degree_dim: np.arange(order)[::-1], **stacked_coords},
+                name=name + "polyfit_coefficients",
+            )
+            if dims_to_stack:
+                coeffs = coeffs.unstack(stacked_dim)
+            variables[coeffs.name] = coeffs
+
+            if full or (cov is True):
+                residuals = xr.DataArray(
+                    residuals if dims_to_stack else residuals.squeeze(),
+                    dims=list(stacked_coords.keys()),
+                    coords=stacked_coords,
+                    name=name + "polyfit_residuals",
+                )
+                if dims_to_stack:
+                    residuals = residuals.unstack(stacked_dim)
+                variables[residuals.name] = residuals
+
+            if cov:
+                Vbase = np.linalg.inv(np.dot(lhs.T, lhs))
+                Vbase /= np.outer(scale, scale)
+                if cov == "unscaled":
+                    fac = 1
+                else:
+                    if x.shape[0] <= order:
+                        raise ValueError(
+                            "The number of data points must exceed order to scale the covariance matrix."
+                        )
+                    fac = residuals / (x.shape[0] - order)
+                covariance = xr.DataArray(Vbase, dims=("cov_i", "cov_j"),) * fac
+                variables[name + "polyfit_covariance"] = covariance
+
+        return Dataset(data_vars=variables, attrs=self.attrs.copy())
+
 
 ops.inject_all_ops_and_reduce_methods(Dataset, array_only=False)