diff --git a/.gitignore b/.gitignore index adaafc7..110bb27 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,11 @@ +.vscode/ .idea/ data/ docs/modules/ docs/_build/ docs/auto_examples/ coverage/ +scratch # So far, all html files are auto-generated *.html diff --git a/docs/content/target.rst b/docs/content/target.rst index c3ae0ae..81599b0 100644 --- a/docs/content/target.rst +++ b/docs/content/target.rst @@ -11,7 +11,7 @@ for supervised learning. This is achieved with a :py:class:`Target` object: >>> from sklearn_xarray import wrap, Target >>> from sklearn_xarray.datasets import load_digits_dataarray - >>> from sklearn.linear_model.logistic import LogisticRegression + >>> from sklearn.linear_model import LogisticRegression >>> >>> X = load_digits_dataarray() >>> y = Target(coord='digit')(X) @@ -61,17 +61,52 @@ Pre-processing -------------- In some cases, it is necessary to pre-process the coordinate before it can be -used as a target. For this, the constructor takes a ``transform_func`` parameter -which can be used with the ``fit_transform`` method of transformers in -``sklearn.preprocessing`` (and also any other object implementing the sklearn -transformer interface): +used as a target. For this, the constructor takes a ``transformer`` parameter +which can be used with transformers in ``sklearn.preprocessing`` (and also any +other object implementing the sklearn transformer interface): .. doctest:: >>> from sklearn.neural_network import MLPClassifier >>> from sklearn.preprocessing import LabelBinarizer >>> - >>> y = Target(coord='digit', transform_func=LabelBinarizer().fit_transform)(X) + >>> y = Target(coord='digit', transformer=LabelBinarizer(), reshapes="feature") + >>> wrapper = wrap(MLPClassifier(), reshapes="feature") + >>> wrapper.fit(X, y) # doctest:+ELLIPSIS + EstimatorWrapper(...) + +This approach makes it possible to reverse the pre-processing, e.g. after +calling ``wrapper.predict``: + +.. doctest:: + + >>> yp = wrapper.predict(X) + >>> yp + + array([[1, 0, 0, ..., 0, 0, 0], + [0, 1, 0, ..., 0, 0, 0], + [0, 0, 1, ..., 0, 0, 0], + ..., + [0, 0, 0, ..., 0, 1, 0], + [0, 0, 0, ..., 0, 0, 1], + [0, 0, 0, ..., 0, 1, 0]]) + Coordinates: + * sample (sample) int64 0 1 2 3 4 5 6 ... 1790 1791 1792 1793 1794 1795 1796 + digit (sample) int64 0 1 2 3 4 5 6 7 8 9 0 1 ... 7 9 5 4 8 8 4 9 0 8 9 8 + Dimensions without coordinates: feature + >>> y.inverse_transform(yp) + + array([0, 1, 2, ..., 8, 9, 8]) + Coordinates: + * sample (sample) int64 0 1 2 3 4 5 6 ... 1790 1791 1792 1793 1794 1795 1796 + digit (sample) int64 0 1 2 3 4 5 6 7 8 9 0 1 ... 7 9 5 4 8 8 4 9 0 8 9 8 + + +Alternatively, the constructor also accepts a ``transform_func`` parameter: + +.. doctest:: + + >>> y = Target(coord='digit', transform_func=LabelBinarizer().fit_transform) >>> wrapper = wrap(MLPClassifier()) >>> wrapper.fit(X, y) # doctest:+ELLIPSIS EstimatorWrapper(...) @@ -81,13 +116,13 @@ Indexing A :py:class:`Target` object can be indexed in the same way as the underlying coordinate and interfaces with ``numpy`` by providing an ``__array__`` -attribute which returns ``numpy.array()`` of the (transformed) coordinate. +attribute which returns ``numpy.array()`` of the (transformed) data. Multi-dimensional coordinates ----------------------------- -In some cases, the target coordinates span multiple dimensions, but the +In some cases, the target data spans multiple dimensions, but the transformer expects a lower-dimensional input. With the ``dim`` parameter of the :py:class:`Target` class you can specify which of the dimensions to keep. You can also specify the callable ``reduce_func`` to perform the reduction of diff --git a/docs/content/transformers.rst b/docs/content/transformers.rst index 103d333..8df8b2a 100644 --- a/docs/content/transformers.rst +++ b/docs/content/transformers.rst @@ -6,14 +6,83 @@ xarray's powerful array manipulation syntax. Refer to :ref:`API/Pre-processing` for a full list. +Combining dimensions +-------------------- + +scikit-learn's estimators generally assume that data is two-dimensional: +the first dimension represents the samples, the second dimension the features +of your data. Since xarray is generally used for higher-dimensional data, it is +often necessary to perform pre-processing steps that combine multiple +dimensions to a sample and/or feature dimension, or even combine multiple +variables of a ``Dataset`` into a single ``DataArray``. + +.. py:currentmodule:: sklearn_xarray.datasets + +For example, the :py:func:`load_digits_dataarray` method loads a +three-dimensional array of 8-by-8-pixel grayscale images: + +.. doctest:: + + >>> from sklearn_xarray.datasets import load_digits_dataarray + >>> X = load_digits_dataarray(load_images=True) + >>> X # doctest:+ELLIPSIS + + array([[[ 0., 0., 5., ..., 1., 0., 0.], + [ 0., 0., 13., ..., 15., 5., 0.], + [ 0., 3., 15., ..., 11., 8., 0.], + ..., + [ 0., 4., 16., ..., 16., 6., 0.], + [ 0., 8., 16., ..., 16., 8., 0.], + [ 0., 1., 8., ..., 12., 1., 0.]]]) + Coordinates: + * sample (sample) int64 0 1 2 3 4 5 6 ... 1790 1791 1792 1793 1794 1795 1796 + * row (row) int64 0 1 2 3 4 5 6 7 + * col (col) int64 0 1 2 3 4 5 6 7 + digit (sample) int64 0 1 2 3 4 5 6 7 8 9 0 1 ... 7 9 5 4 8 8 4 9 0 8 9 8 + +.. py:currentmodule:: sklearn_xarray.preprocessing + +In order to use the individual images as samples to fit an estimator, we need +to vectorize them first. The :py:class:`Featurizer` combines all dimensions +of the array except for the sample dimension: + +.. doctest:: + + >>> from sklearn_xarray.preprocessing import Featurizer + >>> Featurizer().fit_transform(X) + + array([[0., 0., 0., ..., 0., 0., 0.], + [0., 0., 0., ..., 0., 0., 0.], + [0., 0., 0., ..., 0., 0., 0.], + ..., + [0., 0., 0., ..., 0., 0., 0.], + [0., 0., 0., ..., 0., 0., 0.], + [0., 0., 0., ..., 0., 0., 0.]]) + Coordinates: + * sample (sample) int64 0 1 2 3 4 5 6 ... 1790 1791 1792 1793 1794 1795 1796 + digit (sample) int64 0 1 2 3 4 5 6 7 8 9 0 1 ... 7 9 5 4 8 8 4 9 0 8 9 8 + * feature (feature) MultiIndex + - col (feature) int64 0 0 0 0 0 0 0 0 1 1 1 1 ... 6 6 6 6 7 7 7 7 7 7 7 7 + - row (feature) int64 0 1 2 3 4 5 6 7 0 1 2 3 ... 4 5 6 7 0 1 2 3 4 5 6 7 + +Other transformers for combining dimensions are: + +.. autosummary:: + :nosignatures: + + Concatenator + Featurizer + Stacker + +Check out the :ref:`examples` for more use cases. + + Transformers changing the number of samples ------------------------------------------- There are several transformers that change the number of samples in the data, namely: -.. py:currentmodule:: sklearn_xarray.preprocessing - .. autosummary:: :nosignatures: @@ -21,6 +90,7 @@ namely: Sanitizer Segmenter Splitter + Stacker These kinds of transformer are usually disallowed by sklearn, because the package does not provide any mechanism of also changing the number of samples @@ -83,25 +153,25 @@ specify the ``groupby`` parameter: >>> >>> X = load_wisdm_dataarray() >>> Xt = segmenter.fit_transform(X) - >>> Xt # doctest:+ELLIPSIS doctest:+NORMALIZE_WHITESPACE + >>> Xt # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE - array([[[ -0.15 , 0.11 , ..., -2.26 , -1.46 ], - [ 9.15 , 9.19 , ..., 9.72 , 9.81 ], - [ -0.34 , 2.76 , ..., 2.03 , 2.15 ]], - [[ 0.27 , -3.06 , ..., -2.56 , -2.6 ], - [ 12.57 , 13.18 , ..., 14.56 , 8.96 ], - [ 5.37 , 6.47 , ..., 0.31 , -3.3 ]], - ..., - [[ -0.3 , 0.27 , ..., 0.42 , 3.17 ], - [ 8.08 , 6.63 , ..., 10.5 , 9.23 ], - [ 0.994285, 0.994285, ..., -5.175732, -4.671779]], - [[ 5.33 , 6.44 , ..., -4.14 , -4.9 ], - [ 8.39 , 9.04 , ..., 6.21 , 6.55 ], - [ -4.794363, -2.179256, ..., 5.938472, 3.827318]]]) + array([[[-0.15 , 0.11 , ..., -2.26 , -1.46 ], + [ 9.15 , 9.19 , ..., 9.72 , 9.81 ], + [-0.34 , 2.76 , ..., 2.03 , 2.15 ]], + [[ 0.27 , -3.06 , ..., -2.56 , -2.6 ], + [12.57 , 13.18 , ..., 14.56 , 8.96 ], + [ 5.37 , 6.47 , ..., 0.31 , -3.3 ]], + ... + [[-0.3 , 0.27 , ..., 0.42 , 3.17 ], + [ 8.08 , 6.63 , ..., 10.5 , 9.23 ], + [ 0.99... , 0.99... , ..., -5.17... , -4.67... ]], + [[ 5.33 , 6.44 , ..., -4.14 , -4.9 ], + [ 8.39 , 9.04 , ..., 6.21 , 6.55 ], + [-4.79... , -2.17... , ..., 5.93... , 3.82... ]]]) Coordinates: * axis (axis)