From c1bb19f5b6ff660f1523d7771a06e06b42e0c038 Mon Sep 17 00:00:00 2001 From: Yannick DAYER <yannick.dayer@idiap.ch> Date: Wed, 22 Mar 2023 20:45:55 +0100 Subject: [PATCH] doc [sample]: Added info on estimator and pipeline --- doc/estimator.rst | 63 ++++++++++++++++++++++++++++++++++++++++++++ doc/index.rst | 2 ++ doc/pipeline.rst | 66 +++++++++++++++++++++++++++++++++++++++++++++++ doc/sample.rst | 60 +++++++++++++++++++++++++++++++++++++++--- 4 files changed, 188 insertions(+), 3 deletions(-) create mode 100644 doc/estimator.rst create mode 100644 doc/pipeline.rst diff --git a/doc/estimator.rst b/doc/estimator.rst new file mode 100644 index 0000000..e935604 --- /dev/null +++ b/doc/estimator.rst @@ -0,0 +1,63 @@ +.. _bob.pipelines.estimator: + +Scikit-learn Estimators +======================= + +This is a short example showing the difference between an estimator taking one parameter +and one taking metadata for each sample in addition. + +Example of a custom Estimator +----------------------------- + +Let us make an Estimator that takes batches of arrays as input and applies a simple +function to each of them: + +.. doctest:: custom_estimator + + >>> import numpy as np + >>> from sklearn.base import BaseEstimator + >>> class OffsetTransformer(BaseEstimator): + ... """Demo Estimator to add an offset to arrays.""" + ... def __init__(self, offset=1): + ... self.offset = offset + ... + ... def transform(self, arrays: np.ndarray) -> np.ndarray: + ... """Add the offset to each array""" + ... return [arr + self.offset for arr in arrays] + + >>> transformer = OffsetTransformer(offset=2) + >>> transformer.transform([np.array([1, 2]), np.array([2, 3])]) + [array([3, 4]), array([4, 5])] + +.. note:: + + The ``transform`` method accepts a series of data samples (it works on batches). If + you work with 2D numpy arrays, your ``transform`` needs to handle 3D arrays (or you + need to loop over the first dimension to handle sample individually). + +Now let's edit the ``OffsetTransformer`` so that each sample can be offset by a +different value. This may be the case when applying preprocessing on some data with +annotations. Each will be different for each sample. +Here, we want to pass an offset for each sample given to ``transform``: + +.. doctest:: custom_estimator + + >>> class OffsetTransformerPerSample(BaseEstimator): + ... """Demo Estimator to add a different offset to each sample array.""" + ... def transform(self, arrays: np.ndarray, offsets: np.ndarray) -> np.ndarray: + ... """Add its offset to each array""" + ... return [arr + o for arr, o in zip(arrays, offsets)] + + >>> transformer_2 = OffsetTransformerPerSample() + +We need to pass two series of arrays to ``transform``, one for the samples data and one +containing the offsets: + +.. doctest:: custom_estimator + + >>> transformer_2.transform(arrays=[np.array([3,4,5]), np.array([1,2,3])], offsets=[1,2]) + [array([4, 5, 6]), array([3, 4, 5])] + + +We will see how the second estimator (with multiple parameters in ``transform``) can +cause problems with the pipeline API in the next page. diff --git a/doc/index.rst b/doc/index.rst index 1177c76..eb2e548 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -31,6 +31,8 @@ User Guide .. toctree:: :maxdepth: 2 + estimator + pipeline sample checkpoint dask diff --git a/doc/pipeline.rst b/doc/pipeline.rst new file mode 100644 index 0000000..bb7eae5 --- /dev/null +++ b/doc/pipeline.rst @@ -0,0 +1,66 @@ +.. _bob.pipelines.pipeline: + +Making pipelines +================ + +We rely on the :ref:`scikit-learn pipeline API<scikit-learn:pipeline>` to run our +pipelines. +Basically, a pipeline is a series of :py:class:`sklearn.base.BaseEstimator` objects. +When data is fed to the pipeline, the first Estimator receives it, processes it and what +is returned is then fed to the second Estimator in the pipeline, and so on until the +end of the steps. The result of the last Estimator is the result of the pipeline. + +To make a pipeline, you can call :py:func:`sklearn.pipeline.make_pipeline` or directly +instantiate a Pipeline object with :py:class:`sklearn.pipeline.Pipeline`. + +Below is a quick example on how to make a pipeline out of two Estimators: + +.. doctest:: pipeline + + >>> import numpy + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.preprocessing import StandardScaler + >>> from sklearn.decomposition import PCA + + >>> pipeline = make_pipeline(StandardScaler(), PCA()) + >>> pipeline = pipeline.fit([[0, 0], [0, 0], [2, 2]]) + >>> pipeline.transform([[0, 1], [2, 1]]) + array([[-0.25, 0.75], + [ 1.25, -0.75]]) + +The issue with scikit-learn pipelines +------------------------------------- + +The previous example works fine as long as we only have one parameter to give to the +``transform`` or ``fit`` method of all the Estimators in the pipeline. + +If we want to pass some metadata to the Estimators in a pipeline (like the ``offsets`` +parameters from the previous page), we cannot include the Estimator in a pipeline. + +This example takes the transformer with an ``offsets`` parameters defined in the +previous page and tries to make a pipeline out of it: + +.. testsetup:: custom_estimator + + from sklearn.base import BaseEstimator + import numpy as np + class OffsetTransformerPerSample(BaseEstimator): + """Demo Estimator to add a different offset to each sample array.""" + def transform(self, arrays: np.ndarray, offsets: np.ndarray) -> np.ndarray: + """Add its offset to each array""" + return [arr + o for arr, o in zip(arrays, offsets)] + + transformer_2 = OffsetTransformerPerSample() + +.. doctest:: custom_estimator + + >>> from sklearn.pipeline import make_pipeline + >>> pipeline = make_pipeline(transformer_2) + >>> pipeline.transform([np.array([3,4,5]), np.array([1,2,3])], [1,2]) + Traceback (most recent call last): + ... + TypeError: _transform() takes 2 positional arguments but 3 were given + +In order to include such an Estimator in a pipeline, we must add some logic to redirect +data and metadata through the pipeline. This is the goal of the :any:`Sample` and +:any:`SampleWrapper` presented in the next page. diff --git a/doc/sample.rst b/doc/sample.rst index c4d9b30..830d9b7 100644 --- a/doc/sample.rst +++ b/doc/sample.rst @@ -6,7 +6,7 @@ Samples, a way to enhance scikit pipelines with metadata Some tasks in pattern recognition demands the usage of metadata to support some processing (e.g. face cropping, audio segmentation). To support scikit-learn based estimators with such requirement task, this package provides two mechanisms that: - 1. Wraps input data in a layer called :any:`Sample` that allows you to append some metadata to your original input data. + 1. Wraps input data in a container called :any:`Sample` that allows you to append some metadata to your original input data. 2. A wrapper class (:any:`SampleWrapper`) that interplay between :any:`Sample` and your estimator. @@ -87,8 +87,8 @@ While this transformer works well by itself, it can't be used by TypeError: _transform() takes 2 positional arguments but 3 were given To approach this issue, :any:`SampleWrapper` can be used. This class wraps -other estimators and accepts as input samples and passes the data with metadata inside -samples to the wrapped estimator: +other estimators and accepts as input :any:`Sample` objects and passes the data with +metadata inside samples to the wrapped estimator: .. doctest:: @@ -215,3 +215,57 @@ transform each sample inside and returns the same SampleSets with new data. DelayedSample(offset=array([1])) >>> transformed_sample_sets[0].samples[1].data array([1., 1.]) + + +Using Tags +---------- + +If an estimator needs always the same elements from the :any:`Sample` objects, you can +define which attribute it takes by setting tags at the class level, instead of requiring +the user to define this when instantiating: + +.. testsetup:: + + from typing import Any + +.. doctest:: + + >>> class TaggedTransformer(BaseEstimator): + ... """Transforms samples with minimal user configuration""" + ... def transform(self, X: np.ndarray, offsets_kwarg: np.ndarray) -> np.ndarray: + ... """Adds an offset to each sample.""" + ... return np.array(X) + np.array(offsets_kwarg) + ... + ... def _more_tags(self) -> dict[str, Any]: + ... return { + ... "requires_fit": False, # sklearn Estimator tag + ... "bob_input": "data", # Optional (data is the default) + ... "bob_transform_extra_input": [("offsets_kwarg", "offset")], + ... "bob_output": "data", # Optional (data is the default) + ... } + +With these tags defined, when wrapping the transformer with :any:`SampleWrapper` you +don't need to specify any parameters (unless you want to override the tags): + +.. doctest:: + + >>> my_transformer = TaggedTransformer() + >>> my_wrapped_transformer = bob.pipelines.SampleWrapper(my_transformer) + >>> pipe = make_pipeline(my_wrapped_transformer) + >>> result_samples = pipe.transform(samples) + >>> np.array([s.data for s in result_samples]) + array([[0., 0.], + [1., 1.], + [2., 2.]]) + +The tags related to the :any:`SampleWrapper` are the following: + +- ``bob_input``: Name of an attribute in :any:`Sample` that will be passed as first + parameter of the ``transform`` or ``fit`` method. +- ``bob_transform_extra_input``: The additional :any:`Sample` attributes to pass as + parameters to ``transform``. The format is a list of tuples, each containing the + parameter name in ``transform`` and the corresponding :any:`Sample` attribute to + load. +- ``bob_fit_extra_input``: Additional :any:`Sample` attributes to pass to the fit method. +- ``bob_output``: The :any:`Sample` attribute that will receive the result of + ``transform``. -- GitLab