From c1bb19f5b6ff660f1523d7771a06e06b42e0c038 Mon Sep 17 00:00:00 2001
From: Yannick DAYER <yannick.dayer@idiap.ch>
Date: Wed, 22 Mar 2023 20:45:55 +0100
Subject: [PATCH] doc [sample]: Added info on estimator and pipeline

---
 doc/estimator.rst | 63 ++++++++++++++++++++++++++++++++++++++++++++
 doc/index.rst     |  2 ++
 doc/pipeline.rst  | 66 +++++++++++++++++++++++++++++++++++++++++++++++
 doc/sample.rst    | 60 +++++++++++++++++++++++++++++++++++++++---
 4 files changed, 188 insertions(+), 3 deletions(-)
 create mode 100644 doc/estimator.rst
 create mode 100644 doc/pipeline.rst

diff --git a/doc/estimator.rst b/doc/estimator.rst
new file mode 100644
index 0000000..e935604
--- /dev/null
+++ b/doc/estimator.rst
@@ -0,0 +1,63 @@
+.. _bob.pipelines.estimator:
+
+Scikit-learn Estimators
+=======================
+
+This is a short example showing the difference between an estimator taking one parameter
+and one taking metadata for each sample in addition.
+
+Example of a custom Estimator
+-----------------------------
+
+Let us make an Estimator that takes batches of arrays as input and applies a simple
+function to each of them:
+
+.. doctest:: custom_estimator
+
+    >>> import numpy as np
+    >>> from sklearn.base import BaseEstimator
+    >>> class OffsetTransformer(BaseEstimator):
+    ...     """Demo Estimator to add an offset to arrays."""
+    ...     def __init__(self, offset=1):
+    ...         self.offset = offset
+    ...
+    ...     def transform(self, arrays: np.ndarray) -> np.ndarray:
+    ...         """Add the offset to each array"""
+    ...         return [arr + self.offset for arr in arrays]
+
+    >>> transformer = OffsetTransformer(offset=2)
+    >>> transformer.transform([np.array([1, 2]), np.array([2, 3])])
+    [array([3, 4]), array([4, 5])]
+
+.. note::
+
+    The ``transform`` method accepts a series of data samples (it works on batches). If
+    you work with 2D numpy arrays, your ``transform`` needs to handle 3D arrays (or you
+    need to loop over the first dimension to handle sample individually).
+
+Now let's edit the ``OffsetTransformer`` so that each sample can be offset by a
+different value. This may be the case when applying preprocessing on some data with
+annotations. Each will be different for each sample.
+Here, we want to pass an offset for each sample given to ``transform``:
+
+.. doctest:: custom_estimator
+
+    >>> class OffsetTransformerPerSample(BaseEstimator):
+    ...     """Demo Estimator to add a different offset to each sample array."""
+    ...     def transform(self, arrays: np.ndarray, offsets: np.ndarray) -> np.ndarray:
+    ...         """Add its offset to each array"""
+    ...         return [arr + o for arr, o in zip(arrays, offsets)]
+
+    >>> transformer_2 = OffsetTransformerPerSample()
+
+We need to pass two series of arrays to ``transform``, one for the samples data and one
+containing the offsets:
+
+.. doctest:: custom_estimator
+
+    >>> transformer_2.transform(arrays=[np.array([3,4,5]), np.array([1,2,3])], offsets=[1,2])
+    [array([4, 5, 6]), array([3, 4, 5])]
+
+
+We will see how the second estimator (with multiple parameters in ``transform``) can
+cause problems with the pipeline API in the next page.
diff --git a/doc/index.rst b/doc/index.rst
index 1177c76..eb2e548 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -31,6 +31,8 @@ User Guide
 .. toctree::
    :maxdepth: 2
 
+   estimator
+   pipeline
    sample
    checkpoint
    dask
diff --git a/doc/pipeline.rst b/doc/pipeline.rst
new file mode 100644
index 0000000..bb7eae5
--- /dev/null
+++ b/doc/pipeline.rst
@@ -0,0 +1,66 @@
+.. _bob.pipelines.pipeline:
+
+Making pipelines
+================
+
+We rely on the :ref:`scikit-learn pipeline API<scikit-learn:pipeline>` to run our
+pipelines.
+Basically, a pipeline is a series of :py:class:`sklearn.base.BaseEstimator` objects.
+When data is fed to the pipeline, the first Estimator receives it, processes it and what
+is returned is then fed to the second Estimator in the pipeline, and so on until the
+end of the steps. The result of the last Estimator is the result of the pipeline.
+
+To make a pipeline, you can call :py:func:`sklearn.pipeline.make_pipeline` or directly
+instantiate a Pipeline object with :py:class:`sklearn.pipeline.Pipeline`.
+
+Below is a quick example on how to make a pipeline out of two Estimators:
+
+.. doctest:: pipeline
+
+    >>> import numpy
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
+    >>> from sklearn.decomposition import PCA
+
+    >>> pipeline = make_pipeline(StandardScaler(), PCA())
+    >>> pipeline = pipeline.fit([[0, 0], [0, 0], [2, 2]])
+    >>> pipeline.transform([[0, 1], [2, 1]])
+    array([[-0.25,  0.75],
+           [ 1.25, -0.75]])
+
+The issue with scikit-learn pipelines
+-------------------------------------
+
+The previous example works fine as long as we only have one parameter to give to the
+``transform`` or ``fit`` method of all the Estimators in the pipeline.
+
+If we want to pass some metadata to the Estimators in a pipeline (like the ``offsets``
+parameters from the previous page), we cannot include the Estimator in a pipeline.
+
+This example takes the transformer with an ``offsets`` parameters defined in the
+previous page and tries to make a pipeline out of it:
+
+.. testsetup:: custom_estimator
+
+    from sklearn.base import BaseEstimator
+    import numpy as np
+    class OffsetTransformerPerSample(BaseEstimator):
+        """Demo Estimator to add a different offset to each sample array."""
+        def transform(self, arrays: np.ndarray, offsets: np.ndarray) -> np.ndarray:
+            """Add its offset to each array"""
+            return [arr + o for arr, o in zip(arrays, offsets)]
+
+    transformer_2 = OffsetTransformerPerSample()
+
+.. doctest:: custom_estimator
+
+   >>> from sklearn.pipeline import make_pipeline
+   >>> pipeline = make_pipeline(transformer_2)
+   >>> pipeline.transform([np.array([3,4,5]), np.array([1,2,3])], [1,2])
+   Traceback (most recent call last):
+      ...
+   TypeError: _transform() takes 2 positional arguments but 3 were given
+
+In order to include such an Estimator in a pipeline, we must add some logic to redirect
+data and metadata through the pipeline. This is the goal of the :any:`Sample` and
+:any:`SampleWrapper` presented in the next page.
diff --git a/doc/sample.rst b/doc/sample.rst
index c4d9b30..830d9b7 100644
--- a/doc/sample.rst
+++ b/doc/sample.rst
@@ -6,7 +6,7 @@ Samples, a way to enhance scikit pipelines with metadata
 Some tasks in pattern recognition demands the usage of metadata to support some processing (e.g. face cropping, audio segmentation).
 To support scikit-learn based estimators with such requirement task, this package provides two mechanisms that:
 
-    1. Wraps input data in a layer called :any:`Sample` that allows you to append some metadata to your original input data.
+    1. Wraps input data in a container called :any:`Sample` that allows you to append some metadata to your original input data.
 
     2. A wrapper class (:any:`SampleWrapper`) that interplay between :any:`Sample` and your estimator.
 
@@ -87,8 +87,8 @@ While this transformer works well by itself, it can't be used by
    TypeError: _transform() takes 2 positional arguments but 3 were given
 
 To approach this issue, :any:`SampleWrapper` can be used. This class wraps
-other estimators and accepts as input samples and passes the data with metadata inside
-samples to the wrapped estimator:
+other estimators and accepts as input :any:`Sample` objects and passes the data with
+metadata inside samples to the wrapped estimator:
 
 .. doctest::
 
@@ -215,3 +215,57 @@ transform each sample inside and returns the same SampleSets with new data.
    DelayedSample(offset=array([1]))
    >>> transformed_sample_sets[0].samples[1].data
    array([1., 1.])
+
+
+Using Tags
+----------
+
+If an estimator needs always the same elements from the :any:`Sample` objects, you can
+define which attribute it takes by setting tags at the class level, instead of requiring
+the user to define this when instantiating:
+
+.. testsetup::
+
+   from typing import Any
+
+.. doctest::
+
+   >>> class TaggedTransformer(BaseEstimator):
+   ...      """Transforms samples with minimal user configuration"""
+   ...      def transform(self, X: np.ndarray, offsets_kwarg: np.ndarray) -> np.ndarray:
+   ...          """Adds an offset to each sample."""
+   ...          return np.array(X) + np.array(offsets_kwarg)
+   ...
+   ...      def _more_tags(self) -> dict[str, Any]:
+   ...          return {
+   ...              "requires_fit": False,  # sklearn Estimator tag
+   ...              "bob_input": "data",  # Optional (data is the default)
+   ...              "bob_transform_extra_input": [("offsets_kwarg", "offset")],
+   ...              "bob_output": "data",  # Optional (data is the default)
+   ...          }
+
+With these tags defined, when wrapping the transformer with :any:`SampleWrapper` you
+don't need to specify any parameters (unless you want to override the tags):
+
+.. doctest::
+
+   >>> my_transformer = TaggedTransformer()
+   >>> my_wrapped_transformer = bob.pipelines.SampleWrapper(my_transformer)
+   >>> pipe = make_pipeline(my_wrapped_transformer)
+   >>> result_samples = pipe.transform(samples)
+   >>> np.array([s.data for s in result_samples])
+   array([[0., 0.],
+          [1., 1.],
+          [2., 2.]])
+
+The tags related to the :any:`SampleWrapper` are the following:
+
+- ``bob_input``: Name of an attribute in :any:`Sample` that will be passed as first
+    parameter of the ``transform`` or ``fit`` method.
+- ``bob_transform_extra_input``: The additional :any:`Sample` attributes to pass as
+    parameters to ``transform``. The format is a list of tuples, each containing the
+    parameter name in ``transform`` and the corresponding :any:`Sample` attribute to
+    load.
+- ``bob_fit_extra_input``: Additional :any:`Sample` attributes to pass to the fit method.
+- ``bob_output``: The :any:`Sample` attribute that will receive the result of
+    ``transform``.
-- 
GitLab