diff --git a/bob/bio/base/__init__.py b/bob/bio/base/__init__.py index 82ce2a09e27b49c5572bda388b25b6cdb1934df5..a2c2e098cd431ac6554e7abed3531e0bb956a301 100644 --- a/bob/bio/base/__init__.py +++ b/bob/bio/base/__init__.py @@ -4,6 +4,7 @@ from . import preprocessor from . import extractor from . import algorithm from . import annotator +from . import processor from . import script from . import test diff --git a/bob/bio/base/processor/__init__.py b/bob/bio/base/processor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..729af155f8f3f72cd1e3805f4a6efe40e2787dd7 --- /dev/null +++ b/bob/bio/base/processor/__init__.py @@ -0,0 +1,2 @@ +from .linearize import Linearize, SampleLinearize, CheckpointSampleLinearize +from .pca import CheckpointSamplePCA, SamplePCA diff --git a/bob/bio/base/processor/linearize.py b/bob/bio/base/processor/linearize.py new file mode 100644 index 0000000000000000000000000000000000000000..4815f468050cdc55f3c88034bb086827f74c48b0 --- /dev/null +++ b/bob/bio/base/processor/linearize.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : +# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch> + + +from bob.pipelines.processor import CheckpointMixin, SampleMixin +from sklearn.base import TransformerMixin +from sklearn.utils.validation import check_array, check_is_fitted +import numpy + + +class Linearize(TransformerMixin): + """Extracts features by simply concatenating all elements of the data into one long vector. + + If a ``dtype`` is specified in the contructor, it is assured that the resulting + """ + + def fit(self, X, y=None): + return self + + def transform(self, X): + + """__call__(data) -> data + + Takes data of arbitrary dimensions and linearizes it into a 1D vector; enforcing the data type, if desired. + + Parameters: + ----------- + + data : :py:class:`numpy.ndarray` + The preprocessed data to be transformed into one vector. + + Returns: + -------- + + data : 1D :py:class:`numpy.ndarray` + The extracted feature vector, of the desired ``dtype`` (if specified). + """ + + X = check_array(X, allow_nd=True) + + if X.ndim == 2: + return numpy.reshape(X, X.size) + else: + # Reshaping n-dimensional arrays assuming that the + # first axis corresponds to the number of samples + return numpy.reshape(X, (X.shape[0], numpy.prod(X.shape[1:]))) + + +class SampleLinearize(SampleMixin, Linearize): + pass + + +class CheckpointSampleLinearize(CheckpointMixin, SampleMixin, Linearize): + pass diff --git a/bob/bio/base/processor/pca.py b/bob/bio/base/processor/pca.py new file mode 100644 index 0000000000000000000000000000000000000000..243d8e2672e66dba18c86d97b171d4230477010f --- /dev/null +++ b/bob/bio/base/processor/pca.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : +# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch> + + +""" +TODO: This should be deployed in bob.pipelines +""" + +from bob.pipelines.processor import CheckpointMixin, SampleMixin +from sklearn.base import TransformerMixin +from sklearn.decomposition import PCA +import numpy + +""" +Wraps the +""" + + +class SamplePCA(SampleMixin, PCA): + """ + Enables SAMPLE handling for https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html + """ + pass + + +class CheckpointSamplePCA(CheckpointMixin, SampleMixin, PCA): + """ + Enables SAMPLE and CHECKPOINTIN handling for https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html + """ + pass diff --git a/bob/bio/base/test/test_processor.py b/bob/bio/base/test/test_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..fc06fcf88667731438379b7ca69c085a3091ba1f --- /dev/null +++ b/bob/bio/base/test/test_processor.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : +# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch> + +from bob.pipelines.sample import Sample, SampleSet, DelayedSample +import os +import numpy +import tempfile +from sklearn.utils.validation import check_is_fitted + +from bob.bio.base.processor import Linearize, SampleLinearize, CheckpointSampleLinearize +def test_linearize_processor(): + + ## Test the transformer only + transformer = Linearize() + X = numpy.zeros(shape=(10,10)) + X_tr = transformer.transform(X) + assert X_tr.shape == (100,) + + + ## Test wrapped in to a Sample + sample = Sample(X, key="1") + transformer = SampleLinearize() + X_tr = transformer.transform([sample]) + assert X_tr[0].data.shape == (100,) + + ## Test checkpoint + with tempfile.TemporaryDirectory() as d: + transformer = CheckpointSampleLinearize(features_dir=d) + X_tr = transformer.transform([sample]) + assert X_tr[0].data.shape == (100,) + assert os.path.exists(os.path.join(d, "1.h5")) + + +from bob.bio.base.processor import SamplePCA, CheckpointSamplePCA +def test_pca_processor(): + + ## Test wrapped in to a Sample + X = numpy.random.rand(100,10) + samples = [Sample(data, key=str(i)) for i, data in enumerate(X)] + + # fit + n_components = 2 + estimator = SamplePCA(n_components=n_components) + estimator = estimator.fit(samples) + + # https://scikit-learn.org/stable/modules/generated/sklearn.utils.validation.check_is_fitted.html + assert check_is_fitted(estimator, "n_components_") is None + + # transform + samples_tr = estimator.transform(samples) + assert samples_tr[0].data.shape == (n_components,) + + + ## Test Checkpoining + with tempfile.TemporaryDirectory() as d: + model_path = os.path.join(d, "model.pkl") + estimator = CheckpointSamplePCA(n_components=n_components, features_dir=d, model_path=model_path) + + # fit + estimator = estimator.fit(samples) + assert check_is_fitted(estimator, "n_components_") is None + assert os.path.exists(model_path) + + # transform + samples_tr = estimator.transform(samples) + assert samples_tr[0].data.shape == (n_components,) + assert os.path.exists(os.path.join(d, samples_tr[0].key+".h5"))