diff --git a/bob/bio/base/pipelines/vanilla_biometrics/__init__.py b/bob/bio/base/pipelines/vanilla_biometrics/__init__.py index 2e1e955fdcd6ee68ba532cf8bde494e47a00a8d4..40b552d2711475e928595fdd8876b41179a6fee6 100644 --- a/bob/bio/base/pipelines/vanilla_biometrics/__init__.py +++ b/bob/bio/base/pipelines/vanilla_biometrics/__init__.py @@ -6,5 +6,6 @@ from .biometric_algorithms import Distance from .score_writers import FourColumnsScoreWriter, CSVScoreWriter from .wrappers import BioAlgorithmCheckpointWrapper, BioAlgorithmDaskWrapper, dask_vanilla_biometrics +from .legacy import BioAlgorithmLegacy, DatabaseConnector __path__ = extend_path(__path__, __name__) diff --git a/bob/bio/base/pipelines/vanilla_biometrics/legacy.py b/bob/bio/base/pipelines/vanilla_biometrics/legacy.py index 6b44b591046c6964c1d7fb37bc5ff51a7f8fe59f..40bb6fc300dd57e25638a5e418fda1a5a7246973 100644 --- a/bob/bio/base/pipelines/vanilla_biometrics/legacy.py +++ b/bob/bio/base/pipelines/vanilla_biometrics/legacy.py @@ -11,15 +11,15 @@ from bob.bio.base import utils from .abstract_classes import ( BioAlgorithm, Database, - create_score_delayed_sample, - make_four_colums_score, ) from bob.io.base import HDF5File -from bob.pipelines.mixins import SampleMixin, CheckpointMixin -from bob.pipelines.sample import DelayedSample, SampleSet, Sample +from bob.pipelines import DelayedSample, SampleSet, Sample import logging import copy +from .score_writers import FourColumnsScoreWriter + +from bob.bio.base.algorithm import Algorithm logger = logging.getLogger("bob.bio.base") @@ -156,175 +156,121 @@ class DatabaseConnector(Database): return list(probes.values()) -class AlgorithmAsBioAlg(_NonPickableWrapper, BioAlgorithm): - """Biometric Algorithm that handles legacy :py:class:`bob.bio.base.algorithm.Algorithm` +class BioAlgorithmLegacy(BioAlgorithm): + """Biometric Algorithm that handlesy :any:`bob.bio.base.algorithm.Algorithm` + In this design, :any:`BioAlgorithm.enroll` maps to :any:`bob.bio.base.algorithm.Algorithm.enroll` and + :any:`BioAlgorithm.score` maps :any:`bob.bio.base.algorithm.Algorithm.score` + + .. note:: + Legacy algorithms are always checkpointable - :py:method:`BioAlgorithm.enroll` maps to :py:method:`bob.bio.base.algorithm.Algoritm.enroll` - :py:method:`BioAlgorithm.score` maps :py:method:`bob.bio.base.algorithm.Algoritm.score` + Parameters + ---------- + callable: ``collection.callable`` + Callable function that instantiates the :any:`bob.bio.base.algorithm.Algorithm` Example ------- - - - Parameters - ---------- - callable: callable - Calleble function that instantiates the bob.bio.base.algorithm.Algorithm + >>> from bob.bio.base.pipelines.vanilla_biometrics import BioAlgorithmLegacy + >>> from bob.bio.base.algorithm import PCA + >>> biometric_algorithm = BioAlgorithmLegacy(PCA()) """ def __init__( - self, callable, features_dir, extension=".hdf5", model_path=None, **kwargs - ): - super().__init__(callable, **kwargs) - self.features_dir = features_dir - self.biometric_reference_dir = os.path.join( - self.features_dir, "biometric_references" - ) - self.score_dir = os.path.join(self.features_dir, "scores") - self.extension = extension - self.model_path = model_path - self.is_projector_loaded = False - - def _enroll_sample_set(self, sampleset): - # Enroll - return self.enroll(sampleset) - - def _load_projector(self): - """ - Run :py:meth:`bob.bio.base.algorithm.Algorithm.load_projector` if necessary by - :py:class:`bob.bio.base.algorithm.Algorithm` - """ - if self.instance.performs_projection and not self.is_projector_loaded: - if self.model_path is None: - raise ValueError( - "Algorithm " + f"{self. instance} performs_projection. Hence, " - "`model_path` needs to passed in `AlgorithmAsBioAlg.__init__`" - ) - else: - # Loading model - self.instance.load_projector(self.model_path) - self.is_projector_loaded = True - - def _restore_state_of_ref(self, ref): - """ - There are some algorithms that :py:meth:`bob.bio.base.algorithm.Algorithm.read_model` or - :py:meth:`bob.bio.base.algorithm.Algorithm.read_feature` depends - on the state of `self` to be properly loaded. - In these cases, it's not possible to rely only in the unbounded method extracted by - :py:func:`_get_pickable_method`. - - This function replaces the current state of these objects (that are not) - by bounding them with `self.instance` - """ - - if isinstance(ref, DelayedSample): - new_ref = copy.copy(ref) - new_ref.load = functools.partial(ref.load.func, self.instance, ref.load.args[1]) - #new_ref.load = functools.partial(ref.load.func, self.instance, ref.load.args[1]) - return new_ref - else: - return ref - - def _score_sample_set( self, - sampleset, - biometric_references, - allow_scoring_with_all_biometric_references=False, + callable, + base_dir, + force=False, + projector_file=None, + score_writer=FourColumnsScoreWriter(), + **kwargs, ): - """Given a sampleset for probing, compute the scores and retures a sample set with the scores - """ - - # Compute scores for each sample inside of the sample set - # TODO: In some cases we want to compute 1 score per sampleset (IJB-C) - # We should add an agregator function here so we can properlly agregate samples from - # a sampleset either after or before scoring. - # To be honest, this should be the default behaviour + super().__init__(**kwargs) - def _write_sample(ref, probe, score): - data = make_four_colums_score(ref.subject, probe.subject, probe.path, score) - return Sample(data, parent=ref) - - self._load_projector() - retval = [] + if not isinstance(callable, Algorithm): + raise ValueError( + f"Only `bob.bio.base.Algorithm` supported, not `{callable}`" + ) + logger.info(f"Using `bob.bio.base` legacy algorithm {callable}") - for subprobe_id, s in enumerate(sampleset.samples): - # Creating one sample per comparison - subprobe_scores = [] + if callable.requires_projector_training and projector_file is None: + raise ValueError(f"{callable} requires a `projector_file` to be set") - if allow_scoring_with_all_biometric_references: - if self.stacked_biometric_references is None: - self.stacked_biometric_references = [ - ref.data for ref in biometric_references - ] + self.callable = callable + self.is_background_model_loaded = False - #s = self._restore_state_of_ref(s) - scores = self.score_multiple_biometric_references( - self.stacked_biometric_references, s.data - ) - # Wrapping the scores in samples - for ref, score in zip(biometric_references, scores): - subprobe_scores.append(_write_sample(ref, sampleset, score)) + self.projector_file = projector_file + self.biometric_reference_dir = os.path.join(base_dir, "biometric_references") + self._biometric_reference_extension = ".hdf5" + self.score_dir = os.path.join(base_dir, "scores") + self.score_writer = score_writer + self.force = force - else: - for ref in [ - r for r in biometric_references if r.key in sampleset.references - ]: - - score = self.score(ref.data, s.data) - subprobe_scores.append(_write_sample(ref, sampleset, score)) + def load_legacy_background_model(self): + # Loading background model + if not self.is_background_model_loaded: + self.callable.load_projector(self.projector_file) + self.is_background_model_loaded = True - # Creating one sampleset per probe - subprobe = SampleSet(subprobe_scores, parent=sampleset) - subprobe.subprobe_id = subprobe_id + def enroll(self, enroll_features, **kwargs): + self.load_legacy_background_model() + return self.callable.enroll(enroll_features) - # Checkpointing score MANDATORY FOR LEGACY - path = os.path.join(self.score_dir, str(subprobe.path) + ".txt") - os.makedirs(os.path.dirname(path), exist_ok=True) + def score(self, biometric_reference, data, **kwargs): + self.load_legacy_background_model() + scores = self.callable.score(biometric_reference, data) + if isinstance(scores, list): + scores = self.callable.probe_fusion_function(scores) + return scores - delayed_scored_sample = create_score_delayed_sample(path, subprobe) - subprobe.samples = [delayed_scored_sample] + def score_multiple_biometric_references(self, biometric_references, data, **kwargs): + scores = self.callable.score_for_multiple_models(biometric_references, data) + return scores - retval.append(subprobe) + def write_biometric_reference(self, sample, path): + os.makedirs(os.path.dirname(path), exist_ok=True) + self.callable.write_model(sample.data, path) - return retval + def _enroll_sample_set(self, sampleset): + """ + Enroll a sample set with checkpointing + """ + # Amending `models` directory + path = os.path.join( + self.biometric_reference_dir, + str(sampleset.key) + self._biometric_reference_extension, + ) - def enroll(self, enroll_features, **kwargs): + if self.force or not os.path.exists(path): + enrolled_sample = super()._enroll_sample_set(sampleset) - if not isinstance(enroll_features, SampleSet): - raise ValueError( - f"`enroll_features` should be the type SampleSet, not {enroll_features}" - ) + # saving the new sample + self.write_biometric_reference(enrolled_sample, path) - path = os.path.join( - self.biometric_reference_dir, str(enroll_features.key) + self.extension + delayed_enrolled_sample = DelayedSample( + functools.partial(self.callable.read_model, path), parent=sampleset ) - self._load_projector() - if path is None or not os.path.isfile(path): - # Enrolling - data = [s.data for s in enroll_features.samples] - model = self.instance.enroll(data) - - # Checkpointing - os.makedirs(os.path.dirname(path), exist_ok=True) - self.instance.write_model(model, path) - - reader = self.instance.read_model - return DelayedSample(functools.partial(reader, path), parent=enroll_features) - def score(self, biometric_reference, data, **kwargs): - return self.instance.score(biometric_reference, data) + return delayed_enrolled_sample - def score_multiple_biometric_references(self, biometric_references, data, **kwargs): - """ - It handles the score computation of one probe against multiple biometric references using legacy - `bob.bio.base` + def _score_sample_set( + self, + sampleset, + biometric_references, + allow_scoring_with_all_biometric_references=False, + ): + path = os.path.join(self.score_dir, str(sampleset.key)) + # Computing score + scored_sample_set = super()._score_sample_set( + sampleset, + biometric_references, + allow_scoring_with_all_biometric_references=allow_scoring_with_all_biometric_references, + ) - Basically it wraps :py:meth:`bob.bio.base.algorithm.Algorithm.score_for_multiple_models`. + scored_sample_set = self.score_writer.write(scored_sample_set, path) - """ - scores = self.instance.score_for_multiple_models(biometric_references, data) - return scores + return scored_sample_set diff --git a/bob/bio/base/pipelines/vanilla_biometrics/wrappers.py b/bob/bio/base/pipelines/vanilla_biometrics/wrappers.py index a012ce4a038d440a32637c379694204706eb9414..2c2688cc5b094e9d091e84ce5320575aed5819cb 100644 --- a/bob/bio/base/pipelines/vanilla_biometrics/wrappers.py +++ b/bob/bio/base/pipelines/vanilla_biometrics/wrappers.py @@ -103,7 +103,7 @@ class BioAlgorithmCheckpointWrapper(BioAlgorithm): """ # TODO: WE CAN'T REUSE THE ALREADY WRITTEN SCORE FILE FOR LOADING - # UNLESS WE SAVE THE PICKLED THE SAMPLESET WITH THE SCORES + # UNLESS WE SAVE THE PICKLED SAMPLESET WITH THE SCORES path = os.path.join(self.score_dir, str(sampleset.key)) diff --git a/bob/bio/base/test/test_algorithms.py b/bob/bio/base/test/test_algorithms.py index 8b03c5e32599244677fe5e2ed5db437680f7a3b7..c023812e3ba32bf8d9d58e4cd2dae4b81b331a44 100644 --- a/bob/bio/base/test/test_algorithms.py +++ b/bob/bio/base/test/test_algorithms.py @@ -111,7 +111,7 @@ def test_pca(): # compare model with probe probe = pca1.read_feature(pkg_resources.resource_filename('bob.bio.base.test', 'data/pca_projected.hdf5')) reference_score = -251.53563107 - assert abs(pca1.score(model, probe) - reference_score) < 1e-5, "The scores differ: %3.8f, %3.8f" % (pca1.score(model, probe), reference_score) + assert abs(numpy.mean(pca1.score(model, probe)) - reference_score) < 1e-5, "The scores differ: %3.8f, %3.8f" % (pca1.score(model, probe), reference_score) assert abs(pca1.score_for_multiple_probes(model, [probe, probe]) - reference_score) < 1e-5 # test the calculation of the subspace dimension based on percentage of variance @@ -180,12 +180,11 @@ def test_lda(): # enroll model from random features enroll = utils.random_training_set(5, 5, 0., 255., seed=21) model = lda1.enroll(enroll) - _compare(model, pkg_resources.resource_filename('bob.bio.base.test', 'data/lda_model.hdf5'), lda1.write_model, lda1.read_model) - + _compare(model, pkg_resources.resource_filename('bob.bio.base.test', 'data/lda_model.hdf5'), lda1.write_model, lda1.read_model) # compare model with probe probe = lda1.read_feature(pkg_resources.resource_filename('bob.bio.base.test', 'data/lda_projected.hdf5')) reference_score = -233.30450012 - assert abs(lda1.score(model, probe) - reference_score) < 1e-5, "The scores differ: %3.8f, %3.8f" % (lda1.score(model, probe), reference_score) + assert abs(numpy.mean(lda1.score(model, probe)) - reference_score) < 1e-5, "The scores differ: %3.8f, %3.8f" % (lda1.score(model, probe), reference_score) assert abs(lda1.score_for_multiple_probes(model, [probe, probe]) - reference_score) < 1e-5 # test the calculation of the subspace dimension based on percentage of variance diff --git a/bob/bio/base/test/test_transformers.py b/bob/bio/base/test/test_transformers.py index 3360d18428c9976b0b41b2d826ced8cfff28eaac..658f448ba840c1fdc1a75b635d3797f18cd18728 100644 --- a/bob/bio/base/test/test_transformers.py +++ b/bob/bio/base/test/test_transformers.py @@ -4,6 +4,7 @@ from bob.bio.base.preprocessor import Preprocessor from bob.bio.base.extractor import Extractor from bob.bio.base.algorithm import Algorithm +import scipy from bob.bio.base.transformers import ( PreprocessorTransformer, ExtractorTransformer, @@ -40,17 +41,18 @@ class FakeExtractorFittable(Extractor): self.model = None def __call__(self, data, metadata=None): - return data @ self.model + model = self.model + return data @ model def train(self, training_data, extractor_file): - self.model = training_data + self.model = np.vstack(training_data) bob.io.base.save(self.model, extractor_file) class FakeAlgorithm(Algorithm): def __init__(self, **kwargs): super().__init__(**kwargs) - self.requires_training = True + self.requires_projector_training = True self.split_training_features_by_client = True self.model = None @@ -64,6 +66,12 @@ class FakeAlgorithm(Algorithm): def load_projector(self, projector_file): self.model = bob.io.base.load(projector_file) + def enroll(self, enroll_features): + return np.mean(enroll_features, axis=0) + + def score(self, model, data): + return scipy.spatial.distance.euclidean(model, data) + def generate_samples(n_subjects, n_samples_per_subject, shape=(2, 2), annotations=1): """ @@ -175,7 +183,6 @@ def test_extractor_fittable(): # Testing sample sample_transformer = mario.SampleWrapper(extractor_transformer) - # Fitting training_data = np.arange(4).reshape(2, 2) training_samples = [mario.Sample(training_data, key="1")] @@ -252,7 +259,6 @@ def test_algorithm(): def test_wrap_bob_pipeline(): - def run_pipeline(with_dask): with tempfile.TemporaryDirectory() as dir_name: @@ -263,14 +269,12 @@ def test_wrap_bob_pipeline(): transform_extra_arguments=(("annotations", "annotations"),), ), wrap_transform_bob(FakeExtractor(), dir_name,), - wrap_transform_bob( - FakeAlgorithm(), dir_name - ), + wrap_transform_bob(FakeAlgorithm(), dir_name), ) oracle = [7.0, 7.0, 7.0, 7.0] training_samples = generate_samples(n_subjects=2, n_samples_per_subject=2) test_samples = generate_samples(n_subjects=1, n_samples_per_subject=1) - if with_dask: + if with_dask: pipeline = mario.wrap(["dask"], pipeline) transformed_samples = ( pipeline.fit(training_samples).transform(test_samples).compute() diff --git a/bob/bio/base/test/test_vanilla_biometrics.py b/bob/bio/base/test/test_vanilla_biometrics.py index 9b8d02afdc9522e2c1d8d5c7037c157b4bec3527..d36ae1659e0d06202edcf09a1018c6066601a055 100644 --- a/bob/bio/base/test/test_vanilla_biometrics.py +++ b/bob/bio/base/test/test_vanilla_biometrics.py @@ -4,11 +4,11 @@ from bob.pipelines import Sample, SampleSet, DelayedSample import os -import numpy +import numpy as np import tempfile from sklearn.pipeline import make_pipeline from bob.bio.base.wrappers import wrap_transform_bob -from bob.bio.base.test.test_transformers import FakePreprocesor, FakeExtractor +from bob.bio.base.test.test_transformers import FakePreprocesor, FakeExtractor, FakeAlgorithm from bob.bio.base.pipelines.vanilla_biometrics import ( Distance, VanillaBiometricsPipeline, @@ -16,12 +16,13 @@ from bob.bio.base.pipelines.vanilla_biometrics import ( dask_vanilla_biometrics, FourColumnsScoreWriter, CSVScoreWriter, + BioAlgorithmLegacy ) import bob.pipelines as mario import uuid import shutil - +import itertools class DummyDatabase: def __init__(self, delayed=False, n_references=10, n_probes=10, dim=10, one_d=True): @@ -35,27 +36,27 @@ class DummyDatabase: def _create_random_1dsamples(self, n_samples, offset, dim): return [ - Sample(numpy.random.rand(dim), key=str(uuid.uuid4()), annotations=1) + Sample(np.random.rand(dim), key=str(uuid.uuid4()), annotations=1, subject=str(i)) for i in range(offset, offset + n_samples) ] def _create_random_2dsamples(self, n_samples, offset, dim): return [ - Sample(numpy.random.rand(dim, dim), key=str(uuid.uuid4()), annotations=1) + Sample(np.random.rand(dim, dim), key=str(uuid.uuid4()), annotations=1, subject=str(i)) for i in range(offset, offset + n_samples) ] def _create_random_sample_set(self, n_sample_set=10, n_samples=2): # Just generate random samples - numpy.random.seed(10) + np.random.seed(10) sample_set = [ SampleSet( samples=[], key=str(i), subject=str(i), - gender=numpy.random.choice(self.gender_choices), - metadata_1=numpy.random.choice(self.metadata_1_choices), + gender=np.random.choice(self.gender_choices), + metadata_1=np.random.choice(self.metadata_1_choices), ) for i in range(n_sample_set) ] @@ -73,7 +74,8 @@ class DummyDatabase: return sample_set def background_model_samples(self): - return self._create_random_sample_set() + samples = [sset.samples for sset in self._create_random_sample_set()] + return list(itertools.chain(*samples)) def references(self): return self._create_random_sample_set(self.n_references, self.dim) @@ -99,7 +101,20 @@ def _make_transformer(dir_name): dir_name, transform_extra_arguments=(("annotations", "annotations"),), ), - wrap_transform_bob(FakeExtractor(), dir_name,), + wrap_transform_bob(FakeExtractor(), dir_name,) + ) + + return pipeline + +def _make_transformer_with_algorithm(dir_name): + pipeline = make_pipeline( + wrap_transform_bob( + FakePreprocesor(), + dir_name, + transform_extra_arguments=(("annotations", "annotations"),), + ), + wrap_transform_bob(FakeExtractor(), dir_name), + wrap_transform_bob(FakeAlgorithm(), dir_name) ) return pipeline @@ -148,7 +163,7 @@ def test_on_memory(): run_pipeline(True) # Testing checkpoint -def test_checkpoint_bioalg(): +def test_checkpoint_bioalg_as_transformer(): with tempfile.TemporaryDirectory() as dir_name: @@ -205,3 +220,53 @@ def test_checkpoint_bioalg(): # CSVWriter + Dask run_pipeline(True, CSVScoreWriter()) run_pipeline(True, CSVScoreWriter()) # Checking if the checkpointng works + + +def test_checkpoint_bioalg_as_bioalg(): + + with tempfile.TemporaryDirectory() as dir_name: + + def run_pipeline(with_dask, score_writer=FourColumnsScoreWriter()): + database = DummyDatabase() + + transformer = _make_transformer_with_algorithm(dir_name) + projector_file = transformer[2].estimator.estimator.projector_file + + biometric_algorithm = BioAlgorithmLegacy( + FakeAlgorithm(), base_dir=dir_name, score_writer=score_writer, projector_file=projector_file + ) + + vanilla_biometrics_pipeline = VanillaBiometricsPipeline( + transformer, biometric_algorithm + ) + + if with_dask: + vanilla_biometrics_pipeline = dask_vanilla_biometrics( + vanilla_biometrics_pipeline, npartitions=2 + ) + + scores = vanilla_biometrics_pipeline( + database.background_model_samples(), + database.references(), + database.probes(), + allow_scoring_with_all_biometric_references=database.allow_scoring_with_all_biometric_references, + ) + + filename = os.path.join(dir_name, "concatenated_scores.txt") + score_writer.concatenate_write_scores(scores, filename) + + if isinstance(score_writer, CSVScoreWriter): + assert len(open(filename).readlines()) == 101 + else: + assert len(open(filename).readlines()) == 100 + + run_pipeline(False) + run_pipeline(False) # Checking if the checkpointng works + shutil.rmtree(dir_name) # Deleting the cache so it runs again from scratch + os.makedirs(dir_name, exist_ok=True) + + # Dask + run_pipeline(True) + run_pipeline(True) # Checking if the checkpointng works + shutil.rmtree(dir_name) # Deleting the cache so it runs again from scratch + os.makedirs(dir_name, exist_ok=True) diff --git a/bob/bio/base/transformers/algorithm.py b/bob/bio/base/transformers/algorithm.py index 6e74a3068ff8a123e29ca490dab75b12268e08a1..16bcf1c94111f3158827522d411f0f59e4c092e2 100644 --- a/bob/bio/base/transformers/algorithm.py +++ b/bob/bio/base/transformers/algorithm.py @@ -19,10 +19,9 @@ class AlgorithmTransformer(TransformerMixin, BaseEstimator): ------- Wrapping LDA algorithm with functtools - >>> from bob.bio.base.pipelines.vanilla_biometrics.legacy import LegacyAlgorithmAsTransformer + >>> from bob.bio.base.pipelines.vanilla_biometrics import AlgorithmTransformer >>> from bob.bio.base.algorithm import LDA - >>> import functools - >>> transformer = LegacyAlgorithmAsTransformer(functools.partial(LDA, use_pinv=True, pca_subspace_dimension=0.90)) + >>> transformer = AlgorithmTransformer(LDA(use_pinv=True, pca_subspace_dimension=0.90) Parameters @@ -41,7 +40,7 @@ class AlgorithmTransformer(TransformerMixin, BaseEstimator): "`callable` should be an instance of `bob.bio.base.extractor.Algorithm`" ) - if callable.requires_training and ( + if callable.requires_projector_training and ( projector_file is None or projector_file == "" ): raise ValueError( @@ -56,7 +55,7 @@ class AlgorithmTransformer(TransformerMixin, BaseEstimator): super().__init__(**kwargs) def fit(self, X, y=None): - if not self.callable.requires_training: + if not self.callable.requires_projector_training: return self training_data = X if self.callable.split_training_features_by_client: @@ -77,6 +76,6 @@ class AlgorithmTransformer(TransformerMixin, BaseEstimator): def _more_tags(self): return { - "stateless": not self.callable.requires_training, - "requires_fit": self.callable.requires_training, + "stateless": not self.callable.requires_projector_training, + "requires_fit": self.callable.requires_projector_training, }