Cleaning up vanilla biometrics

9b7a8633 · Tiago de Freitas Pereira · 5c9dd28f · 9b7a8633 · 9b7a8633
Commit 9b7a8633 authored 4 years ago by Tiago de Freitas Pereira
--- a/bob/bio/base/script/vanilla_biometrics.py
+++ b/bob/bio/base/script/vanilla_biometrics.py
@@ -25,7 +25,7 @@ from bob.bio.base.pipelines.vanilla_biometrics import (
    dask_vanilla_biometrics,
    dask_get_partition_size,
    FourColumnsScoreWriter,
-    CSVScoreWriter
+    CSVScoreWriter,
 )
 from dask.delayed import Delayed
 import pkg_resources
@@ -44,31 +44,18 @@ EPILOG = """\b
 -----------------------


- $ bob pipelines vanilla-biometrics my_experiment.py -vv
+ $ bob pipelines vanilla-biometrics -p my_experiment.py -vv


 my_experiment.py must contain the following elements:

- >>> preprocessor = my_preprocessor() \n
- >>> extractor = my_extractor() \n
- >>> algorithm = my_algorithm() \n
- >>> checkpoints = EXPLAIN CHECKPOINTING \n
+   >>> transformer = ... # A scikit-learn pipeline
+   >>> algorithm   = ... # `An BioAlgorithm`
+   >>> pipeline = VanillaBiometricsPipeline(transformer,algorithm)
+   >>> database = .... # Biometric Database connector (class that implements the methods: `background_model_samples`, `references` and `probes`)" 

 \b

-
-Look at the following example
-
- $ bob pipelines vanilla-biometrics ./bob/pipelines/config/distributed/sge_iobig_16cores.py \
-                                    ./bob/pipelines/config/database/mobio_male.py \
-                                    ./bob/pipelines/config/baselines/facecrop_pca.py
-
-\b
-
-
-
-TODO: Work out this help
-
 """


@@ -122,24 +109,49 @@ def post_process_scores(pipeline, scores, path):
    help="Name of output directory",
 )
 @click.option(
-    "--write-metadata-scores", "-m",
+    "--write-metadata-scores",
+    "-m",
    is_flag=True,
-    help="If set, all the scores will be written with all its metadata",
+    help="If set, all the scores will be written with all its metadata using the `CSVScoreWriter`",
+)
+@click.option(
+    "--checkpoint",
+    "-c",
+    is_flag=True,
+    help="If set, it will checkpoint all steps of the pipeline",
 )
 @verbosity_option(cls=ResourceOption)
-def vanilla_biometrics(pipeline, database, dask_client, groups, output, write_metadata_scores, **kwargs):
+def vanilla_biometrics(
+    pipeline,
+    database,
+    dask_client,
+    groups,
+    output,
+    write_metadata_scores,
+    checkpoint,
+    **kwargs,
+):
    """Runs the simplest biometrics pipeline.

-    Such pipeline consists into three sub-pipelines.
-    In all of them, given raw data as input it does the following steps:
+    Such pipeline consists into two major components.
+    The first component consists of a scikit-learn `Pipeline`,
+    where a sequence of transformations of the input data
+    is defined.
+    The second component is a `BioAlgorithm` that defines the primitives
+    `enroll` and `score`
+
+    With those two components any Biometric Experiment can be done.
+    A Biometric experiment consists of three sub-pipelines and 
+    they are defined below:

    Sub-pipeline 1:\n
    ---------------

-    Training background model. Some biometric algorithms demands the training of background model, for instance, PCA/LDA matrix or a Neural networks. This sub-pipeline handles that and it consists of 3 steps:
-
+    Training background model.
+    Some biometric algorithms demands the training of background model, for instance, PCA/LDA matrix or a Neural networks. 
+    
    \b
-    raw_data --> preprocessing >> feature extraction >> train background model --> background_model
+    This pipeline runs: `Pipeline.fit(DATA_FOR_FIT)`



@@ -149,26 +161,24 @@ def vanilla_biometrics(pipeline, database, dask_client, groups, output, write_me
    ---------------

    Creation of biometric references: This is a standard step in a biometric pipelines.
-    Given a set of samples of one identity, create a biometric reference (a.k.a template) for sub identity. This sub-pipeline handles that in 3 steps and they are the following:
+    Given a set of samples of one identity, create a biometric reference (a.k.a template) for sub identity. 
+

    \b
    raw_data --> preprocessing >> feature extraction >> enroll(background_model) --> biometric_reference

-    Note that this sub-pipeline depends on the previous one
-
+    This pipeline runs: `BioAlgorithm.enroll(Pipeline.transform(DATA_ENROLL))` >> biometric_references


    Sub-pipeline 3:\n
    ---------------

+    Probing: This is another standard step in biometric pipelines. 
+    Given one sample and one biometric reference, computes a score.
+    Such score has different meanings depending on the scoring method your biometric algorithm uses. 
+    It's out of scope to explain in a help message to explain what scoring is for different biometric algorithms.

-    Probing: This is another standard step in biometric pipelines. Given one sample and one biometric reference, computes a score. Such score has different meanings depending on the scoring method your biometric algorithm uses. It's out of scope to explain in a help message to explain what scoring is for different biometric algorithms.
-
-
-    raw_data --> preprocessing >> feature extraction >> probe(biometric_reference, background_model) --> score
-
-    Note that this sub-pipeline depends on the two previous ones
-
+    This pipeline runs: `BioAlgorithm.score(Pipeline.transform(DATA_SCORE, biometric_references))` >> biometric_references

    """

@@ -185,12 +195,12 @@ def vanilla_biometrics(pipeline, database, dask_client, groups, output, write_me
    database = vanilla_pipeline.database
    pipeline = vanilla_pipeline.pipeline
    if write_metadata_scores:
-        pipeline.score_writer = CSVScoreWriter(os.path.join(output,"./tmp"))
+        pipeline.score_writer = CSVScoreWriter(os.path.join(output, "./tmp"))
    else:
-        pipeline.score_writer = FourColumnsScoreWriter(os.path.join(output,"./tmp"))
+        pipeline.score_writer = FourColumnsScoreWriter(os.path.join(output, "./tmp"))

    # Check if it's already checkpointed
-    if not isinstance_nested(
+    if checkpoint and not isinstance_nested(
        pipeline.biometric_algorithm,
        "biometric_algorithm",
        BioAlgorithmCheckpointWrapper,
@@ -206,7 +216,7 @@ def vanilla_biometrics(pipeline, database, dask_client, groups, output, write_me

        if dask_client is not None and not isinstance_nested(
            pipeline.biometric_algorithm, "biometric_algorithm", BioAlgorithmDaskWrapper
-        ):            
+        ):
            n_objects = (
                len(background_model_samples) + len(biometric_references) + len(probes)
            )

--- a/bob/bio/base/script/vanilla_biometrics_ztnorm.py
+++ b/bob/bio/base/script/vanilla_biometrics_ztnorm.py
@@ -28,7 +28,7 @@ from bob.bio.base.pipelines.vanilla_biometrics import (
    dask_vanilla_biometrics,
    dask_get_partition_size,
    FourColumnsScoreWriter,
-    CSVScoreWriter
+    CSVScoreWriter,
 )
 from dask.delayed import Delayed
 from bob.bio.base.utils import get_resource_filename
@@ -47,31 +47,19 @@ EPILOG = """\b
 -----------------------


- $ bob pipelines vanilla-biometrics my_experiment.py -vv
+ $ bob pipelines vanilla-biometrics -p my_experiment.py -vv


 my_experiment.py must contain the following elements:

- >>> preprocessor = my_preprocessor() \n
- >>> extractor = my_extractor() \n
- >>> algorithm = my_algorithm() \n
- >>> checkpoints = EXPLAIN CHECKPOINTING \n
+   >>> transformer = ... # A scikit-learn pipeline
+   >>> algorithm   = ... # `An BioAlgorithm`
+   >>> pipeline = VanillaBiometricsPipeline(transformer,algorithm)
+   >>> database = .... # Biometric Database connector (class that implements the methods: `background_model_samples`, `references` and `probes`)" 

 \b


-Look at the following example
-
- $ bob pipelines vanilla-biometrics ./bob/pipelines/config/distributed/sge_iobig_16cores.py \
-                                    ./bob/pipelines/config/database/mobio_male.py \
-                                    ./bob/pipelines/config/baselines/facecrop_pca.py
-
-\b
-
-
-
-TODO: Work out this help
-
 """


@@ -79,12 +67,11 @@ TODO: Work out this help
    entry_point_group="bob.pipelines.config", cls=ConfigCommand, epilog=EPILOG,
 )
 @click.option(
-    "--pipeline", "-p", required=True, help="Vanilla biometrics pipeline",
+    "--pipeline", "-p", required=True, help="An entry point or a configuration file containing a `VanillaBiometricsPipeline`.",
 )
 @click.option(
    "--database",
-    "-d",
-    required=True,
+    "-d",    
    help="Biometric Database connector (class that implements the methods: `background_model_samples`, `references` and `probes`)",
 )
 @click.option(
@@ -115,17 +102,84 @@ TODO: Work out this help
    help="If set, will consider genuine scores in the ZT score normalization",
 )
 @click.option(
-    "--write-metadata-scores", "-m",
+    "--write-metadata-scores",
+    "-m",
    is_flag=True,
    help="If set, all the scores will be written with all its metadata",
 )
-@click.option("--ztnorm-cohort-proportion", default=1., type=float, 
-    help="Sets the percentage of samples used for t-norm and z-norm. Sometimes you don't want to use all the t/z samples for normalization")
+@click.option(
+    "--ztnorm-cohort-proportion",
+    default=1.0,
+    type=float,
+    help="Sets the percentage of samples used for t-norm and z-norm. Sometimes you don't want to use all the t/z samples for normalization",
+)
+@click.option(
+    "--checkpoint",
+    "-c",
+    is_flag=True,
+    help="If set, it will checkpoint all steps of the pipeline",
+)
 @verbosity_option(cls=ResourceOption)
 def vanilla_biometrics_ztnorm(
-    pipeline, database, dask_client, groups, output, consider_genuines, write_metadata_scores, ztnorm_cohort_proportion, **kwargs
+    pipeline,
+    database,
+    dask_client,
+    groups,
+    output,
+    consider_genuines,
+    write_metadata_scores,
+    ztnorm_cohort_proportion,
+    checkpoint,
+    **kwargs,
 ):
-    """Runs the simplest biometrics pipeline under ZTNorm.
+    """Runs the the vanilla-biometrics with ZT-Norm like score normalizations.
+
+    Such pipeline consists into two major components.
+    The first component consists of a scikit-learn `Pipeline`,
+    where a sequence of transformations of the input data
+    is defined.
+    The second component is a `BioAlgorithm` that defines the primitives
+    `enroll` and `score`
+
+    With those two components any Biometric Experiment can be done.
+    A Biometric experiment consists of three sub-pipelines and 
+    they are defined below:
+
+    Sub-pipeline 1:\n
+    ---------------
+
+    Training background model.
+    Some biometric algorithms demands the training of background model, for instance, PCA/LDA matrix or a Neural networks. 
+    
+    \b
+    This pipeline runs: `Pipeline.fit(DATA_FOR_FIT)`
+
+
+
+    \b
+
+    Sub-pipeline 2:\n
+    ---------------
+
+    Creation of biometric references: This is a standard step in a biometric pipelines.
+    Given a set of samples of one identity, create a biometric reference (a.k.a template) for sub identity. 
+
+
+    \b
+    raw_data --> preprocessing >> feature extraction >> enroll(background_model) --> biometric_reference
+
+    This pipeline runs: `BioAlgorithm.enroll(Pipeline.transform(DATA_ENROLL))` >> biometric_references
+
+
+    Sub-pipeline 3:\n
+    ---------------
+
+    Probing: This is another standard step in biometric pipelines. 
+    Given one sample and one biometric reference, computes a score.
+    Such score has different meanings depending on the scoring method your biometric algorithm uses. 
+    It's out of scope to explain in a help message to explain what scoring is for different biometric algorithms.
+
+    This pipeline runs: `BioAlgorithm.score(Pipeline.transform(DATA_SCORE, biometric_references))` >> biometric_references

    """

@@ -144,7 +198,6 @@ def vanilla_biometrics_ztnorm(
    if not os.path.exists(output):
        os.makedirs(output, exist_ok=True)

-
    # It's necessary to chain load 2 resources together
    pipeline_config = get_resource_filename(pipeline, "bob.bio.pipeline")
    database_config = get_resource_filename(database, "bob.bio.database")
@@ -157,20 +210,18 @@ def vanilla_biometrics_ztnorm(
    pipeline = vanilla_pipeline.pipeline

    if write_metadata_scores:
-        pipeline.score_writer = CSVScoreWriter(os.path.join(output,"./tmp"))
+        pipeline.score_writer = CSVScoreWriter(os.path.join(output, "./tmp"))
    else:
-        pipeline.score_writer = FourColumnsScoreWriter(os.path.join(output,"./tmp"))
-
+        pipeline.score_writer = FourColumnsScoreWriter(os.path.join(output, "./tmp"))

    # Check if it's already checkpointed
-    if not isinstance_nested(
+    if checkpoint and not isinstance_nested(
        pipeline.biometric_algorithm,
        "biometric_algorithm",
        BioAlgorithmCheckpointWrapper,
    ):
        pipeline = checkpoint_vanilla_biometrics(pipeline, output)

-
    # Patching the pipeline in case of ZNorm and checkpointing it
    pipeline = ZTNormPipeline(pipeline)
    pipeline.ztnorm_solver = ZTNormCheckpointWrapper(
@@ -213,8 +264,14 @@ def vanilla_biometrics_ztnorm(
        probes, zprobes = _merge_references_ztnorm(
            biometric_references, probes, zprobes, treferences
        )
-        
-        raw_scores, z_normed_scores, t_normed_scores, zt_normed_scores, s_normed_scores = pipeline(
+
+        (
+            raw_scores,
+            z_normed_scores,
+            t_normed_scores,
+            zt_normed_scores,
+            s_normed_scores,
+        ) = pipeline(
            background_model_samples,
            biometric_references,
            probes,
@@ -225,7 +282,7 @@ def vanilla_biometrics_ztnorm(

        def _build_filename(score_file_name, suffix):
            return os.path.join(score_file_name, suffix)
-        
+
        # Running RAW_SCORES
        raw_scores = post_process_scores(
            pipeline, raw_scores, _build_filename(score_file_name, "raw_scores")