Commit b22ac980 authored by Amir MOHAMMADI's avatar Amir MOHAMMADI
Browse files

Merge branch 'dask-pipelines' into 'master'

Dask pipelines Improvements

See merge request !78
parents 8dcd5aac 3ad7b6c9
Pipeline #46074 passed with stages
in 5 minutes and 33 seconds
......@@ -21,11 +21,6 @@ def __appropriate__(*args):
PadFile, PadDatabase, FileListPadDatabase, Client, HighBioDatabase, HighPadDatabase
__all__ = [_ for _ in dir() if not _.startswith('_')]
__all__ = [_ for _ in dir() if not _.startswith("_")]
from import BioFile
class PadFile(
class PadFile(BioFile):
"""A simple base class that defines basic properties of File object for the use in PAD experiments"""
def __init__(
from .abstract_classes import Database
from .legacy import DatabaseConnector
from .implemented import FrameContainersToFrames
# gets sphinx autodoc done right - don't remove it
def __appropriate__(*args):
"""Says object was actually declared here, and not in the import module.
Fixing sphinx warnings of not being able to find classes, when path is shortened.
*args: An iterable of objects to modify
Resolves `Sphinx referencing issues
for obj in args:
obj.__module__ = __name__
__all__ = [_ for _ in dir() if not _.startswith("_")]
......@@ -48,3 +48,10 @@ class DatabaseConnector(Database):
def predict_samples(self, group="dev"):
objects = self.database.all_files(groups=group, flat=True)
return [_padfile_to_delayed_sample(k, self.database) for k in objects]
def __repr__(self) -> str:
return f"""{self.__class__.__name__}(
"""Finalizes the scores that are produced by
import click
import numpy
import logging
from bob.extension.scripts.click_helper import (
verbosity_option, log_parameters)
from bob.extension.scripts.click_helper import log_parameters
from bob.extension.scripts.click_helper import verbosity_option
logger = logging.getLogger(__name__)
@click.command(name='finalize-scores', epilog='''\b
$ bin/bob pad finalize_scores /path/to/scores-dev
$ bin/bob pad finalize_scores /path/to/scores-{dev,eval}
@click.argument('scores', type=click.Path(exists=True, dir_okay=False),
@click.option('-m', '--method', default='mean',
type=click.Choice(['mean', 'min', 'max']), show_default=True,
help='The method to use when finalizing the scores.')
@click.argument("scores", type=click.Path(exists=True, dir_okay=False), nargs=-1)
type=click.Choice(["mean", "min", "max"]),
help="The method to use when finalizing the scores.",
@click.option("--backup/--no-backup", default=True, help="Whether to backup scores.")
def finalize_scores(scores, method, **kwargs):
"""Finalizes the scores given by
def finalize_scores(scores, method, backup, verbose):
"""Finalizes the scores given by bob pad vanilla-pad
When using bob.pad.base, Algorithms can produce several score values for
each unique sample. You can use this script to average (or min/max) these
scores to have one final score per sample.
The conversion is done in-place. The order of scores will change.
The conversion is done in-place (original files will be backed up).
The order of scores will change.
import logging
import numpy
logger = logging.getLogger(__name__)
mean = {'mean': numpy.nanmean, 'max': numpy.nanmax, 'min': numpy.nanmin}[method]
mean = {"mean": numpy.nanmean, "max": numpy.nanmax, "min": numpy.nanmin}[method]
for path in scores:
new_lines = []
with open(path) as f:
old_lines = f.readlines()
if backup:
with open(f"{path}.bak", "w") as f:
for i, line in enumerate(old_lines):
uniq, s = line.strip().rsplit(maxsplit=1)
s = float(s)
......@@ -47,14 +63,13 @@ def finalize_scores(scores, method, **kwargs):
if uniq == last_line:
new_lines.append('{} {}\n'.format(
last_line, mean(last_scores)))
new_lines.append("{} {}\n".format(last_line, mean(last_scores)))
last_scores = [s]
last_line = uniq
else: # this else is for the for loop
new_lines.append('{} {}\n'.format(last_line, mean(last_scores)))
new_lines.append("{} {}\n".format(last_line, mean(last_scores)))
with open(path, 'w') as f:
with open(path, "w") as f:
"""Executes PAD pipeline"""
import logging
import bob.pipelines as mario
import click
import joblib
from bob.extension.scripts.click_helper import ConfigCommand
from bob.extension.scripts.click_helper import ResourceOption
from bob.extension.scripts.click_helper import verbosity_option
logger = logging.getLogger(__name__)
EPILOG = """\b
Command line examples\n
$ bob pad vanilla-pad -vv must contain the following elements:
>>> preprocessor = my_preprocessor() \n
>>> extractor = my_extractor() \n
>>> algorithm = my_algorithm() \n
>>> checkpoints = EXPLAIN CHECKPOINTING \n
Look at the following example
$ bob pipelines vanilla-biometrics ./bob/pipelines/config/distributed/ \
./bob/pipelines/config/database/ \
TODO: Work out this help
......@@ -81,7 +47,7 @@ TODO: Work out this help
type=click.Choice(["dev", "eval"]),
default=("dev", "eval"),
help="If given, this value will limit the experiments belonging to a particular group",
......@@ -89,7 +55,7 @@ TODO: Work out this help
help="Name of output directory",
help="Saves scores (and checkpoints) in this folder.",
......@@ -99,56 +65,22 @@ TODO: Work out this help
def vanilla_pad(pipeline, database, dask_client, groups, output, checkpoint, **kwargs):
"""Runs the simplest PAD pipeline.
Such pipeline consists into three sub-pipelines.
In all of them, given raw data as input it does the following steps:
Sub-pipeline 1:\n
Training background model. Some biometric algorithms demands the training of background model, for instance, PCA/LDA matrix or a Neural networks. This sub-pipeline handles that and it consists of 3 steps:
raw_data --> preprocessing >> feature extraction >> train background model --> background_model
Sub-pipeline 2:\n
Creation of biometric references: This is a standard step in a biometric pipelines.
Given a set of samples of one identity, create a biometric reference (a.k.a template) for sub identity. This sub-pipeline handles that in 3 steps and they are the following:
raw_data --> preprocessing >> feature extraction >> enroll(background_model) --> biometric_reference
Note that this sub-pipeline depends on the previous one
Sub-pipeline 3:\n
Probing: This is another standard step in biometric pipelines. Given one sample and one biometric reference, computes a score. Such score has different meanings depending on the scoring method your biometric algorithm uses. It's out of scope to explain in a help message to explain what scoring is for different biometric algorithms.
raw_data --> preprocessing >> feature extraction >> probe(biometric_reference, background_model) --> score
Note that this sub-pipeline depends on the two previous ones
def vanilla_pad(ctx, pipeline, database, dask_client, groups, output, checkpoint, **kwargs):
"""Runs the simplest PAD pipeline."""
import gzip
import logging
import os
import sys
from glob import glob
import bob.pipelines as mario
import dask.bag
from bob.extension.scripts.click_helper import log_parameters
logger = logging.getLogger(__name__)
os.makedirs(output, exist_ok=True)
......@@ -157,26 +89,30 @@ def vanilla_pad(pipeline, database, dask_client, groups, output, checkpoint, **k
["checkpoint"], pipeline, features_dir=output, model_path=output
if dask_client is not None:
pipeline = mario.wrap(["dask"], pipeline)
if dask_client is None:
logger.warning("`dask_client` not set. Your pipeline will run locally")
# create an experiment info file
with open(os.path.join(output, "Experiment_info.txt"), "wt") as f:
f.write("Pipeline steps:\n")
for i, name, estimator in pipeline._iter():
f.write(f"Step {i}: {name}\n{estimator!r}\n")
# train the pipeline
fit_samples = database.fit_samples() # [::50]
pipeline =
fit_samples = database.fit_samples()
for group in groups:"Running vanilla biometrics for group {group}")
predict_samples = database.predict_samples(group=group) # [::50]
predict_samples = database.predict_samples(group=group)
result = pipeline.decision_function(predict_samples)
with open(os.path.join(output, f"scores-{group}"), "w") as f:
scores_path = os.path.join(output, f"scores-{group}")
if isinstance(result, dask.bag.core.Bag):
if dask_client is None:
"`dask_client` not set. Your pipeline will run locally"
# write each partition into a zipped txt file
result =
......@@ -186,6 +122,7 @@ def vanilla_pad(pipeline, database, dask_client, groups, output, checkpoint, **k"Writing bag results into files ...")
result.to_textfiles(pattern, last_endline=True, scheduler=dask_client)
with open(scores_path, "w") as f:
# concatenate scores into one score file
for path in sorted(
......@@ -193,8 +130,11 @@ def vanilla_pad(pipeline, database, dask_client, groups, output, checkpoint, **k
with, "rt") as f2:
# delete intermediate score files
with open(scores_path, "w") as f:
for sample in result:
f.write(pad_predicted_sample_to_score_line(sample, endl="\n"))
from . import database
from . import database_sql
from . import preprocessor
from . import extractor
......@@ -63,7 +63,7 @@ These five options are:
* ``--sub-directory``: A descriptive name for your experiment, which will serve as a sub-directory
The first four parameters, i.e., the ``database``, the ``preprocessor``, the ``extractor`` and the ``algorithm`` can be specified in several different ways.
For the start, we will use only the registered :ref:`Resources <>`.
For the start, we will use only the registered Resources.
These resources define the source code that will be used to compute the experiments, as well as all the meta-parameters of the algorithms (which we will call the *configuration*).
To get a list of registered resources, please call:
......@@ -15,14 +15,12 @@ Base Classes
Most of the base classes are reused from :ref:` <>`.
Only one base class that is presentation attack detection specific, ``Algorithm`` is implemented in this package.
.. autosummary::
.. autosummary::
......@@ -30,13 +28,7 @@ Preprocessors and Extractors
Preprocessors and Extractors from the :ref:` <>`
package can also be used in this package. Please see
:any:`` for more details.
.. automodule:: bob.pad.base.algorithm
package can also be used in this package.
......@@ -42,7 +42,7 @@ command line below will install all the required packages:
.. code-block:: sh
$ source activate <bob_conda_environment>
$ conda activate <bob_conda_environment>
$ conda install \ \
bob.pad.base \
......@@ -12,29 +12,6 @@ Generic functions
Tools to run PAD experiments
Command line generation
.. autosummary::
.. autosummary::
.. autosummary::
......@@ -42,9 +19,5 @@ Details
.. automodule:: bob.pad.base
.. automodule::
.. autoclass:: FileSelector
.. include:: links.rst
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment