Commit b22ac980 authored by Amir MOHAMMADI's avatar Amir MOHAMMADI

Merge branch 'dask-pipelines' into 'master'

Dask pipelines Improvements

See merge request !78
parents 8dcd5aac 3ad7b6c9
Pipeline #46074 passed with stages
in 5 minutes and 33 seconds
...@@ -6,26 +6,21 @@ from .PadBioFileDB import HighBioDatabase, HighPadDatabase ...@@ -6,26 +6,21 @@ from .PadBioFileDB import HighBioDatabase, HighPadDatabase
# gets sphinx autodoc done right - don't remove it # gets sphinx autodoc done right - don't remove it
def __appropriate__(*args): def __appropriate__(*args):
"""Says object was actually declared here, and not in the import module. """Says object was actually declared here, and not in the import module.
Fixing sphinx warnings of not being able to find classes, when path is shortened. Fixing sphinx warnings of not being able to find classes, when path is shortened.
Parameters: Parameters:
*args: An iterable of objects to modify *args: An iterable of objects to modify
Resolves `Sphinx referencing issues Resolves `Sphinx referencing issues
<https://github.com/sphinx-doc/sphinx/issues/3048>` <https://github.com/sphinx-doc/sphinx/issues/3048>`
""" """
for obj in args: for obj in args:
obj.__module__ = __name__ obj.__module__ = __name__
__appropriate__( __appropriate__(
PadFile, PadFile, PadDatabase, FileListPadDatabase, Client, HighBioDatabase, HighPadDatabase
PadDatabase,
FileListPadDatabase,
Client,
HighBioDatabase,
HighPadDatabase
) )
__all__ = [_ for _ in dir() if not _.startswith('_')] __all__ = [_ for _ in dir() if not _.startswith("_")]
import bob.bio.base.database from bob.bio.base.database import BioFile
class PadFile(bob.bio.base.database.BioFile): class PadFile(BioFile):
"""A simple base class that defines basic properties of File object for the use in PAD experiments""" """A simple base class that defines basic properties of File object for the use in PAD experiments"""
def __init__( def __init__(
......
from .abstract_classes import Database from .abstract_classes import Database
from .legacy import DatabaseConnector from .legacy import DatabaseConnector
from .implemented import FrameContainersToFrames
# gets sphinx autodoc done right - don't remove it
def __appropriate__(*args):
"""Says object was actually declared here, and not in the import module.
Fixing sphinx warnings of not being able to find classes, when path is shortened.
Parameters:
*args: An iterable of objects to modify
Resolves `Sphinx referencing issues
<https://github.com/sphinx-doc/sphinx/issues/3048>`
"""
for obj in args:
obj.__module__ = __name__
__appropriate__(
Database,
DatabaseConnector,
)
__all__ = [_ for _ in dir() if not _.startswith("_")]
...@@ -48,3 +48,10 @@ class DatabaseConnector(Database): ...@@ -48,3 +48,10 @@ class DatabaseConnector(Database):
def predict_samples(self, group="dev"): def predict_samples(self, group="dev"):
objects = self.database.all_files(groups=group, flat=True) objects = self.database.all_files(groups=group, flat=True)
return [_padfile_to_delayed_sample(k, self.database) for k in objects] return [_padfile_to_delayed_sample(k, self.database) for k in objects]
def __repr__(self) -> str:
return f"""{self.__class__.__name__}(
database={self.database},
annotation_type={self.annotation_type},
fixed_positions={self.fixed_positions}
)"""
"""Finalizes the scores that are produced by spoof.py """Finalizes the scores that are produced by spoof.py
""" """
import click import click
import numpy from bob.extension.scripts.click_helper import log_parameters
import logging from bob.extension.scripts.click_helper import verbosity_option
from bob.extension.scripts.click_helper import (
verbosity_option, log_parameters)
logger = logging.getLogger(__name__)
@click.command(
@click.command(name='finalize-scores', epilog='''\b name="finalize-scores",
epilog="""\b
Examples: Examples:
$ bin/bob pad finalize_scores /path/to/scores-dev $ bin/bob pad finalize_scores /path/to/scores-dev
$ bin/bob pad finalize_scores /path/to/scores-{dev,eval} $ bin/bob pad finalize_scores /path/to/scores-{dev,eval}
''') """,
@click.argument('scores', type=click.Path(exists=True, dir_okay=False), )
nargs=-1) @click.argument("scores", type=click.Path(exists=True, dir_okay=False), nargs=-1)
@click.option('-m', '--method', default='mean', @click.option(
type=click.Choice(['mean', 'min', 'max']), show_default=True, "-m",
help='The method to use when finalizing the scores.') "--method",
default="mean",
type=click.Choice(["mean", "min", "max"]),
show_default=True,
help="The method to use when finalizing the scores.",
)
@click.option("--backup/--no-backup", default=True, help="Whether to backup scores.")
@verbosity_option() @verbosity_option()
def finalize_scores(scores, method, **kwargs): def finalize_scores(scores, method, backup, verbose):
"""Finalizes the scores given by spoof.py """Finalizes the scores given by bob pad vanilla-pad
When using bob.pad.base, Algorithms can produce several score values for When using bob.pad.base, Algorithms can produce several score values for
each unique sample. You can use this script to average (or min/max) these each unique sample. You can use this script to average (or min/max) these
scores to have one final score per sample. scores to have one final score per sample.
The conversion is done in-place. The order of scores will change. The conversion is done in-place (original files will be backed up).
The order of scores will change.
""" """
import logging
import numpy
logger = logging.getLogger(__name__)
log_parameters(logger) log_parameters(logger)
mean = {'mean': numpy.nanmean, 'max': numpy.nanmax, 'min': numpy.nanmin}[method] mean = {"mean": numpy.nanmean, "max": numpy.nanmax, "min": numpy.nanmin}[method]
for path in scores: for path in scores:
new_lines = [] new_lines = []
with open(path) as f: with open(path) as f:
old_lines = f.readlines() old_lines = f.readlines()
old_lines.sort()
if backup:
with open(f"{path}.bak", "w") as f:
f.writelines(old_lines)
old_lines.sort()
for i, line in enumerate(old_lines): for i, line in enumerate(old_lines):
uniq, s = line.strip().rsplit(maxsplit=1) uniq, s = line.strip().rsplit(maxsplit=1)
s = float(s) s = float(s)
...@@ -47,14 +63,13 @@ def finalize_scores(scores, method, **kwargs): ...@@ -47,14 +63,13 @@ def finalize_scores(scores, method, **kwargs):
if uniq == last_line: if uniq == last_line:
last_scores.append(s) last_scores.append(s)
else: else:
new_lines.append('{} {}\n'.format( new_lines.append("{} {}\n".format(last_line, mean(last_scores)))
last_line, mean(last_scores)))
last_scores = [s] last_scores = [s]
last_line = uniq last_line = uniq
else: # this else is for the for loop else: # this else is for the for loop
new_lines.append('{} {}\n'.format(last_line, mean(last_scores))) new_lines.append("{} {}\n".format(last_line, mean(last_scores)))
with open(path, 'w') as f: with open(path, "w") as f:
f.writelines(new_lines) f.writelines(new_lines)
"""Executes PAD pipeline""" """Executes PAD pipeline"""
import logging
import bob.pipelines as mario
import click import click
import joblib
from bob.extension.scripts.click_helper import ConfigCommand from bob.extension.scripts.click_helper import ConfigCommand
from bob.extension.scripts.click_helper import ResourceOption from bob.extension.scripts.click_helper import ResourceOption
from bob.extension.scripts.click_helper import verbosity_option from bob.extension.scripts.click_helper import verbosity_option
logger = logging.getLogger(__name__)
EPILOG = """\b
@click.command(
entry_point_group="bob.pad.config",
cls=ConfigCommand,
epilog="""\b
Command line examples\n Command line examples\n
----------------------- -----------------------
$ bob pad vanilla-pad my_experiment.py -vv $ bob pad vanilla-pad my_experiment.py -vv
""",
my_experiment.py must contain the following elements:
>>> preprocessor = my_preprocessor() \n
>>> extractor = my_extractor() \n
>>> algorithm = my_algorithm() \n
>>> checkpoints = EXPLAIN CHECKPOINTING \n
\b
Look at the following example
$ bob pipelines vanilla-biometrics ./bob/pipelines/config/distributed/sge_iobig_16cores.py \
./bob/pipelines/config/database/mobio_male.py \
./bob/pipelines/config/baselines/facecrop_pca.py
\b
TODO: Work out this help
"""
@click.command(
entry_point_group="bob.pad.config",
cls=ConfigCommand,
epilog=EPILOG,
) )
@click.option( @click.option(
"--pipeline", "--pipeline",
...@@ -81,7 +47,7 @@ TODO: Work out this help ...@@ -81,7 +47,7 @@ TODO: Work out this help
"groups", "groups",
type=click.Choice(["dev", "eval"]), type=click.Choice(["dev", "eval"]),
multiple=True, multiple=True,
default=("dev",), default=("dev", "eval"),
help="If given, this value will limit the experiments belonging to a particular group", help="If given, this value will limit the experiments belonging to a particular group",
) )
@click.option( @click.option(
...@@ -89,7 +55,7 @@ TODO: Work out this help ...@@ -89,7 +55,7 @@ TODO: Work out this help
"--output", "--output",
show_default=True, show_default=True,
default="results", default="results",
help="Name of output directory", help="Saves scores (and checkpoints) in this folder.",
) )
@click.option( @click.option(
"--checkpoint", "--checkpoint",
...@@ -99,56 +65,22 @@ TODO: Work out this help ...@@ -99,56 +65,22 @@ TODO: Work out this help
cls=ResourceOption, cls=ResourceOption,
) )
@verbosity_option(cls=ResourceOption) @verbosity_option(cls=ResourceOption)
def vanilla_pad(pipeline, database, dask_client, groups, output, checkpoint, **kwargs): @click.pass_context
"""Runs the simplest PAD pipeline. def vanilla_pad(ctx, pipeline, database, dask_client, groups, output, checkpoint, **kwargs):
"""Runs the simplest PAD pipeline."""
Such pipeline consists into three sub-pipelines.
In all of them, given raw data as input it does the following steps:
Sub-pipeline 1:\n
---------------
Training background model. Some biometric algorithms demands the training of background model, for instance, PCA/LDA matrix or a Neural networks. This sub-pipeline handles that and it consists of 3 steps:
\b
raw_data --> preprocessing >> feature extraction >> train background model --> background_model
\b
Sub-pipeline 2:\n
---------------
Creation of biometric references: This is a standard step in a biometric pipelines.
Given a set of samples of one identity, create a biometric reference (a.k.a template) for sub identity. This sub-pipeline handles that in 3 steps and they are the following:
\b
raw_data --> preprocessing >> feature extraction >> enroll(background_model) --> biometric_reference
Note that this sub-pipeline depends on the previous one
Sub-pipeline 3:\n
---------------
Probing: This is another standard step in biometric pipelines. Given one sample and one biometric reference, computes a score. Such score has different meanings depending on the scoring method your biometric algorithm uses. It's out of scope to explain in a help message to explain what scoring is for different biometric algorithms.
raw_data --> preprocessing >> feature extraction >> probe(biometric_reference, background_model) --> score
Note that this sub-pipeline depends on the two previous ones
"""
import gzip import gzip
import logging
import os import os
import sys
from glob import glob from glob import glob
import bob.pipelines as mario
import dask.bag import dask.bag
from bob.extension.scripts.click_helper import log_parameters
logger = logging.getLogger(__name__)
log_parameters(logger)
os.makedirs(output, exist_ok=True) os.makedirs(output, exist_ok=True)
...@@ -157,35 +89,40 @@ def vanilla_pad(pipeline, database, dask_client, groups, output, checkpoint, **k ...@@ -157,35 +89,40 @@ def vanilla_pad(pipeline, database, dask_client, groups, output, checkpoint, **k
["checkpoint"], pipeline, features_dir=output, model_path=output ["checkpoint"], pipeline, features_dir=output, model_path=output
) )
if dask_client is not None: if dask_client is None:
pipeline = mario.wrap(["dask"], pipeline) logger.warning("`dask_client` not set. Your pipeline will run locally")
# create an experiment info file
with open(os.path.join(output, "Experiment_info.txt"), "wt") as f:
f.write(f"{sys.argv!r}\n")
f.write(f"database={database!r}\n")
f.write("Pipeline steps:\n")
for i, name, estimator in pipeline._iter():
f.write(f"Step {i}: {name}\n{estimator!r}\n")
# train the pipeline # train the pipeline
fit_samples = database.fit_samples() # [::50] fit_samples = database.fit_samples()
pipeline = pipeline.fit(fit_samples) pipeline.fit(fit_samples)
for group in groups: for group in groups:
logger.info(f"Running vanilla biometrics for group {group}") logger.info(f"Running vanilla biometrics for group {group}")
predict_samples = database.predict_samples(group=group) # [::50] predict_samples = database.predict_samples(group=group)
result = pipeline.decision_function(predict_samples) result = pipeline.decision_function(predict_samples)
with open(os.path.join(output, f"scores-{group}"), "w") as f: scores_path = os.path.join(output, f"scores-{group}")
if isinstance(result, dask.bag.core.Bag): if isinstance(result, dask.bag.core.Bag):
if dask_client is None:
logger.warning(
"`dask_client` not set. Your pipeline will run locally"
)
# write each partition into a zipped txt file # write each partition into a zipped txt file
result = result.map(pad_predicted_sample_to_score_line) result = result.map(pad_predicted_sample_to_score_line)
prefix, postfix = f"{output}/scores/scores-{group}-", ".txt.gz" prefix, postfix = f"{output}/scores/scores-{group}-", ".txt.gz"
pattern = f"{prefix}*{postfix}" pattern = f"{prefix}*{postfix}"
os.makedirs(os.path.dirname(prefix), exist_ok=True) os.makedirs(os.path.dirname(prefix), exist_ok=True)
logger.info("Writing bag results into files ...") logger.info("Writing bag results into files ...")
result.to_textfiles(pattern, last_endline=True, scheduler=dask_client) result.to_textfiles(pattern, last_endline=True, scheduler=dask_client)
with open(scores_path, "w") as f:
# concatenate scores into one score file # concatenate scores into one score file
for path in sorted( for path in sorted(
glob(pattern), glob(pattern),
...@@ -193,8 +130,11 @@ def vanilla_pad(pipeline, database, dask_client, groups, output, checkpoint, **k ...@@ -193,8 +130,11 @@ def vanilla_pad(pipeline, database, dask_client, groups, output, checkpoint, **k
): ):
with gzip.open(path, "rt") as f2: with gzip.open(path, "rt") as f2:
f.write(f2.read()) f.write(f2.read())
# delete intermediate score files
os.remove(path)
else: else:
with open(scores_path, "w") as f:
for sample in result: for sample in result:
f.write(pad_predicted_sample_to_score_line(sample, endl="\n")) f.write(pad_predicted_sample_to_score_line(sample, endl="\n"))
......
from . import database from . import database
from . import database_sql from . import database_sql
from . import preprocessor
from . import extractor
...@@ -63,7 +63,7 @@ These five options are: ...@@ -63,7 +63,7 @@ These five options are:
* ``--sub-directory``: A descriptive name for your experiment, which will serve as a sub-directory * ``--sub-directory``: A descriptive name for your experiment, which will serve as a sub-directory
The first four parameters, i.e., the ``database``, the ``preprocessor``, the ``extractor`` and the ``algorithm`` can be specified in several different ways. The first four parameters, i.e., the ``database``, the ``preprocessor``, the ``extractor`` and the ``algorithm`` can be specified in several different ways.
For the start, we will use only the registered :ref:`Resources <bob.bio.base.resources>`. For the start, we will use only the registered Resources.
These resources define the source code that will be used to compute the experiments, as well as all the meta-parameters of the algorithms (which we will call the *configuration*). These resources define the source code that will be used to compute the experiments, as well as all the meta-parameters of the algorithms (which we will call the *configuration*).
To get a list of registered resources, please call: To get a list of registered resources, please call:
...@@ -112,7 +112,7 @@ After the experiment has finished successfully, one or more text file containing ...@@ -112,7 +112,7 @@ After the experiment has finished successfully, one or more text file containing
all the scores are written. In this section, commands that helps to quickly all the scores are written. In this section, commands that helps to quickly
evaluate a set of scores by generating metrics or plots are presented here. evaluate a set of scores by generating metrics or plots are presented here.
The scripts take as input either a 4-column or 5-column data format as specified The scripts take as input either a 4-column or 5-column data format as specified
in the documentation of :py:func:`bob.bio.base.score.load.four_column` or in the documentation of :py:func:`bob.bio.base.score.load.four_column` or
:py:func:`bob.bio.base.score.load.five_column`. :py:func:`bob.bio.base.score.load.five_column`.
Two sets of commands, ``bob pad`` and ``bob vuln`` are available for Two sets of commands, ``bob pad`` and ``bob vuln`` are available for
......
...@@ -15,14 +15,12 @@ Base Classes ...@@ -15,14 +15,12 @@ Base Classes
Most of the base classes are reused from :ref:`bob.bio.base <bob.bio.base>`. Most of the base classes are reused from :ref:`bob.bio.base <bob.bio.base>`.
Only one base class that is presentation attack detection specific, ``Algorithm`` is implemented in this package. Only one base class that is presentation attack detection specific, ``Algorithm`` is implemented in this package.
.. autosummary::
bob.pad.base.algorithm.Algorithm
bob.pad.base.algorithm.Predictions
Implementations Implementations
~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~
.. autosummary:: .. autosummary::
bob.pad.base.pipelines.vanilla_pad.Database
bob.pad.base.pipelines.vanilla_pad.DatabaseConnector
bob.pad.base.database.PadDatabase bob.pad.base.database.PadDatabase
bob.pad.base.database.PadFile bob.pad.base.database.PadFile
...@@ -30,13 +28,7 @@ Preprocessors and Extractors ...@@ -30,13 +28,7 @@ Preprocessors and Extractors
---------------------------- ----------------------------
Preprocessors and Extractors from the :ref:`bob.bio.base <bob.bio.base>` Preprocessors and Extractors from the :ref:`bob.bio.base <bob.bio.base>`
package can also be used in this package. Please see package can also be used in this package.
:any:`bob.bio.base.implemented` for more details.
Algorithms
----------
.. automodule:: bob.pad.base.algorithm
Databases Databases
......
...@@ -42,7 +42,7 @@ command line below will install all the required packages: ...@@ -42,7 +42,7 @@ command line below will install all the required packages:
.. code-block:: sh .. code-block:: sh
$ source activate <bob_conda_environment> $ conda activate <bob_conda_environment>
$ conda install bob.bio.base \ $ conda install bob.bio.base \
bob.bio.spear \ bob.bio.spear \
bob.pad.base \ bob.pad.base \
......
...@@ -12,29 +12,6 @@ Generic functions ...@@ -12,29 +12,6 @@ Generic functions
Tools to run PAD experiments Tools to run PAD experiments
---------------------------- ----------------------------
Command line generation
~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
bob.pad.base.tools.command_line_parser
bob.pad.base.tools.initialize
bob.pad.base.tools.command_line
bob.pad.base.tools.write_info
bob.pad.base.tools.FileSelector
Algorithm
~~~~~~~~~
.. autosummary::
bob.pad.base.tools.train_projector
bob.pad.base.tools.project
bob.pad.base.algorithm
Scoring
~~~~~~~
.. autosummary::
bob.bio.base.tools.compute_scores
Details Details
------- -------
...@@ -42,9 +19,5 @@ Details ...@@ -42,9 +19,5 @@ Details
.. automodule:: bob.pad.base .. automodule:: bob.pad.base
.. automodule:: bob.pad.base.tools
.. autoclass:: FileSelector
.. include:: links.rst .. include:: links.rst
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment