Commit b22ac980 authored by Amir MOHAMMADI's avatar Amir MOHAMMADI
Browse files

Merge branch 'dask-pipelines' into 'master'

Dask pipelines Improvements

See merge request !78
parents 8dcd5aac 3ad7b6c9
Pipeline #46074 passed with stages
in 5 minutes and 33 seconds
......@@ -6,26 +6,21 @@ from .PadBioFileDB import HighBioDatabase, HighPadDatabase
# gets sphinx autodoc done right - don't remove it
def __appropriate__(*args):
"""Says object was actually declared here, and not in the import module.
Fixing sphinx warnings of not being able to find classes, when path is shortened.
Parameters:
"""Says object was actually declared here, and not in the import module.
Fixing sphinx warnings of not being able to find classes, when path is shortened.
Parameters:
*args: An iterable of objects to modify
*args: An iterable of objects to modify
Resolves `Sphinx referencing issues
<https://github.com/sphinx-doc/sphinx/issues/3048>`
"""
Resolves `Sphinx referencing issues
<https://github.com/sphinx-doc/sphinx/issues/3048>`
"""
for obj in args:
obj.__module__ = __name__
for obj in args:
obj.__module__ = __name__
__appropriate__(
PadFile,
PadDatabase,
FileListPadDatabase,
Client,
HighBioDatabase,
HighPadDatabase
PadFile, PadDatabase, FileListPadDatabase, Client, HighBioDatabase, HighPadDatabase
)
__all__ = [_ for _ in dir() if not _.startswith('_')]
__all__ = [_ for _ in dir() if not _.startswith("_")]
import bob.bio.base.database
from bob.bio.base.database import BioFile
class PadFile(bob.bio.base.database.BioFile):
class PadFile(BioFile):
"""A simple base class that defines basic properties of File object for the use in PAD experiments"""
def __init__(
......
from .abstract_classes import Database
from .legacy import DatabaseConnector
from .implemented import FrameContainersToFrames
# gets sphinx autodoc done right - don't remove it
def __appropriate__(*args):
"""Says object was actually declared here, and not in the import module.
Fixing sphinx warnings of not being able to find classes, when path is shortened.
Parameters:
*args: An iterable of objects to modify
Resolves `Sphinx referencing issues
<https://github.com/sphinx-doc/sphinx/issues/3048>`
"""
for obj in args:
obj.__module__ = __name__
__appropriate__(
Database,
DatabaseConnector,
)
__all__ = [_ for _ in dir() if not _.startswith("_")]
......@@ -48,3 +48,10 @@ class DatabaseConnector(Database):
def predict_samples(self, group="dev"):
objects = self.database.all_files(groups=group, flat=True)
return [_padfile_to_delayed_sample(k, self.database) for k in objects]
def __repr__(self) -> str:
return f"""{self.__class__.__name__}(
database={self.database},
annotation_type={self.annotation_type},
fixed_positions={self.fixed_positions}
)"""
"""Finalizes the scores that are produced by spoof.py
"""
import click
import numpy
import logging
from bob.extension.scripts.click_helper import (
verbosity_option, log_parameters)
from bob.extension.scripts.click_helper import log_parameters
from bob.extension.scripts.click_helper import verbosity_option
logger = logging.getLogger(__name__)
@click.command(name='finalize-scores', epilog='''\b
@click.command(
name="finalize-scores",
epilog="""\b
Examples:
$ bin/bob pad finalize_scores /path/to/scores-dev
$ bin/bob pad finalize_scores /path/to/scores-{dev,eval}
''')
@click.argument('scores', type=click.Path(exists=True, dir_okay=False),
nargs=-1)
@click.option('-m', '--method', default='mean',
type=click.Choice(['mean', 'min', 'max']), show_default=True,
help='The method to use when finalizing the scores.')
""",
)
@click.argument("scores", type=click.Path(exists=True, dir_okay=False), nargs=-1)
@click.option(
"-m",
"--method",
default="mean",
type=click.Choice(["mean", "min", "max"]),
show_default=True,
help="The method to use when finalizing the scores.",
)
@click.option("--backup/--no-backup", default=True, help="Whether to backup scores.")
@verbosity_option()
def finalize_scores(scores, method, **kwargs):
"""Finalizes the scores given by spoof.py
def finalize_scores(scores, method, backup, verbose):
"""Finalizes the scores given by bob pad vanilla-pad
When using bob.pad.base, Algorithms can produce several score values for
each unique sample. You can use this script to average (or min/max) these
scores to have one final score per sample.
The conversion is done in-place. The order of scores will change.
The conversion is done in-place (original files will be backed up).
The order of scores will change.
"""
import logging
import numpy
logger = logging.getLogger(__name__)
log_parameters(logger)
mean = {'mean': numpy.nanmean, 'max': numpy.nanmax, 'min': numpy.nanmin}[method]
mean = {"mean": numpy.nanmean, "max": numpy.nanmax, "min": numpy.nanmin}[method]
for path in scores:
new_lines = []
with open(path) as f:
old_lines = f.readlines()
old_lines.sort()
if backup:
with open(f"{path}.bak", "w") as f:
f.writelines(old_lines)
old_lines.sort()
for i, line in enumerate(old_lines):
uniq, s = line.strip().rsplit(maxsplit=1)
s = float(s)
......@@ -47,14 +63,13 @@ def finalize_scores(scores, method, **kwargs):
if uniq == last_line:
last_scores.append(s)
else:
new_lines.append('{} {}\n'.format(
last_line, mean(last_scores)))
new_lines.append("{} {}\n".format(last_line, mean(last_scores)))
last_scores = [s]
last_line = uniq
else: # this else is for the for loop
new_lines.append('{} {}\n'.format(last_line, mean(last_scores)))
new_lines.append("{} {}\n".format(last_line, mean(last_scores)))
with open(path, 'w') as f:
with open(path, "w") as f:
f.writelines(new_lines)
"""Executes PAD pipeline"""
import logging
import bob.pipelines as mario
import click
import joblib
from bob.extension.scripts.click_helper import ConfigCommand
from bob.extension.scripts.click_helper import ResourceOption
from bob.extension.scripts.click_helper import verbosity_option
logger = logging.getLogger(__name__)
EPILOG = """\b
@click.command(
entry_point_group="bob.pad.config",
cls=ConfigCommand,
epilog="""\b
Command line examples\n
-----------------------
$ bob pad vanilla-pad my_experiment.py -vv
my_experiment.py must contain the following elements:
>>> preprocessor = my_preprocessor() \n
>>> extractor = my_extractor() \n
>>> algorithm = my_algorithm() \n
>>> checkpoints = EXPLAIN CHECKPOINTING \n
\b
Look at the following example
$ bob pipelines vanilla-biometrics ./bob/pipelines/config/distributed/sge_iobig_16cores.py \
./bob/pipelines/config/database/mobio_male.py \
./bob/pipelines/config/baselines/facecrop_pca.py
\b
TODO: Work out this help
"""
@click.command(
entry_point_group="bob.pad.config",
cls=ConfigCommand,
epilog=EPILOG,
""",
)
@click.option(
"--pipeline",
......@@ -81,7 +47,7 @@ TODO: Work out this help
"groups",
type=click.Choice(["dev", "eval"]),
multiple=True,
default=("dev",),
default=("dev", "eval"),
help="If given, this value will limit the experiments belonging to a particular group",
)
@click.option(
......@@ -89,7 +55,7 @@ TODO: Work out this help
"--output",
show_default=True,
default="results",
help="Name of output directory",
help="Saves scores (and checkpoints) in this folder.",
)
@click.option(
"--checkpoint",
......@@ -99,56 +65,22 @@ TODO: Work out this help
cls=ResourceOption,
)
@verbosity_option(cls=ResourceOption)
def vanilla_pad(pipeline, database, dask_client, groups, output, checkpoint, **kwargs):
"""Runs the simplest PAD pipeline.
Such pipeline consists into three sub-pipelines.
In all of them, given raw data as input it does the following steps:
Sub-pipeline 1:\n
---------------
Training background model. Some biometric algorithms demands the training of background model, for instance, PCA/LDA matrix or a Neural networks. This sub-pipeline handles that and it consists of 3 steps:
\b
raw_data --> preprocessing >> feature extraction >> train background model --> background_model
\b
Sub-pipeline 2:\n
---------------
Creation of biometric references: This is a standard step in a biometric pipelines.
Given a set of samples of one identity, create a biometric reference (a.k.a template) for sub identity. This sub-pipeline handles that in 3 steps and they are the following:
\b
raw_data --> preprocessing >> feature extraction >> enroll(background_model) --> biometric_reference
Note that this sub-pipeline depends on the previous one
Sub-pipeline 3:\n
---------------
Probing: This is another standard step in biometric pipelines. Given one sample and one biometric reference, computes a score. Such score has different meanings depending on the scoring method your biometric algorithm uses. It's out of scope to explain in a help message to explain what scoring is for different biometric algorithms.
raw_data --> preprocessing >> feature extraction >> probe(biometric_reference, background_model) --> score
Note that this sub-pipeline depends on the two previous ones
"""
@click.pass_context
def vanilla_pad(ctx, pipeline, database, dask_client, groups, output, checkpoint, **kwargs):
"""Runs the simplest PAD pipeline."""
import gzip
import logging
import os
import sys
from glob import glob
import bob.pipelines as mario
import dask.bag
from bob.extension.scripts.click_helper import log_parameters
logger = logging.getLogger(__name__)
log_parameters(logger)
os.makedirs(output, exist_ok=True)
......@@ -157,35 +89,40 @@ def vanilla_pad(pipeline, database, dask_client, groups, output, checkpoint, **k
["checkpoint"], pipeline, features_dir=output, model_path=output
)
if dask_client is not None:
pipeline = mario.wrap(["dask"], pipeline)
if dask_client is None:
logger.warning("`dask_client` not set. Your pipeline will run locally")
# create an experiment info file
with open(os.path.join(output, "Experiment_info.txt"), "wt") as f:
f.write(f"{sys.argv!r}\n")
f.write(f"database={database!r}\n")
f.write("Pipeline steps:\n")
for i, name, estimator in pipeline._iter():
f.write(f"Step {i}: {name}\n{estimator!r}\n")
# train the pipeline
fit_samples = database.fit_samples() # [::50]
pipeline = pipeline.fit(fit_samples)
fit_samples = database.fit_samples()
pipeline.fit(fit_samples)
for group in groups:
logger.info(f"Running vanilla biometrics for group {group}")
predict_samples = database.predict_samples(group=group) # [::50]
predict_samples = database.predict_samples(group=group)
result = pipeline.decision_function(predict_samples)
with open(os.path.join(output, f"scores-{group}"), "w") as f:
scores_path = os.path.join(output, f"scores-{group}")
if isinstance(result, dask.bag.core.Bag):
if dask_client is None:
logger.warning(
"`dask_client` not set. Your pipeline will run locally"
)
if isinstance(result, dask.bag.core.Bag):
# write each partition into a zipped txt file
result = result.map(pad_predicted_sample_to_score_line)
prefix, postfix = f"{output}/scores/scores-{group}-", ".txt.gz"
pattern = f"{prefix}*{postfix}"
os.makedirs(os.path.dirname(prefix), exist_ok=True)
logger.info("Writing bag results into files ...")
result.to_textfiles(pattern, last_endline=True, scheduler=dask_client)
# write each partition into a zipped txt file
result = result.map(pad_predicted_sample_to_score_line)
prefix, postfix = f"{output}/scores/scores-{group}-", ".txt.gz"
pattern = f"{prefix}*{postfix}"
os.makedirs(os.path.dirname(prefix), exist_ok=True)
logger.info("Writing bag results into files ...")
result.to_textfiles(pattern, last_endline=True, scheduler=dask_client)
with open(scores_path, "w") as f:
# concatenate scores into one score file
for path in sorted(
glob(pattern),
......@@ -193,8 +130,11 @@ def vanilla_pad(pipeline, database, dask_client, groups, output, checkpoint, **k
):
with gzip.open(path, "rt") as f2:
f.write(f2.read())
# delete intermediate score files
os.remove(path)
else:
else:
with open(scores_path, "w") as f:
for sample in result:
f.write(pad_predicted_sample_to_score_line(sample, endl="\n"))
......
from . import database
from . import database_sql
from . import preprocessor
from . import extractor
......@@ -63,7 +63,7 @@ These five options are:
* ``--sub-directory``: A descriptive name for your experiment, which will serve as a sub-directory
The first four parameters, i.e., the ``database``, the ``preprocessor``, the ``extractor`` and the ``algorithm`` can be specified in several different ways.
For the start, we will use only the registered :ref:`Resources <bob.bio.base.resources>`.
For the start, we will use only the registered Resources.
These resources define the source code that will be used to compute the experiments, as well as all the meta-parameters of the algorithms (which we will call the *configuration*).
To get a list of registered resources, please call:
......@@ -112,7 +112,7 @@ After the experiment has finished successfully, one or more text file containing
all the scores are written. In this section, commands that helps to quickly
evaluate a set of scores by generating metrics or plots are presented here.
The scripts take as input either a 4-column or 5-column data format as specified
in the documentation of :py:func:`bob.bio.base.score.load.four_column` or
in the documentation of :py:func:`bob.bio.base.score.load.four_column` or
:py:func:`bob.bio.base.score.load.five_column`.
Two sets of commands, ``bob pad`` and ``bob vuln`` are available for
......
......@@ -15,14 +15,12 @@ Base Classes
Most of the base classes are reused from :ref:`bob.bio.base <bob.bio.base>`.
Only one base class that is presentation attack detection specific, ``Algorithm`` is implemented in this package.
.. autosummary::
bob.pad.base.algorithm.Algorithm
bob.pad.base.algorithm.Predictions
Implementations
~~~~~~~~~~~~~~~
.. autosummary::
bob.pad.base.pipelines.vanilla_pad.Database
bob.pad.base.pipelines.vanilla_pad.DatabaseConnector
bob.pad.base.database.PadDatabase
bob.pad.base.database.PadFile
......@@ -30,13 +28,7 @@ Preprocessors and Extractors
----------------------------
Preprocessors and Extractors from the :ref:`bob.bio.base <bob.bio.base>`
package can also be used in this package. Please see
:any:`bob.bio.base.implemented` for more details.
Algorithms
----------
.. automodule:: bob.pad.base.algorithm
package can also be used in this package.
Databases
......
......@@ -42,7 +42,7 @@ command line below will install all the required packages:
.. code-block:: sh
$ source activate <bob_conda_environment>
$ conda activate <bob_conda_environment>
$ conda install bob.bio.base \
bob.bio.spear \
bob.pad.base \
......
......@@ -12,29 +12,6 @@ Generic functions
Tools to run PAD experiments
----------------------------
Command line generation
~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
bob.pad.base.tools.command_line_parser
bob.pad.base.tools.initialize
bob.pad.base.tools.command_line
bob.pad.base.tools.write_info
bob.pad.base.tools.FileSelector
Algorithm
~~~~~~~~~
.. autosummary::
bob.pad.base.tools.train_projector
bob.pad.base.tools.project
bob.pad.base.algorithm
Scoring
~~~~~~~
.. autosummary::
bob.bio.base.tools.compute_scores
Details
-------
......@@ -42,9 +19,5 @@ Details
.. automodule:: bob.pad.base
.. automodule:: bob.pad.base.tools
.. autoclass:: FileSelector
.. include:: links.rst
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment