Commit 92e6d94e authored by Amir MOHAMMADI's avatar Amir MOHAMMADI
Browse files

Merge branch 'dask-pipelines'

parents 49342c2a a0de0b05
Pipeline #44909 failed with stages
in 52 seconds
......@@ -13,3 +13,4 @@ sphinx
dist
build
record.txt
.DS_Store
......@@ -3,10 +3,7 @@ from . import database
from . import preprocessor
from . import extractor
from . import algorithm
from . import tools
from . import grid # only one file, not complete directory
from . import annotator
from . import baseline
from . import script
from . import test
......
......@@ -6,6 +6,8 @@
import numpy
import os
from .. import utils
import warnings
class Algorithm (object):
"""This is the base class for all biometric recognition algorithms.
......@@ -87,6 +89,13 @@ class Algorithm (object):
min_t_model_file_size=1000,
**kwargs # parameters from the derived class that should be reported in the __str__() function
):
warnings.warn("`bob.bio.base.algorithm.Algorithm` will be deprecated in 01/01/2021. "\
"Please, implement your biometric algorithm using `bob.pipelines` (https://gitlab.idiap.ch/bob/bob.pipelines).", DeprecationWarning)
self.performs_projection = performs_projection
self.requires_projector_training = performs_projection and requires_projector_training
self.split_training_features_by_client = split_training_features_by_client
......@@ -205,10 +214,11 @@ class Algorithm (object):
score : float
The fused similarity between the given ``models`` and the ``probe``.
"""
if isinstance(models, list):
return self.model_fusion_function([self.score(model, probe) for model in models])
return [self.probe_fusion_function(self.score(model, probe)) for model in models]
elif isinstance(models, numpy.ndarray):
return self.model_fusion_function([self.score(models[i,:], probe) for i in range(models.shape[0])])
return [self.probe_fusion_function(self.score(models[i,:], probe)) for i in range(models.shape[0])]
else:
raise ValueError("The model does not have the desired format (list, array, ...)")
......
......@@ -9,7 +9,7 @@ class Annotator(object):
----------
read_original_data : callable
A function that loads the samples. The syntax is like
:any:`bob.bio.base.read_original_data`.
`bob.bio.base.read_original_data`.
"""
def __init__(self, read_original_data=None, **kwargs):
......
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
from .. import resource_keys, load_resource
def search_preprocessor(db_name, keys):
"""
Wrapper that searches for preprocessors for specific databases.
If not found, the default preprocessor is returned
"""
for k in keys:
if db_name.startswith(k):
return k
else:
return "default"
def get_available_databases():
"""
Get all the available databases through the database entry-points
"""
available_databases = dict()
all_databases = resource_keys('database', strip=[])
for database in all_databases:
try:
database_entry_point = load_resource(database, 'database')
available_databases[database] = dict()
# Checking if the database has data for the ZT normalization
available_databases[database]["has_zt"] = hasattr(database_entry_point, "zobjects") and hasattr(database_entry_point, "tobjects")
available_databases[database]["groups"] = []
# Searching for database groups
try:
groups = list(database_entry_point.groups()) or ["dev"]
for g in ["dev", "eval"]:
available_databases[database]["groups"] += [g] if g in groups else []
except Exception:
# In case the method groups is not implemented
available_databases[database]["groups"] = ["dev"]
except Exception:
pass
return available_databases
class Baseline(object):
"""
Base class to define baselines
A Baseline is composed by the triplet
:any:`bob.bio.base.preprocessor.Preprocessor`,
:any:`bob.bio.base.extractor.Extractor`, and
:any:`bob.bio.base.algorithm.Algorithm`
Attributes
----------
name : str
Name of the baseline. This name will be displayed in the command line
interface.
preprocessors : dict
Dictionary containing all possible preprocessors
extractor : str
Registered resource or a config file containing the feature extractor
algorithm : str
Registered resource or a config file containing the algorithm
"""
def __init__(self, name, preprocessors, extractor, algorithm, **kwargs):
super(Baseline, self).__init__(**kwargs)
self.name = name
self.preprocessors = preprocessors
self.extractor = extractor
self.algorithm = algorithm
from .Baseline import Baseline, search_preprocessor, get_available_databases
def get_config():
"""Returns a string containing the configuration information.
"""
import bob.extension
return bob.extension.get_config(__name__)
# gets sphinx autodoc done right - don't remove it
def __appropriate__(*args):
"""Says object was actually declared here, and not in the import module.
Fixing sphinx warnings of not being able to find classes, when path is
shortened. Parameters:
*args: An iterable of objects to modify
Resolves `Sphinx referencing issues
<https://github.com/sphinx-doc/sphinx/issues/3048>`
"""
for obj in args:
obj.__module__ = __name__
__appropriate__(
Baseline,
)
__all__ = [_ for _ in dir() if not _.startswith('_')]
#!/usr/bin/env python
import bob.bio.base
import scipy.spatial
algorithm = bob.bio.base.algorithm.PCA(
subspace_dimension = .95,
distance_function = scipy.spatial.distance.euclidean,
is_distance_function = True
)
from bob.bio.base.algorithm import PCA
algorithm = PCA(0.99)
\ No newline at end of file
import bob.bio.base
# define a queue with demanding parameters
grid = bob.bio.base.grid.Grid(
training_queue = '32G',
# preprocessing
preprocessing_queue = '4G-io-big',
# feature extraction
extraction_queue = '8G-io-big',
# feature projection
projection_queue = '8G-io-big',
# model enrollment
enrollment_queue = '8G-io-big',
# scoring
scoring_queue = '8G-io-big'
)
import bob.bio.base
# define a queue with demanding parameters
grid = bob.bio.base.grid.Grid(
training_queue = 'GPU',
# preprocessing
preprocessing_queue = '4G',
# feature extraction
extraction_queue = 'GPU',
# feature projection
projection_queue = '4G',
# model enrollment
enrollment_queue = '4G',
# scoring
scoring_queue = '4G'
)
import bob.bio.base
# define the queue using all the default parameters
grid = bob.bio.base.grid.Grid()
import bob.bio.base
# define the queue using all the default parameters
grid = bob.bio.base.grid.Grid(
grid_type = 'local',
number_of_parallel_processes = 4
)
# define a queue that is highly parallelized
grid_p8 = bob.bio.base.grid.Grid(
grid_type = 'local',
number_of_parallel_processes = 8
)
# define a queue that is highly parallelized
grid_p16 = bob.bio.base.grid.Grid(
grid_type = 'local',
number_of_parallel_processes = 16
)
from .csv_dataset import CSVDatasetDevEval, CSVToSampleLoader, CSVDatasetCrossValidation
from .file import BioFile
from .file import BioFileSet
from .database import BioDatabase
......@@ -5,6 +6,7 @@ from .database import ZTBioDatabase
from .filelist import FileListBioDatabase
from . import filelist
# gets sphinx autodoc done right - don't remove it
def __appropriate__(*args):
"""Says object was actually declared here, and not in the import module.
......@@ -25,7 +27,9 @@ __appropriate__(
BioFile,
BioFileSet,
BioDatabase,
ZTBioDatabase,
FileListBioDatabase
ZTBioDatabase,
CSVDatasetDevEval,
CSVToSampleLoader,
CSVDatasetCrossValidation
)
__all__ = [_ for _ in dir() if not _.startswith('_')]
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
import os
from bob.pipelines import Sample, DelayedSample, SampleSet
import csv
import bob.io.base
import functools
from abc import ABCMeta, abstractmethod
import numpy as np
import itertools
class CSVBaseSampleLoader(metaclass=ABCMeta):
"""
Convert CSV files in the format below to either a list of
:any:`bob.pipelines.DelayedSample` or :any:`bob.pipelines.SampleSet`
.. code-block:: text
PATH,SUBJECT
path_1,subject_1
path_2,subject_2
path_i,subject_j
...
.. note::
This class should be extended
Parameters
----------
data_loader:
A python function that can be called parameterlessly, to load the
sample in question from whatever medium
extension:
The file extension
"""
def __init__(self, data_loader, dataset_original_directory="", extension=""):
self.data_loader = data_loader
self.extension = extension
self.dataset_original_directory = dataset_original_directory
self.excluding_attributes = ["_data", "load", "key"]
@abstractmethod
def __call__(self, filename):
pass
@abstractmethod
def convert_row_to_sample(self, row, header):
pass
@abstractmethod
def convert_samples_to_samplesets(
self, samples, group_by_subject=True, references=None
):
pass
class CSVToSampleLoader(CSVBaseSampleLoader):
"""
Simple mechanism to convert CSV files in the format below to either a list of
:any:`bob.pipelines.DelayedSample` or :any:`bob.pipelines.SampleSet`
"""
def __call__(self, filename):
def check_header(header):
"""
A header should have at least "SUBJECT" AND "PATH"
"""
header = [h.lower() for h in header]
if not "subject" in header:
raise ValueError(
"The field `subject` is not available in your dataset."
)
if not "path" in header:
raise ValueError("The field `path` is not available in your dataset.")
with open(filename) as cf:
reader = csv.reader(cf)
header = next(reader)
check_header(header)
return [self.convert_row_to_sample(row, header) for row in reader]
def convert_row_to_sample(self, row, header):
path = row[0]
subject = row[1]
kwargs = dict([[h, r] for h, r in zip(header[2:], row[2:])])
return DelayedSample(
functools.partial(
self.data_loader,
os.path.join(self.dataset_original_directory, path + self.extension),
),
key=path,
subject=subject,
**kwargs,
)
def convert_samples_to_samplesets(
self, samples, group_by_subject=True, references=None
):
def get_attribute_from_sample(sample):
return dict(
[
[attribute, sample.__dict__[attribute]]
for attribute in list(sample.__dict__.keys())
if attribute not in self.excluding_attributes
]
)
if group_by_subject:
# Grouping sample sets
sample_sets = dict()
for s in samples:
if s.subject not in sample_sets:
sample_sets[s.subject] = SampleSet(
[s], **get_attribute_from_sample(s)
)
else:
sample_sets[s.subject].append(s)
return list(sample_sets.values())
else:
return [
SampleSet([s], **get_attribute_from_sample(s), references=references)
for s in samples
]
class CSVDatasetDevEval:
"""
Generic filelist dataset for :any:`bob.bio.base.pipelines.VanillaBiometrics` pipeline.
Check :ref:`vanilla_biometrics_features` for more details about the Vanilla Biometrics Dataset
interface.
To create a new dataset, you need to provide a directory structure similar to the one below:
.. code-block:: text
my_dataset/
my_dataset/my_protocol/
my_dataset/my_protocol/train.csv
my_dataset/my_protocol/train.csv/dev_enroll.csv
my_dataset/my_protocol/train.csv/dev_probe.csv
my_dataset/my_protocol/train.csv/eval_enroll.csv
my_dataset/my_protocol/train.csv/eval_probe.csv
...
In the above directory structure, inside of `my_dataset` should contain the directories with all
evaluation protocols this dataset might have.
Inside of the `my_protocol` directory should contain at least two csv files:
- dev_enroll.csv
- dev_probe.csv
Those csv files should contain in each row i-) the path to raw data and ii-) the subject label
for enrollment (:ref:`bob.bio.base.pipelines.vanilla_biometrics.abstract_classes.Database.references`) and
probing (:ref:`bob.bio.base.pipelines.vanilla_biometrics.abstract_classes.Database.probes`).
The structure of each CSV file should be as below:
.. code-block:: text
PATH,SUBJECT
path_1,subject_1
path_2,subject_2
path_i,subject_j
...
You might want to ship metadata within your Samples (e.g gender, age, annotation, ...)
To do so is simple, just do as below:
.. code-block:: text
PATH,SUBJECT,METADATA_1,METADATA_2,METADATA_k
path_1,subject_1,A,B,C
path_2,subject_2,A,B,1
path_i,subject_j,2,3,4
...
The files `my_dataset/my_protocol/train.csv/eval_enroll.csv` and `my_dataset/my_protocol/train.csv/eval_probe.csv`
are optional and it is used in case a protocol contains data for evaluation.
Finally, the content of the file `my_dataset/my_protocol/train.csv` is used in the case a protocol
contains data for training (:ref:`bob.bio.base.pipelines.vanilla_biometrics.abstract_classes.Database.background_model_samples`)
Parameters
----------
dataset_path: str
Absolute path of the dataset protocol description
protocol_na,e: str
The name of the protocol
csv_to_sample_loader: :any:`CSVBaseSampleLoader`
Base class that whose objective is to generate :any:`bob.pipelines.Samples`
and/or :any:`bob.pipelines.SampleSet` from csv rows
"""
def __init__(
self,
dataset_protocol_path,
protocol_name,
csv_to_sample_loader=CSVToSampleLoader(
data_loader=bob.io.base.load, dataset_original_directory="", extension=""
),
):
def get_paths():
if not os.path.exists(dataset_protocol_path):
raise ValueError(f"The path `{dataset_protocol_path}` was not found")
# TODO: Unzip file if dataset path is a zip
protocol_path = os.path.join(dataset_protocol_path, protocol_name)
if not os.path.exists(protocol_path):
raise ValueError(f"The protocol `{protocol_name}` was not found")
train_csv = os.path.join(protocol_path, "train.csv")
dev_enroll_csv = os.path.join(protocol_path, "dev_enroll.csv")
dev_probe_csv = os.path.join(protocol_path, "dev_probe.csv")
eval_enroll_csv = os.path.join(protocol_path, "eval_enroll.csv")
eval_probe_csv = os.path.join(protocol_path, "eval_probe.csv")
# The minimum required is to have `dev_enroll_csv` and `dev_probe_csv`
train_csv = train_csv if os.path.exists(train_csv) else None
# Eval
eval_enroll_csv = (
eval_enroll_csv if os.path.exists(eval_enroll_csv) else None
)
eval_probe_csv = eval_probe_csv if os.path.exists(eval_probe_csv) else None
# Dev
if not os.path.exists(dev_enroll_csv):
raise ValueError(
f"The file `{dev_enroll_csv}` is required and it was not found"
)
if not os.path.exists(dev_probe_csv):
raise ValueError(
f"The file `{dev_probe_csv}` is required and it was not found"
)
return (
train_csv,
dev_enroll_csv,
dev_probe_csv,
eval_enroll_csv,
eval_probe_csv,
)
(
self.train_csv,
self.dev_enroll_csv,
self.dev_probe_csv,
self.eval_enroll_csv,
self.eval_probe_csv,
) = get_paths()
def get_dict_cache():
cache = dict()
cache["train"] = None
cache["dev_enroll_csv"] = None
cache["dev_probe_csv"] = None
cache["eval_enroll_csv"] = None
cache["eval_probe_csv"] = None
return cache
self.cache = get_dict_cache()
self.csv_to_sample_loader = csv_to_sample_loader
def background_model_samples(self):
self.cache["train"] = (
self.csv_to_sample_loader(self.train_csv)
if self.cache["train"] is None
else self.cache["train"]
)
return self.cache["train"]
def _get_samplesets(self, group="dev", purpose="enroll", group_by_subject=False):
fetching_probes = False
if purpose == "enroll":
cache_label = "dev_enroll_csv" if group == "dev" else "eval_enroll_csv"
else:
fetching_probes = True
cache_label = "dev_probe_csv" if group == "dev" else "eval_probe_csv"
if self.cache[cache_label] is not None:
return self.cache[cache_label]
references = None
if fetching_probes:
references = list(set([s.subject for s in self.references(group=group)]))
samples = self.csv_to_sample_loader(self.__dict__[cache_label])
sample_sets = self.csv_to_sample_loader.convert_samples_to_samplesets(
samples, group_by_subject=group_by_subject, references=references
)
self.cache[cache_label] = sample_sets
return self.cache[cache_label]
def references(self, group="dev"):
return self._get_samplesets(
group=group, purpose="enroll", group_by_subject=True
)
def probes(self, group="dev"):
return self._get_samplesets(
group=group, purpose="probe", group_by_subject=False
)
class CSVDatasetCrossValidation:
"""
Generic filelist dataset for :any:`bob.bio.base.pipelines.VanillaBiometrics` pipeline that
handles **CROSS VALIDATION**.
Check :ref:`vanilla_biometrics_features` for more details about the Vanilla Biometrics Dataset
interface.