Skip to content
Snippets Groups Projects
Commit f22853af authored by Pavel KORSHUNOV's avatar Pavel KORSHUNOV
Browse files

Adding support for FileList databases

parent 10d6c6cf
No related branches found
No related tags found
1 merge request!9Adding support for Filelist-based databases
Pipeline #
Showing
with 614 additions and 18 deletions
from .database import PadDatabase
from .file import PadFile from .file import PadFile
from .database import PadDatabase
from .filelist.query import FileListPadDatabase
from .filelist.models import Client
from . import filelist
# to fix sphinx warnings of not able to find classes, when path is shortened
PadDatabase.__module__ = "bob.pad.base.database"
PadFile.__module__ = "bob.pad.base.database"
# gets sphinx autodoc done right - don't remove it # gets sphinx autodoc done right - don't remove it
def __appropriate__(*args):
"""Says object was actually declared here, and not in the import module.
Fixing sphinx warnings of not being able to find classes, when path is shortened.
Parameters:
*args: An iterable of objects to modify
Resolves `Sphinx referencing issues
<https://github.com/sphinx-doc/sphinx/issues/3048>`
"""
for obj in args:
obj.__module__ = __name__
__appropriate__(
PadFile,
PadDatabase,
FileListPadDatabase,
Client,
)
__all__ = [_ for _ in dir() if not _.startswith('_')] __all__ = [_ for _ in dir() if not _.startswith('_')]
...@@ -21,11 +21,8 @@ class PadDatabase(BioDatabase): ...@@ -21,11 +21,8 @@ class PadDatabase(BioDatabase):
name : str name : str
A unique name for the database. A unique name for the database.
all_files_options : dict protocol : str or ``None``
Dictionary of options passed to the second-level database query when retrieving all data. The name of the protocol that defines the default experimental setup for this database.
check_original_files_for_existence : bool
Enables to test for the original data files when querying the database.
original_directory : str original_directory : str
The directory where the original data of the database are stored. The directory where the original data of the database are stored.
...@@ -33,9 +30,6 @@ class PadDatabase(BioDatabase): ...@@ -33,9 +30,6 @@ class PadDatabase(BioDatabase):
original_extension : str original_extension : str
The file name extension of the original data. The file name extension of the original data.
protocol : str or ``None``
The name of the protocol that defines the default experimental setup for this database.
kwargs : ``key=value`` pairs kwargs : ``key=value`` pairs
The arguments of the :py:class:`bob.bio.base.database.BioDatabase` base class constructor. The arguments of the :py:class:`bob.bio.base.database.BioDatabase` base class constructor.
...@@ -44,14 +38,18 @@ class PadDatabase(BioDatabase): ...@@ -44,14 +38,18 @@ class PadDatabase(BioDatabase):
def __init__( def __init__(
self, self,
name, name,
all_files_options={}, # additional options for the database query that can be used to extract all files protocol='Default',
check_original_files_for_existence=False,
original_directory=None, original_directory=None,
original_extension=None, original_extension=None,
protocol='Default',
**kwargs # The rest of the default parameters of the base class **kwargs # The rest of the default parameters of the base class
): ):
super(PadDatabase, self).__init__(name=name, all_files_options=all_files_options, check_original_files_for_existence=check_original_files_for_existence, original_directory=original_directory, original_extension=original_extension, protocol=protocol, **kwargs) super(PadDatabase, self).__init__(
name=name,
protocol=protocol,
original_directory=original_directory,
original_extension=original_extension,
**kwargs)
def original_file_names(self, files): def original_file_names(self, files):
"""original_file_names(files) -> paths """original_file_names(files) -> paths
...@@ -124,7 +122,7 @@ class PadDatabase(BioDatabase): ...@@ -124,7 +122,7 @@ class PadDatabase(BioDatabase):
Usually it is either 'real' or 'attack'. Usually it is either 'real' or 'attack'.
model_ids : [various type] model_ids : [various type]
This parameter is not suported in PAD databases yet This parameter is not supported in PAD databases yet
""" """
raise NotImplementedError("This function must be implemented in your derived class.") raise NotImplementedError("This function must be implemented in your derived class.")
......
...@@ -29,5 +29,5 @@ class PadFile(BioFile): ...@@ -29,5 +29,5 @@ class PadFile(BioFile):
assert isinstance(attack_type, str) assert isinstance(attack_type, str)
# just copy the information # just copy the information
# The attack type of the sample, None if it is a genuine sample.
self.attack_type = attack_type self.attack_type = attack_type
"""The attack type of the sample, None if it is a genuine sample."""
from .models import ListReader, Client, FileListFile
from .query import FileListPadDatabase
from .driver import Interface
# gets sphinx autodoc done right - don't remove it
def __appropriate__(*args):
"""Says object was actually declared here, and not in the import module.
Fixing sphinx warnings of not being able to find classes, when path is shortened.
Parameters:
*args: An iterable of objects to modify
Resolves `Sphinx referencing issues
<https://github.com/sphinx-doc/sphinx/issues/3048>`
"""
for obj in args:
obj.__module__ = __name__
__appropriate__(
ListReader,
Client,
FileListFile,
FileListPadDatabase,
Interface,
)
__all__ = [_ for _ in dir() if not _.startswith('_')]
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Laurent El Shafey <laurent.el-shafey@idiap.ch>
#
# Copyright (C) 2011-2013 Idiap Research Institute, Martigny, Switzerland
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Commands the PAD Filelists database can respond to.
"""
import os
import sys
from bob.db.base.driver import Interface as BaseInterface
def dumplist(args):
"""Dumps lists of files based on your criteria"""
from .query import FileListPadDatabase
db = FileListPadDatabase(args.list_directory, 'pad_filelist')
r = db.objects(
purposes=args.purpose,
groups=args.group,
protocol=args.protocol
)
output = sys.stdout
if args.selftest:
from bob.db.base.utils import null
output = null()
for f in r:
output.write('%s\n' % f.make_path(directory=args.directory, extension=args.extension))
return 0
def checkfiles(args):
"""Checks existence of files based on your criteria"""
from .query import FileListPadDatabase
db = FileListPadDatabase(args.list_directory, 'pad_filelist')
r = db.objects(protocol=args.protocol)
# go through all files, check if they are available on the filesystem
good = []
bad = []
for f in r:
if os.path.exists(f.make_path(args.directory, args.extension)):
good.append(f)
else:
bad.append(f)
# report
output = sys.stdout
if args.selftest:
from bob.db.base.utils import null
output = null()
if bad:
for f in bad:
output.write('Cannot find file "%s"\n' % f.make_path(args.directory, args.extension))
output.write('%d files (out of %d) were not found at "%s"\n' % (len(bad), len(r), args.directory))
return 0
class Interface(BaseInterface):
def name(self):
return 'pad_filelist'
def version(self):
import pkg_resources # part of setuptools
return pkg_resources.require('bob.pad.base')[0].version
def files(self):
return ()
def type(self):
return 'text'
def add_commands(self, parser):
from . import __doc__ as docs
subparsers = self.setup_parser(parser,
"Presentation Attack Detection File Lists database", docs)
import argparse
# the "dumplist" action
parser = subparsers.add_parser('dumplist', help=dumplist.__doc__)
parser.add_argument('-l', '--list-directory', required=True,
help="The directory which contains the file lists.")
parser.add_argument('-d', '--directory', default='',
help="if given, this path will be prepended to every entry returned.")
parser.add_argument('-e', '--extension', default='',
help="if given, this extension will be appended to every entry returned.")
parser.add_argument('-u', '--purpose',
help="if given, this value will limit the output files to those designed "
"for the given purposes.",
choices=('real', 'attack', ''))
parser.add_argument('-g', '--group',
help="if given, this value will limit the output files to those belonging to a "
"particular protocolar group.",
choices=('dev', 'eval', 'train', ''))
parser.add_argument('-p', '--protocol', default=None,
help="If set, the protocol is appended to the directory that contains the file lists.")
parser.add_argument('--self-test', dest="selftest", action='store_true', help=argparse.SUPPRESS)
parser.set_defaults(func=dumplist) # action
# the "checkfiles" action
parser = subparsers.add_parser('checkfiles', help=checkfiles.__doc__)
parser.add_argument('-l', '--list-directory', required=True,
help="The directory which contains the file lists.")
parser.add_argument('-d', '--directory', dest="directory", default='',
help="if given, this path will be prepended to every entry returned.")
parser.add_argument('-e', '--extension', dest="extension", default='',
help="if given, this extension will be appended to every entry returned.")
parser.add_argument('-p', '--protocol', default=None,
help="If set, the protocol is appended to the directory that contains the file lists.")
parser.add_argument('--self-test', dest="selftest", action='store_true', help=argparse.SUPPRESS)
parser.set_defaults(func=checkfiles) # action
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Pavel Korshunov <pavel.korshunov@idiap.ch>
# @date: Thu Nov 17 16:09:22 CET 2016
#
# Copyright (C) 2011-2013 Idiap Research Institute, Martigny, Switzerland
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
This file defines simple Client and File interfaces that are comparable with other bob.db databases.
"""
import os
import fileinput
import re
from bob.pad.base.database import PadFile
class Client(object):
"""
The clients of this database contain ONLY client ids. Nothing special.
"""
def __init__(self, client_id):
self.id = client_id
"""The ID of the client, which is stored as a :py:class:`str` object."""
class FileListFile(PadFile):
"""
Initialize the File object with the minimum required data.
**Parameters**
path : str
The path of this file, relative to the basic directory.
Please do not specify any file extensions.
This path will be used as an underlying file_id, as it is assumed to be unique
client_id : various type
The id of the client, this file belongs to.
The type of it is dependent on your implementation.
If you use an SQL database, this should be an SQL type like Integer or String.
"""
def __init__(self, file_name, client_id, attack_type=None):
super(FileListFile, self).__init__(client_id=client_id, path=file_name, attack_type=attack_type, file_id=file_name)
#############################################################################
### internal access functions for the file lists; do not export!
#############################################################################
class ListReader(object):
def __init__(self, store_lists):
self.m_read_lists = {}
self.m_store_lists = store_lists
def _read_multi_column_list(self, list_file):
rows = []
if not os.path.isfile(list_file):
raise RuntimeError('File %s does not exist.' % (list_file,))
try:
for line in fileinput.input(list_file):
parsed_line = re.findall('[\w/(-.)]+', line)
if len(parsed_line):
# perform some sanity checks
if len(parsed_line) not in (2, 3):
raise IOError("The read line '%s' from file '%s' could not be parsed successfully!" %
(line.rstrip(), list_file))
if len(rows) and len(rows[0]) != len(parsed_line):
raise IOError("The parsed line '%s' from file '%s' has a different number of elements "
"than the first parsed line '%s'!" % (parsed_line, list_file, rows[0]))
# append the read line
rows.append(parsed_line)
fileinput.close()
except IOError as e:
raise RuntimeError("Error reading the file '%s' : '%s'." % (list_file, e))
# return the read list as a vector of columns
return rows
def _read_column_list(self, list_file, column_count):
# read the list
rows = self._read_multi_column_list(list_file)
# extract the file from the first two columns
file_list = []
for row in rows:
if column_count == 2:
assert len(row) == 2
# we expect: filename client_id
file_list.append(FileListFile(file_name=row[0], client_id=row[1]))
elif column_count == 3:
assert len(row) == 3
# we expect: filename, model_id, client_id
file_list.append(FileListFile(file_name=row[0], client_id=row[1], attack_type=row[2]))
else:
raise ValueError("The given column count %d cannot be interpreted. This is a BUG, please "
"report to the author." % column_count)
return file_list
def read_list(self, list_file, group, type=None):
"""Reads the list of Files from the given list file (if not done yet) and returns it."""
if group not in self.m_read_lists:
self.m_read_lists[group] = {}
if type not in self.m_read_lists[group]:
if type == 'for_real':
files_list = self._read_column_list(list_file, 2)
elif type == 'for_attack':
files_list = self._read_column_list(list_file, 3)
else:
raise ValueError("The given type must be one of %s, but not '%s'" % (('for_real', 'for_attack'), type))
if self.m_store_lists:
self.m_read_lists[group][type] = files_list
return files_list
return self.m_read_lists[group][type]
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Laurent El Shafey <Laurent.El-Shafey@idiap.ch>
# @author: Pavel Korshunov <pavel.korshunov@idiap.ch>
# @date: Thu Nov 17 16:09:22 CET 2016
#
# Copyright (C) 2011-2013 Idiap Research Institute, Martigny, Switzerland
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""This module provides the Database interface allowing the user to query the
PAD database based on file lists provided in the corresponding directory.
"""
import os
from .models import Client, ListReader
from .. import PadFile
from .. import PadDatabase
from bob.bio.base.database import FileListBioDatabase
class FileListPadDatabase(FileListBioDatabase, PadDatabase):
"""This class provides a user-friendly interface to databases that are given as file lists.
Keyword parameters:
filelists_directory : str
The directory that contains the filelists defining the protocol(s). If you use the protocol
attribute when querying the database, it will be appended to the base directory, such that
several protocols are supported by the same class instance of `bob.pad.base`.
name : str
The name of the database
protocol : str
The protocol of the database. This should be a folder inside ``filelists_directory``.
pad_file_class : class
The class that should be used for return the files.
This can be `PadFile`, `PadVoiceFile`, or anything similar.
original_directory : str or ``None``
The directory, where the original data can be found
original_extension : str or [str] or ``None``
The filename extension of the original data, or multiple extensions
annotation_directory : str or ``None``
The directory, where additional annotation files can be found
annotation_extension : str or ``None``
The filename extension of the annotation files
annotation_type : str
The type of the annotation file to read, see `bob.db.base.read_annotation_file` for accepted formats.
train_subdir : str or ``None``
Specify a custom subdirectory for the filelists of the development set (default is 'train')
dev_subdir : str or ``None``
Specify a custom subdirectory for the filelists of the development set (default is 'dev')
eval_subdir : str or ``None``
Specify a custom subdirectory for the filelists of the development set (default is 'eval')
keep_read_lists_in_memory : bool
If set to true, the lists are read only once and stored in memory
"""
def __init__(
self,
filelists_directory,
name,
protocol=None,
pad_file_class=PadFile,
original_directory=None,
original_extension=None,
# PAD annotations should be supported in the future
annotation_directory=None,
annotation_extension="",
annotation_type=None,
train_subdir=None,
dev_subdir=None,
eval_subdir=None,
real_filename=None, # File containing the real files
attack_filename=None, # File containing the real files
# if set to True (the RECOMMENDED default) lists are read only once and stored in memory.
keep_read_lists_in_memory=True,
**kwargs
):
"""We call PadDatabase.__init__() instead of super() because of we do not want
bob.bio.base.database.FileListBioDatabase.__init__() to be called by super().
bob.bio.base.database.FileListBioDatabase depends on bob.bio.base.database.ZTBioDatabase, which would
throw an exception, since we do not implement here methods for ZT-based metric."""
PadDatabase.__init__(self,
name=name,
protocol=protocol,
original_directory=original_directory,
original_extension=original_extension,
annotation_directory=annotation_directory,
annotation_extension=annotation_extension,
annotation_type=annotation_type,
filelists_directory=filelists_directory,
# extra args for pretty printing
train_sub_directory=train_subdir,
dev_sub_directory=dev_subdir,
eval_sub_directory=eval_subdir,
real_filename=real_filename,
attack_filename=attack_filename,
**kwargs)
self.pad_file_class = pad_file_class
self.list_readers = {}
self.m_base_dir = os.path.abspath(filelists_directory)
if not os.path.isdir(self.m_base_dir):
raise RuntimeError('Invalid directory specified %s.' % self.m_base_dir)
# sub-directories for train, dev, and eval sets:
self.m_dev_subdir = dev_subdir if dev_subdir is not None else 'dev'
self.m_eval_subdir = eval_subdir if eval_subdir is not None else 'eval'
self.m_train_subdir = train_subdir if train_subdir is not None else 'train'
# real list: format: filename client_id
self.m_real_filename = real_filename if real_filename is not None else 'for_real.lst'
# attack list: format: filename client_id attack_type
self.m_attack_filename = attack_filename if attack_filename is not None else 'for_attack.lst'
self.keep_read_lists_in_memory = keep_read_lists_in_memory
def _list_reader(self, protocol):
if protocol not in self.list_readers:
if protocol is not None:
protocol_dir = os.path.join(self.get_base_directory(), protocol)
if not os.path.isdir(protocol_dir):
raise ValueError(
"The directory %s for the given protocol '%s' does not exist" % (protocol_dir, protocol))
self.list_readers[protocol] = ListReader(self.keep_read_lists_in_memory)
return self.list_readers[protocol]
def _make_pad(self, files):
return [self.pad_file_class(client_id=f.client_id, path=f.path, attack_type=f.attack_type, file_id=f.id)
for f in files]
def groups(self, protocol=None):
"""This function returns the list of groups for this database.
protocol : str or ``None``
The protocol for which the groups should be retrieved.
Returns: a list of groups
"""
groups = []
if protocol is not None:
if os.path.isdir(os.path.join(self.get_base_directory(), protocol, self.m_dev_subdir)):
groups.append('dev')
if os.path.isdir(os.path.join(self.get_base_directory(), protocol, self.m_eval_subdir)):
groups.append('eval')
if os.path.isdir(os.path.join(self.get_base_directory(), protocol, self.m_train_subdir)):
groups.append('train')
else:
if os.path.isdir(os.path.join(self.get_base_directory(), self.m_dev_subdir)):
groups.append('dev')
if os.path.isdir(os.path.join(self.get_base_directory(), self.m_eval_subdir)):
groups.append('eval')
if os.path.isdir(os.path.join(self.get_base_directory(), self.m_train_subdir)):
groups.append('train')
return groups
def _get_list_file(self, group, type=None, protocol=None):
if protocol:
base_directory = os.path.join(self.get_base_directory(), protocol)
else:
base_directory = self.get_base_directory()
group_dir = self.m_dev_subdir if group == 'dev' else self.m_eval_subdir if group == 'eval' else self.m_train_subdir
list_name = {'for_real': self.m_real_filename,
'for_attack': self.m_attack_filename,
}[type]
return os.path.join(base_directory, group_dir, list_name)
def client_ids(self, protocol=None, groups=None):
"""Returns a list of client ids for the specific query by the user.
Keyword Parameters:
protocol : str or ``None``
The protocol to consider
groups : str or [str] or ``None``
The groups to which the clients belong ("dev", "eval", "train").
Returns: A list containing all the client ids which have the given properties.
"""
groups = self.check_parameters_for_validity(groups, "group",
self.groups(protocol),
default_parameters=self.groups(protocol))
return self.__client_id_list__(groups, 'for_real', protocol)
def objects(self, groups=None, protocol=None, purposes=None, model_ids=None, **kwargs):
"""Returns a set of :py:class:`File` objects for the specific query by the user.
Keyword Parameters:
groups : str or [str] or ``None``
One of the groups ("dev", "eval", "train") or a tuple with several of them.
If 'None' is given (this is the default), it is considered the same as a
tuple with all possible values.
protocol : str or ``None``
The protocol to consider
purposes : str or [str] or ``None``
The purposes required to be retrieved ("real", "attack") or a tuple
with several of them. If 'None' is given (this is the default), it is
considered the same as a tuple with all possible values.
model_ids : [various type]
This parameter is not supported in PAD databases yet
Returns: A list of :py:class:`File` objects considering all the filtering criteria.
"""
purposes = self.check_parameters_for_validity(purposes, "purpose", ('real', 'attack'))
groups = self.check_parameters_for_validity(groups, "group",
self.groups(protocol),
default_parameters=self.groups(protocol))
# first, collect all the lists that we want to process
lists = []
for group in ('train', 'dev', 'eval'):
if group in groups:
if 'real' in purposes:
lists.append(
self._list_reader(protocol).read_list(self._get_list_file(group, 'for_real', protocol=protocol),
group, 'for_real'))
if 'attack' in purposes:
lists.append(
self._list_reader(protocol).read_list(self._get_list_file(group, 'for_attack',
protocol=protocol),
group, 'for_attack'))
# now, go through the lists and add add corresponding files
retval = []
# non-probe files; just filter by model id
for flist in lists:
for fileobj in flist:
retval.append(fileobj)
return self._make_pad(retval)
def annotations(self, file):
return super(FileListPadDatabase, self).annotations(file)
File added
File added
File added
data/attack2 F2 Attack_1
data/real2 F2
data/real3 M2
data/attack3 unknown Attack_2
data/real4 F4
data/attack1 M1 Attack_1
data/real1 M1
data/real2 F1
File added
File added
File added
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment