Pavel KORSHUNOV · 7b8030c4 · 10fa106b · 04a1faf2 · 7b8030c4 · 10fa106b
--- a/bob/pad/base/database/PadBioFileDB.py

+ 162

− 85
+++ b/bob/pad/base/database/PadBioFileDB.py

+ 162

− 85
 @@ -6,7 +6,7 @@ used by both verification and PAD experiments.
 from bob.pad.base.database import PadFile
 from bob.pad.base.database import FileListPadDatabase

-from bob.bio.base.database import BioDatabase
+from bob.bio.base.database import FileListBioDatabase
 from bob.bio.base.database.file import BioFile

 import bob.io.base
 @@ -48,66 +48,182 @@ class HighPadFile(PadFile):

 class HighPadDatabase(FileListPadDatabase):
    def __init__(self,
+                 filelists_directory=None,
                 original_directory="[DB_DATA_DIRECTORY]",
                 original_extension=".wav",
+                 file_class=None,
                 db_name='',
                 **kwargs):
-        # call base class constructor
-        from pkg_resources import resource_filename
-        folder = resource_filename(__name__, '../lists/' + db_name)
-        super(HighPadDatabase, self).__init__(folder, db_name, pad_file_class=HighPadFile,
+        if not filelists_directory:
+            # if not provided, we assume the lists are located in '../lists'
+            from pkg_resources import resource_filename
+            filelists_directory = resource_filename(__name__, '../lists/' + db_name)
+        if not file_class:
+            file_class = HighPadFile
+        super(HighPadDatabase, self).__init__(filelists_directory, db_name, pad_file_class=file_class,
                                              original_directory=original_directory,
                                              original_extension=original_extension,
                                              **kwargs)


-class HighBioFile(BioFile):
-    def __init__(self, f):
-        """
-        Initializes this File object with an File equivalent from the underlying SQl-based interface for
-        database. Replace this class for the specific database.
-        """
-        super(HighBioFile, self).__init__(client_id=f.client_id, path=f.path, file_id=f.id)
-
-        self.__f = f
-
-    def load(self, directory=None, extension='.wav'):
-        path = self.make_path(directory, extension)
-        if extension == '.wav':
-            rate, audio = scipy.io.wavfile.read(path)
-            # We consider there is only 1 channel in the audio file => data[0]
-            return rate, numpy.cast['float'](audio)
-        elif extension == '.avi':
-            return bob.io.base.load(path)
-
-
-class HighBioDatabase(BioDatabase):
+class HighBioDatabase(FileListBioDatabase):
    """
    Implements verification API for querying High database.
    """

    def __init__(self,
+                 filelists_directory=None,
                 original_directory="[DB_DATA_DIRECTORY]",
                 original_extension=".wav",
                 db_name='',
+                 file_class=None,
                 **kwargs):
+        if not filelists_directory:
+            # if not provided, we assume the lists are located in '../lists'
+            from pkg_resources import resource_filename
+            filelists_directory = resource_filename(__name__, '../lists/' + db_name)
+        if not file_class:
+            file_class = HighPadFile
        # call base class constructors to open a session to the database
-        super(HighBioDatabase, self).__init__(name=db_name,
+        super(HighBioDatabase, self).__init__(filelists_directory, db_name,
+                                              bio_file_class=file_class,
                                              original_directory=original_directory,
                                              original_extension=original_extension, **kwargs)

-        self.__db = HighPadDatabase(db_name=db_name,
-                                    original_directory=original_directory,
-                                    original_extension=original_extension,
-                                    **kwargs)
+        self._pad_db = HighPadDatabase(filelists_directory=filelists_directory,
+                                        db_name=db_name,
+                                        file_class=file_class,
+                                        original_directory=original_directory,
+                                        original_extension=original_extension,
+                                        **kwargs)

        self.low_level_group_names = ('train', 'dev', 'eval')
        self.high_level_group_names = ('world', 'dev', 'eval')

+    def _convert_protocol(self, protocol=None):
+        """
+        This conversion of the protocol with appended '-licit' or '-spoof' is a hack for verification experiments.
+        To adapt spoofing databases to the verification experiments, we need to be able to split a given protocol
+        into two parts: when data for licit (only real/genuine data is used) and data for spoof
+        (attacks are used instead of real data) is used in the experiment.
+        Hence, we use this trick with appending '-licit' or '-spoof' to the
+        protocol name, so we can distinguish these two scenarios.
+        By default, if nothing is appended, we assume licit protocol.
+        The distinction between licit and spoof is expressed via purposes parameters, but
+        the difference is in the terminology only.
+        """
+
+        if protocol == '.':
+            protocol = None
+
+        # if protocol was empty, we return None
+        if not protocol:
+            return None, None
+
+        # lets check if we have an appendix to the protocol name
+        modifier = None
+        if protocol:
+            modifier = protocol.split('-')[-1]
+
+        # if protocol was empty or there was no correct appendix, we just assume the 'licit' option
+        if not (modifier == 'licit' or modifier == 'spoof'):
+            modifier = 'licit'
+        else:
+            # put back everything except the appendix into the protocol
+            protocol = '-'.join(protocol.split('-')[:-1])
+
+        return protocol, modifier
+
+    def _convert_purposes(self, purposes, modifier):
+        """
+        We assume there is no enrollment data, since
+        PAD File database has real and attack lists only,
+        so we cannot assume any availability of enrollment data
+        If your PAD File lists also have for_model.lst
+        and/or for_probe.lst files, you need to change this method
+
+        Args:
+            purposes: The original purposes supplied by Bio verification framework
+            modifier: Indicates whether it is licit or spoof scenario
+
+        Returns: corrected purposes according to either licit or spoof scenarios
+
+        """
+
+        if isinstance(purposes, str):
+            purposes = [purposes]
+        elif purposes is not None:
+            purposes = list(purposes)
+
+        # licit scenario considers genuine data only
+        # we return all real data
+        purposes = ['real']
+
+        # spoof scenario uses spoofed data for probe
+        # but, during scoring, this scenario also needs a real-probe data
+        # for cases when model_id is equal to client_id
+        # Hence, we request both real and attack data
+        if modifier == 'spoof':
+            # we return real and attack data
+            purposes.append('attack')
+
+        return purposes
+
+    def _filter_by_model_ids(self, objects, model_ids):
+        """
+        From all File objects, keep only those, whose client_id is in model_ids
+        Args:
+            objects: File objects derived from BioFile
+            model_ids: The list of the requested model Ids
+
+        Returns: The list of File objects
+
+        """
+        if not model_ids:
+            return []
+
+        filtered_objects = []
+        for f in objects:
+            if f.client_id in model_ids:
+                if hasattr(f, 'attack_type') and f.attack_type is not None:
+                    f.client_id = 'attack/{}'.format(f.client_id)
+                filtered_objects.append(f)
+        return filtered_objects
+
+    def client_id_from_model_id(self, model_id, group='dev'):
+        """
+        This wrapper around PAD database does not have a knowledge of
+        model ids used in verification experiments, so we just assume that
+        the client_id is the same as model_id, which is actually true
+        for most of the verification databases as well.
+        """
+        return model_id
+
    def model_ids_with_protocol(self, groups=None, protocol=None, **kwargs):
-        groups = self.convert_names_to_lowlevel(groups, self.low_level_group_names, self.high_level_group_names)
+        """
+        This wrapper around PAD database does not have a knowledge of
+        model ids used in verification experiments, so we just assume that
+        the model_ids are the same as client ids, which is actually true
+        for most of the verification databases as well.

-        return [client.id for client in self.__db.clients(groups=groups, **kwargs)]
+        """
+        # we need to correctly convert groups first
+        groups = self.convert_names_to_lowlevel(groups, self.low_level_group_names, self.high_level_group_names)
+        # we also need to convert protocol name (it can have either '-licit' or '-spoof' appendix)
+        # to the expected protocol name without appendix
+        return self._pad_db.client_ids(protocol=self._convert_protocol(protocol)[0], groups=groups, **kwargs)
+
+    def arrange_by_client(self, files):
+        client_files = {}
+        for f in files:
+            if str(f.client_id) not in client_files:
+                client_files[str(f.client_id)] = []
+            client_files[str(f.client_id)].append(f)
+
+        files_by_clients = []
+        for client in sorted(client_files.keys()):
+            files_by_clients.append(client_files[client])
+        return files_by_clients

    def objects(self, protocol=None, purposes=None, model_ids=None, groups=None, **kwargs):
        """
 @@ -144,63 +260,24 @@ class HighBioDatabase(BioDatabase):
        # convert group names from the conventional names in verification experiments to the internal database names
        if groups is None:  # all groups are assumed
            groups = self.high_level_group_names
-        matched_groups = self.convert_names_to_lowlevel(groups, self.low_level_group_names, self.high_level_group_names)
-
-        # this conversion of the protocol with appended '-licit' or '-spoof' is a hack for verification experiments.
-        # To adapt spoofing databases to the verification experiments, we need to be able to split a given protocol
-        # into two parts: when data for licit (only real/genuine data is used) and data for spoof
-        # (attacks are used instead of real data) is used in the experiment.
-        # Hence, we use this trick with appending '-licit' or '-spoof' to the
-        # protocol name, so we can distinguish these two scenarios.
-        # By default, if nothing is appended, we assume licit protocol.
-        # The distinction between licit and spoof is expressed via purposes parameters, but
-        # the difference is in the terminology only.
-
-        # lets check if we have an appendix to the protocol name
-        appendix = None
-        if protocol:
-            appendix = protocol.split('-')[-1]
+        groups = self.convert_names_to_lowlevel(groups, self.low_level_group_names, self.high_level_group_names)

-        # if protocol was empty or there was no correct appendix, we just assume the 'licit' option
-        if not (appendix == 'licit' or appendix == 'spoof'):
-            appendix = 'licit'
-        else:
-            # put back everything except the appendix into the protocol
-            protocol = '-'.join(protocol.split('-')[:-1])
+        protocol, modifier = self._convert_protocol(protocol)
+        purposes = self._convert_purposes(purposes, modifier)

-        # if protocol was empty, we set it to the None
-        if not protocol:
-            protocol = None
+        # Query the underline PAD database
+        objects = self._pad_db.objects(protocol=protocol, groups=groups, purposes=purposes, **kwargs)

-        correct_purposes = purposes
-        # licit protocol is for real access data only
-        if appendix == 'licit':
-            # by default we assume all real data, since this database has no enroll data
-            if purposes is None:
-                correct_purposes = ('real',)
-
-        # spoof protocol uses real data for enrollment and spoofed data for probe
-        # so, probe set is the same as attack set
-        if appendix == 'spoof':
-            # we return attack data only, since this database does not have explicit enroll data
-            if purposes is None:
-                correct_purposes = ('attack',)
-            # otherwise replace 'probe' with 'attack'
-            elif isinstance(purposes, (tuple, list)):
-                correct_purposes = []
-                for purpose in purposes:
-                    if purpose == 'probe':
-                        correct_purposes += ['attack']
-                    else:
-                        correct_purposes += [purpose]
-            elif purposes == 'probe':
-                correct_purposes = ('attack',)
-
-        # now, query the underline PAD database
-        objects = self.__db.objects(protocol=protocol, groups=matched_groups, purposes=correct_purposes, **kwargs)
+        # note that PAD database does not know anything about model_ids, so these are ignored
+        # Hence, for the spoofing protocol, we need to filter out the files and
+        # keep only those that belong to model_ids
+        # We also modify the client_id to reflect that it is an attack
+        if modifier == 'spoof' and model_ids is not None:
+            objects = self._filter_by_model_ids(objects, model_ids)

        # make sure to return BioFile representation of a file, not the database one
-        return [HighBioFile(f) for f in objects]
+        return [HighPadFile(client_id=f.client_id, path=f.path, file_id=f.path, attack_type=f.attack_type)
+                for f in objects]

    def annotations(self, file):
        pass