Merge branch 'iso_metrics' into 'master'

Iso metrics This branch includes code for computing ISO metrics APCER and BPCER. Please merge it to master. See merge request !6

Merge branch 'iso_metrics' into 'master'
6b07fba1 · Pavel KORSHUNOV · 1af7a507 · f3e6ae8d · 6b07fba1 · 6b07fba1
Commit 6b07fba1 authored Dec 2, 2016 by Pavel KORSHUNOV
--- a/bob/pad/base/__init__.py
+++ b/bob/pad/base/__init__.py
 from . import database
 from . import algorithm
 from . import tools
+from . import evaluation

 from . import script
 from . import test

--- a/bob/pad/base/evaluation/PadIsoMetrics.py
+++ b/bob/pad/base/evaluation/PadIsoMetrics.py
+'''
+Created on 16 Nov 2016
+
+@author: sbhatta
+'''
+
+import sys, os
+import bob.io.base
+import numpy as np
+import bob.measure
+
+class PadIsoMetrics():
+
+    def __init__(self):
+        """ constructor. """
+
+        self.real_name = 'bonafide' #real_presentation_name #'real'
+        self.attack_name = 'attack' #attack_presentation_name #'attack'
+        
+    def save_scores_hdf5(self, outfile, scores_dict):
+        """ saves input scores_dict dictionary in a hdf5 formatted file"""
+
+        h5out = bob.io.base.HDF5File(outfile, "w")
+    
+        for p in scores_dict.keys():
+            if len(scores_dict[p]) == 1: # real_set
+                h5out.set(p, scores_dict[p][0] )
+                h5out.set_attribute('presentation', self.real_name, p)
+            else:
+                #write attacks
+                h5out.set(p, scores_dict[p][0] )
+                h5out.set_attribute('attack_potential', scores_dict[p][1], p)
+                h5out.set_attribute('presentation', self.attack_name, p)
+        
+        del h5out
+
+    def load_scores_hdf5(self, infile):
+        """ loads a hdf5 file, and trys to construct a dictionary of scores. Returns the score-dictionary."""
+
+        h5in = bob.io.base.HDF5File(infile, "r")
+         
+        scores_dict = {}         
+        h5in.cd('/')
+        class_labels = h5in.keys(relative='True')
+        for p in class_labels:
+            scores = h5in.get(p)
+            attrs = h5in.get_attributes(p)
+            if len(attrs) == 2: #then the two elements are 'presentation' and 'attack_potential'
+                ap = attrs['attack_potential']
+                scores_dict[p] = [scores, ap] 
+            else:
+                scores_dict[p] = [scores]
+            
+        del h5in
+        return scores_dict  
+        
+    
+    def eer(self, scores_dict):
+        """ computes EER threshold using the scores in the supplied dictionary 
+        Input:
+        scores_dict: dictionary where each key is the name of the presentation ('real' or one attack-type), 
+        and the corresponding value is a tuple: (scores, attack_potential).
+        'scores' should be a 1D numpy-array of floats containing scores
+        'attack_potential' should be one of the 3 letters 'A', 'B', or 'C')
+        Scores for 'real' presentations will not have an associated 'attack_potential',
+        so, if the value of a key is a tuple of length 1, the key-value pair is assumed
+        to represent a 'real'-presentation set.
+        Return:
+        tuple of three floats: (eer_threshold, far, frr). These are computed using functions from bob.measure.
+        """
+
+        real_scores = None
+        attack_scores = None
+        assert scores_dict is not None, 'no development score-set provided for computing EER'    
+        
+        for k in scores_dict.keys():
+            keyvalue = scores_dict[k]
+            if len(keyvalue)==2:
+                if attack_scores is None:
+                    attack_scores = scores_dict[k][0]
+                else:
+                    attack_scores = np.concatenate((attack_scores, scores_dict[k][0]))
+            else:
+                if len(keyvalue)==1:
+                    real_scores = scores_dict[k][0]
+        
+        assert (attack_scores is not None), 'Empty attack-scores list. Cannot compute EER'
+        assert (real_scores is not None), 'Empty real-scores list. Cannot compute EER.'
+        self.threshEER_dev = bob.measure.eer_threshold(attack_scores, real_scores)
+        
+        self.dev_far, self.dev_frr = bob.measure.farfrr(attack_scores, real_scores, self.threshEER_dev)
+#         self.eer_devel = 50.0*(self.dev_far + self.dev_frr)
+#         print('eer()::threshEER: %s' % self.threshEER_dev)
+        return (self.threshEER_dev, self.dev_far, self.dev_frr)
+
+
+    def hter(self, scores_dict, score_threshold):
+        """ computes HTER on test-set scores, using the supplied score-threshold.
+        Inputs: 
+        scores_dict: dictionary where each key is the name of the presentation ('real' or one attack-type), 
+        and the corresponding value is a tuple: (scores, attack_potential).
+        'scores' should be a 1D numpy-array of floats containing scores
+        'attack_potential' should be one of the 3 letters 'A', 'B', or 'C')
+        Scores for 'real' presentations will not have an associated 'attack_potential',
+        so, if the value of a key is a tuple of length 1, the key-value pair is assumed
+        to represent a 'real'-presentation set.
+        score_threshold: (float) value to be used for thresholding scores.
+        Return:
+        tuple of three floats: (hter, far, frr). These are computed using functions from bob.measure.
+        """
+
+        assert ((score_threshold is not None) and isinstance(score_threshold, (int, long, float)) ), 'input score_threshold should be a number (float or integer).'
+        
+        real_scores = None
+        attack_scores = None
+        assert scores_dict is not None, 'no test score-set available for computing HTER'    
+        
+        for k in scores_dict.keys():
+            key_value = scores_dict[k]
+            if len(key_value)==2:
+                if attack_scores is None:
+                    attack_scores = scores_dict[k][0]
+                else:
+                    attack_scores = np.concatenate((attack_scores, scores_dict[k][0]))
+            else:
+                if len(key_value)==1:
+                    real_scores = scores_dict[k][0]
+       
+        assert (attack_scores is not None), 'Empty attack-scores list. Cannot compute EER'
+        assert (real_scores is not None), 'Empty real-scores list. Cannot compute EER.'
+        test_far, test_frr = bob.measure.farfrr(attack_scores, real_scores, score_threshold)
+#         test_good_neg = bob.measure.correctly_classified_negatives(attack_scores, score_threshold).sum()
+#         test_good_pos = bob.measure.correctly_classified_positives(real_scores, score_threshold).sum()
+        hter = (test_far+test_frr)/2.0
+        
+        return (hter, test_far, test_frr)
+
+
+    def _check_attack_potential(self, attack_potential):
+        """ For now, we assume three levels of attack-potential: 'C'>'B'>'A' """
+
+        if attack_potential is None:
+            attack_potential = 'C'
+        if attack_potential not in ['A', 'B', 'C']:
+            attack_potential = 'C'
+        
+        return attack_potential
+    
+
+    def bpcer(self, scores, score_threshold=0.0):
+        """ computes BPCER  on test-set scores, using either the supplied score-threshold, 
+        or the threshold computed from the EER of the development set 
+        Inputs:
+        scores: a 1D numpy-array of scores corresponding to genuine (bona-fide) presentations.
+        score_threshold: a floating point number specifying the score-threshold to be used for deciding accept/reject.
+        
+        Return:
+        floating-point number representing the bpcer computed for the input score-set
+        """
+
+        bonafide_scores = None
+        if isinstance(scores, dict):
+            #extract 'real' scores from dictionary
+            for k in scores.keys():
+                key_value = scores[k]
+                if len(key_value) == 1:
+                    bonafide_scores = key_value[0]
+        else:
+            #verify that scores is a 1D numpy array
+            if isinstance(scores, np.ndarray) and len(scores.shape)==1:
+                bonafide_scores = scores
+        
+        assert bonafide_scores is not None, 'input scores does not contain bona-fide scores, for computing BPCER.'
+        assert isinstance(score_threshold, (int, long, float)), 'input score_threshold should be a number (float or integer).'
+        
+        correct_scores = bonafide_scores[bonafide_scores<score_threshold].shape[0]
+        
+        return correct_scores/float(bonafide_scores.shape[0]) 
+    
+
+    def apcer(self, scores_dict, attack_potential='C', score_threshold=0.0):
+        """computes APCER as defined in ISO standard. For now, we assume three levels of attack-potential: 'C'>'B'>'A' 
+        
+        Inputs:
+        scores_dict: a dictionary where each key corresponds to a specific PAI (presentation-attack-instrument)
+        Keys corresponding to PAIs will have as value a list of 2 elements: 
+        1st element: a 1D numpy-array of scores
+        2nd element: a single letter 'A', 'B', or 'C', specifying the attack-potential of the PAI.
+                             
+        attack_potential: a letter 'A', 'B', or 'C', specifying the attack_potential at which the APCER is to be computed
+        score_threshold: a floating point number specifying the score-threshold to be used for deciding accept/reject.
+             
+        Returns:
+        tuple consisting of 2 elements:
+        1st element: apcer at specified attack-potential
+        2nd element: dictionary of hter of individual PAIs that have attack-potential at or below input-parameter attack_potential.
+        """
+        
+        attack_potential = self._check_attack_potential( attack_potential)
+
+        attack_perf_dict = {} #dictionary to store the hter for each attack-type that is at or below specified attack-potential
+        result_list = []
+        for k in scores_dict.keys():
+            if len(scores_dict[k]) == 2: #consider only the keys where the value is a list of 2 elements
+                if scores_dict[k][1] <= attack_potential:
+                    scores =  scores_dict[k][0]
+                    result = (scores[scores>=score_threshold].shape[0])/float(scores.shape[0])
+                    result_list.append(result)
+                    attack_perf_dict[k]=result
+        
+        return (max(result_list), attack_perf_dict)  
+
--- a/bob/pad/base/evaluation/__init__.py
+++ b/bob/pad/base/evaluation/__init__.py
+from .PadIsoMetrics import PadIsoMetrics
+
+# to fix sphinx warnings of not able to find classes, when path is shortened
+PadIsoMetrics.__module__ = "bob.pad.base.evaluation"
+# gets sphinx autodoc done right - don't remove it
+__all__ = [_ for _ in dir() if not _.startswith('_')]
+
--- a/bob/pad/base/test/data/pad_devel_replaymobile_IqmScores_SVM.hdf5
+++ b/bob/pad/base/test/data/pad_devel_replaymobile_IqmScores_SVM.hdf5
--- a/bob/pad/base/test/data/pad_test_replaymobile_IqmScores_SVM.hdf5
+++ b/bob/pad/base/test/data/pad_test_replaymobile_IqmScores_SVM.hdf5
--- a/bob/pad/base/test/test_PadIsoMetrics.py
+++ b/bob/pad/base/test/test_PadIsoMetrics.py
+'''
+Created on 16 Nov 2016
+
+@author: sbhatta
+'''
+
+import sys, os
+import pkg_resources
+import bob.io.base
+import numpy as np
+import bob.measure
+from bob.pad.base.evaluation import PadIsoMetrics
+
+
+#def main(arguments):
+def main(command_line_parameters=None):
+
+    scorefile_devel = pkg_resources.resource_filename('bob.pad.base', 'test/data/pad_devel_replaymobile_IqmScores_SVM.hdf5')
+    scorefile_test  = pkg_resources.resource_filename('bob.pad.base', 'test/data/pad_test_replaymobile_IqmScores_SVM.hdf5')
+
+#     PAI_labels = [('mattescreen-photo', 'A'), ('mattescreen-video', 'A'), ('print-fixed', 'A'), ('print-hand','A') ]
+
+    #rms = PadIsoMetrics.PadIsoMetrics() # PadIsoMetrics(PAI_labels)
+    rms = PadIsoMetrics()
+
+    devel_dict = rms.load_scores_hdf5(scorefile_devel)
+    test_dict  = rms.load_scores_hdf5(scorefile_test)
+    
+    threshEER_dev, dev_far, dev_frr = rms.eer(devel_dict)
+    
+    eer_devel = 50.0*(dev_far + dev_frr)
+    print('threshEER_dev (grandtest): %s'  % threshEER_dev)
+    print('FRR, FAR (devel): %s %s'  % (dev_frr, dev_far))
+    print('EER (%%): %.3f%%'  % eer_devel)
+    
+    test_hter, test_far, test_frr = rms.hter(test_dict, threshEER_dev)
+    print("     * FAR : %.3f%%" % (100*test_far))
+    print("     * FRR : %.3f%%" % (100*test_frr))
+    print("     * HTER: %.3f%%" % (100*test_hter))
+    
+    test_bpcer = 100.0*rms.bpcer(test_dict, threshEER_dev)
+    print('BPCER from dict: %.3f%%'  % test_bpcer )
+    
+    bf_scores = test_dict['real'][0]
+    test_bpcer = 100.0*rms.bpcer(bf_scores, threshEER_dev)
+    print('BPCER from np-array: %.3f%%'  % test_bpcer )
+    
+    attack_apcer, attack_perf_dict =  rms.apcer(test_dict, 'C', threshEER_dev)
+    print('\nAPCER: %.3f%%'  % (100.0*attack_apcer) )
+    print('Performance for individual PAIs:')
+    for k in attack_perf_dict.keys():
+        print('%s: %.3f%%' %(k, 100.0*attack_perf_dict[k]))
+
+
+'''
+'''
+if __name__ == '__main__':
+    main(sys.argv[1:])
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -39,6 +39,7 @@ nitpicky = True

 # Ignores stuff we can't easily resolve on other project's sphinx manuals
 nitpick_ignore = []
+keep_warnings = True

 # Allows the user to override warnings from a separate file
 if os.path.exists('nitpick-exceptions.txt'):

--- a/doc/implementation.rst
+++ b/doc/implementation.rst
@@ -86,6 +86,24 @@ Finally, the :py:class:`bob.pad.base.algorithm.Algorithm` class provides default

 * ``score_for_multiple_projections(self, toscore)``: In case your object store several features or scores, **call** this function to compute the average (or min, max, ...) of the scores.

+Evaluation
+~~~~~~~~~~
+This package includes a class `bob.pad.base.evaluation.PadIsoMetrics`, that can be used to compute the PAD metrics APCER and BPCER as defined in the ISO/IEC 30107 part3 standard.
+The most important methods in the class are: ``eer()``,  ``hter()``,  ``apcer()``, and  ``bpcer()``. 
+The main point to note about these methods is that the input-scores should be organized in a dictionary.
+One dictionary should be created for each group ('devel', 'test', etc.). 
+The keys of the dictionary refer to the presentation-type ('bona-fide' or some presentation-attack-instrument (PAI)). 
+The value associated with each key is a tuple, containing either one or two elements. 
+For each key corresponding to a PAI, the value should be a tuple of 2 elements: (scores, attack_potential), where 'scores' is a 1D numpy-array of scores corresponding to presentations of that PAI, and 'attack_potential' is a single letter, either 'A', or 'B', or 'C', signifying the attack-potential of the PAI.
+For bona-fide presentations, no attack-potential is defined. Therefore, for a key representing bona-fide presentations, the value will be a tuple consisting of only one element: a 1D numpy-array of scores.
+Consequently, a key for which the value is a tuple of length 1 is interpretted as representing a bona-fide presentation.
+
+The methods ``eer()`` and ``hter()`` call the corresponding functions in `bob.measure` to compute the relevant thresholds and performance-measures, based on the input score-dictionary. 
+
+The class also provides methods for saving the score-dictionaries in a hdf5-file (``save_scores_hdf5()``), and for loading such a file (``load_scores_hdf5()``).
+
+For an example of how to use this class to evaluate a score-distribution, see the code provided in file `bob.pad.base/bob/pad/base/test/test_PadIsoMetrics.py/test/test_PadIsoMetrics.py`.
+

 Implemented Tools
 -----------------

--- a/doc/implemented.rst
+++ b/doc/implemented.rst
@@ -44,6 +44,12 @@ Algorithms

 .. automodule:: bob.pad.base.algorithm

+Evaluation
+~~~~~~~~~~
+
+.. automodule:: bob.pad.base.evaluation
+
+
 Databases
 ---------


--- a/doc/py_api.rst
+++ b/doc/py_api.rst
@@ -35,6 +35,12 @@ Scoring
 .. autosummary::
   bob.bio.base.tools.compute_scores

+Evaluation
+~~~~~~~~~~
+
+.. autosummary::
+   bob.pad.base.evaluation.PadIsoMetrics
+
 Details
 -------