From 263bd1818e2938c5aca608f8e557019497d30aea Mon Sep 17 00:00:00 2001
From: Tiago Freitas Pereira <tiagofrepereira@gmail.com>
Date: Wed, 9 Mar 2016 15:00:09 +0100
Subject: [PATCH] Tackled issue 6

---
 bob/measure/__init__.py   | 74 ++++++++++++++++++++++++++---------
 bob/measure/load.py       | 81 +++++++++++++++++++++++++++++----------
 bob/measure/test_error.py | 22 +++++++++++
 3 files changed, 138 insertions(+), 39 deletions(-)

diff --git a/bob/measure/__init__.py b/bob/measure/__init__.py
index 03d95c9..ff55169 100644
--- a/bob/measure/__init__.py
+++ b/bob/measure/__init__.py
@@ -78,9 +78,10 @@ def relevance (input, machine):
 
   return retval
 
-def recognition_rate(cmc_scores):
-  """recognition_rate(cmc_scores) -> RR
 
+def recognition_rate(cmc_scores, threshold=None):
+  """recognition_rate(cmc_scores) -> RR 
+  
   Calculates the recognition rate from the given input, which is identical
   to the rank 1 (C)MC value.
 
@@ -90,16 +91,23 @@ def recognition_rate(cmc_scores):
   the :py:func:`bob.measure.load.cmc_four_column` or
   :py:func:`bob.measure.load.cmc_five_column` function.
 
-  The recognition rate is defined as the number of test items, for which the
+  If **threshold** is set to `None`, the recognition rate is defined as the number of test items, for which the
   positive score is greater than or equal to all negative scores, divided by
-  the number of all test items.  If several positive scores for one test item
-  exist, the **highest** score is taken.
+  the number of all test items. If several positive scores for one test item exist, the **highest** score is taken.
 
-  **Parameters:**
+  If **threshold** assumes one value, the recognition rate is defined as the number of test items, for which the
+  positive score is greater than or equal to all negative scores and the threshold divided by
+  the number of all test items. If several positive scores for one test item exist, the **highest** score is taken.
+  
+  If a particular test item has only negative scores a correct classification hit it done if all the negative scores are higher than 
+  the **threshold**. For this particular test, the definition of threshold is mandatory.  
 
-  ``cmc_scores`` : [(array_like(1D, float), array_like(1D, float))]
-    A list of tuples, where each tuple contains the ``negative`` and ``positive`` scores for one probe of the database
+  **Parameters:**
+  
+    ``cmc_scores`` : CMC scores loaded with one of the functions (:py:func:`bob.measure.load.cmc_four_column` or
+  :py:func:`bob.measure.load.cmc_five_column`)
 
+    ``threshold`` : Decision threshold. If `None`, the decision threshold will be the **highest** positive score.
   **Returns:**
 
   ``RR`` : float
@@ -111,16 +119,38 @@ def recognition_rate(cmc_scores):
 
   correct = 0.
   for neg, pos in cmc_scores:
-    # get the maximum positive score for the current probe item
-    # (usually, there is only one positive score, but just in case...)
-    max_pos = numpy.max(pos)
-    # check if the positive score is smaller than all negative scores
-    if (neg < max_pos).all():
-      correct += 1.
+
+    #If threshold is none, let's use the highest positive score as the decision threshold
+    if(threshold is None):
+      # get the maximum positive score for the current probe item
+      # (usually, there is only one positive score, but just in case...)
+      max_pos = numpy.max(pos)
+      # check if the positive score is smaller than all negative scores
+      if (neg < max_pos).all():
+        correct += 1.
+        
+    else:
+
+      #If threshold is NOT None, we have an openset identification
+      if(len(pos)>0):
+        # if we have positive scores the comparison is considered correct
+        # if the positive score is higher than the threshold AND all negative scores
+        max_pos = numpy.max(pos)
+
+        if((threshold < max_pos) and (neg < max_pos).all()):
+            correct += 1.
+
+      else:
+        #If we don't have a positive score we only will consider 
+        #a correct classification if ALL the negative scores are smaller than the threshold
+        if (neg < threshold).all():
+          correct += 1.
 
   # return relative number of correctly matched scores
   return correct / float(len(cmc_scores))
 
+
+
 def cmc(cmc_scores):
   """cmc(cmc_scores) -> curve
 
@@ -143,6 +173,8 @@ def cmc(cmc_scores):
   ``cmc_scores`` : [(array_like(1D, float), array_like(1D, float))]
     A list of tuples, where each tuple contains the ``negative`` and ``positive`` scores for one probe of the database
 
+  ``threshold`` : Decision threshold. If `None`, the decision threshold will be the **highest** positive score.
+
   **Returns:**
 
   ``curve`` : array_like(2D, float)
@@ -156,14 +188,19 @@ def cmc(cmc_scores):
 
   # compute MC
   match_characteristic = numpy.zeros((max([len(neg) for (neg,pos) in cmc_scores])+1,), numpy.int)
+
   for neg, pos in cmc_scores:
+    if((type(pos)!=float) and (len(pos) == 0)):
+      raise ValueError("For the CMC computation at least one positive score is necessary. Please review who you are loading the scores. You must set `load_only_negatives=False` in the :py:func:`bob.measure.load.cmc_four_column` or `:py:func:`bob.measure.load.cmc_five_column` methods.")
+
     # get the maximum positive score for the current probe item
-    # (usually, there is only one positive score, but just in case...)
+    # (usually, there is only one positive score, but just in case...)    
     max_pos = numpy.max(pos)
-    # count the number of negative scores that are higher than the best positive score
-    index = numpy.sum(neg >= max_pos)
-    match_characteristic[index] += 1
 
+    # count the number of negative scores that are higher than the best positive score            
+    index = numpy.sum(neg >= max_pos)
+    match_characteristic[index] += 1  
+    
   # cumulate
   cumulative_match_characteristic = numpy.ndarray(match_characteristic.shape, numpy.float64)
   count = 0.
@@ -174,6 +211,7 @@ def cmc(cmc_scores):
   return cumulative_match_characteristic
 
 
+
 def get_config():
   """Returns a string containing the configuration information.
   """
diff --git a/bob/measure/load.py b/bob/measure/load.py
index 75a0591..9039db9 100644
--- a/bob/measure/load.py
+++ b/bob/measure/load.py
@@ -128,8 +128,10 @@ def split_four_column(filename):
 
   return (numpy.array(neg, numpy.float64), numpy.array(pos, numpy.float64))
 
-def cmc_four_column(filename):
-  """cmc_four_column(filename) -> cmc_scores
+def cmc_four_column(filename, load_only_negatives=False):
+  """
+  cmc_four_column(filename) -> cmc_scores
+  
 
   Loads scores to compute CMC curves from a file in four column format.
   The four column file needs to be in the same format as described in :py:func:`four_column`,
@@ -140,20 +142,26 @@ def cmc_four_column(filename):
   Usually, the list of positive scores should contain only one element, but more are allowed.
   The result of this function can directly be passed to, e.g., the :py:func:`bob.measure.cmc` function.
 
+  
   **Parameters:**
 
   ``filename`` : str or file-like
     The file that will be opened with :py:func:`open_file` containing the scores.
 
+  ``load_only_negatives`` : boolean
+    Set this argument to **True** if you want also to load the probes that has negative scores **only** (used for open-set recognition).
+
+
   **Returns:**
 
   ``cmc_scores`` : [(array_like(1D, float), array_like(1D, float))]
     A list of tuples, where each tuple contains the ``negative`` and ``positive`` scores for one probe of the database
+
   """
   # extract positives and negatives
   pos_dict = {}
   neg_dict = {}
-  # read four column list
+  # read four column list  
   for (client_id, probe_id, probe_name, score_str) in four_column(filename):
     try:
       score = float(score_str)
@@ -174,17 +182,28 @@ def cmc_four_column(filename):
   retval = []
   import logging
   logger = logging.getLogger('bob')
-  for probe_name in sorted(pos_dict.keys()):
-    if probe_name in neg_dict:
+  if(not load_only_negatives):
+    for probe_name in sorted(pos_dict.keys()):
+      if probe_name in neg_dict:
+        retval.append((numpy.array(neg_dict[probe_name], numpy.float64), numpy.array(pos_dict[probe_name], numpy.float64)))
+      else:
+        logger.warn('For probe name "%s" there are only positive scores. This probe name is ignored.' % probe_name)
+
+    #test if there are probes for which only negatives exist
+    for probe_name in sorted(neg_dict.keys()):
+      if not probe_name in pos_dict.keys():
+        logger.warn('For probe name "%s" there are only negative scores. This probe name is ignored.' % probe_name)
+
+  else:
+    for probe_name in sorted(pos_dict.keys()):
       retval.append((numpy.array(neg_dict[probe_name], numpy.float64), numpy.array(pos_dict[probe_name], numpy.float64)))
-    else:
-      logger.warn('For probe name "%s" there are only positive scores. This probe name is ignored.' % probe_name)
-  # test if there are probes for which only negatives exist
-  for probe_name in sorted(neg_dict.keys()):
-    if not probe_name in pos_dict.keys():
-       logger.warn('For probe name "%s" there are only negative scores. This probe name is ignored.' % probe_name)
+
+    for probe_name in sorted(neg_dict.keys()):
+      if not probe_name in pos_dict.keys():
+        retval.append((numpy.array(neg_dict[probe_name], numpy.float64), numpy.array([], numpy.float64)))
 
   return retval
+  
 
 def five_column(filename):
   """five_column(filename) -> claimed_id, model_label, real_id, test_label, score
@@ -231,6 +250,7 @@ def five_column(filename):
       raise SyntaxError('Cannot convert score to float at line %d of file "%s": %s' % (i, filename, l))
     yield (field[0], field[1], field[2], field[3], score)
 
+
 def split_five_column(filename):
   """split_five_column(filename) -> negatives, positives
 
@@ -267,9 +287,11 @@ def split_five_column(filename):
 
   return (numpy.array(neg, numpy.float64), numpy.array(pos, numpy.float64))
 
-def cmc_five_column(filename):
-  """cmc_four_column(filename) -> cmc_scores
 
+def cmc_five_column(filename, load_only_negatives=False):
+  """
+  cmc_four_column(filename) -> cmc_scores
+  
   Loads scores to compute CMC curves from a file in five column format.
   The four column file needs to be in the same format as described in :py:func:`five_column`,
   and the ``test_label`` (column 4) has to contain the test/probe file name or a probe id.
@@ -284,10 +306,15 @@ def cmc_five_column(filename):
   ``filename`` : str or file-like
     The file that will be opened with :py:func:`open_file` containing the scores.
 
+  ``load_only_negatives`` : boolean
+    Set this argument to **True** if you want also to load the probes that has negative scores **only** (used for open-set recognition).
+
+
   **Returns:**
 
   ``cmc_scores`` : [(array_like(1D, float), array_like(1D, float))]
     A list of tuples, where each tuple contains the ``negative`` and ``positive`` scores for one probe of the database
+
   """
   # extract positives and negatives
   pos_dict = {}
@@ -309,13 +336,25 @@ def cmc_five_column(filename):
   retval = []
   import logging
   logger = logging.getLogger('bob')
-  for probe_name in sorted(pos_dict.keys()):
-    if probe_name in neg_dict:
+  if(not load_only_negatives):
+
+    for probe_name in sorted(pos_dict.keys()):
+      if probe_name in neg_dict:
+        retval.append((numpy.array(neg_dict[probe_name], numpy.float64), numpy.array(pos_dict[probe_name], numpy.float64)))
+      else:
+        logger.warn('For probe name "%s" there are only positive scores. This probe name is ignored.' % probe_name)
+    # test if there are probes for which only negatives exist
+    for probe_name in sorted(neg_dict.keys()):
+      if not probe_name in pos_dict.keys():
+         logger.warn('For probe name "%s" there are only negative scores. This probe name is ignored.' % probe_name)
+  else:
+  
+    for probe_name in sorted(pos_dict.keys()):
       retval.append((numpy.array(neg_dict[probe_name], numpy.float64), numpy.array(pos_dict[probe_name], numpy.float64)))
-    else:
-      logger.warn('For probe name "%s" there are only positive scores. This probe name is ignored.' % probe_name)
-  # test if there are probes for which only negatives exist
-  for probe_name in sorted(neg_dict.keys()):
-    if not probe_name in pos_dict.keys():
-       logger.warn('For probe name "%s" there are only negative scores. This probe name is ignored.' % probe_name)
+
+    for probe_name in sorted(neg_dict.keys()):
+      if not probe_name in pos_dict.keys():
+        retval.append((numpy.array(neg_dict[probe_name], numpy.float64), numpy.array([], numpy.float64)))
+  
+
   return retval
diff --git a/bob/measure/test_error.py b/bob/measure/test_error.py
index 3806918..45d026c 100644
--- a/bob/measure/test_error.py
+++ b/bob/measure/test_error.py
@@ -321,3 +321,25 @@ def test_calibration():
   assert min_cllr <= cllr
   assert cllr, 3.61833457
   assert min_cllr, 0.337364136
+  
+  
+
+def test_open_set_recognition_rate():
+  
+  #No error files
+  scores = bob.measure.load.cmc_four_column(F("scores-cmc-4col-open-set.txt"),   load_only_negatives=True)
+  assert bob.measure.recognition_rate(scores, threshold=0.5), 1.0
+  assert bob.measure.recognition_rate(scores, threshold=10.), 0.222222222222
+  
+  #One error
+  scores = bob.measure.load.cmc_four_column(F("scores-cmc-4col-open-set-one-error.txt"), 
+  load_only_negatives=True)
+  assert bob.measure.recognition_rate(scores, threshold=0.5), 0.888888888889
+  assert bob.measure.recognition_rate(scores, threshold=10.), 0.222222222222
+
+  #Two errors
+  scores = bob.measure.load.cmc_four_column(F("scores-cmc-4col-open-set-two-errors.txt"), 
+  load_only_negatives=True)
+  assert bob.measure.recognition_rate(scores, threshold=0.5), 0.777777777778
+  assert bob.measure.recognition_rate(scores, threshold=10.), 0.111111111111  
+
-- 
GitLab