Merge branch 'fix/empty-column' into 'master'

Scores loading fixes Closes #194 and #196 See merge request !328

Merge branch 'fix/empty-column' into 'master'
e2fa582c · Yannick DAYER · 9a790353 · f996874d · e2fa582c
Commit e2fa582c authored 11 months ago by Yannick DAYER
--- a/src/bob/bio/base/score/load.py
+++ b/src/bob/bio/base/score/load.py
@@ -2,14 +2,14 @@
 # vim: set fileencoding=utf-8 :
 # Mon 23 May 2011 16:23:05 CEST
-"""A set of utilities to load score files with different formats.
+"""A set of utilities to load score files with different formats."""
-"""
 import csv
 import logging
 import os
 import tarfile
+from collections import defaultdict
 from pathlib import Path
 import dask.dataframe
@@ -94,8 +94,8 @@ def four_column(filename):
      str: The claimed identity -- the client name of the model that was used in
      the comparison
-      str: The real identity -- the client name of the probe that was used in the
+      str: The real identity -- the client name of the probe that was used in
-      comparison
+      the comparison
      str: A label of the probe -- usually the probe file name, or the probe id
@@ -153,15 +153,19 @@ def get_split_dataframe(filename):
    -------
      dataframe: negatives, contains the list of scores (and metadata) for which
-        the fields of the ``bio_ref_subject_id`` and ``probe_subject_id`` columns are
+        the fields of the ``bio_ref_subject_id`` and ``probe_subject_id``
-        different. (see :ref:`bob.bio.base.pipeline_simple_advanced_features`)
+        columns are different. (see
+        :ref:`bob.bio.base.pipeline_simple_advanced_features`)
      dataframe: positives, contains the list of scores (and metadata) for which
-        the fields of the ``bio_ref_subject_id`` and ``probe_subject_id`` columns are
+        the fields of the ``bio_ref_subject_id`` and ``probe_subject_id``
-        identical. (see :ref:`bob.bio.base.pipeline_simple_advanced_features`)
+        columns are identical. (see
+        :ref:`bob.bio.base.pipeline_simple_advanced_features`)
    """
-    df = dask.dataframe.read_csv(filename)
+    df = dask.dataframe.read_csv(
+        filename, dtype=defaultdict(lambda: str, {"score": float})
+    )
    genuines = df[df.probe_subject_id == df.bio_ref_subject_id]
    impostors = df[df.probe_subject_id != df.bio_ref_subject_id]
@@ -184,15 +188,19 @@ def split_csv_scores(filename, score_column: str = "score"):
    -------
      array: negatives, 1D float array containing the list of scores, for which
-        the fields of the ``bio_ref_subject_id`` and ``probe_subject_id`` columns are
+        the fields of the ``bio_ref_subject_id`` and ``probe_subject_id``
-        different. (see :ref:`bob.bio.base.pipeline_simple_advanced_features`)
+        columns are different. (see
+        :ref:`bob.bio.base.pipeline_simple_advanced_features`)
      array: positives, 1D float array containing the list of scores, for which
-        the fields of the ``bio_ref_subject_id`` and ``probe_subject_id`` columns are
+        the fields of the ``bio_ref_subject_id`` and ``probe_subject_id``
-        identical. (see :ref:`bob.bio.base.pipeline_simple_advanced_features`)
+        columns are identical. (see
+        :ref:`bob.bio.base.pipeline_simple_advanced_features`)
    """
-    df = dask.dataframe.read_csv(filename)
+    df = dask.dataframe.read_csv(
+        filename, dtype=defaultdict(lambda: str, {"score": float})
+    )
    genuines = df[df.probe_subject_id == df.bio_ref_subject_id]
    impostors = df[df.probe_subject_id != df.bio_ref_subject_id]
@@ -262,8 +270,8 @@ def five_column(filename):
      str: A label for the model -- usually the model file name, or the model id
-      str: The real identity -- the client name of the probe that was used in the
+      str: The real identity -- the client name of the probe that was used in
-      comparison
+      the comparison
      str: A label of the probe -- usually the probe file name, or the probe id
@@ -346,7 +354,8 @@ def scores(filename, ncolumns=None):
    Parameters:
    filename:  :py:class:`str`, ``file-like``:
-      The file object that will be opened with :py:func:`open_file` containing the scores.
+      The file object that will be opened with :py:func:`open_file` containing
+      the scores.
    ncolumns: any
      ignored
@@ -461,8 +470,8 @@ def load_score(filename, ncolumns=None, minimal=False, **kwargs):
        specifying the number of columns in the score file. If None is provided,
        the number of columns will be guessed.
-      minimal (:py:class:`bool`, optional): If True, only loads ``claimed_id``, ``real_id``,
+      minimal (:py:class:`bool`, optional): If True, only loads ``claimed_id``,
-        and ``scores``.
+      ``real_id``, and ``scores``.
      **kwargs: Keyword arguments passed to :py:func:`numpy.genfromtxt`
@@ -624,9 +633,10 @@ def _estimate_score_file_format(filename, ncolumns=None):
 def _iterate_score_file(filename, csv_score_column: str = "score"):
-    """Opens the score file for reading and yields the score file line by line in a tuple/list.
+    """Opens the score file and yields the score file lines in a tuple/list.
-    The last element of the line (which is the score) will be transformed to float, the other elements will be str
+    The last element of the line (which is the score) will be transformed to
+    float, the other elements will be str.
    """
    if iscsv(filename):
        for row in _iterate_csv_score_file(
@@ -635,7 +645,7 @@ def _iterate_score_file(filename, csv_score_column: str = "score"):
            yield [
                row["bio_ref_subject_id"],
                row["probe_subject_id"],
-                row["probe_key"],
+                row["probe_template_id"],
                row[csv_score_column],
            ]
    else:
@@ -667,7 +677,9 @@ def _iterate_csv_score_file(filename, score_column: str = "score"):
 def _split_scores(
    score_lines, real_id_index, claimed_id_index=0, score_index=-1
 ):
-    """Take the output of :py:func:`four_column` or :py:func:`five_column` and return negatives and positives."""
+    """Take the output of :py:func:`four_column` or :py:func:`five_column` and
+    return negatives and positives.
+    """
    positives, negatives = [], []
    for line in score_lines:
        which = (
@@ -687,7 +699,9 @@ def _split_cmc_scores(
    claimed_id_index=0,
    score_index=-1,
 ):
-    """Takes the output of :py:func:`four_column` or :py:func:`five_column` and return cmc scores."""
+    """Takes the output of :py:func:`four_column` or :py:func:`five_column` and
+    return cmc scores.
+    """
    if probe_name_index is None:
        probe_name_index = real_id_index + 1
    # extract positives and negatives
@@ -712,12 +726,16 @@ def _split_cmc_scores(
    # get all scores in the desired format
    return [
        (
-            numpy.array(neg_dict[probe_name], numpy.float64)
+            (
-            if probe_name in neg_dict
+                numpy.array(neg_dict[probe_name], numpy.float64)
-            else None,
+                if probe_name in neg_dict
-            numpy.array(pos_dict[probe_name], numpy.float64)
+                else None
-            if probe_name in pos_dict
+            ),
-            else None,
+            (
+                numpy.array(pos_dict[probe_name], numpy.float64)
+                if probe_name in pos_dict
+                else None
+            ),
        )
        for probe_name in probe_names
    ]