diff --git a/src/bob/bio/base/score/load.py b/src/bob/bio/base/score/load.py index 47043f85dfebb77a2eb940fdd33e4ebb7abe650c..8daf48abc6414d47f311299f721e897bd194df90 100644 --- a/src/bob/bio/base/score/load.py +++ b/src/bob/bio/base/score/load.py @@ -2,14 +2,14 @@ # vim: set fileencoding=utf-8 : # Mon 23 May 2011 16:23:05 CEST -"""A set of utilities to load score files with different formats. -""" +"""A set of utilities to load score files with different formats.""" import csv import logging import os import tarfile +from collections import defaultdict from pathlib import Path import dask.dataframe @@ -94,8 +94,8 @@ def four_column(filename): str: The claimed identity -- the client name of the model that was used in the comparison - str: The real identity -- the client name of the probe that was used in the - comparison + str: The real identity -- the client name of the probe that was used in + the comparison str: A label of the probe -- usually the probe file name, or the probe id @@ -153,15 +153,19 @@ def get_split_dataframe(filename): ------- dataframe: negatives, contains the list of scores (and metadata) for which - the fields of the ``bio_ref_subject_id`` and ``probe_subject_id`` columns are - different. (see :ref:`bob.bio.base.pipeline_simple_advanced_features`) + the fields of the ``bio_ref_subject_id`` and ``probe_subject_id`` + columns are different. (see + :ref:`bob.bio.base.pipeline_simple_advanced_features`) dataframe: positives, contains the list of scores (and metadata) for which - the fields of the ``bio_ref_subject_id`` and ``probe_subject_id`` columns are - identical. (see :ref:`bob.bio.base.pipeline_simple_advanced_features`) + the fields of the ``bio_ref_subject_id`` and ``probe_subject_id`` + columns are identical. (see + :ref:`bob.bio.base.pipeline_simple_advanced_features`) """ - df = dask.dataframe.read_csv(filename) + df = dask.dataframe.read_csv( + filename, dtype=defaultdict(lambda: str, {"score": float}) + ) genuines = df[df.probe_subject_id == df.bio_ref_subject_id] impostors = df[df.probe_subject_id != df.bio_ref_subject_id] @@ -184,15 +188,19 @@ def split_csv_scores(filename, score_column: str = "score"): ------- array: negatives, 1D float array containing the list of scores, for which - the fields of the ``bio_ref_subject_id`` and ``probe_subject_id`` columns are - different. (see :ref:`bob.bio.base.pipeline_simple_advanced_features`) + the fields of the ``bio_ref_subject_id`` and ``probe_subject_id`` + columns are different. (see + :ref:`bob.bio.base.pipeline_simple_advanced_features`) array: positives, 1D float array containing the list of scores, for which - the fields of the ``bio_ref_subject_id`` and ``probe_subject_id`` columns are - identical. (see :ref:`bob.bio.base.pipeline_simple_advanced_features`) + the fields of the ``bio_ref_subject_id`` and ``probe_subject_id`` + columns are identical. (see + :ref:`bob.bio.base.pipeline_simple_advanced_features`) """ - df = dask.dataframe.read_csv(filename) + df = dask.dataframe.read_csv( + filename, dtype=defaultdict(lambda: str, {"score": float}) + ) genuines = df[df.probe_subject_id == df.bio_ref_subject_id] impostors = df[df.probe_subject_id != df.bio_ref_subject_id] @@ -262,8 +270,8 @@ def five_column(filename): str: A label for the model -- usually the model file name, or the model id - str: The real identity -- the client name of the probe that was used in the - comparison + str: The real identity -- the client name of the probe that was used in + the comparison str: A label of the probe -- usually the probe file name, or the probe id @@ -346,7 +354,8 @@ def scores(filename, ncolumns=None): Parameters: filename: :py:class:`str`, ``file-like``: - The file object that will be opened with :py:func:`open_file` containing the scores. + The file object that will be opened with :py:func:`open_file` containing + the scores. ncolumns: any ignored @@ -461,8 +470,8 @@ def load_score(filename, ncolumns=None, minimal=False, **kwargs): specifying the number of columns in the score file. If None is provided, the number of columns will be guessed. - minimal (:py:class:`bool`, optional): If True, only loads ``claimed_id``, ``real_id``, - and ``scores``. + minimal (:py:class:`bool`, optional): If True, only loads ``claimed_id``, + ``real_id``, and ``scores``. **kwargs: Keyword arguments passed to :py:func:`numpy.genfromtxt` @@ -624,9 +633,10 @@ def _estimate_score_file_format(filename, ncolumns=None): def _iterate_score_file(filename, csv_score_column: str = "score"): - """Opens the score file for reading and yields the score file line by line in a tuple/list. + """Opens the score file and yields the score file lines in a tuple/list. - The last element of the line (which is the score) will be transformed to float, the other elements will be str + The last element of the line (which is the score) will be transformed to + float, the other elements will be str. """ if iscsv(filename): for row in _iterate_csv_score_file( @@ -635,7 +645,7 @@ def _iterate_score_file(filename, csv_score_column: str = "score"): yield [ row["bio_ref_subject_id"], row["probe_subject_id"], - row["probe_key"], + row["probe_template_id"], row[csv_score_column], ] else: @@ -667,7 +677,9 @@ def _iterate_csv_score_file(filename, score_column: str = "score"): def _split_scores( score_lines, real_id_index, claimed_id_index=0, score_index=-1 ): - """Take the output of :py:func:`four_column` or :py:func:`five_column` and return negatives and positives.""" + """Take the output of :py:func:`four_column` or :py:func:`five_column` and + return negatives and positives. + """ positives, negatives = [], [] for line in score_lines: which = ( @@ -687,7 +699,9 @@ def _split_cmc_scores( claimed_id_index=0, score_index=-1, ): - """Takes the output of :py:func:`four_column` or :py:func:`five_column` and return cmc scores.""" + """Takes the output of :py:func:`four_column` or :py:func:`five_column` and + return cmc scores. + """ if probe_name_index is None: probe_name_index = real_id_index + 1 # extract positives and negatives @@ -712,12 +726,16 @@ def _split_cmc_scores( # get all scores in the desired format return [ ( - numpy.array(neg_dict[probe_name], numpy.float64) - if probe_name in neg_dict - else None, - numpy.array(pos_dict[probe_name], numpy.float64) - if probe_name in pos_dict - else None, + ( + numpy.array(neg_dict[probe_name], numpy.float64) + if probe_name in neg_dict + else None + ), + ( + numpy.array(pos_dict[probe_name], numpy.float64) + if probe_name in pos_dict + else None + ), ) for probe_name in probe_names ]