Skip to content
Snippets Groups Projects
Commit e2fa582c authored by Yannick DAYER's avatar Yannick DAYER
Browse files

Merge branch 'fix/empty-column' into 'master'

Scores loading fixes

Closes #194 and #196

See merge request !328
parents 9a790353 f996874d
No related branches found
No related tags found
1 merge request!328Scores loading fixes
Pipeline #89061 passed with stages
in 12 minutes and 18 seconds
......@@ -2,14 +2,14 @@
# vim: set fileencoding=utf-8 :
# Mon 23 May 2011 16:23:05 CEST
"""A set of utilities to load score files with different formats.
"""
"""A set of utilities to load score files with different formats."""
import csv
import logging
import os
import tarfile
from collections import defaultdict
from pathlib import Path
import dask.dataframe
......@@ -94,8 +94,8 @@ def four_column(filename):
str: The claimed identity -- the client name of the model that was used in
the comparison
str: The real identity -- the client name of the probe that was used in the
comparison
str: The real identity -- the client name of the probe that was used in
the comparison
str: A label of the probe -- usually the probe file name, or the probe id
......@@ -153,15 +153,19 @@ def get_split_dataframe(filename):
-------
dataframe: negatives, contains the list of scores (and metadata) for which
the fields of the ``bio_ref_subject_id`` and ``probe_subject_id`` columns are
different. (see :ref:`bob.bio.base.pipeline_simple_advanced_features`)
the fields of the ``bio_ref_subject_id`` and ``probe_subject_id``
columns are different. (see
:ref:`bob.bio.base.pipeline_simple_advanced_features`)
dataframe: positives, contains the list of scores (and metadata) for which
the fields of the ``bio_ref_subject_id`` and ``probe_subject_id`` columns are
identical. (see :ref:`bob.bio.base.pipeline_simple_advanced_features`)
the fields of the ``bio_ref_subject_id`` and ``probe_subject_id``
columns are identical. (see
:ref:`bob.bio.base.pipeline_simple_advanced_features`)
"""
df = dask.dataframe.read_csv(filename)
df = dask.dataframe.read_csv(
filename, dtype=defaultdict(lambda: str, {"score": float})
)
genuines = df[df.probe_subject_id == df.bio_ref_subject_id]
impostors = df[df.probe_subject_id != df.bio_ref_subject_id]
......@@ -184,15 +188,19 @@ def split_csv_scores(filename, score_column: str = "score"):
-------
array: negatives, 1D float array containing the list of scores, for which
the fields of the ``bio_ref_subject_id`` and ``probe_subject_id`` columns are
different. (see :ref:`bob.bio.base.pipeline_simple_advanced_features`)
the fields of the ``bio_ref_subject_id`` and ``probe_subject_id``
columns are different. (see
:ref:`bob.bio.base.pipeline_simple_advanced_features`)
array: positives, 1D float array containing the list of scores, for which
the fields of the ``bio_ref_subject_id`` and ``probe_subject_id`` columns are
identical. (see :ref:`bob.bio.base.pipeline_simple_advanced_features`)
the fields of the ``bio_ref_subject_id`` and ``probe_subject_id``
columns are identical. (see
:ref:`bob.bio.base.pipeline_simple_advanced_features`)
"""
df = dask.dataframe.read_csv(filename)
df = dask.dataframe.read_csv(
filename, dtype=defaultdict(lambda: str, {"score": float})
)
genuines = df[df.probe_subject_id == df.bio_ref_subject_id]
impostors = df[df.probe_subject_id != df.bio_ref_subject_id]
......@@ -262,8 +270,8 @@ def five_column(filename):
str: A label for the model -- usually the model file name, or the model id
str: The real identity -- the client name of the probe that was used in the
comparison
str: The real identity -- the client name of the probe that was used in
the comparison
str: A label of the probe -- usually the probe file name, or the probe id
......@@ -346,7 +354,8 @@ def scores(filename, ncolumns=None):
Parameters:
filename: :py:class:`str`, ``file-like``:
The file object that will be opened with :py:func:`open_file` containing the scores.
The file object that will be opened with :py:func:`open_file` containing
the scores.
ncolumns: any
ignored
......@@ -461,8 +470,8 @@ def load_score(filename, ncolumns=None, minimal=False, **kwargs):
specifying the number of columns in the score file. If None is provided,
the number of columns will be guessed.
minimal (:py:class:`bool`, optional): If True, only loads ``claimed_id``, ``real_id``,
and ``scores``.
minimal (:py:class:`bool`, optional): If True, only loads ``claimed_id``,
``real_id``, and ``scores``.
**kwargs: Keyword arguments passed to :py:func:`numpy.genfromtxt`
......@@ -624,9 +633,10 @@ def _estimate_score_file_format(filename, ncolumns=None):
def _iterate_score_file(filename, csv_score_column: str = "score"):
"""Opens the score file for reading and yields the score file line by line in a tuple/list.
"""Opens the score file and yields the score file lines in a tuple/list.
The last element of the line (which is the score) will be transformed to float, the other elements will be str
The last element of the line (which is the score) will be transformed to
float, the other elements will be str.
"""
if iscsv(filename):
for row in _iterate_csv_score_file(
......@@ -635,7 +645,7 @@ def _iterate_score_file(filename, csv_score_column: str = "score"):
yield [
row["bio_ref_subject_id"],
row["probe_subject_id"],
row["probe_key"],
row["probe_template_id"],
row[csv_score_column],
]
else:
......@@ -667,7 +677,9 @@ def _iterate_csv_score_file(filename, score_column: str = "score"):
def _split_scores(
score_lines, real_id_index, claimed_id_index=0, score_index=-1
):
"""Take the output of :py:func:`four_column` or :py:func:`five_column` and return negatives and positives."""
"""Take the output of :py:func:`four_column` or :py:func:`five_column` and
return negatives and positives.
"""
positives, negatives = [], []
for line in score_lines:
which = (
......@@ -687,7 +699,9 @@ def _split_cmc_scores(
claimed_id_index=0,
score_index=-1,
):
"""Takes the output of :py:func:`four_column` or :py:func:`five_column` and return cmc scores."""
"""Takes the output of :py:func:`four_column` or :py:func:`five_column` and
return cmc scores.
"""
if probe_name_index is None:
probe_name_index = real_id_index + 1
# extract positives and negatives
......@@ -712,12 +726,16 @@ def _split_cmc_scores(
# get all scores in the desired format
return [
(
numpy.array(neg_dict[probe_name], numpy.float64)
if probe_name in neg_dict
else None,
numpy.array(pos_dict[probe_name], numpy.float64)
if probe_name in pos_dict
else None,
(
numpy.array(neg_dict[probe_name], numpy.float64)
if probe_name in neg_dict
else None
),
(
numpy.array(pos_dict[probe_name], numpy.float64)
if probe_name in pos_dict
else None
),
)
for probe_name in probe_names
]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment