Commit 8d99cfaf authored by Amir Mohammadi's avatar Amir Mohammadi

Load score more efficiently for negatives and positives

parent c3c1ecbc
...@@ -264,8 +264,7 @@ def cmc_five_column(filename): ...@@ -264,8 +264,7 @@ def cmc_five_column(filename):
return _split_cmc_scores(score_lines, 2) return _split_cmc_scores(score_lines, 2)
def load_score(filename, ncolumns=None, minimal=False, **kwargs):
def load_score(filename, ncolumns=None):
"""Load scores using numpy.loadtxt and return the data as a numpy array. """Load scores using numpy.loadtxt and return the data as a numpy array.
Parameters: Parameters:
...@@ -277,6 +276,11 @@ def load_score(filename, ncolumns=None): ...@@ -277,6 +276,11 @@ def load_score(filename, ncolumns=None):
specifying the number of columns in the score file. If None is provided, specifying the number of columns in the score file. If None is provided,
the number of columns will be guessed. the number of columns will be guessed.
minimal (:py:class:`bool`, optional): If True, only loads ``claimed_id``, ``real_id``,
and ``scores``.
**kwargs: Keyword arguments passed to :py:func:`numpy.genfromtxt`
Returns: Returns:
...@@ -300,6 +304,7 @@ def load_score(filename, ncolumns=None): ...@@ -300,6 +304,7 @@ def load_score(filename, ncolumns=None):
finally: finally:
f.close() f.close()
usecols = kwargs.pop('usecols', None)
if ncolumns == 4: if ncolumns == 4:
names = ('claimed_id', 'real_id', 'test_label', 'score') names = ('claimed_id', 'real_id', 'test_label', 'score')
converters = { converters = {
...@@ -307,6 +312,8 @@ def load_score(filename, ncolumns=None): ...@@ -307,6 +312,8 @@ def load_score(filename, ncolumns=None):
1: convertfunc, 1: convertfunc,
2: convertfunc, 2: convertfunc,
3: float} 3: float}
if minimal:
usecols = (0, 1, 3)
elif ncolumns == 5: elif ncolumns == 5:
names = ('claimed_id', 'model_label', 'real_id', 'test_label', 'score') names = ('claimed_id', 'model_label', 'real_id', 'test_label', 'score')
...@@ -316,12 +323,14 @@ def load_score(filename, ncolumns=None): ...@@ -316,12 +323,14 @@ def load_score(filename, ncolumns=None):
2: convertfunc, 2: convertfunc,
3: convertfunc, 3: convertfunc,
4: float} 4: float}
if minimal:
usecols = (0, 2, 4)
else: else:
raise ValueError("ncolumns of 4 and 5 are supported only.") raise ValueError("ncolumns of 4 and 5 are supported only.")
score_lines = numpy.genfromtxt( score_lines = numpy.genfromtxt(
open_file(filename, mode='rb'), dtype=None, names=names, open_file(filename, mode='rb'), dtype=None, names=names,
converters=converters, invalid_raise=True) converters=converters, invalid_raise=True, usecols=usecols, **kwargs)
new_dtype = [] new_dtype = []
for name in score_lines.dtype.names[:-1]: for name in score_lines.dtype.names[:-1]:
new_dtype.append((name, str(score_lines.dtype[name]).replace('S', 'U'))) new_dtype.append((name, str(score_lines.dtype[name]).replace('S', 'U')))
...@@ -342,6 +351,13 @@ def get_negatives_positives(score_lines): ...@@ -342,6 +351,13 @@ def get_negatives_positives(score_lines):
return (negatives, positives) return (negatives, positives)
def get_negatives_positives_from_file(filename, **kwargs):
"""Loads the scores first efficiently and then calls
get_negatives_positives"""
score_lines = load_score(filename, minimal=True, **kwargs)
return get_negatives_positives(score_lines)
def get_negatives_positives_all(score_lines_list): def get_negatives_positives_all(score_lines_list):
"""Take a list of outputs of load_score and return stacked negatives and """Take a list of outputs of load_score and return stacked negatives and
positives. positives.
......
...@@ -60,6 +60,12 @@ def test_load_score(): ...@@ -60,6 +60,12 @@ def test_load_score():
for name in normal_scores.dtype.names: for name in normal_scores.dtype.names:
assert all(normal_scores[name] == compressed_scores[name]) assert all(normal_scores[name] == compressed_scores[name])
# test minimal loading
minimal_scores = bob.measure.load.load_score(normal_score_file, minimal=True)
assert len(minimal_scores) == 910
assert len(minimal_scores.dtype) == 3
assert minimal_scores.dtype.names == ('claimed_id', 'real_id', 'score')
def test_dump_score(): def test_dump_score():
# This function tests the IO functionality of dumping score files # This function tests the IO functionality of dumping score files
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment