Commit 271b5897 authored by Amir MOHAMMADI's avatar Amir MOHAMMADI

Merge branch '19-load_scores-extremely-memory-hungry' into 'master'

Resolve "load_scores extremely memory hungry"

Closes #19 

Currently I left the old functions to read scores inside, but it might be a good idea to remove them and base everything on iterators. What do you think?

See merge request !20
parents ba52bf43 fb8f306c
Pipeline #5206 passed with stages
in 10 minutes and 46 seconds
...@@ -6,8 +6,10 @@ ...@@ -6,8 +6,10 @@
""" """
import numpy import numpy
import csv
import tarfile import tarfile
import os import os
import sys
import logging import logging
logger = logging.getLogger('bob.measure') logger = logging.getLogger('bob.measure')
...@@ -78,7 +80,7 @@ def four_column(filename): ...@@ -78,7 +80,7 @@ def four_column(filename):
opened with :py:func:`open_file` containing the scores. opened with :py:func:`open_file` containing the scores.
Returns: Yields:
str: The claimed identity -- the client name of the model that was used in str: The claimed identity -- the client name of the model that was used in
the comparison the comparison
...@@ -91,19 +93,8 @@ def four_column(filename): ...@@ -91,19 +93,8 @@ def four_column(filename):
float: The result of the comparison of the model and the probe float: The result of the comparison of the model and the probe
""" """
return _iterate_score_file(filename)
for i, l in enumerate(open_file(filename)):
if isinstance(l, bytes): l = l.decode('utf-8')
s = l.strip()
if len(s) == 0 or s[0] == '#': continue #empty or comment
field = [k.strip() for k in s.split()]
if len(field) < 4:
raise SyntaxError('Line %d of file "%s" is invalid: %s' % (i, filename, l))
try:
score = float(field[3])
except:
raise SyntaxError('Cannot convert score to float at line %d of file "%s": %s' % (i, filename, l))
yield (field[0], field[1], field[2], score)
def split_four_column(filename): def split_four_column(filename):
...@@ -135,8 +126,8 @@ def split_four_column(filename): ...@@ -135,8 +126,8 @@ def split_four_column(filename):
""" """
score_lines = load_score(filename, 4) score_lines = four_column(filename)
return get_negatives_positives(score_lines) return _split_scores(score_lines, 1)
def cmc_four_column(filename): def cmc_four_column(filename):
...@@ -168,22 +159,8 @@ def cmc_four_column(filename): ...@@ -168,22 +159,8 @@ def cmc_four_column(filename):
""" """
# extract positives and negatives score_lines = four_column(filename)
pos_dict = {} return _split_cmc_scores(score_lines, 1)
neg_dict = {}
# read four column list
for (client_id, probe_id, probe_name, score) in four_column(filename):
# check in which dict we have to put the score
correct_dict = pos_dict if client_id == probe_id else neg_dict
# append score
if probe_name in correct_dict:
correct_dict[probe_name].append(score)
else:
correct_dict[probe_name] = [score]
# convert that into the desired format
return _convert_cmc_scores(neg_dict, pos_dict)
def five_column(filename): def five_column(filename):
...@@ -205,7 +182,7 @@ def five_column(filename): ...@@ -205,7 +182,7 @@ def five_column(filename):
opened with :py:func:`open_file` containing the scores. opened with :py:func:`open_file` containing the scores.
Returns: Yields:
str: The claimed identity -- the client name of the model that was used in str: The claimed identity -- the client name of the model that was used in
the comparison the comparison
...@@ -221,18 +198,7 @@ def five_column(filename): ...@@ -221,18 +198,7 @@ def five_column(filename):
""" """
for i, l in enumerate(open_file(filename)): return _iterate_score_file(filename)
if isinstance(l, bytes): l = l.decode('utf-8')
s = l.strip()
if len(s) == 0 or s[0] == '#': continue #empty or comment
field = [k.strip() for k in s.split()]
if len(field) < 5:
raise SyntaxError('Line %d of file "%s" is invalid: %s' % (i, filename, l))
try:
score = float(field[4])
except:
raise SyntaxError('Cannot convert score to float at line %d of file "%s": %s' % (i, filename, l))
yield (field[0], field[1], field[2], field[3], score)
def split_five_column(filename): def split_five_column(filename):
...@@ -264,8 +230,8 @@ def split_five_column(filename): ...@@ -264,8 +230,8 @@ def split_five_column(filename):
""" """
score_lines = load_score(filename, 5) score_lines = four_column(filename)
return get_negatives_positives(score_lines) return _split_scores(score_lines, 2)
def cmc_five_column(filename): def cmc_five_column(filename):
...@@ -294,22 +260,9 @@ def cmc_five_column(filename): ...@@ -294,22 +260,9 @@ def cmc_five_column(filename):
``positive`` scores for one probe of the database. ``positive`` scores for one probe of the database.
""" """
# extract positives and negatives score_lines = four_column(filename)
pos_dict = {} return _split_cmc_scores(score_lines, 2)
neg_dict = {}
# read four column list
for (client_id, _, probe_id, probe_name, score) in five_column(filename):
# check in which dict we have to put the score
correct_dict = pos_dict if client_id == probe_id else neg_dict
# append score
if probe_name in correct_dict:
correct_dict[probe_name].append(score)
else:
correct_dict[probe_name] = [score]
# convert that into the desired format
return _convert_cmc_scores(neg_dict, pos_dict)
def load_score(filename, ncolumns=None): def load_score(filename, ncolumns=None):
...@@ -425,11 +378,48 @@ def dump_score(filename, score_lines): ...@@ -425,11 +378,48 @@ def dump_score(filename, score_lines):
numpy.savetxt(filename, score_lines, fmt=fmt) numpy.savetxt(filename, score_lines, fmt=fmt)
def _convert_cmc_scores(neg_dict, pos_dict): def _iterate_score_file(filename):
"""Converts the negative and positive scores read with """Opens the score file for reading and yields the score file line by line in a tuple/list.
:py:func:`cmc_four_column` or :py:func:`cmc_four_column` into a format that
is handled by the :py:func:`bob.measure.cmc` and similar functions. The last element of the line (which is the score) will be transformed to float, the other elements will be str
"""
opened = open_file(filename, 'rb')
if sys.version_info.major > 2:
import io
opened = io.TextIOWrapper(opened, newline="")
reader = csv.reader(opened, delimiter=' ')
for splits in reader:
splits[-1] = float(splits[-1])
yield splits
def _split_scores(score_lines, real_id_index, claimed_id_index = 0, score_index = -1):
"""Take the output of :py:func:`four_column` or :py:func:`five_column` and return negatives and positives.
""" """
positives, negatives = [], []
for line in score_lines:
which = positives if line[claimed_id_index] == line[real_id_index] else negatives
which.append(line[score_index])
return (numpy.array(negatives), numpy.array(positives))
def _split_cmc_scores(score_lines, real_id_index, probe_name_index = None, claimed_id_index = 0, score_index = -1):
"""Takes the output of :py:func:`four_column` or :py:func:`five_column` and return cmc scores.
"""
if probe_name_index is None:
probe_name_index = real_id_index + 1
# extract positives and negatives
pos_dict = {}
neg_dict = {}
# read four column list
for line in score_lines:
which = pos_dict if line[claimed_id_index] == line[real_id_index] else neg_dict
probe_name = line[probe_name_index]
# append score
if probe_name not in which:
which[probe_name] = []
which[probe_name].append(line[score_index])
# convert to lists of tuples of ndarrays (or None) # convert to lists of tuples of ndarrays (or None)
probe_names = sorted(set(neg_dict.keys()).union(set(pos_dict.keys()))) probe_names = sorted(set(neg_dict.keys()).union(set(pos_dict.keys())))
......
...@@ -216,10 +216,10 @@ def test_plots(): ...@@ -216,10 +216,10 @@ def test_plots():
# EPC curve, you need to have a development and a test set. We will split, # EPC curve, you need to have a development and a test set. We will split,
# by the middle, the negatives and positives sample we have, just for the # by the middle, the negatives and positives sample we have, just for the
# sake of testing # sake of testing
dev_negatives = negatives[:(negatives.shape[0]/2)] dev_negatives = negatives[:(negatives.shape[0]//2)]
test_negatives = negatives[(negatives.shape[0]/2):] test_negatives = negatives[(negatives.shape[0]//2):]
dev_positives = positives[:(positives.shape[0]/2)] dev_positives = positives[:(positives.shape[0]//2)]
test_positives = positives[(positives.shape[0]/2):] test_positives = positives[(positives.shape[0]//2):]
xy = epc(dev_negatives, dev_positives, xy = epc(dev_negatives, dev_positives,
test_negatives, test_positives, 100) test_negatives, test_positives, 100)
# uncomment the next line to save a reference value # uncomment the next line to save a reference value
......
...@@ -20,7 +20,7 @@ def test_load_scores(): ...@@ -20,7 +20,7 @@ def test_load_scores():
load_functions = {'4col' : bob.measure.load.four_column, '5col' : bob.measure.load.five_column} load_functions = {'4col' : bob.measure.load.four_column, '5col' : bob.measure.load.five_column}
cols = {'4col' : 4, '5col' : 5} cols = {'4col' : 4, '5col' : 5}
for variant in ('4col', '5col'): for variant in cols:
# read score file in normal way # read score file in normal way
normal_score_file = bob.io.base.test_utils.datafile('dev-%s.txt' % variant, 'bob.measure') normal_score_file = bob.io.base.test_utils.datafile('dev-%s.txt' % variant, 'bob.measure')
normal_scores = list(load_functions[variant](normal_score_file)) normal_scores = list(load_functions[variant](normal_score_file))
...@@ -43,7 +43,7 @@ def test_load_score(): ...@@ -43,7 +43,7 @@ def test_load_score():
scores = [] scores = []
cols = {'4col' : 4, '5col' : 5} cols = {'4col' : 4, '5col' : 5}
for variant in ('4col', '5col'): for variant in cols:
# read score file in normal way # read score file in normal way
normal_score_file = bob.io.base.test_utils.datafile('dev-%s.txt' % variant, 'bob.measure') normal_score_file = bob.io.base.test_utils.datafile('dev-%s.txt' % variant, 'bob.measure')
normal_scores = bob.measure.load.load_score(normal_score_file, cols[variant]) normal_scores = bob.measure.load.load_score(normal_score_file, cols[variant])
...@@ -67,7 +67,7 @@ def test_dump_score(): ...@@ -67,7 +67,7 @@ def test_dump_score():
scores = [] scores = []
cols = {'4col' : 4, '5col' : 5} cols = {'4col' : 4, '5col' : 5}
for variant in ('4col', '5col'): for variant in cols:
# read score file # read score file
normal_score_file = bob.io.base.test_utils.datafile('dev-%s.txt' % variant, 'bob.measure') normal_score_file = bob.io.base.test_utils.datafile('dev-%s.txt' % variant, 'bob.measure')
normal_scores = bob.measure.load.load_score(normal_score_file, cols[variant]) normal_scores = bob.measure.load.load_score(normal_score_file, cols[variant])
...@@ -177,7 +177,7 @@ def test_from_openbr(): ...@@ -177,7 +177,7 @@ def test_from_openbr():
load_functions = {'4col' : bob.measure.load.four_column, '5col' : bob.measure.load.five_column} load_functions = {'4col' : bob.measure.load.four_column, '5col' : bob.measure.load.five_column}
try: try:
for variant in ('4col', '5col'): for variant in load_functions:
# first, do not define keyword arguments -- let the file get the model and probe ids being created automatically # first, do not define keyword arguments -- let the file get the model and probe ids being created automatically
bob.measure.openbr.write_score_file(matrix_file, mask_file, score_file, score_file_format="%sumn"%variant) bob.measure.openbr.write_score_file(matrix_file, mask_file, score_file, score_file_format="%sumn"%variant)
assert os.path.exists(score_file) assert os.path.exists(score_file)
...@@ -211,10 +211,10 @@ def test_from_openbr(): ...@@ -211,10 +211,10 @@ def test_from_openbr():
assert columns[i][j] == reference[i][j], str(columns[i]) + " != " + str(reference[i]) assert columns[i][j] == reference[i][j], str(columns[i]) + " != " + str(reference[i])
# check that the score is close (OpenBR write scores in float32 precision only) # check that the score is close (OpenBR write scores in float32 precision only)
assert abs(columns[i][-1] - numpy.float32(reference[i][-1])) <= 1e-8, str(columns[i][-1]) + " != " + str(reference[i][-1]) assert abs(columns[i][-1] - numpy.float32(reference[i][-1])) <= 1e-8, str(columns[i][-1]) + " != " + str(reference[i][-1])
#assert numpy.isclose(columns[i][-1], reference[i][-1], atol = 1e-3, rtol=1e-8), str(columns[i][-1]) + " != " + str(reference[i][-1]) #assert numpy.isclose(columns[i][-1], reference[i][-1], atol = 1e-3, rtol=1e-8), str(columns[i][-1]) + " != " + str(reference[i][-1])
assert numpy.allclose(columns[i][-1], reference[i][-1], atol = 1e-3, rtol=1e-8), str(columns[i][-1]) + " != " + str(reference[i][-1]) assert numpy.allclose(columns[i][-1], reference[i][-1], atol = 1e-3, rtol=1e-8), str(columns[i][-1]) + " != " + str(reference[i][-1])
finally: finally:
shutil.rmtree(temp_dir) shutil.rmtree(temp_dir)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment