From 5a688f7df52b501c1d660a8981efe82d31cba6de Mon Sep 17 00:00:00 2001 From: Manuel Gunther Date: Fri, 21 Oct 2016 17:56:09 -0600 Subject: [PATCH 1/8] Re-added functions to load score files with generators; and use them in split_{four,five}_column --- bob/measure/load.py | 68 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 4 deletions(-) diff --git a/bob/measure/load.py b/bob/measure/load.py index 3cbc430..52e32bd 100644 --- a/bob/measure/load.py +++ b/bob/measure/load.py @@ -6,6 +6,7 @@ """ import numpy +import csv import tarfile import os @@ -135,8 +136,8 @@ def split_four_column(filename): """ - score_lines = load_score(filename, 4) - return get_negatives_positives(score_lines) + score_lines = load_score_with_generator(filename, 4) + return get_negatives_positives_from_generator(score_lines) def cmc_four_column(filename): @@ -264,8 +265,8 @@ def split_five_column(filename): """ - score_lines = load_score(filename, 5) - return get_negatives_positives(score_lines) + score_lines = load_score_with_generator(filename, 5) + return get_negatives_positives_from_generator(score_lines) def cmc_five_column(filename): @@ -312,6 +313,65 @@ def cmc_five_column(filename): return _convert_cmc_scores(neg_dict, pos_dict) +COLUMNS = { + 4 : ('claimed_id', 'real_id', 'test_label', 'score'), + 5 : ('claimed_id', 'model_label', 'real_id', 'test_label', 'score') +} + +def load_score_with_generator(filename, ncolumns=None): + """Load scores using :py:class:`csv.reader` and yield the scores line by line in a dictionary. + + Parameters: + + filename (:py:class:`str`, ``file-like``): The file object that will be + opened with :py:func:`open_file` containing the scores. + + ncolumns (:py:class:`int`, optional): 4, 5 or None (the default), + specifying the number of columns in the score file. If None is provided, + the number of columns will be guessed. + + + Yields: + + line: A dictionary which contains not only the actual ``score`` but also the + ``claimed_id``, ``real_id``, ``test_label`` (and ``['model_label']``) + """ + + if ncolumns is None: + f = open_file(filename) + try: + line = f.readline() + ncolumns = len(line.split()) + except Exception: + logger.warn('Could not guess the number of columns in file: {}. ' + 'Assuming 4 column format.'.format(filename)) + ncolumns = 4 + finally: + f.close() + elif ncolumns not in (4,5): + raise ValueError("ncolumns of 4 and 5 are supported only.") + + names = COLUMNS[ncolumns] + r = csv.reader(open_file(filename, mode='rb'), delimiter=' ') + for n, splits in enumerate(r): + assert len(splits) == ncolumns, "The line %d: %s of file %s is not compatible" % (n, " ".join(splits), filename) + splits[-1] = float(splits[-1]) + yield {names[i] : splits[i] for i in range(ncolumns)} + + +def get_negatives_positives_from_generator(score_lines): + """Take the output of :py:func:`load_score_with_generator` and return negatives and positives. This + function aims to replace split_four_column and split_five_column but takes a + different input. It's up to you to use which one. + """ + positives, negatives = [], [] + for line in score_lines: + which = positives if line['claimed_id'] == line['real_id'] else negatives + which.append(line['score']) + + return (numpy.array(negatives), numpy.array(positives)) + + def load_score(filename, ncolumns=None): """Load scores using numpy.loadtxt and return the data as a numpy array. -- GitLab From 7ed0fdca13efc3ce4ca132b23ec47856920561e2 Mon Sep 17 00:00:00 2001 From: Manuel Gunther Date: Fri, 21 Oct 2016 18:18:47 -0600 Subject: [PATCH 2/8] Implemented load_score_with_generator function using csv.DictReader --- bob/measure/load.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/bob/measure/load.py b/bob/measure/load.py index 52e32bd..71dc0a8 100644 --- a/bob/measure/load.py +++ b/bob/measure/load.py @@ -319,7 +319,7 @@ COLUMNS = { } def load_score_with_generator(filename, ncolumns=None): - """Load scores using :py:class:`csv.reader` and yield the scores line by line in a dictionary. + """Load scores using :py:class:`csv.DictReader` and yield the scores line by line in a dictionary. Parameters: @@ -351,13 +351,10 @@ def load_score_with_generator(filename, ncolumns=None): elif ncolumns not in (4,5): raise ValueError("ncolumns of 4 and 5 are supported only.") - names = COLUMNS[ncolumns] - r = csv.reader(open_file(filename, mode='rb'), delimiter=' ') - for n, splits in enumerate(r): - assert len(splits) == ncolumns, "The line %d: %s of file %s is not compatible" % (n, " ".join(splits), filename) - splits[-1] = float(splits[-1]) - yield {names[i] : splits[i] for i in range(ncolumns)} - + reader = csv.DictReader(open_file(filename, mode='rb'), fieldnames=COLUMNS[ncolumns], delimiter=' ') + for splits in reader: + splits['score'] = float(splits['score']) + yield splits def get_negatives_positives_from_generator(score_lines): """Take the output of :py:func:`load_score_with_generator` and return negatives and positives. This -- GitLab From 0c19f066b39f971d223e349662816370dc3a4e82 Mon Sep 17 00:00:00 2001 From: Manuel Gunther Date: Thu, 27 Oct 2016 15:18:07 -0600 Subject: [PATCH 3/8] Rewrote split_four/five_column to use four/five_column; reimplemented four/five_column --- bob/measure/load.py | 111 ++++++++++---------------------------------- 1 file changed, 25 insertions(+), 86 deletions(-) diff --git a/bob/measure/load.py b/bob/measure/load.py index 71dc0a8..0ef498c 100644 --- a/bob/measure/load.py +++ b/bob/measure/load.py @@ -79,7 +79,7 @@ def four_column(filename): opened with :py:func:`open_file` containing the scores. - Returns: + Yields: str: The claimed identity -- the client name of the model that was used in the comparison @@ -93,18 +93,10 @@ def four_column(filename): """ - for i, l in enumerate(open_file(filename)): - if isinstance(l, bytes): l = l.decode('utf-8') - s = l.strip() - if len(s) == 0 or s[0] == '#': continue #empty or comment - field = [k.strip() for k in s.split()] - if len(field) < 4: - raise SyntaxError('Line %d of file "%s" is invalid: %s' % (i, filename, l)) - try: - score = float(field[3]) - except: - raise SyntaxError('Cannot convert score to float at line %d of file "%s": %s' % (i, filename, l)) - yield (field[0], field[1], field[2], score) + reader = csv.reader(open_file(filename, mode='rb'), delimiter=' ') + for splits in reader: + splits[-1] = float(splits[-1]) + yield splits def split_four_column(filename): @@ -136,8 +128,8 @@ def split_four_column(filename): """ - score_lines = load_score_with_generator(filename, 4) - return get_negatives_positives_from_generator(score_lines) + score_lines = four_column(filename) + return _split_scores(score_lines, 1) def cmc_four_column(filename): @@ -206,7 +198,7 @@ def five_column(filename): opened with :py:func:`open_file` containing the scores. - Returns: + Yields: str: The claimed identity -- the client name of the model that was used in the comparison @@ -222,18 +214,10 @@ def five_column(filename): """ - for i, l in enumerate(open_file(filename)): - if isinstance(l, bytes): l = l.decode('utf-8') - s = l.strip() - if len(s) == 0 or s[0] == '#': continue #empty or comment - field = [k.strip() for k in s.split()] - if len(field) < 5: - raise SyntaxError('Line %d of file "%s" is invalid: %s' % (i, filename, l)) - try: - score = float(field[4]) - except: - raise SyntaxError('Cannot convert score to float at line %d of file "%s": %s' % (i, filename, l)) - yield (field[0], field[1], field[2], field[3], score) + reader = csv.reader(open_file(filename, mode='rb'), delimiter=' ') + for splits in reader: + splits[-1] = float(splits[-1]) + yield splits def split_five_column(filename): @@ -265,8 +249,8 @@ def split_five_column(filename): """ - score_lines = load_score_with_generator(filename, 5) - return get_negatives_positives_from_generator(score_lines) + score_lines = four_column(filename) + return _split_scores(score_lines, 2) def cmc_five_column(filename): @@ -313,62 +297,6 @@ def cmc_five_column(filename): return _convert_cmc_scores(neg_dict, pos_dict) -COLUMNS = { - 4 : ('claimed_id', 'real_id', 'test_label', 'score'), - 5 : ('claimed_id', 'model_label', 'real_id', 'test_label', 'score') -} - -def load_score_with_generator(filename, ncolumns=None): - """Load scores using :py:class:`csv.DictReader` and yield the scores line by line in a dictionary. - - Parameters: - - filename (:py:class:`str`, ``file-like``): The file object that will be - opened with :py:func:`open_file` containing the scores. - - ncolumns (:py:class:`int`, optional): 4, 5 or None (the default), - specifying the number of columns in the score file. If None is provided, - the number of columns will be guessed. - - - Yields: - - line: A dictionary which contains not only the actual ``score`` but also the - ``claimed_id``, ``real_id``, ``test_label`` (and ``['model_label']``) - """ - - if ncolumns is None: - f = open_file(filename) - try: - line = f.readline() - ncolumns = len(line.split()) - except Exception: - logger.warn('Could not guess the number of columns in file: {}. ' - 'Assuming 4 column format.'.format(filename)) - ncolumns = 4 - finally: - f.close() - elif ncolumns not in (4,5): - raise ValueError("ncolumns of 4 and 5 are supported only.") - - reader = csv.DictReader(open_file(filename, mode='rb'), fieldnames=COLUMNS[ncolumns], delimiter=' ') - for splits in reader: - splits['score'] = float(splits['score']) - yield splits - -def get_negatives_positives_from_generator(score_lines): - """Take the output of :py:func:`load_score_with_generator` and return negatives and positives. This - function aims to replace split_four_column and split_five_column but takes a - different input. It's up to you to use which one. - """ - positives, negatives = [], [] - for line in score_lines: - which = positives if line['claimed_id'] == line['real_id'] else negatives - which.append(line['score']) - - return (numpy.array(negatives), numpy.array(positives)) - - def load_score(filename, ncolumns=None): """Load scores using numpy.loadtxt and return the data as a numpy array. @@ -482,6 +410,17 @@ def dump_score(filename, score_lines): numpy.savetxt(filename, score_lines, fmt=fmt) +def _split_scores(score_lines, real_id_index, claimed_id_index = 0, score_index = -1): + """Take the output of :py:func:`four_column` or :py:func:`five_column` and return negatives and positives. + """ + positives, negatives = [], [] + for line in score_lines: + which = positives if line[claimed_id_index] == line[real_id_index] else negatives + which.append(line[score_index]) + + return (numpy.array(negatives), numpy.array(positives)) + + def _convert_cmc_scores(neg_dict, pos_dict): """Converts the negative and positive scores read with :py:func:`cmc_four_column` or :py:func:`cmc_four_column` into a format that -- GitLab From 0ee21702737c8bd6a7962e19190ede311ec0c303 Mon Sep 17 00:00:00 2001 From: Manuel Gunther Date: Thu, 27 Oct 2016 17:48:19 -0600 Subject: [PATCH 4/8] Solved Python3 issue with csv reading from bytes --- bob/measure/load.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/bob/measure/load.py b/bob/measure/load.py index 0ef498c..d599656 100644 --- a/bob/measure/load.py +++ b/bob/measure/load.py @@ -9,6 +9,7 @@ import numpy import csv import tarfile import os +import sys import logging logger = logging.getLogger('bob.measure') @@ -93,7 +94,12 @@ def four_column(filename): """ - reader = csv.reader(open_file(filename, mode='rb'), delimiter=' ') + opened = open_file(filename, 'rb') + if sys.version_info.major > 2: + import io + opened = io.TextIOWrapper(opened, newline="") + + reader = csv.reader(opened, delimiter=' ') for splits in reader: splits[-1] = float(splits[-1]) yield splits @@ -214,7 +220,12 @@ def five_column(filename): """ - reader = csv.reader(open_file(filename, mode='rb'), delimiter=' ') + opened = open_file(filename, 'rb') + if sys.version_info.major > 2: + import io + opened = io.TextIOWrapper(opened, newline="") + + reader = csv.reader(opened, delimiter=' ') for splits in reader: splits[-1] = float(splits[-1]) yield splits -- GitLab From 59dc5aba6885e385ea9fc76d98f930e16f229e69 Mon Sep 17 00:00:00 2001 From: Manuel Gunther Date: Thu, 27 Oct 2016 17:48:46 -0600 Subject: [PATCH 5/8] Fixed python3 warning --- bob/measure/test_error.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bob/measure/test_error.py b/bob/measure/test_error.py index f369ddb..fd00788 100644 --- a/bob/measure/test_error.py +++ b/bob/measure/test_error.py @@ -216,10 +216,10 @@ def test_plots(): # EPC curve, you need to have a development and a test set. We will split, # by the middle, the negatives and positives sample we have, just for the # sake of testing - dev_negatives = negatives[:(negatives.shape[0]/2)] - test_negatives = negatives[(negatives.shape[0]/2):] - dev_positives = positives[:(positives.shape[0]/2)] - test_positives = positives[(positives.shape[0]/2):] + dev_negatives = negatives[:(negatives.shape[0]//2)] + test_negatives = negatives[(negatives.shape[0]//2):] + dev_positives = positives[:(positives.shape[0]//2)] + test_positives = positives[(positives.shape[0]//2):] xy = epc(dev_negatives, dev_positives, test_negatives, test_positives, 100) # uncomment the next line to save a reference value -- GitLab From 504a69fc1c20f1a0dbae01155fe93eaea99e2c0f Mon Sep 17 00:00:00 2001 From: Manuel Gunther Date: Thu, 27 Oct 2016 17:50:02 -0600 Subject: [PATCH 6/8] Removed extra keys in tests, and use iteration over dictionary instead --- bob/measure/test_io.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bob/measure/test_io.py b/bob/measure/test_io.py index 12a50b5..0e6b829 100644 --- a/bob/measure/test_io.py +++ b/bob/measure/test_io.py @@ -20,7 +20,7 @@ def test_load_scores(): load_functions = {'4col' : bob.measure.load.four_column, '5col' : bob.measure.load.five_column} cols = {'4col' : 4, '5col' : 5} - for variant in ('4col', '5col'): + for variant in cols: # read score file in normal way normal_score_file = bob.io.base.test_utils.datafile('dev-%s.txt' % variant, 'bob.measure') normal_scores = list(load_functions[variant](normal_score_file)) @@ -43,7 +43,7 @@ def test_load_score(): scores = [] cols = {'4col' : 4, '5col' : 5} - for variant in ('4col', '5col'): + for variant in cols: # read score file in normal way normal_score_file = bob.io.base.test_utils.datafile('dev-%s.txt' % variant, 'bob.measure') normal_scores = bob.measure.load.load_score(normal_score_file, cols[variant]) @@ -67,7 +67,7 @@ def test_dump_score(): scores = [] cols = {'4col' : 4, '5col' : 5} - for variant in ('4col', '5col'): + for variant in cols: # read score file normal_score_file = bob.io.base.test_utils.datafile('dev-%s.txt' % variant, 'bob.measure') normal_scores = bob.measure.load.load_score(normal_score_file, cols[variant]) @@ -177,7 +177,7 @@ def test_from_openbr(): load_functions = {'4col' : bob.measure.load.four_column, '5col' : bob.measure.load.five_column} try: - for variant in ('4col', '5col'): + for variant in load_functions: # first, do not define keyword arguments -- let the file get the model and probe ids being created automatically bob.measure.openbr.write_score_file(matrix_file, mask_file, score_file, score_file_format="%sumn"%variant) assert os.path.exists(score_file) @@ -211,10 +211,10 @@ def test_from_openbr(): assert columns[i][j] == reference[i][j], str(columns[i]) + " != " + str(reference[i]) # check that the score is close (OpenBR write scores in float32 precision only) assert abs(columns[i][-1] - numpy.float32(reference[i][-1])) <= 1e-8, str(columns[i][-1]) + " != " + str(reference[i][-1]) - + #assert numpy.isclose(columns[i][-1], reference[i][-1], atol = 1e-3, rtol=1e-8), str(columns[i][-1]) + " != " + str(reference[i][-1]) assert numpy.allclose(columns[i][-1], reference[i][-1], atol = 1e-3, rtol=1e-8), str(columns[i][-1]) + " != " + str(reference[i][-1]) - + finally: shutil.rmtree(temp_dir) -- GitLab From 18ecfb37dac4d7203e6a39e2d154caaf131bb385 Mon Sep 17 00:00:00 2001 From: Manuel Gunther Date: Fri, 28 Oct 2016 10:22:48 -0600 Subject: [PATCH 7/8] Moved loading of scores into separate function --- bob/measure/load.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/bob/measure/load.py b/bob/measure/load.py index d599656..55d2a48 100644 --- a/bob/measure/load.py +++ b/bob/measure/load.py @@ -93,16 +93,8 @@ def four_column(filename): float: The result of the comparison of the model and the probe """ + return _iterate_score_file(filename) - opened = open_file(filename, 'rb') - if sys.version_info.major > 2: - import io - opened = io.TextIOWrapper(opened, newline="") - - reader = csv.reader(opened, delimiter=' ') - for splits in reader: - splits[-1] = float(splits[-1]) - yield splits def split_four_column(filename): @@ -220,15 +212,7 @@ def five_column(filename): """ - opened = open_file(filename, 'rb') - if sys.version_info.major > 2: - import io - opened = io.TextIOWrapper(opened, newline="") - - reader = csv.reader(opened, delimiter=' ') - for splits in reader: - splits[-1] = float(splits[-1]) - yield splits + return _iterate_score_file(filename) def split_five_column(filename): @@ -421,6 +405,22 @@ def dump_score(filename, score_lines): numpy.savetxt(filename, score_lines, fmt=fmt) +def _iterate_score_file(filename): + """Opens the score file for reading and yields the score file line by line in a tuple/list. + + The last element of the line (which is the score) will be transformed to float, the other elements will be str + """ + opened = open_file(filename, 'rb') + if sys.version_info.major > 2: + import io + opened = io.TextIOWrapper(opened, newline="") + + reader = csv.reader(opened, delimiter=' ') + for splits in reader: + splits[-1] = float(splits[-1]) + yield splits + + def _split_scores(score_lines, real_id_index, claimed_id_index = 0, score_index = -1): """Take the output of :py:func:`four_column` or :py:func:`five_column` and return negatives and positives. """ -- GitLab From fb8f306c8acffa9f2b503abe95979ffb6d956b63 Mon Sep 17 00:00:00 2001 From: Manuel Gunther Date: Fri, 28 Oct 2016 11:40:17 -0600 Subject: [PATCH 8/8] Also provide generic function for cmc scores --- bob/measure/load.py | 55 ++++++++++++++++----------------------------- 1 file changed, 19 insertions(+), 36 deletions(-) diff --git a/bob/measure/load.py b/bob/measure/load.py index 55d2a48..07a6329 100644 --- a/bob/measure/load.py +++ b/bob/measure/load.py @@ -159,22 +159,8 @@ def cmc_four_column(filename): """ - # extract positives and negatives - pos_dict = {} - neg_dict = {} - # read four column list - for (client_id, probe_id, probe_name, score) in four_column(filename): - # check in which dict we have to put the score - correct_dict = pos_dict if client_id == probe_id else neg_dict - - # append score - if probe_name in correct_dict: - correct_dict[probe_name].append(score) - else: - correct_dict[probe_name] = [score] - - # convert that into the desired format - return _convert_cmc_scores(neg_dict, pos_dict) + score_lines = four_column(filename) + return _split_cmc_scores(score_lines, 1) def five_column(filename): @@ -274,22 +260,9 @@ def cmc_five_column(filename): ``positive`` scores for one probe of the database. """ - # extract positives and negatives - pos_dict = {} - neg_dict = {} - # read four column list - for (client_id, _, probe_id, probe_name, score) in five_column(filename): - # check in which dict we have to put the score - correct_dict = pos_dict if client_id == probe_id else neg_dict - - # append score - if probe_name in correct_dict: - correct_dict[probe_name].append(score) - else: - correct_dict[probe_name] = [score] + score_lines = four_column(filename) + return _split_cmc_scores(score_lines, 2) - # convert that into the desired format - return _convert_cmc_scores(neg_dict, pos_dict) def load_score(filename, ncolumns=None): @@ -431,12 +404,22 @@ def _split_scores(score_lines, real_id_index, claimed_id_index = 0, score_index return (numpy.array(negatives), numpy.array(positives)) - -def _convert_cmc_scores(neg_dict, pos_dict): - """Converts the negative and positive scores read with - :py:func:`cmc_four_column` or :py:func:`cmc_four_column` into a format that - is handled by the :py:func:`bob.measure.cmc` and similar functions. +def _split_cmc_scores(score_lines, real_id_index, probe_name_index = None, claimed_id_index = 0, score_index = -1): + """Takes the output of :py:func:`four_column` or :py:func:`five_column` and return cmc scores. """ + if probe_name_index is None: + probe_name_index = real_id_index + 1 + # extract positives and negatives + pos_dict = {} + neg_dict = {} + # read four column list + for line in score_lines: + which = pos_dict if line[claimed_id_index] == line[real_id_index] else neg_dict + probe_name = line[probe_name_index] + # append score + if probe_name not in which: + which[probe_name] = [] + which[probe_name].append(line[score_index]) # convert to lists of tuples of ndarrays (or None) probe_names = sorted(set(neg_dict.keys()).union(set(pos_dict.keys()))) -- GitLab