diff --git a/bob/bio/base/score/__init__.py b/bob/bio/base/score/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0843f864c78ec3c1bcc87b509641eaab6a2a011d --- /dev/null +++ b/bob/bio/base/score/__init__.py @@ -0,0 +1,6 @@ +from .load import (open_file, split, cmc, split_four_column, four_column, + split_five_column, five_column, scores, load_score, + dump_score + ) +from .openbr import (write_matrix, write_score_file) + diff --git a/bob/bio/base/score/load.py b/bob/bio/base/score/load.py new file mode 100644 index 0000000000000000000000000000000000000000..b7de295fa0f42dd33d1bafc7bf03efd636060e3e --- /dev/null +++ b/bob/bio/base/score/load.py @@ -0,0 +1,547 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : +# Mon 23 May 2011 16:23:05 CEST + +"""A set of utilities to load score files with different formats. +""" + +import numpy +import csv +import tarfile +import os +import sys + +import logging +logger = logging.getLogger('bob.bio.base') + + +def open_file(filename, mode='rt'): + """Opens the given score file for reading. + + Score files might be raw text files, or a tar-file including a single score + file inside. + + + Parameters: + + filename (:py:class:`str`, ``file-like``): The name of the score file to + open, or a file-like object open for reading. If a file name is given, + the according file might be a raw text file or a (compressed) tar file + containing a raw text file. + + + Returns: + + + ``file-like``: A read-only file-like object as it would be returned by + :py:func:`open`. + + """ + + if not isinstance(filename, str) and hasattr(filename, 'read'): + # It seems that this is an open file + return filename + + if not os.path.isfile(filename): + raise IOError("Score file '%s' does not exist." % filename) + if not tarfile.is_tarfile(filename): + return open(filename, mode) + + # open the tar file for reading + tar = tarfile.open(filename, 'r') + # get the first file in the tar file + tar_info = tar.next() + while tar_info is not None and not tar_info.isfile(): + tar_info = tar.next() + # check that one file was found in the archive + if tar_info is None: + raise IOError( + "The given file is a .tar file, but it does not contain any file.") + + # open the file for reading + return tar.extractfile(tar_info) + + +def four_column(filename): + """Loads a score set from a single file and yield its lines + + Loads a score set from a single file and yield its lines (to avoid loading + the score file at once into memory). This function verifies that all fields + are correctly placed and contain valid fields. The score file must contain + the following information in each line: + + .. code-block:: text + + claimed_id real_id test_label score + + + Parameters: + + filename (:py:class:`str`, ``file-like``): The file object that will be + opened with :py:func:`open_file` containing the scores. + + + Yields: + + str: The claimed identity -- the client name of the model that was used in + the comparison + + str: The real identity -- the client name of the probe that was used in the + comparison + + str: A label of the probe -- usually the probe file name, or the probe id + + float: The result of the comparison of the model and the probe + + """ + return _iterate_score_file(filename) + + +def split_four_column(filename): + """Loads a score set from a single file and splits the scores + + Loads a score set from a single file and splits the scores between negatives + and positives. The score file has to respect the 4 column format as defined + in the method :py:func:`four_column`. + + This method avoids loading and allocating memory for the strings present in + the file. We only keep the scores. + + + Parameters: + + filename (:py:class:`str`, ``file-like``): The file object that will be + opened with :py:func:`open_file` containing the scores. + + + Returns: + + array: negatives, 1D float array containing the list of scores, for which + the ``claimed_id`` and the ``real_id`` are different + (see :py:func:`four_column`) + + array: positives, 1D float array containing the list of scores, for which + the ``claimed_id`` and the ``real_id`` are identical + (see :py:func:`four_column`) + + """ + + score_lines = four_column(filename) + return _split_scores(score_lines, 1) + + +def cmc_four_column(filename): + """Loads scores to compute CMC curves from a file in four column format. + + The four column file needs to be in the same format as described in + :py:func:`four_column`, and the ``test_label`` (column 3) has to contain the + test/probe file name or a probe id. + + This function returns a list of tuples. For each probe file, the tuple + consists of a list of negative scores and a list of positive scores. + Usually, the list of positive scores should contain only one element, but + more are allowed. The result of this function can directly be passed to, + e.g., the :py:func:`bob.measure.cmc` function. + + + Parameters: + + filename (:py:class:`str`, ``file-like``): The file object that will be + opened with :py:func:`open_file` containing the scores. + + + Returns: + + :py:class:`list`: A list of tuples, where each tuple contains the + ``negative`` and ``positive`` scores for one probe of the database. Both + ``negatives`` and ``positives`` can be either an 1D + :py:class:`numpy.ndarray` of type ``float``, or ``None``. + + """ + + score_lines = four_column(filename) + return _split_cmc_scores(score_lines, 1) + + +def five_column(filename): + """Loads a score set from a single file and yield its lines + + Loads a score set from a single file and yield its lines (to avoid loading + the score file at once into memory). This function verifies that all fields + are correctly placed and contain valid fields. The score file must contain + the following information in each line: + + .. code-block:: text + + claimed_id model_label real_id test_label score + + + Parameters: + + filename (:py:class:`str`, ``file-like``): The file object that will be + opened with :py:func:`open_file` containing the scores. + + + Yields: + + str: The claimed identity -- the client name of the model that was used in + the comparison + + str: A label for the model -- usually the model file name, or the model id + + str: The real identity -- the client name of the probe that was used in the + comparison + + str: A label of the probe -- usually the probe file name, or the probe id + + float: The result of the comparison of the model and the probe + + """ + + return _iterate_score_file(filename) + + +def split_five_column(filename): + """Loads a score set from a single file and splits the scores + + Loads a score set from a single file in five column format and splits the + scores between negatives and positives. The score file has to respect the 5 + column format as defined in the method :py:func:`five_column`. + + This method avoids loading and allocating memory for the strings present in + the file. We only keep the scores. + + + Parameters: + + filename (:py:class:`str`, ``file-like``): The file object that will be + opened with :py:func:`open_file` containing the scores. + + + Returns: + + array: negatives, 1D float array containing the list of scores, for which + the ``claimed_id`` and the ``real_id`` are different + (see :py:func:`four_column`) + + array: positives, 1D float array containing the list of scores, for which + the ``claimed_id`` and the ``real_id`` are identical + (see :py:func:`four_column`) + + """ + + score_lines = four_column(filename) + return _split_scores(score_lines, 2) + + +def cmc_five_column(filename): + """Loads scores to compute CMC curves from a file in five column format. + + The five column file needs to be in the same format as described in + :py:func:`five_column`, and the ``test_label`` (column 4) has to contain the + test/probe file name or a probe id. + + This function returns a list of tuples. For each probe file, the tuple + consists of a list of negative scores and a list of positive scores. + Usually, the list of positive scores should contain only one element, but + more are allowed. The result of this function can directly be passed to, + e.g., the :py:func:`bob.measure.cmc` function. + + + Parameters: + + filename (:py:class:`str`, ``file-like``): The file object that will be + opened with :py:func:`open_file` containing the scores. + + + Returns: + + :py:class:`list`: A list of tuples, where each tuple contains the + ``negative`` and ``positive`` scores for one probe of the database. + + """ + score_lines = four_column(filename) + return _split_cmc_scores(score_lines, 2) + + +def scores(filename, ncolumns=None): + """scores(filename, ncolumns=None) -> tuple + + Loads the scores from the given score file and yield its lines. + Depending on the score file format, four or five elements are yielded, see :py:func:`bob.bio.base.score.four_column` and :py:func:`bob.bio.base.score.five_column` for details. + + Parameters: + + filename: :py:class:`str`, ``file-like``: + The file object that will be opened with :py:func:`open_file` containing the scores. + + ncolumns: any + ignored + + Yields: + + tuple: + see :py:func:`bob.bio.base.score.four_column` or :py:func:`bob.bio.base.score.five_column` + """ + return _iterate_score_file(filename) + + +def split(filename, ncolumns=None): + """split(filename, ncolumns=None) -> negatives, positives + + Loads the scores from the given score file and splits them into positives and negatives. + + Depending on the score file format, it calls see :py:func:`bob.bio.base.score.split_four_column` and `:py:func:`bob.bio.base.score.split_five_column` for details. + + Parameters: + + filename: :py:class:`str`, ``file-like``: + The file object that will be opened with :py:func:`open_file` containing the scores. + + ncolumns: int or ``None`` + If specified to be ``4`` or ``5``, the score file will be assumed to be in the given format. + If not specified, the score file format will be estimated automatically + + Returns: + + negatives: 1D :py:class:`numpy.ndarray` of type float + This array contains the list of scores, for which the ``claimed_id`` and the ``real_id`` are different (see :py:func:`four_column`) + + positives: 1D :py:class:`numpy.ndarray` of type float + This array contains the list of scores, for which the ``claimed_id`` and the ``real_id`` are identical (see :py:func:`four_column`) + + """ + ncolumns = _estimate_score_file_format(filename, ncolumns) + if ncolumns == 4: + return split_four_column(filename) + else: + assert ncolumns == 5 + return split_five_column(filename) + + +def cmc(filename, ncolumns=None): + """cmc(filename, ncolumns=None) -> list + + Loads scores to compute CMC curves. + + Depending on the score file format, it calls see :py:func:`bob.bio.base.score.cmc_four_column` and `:py:func:`bob.bio.base.score.cmc_five_column` for details. + + Parameters: + + filename (:py:class:`str` or ``file-like``): The file object that will be + opened with :py:func:`open_file` containing the scores. + + ncolumns: (:py:class:`int`, Optional): If specified to be ``4`` or ``5``, + the score file will be assumed to be in the given format. If not + specified, the score file format will be estimated automatically + + Returns: + + :py:class:`list`: [(neg,pos)] A list of tuples, where each tuple contains the + ``negative`` and ``positive`` scores for one probe of the database. + + """ + ncolumns = _estimate_score_file_format(filename, ncolumns) + if ncolumns == 4: + return cmc_four_column(filename) + else: + assert ncolumns == 5 + return cmc_five_column(filename) + + +def load_score(filename, ncolumns=None, minimal=False, **kwargs): + """Load scores using numpy.loadtxt and return the data as a numpy array. + + Parameters: + + filename (:py:class:`str`, ``file-like``): The file object that will be + opened with :py:func:`open_file` containing the scores. + + ncolumns (:py:class:`int`, optional): 4, 5 or None (the default), + specifying the number of columns in the score file. If None is provided, + the number of columns will be guessed. + + minimal (:py:class:`bool`, optional): If True, only loads ``claimed_id``, ``real_id``, + and ``scores``. + + **kwargs: Keyword arguments passed to :py:func:`numpy.genfromtxt` + + + Returns: + + array: An array which contains not only the actual scores but also the + ``claimed_id``, ``real_id``, ``test_label`` and ``['model_label']`` + + """ + + def convertfunc(x): + return x + + ncolumns = _estimate_score_file_format(filename, ncolumns) + + usecols = kwargs.pop('usecols', None) + if ncolumns == 4: + names = ('claimed_id', 'real_id', 'test_label', 'score') + converters = { + 0: convertfunc, + 1: convertfunc, + 2: convertfunc, + 3: float} + if minimal: + usecols = (0, 1, 3) + + elif ncolumns == 5: + names = ('claimed_id', 'model_label', 'real_id', 'test_label', 'score') + converters = { + 0: convertfunc, + 1: convertfunc, + 2: convertfunc, + 3: convertfunc, + 4: float} + if minimal: + usecols = (0, 2, 4) + else: + raise ValueError("ncolumns of 4 and 5 are supported only.") + + score_lines = numpy.genfromtxt( + open_file(filename, mode='rb'), dtype=None, names=names, + converters=converters, invalid_raise=True, usecols=usecols, **kwargs) + new_dtype = [] + for name in score_lines.dtype.names[:-1]: + new_dtype.append((name, str(score_lines.dtype[name]).replace('S', 'U'))) + new_dtype.append(('score', float)) + score_lines = numpy.array(score_lines, new_dtype) + return score_lines + + +def get_negatives_positives(score_lines): + """Take the output of load_score and return negatives and positives. This + function aims to replace split_four_column and split_five_column but takes a + different input. It's up to you to use which one. + """ + + pos_mask = score_lines['claimed_id'] == score_lines['real_id'] + positives = score_lines['score'][pos_mask] + negatives = score_lines['score'][numpy.logical_not(pos_mask)] + return (negatives, positives) + + +def get_negatives_positives_from_file(filename, **kwargs): + """Loads the scores first efficiently and then calls + get_negatives_positives""" + score_lines = load_score(filename, minimal=True, **kwargs) + return get_negatives_positives(score_lines) + + +def get_negatives_positives_all(score_lines_list): + """Take a list of outputs of load_score and return stacked negatives and + positives. + """ + + negatives, positives = [], [] + for score_lines in score_lines_list: + neg_pos = get_negatives_positives(score_lines) + negatives.append(neg_pos[0]) + positives.append(neg_pos[1]) + negatives = numpy.vstack(negatives).T + positives = numpy.vstack(positives).T + return (negatives, positives) + + +def get_all_scores(score_lines_list): + """Take a list of outputs of load_score and return stacked scores""" + + return numpy.vstack([score_lines['score'] + for score_lines in score_lines_list]).T + + +def dump_score(filename, score_lines): + """Dump scores that were loaded using :py:func:`load_score` + The number of columns is automatically detected. + """ + + if len(score_lines.dtype) == 5: + fmt = '%s %s %s %s %.9f' + elif len(score_lines.dtype) == 4: + fmt = '%s %s %s %.9f' + else: + raise ValueError("Only scores with 4 and 5 columns are supported.") + numpy.savetxt(filename, score_lines, fmt=fmt) + + +def _estimate_score_file_format(filename, ncolumns=None): + """Estimates the score file format from the given score file. + If ``ncolumns`` is in ``(4,5)``, then ``ncolumns`` is returned instead. + """ + if ncolumns in (4, 5): + return ncolumns + + f = open_file(filename, 'rb') + try: + line = f.readline() + ncolumns = len(line.split()) + except Exception: + logger.warn('Could not guess the number of columns in file: {}. ' + 'Assuming 4 column format.'.format(filename)) + ncolumns = 4 + finally: + f.close() + return ncolumns + + +def _iterate_score_file(filename): + """Opens the score file for reading and yields the score file line by line in a tuple/list. + + The last element of the line (which is the score) will be transformed to float, the other elements will be str + """ + opened = open_file(filename, 'rb') + if sys.version_info.major > 2: + import io + if not isinstance(opened, io.TextIOWrapper): + opened = io.TextIOWrapper(opened, newline="") + + reader = csv.reader(opened, delimiter=' ') + for splits in reader: + splits[-1] = float(splits[-1]) + yield splits + + +def _split_scores(score_lines, real_id_index, claimed_id_index=0, score_index=-1): + """Take the output of :py:func:`four_column` or :py:func:`five_column` and return negatives and positives. + """ + positives, negatives = [], [] + for line in score_lines: + which = positives if line[claimed_id_index] == line[ + real_id_index] else negatives + which.append(line[score_index]) + + return (numpy.array(negatives), numpy.array(positives)) + + +def _split_cmc_scores(score_lines, real_id_index, probe_name_index=None, claimed_id_index=0, score_index=-1): + """Takes the output of :py:func:`four_column` or :py:func:`five_column` and return cmc scores. + """ + if probe_name_index is None: + probe_name_index = real_id_index + 1 + # extract positives and negatives + pos_dict = {} + neg_dict = {} + # read four column list + for line in score_lines: + which = pos_dict if line[claimed_id_index] == line[ + real_id_index] else neg_dict + probe_name = line[probe_name_index] + # append score + if probe_name not in which: + which[probe_name] = [] + which[probe_name].append(line[score_index]) + + # convert to lists of tuples of ndarrays (or None) + probe_names = sorted(set(neg_dict.keys()).union(set(pos_dict.keys()))) + # get all scores in the desired format + return [( + numpy.array(neg_dict[probe_name], + numpy.float64) if probe_name in neg_dict else None, + numpy.array(pos_dict[probe_name], + numpy.float64) if probe_name in pos_dict else None + ) for probe_name in probe_names] diff --git a/bob/bio/base/score/openbr.py b/bob/bio/base/score/openbr.py new file mode 100644 index 0000000000000000000000000000000000000000..e32bdb0e7d01cabd4463412eac6889f312f438d3 --- /dev/null +++ b/bob/bio/base/score/openbr.py @@ -0,0 +1,368 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : + + +"""This file includes functionality to convert between Bob's four column or + five column score files and the Matrix files used in OpenBR.""" + + +import numpy +import sys +import logging +logger = logging.getLogger("bob.measure") + +from .load import open_file, four_column, five_column + + +def write_matrix( + score_file, + matrix_file, + mask_file, + model_names=None, + probe_names=None, + score_file_format='4column', + gallery_file_name='unknown-gallery.lst', + probe_file_name='unknown-probe.lst', + search=None): + """Writes the OpenBR matrix and mask files (version 2), given a score file. + + If gallery and probe names are provided, the matrices in both files will be + sorted by gallery and probe names. Otherwise, the order will be the same as + given in the score file. + + If ``search`` is given (as an integer), the resulting matrix files will be in + the *search* format, keeping the given number of gallery scores with the + highest values for each probe. + + .. warning:: + + When provided with a 4-column score file, this function will work only, if + there is only a single model id for each client. + + Parameters: + + score_file (str): The 4 or 5 column style score file written by bob. + + matrix_file (str): The OpenBR matrix file that should be written. + Usually, the file name extension is ``.mtx`` + + mask_file (str): The OpenBR mask file that should be written. + The mask file defines, which values are positives, negatives or to be + ignored. Usually, the file name extension is ``.mask`` + + model_names (:py:class:`str`, optional): If given, the matrix will be + written in the same order as the given model names. The model names must + be identical with the second column in the 5-column ``score_file``. + + .. note:: + + If the score file is in four column format, the model_names must be + the client ids stored in the first column. In this case, there might + be only a single model per client + + Only the scores of the given models will be considered. + + probe_names (:py:class:`list`, optional): A list of strings. If given, + the matrix will be written in the same order as the given probe names + (the ``path`` of the probe). The probe names are identical to the third + column of the 4-column (or the fourth column of the 5-column) + ``score_file``. Only the scores of the given probe names will be + considered in this case. + + score_file_format (:py:class:`str`, optional): One of ``('4column', + '5column')``. The format, in which the ``score_file`` is; defaults to + ``'4column'`` + + gallery_file_name (:py:class:`str`, optional): The name of the gallery file + that will be written in the header of the OpenBR files. + + probe_file_name (:py:class:`str`, optional): The name of the probe file that + will be written in the header of the OpenBR files. + + search (:py:class:`int`, optional): If given, the scores will be sorted per + probe, keeping the specified number of highest scores. If the given + number is higher than the models, ``NaN`` values will be added, and the + mask will contain ``0x00`` values. + + """ + + def _write_matrix(filename, matrix): + # Helper function to write a matrix file as required by OpenBR + with open(filename, 'wb') as f: + # write the first four lines + header = "S2\n%s\n%s\nM%s %d %d " % ( + gallery_file_name, probe_file_name, 'B' if matrix.dtype == numpy.uint8 else 'F', matrix.shape[0], matrix.shape[1]) + footer = "\n" + if sys.version_info[0] > 2: + header, footer = header.encode('utf-8'), footer.encode('utf-8') + f.write(header) + # write magic number + numpy.array(0x12345678, numpy.int32).tofile(f) + f.write(footer) + # write the matrix + matrix.tofile(f) + + # define read functions, and which information should be read + read_function = {'4column': four_column, + '5column': five_column}[score_file_format] + offset = {'4column': 0, '5column': 1}[score_file_format] + + # first, read the score file and estimate model and probe names, if not given + if model_names is None or probe_names is None: + model_names, probe_names = [], [] + model_set, probe_set = set(), set() + + # read the score file + for line in read_function(score_file): + model, probe = line[offset], line[2 + offset] + if model not in model_set: + model_names.append(model) + model_set.add(model) + if probe not in probe_set: + probe_names.append(probe) + probe_set.add(probe) + + if search is None: + # create a shortcut to get indices for client and probe subset (to + # increase speed) + model_dict, probe_dict = {}, {} + for i, m in enumerate(model_names): + model_dict[m] = i + for i, p in enumerate(probe_names): + probe_dict[p] = i + + # create the matrices in the desired size + matrix = numpy.ndarray((len(probe_names), len(model_names)), numpy.float32) + matrix[:] = numpy.nan + mask = numpy.zeros(matrix.shape, numpy.uint8) + + # now, iterate through the score file and fill in the matrix + for line in read_function(score_file): + client, model, id, probe, score = line[0], line[offset], line[ + 1 + offset], line[2 + offset], line[3 + offset] + + assert model in model_dict, "model " + model + " unknown" + assert probe in probe_dict, "probe " + probe + " unknown" + + model_index = model_dict[model] + probe_index = probe_dict[probe] + + # check, if we have already written something into that matrix element + if mask[probe_index, model_index]: + logger.warn("Overwriting existing matrix '%f' element of client '%s' and probe '%s' with '%f'", matrix[ + probe_index, model_index], client, probe, score) + + matrix[probe_index, model_index] = score + mask[probe_index, model_index] = 0xff if client == id else 0x7f + + else: + # get the correct search parameter, if negative + if search < 0: + search = len(model_names) + + # create the matrices in the desired size + matrix = numpy.ndarray((len(probe_names), search), numpy.float32) + matrix[:] = numpy.nan + mask = numpy.zeros(matrix.shape, numpy.uint8) + + # get the scores, sorted by probe + scores = {} + for line in read_function(score_file): + client, model, id, probe, score = line[0], line[offset], line[ + 1 + offset], line[2 + offset], line[3 + offset] + + if probe not in scores: + scores[probe] = [] + scores[probe].append((score, 0xff if client == id else 0x7f)) + + # go ahead and sort the scores per probe + sorted_scores = {} + for k, v in scores.items(): + sorted_scores[k] = sorted(v, key=lambda x: x[0], reverse=True) + + # now, write matrix + for p, probe in enumerate(probe_names): + if probe in scores: + for m in range(min(search, len(sorted_scores[probe]))): + matrix[p, m], mask[p, m] = sorted_scores[probe][m] + + # OK, now finally write the file in the desired format + _write_matrix(mask_file, mask) + _write_matrix(matrix_file, matrix) + + +def write_score_file( + matrix_file, + mask_file, + score_file, + models_ids=None, + probes_ids=None, + model_names=None, + probe_names=None, + score_file_format='4column', + replace_nan=None +): + """Writes the Bob score file in the desired format from OpenBR files. + + Writes a Bob score file in the desired format (four or five column), given + the OpenBR matrix and mask files. + + In principle, the score file can be written based on the matrix and mask + files, and the format suffice the requirements to compute CMC curves. + However, the contents of the score files can be adapted. If given, the + ``models_ids`` and ``probes_ids`` define the **client ids** of model and + probe, and they have to be in the same order as used to compute the OpenBR + matrix. The ``model_names`` and ``probe_names`` define the **paths** of + model and probe, and they should be in the same order as the ids. + + In rare cases, the OpenBR matrix contains NaN values, which Bob's score files + cannot handle. You can use the ``replace_nan`` parameter to decide, what to + do with these values. By default (``None``), these values are ignored, i.e., + not written into the score file. This is, what OpenBR is doing as well. + However, you can also set ``replace_nan`` to any value, which will be written + instead of the NaN values. + + + Parameters: + + matrix_file (str): The OpenBR matrix file that should be read. Usually, the + file name extension is ``.mtx`` + + mask_file (str): The OpenBR mask file that should be read. Usually, the + file name extension is ``.mask`` + + score_file (str): Path to the 4 or 5 column style score file that should be + written. + + models_ids (:py:class:`list`, optional): A list of strings with the client + ids of the models that will be written in the first column of the score + file. If given, the size must be identical to the number of models + (gallery templates) in the OpenBR files. If not given, client ids of the + model will be identical to the **gallery index** in the matrix file. + + probes_ids (:py:class:`list`, optional): A list of strings with the client + ids of the probes that will be written in the second/third column of the + four/five column score file. If given, the size must be identical to the + number of probe templates in the OpenBR files. It will be checked that + the OpenBR mask fits to the model/probe client ids. If not given, the + probe ids will be estimated automatically, i.e., to fit the OpenBR + matrix. + + model_names (:py:class:`list`, optional): A list of strings with the model + path written in the second column of the five column score file. If not + given, the model index in the OpenBR file will be used. + + .. note:: + + This entry is ignored in the four column score file format. + + probe_names (:py:class:`list`, optional): A list of probe path to be + written in the third/fourth column in the four/five column score file. If + given, the size must be identical to the number of probe templates in the + OpenBR files. If not given, the probe index in the OpenBR file will be + used. + + score_file_format (:py:class:`str`, optional): One of ``('4column', + '5column')``. The format, in which the ``score_file`` is; defaults to + ``'4column'`` + + replace_nan (:py:class:`float`, optional): If NaN values are encountered in + the OpenBR matrix (which are not ignored due to the mask being non-NULL), + this value will be written instead. If ``None``, the values will not be + written in the score file at all. + + """ + + def _read_matrix(filename): + py3 = sys.version_info[0] >= 3 + # Helper function to read a matrix file as written by OpenBR + with open(filename, 'rb') as f: + # get version + header = f.readline() + if py3: + header = header.decode("utf-8") + assert header[:2] == "S2" + # skip gallery and probe files + f.readline() + f.readline() + # read size and type of matrix + size = f.readline() + if py3: + size = size.decode("utf-8") + splits = size.rstrip().split() + # TODO: check the endianess of the magic number stored in split[3] + assert splits[0][0] == 'M' + w, h = int(splits[1]), int(splits[2]) + # read matrix data + data = numpy.fromfile( + f, dtype={'B': numpy.uint8, 'F': numpy.float32}[splits[0][1]]) + assert data.shape[0] == w * h + data.shape = (w, h) + return data + + # check parameters + if score_file_format not in ("4column", "5column"): + raise ValueError( + "The given score file format %s is not known; choose one of ('4column', '5column')" % score_file_format) + # get type of score file + four_col = score_file_format == "4column" + + # read the two matrices + scores = _read_matrix(matrix_file) + mask = _read_matrix(mask_file) + + # generate the id lists, if not given + if models_ids is None: + models_ids = [str(g + 1) for g in range(mask.shape[1])] + assert len(models_ids) == mask.shape[1] + + if probes_ids is None: + probes_ids = [] + # iterate over all probes + for p in range(mask.shape[0]): + # get indices, where model and probe id should be identical + equal_indices = numpy.where(mask[p] == 0xff) + if len(equal_indices): + # model id found, use the first one + probes_ids.append(models_ids[equal_indices[0][0]]) + else: + # no model found; add non-existing id + probes_ids.append("unknown") + else: + assert len(probes_ids) == mask.shape[0] + # check that the probes client ids are in the correct order + for p in range(mask.shape[0]): + for g in range(mask.shape[1]): + if mask[p, g] == 0x7f: + if models_ids[g] == probes_ids[p]: + raise ValueError("The probe id %s with index %d should not be identical to model id %s with index %d" % ( + probes_ids[p], p, models_ids[g], g)) + elif mask[p, g] == 0xff: + if models_ids[g] != probes_ids[p]: + raise ValueError("The probe id %s with index %d should be identical to model id %s with index %d" % ( + probes_ids[p], p, models_ids[g], g)) + + # generate model and probe names, if not given + if not four_col and model_names is None: + model_names = [str(g + 1) for g in range(mask.shape[1])] + if probe_names is None: + probe_names = [str(p + 1) for p in range(mask.shape[0])] + + # iterate through the files and write scores + with open(score_file, 'w') as f: + for g in range(mask.shape[1]): + for p in range(mask.shape[0]): + if mask[p, g]: + score = scores[p, g] + # handle NaN values + if numpy.isnan(score): + if replace_nan is None: + continue + score = replace_nan + # write score file + if four_col: + f.write("%s %s %s %3.8f\n" % + (models_ids[g], probes_ids[p], probe_names[p], score)) + else: + f.write("%s %s %s %s %3.8f\n" % (models_ids[g], model_names[ + g], probes_ids[p], probe_names[p], score)) diff --git a/bob/bio/base/test/test_io.py b/bob/bio/base/test/test_io.py new file mode 100644 index 0000000000000000000000000000000000000000..3a30d75b6a2dda2d3827f67a2e701731a06e3dd7 --- /dev/null +++ b/bob/bio/base/test/test_io.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8 : +# Andre Anjos <andre.anjos@idiap.ch> +# Wed 11 Dec 15:14:08 2013 CET +# +# Copyright (C) 2011-2013 Idiap Research Institute, Martigny, Switzerland + +"""Tests the IO functionality of bob.bio.base.score.""" + +import numpy +import tempfile +import os +import shutil + +import bob.io.base.test_utils +from .. import score + + +def test_load_scores(): + # This function tests the IO functionality of loading score files in + # different ways + + load_functions = {'4col': score.four_column, + '5col': score.five_column} + cols = {'4col': 4, '5col': 5} + + for variant in cols: + # read score file in normal way + normal_score_file = bob.io.base.test_utils.datafile( + 'dev-%s.txt' % variant, 'bob.bio.base', 'test/data') + normal_scores = list(load_functions[variant](normal_score_file)) + + assert len(normal_scores) == 910 + assert all(len(s) == cols[variant] for s in normal_scores) + + # read the compressed score file + compressed_score_file = bob.io.base.test_utils.datafile( + 'dev-%s.tar.gz' % variant, 'bob.bio.base', 'test/data') + compressed_scores = list(load_functions[variant](compressed_score_file)) + + assert len(compressed_scores) == len(normal_scores) + assert all(len(c) == cols[variant] for c in compressed_scores) + assert all(c[i] == s[i] for c, s in zip(compressed_scores, + normal_scores) for i in range(cols[variant])) + + # Use auto-estimated score file contents + # read score file in normal way + normal_scores = list(score.scores(normal_score_file)) + + assert len(normal_scores) == 910 + assert all(len(s) == cols[variant] for s in normal_scores) + + # read the compressed score file + compressed_scores = list(score.scores(compressed_score_file)) + + assert len(compressed_scores) == len(normal_scores) + assert all(len(c) == cols[variant] for c in compressed_scores) + assert all(c[i] == s[i] for c, s in zip(compressed_scores, + normal_scores) for i in range(cols[variant])) + + +def test_split_scores(): + # This function tests the IO functionality of loading score files in + # different ways + + split_functions = {'4col': score.split_four_column, + '5col': score.split_five_column} + cols = {'4col': 4, '5col': 5} + + for variant in cols: + # read score file in normal way + normal_score_file = bob.io.base.test_utils.datafile( + 'dev-%s.txt' % variant, 'bob.bio.base', 'test/data') + negatives, positives = split_functions[variant](normal_score_file) + + assert len(negatives) == 520, len(negatives) + assert len(positives) == 390, len(positives) + + # read the compressed score file + compressed_score_file = bob.io.base.test_utils.datafile( + 'dev-%s.tar.gz' % variant, 'bob.bio.base', 'test/data') + negatives, positives = split_functions[variant](compressed_score_file) + + assert len(negatives) == 520, len(negatives) + assert len(positives) == 390, len(positives) + + # Use auto-estimated score file contents + # read score file in normal way + negatives, positives = score.split(normal_score_file) + + assert len(negatives) == 520, len(negatives) + assert len(positives) == 390, len(positives) + + # read the compressed score file + negatives, positives = score.split(compressed_score_file) + + assert len(negatives) == 520, len(negatives) + assert len(positives) == 390, len(positives) + + +def test_load_score(): + # This function tests the IO functionality of loading score files in + # different ways + + scores = [] + cols = {'4col': 4, '5col': 5} + + for variant in cols: + # read score file in normal way + normal_score_file = bob.io.base.test_utils.datafile( + 'dev-%s.txt' % variant, 'bob.bio.base', 'test/data') + normal_scores = score.load_score( + normal_score_file, cols[variant]) + + assert len(normal_scores) == 910 + assert len(normal_scores.dtype) == cols[variant] + + # read the compressed score file + compressed_score_file = bob.io.base.test_utils.datafile( + 'dev-%s.tar.gz' % variant, 'bob.bio.base', 'test/data') + compressed_scores = score.load_score( + compressed_score_file, cols[variant]) + + assert len(compressed_scores) == len(normal_scores) + assert len(compressed_scores.dtype) == cols[variant] + for name in normal_scores.dtype.names: + assert all(normal_scores[name] == compressed_scores[name]) + + # test minimal loading + minimal_scores = score.load_score( + normal_score_file, minimal=True) + assert len(minimal_scores) == 910 + assert len(minimal_scores.dtype) == 3 + assert minimal_scores.dtype.names == ('claimed_id', 'real_id', 'score') + + +def test_dump_score(): + # This function tests the IO functionality of dumping score files + + scores = [] + cols = {'4col': 4, '5col': 5} + + for variant in cols: + # read score file + normal_score_file = bob.io.base.test_utils.datafile( + 'dev-%s.txt' % variant, 'bob.bio.base', 'test/data') + normal_scores = score.load_score( + normal_score_file, cols[variant]) + + with tempfile.TemporaryFile() as f: + score.dump_score(f, normal_scores) + f.seek(0) + loaded_scores = score.load_score(f, cols[variant]) + + for name in normal_scores.dtype.names: + assert all(normal_scores[name] == loaded_scores[name]) + + +def _check_binary_identical(name1, name2): + # see: http://www.peterbe.com/plog/using-md5-to-check-equality-between-files + from hashlib import md5 + # tests if two files are binary identical + with open(name1, 'rb') as f1: + with open(name2, 'rb') as f2: + assert md5(f1.read()).digest() == md5(f2.read()).digest() + + +def test_openbr_verify(): + # This function tests that the conversion to the OpenBR verify file works + # as expected + temp_dir = tempfile.mkdtemp(prefix='bob_test') + + # define output files + openbr_extensions = ('.mtx', '.mask') + matrix_file, mask_file = [os.path.join( + temp_dir, "scores%s") % ext for ext in openbr_extensions] + + try: + for variant in ('4col', '5col'): + # get score file + score_file = bob.io.base.test_utils.datafile( + 'scores-cmc-%s.txt' % variant, 'bob.bio.base', 'test/data') + + # first round, do not define keyword arguments -- let the file get the + # gallery and probe ids automatically + kwargs = {} + for i in range(2): + # get the files by automatically obtaining the identities + score.write_matrix( + score_file, matrix_file, mask_file, score_file_format="%sumn" % variant, **kwargs) + + assert os.path.isfile(matrix_file) and os.path.isfile(mask_file) + + # check that they are binary identical to the reference files (which + # are tested to work and give the same results with OpenBR) + matrix_ref, mask_ref = [bob.io.base.test_utils.datafile( + 'scores%s' % ext, 'bob.bio.base', 'test/data') for ext in openbr_extensions] + _check_binary_identical(matrix_file, matrix_ref) + _check_binary_identical(mask_file, mask_ref) + + # define new kwargs for second round, i.e., define model and probe names + # these names are identical to what is found in the score file, which + # in turn comes from the AT&T database + model_type = {"4col": "%d", "5col": "s%d"}[variant] + dev_ids = (3, 4, 7, 8, 9, 13, 15, 18, 19, 22, 23, + 25, 28, 30, 31, 32, 35, 37, 38, 40) + kwargs['model_names'] = [model_type % c for c in dev_ids] + kwargs['probe_names'] = ["s%d/%d" % + (c, i) for c in dev_ids for i in (1, 3, 6, 8, 10)] + + finally: + shutil.rmtree(temp_dir) + + +def test_openbr_search(): + # This function tests that the conversion to the OpenBR search file works + # as expected + temp_dir = tempfile.mkdtemp(prefix='bob_test') + + # define output files + openbr_extensions = ('.mtx', '.mask') + matrix_file, mask_file = [os.path.join( + temp_dir, "search%s") % ext for ext in openbr_extensions] + + try: + for variant in ('4col', '5col'): + # get score file + score_file = bob.io.base.test_utils.datafile( + 'scores-cmc-%s.txt' % variant, 'bob.bio.base', 'test/data') + + # first round, do not define keyword arguments -- let the file get the + # gallery and probe ids automatically + kwargs = {} + for i in range(2): + # get the files by automatically obtaining the identities + score.write_matrix( + score_file, matrix_file, mask_file, score_file_format="%sumn" % variant, search=50, **kwargs) + + assert os.path.isfile(matrix_file) and os.path.isfile(mask_file) + + # check that they are binary identical to the reference files (which + # are tested to work and give the same results with OpenBR) + matrix_ref, mask_ref = [bob.io.base.test_utils.datafile( + 'search%s' % ext, 'bob.bio.base', 'test/data') for ext in openbr_extensions] + _check_binary_identical(matrix_file, matrix_ref) + _check_binary_identical(mask_file, mask_ref) + + # define new kwargs for second round, i.e., define model and probe names + # these names are identical to what is found in the score file, which + # in turn comes from the AT&T database + model_type = {"4col": "%d", "5col": "s%d"}[variant] + dev_ids = (3, 4, 7, 8, 9, 13, 15, 18, 19, 22, 23, + 25, 28, 30, 31, 32, 35, 37, 38, 40) + kwargs['model_names'] = [model_type % c for c in dev_ids] + kwargs['probe_names'] = ["s%d/%d" % + (c, i) for c in dev_ids for i in (1, 3, 6, 8, 10)] + + finally: + shutil.rmtree(temp_dir) + + +def test_from_openbr(): + # This function tests that the conversion from the OpenBR matrices work as + # expected + temp_dir = tempfile.mkdtemp(prefix='bob_test') + + # define input files + openbr_extensions = ('.mtx', '.mask') + matrix_file, mask_file = [bob.io.base.test_utils.datafile( + 'scores%s' % ext, 'bob.bio.base', 'test/data') for ext in openbr_extensions] + + score_file = os.path.join(temp_dir, "scores") + load_functions = {'4col': score.four_column, + '5col': score.five_column} + + try: + for variant in load_functions: + # first, do not define keyword arguments -- let the file get the model + # and probe ids being created automatically + score.write_score_file( + matrix_file, mask_file, score_file, score_file_format="%sumn" % variant) + assert os.path.exists(score_file) + # read the score file with bobs functionality + columns = list(load_functions[variant](score_file)) + + # check the contents + assert len(columns) == 2000 + + # now, generate model and probe names and ids + model_type = {"4col": "%d", "5col": "s%d"}[variant] + dev_ids = (3, 4, 7, 8, 9, 13, 15, 18, 19, 22, 23, + 25, 28, 30, 31, 32, 35, 37, 38, 40) + model_names = ["s%d" % c for c in dev_ids] + probe_names = ["s%d/%d" % (c, i) + for c in dev_ids for i in (1, 3, 6, 8, 10)] + models_ids = ["%d" % c for c in dev_ids] + probes_ids = ["%d" % c for c in dev_ids for i in (1, 3, 6, 8, 10)] + + score.write_score_file(matrix_file, mask_file, score_file, models_ids=models_ids, probes_ids=probes_ids, + model_names=model_names, probe_names=probe_names, score_file_format="%sumn" % variant) + + # check that we re-generated the bob score file + reference_file = bob.io.base.test_utils.datafile( + 'scores-cmc-%s.txt' % variant, 'bob.bio.base', 'test/data') + + # assert that we can (almost) reproduce the score file + # ... read both files + columns = list(load_functions[variant](score_file)) + reference = list(load_functions[variant](reference_file)) + assert len(columns) == len(reference) + for i in range(len(columns)): + for j in range(len(columns[i]) - 1): + # check that the model and probe names are fine + assert columns[i][j] == reference[i][j], str( + columns[i]) + " != " + str(reference[i]) + # check that the score is close (OpenBR write scores in float32 + # precision only) + assert abs(columns[i][-1] - numpy.float32(reference[i][-1]) + ) <= 1e-8, str(columns[i][-1]) + " != " + str(reference[i][-1]) + + #assert numpy.isclose(columns[i][-1], reference[i][-1], atol = 1e-3, rtol=1e-8), str(columns[i][-1]) + " != " + str(reference[i][-1]) + assert numpy.allclose(columns[i][-1], reference[i][-1], atol=1e-3, + rtol=1e-8), str(columns[i][-1]) + " != " + str(reference[i][-1]) + + finally: + shutil.rmtree(temp_dir)