Commit c36b74f2 authored by André Anjos's avatar André Anjos 💬
Browse files

Merge branch 'epc_with_thresholds' into 'master'

Add an option to the epc function to also get the thresholds which were used during calculation

See merge request !28
parents 142014da c2697a01
Pipeline #8623 passed with stages
in 26 minutes and 24 seconds
......@@ -66,12 +66,13 @@ def min_cllr(negatives, positives):
pos = sorted(positives)
N = len(neg)
P = len(pos)
I = N+P
# now, iterate through both score sets and add a 0 for negative and 1 for positive scores
n, p = 0,0
I = N + P
# now, iterate through both score sets and add a 0 for negative and 1 for
# positive scores
n, p = 0, 0
ideal = numpy.zeros(I)
neg_indices = [0]*N
pos_indices = [0]*P
neg_indices = [0] * N
pos_indices = [0] * P
for i in range(I):
if p < P and (n == N or neg[n] > pos[p]):
pos_indices[p] = i
......@@ -88,12 +89,12 @@ def min_cllr(negatives, positives):
# disable runtime warnings for a short time since log(0) will raise a warning
old_warn_setup = numpy.seterr(divide='ignore')
# ... compute logs
posterior_log_odds = numpy.log(popt)-numpy.log(1.-popt);
log_prior_odds = math.log(float(P)/float(N));
posterior_log_odds = numpy.log(popt) - numpy.log(1. - popt)
log_prior_odds = math.log(float(P) / float(N))
# ... activate old warnings
numpy.seterr(**old_warn_setup)
llrs = posterior_log_odds - log_prior_odds;
llrs = posterior_log_odds - log_prior_odds
# some weired addition
# for i in range(I):
......
This diff is collapsed.
This diff is collapsed.
......@@ -55,7 +55,8 @@ def open_file(filename, mode='rt'):
tar_info = tar.next()
# check that one file was found in the archive
if tar_info is None:
raise IOError("The given file is a .tar file, but it does not contain any file.")
raise IOError(
"The given file is a .tar file, but it does not contain any file.")
# open the file for reading
return tar.extractfile(tar_info)
......@@ -348,7 +349,6 @@ def cmc(filename, ncolumns=None):
return cmc_five_column(filename)
def load_score(filename, ncolumns=None, minimal=False, **kwargs):
"""Load scores using numpy.loadtxt and return the data as a numpy array.
......@@ -383,29 +383,29 @@ def load_score(filename, ncolumns=None, minimal=False, **kwargs):
if ncolumns == 4:
names = ('claimed_id', 'real_id', 'test_label', 'score')
converters = {
0: convertfunc,
1: convertfunc,
2: convertfunc,
3: float}
0: convertfunc,
1: convertfunc,
2: convertfunc,
3: float}
if minimal:
usecols = (0, 1, 3)
elif ncolumns == 5:
names = ('claimed_id', 'model_label', 'real_id', 'test_label', 'score')
converters = {
0: convertfunc,
1: convertfunc,
2: convertfunc,
3: convertfunc,
4: float}
0: convertfunc,
1: convertfunc,
2: convertfunc,
3: convertfunc,
4: float}
if minimal:
usecols = (0, 2, 4)
else:
raise ValueError("ncolumns of 4 and 5 are supported only.")
score_lines = numpy.genfromtxt(
open_file(filename, mode='rb'), dtype=None, names=names,
converters=converters, invalid_raise=True, usecols=usecols, **kwargs)
open_file(filename, mode='rb'), dtype=None, names=names,
converters=converters, invalid_raise=True, usecols=usecols, **kwargs)
new_dtype = []
for name in score_lines.dtype.names[:-1]:
new_dtype.append((name, str(score_lines.dtype[name]).replace('S', 'U')))
......@@ -506,17 +506,19 @@ def _iterate_score_file(filename):
yield splits
def _split_scores(score_lines, real_id_index, claimed_id_index = 0, score_index = -1):
def _split_scores(score_lines, real_id_index, claimed_id_index=0, score_index=-1):
"""Take the output of :py:func:`four_column` or :py:func:`five_column` and return negatives and positives.
"""
positives, negatives = [], []
for line in score_lines:
which = positives if line[claimed_id_index] == line[real_id_index] else negatives
which = positives if line[claimed_id_index] == line[
real_id_index] else negatives
which.append(line[score_index])
return (numpy.array(negatives), numpy.array(positives))
def _split_cmc_scores(score_lines, real_id_index, probe_name_index = None, claimed_id_index = 0, score_index = -1):
def _split_cmc_scores(score_lines, real_id_index, probe_name_index=None, claimed_id_index=0, score_index=-1):
"""Takes the output of :py:func:`four_column` or :py:func:`five_column` and return cmc scores.
"""
if probe_name_index is None:
......@@ -526,7 +528,8 @@ def _split_cmc_scores(score_lines, real_id_index, probe_name_index = None, claim
neg_dict = {}
# read four column list
for line in score_lines:
which = pos_dict if line[claimed_id_index] == line[real_id_index] else neg_dict
which = pos_dict if line[claimed_id_index] == line[
real_id_index] else neg_dict
probe_name = line[probe_name_index]
# append score
if probe_name not in which:
......@@ -537,6 +540,8 @@ def _split_cmc_scores(score_lines, real_id_index, probe_name_index = None, claim
probe_names = sorted(set(neg_dict.keys()).union(set(pos_dict.keys())))
# get all scores in the desired format
return [(
numpy.array(neg_dict[probe_name], numpy.float64) if probe_name in neg_dict else None,
numpy.array(pos_dict[probe_name], numpy.float64) if probe_name in pos_dict else None
numpy.array(neg_dict[probe_name],
numpy.float64) if probe_name in neg_dict else None,
numpy.array(pos_dict[probe_name],
numpy.float64) if probe_name in pos_dict else None
) for probe_name in probe_names]
This diff is collapsed.
......@@ -15,15 +15,15 @@ from .load import open_file, four_column, five_column
def write_matrix(
score_file,
matrix_file,
mask_file,
model_names = None,
probe_names = None,
score_file_format = '4column',
gallery_file_name = 'unknown-gallery.lst',
probe_file_name = 'unknown-probe.lst',
search = None):
score_file,
matrix_file,
mask_file,
model_names=None,
probe_names=None,
score_file_format='4column',
gallery_file_name='unknown-gallery.lst',
probe_file_name='unknown-probe.lst',
search=None):
"""Writes the OpenBR matrix and mask files (version 2), given a score file.
If gallery and probe names are provided, the matrices in both files will be
......@@ -87,12 +87,14 @@ def write_matrix(
"""
def _write_matrix(filename, matrix):
## Helper function to write a matrix file as required by OpenBR
# Helper function to write a matrix file as required by OpenBR
with open(filename, 'wb') as f:
# write the first four lines
header = "S2\n%s\n%s\nM%s %d %d " % (gallery_file_name, probe_file_name, 'B' if matrix.dtype == numpy.uint8 else 'F', matrix.shape[0], matrix.shape[1])
header = "S2\n%s\n%s\nM%s %d %d " % (
gallery_file_name, probe_file_name, 'B' if matrix.dtype == numpy.uint8 else 'F', matrix.shape[0], matrix.shape[1])
footer = "\n"
if sys.version_info[0] > 2: header, footer = header.encode('utf-8'), footer.encode('utf-8')
if sys.version_info[0] > 2:
header, footer = header.encode('utf-8'), footer.encode('utf-8')
f.write(header)
# write magic number
numpy.array(0x12345678, numpy.int32).tofile(f)
......@@ -100,10 +102,10 @@ def write_matrix(
# write the matrix
matrix.tofile(f)
# define read functions, and which information should be read
read_function = {'4column' : four_column, '5column' : five_column}[score_file_format]
offset = {'4column' : 0, '5column' : 1}[score_file_format]
read_function = {'4column': four_column,
'5column': five_column}[score_file_format]
offset = {'4column': 0, '5column': 1}[score_file_format]
# first, read the score file and estimate model and probe names, if not given
if model_names is None or probe_names is None:
......@@ -112,7 +114,7 @@ def write_matrix(
# read the score file
for line in read_function(score_file):
model, probe = line[offset], line[2+offset]
model, probe = line[offset], line[2 + offset]
if model not in model_set:
model_names.append(model)
model_set.add(model)
......@@ -121,10 +123,13 @@ def write_matrix(
probe_set.add(probe)
if search is None:
# create a shortcut to get indices for client and probe subset (to increase speed)
# create a shortcut to get indices for client and probe subset (to
# increase speed)
model_dict, probe_dict = {}, {}
for i,m in enumerate(model_names): model_dict[m]=i
for i,p in enumerate(probe_names): probe_dict[p]=i
for i, m in enumerate(model_names):
model_dict[m] = i
for i, p in enumerate(probe_names):
probe_dict[p] = i
# create the matrices in the desired size
matrix = numpy.ndarray((len(probe_names), len(model_names)), numpy.float32)
......@@ -133,7 +138,8 @@ def write_matrix(
# now, iterate through the score file and fill in the matrix
for line in read_function(score_file):
client, model, id, probe, score = line[0], line[offset], line[1+offset], line[2+offset], line[3+offset]
client, model, id, probe, score = line[0], line[offset], line[
1 + offset], line[2 + offset], line[3 + offset]
assert model in model_dict, "model " + model + " unknown"
assert probe in probe_dict, "probe " + probe + " unknown"
......@@ -143,7 +149,8 @@ def write_matrix(
# check, if we have already written something into that matrix element
if mask[probe_index, model_index]:
logger.warn("Overwriting existing matrix '%f' element of client '%s' and probe '%s' with '%f'", matrix[probe_index, model_index], client, probe, score)
logger.warn("Overwriting existing matrix '%f' element of client '%s' and probe '%s' with '%f'", matrix[
probe_index, model_index], client, probe, score)
matrix[probe_index, model_index] = score
mask[probe_index, model_index] = 0xff if client == id else 0x7f
......@@ -161,7 +168,8 @@ def write_matrix(
# get the scores, sorted by probe
scores = {}
for line in read_function(score_file):
client, model, id, probe, score = line[0], line[offset], line[1+offset], line[2+offset], line[3+offset]
client, model, id, probe, score = line[0], line[offset], line[
1 + offset], line[2 + offset], line[3 + offset]
if probe not in scores:
scores[probe] = []
......@@ -169,14 +177,14 @@ def write_matrix(
# go ahead and sort the scores per probe
sorted_scores = {}
for k,v in scores.items(): sorted_scores[k] = sorted(v, key=lambda x: x[0], reverse=True)
for k, v in scores.items():
sorted_scores[k] = sorted(v, key=lambda x: x[0], reverse=True)
# now, write matrix
for p, probe in enumerate(probe_names):
if probe in scores:
for m in range(min(search, len(sorted_scores[probe]))):
matrix[p,m], mask[p,m] = sorted_scores[probe][m]
matrix[p, m], mask[p, m] = sorted_scores[probe][m]
# OK, now finally write the file in the desired format
_write_matrix(mask_file, mask)
......@@ -187,12 +195,12 @@ def write_score_file(
matrix_file,
mask_file,
score_file,
models_ids = None,
probes_ids = None,
model_names = None,
probe_names = None,
score_file_format = '4column',
replace_nan = None
models_ids=None,
probes_ids=None,
model_names=None,
probe_names=None,
score_file_format='4column',
replace_nan=None
):
"""Writes the Bob score file in the desired format from OpenBR files.
......@@ -266,32 +274,36 @@ def write_score_file(
"""
def _read_matrix(filename):
py3 = sys.version_info[0] >=3
## Helper function to read a matrix file as written by OpenBR
py3 = sys.version_info[0] >= 3
# Helper function to read a matrix file as written by OpenBR
with open(filename, 'rb') as f:
# get version
header = f.readline()
if py3: header = header.decode("utf-8")
if py3:
header = header.decode("utf-8")
assert header[:2] == "S2"
# skip gallery and probe files
f.readline()
f.readline()
# read size and type of matrix
size = f.readline()
if py3: size = size.decode("utf-8")
if py3:
size = size.decode("utf-8")
splits = size.rstrip().split()
# TODO: check the endianess of the magic number stored in split[3]
assert splits[0][0] == 'M'
w,h = int(splits[1]), int(splits[2])
w, h = int(splits[1]), int(splits[2])
# read matrix data
data = numpy.fromfile(f, dtype={'B':numpy.uint8, 'F': numpy.float32}[splits[0][1]])
assert data.shape[0] == w*h
data.shape = (w,h)
data = numpy.fromfile(
f, dtype={'B': numpy.uint8, 'F': numpy.float32}[splits[0][1]])
assert data.shape[0] == w * h
data.shape = (w, h)
return data
# check parameters
if score_file_format not in ("4column", "5column"):
raise ValueError("The given score file format %s is not known; choose one of ('4column', '5column')" % score_file_format)
raise ValueError(
"The given score file format %s is not known; choose one of ('4column', '5column')" % score_file_format)
# get type of score file
four_col = score_file_format == "4column"
......@@ -301,7 +313,7 @@ def write_score_file(
# generate the id lists, if not given
if models_ids is None:
models_ids = [str(g+1) for g in range(mask.shape[1])]
models_ids = [str(g + 1) for g in range(mask.shape[1])]
assert len(models_ids) == mask.shape[1]
if probes_ids is None:
......@@ -321,29 +333,36 @@ def write_score_file(
# check that the probes client ids are in the correct order
for p in range(mask.shape[0]):
for g in range(mask.shape[1]):
if mask[p,g] == 0x7f:
if models_ids[g] == probes_ids[p]: raise ValueError("The probe id %s with index %d should not be identical to model id %s with index %d" % (probes_ids[p], p, models_ids[g], g))
elif mask[p,g] == 0xff:
if models_ids[g] != probes_ids[p]: raise ValueError("The probe id %s with index %d should be identical to model id %s with index %d" % (probes_ids[p], p, models_ids[g], g))
if mask[p, g] == 0x7f:
if models_ids[g] == probes_ids[p]:
raise ValueError("The probe id %s with index %d should not be identical to model id %s with index %d" % (
probes_ids[p], p, models_ids[g], g))
elif mask[p, g] == 0xff:
if models_ids[g] != probes_ids[p]:
raise ValueError("The probe id %s with index %d should be identical to model id %s with index %d" % (
probes_ids[p], p, models_ids[g], g))
# generate model and probe names, if not given
if not four_col and model_names is None:
model_names = [str(g+1) for g in range(mask.shape[1])]
model_names = [str(g + 1) for g in range(mask.shape[1])]
if probe_names is None:
probe_names = [str(p+1) for p in range(mask.shape[0])]
probe_names = [str(p + 1) for p in range(mask.shape[0])]
# iterate through the files and write scores
with open(score_file, 'w') as f:
for g in range(mask.shape[1]):
for p in range(mask.shape[0]):
if mask[p,g]:
score = scores[p,g]
if mask[p, g]:
score = scores[p, g]
# handle NaN values
if numpy.isnan(score):
if replace_nan is None: continue
if replace_nan is None:
continue
score = replace_nan
# write score file
if four_col:
f.write("%s %s %s %3.8f\n" % (models_ids[g], probes_ids[p], probe_names[p], score))
f.write("%s %s %s %3.8f\n" %
(models_ids[g], probes_ids[p], probe_names[p], score))
else:
f.write("%s %s %s %s %3.8f\n" % (models_ids[g], model_names[g], probes_ids[p], probe_names[p], score))
f.write("%s %s %s %s %3.8f\n" % (models_ids[g], model_names[
g], probes_ids[p], probe_names[p], score))
......@@ -3,7 +3,7 @@
# Mon 23 May 2011 14:36:14 CEST
def log_values(min_step = -4, counts_per_step = 4):
def log_values(min_step=-4, counts_per_step=4):
"""Computes log-scaled values between :math:`10^{M}` and 1
This function computes log-scaled values between :math:`10^{M}` and 1
......@@ -32,7 +32,7 @@ def log_values(min_step = -4, counts_per_step = 4):
"""
import math
return [math.pow(10., i * 1./counts_per_step) for i in range(min_step*counts_per_step,0)] + [1.]
return [math.pow(10., i * 1. / counts_per_step) for i in range(min_step * counts_per_step, 0)] + [1.]
def roc(negatives, positives, npoints=100, CAR=False, **kwargs):
......@@ -88,12 +88,12 @@ def roc(negatives, positives, npoints=100, CAR=False, **kwargs):
from . import roc as calc
out = calc(negatives, positives, npoints)
if not CAR:
return pyplot.plot(100.0*out[0,:], 100.0*out[1,:], **kwargs)
return pyplot.plot(100.0 * out[0, :], 100.0 * out[1, :], **kwargs)
else:
return pyplot.semilogx(100.0*out[0,:], 100.0*(1-out[1,:]), **kwargs)
return pyplot.semilogx(100.0 * out[0, :], 100.0 * (1 - out[1, :]), **kwargs)
def roc_for_far(negatives, positives, far_values = log_values(), **kwargs):
def roc_for_far(negatives, positives, far_values=log_values(), **kwargs):
"""Plots the ROC curve for the given list of False Acceptance Rates (FAR).
This method will call ``matplotlib`` to plot the ROC curve for a system which
......@@ -141,7 +141,7 @@ def roc_for_far(negatives, positives, far_values = log_values(), **kwargs):
from matplotlib import pyplot
from . import roc_for_far as calc
out = calc(negatives, positives, far_values)
return pyplot.semilogx(100.0*out[0,:], 100.0*(1-out[1,:]), **kwargs)
return pyplot.semilogx(100.0 * out[0, :], 100.0 * (1 - out[1, :]), **kwargs)
def precision_recall_curve(negatives, positives, npoints=100, **kwargs):
......@@ -189,11 +189,11 @@ def precision_recall_curve(negatives, positives, npoints=100, **kwargs):
from matplotlib import pyplot
from . import precision_recall_curve as calc
out = calc(negatives, positives, npoints)
return pyplot.plot(100.0*out[0,:], 100.0*out[1,:], **kwargs)
return pyplot.plot(100.0 * out[0, :], 100.0 * out[1, :], **kwargs)
def epc(dev_negatives, dev_positives, test_negatives, test_positives,
npoints=100, **kwargs):
npoints=100, **kwargs):
"""Plots Expected Performance Curve (EPC) as defined in the paper:
Bengio, S., Keller, M., Mariéthoz, J. (2004). The Expected Performance Curve.
......@@ -255,8 +255,8 @@ def epc(dev_negatives, dev_positives, test_negatives, test_positives,
from . import epc as calc
out = calc(dev_negatives, dev_positives, test_negatives, test_positives,
npoints)
return pyplot.plot(out[0,:], 100.0*out[1,:], **kwargs)
npoints)
return pyplot.plot(out[0, :], 100.0 * out[1, :], **kwargs)
def det(negatives, positives, npoints=100, axisfontsize='x-small', **kwargs):
......@@ -354,7 +354,7 @@ def det(negatives, positives, npoints=100, axisfontsize='x-small', **kwargs):
"0.995", "0.998", "0.999",
"0.9995", "0.9998", "0.9999",
"0.99995", "0.99998", "0.99999"
]
]
desiredLabels = [
"0.001", "0.002", "0.005",
......@@ -366,7 +366,7 @@ def det(negatives, positives, npoints=100, axisfontsize='x-small', **kwargs):
"99.5", "99.8", "99.9",
"99.95", "99.98", "99.99",
"99.995", "99.998", "99.999"
]
]
# this will actually do the plotting
from matplotlib import pyplot
......@@ -374,11 +374,11 @@ def det(negatives, positives, npoints=100, axisfontsize='x-small', **kwargs):
from . import ppndf
out = calc(negatives, positives, npoints)
retval = pyplot.plot(out[0,:], out[1,:], **kwargs)
retval = pyplot.plot(out[0, :], out[1, :], **kwargs)
# now the trick: we must plot the tick marks by hand using the PPNDF method
pticks = [ppndf(float(v)) for v in desiredTicks]
ax = pyplot.gca() #and finally we set our own tick marks
ax = pyplot.gca() # and finally we set our own tick marks
ax.set_xticks(pticks)
ax.set_xticklabels(desiredLabels, size=axisfontsize)
ax.set_yticks(pticks)
......@@ -421,9 +421,10 @@ def det_axis(v, **kwargs):
# treat input
try:
tv = list(v) #normal input
if len(tv) != 4: raise IndexError
tv = [ppndf(float(k)/100) for k in tv]
tv = list(v) # normal input
if len(tv) != 4:
raise IndexError
tv = [ppndf(float(k) / 100) for k in tv]
cur = pyplot.axis()
# limits must be within bounds
......@@ -446,7 +447,7 @@ def det_axis(v, **kwargs):
return pyplot.axis(tv, **kwargs)
def cmc(cmc_scores, logx = True, **kwargs):
def cmc(cmc_scores, logx=True, **kwargs):
"""Plots the (cumulative) match characteristics and returns the maximum rank.
This function plots a CMC curve using the given CMC scores, which can be read
......@@ -483,15 +484,14 @@ def cmc(cmc_scores, logx = True, **kwargs):
out = calc(cmc_scores)
if logx:
pyplot.semilogx(range(1, len(out)+1), out * 100, **kwargs)
pyplot.semilogx(range(1, len(out) + 1), out * 100, **kwargs)
else:
pyplot.plot(range(1, len(out)+1), out * 100, **kwargs)
pyplot.plot(range(1, len(out) + 1), out * 100, **kwargs)
return len(out)
def detection_identification_curve(cmc_scores, far_values = log_values(), rank
= 1, logx = True, **kwargs):
def detection_identification_curve(cmc_scores, far_values=log_values(), rank=1, logx=True, **kwargs):
"""Plots the Detection & Identification curve over the FAR
This curve is designed to be used in an open set identification protocol, and
......@@ -539,16 +539,21 @@ def detection_identification_curve(cmc_scores, far_values = log_values(), rank
from matplotlib import pyplot
from . import far_threshold, detection_identification_rate
# for each probe, for which no positives exists, get the highest negative score; and sort them to compute the FAR thresholds
negatives = sorted(max(neg) for neg,pos in cmc_scores if (pos is None or not numpy.array(pos).size) and neg is not None)
# for each probe, for which no positives exists, get the highest negative
# score; and sort them to compute the FAR thresholds
negatives = sorted(max(neg) for neg, pos in cmc_scores if (
pos is None or not numpy.array(pos).size) and neg is not None)
if not negatives:
raise ValueError("There need to be at least one pair with only negative scores")
raise ValueError(
"There need to be at least one pair with only negative scores")
# compute thresholds based on FAR values
thresholds = [far_threshold(negatives, [], v, True) for v in far_values]
# compute detection and identification rate based on the thresholds for the given rank
rates = [100.*detection_identification_rate(cmc_scores, t, rank) for t in thresholds]
# compute detection and identification rate based on the thresholds for
# the given rank
rates = [
100. * detection_identification_rate(cmc_scores, t, rank) for t in thresholds]
# plot curve
if logx:
......
......@@ -33,11 +33,10 @@ Examples:
import os
import sys
from .eval_threshold import apthres
import bob.core
logger = bob.core.log.setup("bob.measure")
from .eval_threshold import apthres
def main(user_input=None):
......@@ -51,14 +50,12 @@ def main(user_input=None):
completions = dict(
prog=os.path.basename(sys.argv[0]),
version=pkg_resources.require('bob.measure')[0].version
)
version=pkg_resources.require('bob.measure')[0].version)
args = docopt.docopt(
__doc__ % completions,
argv=argv,
version=completions['version'],
)
version=completions['version'],)
# Sets-up logging
verbosity = int(args['--verbose'])
......@@ -67,9 +64,9 @@ def main(user_input=None):
# handles threshold validation
try:
args['<threshold>'] = float(args['<threshold>'])
except:
raise docopt.DocoptExit("cannot convert %s into float for threshold" % \
args['<threshold>'])
except Exception:
raise docopt.DocoptExit("cannot convert %s into float for threshold" %
args['<threshold>'])
from ..load import load_score, get_negatives_positives
neg, pos = get_negatives_positives(load_score(args['<scores>']))
......