Skip to content
Snippets Groups Projects
Commit fd183226 authored by Amir MOHAMMADI's avatar Amir MOHAMMADI
Browse files

New handling of scores

parent d605a113
Branches
Tags
No related merge requests found
......@@ -21,7 +21,7 @@ class MLP(AlgorithmBob):
def __init__(self,
n_systems=2,
hidden_layers=None,
hidden_layers=(3,),
seed=None,
machine=None,
trainer=None,
......@@ -29,9 +29,7 @@ class MLP(AlgorithmBob):
super(MLP, self).__init__(
classifier=self,
*args, **kwargs)
if hidden_layers is None:
hidden_layers = [3]
self.mlp_shape = [n_systems] + hidden_layers + [1]
self.mlp_shape = [n_systems] + list(hidden_layers) + [1]
self.seed = seed
self.machine = machine
self.trainer = trainer
......
......@@ -5,14 +5,13 @@
from __future__ import print_function, absolute_import, division
import os
import numpy as np
from bob.io.base import create_directories_safe
from bob.measure.load import load_score, get_all_scores,\
get_negatives_positives_all, dump_score
from bob.measure.load import load_score, dump_score
from bob.bio.base import utils
from ..tools import parse_arguments, write_info
from ..tools import parse_arguments, write_info, get_gza_from_lines_list, \
check_consistency, get_scores, remove_nan
import bob.core
logger = bob.core.log.setup("bob.fusion.base")
......@@ -25,35 +24,68 @@ def fuse(args, command_line_parameters):
write_info(args, command_line_parameters)
# load the scores
score_lines_list_train = [load_score(path, ncolumns=args.score_type)
for path in args.train_files]
if args.dev_files:
score_lines_list_dev = [load_score(path, ncolumns=args.score_type)
for path in args.dev_files]
scores_dev = get_all_scores(score_lines_list_dev)
(neg, pos) = get_negatives_positives_all(score_lines_list_dev)
if args.eval_files:
score_lines_list_eval = [load_score(path, ncolumns=args.score_type)
for path in args.eval_files]
scores_eval = get_all_scores(score_lines_list_eval)
# genuine, zero effort impostor, and attack list of
# train, development and evaluation data.
idx1, gen_lt, zei_lt, atk_lt = get_gza_from_lines_list(score_lines_list_train)
if args.dev_files:
_, gen_ld, zei_ld, atk_ld = get_gza_from_lines_list(score_lines_list_dev)
if args.eval_files:
_, gen_le, zei_le, atk_le = get_gza_from_lines_list(score_lines_list_eval)
# check if score lines are consistent
if not args.skip_check:
score_lines0 = score_lines_list_dev[0]
for score_lines in score_lines_list_dev[1:]:
assert(np.all(score_lines['claimed_id'] == score_lines0['claimed_id']))
assert(np.all(score_lines['real_id'] == score_lines0['real_id']))
check_consistency(gen_lt, zei_lt, atk_lt)
if args.dev_files:
check_consistency(gen_ld, zei_ld, atk_ld)
if args.eval_files:
check_consistency(gen_le, zei_le, atk_le)
scores_train = get_scores(gen_lt, zei_lt, atk_lt)
train_neg = get_scores(zei_lt, atk_lt)
train_pos = get_scores(gen_lt)
if args.dev_files:
scores_dev = get_scores(gen_ld, zei_ld, atk_ld)
dev_neg = get_scores(zei_ld, atk_ld)
dev_pos = get_scores(gen_ld)
else:
dev_neg, dev_pos = None, None
if args.eval_files:
score_lines0 = score_lines_list_eval[0]
for score_lines in score_lines_list_eval[1:]:
assert(np.all(score_lines['claimed_id'] == score_lines0['claimed_id']))
assert(np.all(score_lines['real_id'] == score_lines0['real_id']))
scores_eval = get_scores(gen_le, zei_le, atk_le)
# check for nan values
found_nan = False
found_nan, _, scores_train = remove_nan(scores_train, found_nan)
found_nan, _, train_neg = remove_nan(train_neg, found_nan)
found_nan, _, train_pos = remove_nan(train_pos, found_nan)
if args.dev_files:
found_nan, nan_dev, scores_dev = remove_nan(scores_dev, found_nan)
found_nan, _, dev_neg = remove_nan(dev_neg, found_nan)
found_nan, _, dev_pos = remove_nan(dev_pos, found_nan)
if args.eval_files:
found_nan, nan_eval, scores_eval = remove_nan(scores_eval, found_nan)
if found_nan:
logger.warn('Some nan values were removed.')
# train the preprocessors
algorithm.train_preprocessors(scores_dev)
algorithm.train_preprocessors(scores_train)
# preprocess data
train_neg, train_pos = algorithm.preprocess(train_neg), algorithm.preprocess(train_pos)
if args.dev_files:
scores_dev = algorithm.preprocess(scores_dev)
dev_neg, dev_pos = algorithm.preprocess(dev_neg), algorithm.preprocess(dev_pos)
if args.eval_files:
scores_eval = algorithm.preprocess(scores_eval)
neg, pos = algorithm.preprocess(neg), algorithm.preprocess(pos)
# train the model
if utils.check_file(args.model_file, args.force, 1000):
......@@ -61,16 +93,16 @@ def fuse(args, command_line_parameters):
"- Fusion: model '%s' already exists.", args.model_file)
algorithm = algorithm.load(args.model_file)
else:
algorithm.train(neg, pos)
algorithm.train(train_neg, train_pos, dev_neg, dev_pos)
algorithm.save(args.model_file)
# fuse the scores (dev)
if utils.check_file(args.fused_dev_file, args.force, 1000):
logger.info(
"- Fusion: scores '%s' already exists.", args.fused_dev_file)
else:
elif args.dev_files:
fused_scores_dev = algorithm.fuse(scores_dev)
score_lines = score_lines_list_dev[0]
score_lines = score_lines_list_dev[idx1][~nan_dev]
score_lines['score'] = fused_scores_dev
create_directories_safe(os.path.dirname(args.fused_dev_file))
dump_score(args.fused_dev_file, score_lines)
......@@ -82,7 +114,7 @@ def fuse(args, command_line_parameters):
"- Fusion: scores '%s' already exists.", args.fused_eval_file)
else:
fused_scores_eval = algorithm.fuse(scores_eval)
score_lines = score_lines_list_eval[0]
score_lines = score_lines_list_eval[idx1][~nan_eval]
score_lines['score'] = fused_scores_eval
create_directories_safe(os.path.dirname(args.fused_eval_file))
dump_score(args.fused_eval_file, score_lines)
......
#!/usr/bin/env python
"""Plot decision boundraries of the fusion algorithm.
Usage:
plot_fusion_decision_boundary.py SCORE_FILE SCORE_FILE MODEL_FILE
[-v... | --verbose...] [options]
plot_fusion_decision_boundary.py (-h | --help)
plot_fusion_decision_boundary.py (-V | --version)
Options:
-o, --output PLOT_FILE The path to save the plot. [default: scatter.pdf]
--score-type {4,5} The format the scores are provided. [default: 4]
-v, --verbose Increase the verbosity level from 0 (only error
messages) to 1 (warnings), 2 (log messages), 3 (debug
information) by adding the --verbose option as often
as desired (e.g. '-vvv' for debug). [default: 0]
-g, --group N If given scores will be grouped into N samples.
Give -1 for no grouping. [default: -1]
--grouping {random, kmeans} The gouping algorithm used. [default: kmeans]
-h --help Show this screen.
-V, --version Show version.
"""
from docopt import docopt
"""Plot decision boundraries of the fusion algorithm."""
import argparse
import matplotlib.pyplot as plt
import numpy
import numpy as np
import bob.fusion.base
import bob.core
from bob.measure.load import load_score, get_negatives_positives,\
get_all_scores
from bob.measure import eer_threshold
from ..tools import grouping
from bob.measure.load import load_score
from ..tools import grouping, get_gza_from_lines_list, \
get_scores, remove_nan, check_consistency
logger = bob.core.log.setup("bob.fusion.base")
......@@ -53,14 +30,14 @@ def plot_boundary_decision(algorithm, scores, score_labels, threshold,
'''
Plots the boundary decision of the Algorithm
@param score_labels numpy.array A (scores.shape[0]) array containing
@param score_labels np.array A (scores.shape[0]) array containing
the true labels of scores.
@param threshold float threshold of the decision boundary
'''
if legends is None:
legends = ['Impostor', 'Genuine']
markers = ['x', 'o']
legends = ['Impostor', 'Attack', 'Genuine']
markers = ['x', 'o', 's']
if scores.shape[1] > 2:
raise NotImplementedError(
......@@ -74,27 +51,25 @@ def plot_boundary_decision(algorithm, scores, score_labels, threshold,
Y = score_labels
x_min, x_max = X[:, i1].min() - x_pad, X[:, i1].max() + x_pad
y_min, y_max = X[:, i2].min() - y_pad, X[:, i2].max() + y_pad
xx, yy = numpy.meshgrid(
numpy.linspace(x_min, x_max, resolution),
numpy.linspace(y_min, y_max, resolution))
temp = numpy.c_[xx.ravel(), yy.ravel()]
xx, yy = np.meshgrid(
np.linspace(x_min, x_max, resolution),
np.linspace(y_min, y_max, resolution))
temp = np.c_[xx.ravel(), yy.ravel()]
temp = algorithm.preprocess(temp)
Z = (algorithm.fuse(temp) > threshold).reshape(xx.shape)
contourf = plt.contour(xx, yy, Z, 1, alpha=1, cmap=plt.cm.viridis)
contourf = plt.contourf(xx, yy, Z, 1, alpha=1, cmap=plt.cm.viridis)
if do_grouping:
negatives, positives = X[numpy.logical_not(Y)], X[Y]
negatives, positives = grouping(negatives, positives, **kwargs)
X = numpy.concatenate((negatives, positives), axis=0)
Y = numpy.concatenate(
(numpy.zeros(negatives.shape[0], dtype=numpy.bool8),
numpy.ones(positives.shape[0], dtype=numpy.bool8)),
axis=0)
negatives, positives = X[numpy.logical_not(Y)], X[Y]
colors = plt.cm.viridis(numpy.linspace(0, 1, 2))
for i, X in enumerate((negatives, positives)):
gen = grouping(X[Y == 0, :], **kwargs)
zei = grouping(X[Y == 1, :], **kwargs)
atk = grouping(X[Y == 2, :], **kwargs)
else:
gen = X[Y == 0, :]
zei = X[Y == 1, :]
atk = X[Y == 2, :]
colors = plt.cm.viridis(np.linspace(0, 1, 3))
for i, X in enumerate((zei, atk, gen)):
plt.scatter(
X[:, 0], X[:, 1], marker=markers[i], alpha=alpha,
c=colors[i], label=legends[i])
......@@ -112,38 +87,90 @@ def plot_boundary_decision(algorithm, scores, score_labels, threshold,
def main(command_line_parameters=None):
args = docopt(__doc__, argv=command_line_parameters,
version=bob.fusion.base.get_config())
bob.core.log.set_verbosity_level(logger, args['--verbose'])
# setup command line parameters
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-e', '--eval-files', nargs='+', required=True,
help='A list of score files to be plotted usually the '
'evaluation set.')
parser.add_argument('-m', '--model-file', required=True,
help='Path to Model.pkl of a saved bob.fusion.algorithm.')
parser.add_argument('-t', '--threshold', type=float, default=0, required=True,
help='The threshold to classify scores after fusion.'
'Usually calculated from fused development set.')
parser.add_argument('-o', '--output', default='scatter.pdf',
help='The path to save the plot.')
parser.add_argument('-g', '--group', default=0, type=int,
help='If given scores will be grouped into N samples.')
parser.add_argument('-G', '--grouping', choices=('random', 'kmeans'),
default='kmeans',
help='The gouping algorithm to be used.')
parser.add_argument('--skip-check', action='store_true',
help='If provided, score files are not checked '
'for consistency')
parser.add_argument('--score-type', choices=(4, 5), default=None,
help='The format the scores are provided.')
bob.core.log.add_command_line_option(parser)
# parse command line options
args = parser.parse_args(command_line_parameters)
bob.core.log.set_verbosity_level(logger, args.verbose)
# load the algorithm
algorithm = bob.fusion.base.algorithm.Algorithm()
algorithm = algorithm.load(args['MODEL_FILE'])
algorithm = algorithm.load(args.model_file)
# load the scores
score_lines_list = [
load_score(path, int(args['--score-type'])) for
path in args['SCORE_FILE']]
scores = get_all_scores(score_lines_list)
score_lines = numpy.array(score_lines_list[0])
score_lines['score'] = algorithm.fuse(algorithm.preprocess(scores))
threshold = eer_threshold(*get_negatives_positives(score_lines))
score_labels = score_lines['claimed_id'] == score_lines['real_id']
score_lines_list_eval = [load_score(path, ncolumns=args.score_type)
for path in args.eval_files]
# genuine, zero effort impostor, and attack list
idx1, gen_le, zei_le, atk_le = get_gza_from_lines_list(score_lines_list_eval)
# check if score lines are consistent
if not args.skip_check:
check_consistency(gen_le, zei_le, atk_le)
# concatenate the scores and create the labels
scores = get_scores(gen_le, zei_le, atk_le)
score_labels = np.zeros((scores.shape[0],))
gensize = gen_le[0].shape[0]
zeisize = zei_le[0].shape[0]
score_labels[:gensize] = 0
score_labels[gensize: gensize + zeisize] = 1
score_labels[gensize + zeisize:] = 2
found_nan, nan_idx, scores = remove_nan(scores, False)
score_labels = score_labels[~nan_idx]
if found_nan:
logger.warn('{} nan values were removed.'.format(np.sum(nan_idx)))
# plot the decision boundary
do_grouping = True
if int(args['--group']) == -1:
if args.group < 1:
do_grouping = False
plot_boundary_decision(
algorithm, scores, score_labels, threshold,
algorithm, scores, score_labels, args.threshold,
do_grouping=do_grouping,
npoints=int(args['--group']),
npoints=args.group,
seed=0,
gformat=args['--grouping']
gformat=args.grouping
)
plt.savefig(args['--output'])
plt.savefig(args.output)
plt.close()
if __name__ == '__main__':
main()
......@@ -8,7 +8,7 @@ from bob.fusion.base.script.bob_fuse import main as bob_fuse
from bob.fusion.base.script.plot_fusion_decision_boundary import main as plot_fusion_decision_boundary
from bob.io.base.test_utils import datafile
dev_files = [datafile("scores-dev-1", 'bob.fusion.base'),
train_files = [datafile("scores-dev-1", 'bob.fusion.base'),
datafile("scores-dev-2", 'bob.fusion.base')]
eval_files = [datafile("scores-eval-1", 'bob.fusion.base'),
datafile("scores-eval-2", 'bob.fusion.base')]
......@@ -22,22 +22,22 @@ def test_scripts():
fused_eval_file = os.path.join(tmpdir, 'scores-eval')
# test normally
cmd = ['-i'] + dev_files + ['-o', fused_dev_file, '-a', 'llr']
cmd = ['-s', tmpdir, '-t'] + train_files + ['-o', fused_dev_file, '-a', 'llr']
bob_fuse(cmd)
cmd = ['-i'] + dev_files + ['-I'] + eval_files + ['-o', fused_dev_file, '-O', fused_eval_file, '-a', 'llr']
cmd = ['-s', tmpdir, '-t'] + train_files + ['-e'] + eval_files + ['-o', fused_dev_file, '-O', fused_eval_file, '-a', 'llr']
bob_fuse(cmd)
# make inconsistency
wrong_dev2 = os.path.join(tmpdir, 'scores-dev-2')
with open(wrong_dev2, 'w') as f1, open(dev_files[1]) as f2:
with open(wrong_dev2, 'w') as f1, open(train_files[1]) as f2:
lines = f2.readlines()
temp = lines[0].split()
temp = (temp[0], 'temp1_id', temp[2], temp[3])
lines[0] = ' '.join(temp) + '\n'
f1.writelines(lines)
cmd = ['-i'] + dev_files[0:1] + [wrong_dev2] + ['-o', fused_dev_file, '-a', 'llr']
cmd = ['-s', tmpdir, '-t'] + train_files[0:1] + [wrong_dev2] + ['-o', fused_dev_file, '-a', 'llr']
try:
bob_fuse(cmd)
except AssertionError:
......@@ -46,13 +46,13 @@ def test_scripts():
raise Exception('An AssertionError should have been raised.')
# this should not raise an error
cmd = ['-i'] + dev_files[0:1] + [wrong_dev2] + ['-o', fused_dev_file, '-a', 'llr', '--skip-check']
cmd = ['-s', tmpdir, '-t'] + train_files[0:1] + [wrong_dev2] + ['-o', fused_dev_file, '-a', 'llr', '--skip-check']
bob_fuse(cmd)
# test plot
model_file = os.path.join(tmpdir, 'Model.pkl')
output = os.path.join(tmpdir, 'scatter.pdf')
cmd = dev_files + [model_file, '-o', output]
cmd = train_files + [model_file, '-o', output]
plot_fusion_decision_boundary(cmd)
finally:
......
from common import *
from command_line import *
from plotting import *
......
......@@ -36,25 +36,31 @@ def command_line_parser(description=__doc__, exclude_resources_from=[]):
description=description,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-i', '--dev-files', required=True,
parser.add_argument('-t', '--train-files', required=True,
nargs='+', help="A list of score files of "
"the development set.")
parser.add_argument('-I', '--eval-files', nargs='+',
"the train set.")
parser.add_argument('-d', '--dev-files', nargs='+',
help="A list of score files of the development set; "
"if given it must be the same number of files "
"as the --train-files.")
parser.add_argument('-e', '--eval-files', nargs='+',
help="A list of score files of the evaluation set; "
"if given it must be the same number of files "
"as the --dev-files.")
parser.add_argument('-o', '--fused-dev-file',
required=True, help='The fused development score file.')
parser.add_argument(
'-O', '--fused-eval-file', help='The fused evaluation score file.')
parser.add_argument('--score-type', choices=[4, 5], default=4,
help='The format the scores are provided.')
"as the --train-files.")
parser.add_argument('-o', '--fused-dev-file', default=None,
help='The fused development score file. '
'Default is "scores-dev" in the --save-directory')
parser.add_argument('-O', '--fused-eval-file', default=None,
help='The fused evaluation score file. '
'Default is "scores-eval" in the --save-directory')
parser.add_argument('--score-type', choices=[4, 5], default=None,
help='The format the scores are provided. If not '
'provided, the number of columns will be guessed.')
parser.add_argument('--skip-check', action='store_true',
help='If provided, score files are not checked '
'for consistency')
parser.add_argument('-s', '--save-directory', help='The directory to save '
'the experiment artifacts. If not given, the directory'
' of fused-dev-file will be used.')
'the experiment artifacts.', default='fusion_result')
config_group = parser.add_argument_group(
'Parameters defining the experiment', ' Most of these parameters can be a'
......@@ -95,8 +101,10 @@ def initialize(parsers, command_line_parameters=None, skips=[]):
args.algorithm, 'algorithm', imports=args.imports)
# set base directories
if args.save_directory is None:
args.save_directory = os.path.dirname(args.fused_dev_file)
if args.fused_dev_file is None:
args.fused_dev_file = os.path.join(args.save_directory, 'scores-dev')
if args.fused_eval_file is None:
args.fused_eval_file = os.path.join(args.save_directory, 'scores-eval')
# result files
args.info_file = os.path.join(args.save_directory, 'Experiment.info')
......
import numpy as np
import bob.core
logger = bob.core.log.setup("bob.fusion.base")
def get_2negatives_1positive(score_lines):
gen_mask = score_lines['claimed_id'] == score_lines['real_id']
atk_mask = np.logical_or(np.char.count(score_lines['real_id'], 'spoof/') > 0,
np.char.count(score_lines['real_id'], 'attack') > 0)
zei_mask = np.logical_and(np.logical_not(gen_mask), np.logical_not(atk_mask))
gen = score_lines[gen_mask]
zei = score_lines[zei_mask]
atk = score_lines[atk_mask]
return (gen, zei, atk, gen_mask, zei_mask, atk_mask)
def check_consistency(gen_l, zei_l, atk_l):
if len(gen_l) < 2:
logger.error('Check failed since less than two system is available.')
for score_lines_list in (gen_l, zei_l, atk_l):
if not score_lines_list:
continue
score_lines0 = score_lines_list[0]
for score_lines in score_lines_list[1:]:
assert(np.all(score_lines['claimed_id'] == score_lines0['claimed_id']))
assert(np.all(score_lines['real_id'] == score_lines0['real_id']))
def get_scores(*args):
scores = []
for temp in zip(*args):
scores.append(np.concatenate([a['score'] for a in temp], axis=0))
return np.vstack(scores).T
def remove_nan(samples, found_nan):
ncls = samples.shape[1]
nans = np.isnan(samples[:, 0])
for i in range(1, ncls):
nans = np.logical_or(nans, np.isnan(samples[:, i]))
return np.any(nans) or found_nan, nans, samples[~nans, :]
def get_gza_from_lines_list(score_lines_list):
gen_l, zei_l, atk_l = [], [], []
for score_lines in score_lines_list:
gen, zei, atk, _, _, _ = get_2negatives_1positive(score_lines)
gen_l.append(gen)
zei_l.append(zei)
atk_l.append(atk)
zei_lengths = []
for zei in zei_l:
zei_lengths.append(zei.size)
zei_lengths = np.array(zei_lengths)
idx1 = 0 # used later if it does not enter the if.
if not (np.all(zei_lengths == 0) or np.all(zei_lengths > 0)):
# generate the missing ones
# find one that has zei
idx1 = zei_lengths.nonzero()[0][0]
zei_full = zei_l[idx1]
for idx2 in np.where(zei_lengths == 0)[0]:
if zei_l[idx2] is None:
continue
temp = np.array(zei_full)
# make sure we replace all scores.
temp['score'] = np.nan
# get the list of ids
real_ids = np.unique(temp['real_id'])
# find pad score of that id and replace the score
for real_id in real_ids:
# get the list of test_labels
test_labels = np.unique(temp['test_label'][temp['real_id'] == real_id])
for test_label in test_labels:
idx3 = np.logical_and(temp['real_id'] == real_id,
temp['test_label'] == test_label)
idx4 = np.logical_and(gen_l[idx2]['real_id'] == real_id,
gen_l[idx2]['test_label'] == test_label)
temp['score'][idx3] = gen_l[idx2]['score'][idx4]
zei_l[idx2] = temp
return idx1, gen_l, zei_l, atk_l
......@@ -4,38 +4,24 @@ import numpy
import bob.learn.em
def grouping(negatives, positives,
gformat='random', npoints=500, seed=None, **kwargs):
def grouping(scores, gformat='random', npoints=500, seed=None, **kwargs):
negatives = numpy.asarray(negatives)
positives = numpy.asarray(positives)
scores = numpy.asarray(scores)
if(gformat == "kmeans"):
kmeans_negatives = bob.learn.em.KMeansMachine(npoints, 2)
kmeans_positives = bob.learn.em.KMeansMachine(npoints, 2)
kmeansTrainer = bob.learn.em.KMeansTrainer()
kmeans_machine = bob.learn.em.KMeansMachine(npoints, 2)
kmeans_trainer = bob.learn.em.KMeansTrainer()
bob.learn.em.train(
kmeansTrainer, kmeans_negatives, negatives, max_iterations=500,
kmeans_trainer, kmeans_machine, scores, max_iterations=500,
convergence_threshold=0.1)
bob.learn.em.train(
kmeansTrainer, kmeans_positives, positives, max_iterations=500,
convergence_threshold=0.1)
negatives = kmeans_negatives.means
positives = kmeans_positives.means
scores = kmeans_machine.means
elif(gformat == "random"):
if seed is not None:
numpy.random.seed(seed)
negatives_indexes = numpy.array(
numpy.random.rand(npoints) * negatives.shape[0], dtype=int)
positives_indexes = numpy.array(
numpy.random.rand(npoints) * positives.shape[0], dtype=int)
scores_indexes = numpy.array(
numpy.random.rand(npoints) * scores.shape[0], dtype=int)
negatives = negatives[negatives_indexes]
positives = positives[positives_indexes]
scores = scores[scores_indexes]
return negatives, positives
return scores
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment