Skip to content
Snippets Groups Projects
Commit 5814a2de authored by Amir MOHAMMADI's avatar Amir MOHAMMADI
Browse files

Merge branch 'fix-gen' into 'master'

Fixing and adding features to the scores generation script

See merge request !198
parents 213d6069 3c096a7b
No related branches found
No related tags found
1 merge request!198Fixing and adding features to the scores generation script
Pipeline #42833 passed
......@@ -3,21 +3,17 @@
import os
import logging
import numpy
import random
import click
from click.types import FLOAT
from bob.extension.scripts.click_helper import verbosity_option
import bob.core
from bob.io.base import create_directories_safe
from bob.measure.script import common_options
logger = logging.getLogger(__name__)
NUM_NEG = 5000
NUM_POS = 5000
def gen_score_distr(mean_neg, mean_pos, sigma_neg=10, sigma_pos=10):
def gen_score_distr(
mean_neg, mean_pos, sigma_neg=10, sigma_pos=10, n_neg=5000, n_pos=5000, seed=0
):
"""Generate scores from normal distributions
Parameters
......@@ -30,6 +26,14 @@ def gen_score_distr(mean_neg, mean_pos, sigma_neg=10, sigma_pos=10):
STDev for negative scores
sigma_pos : float
STDev for positive scores
n_pos: int
The number of positive scores generated
n_neg: int
The number of negative scores generated
seed: int
A value to initialize the Random Number generator. Giving the same
value (or not specifying 'seed') on two different calls will generate
the same lists of scores.
Returns
-------
......@@ -38,19 +42,28 @@ def gen_score_distr(mean_neg, mean_pos, sigma_neg=10, sigma_pos=10):
pos_scores : :any:`list`
Positive scores
"""
mt = bob.core.random.mt19937() # initialise the random number generator
neg_generator = bob.core.random.normal(numpy.float32, mean_neg, sigma_neg)
pos_generator = bob.core.random.normal(numpy.float32, mean_pos, sigma_pos)
logger.debug("Initializing RNG.")
numpy.random.seed(seed)
logger.info(f"Generating {n_neg} negative and {n_pos} positive scores.")
neg_scores = [neg_generator(mt) for _ in range(NUM_NEG)]
pos_scores = [pos_generator(mt) for _ in range(NUM_NEG)]
neg_scores = numpy.random.normal(loc=mean_neg, scale=sigma_neg, size=n_neg)
pos_scores = numpy.random.normal(loc=mean_pos, scale=sigma_pos, size=n_pos)
return neg_scores, pos_scores
def write_scores_to_file(neg, pos, filename, n_subjects=5, n_probes_per_subject=5,
n_unknown_subjects=0, neg_unknown=None, five_col=False):
def write_scores_to_file(
neg,
pos,
filename,
n_subjects=5,
n_probes_per_subject=5,
n_unknown_subjects=0,
neg_unknown=None,
five_col=False,
):
""" Writes score distributions
Parameters
......@@ -61,82 +74,271 @@ def write_scores_to_file(neg, pos, filename, n_subjects=5, n_probes_per_subject=
Scores for positive samples.
filename : str
The path to write the score to.
n_sys : int
Number of different systems
n_subjects: int
Number of different subjects
n_probes_per_subject: int
Number of different samples used as probe for each subject
n_unknown_subjects: int
The number of unknown (no registered model) subjects
neg_unknown: None or list
The of unknown subjects scores
five_col : bool
If 5-colum format, else 4-column
"""
logger.debug(f"Creating result directories ('{filename}').")
create_directories_safe(os.path.dirname(filename))
s_subjects = ['x%d' % i for i in range(n_subjects)]
with open(filename, 'wt') as f:
for i in pos:
s_name = random.choice(s_subjects)
s_five = ' ' if not five_col else ' d' + \
random.choice(s_subjects) + ' '
probe_id = "%s_%d" %(s_name, random.randint(0, n_probes_per_subject-1))
f.write('%s%s%s %s %f\n' % (s_name, s_five, s_name, probe_id, i))
for i in neg:
s_names = random.sample(s_subjects, 2)
s_five = ' ' if not five_col else ' d' + \
random.choice(s_names) + ' '
probe_id = "%s_%d" %(s_names[1], random.randint(0, n_probes_per_subject-1))
f.write('%s%s%s %s %f\n' % (s_names[0], s_five, s_names[1], probe_id, i))
s_subjects = ["x%d" % i for i in range(n_subjects)]
logger.debug("Writing scores to files.")
with open(filename, "wt") as f:
# Generate one line per probe (unless "--force-count" specified)
logger.debug("Writing positive scores.")
for i, score in enumerate(pos):
s_name = s_subjects[int(i / n_probes_per_subject) % n_subjects]
s_five = " " if not five_col else " d" + s_name + " "
probe_id = "%s_%d" % (s_name, i % n_probes_per_subject)
f.write("%s%s%s %s %f\n" % (s_name, s_five, s_name, probe_id, score))
# Generate one line per probe against each ref (unless "--force-count" specified)
logger.debug("Writing negative scores.")
for i, score in enumerate(neg):
n_impostors = n_subjects - 1
ref = s_subjects[int(i / n_probes_per_subject / n_impostors) % n_subjects]
impostors = [s for s in s_subjects if s != ref] # ignore pos
probe = impostors[int(i / n_probes_per_subject) % n_impostors]
s_five = " " if not five_col else " d" + ref
probe_id = "%s_%d" % (probe, i % n_probes_per_subject)
f.write("%s%s%s %s %f\n" % (ref, s_five, probe, probe_id, score))
logger.debug("Writing unknown scores.")
if neg_unknown is not None:
s_unknown_subjects = ['u%d' % i for i in range(n_unknown_subjects)]
for i in neg_unknown:
s_name = random.choice(s_subjects)
s_name_probe = random.choice(s_unknown_subjects)
s_five = ' ' if not five_col else ' d' + \
random.choice(s_subjects) + ' '
probe_id = "%s_%d" %(s_name_probe, random.randint(0, n_probes_per_subject-1))
f.write('%s%s%s %s %f\n' % (s_name, s_five, s_name_probe, probe_id, i))
@click.command()
@click.argument('outdir')
@click.option('-mm', '--mean-match', default=10, type=FLOAT, show_default=True,\
help="Mean for the positive scores distribution")
@click.option('-mnm', '--mean-non-match', default=-10, type=FLOAT, show_default=True,\
help="Mean for the negative scores distribution")
@click.option('-p', '--n-probes-per-subjects', default=5, type=click.INT, show_default=True,\
help="Number of probes per subject")
@click.option('-s', '--n-subjects', default=5, type=click.INT, show_default=True,\
help="Number of subjects")
@click.option('-p', '--sigma-positive', default=10, type=click.FLOAT, show_default=True,\
help="Variance for the positive score distributions")
@click.option('-n', '--sigma-negative', default=10, type=click.FLOAT, show_default=True,\
help="Variance for the negative score distributions")
@click.option('-u', '--n-unknown-subjects', default=0, type=click.INT, show_default=True,\
help="Number of unknown subjects (useful for openset plots)")
@click.option('--five-col/--four-col', default=False, show_default=True)
s_unknown_subjects = ["u%d" % i for i in range(n_unknown_subjects)]
for i, score in enumerate(neg_unknown):
ref = s_subjects[
int(i / n_probes_per_subject / n_unknown_subjects) % n_subjects
]
probe = s_unknown_subjects[
int(i / n_probes_per_subject) % n_unknown_subjects
]
s_five = " " if not five_col else " d" + ref + " "
probe_id = "%s_%d" % (probe, i % n_probes_per_subject)
f.write("%s%s%s %s %f\n" % (ref, s_five, probe, probe_id, score))
@click.command(
epilog="""
Scores generation examples:
Output 'scores-dev' and 'scores-eval' in a new folder 'generated_scores/':
$ bob bio gen ./generated_scores
Output scores similar to a system evaluated on the AT&T dataset dev group:
$ bob bio gen -s 20 -p 5 ./generated_scores
Output a given number of scores in each file:
$ bob bio gen -f --n-neg 500 --n-pos 100 ./generated_scores
Include unknown subjects scores:
$ bob bio gen -s 5 -u 2 ./generated_scores
Change the mean and standard deviation of the scores distributions:
$ bob bio gen -mm 1 -sp 0.3 -mnm -1 -sn 0.5 ./generated_scores
You can observe the distributions histograms in a pdf file with:
$ bob bio hist -e ./generated_scores/scores-{dev,eval} -o hist_gen.pdf
"""
)
@click.argument("outdir")
@click.option(
"-mm",
"--mean-match",
default=10,
type=click.FLOAT,
show_default=True,
help="Mean for the positive scores distribution",
)
@click.option(
"-mnm",
"--mean-non-match",
default=-10,
type=click.FLOAT,
show_default=True,
help="Mean for the negative scores distribution",
)
@click.option(
"-p",
"--n-probes-per-subject",
default=5,
type=click.INT,
show_default=True,
help="Number of probes per subject",
)
@click.option(
"-s",
"--n-subjects",
default=50,
type=click.INT,
show_default=True,
help="Number of subjects",
)
@click.option(
"-sp",
"--sigma-positive",
default=10,
type=click.FLOAT,
show_default=True,
help="Variance for the positive score distributions",
)
@click.option(
"-sn",
"--sigma-negative",
default=10,
type=click.FLOAT,
show_default=True,
help="Variance for the negative score distributions",
)
@click.option(
"-u",
"--n-unknown-subjects",
default=0,
type=click.INT,
show_default=True,
help="Number of unknown subjects (useful for openset plots)",
)
@click.option(
"-f",
"--force-count",
"force_count",
is_flag=True,
help="Use --n-pos and --n-neg amounts instead of the subject and sample counts",
)
@click.option(
"--n-pos",
"n_pos",
default=5000,
type=click.INT,
show_default=True,
help="Number of Positive verifications (number of lines in the file)",
)
@click.option(
"--n-neg",
"n_neg",
default=5000,
type=click.INT,
show_default=True,
help="Number of Negative verifications (number of lines in the file)",
)
@click.option(
"--n-unk",
"n_unk",
default=5000,
type=click.INT,
show_default=True,
help="Number of Unknown verifications (number of lines in the file)",
)
@click.option("--five-col/--four-col", default=False, show_default=True)
@verbosity_option()
def gen(outdir, mean_match, mean_non_match, n_probes_per_subjects, n_subjects,\
sigma_positive, sigma_negative, n_unknown_subjects, five_col, **kwargs):
def gen(
outdir,
mean_match,
mean_non_match,
n_probes_per_subject,
n_subjects,
sigma_positive,
sigma_negative,
n_unknown_subjects,
five_col,
force_count,
n_pos,
n_neg,
n_unk,
**kwargs,
):
"""Generate random scores.
Generates random scores in 4col or 5col format. The scores are generated
using Gaussian distribution whose mean is an input
using Gaussian distribution whose mean and variance are an input
parameter. The generated scores can be used as hypothetical datasets.
This command generates scores relative to the number of subjects and
probes per subjects, unless the -f flag is set. In that case, the --n-pos
and --n-neg options are used as number of genuine and impostor
comparisons.
"""
# Compute the number of verifications needed
if force_count:
neg_count, pos_count, unknown_count = n_neg, n_pos, n_unk
else:
# One reference (model), and `n_probes_per_subject` probes per subject
neg_count = n_subjects * n_probes_per_subject * (n_subjects - 1)
pos_count = n_probes_per_subject * n_subjects
unknown_count = n_unknown_subjects * n_subjects * n_probes_per_subject
# Generate the data
neg_dev, pos_dev = gen_score_distr(mean_non_match, mean_match, sigma_negative, sigma_positive)
neg_eval, pos_eval = gen_score_distr(mean_non_match, mean_match, sigma_negative, sigma_positive)
logger.info("Generating dev scores.")
neg_dev, pos_dev = gen_score_distr(
mean_non_match,
mean_match,
sigma_negative,
sigma_positive,
n_neg=neg_count,
n_pos=pos_count,
seed=0,
)
logger.info("Generating eval scores.")
neg_eval, pos_eval = gen_score_distr(
mean_non_match,
mean_match,
sigma_negative,
sigma_positive,
n_neg=neg_count,
n_pos=pos_count,
seed=1,
)
# For simplicity I will use the same distribution for dev-eval
if n_unknown_subjects:
neg_unknown,_ = gen_score_distr(mean_non_match, mean_match, sigma_negative, sigma_positive)
logger.info("Generating unknown scores.")
neg_unknown, _ = gen_score_distr(
mean_non_match,
mean_match,
sigma_negative,
sigma_positive,
n_neg=unknown_count,
n_pos=0,
seed=2,
)
else:
neg_unknown = None
# Write the data into files
write_scores_to_file(neg_dev, pos_dev,
os.path.join(outdir, 'scores-dev'),
n_subjects, n_probes_per_subjects,
n_unknown_subjects, neg_unknown, five_col)
write_scores_to_file(neg_eval, pos_eval,
os.path.join(outdir, 'scores-eval'),
n_subjects, n_probes_per_subjects,
n_unknown_subjects, neg_unknown, five_col)
logger.info("Saving results.")
write_scores_to_file(
neg_dev,
pos_dev,
os.path.join(outdir, "scores-dev"),
n_subjects,
n_probes_per_subject,
n_unknown_subjects,
neg_unknown,
five_col,
)
write_scores_to_file(
neg_eval,
pos_eval,
os.path.join(outdir, "scores-eval"),
n_subjects,
n_probes_per_subject,
n_unknown_subjects,
neg_unknown,
five_col,
)
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Yannick Dayer <yannick.dayer@idiap.ch>
# Mon 14 Sep 2020 17:00:41 UTC+02
"""Tests for the bob.bio.base.script.gen module
The gen module generates synthetic scores and saves them to a file for
demonstration and test purpose.
"""
import os
import numpy
from click.testing import CliRunner
from bob.extension.scripts.click_helper import assert_click_runner_result
from bob.bio.base.script.gen import gen, gen_score_distr
import logging
logger = logging.getLogger(__name__)
logger.setLevel(
"DEBUG"
) # If NOTSET (default), will be changed to ERROR at CliRunner.invoke
def test_gen():
"""
Tests that the main gen command works as expected
"""
# Define a click runner to invoke click commands
runner = CliRunner()
with runner.isolated_filesystem():
temp_path = "./gen_test_temp_dir/"
n_subjects = 5
n_probes_per_subject = 5
n_unknown_subjects = 2
n_pos = 10
n_neg = 60
n_unk = 20
logger.info("Calling 'gen' with a specific amount of scores.")
result = runner.invoke(
gen,
args=[
"-mm",
"10",
"-mnm",
"-10",
"-sp",
"1",
"-sn",
"1",
"-p",
f"{n_probes_per_subject}",
"-s",
f"{n_subjects}",
"-u",
f"{n_unknown_subjects}",
"-f",
"--n-pos",
f"{n_pos}",
"--n-neg",
f"{n_neg}",
"--n-unk",
f"{n_unk}",
f"{temp_path}",
],
)
assert_click_runner_result(result)
assert os.path.exists(
os.path.join(temp_path, "scores-dev")
), "dev scores file not created."
line_count = 0
with open(os.path.join(temp_path, "scores-dev")) as f:
for l in f:
line_count += 1
assert line_count == n_pos + n_neg + n_unk
assert os.path.exists(
os.path.join(temp_path, "scores-eval")
), "eval scores file not created."
line_count = 0
with open(os.path.join(temp_path, "scores-eval")) as f:
for l in f:
line_count += 1
assert line_count == n_pos + n_neg + n_unk
n_subjects = 5
n_probes_per_subject = 5
n_unknown_subjects = 2
n_pos = n_subjects * n_probes_per_subject
n_neg = n_subjects * (n_subjects - 1) * n_probes_per_subject
n_unk = n_unknown_subjects * n_subjects * n_probes_per_subject
logger.info("Calling 'gen' without a specific amount.")
result = runner.invoke(
gen,
args=[
"-mm",
"10",
"-mnm",
"-10",
"-sp",
"1",
"-sn",
"1",
"-p",
f"{n_probes_per_subject}",
"-s",
f"{n_subjects}",
"-u",
f"{n_unknown_subjects}",
f"{temp_path}",
],
)
assert_click_runner_result(result)
assert os.path.exists(
os.path.join(temp_path, "scores-dev")
), "dev scores file not created."
line_count = 0
with open(os.path.join(temp_path, "scores-dev")) as f:
for l in f:
line_count += 1
assert line_count == n_pos + n_neg + n_unk
assert os.path.exists(
os.path.join(temp_path, "scores-eval")
), "eval scores file not created."
line_count = 0
with open(os.path.join(temp_path, "scores-eval")) as f:
for l in f:
line_count += 1
assert line_count == n_pos + n_neg + n_unk
n_subjects = 5
n_probes_per_subject = 2
n_unknown_subjects = 0
n_pos = n_subjects * n_probes_per_subject
n_neg = n_subjects * (n_subjects - 1) * n_probes_per_subject
n_unk = n_unknown_subjects * n_subjects * n_probes_per_subject
logger.info("Calling 'gen' without unknown subjects.")
result = runner.invoke(
gen,
args=[
"-mm",
"10",
"-mnm",
"-10",
"-sp",
"1",
"-sn",
"1",
"-p",
f"{n_probes_per_subject}",
"-s",
f"{n_subjects}",
"-u",
f"{n_unknown_subjects}",
f"{temp_path}",
],
)
assert_click_runner_result(result)
assert os.path.exists(
os.path.join(temp_path, "scores-dev")
), "dev scores file not created."
line_count = 0
with open(os.path.join(temp_path, "scores-dev")) as f:
for l in f:
line_count += 1
assert line_count == n_pos + n_neg + n_unk
assert os.path.exists(
os.path.join(temp_path, "scores-eval")
), "eval scores file not created."
line_count = 0
with open(os.path.join(temp_path, "scores-eval")) as f:
for l in f:
line_count += 1
assert line_count == n_pos + n_neg + n_unk
n_subjects = 0
n_probes_per_subject = 2
n_unknown_subjects = 0
n_pos = n_subjects * n_probes_per_subject
n_neg = n_subjects * (n_subjects - 1) * n_probes_per_subject
n_unk = n_unknown_subjects * n_subjects * n_probes_per_subject
logger.info("Calling 'gen' with no subjects.")
result = runner.invoke(
gen,
args=[
"-mm",
"10",
"-mnm",
"-10",
"-sp",
"1",
"-sn",
"1",
"-p",
f"{n_probes_per_subject}",
"-s",
f"{n_subjects}",
"-u",
f"{n_unknown_subjects}",
f"{temp_path}",
],
)
assert_click_runner_result(result)
assert os.path.exists(
os.path.join(temp_path, "scores-dev")
), "dev scores file not created."
line_count = 0
with open(os.path.join(temp_path, "scores-dev")) as f:
for l in f:
line_count += 1
assert line_count == n_pos + n_neg + n_unk
assert os.path.exists(
os.path.join(temp_path, "scores-eval")
), "eval scores file not created."
line_count = 0
with open(os.path.join(temp_path, "scores-eval")) as f:
for l in f:
line_count += 1
assert line_count == n_pos + n_neg + n_unk
n_subjects = 5
n_probes_per_subject = 0
n_unknown_subjects = 2
n_pos = n_subjects * n_probes_per_subject
n_neg = n_subjects * (n_subjects - 1) * n_probes_per_subject
n_unk = n_unknown_subjects * n_subjects * n_probes_per_subject
logger.info("Calling 'gen' with no probes.")
result = runner.invoke(
gen,
args=[
"-mm",
"10",
"-mnm",
"-10",
"-sp",
"1",
"-sn",
"1",
"-p",
f"{n_probes_per_subject}",
"-s",
f"{n_subjects}",
"-u",
f"{n_unknown_subjects}",
f"{temp_path}",
],
)
assert_click_runner_result(result)
assert os.path.exists(
os.path.join(temp_path, "scores-dev")
), "dev scores file not created."
line_count = 0
with open(os.path.join(temp_path, "scores-dev")) as f:
for l in f:
line_count += 1
assert line_count == n_pos + n_neg + n_unk
assert os.path.exists(
os.path.join(temp_path, "scores-eval")
), "eval scores file not created."
line_count = 0
with open(os.path.join(temp_path, "scores-eval")) as f:
for l in f:
line_count += 1
assert line_count == n_pos + n_neg + n_unk
n_subjects = 5
n_probes_per_subject = 0
n_unknown_subjects = 2
n_pos = n_subjects * n_probes_per_subject
n_neg = n_subjects * (n_subjects - 1) * n_probes_per_subject
n_unk = n_unknown_subjects * n_subjects * n_probes_per_subject
logger.info("Calling 'gen' with only unknowns.")
result = runner.invoke(
gen,
args=[
"-mm",
"10",
"-mnm",
"-10",
"-sp",
"1",
"-sn",
"1",
"-p",
f"{n_probes_per_subject}",
"-s",
f"{n_subjects}",
"-u",
f"{n_unknown_subjects}",
f"{temp_path}",
],
)
assert_click_runner_result(result)
assert os.path.exists(
os.path.join(temp_path, "scores-dev")
), "dev scores file not created."
line_count = 0
with open(os.path.join(temp_path, "scores-dev")) as f:
for l in f:
line_count += 1
assert line_count == n_pos + n_neg + n_unk
assert os.path.exists(
os.path.join(temp_path, "scores-eval")
), "eval scores file not created."
line_count = 0
with open(os.path.join(temp_path, "scores-eval")) as f:
for l in f:
line_count += 1
assert line_count == n_pos + n_neg + n_unk
def test_gen_score_dist():
"""
Tests that the scores generation works as expected
"""
neg, pos = gen_score_distr(
mean_neg=-10, mean_pos=10, sigma_neg=1, sigma_pos=1, n_neg=20, n_pos=20, seed=0
)
assert len(neg) == 20, f"Incorrect number of negative scores generated ({len(neg)})"
assert len(pos) == 20, f"Incorrect number of positive scores generated ({len(pos)})"
assert all(
[isinstance(s, (numpy.floating, float)) for s in neg]
), "A score was not a float"
assert all(
[isinstance(s, (numpy.floating, float)) for s in pos]
), "A score was not a float"
expected_neg = numpy.array(
[
-8.23594765,
-9.59984279,
-9.02126202,
-7.7591068,
-8.13244201,
-10.97727788,
-9.04991158,
-10.15135721,
-10.10321885,
-9.5894015,
-9.85595643,
-8.54572649,
-9.23896227,
-9.87832498,
-9.55613677,
-9.66632567,
-8.50592093,
-10.20515826,
-9.6869323,
-10.85409574,
]
)
expected_pos = numpy.array(
[
7.44701018,
10.6536186,
10.8644362,
9.25783498,
12.26975462,
8.54563433,
10.04575852,
9.81281615,
11.53277921,
11.46935877,
10.15494743,
10.37816252,
9.11221425,
8.01920353,
9.65208785,
10.15634897,
11.23029068,
11.20237985,
9.61267318,
9.69769725,
]
)
assert numpy.allclose(neg, expected_neg), "Unexpected score generated"
assert numpy.allclose(pos, expected_pos), "Unexpected score generated"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment