Skip to content
Snippets Groups Projects

Merge 'fix-gen' (already applied on 'master') onto 'dask-pipelines'

Merged Yannick DAYER requested to merge fix-gen into dask-pipelines
2 files
+ 662
73
Compare changes
  • Side-by-side
  • Inline
Files
2
+ 275
73
@@ -3,21 +3,17 @@
import os
import logging
import numpy
import random
import click
from click.types import FLOAT
from bob.extension.scripts.click_helper import verbosity_option
import bob.core
from bob.io.base import create_directories_safe
from bob.measure.script import common_options
logger = logging.getLogger(__name__)
NUM_NEG = 5000
NUM_POS = 5000
def gen_score_distr(mean_neg, mean_pos, sigma_neg=10, sigma_pos=10):
def gen_score_distr(
mean_neg, mean_pos, sigma_neg=10, sigma_pos=10, n_neg=5000, n_pos=5000, seed=0
):
"""Generate scores from normal distributions
Parameters
@@ -30,6 +26,14 @@ def gen_score_distr(mean_neg, mean_pos, sigma_neg=10, sigma_pos=10):
STDev for negative scores
sigma_pos : float
STDev for positive scores
n_pos: int
The number of positive scores generated
n_neg: int
The number of negative scores generated
seed: int
A value to initialize the Random Number generator. Giving the same
value (or not specifying 'seed') on two different calls will generate
the same lists of scores.
Returns
-------
@@ -38,19 +42,28 @@ def gen_score_distr(mean_neg, mean_pos, sigma_neg=10, sigma_pos=10):
pos_scores : :any:`list`
Positive scores
"""
mt = bob.core.random.mt19937() # initialise the random number generator
neg_generator = bob.core.random.normal(numpy.float32, mean_neg, sigma_neg)
pos_generator = bob.core.random.normal(numpy.float32, mean_pos, sigma_pos)
logger.debug("Initializing RNG.")
numpy.random.seed(seed)
logger.info(f"Generating {n_neg} negative and {n_pos} positive scores.")
neg_scores = [neg_generator(mt) for _ in range(NUM_NEG)]
pos_scores = [pos_generator(mt) for _ in range(NUM_NEG)]
neg_scores = numpy.random.normal(loc=mean_neg, scale=sigma_neg, size=n_neg)
pos_scores = numpy.random.normal(loc=mean_pos, scale=sigma_pos, size=n_pos)
return neg_scores, pos_scores
def write_scores_to_file(neg, pos, filename, n_subjects=5, n_probes_per_subject=5,
n_unknown_subjects=0, neg_unknown=None, five_col=False):
def write_scores_to_file(
neg,
pos,
filename,
n_subjects=5,
n_probes_per_subject=5,
n_unknown_subjects=0,
neg_unknown=None,
five_col=False,
):
""" Writes score distributions
Parameters
@@ -61,82 +74,271 @@ def write_scores_to_file(neg, pos, filename, n_subjects=5, n_probes_per_subject=
Scores for positive samples.
filename : str
The path to write the score to.
n_sys : int
Number of different systems
n_subjects: int
Number of different subjects
n_probes_per_subject: int
Number of different samples used as probe for each subject
n_unknown_subjects: int
The number of unknown (no registered model) subjects
neg_unknown: None or list
The of unknown subjects scores
five_col : bool
If 5-colum format, else 4-column
"""
logger.debug(f"Creating result directories ('{filename}').")
create_directories_safe(os.path.dirname(filename))
s_subjects = ['x%d' % i for i in range(n_subjects)]
with open(filename, 'wt') as f:
for i in pos:
s_name = random.choice(s_subjects)
s_five = ' ' if not five_col else ' d' + \
random.choice(s_subjects) + ' '
probe_id = "%s_%d" %(s_name, random.randint(0, n_probes_per_subject-1))
f.write('%s%s%s %s %f\n' % (s_name, s_five, s_name, probe_id, i))
for i in neg:
s_names = random.sample(s_subjects, 2)
s_five = ' ' if not five_col else ' d' + \
random.choice(s_names) + ' '
probe_id = "%s_%d" %(s_names[1], random.randint(0, n_probes_per_subject-1))
f.write('%s%s%s %s %f\n' % (s_names[0], s_five, s_names[1], probe_id, i))
s_subjects = ["x%d" % i for i in range(n_subjects)]
logger.debug("Writing scores to files.")
with open(filename, "wt") as f:
# Generate one line per probe (unless "--force-count" specified)
logger.debug("Writing positive scores.")
for i, score in enumerate(pos):
s_name = s_subjects[int(i / n_probes_per_subject) % n_subjects]
s_five = " " if not five_col else " d" + s_name + " "
probe_id = "%s_%d" % (s_name, i % n_probes_per_subject)
f.write("%s%s%s %s %f\n" % (s_name, s_five, s_name, probe_id, score))
# Generate one line per probe against each ref (unless "--force-count" specified)
logger.debug("Writing negative scores.")
for i, score in enumerate(neg):
n_impostors = n_subjects - 1
ref = s_subjects[int(i / n_probes_per_subject / n_impostors) % n_subjects]
impostors = [s for s in s_subjects if s != ref] # ignore pos
probe = impostors[int(i / n_probes_per_subject) % n_impostors]
s_five = " " if not five_col else " d" + ref
probe_id = "%s_%d" % (probe, i % n_probes_per_subject)
f.write("%s%s%s %s %f\n" % (ref, s_five, probe, probe_id, score))
logger.debug("Writing unknown scores.")
if neg_unknown is not None:
s_unknown_subjects = ['u%d' % i for i in range(n_unknown_subjects)]
for i in neg_unknown:
s_name = random.choice(s_subjects)
s_name_probe = random.choice(s_unknown_subjects)
s_five = ' ' if not five_col else ' d' + \
random.choice(s_subjects) + ' '
probe_id = "%s_%d" %(s_name_probe, random.randint(0, n_probes_per_subject-1))
f.write('%s%s%s %s %f\n' % (s_name, s_five, s_name_probe, probe_id, i))
@click.command()
@click.argument('outdir')
@click.option('-mm', '--mean-match', default=10, type=FLOAT, show_default=True,\
help="Mean for the positive scores distribution")
@click.option('-mnm', '--mean-non-match', default=-10, type=FLOAT, show_default=True,\
help="Mean for the negative scores distribution")
@click.option('-p', '--n-probes-per-subjects', default=5, type=click.INT, show_default=True,\
help="Number of probes per subject")
@click.option('-s', '--n-subjects', default=5, type=click.INT, show_default=True,\
help="Number of subjects")
@click.option('-p', '--sigma-positive', default=10, type=click.FLOAT, show_default=True,\
help="Variance for the positive score distributions")
@click.option('-n', '--sigma-negative', default=10, type=click.FLOAT, show_default=True,\
help="Variance for the negative score distributions")
@click.option('-u', '--n-unknown-subjects', default=0, type=click.INT, show_default=True,\
help="Number of unknown subjects (useful for openset plots)")
@click.option('--five-col/--four-col', default=False, show_default=True)
s_unknown_subjects = ["u%d" % i for i in range(n_unknown_subjects)]
for i, score in enumerate(neg_unknown):
ref = s_subjects[
int(i / n_probes_per_subject / n_unknown_subjects) % n_subjects
]
probe = s_unknown_subjects[
int(i / n_probes_per_subject) % n_unknown_subjects
]
s_five = " " if not five_col else " d" + ref + " "
probe_id = "%s_%d" % (probe, i % n_probes_per_subject)
f.write("%s%s%s %s %f\n" % (ref, s_five, probe, probe_id, score))
@click.command(
epilog="""
Scores generation examples:
Output 'scores-dev' and 'scores-eval' in a new folder 'generated_scores/':
$ bob bio gen ./generated_scores
Output scores similar to a system evaluated on the AT&T dataset dev group:
$ bob bio gen -s 20 -p 5 ./generated_scores
Output a given number of scores in each file:
$ bob bio gen -f --n-neg 500 --n-pos 100 ./generated_scores
Include unknown subjects scores:
$ bob bio gen -s 5 -u 2 ./generated_scores
Change the mean and standard deviation of the scores distributions:
$ bob bio gen -mm 1 -sp 0.3 -mnm -1 -sn 0.5 ./generated_scores
You can observe the distributions histograms in a pdf file with:
$ bob bio hist -e ./generated_scores/scores-{dev,eval} -o hist_gen.pdf
"""
)
@click.argument("outdir")
@click.option(
"-mm",
"--mean-match",
default=10,
type=click.FLOAT,
show_default=True,
help="Mean for the positive scores distribution",
)
@click.option(
"-mnm",
"--mean-non-match",
default=-10,
type=click.FLOAT,
show_default=True,
help="Mean for the negative scores distribution",
)
@click.option(
"-p",
"--n-probes-per-subject",
default=5,
type=click.INT,
show_default=True,
help="Number of probes per subject",
)
@click.option(
"-s",
"--n-subjects",
default=50,
type=click.INT,
show_default=True,
help="Number of subjects",
)
@click.option(
"-sp",
"--sigma-positive",
default=10,
type=click.FLOAT,
show_default=True,
help="Variance for the positive score distributions",
)
@click.option(
"-sn",
"--sigma-negative",
default=10,
type=click.FLOAT,
show_default=True,
help="Variance for the negative score distributions",
)
@click.option(
"-u",
"--n-unknown-subjects",
default=0,
type=click.INT,
show_default=True,
help="Number of unknown subjects (useful for openset plots)",
)
@click.option(
"-f",
"--force-count",
"force_count",
is_flag=True,
help="Use --n-pos and --n-neg amounts instead of the subject and sample counts",
)
@click.option(
"--n-pos",
"n_pos",
default=5000,
type=click.INT,
show_default=True,
help="Number of Positive verifications (number of lines in the file)",
)
@click.option(
"--n-neg",
"n_neg",
default=5000,
type=click.INT,
show_default=True,
help="Number of Negative verifications (number of lines in the file)",
)
@click.option(
"--n-unk",
"n_unk",
default=5000,
type=click.INT,
show_default=True,
help="Number of Unknown verifications (number of lines in the file)",
)
@click.option("--five-col/--four-col", default=False, show_default=True)
@verbosity_option()
def gen(outdir, mean_match, mean_non_match, n_probes_per_subjects, n_subjects,\
sigma_positive, sigma_negative, n_unknown_subjects, five_col, **kwargs):
def gen(
outdir,
mean_match,
mean_non_match,
n_probes_per_subject,
n_subjects,
sigma_positive,
sigma_negative,
n_unknown_subjects,
five_col,
force_count,
n_pos,
n_neg,
n_unk,
**kwargs,
):
"""Generate random scores.
Generates random scores in 4col or 5col format. The scores are generated
using Gaussian distribution whose mean is an input
using Gaussian distribution whose mean and variance are an input
parameter. The generated scores can be used as hypothetical datasets.
This command generates scores relative to the number of subjects and
probes per subjects, unless the -f flag is set. In that case, the --n-pos
and --n-neg options are used as number of genuine and impostor
comparisons.
"""
# Compute the number of verifications needed
if force_count:
neg_count, pos_count, unknown_count = n_neg, n_pos, n_unk
else:
# One reference (model), and `n_probes_per_subject` probes per subject
neg_count = n_subjects * n_probes_per_subject * (n_subjects - 1)
pos_count = n_probes_per_subject * n_subjects
unknown_count = n_unknown_subjects * n_subjects * n_probes_per_subject
# Generate the data
neg_dev, pos_dev = gen_score_distr(mean_non_match, mean_match, sigma_negative, sigma_positive)
neg_eval, pos_eval = gen_score_distr(mean_non_match, mean_match, sigma_negative, sigma_positive)
logger.info("Generating dev scores.")
neg_dev, pos_dev = gen_score_distr(
mean_non_match,
mean_match,
sigma_negative,
sigma_positive,
n_neg=neg_count,
n_pos=pos_count,
seed=0,
)
logger.info("Generating eval scores.")
neg_eval, pos_eval = gen_score_distr(
mean_non_match,
mean_match,
sigma_negative,
sigma_positive,
n_neg=neg_count,
n_pos=pos_count,
seed=1,
)
# For simplicity I will use the same distribution for dev-eval
if n_unknown_subjects:
neg_unknown,_ = gen_score_distr(mean_non_match, mean_match, sigma_negative, sigma_positive)
logger.info("Generating unknown scores.")
neg_unknown, _ = gen_score_distr(
mean_non_match,
mean_match,
sigma_negative,
sigma_positive,
n_neg=unknown_count,
n_pos=0,
seed=2,
)
else:
neg_unknown = None
# Write the data into files
write_scores_to_file(neg_dev, pos_dev,
os.path.join(outdir, 'scores-dev'),
n_subjects, n_probes_per_subjects,
n_unknown_subjects, neg_unknown, five_col)
write_scores_to_file(neg_eval, pos_eval,
os.path.join(outdir, 'scores-eval'),
n_subjects, n_probes_per_subjects,
n_unknown_subjects, neg_unknown, five_col)
logger.info("Saving results.")
write_scores_to_file(
neg_dev,
pos_dev,
os.path.join(outdir, "scores-dev"),
n_subjects,
n_probes_per_subject,
n_unknown_subjects,
neg_unknown,
five_col,
)
write_scores_to_file(
neg_eval,
pos_eval,
os.path.join(outdir, "scores-eval"),
n_subjects,
n_probes_per_subject,
n_unknown_subjects,
neg_unknown,
five_col,
)
Loading