gen.py 10.7 KB
Newer Older
1
2
"""Generate random scores.
"""
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
3
import csv
4
import logging
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
5
6
import os

7
import click
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
8
9
import numpy

10
from bob.extension.scripts.click_helper import verbosity_option
11
12
13
14
15
from bob.io.base import create_directories_safe

logger = logging.getLogger(__name__)


16
def gen_score_distr(
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
17
18
19
20
21
22
23
    mean_neg,
    mean_pos,
    sigma_neg=10,
    sigma_pos=10,
    n_neg=5000,
    n_pos=5000,
    seed=0,
24
):
25
26
27
28
29
30
31
32
33
34
35
36
    """Generate scores from normal distributions

    Parameters
    ----------
    mean_neg : float
        Mean for negative scores
    mean_pos : float
        Mean for positive scores
    sigma_neg : float
        STDev for negative scores
    sigma_pos : float
        STDev for positive scores
37
38
39
40
    n_pos: int
        The number of positive scores generated
    n_neg: int
        The number of negative scores generated
41
42
43
44
    seed: int
        A value to initialize the Random Number generator. Giving the same
        value (or not specifying 'seed') on two different calls will generate
        the same lists of scores.
45
46
47

    Returns
    -------
48
    neg_scores : :any:`list`
49
        Negatives scores
50
    pos_scores : :any:`list`
51
52
        Positive scores
    """
53
54

    logger.debug("Initializing RNG.")
55
    numpy.random.seed(seed)
56

57
58
    logger.info(f"Generating {n_neg} negative and {n_pos} positive scores.")

59
60
    neg_scores = numpy.random.normal(loc=mean_neg, scale=sigma_neg, size=n_neg)
    pos_scores = numpy.random.normal(loc=mean_pos, scale=sigma_pos, size=n_pos)
61
62
63

    return neg_scores, pos_scores

64

65
66
67
68
69
70
71
72
def write_scores_to_file(
    neg,
    pos,
    filename,
    n_subjects=5,
    n_probes_per_subject=5,
    n_unknown_subjects=0,
    neg_unknown=None,
73
    to_csv=True,
74
    five_col=False,
75
    metadata={"meta0": "data0", "meta1": "data1"},
76
):
77
    """Writes score distributions
78
79
80

    Parameters
    ----------
81
    neg : :py:class:`numpy.ndarray`
82
        Scores for negative samples.
83
84
    pos : :py:class:`numpy.ndarray`
        Scores for positive samples.
85
86
    filename : str
        The path to write the score to.
87
88
89
90
91
92
93
94
    n_subjects: int
        Number of different subjects
    n_probes_per_subject: int
        Number of different samples used as probe for each subject
    n_unknown_subjects: int
        The number of unknown (no registered model) subjects
    neg_unknown: None or list
        The of unknown subjects scores
95
96
    to_csv: bool
        Use the CSV format, else the legacy 4 or 5 columns format.
97
98
99
    five_col : bool
        If 5-colum format, else 4-column
    """
100
    logger.debug(f"Creating result directories ('{filename}').")
101
    create_directories_safe(os.path.dirname(filename))
102
    s_subjects = ["x%d" % i for i in range(n_subjects)]
103

104
    logger.debug("Writing scores to files.")
105

106
    with open(filename, "wt") as f:
107
108
109
110
111
112
        if to_csv:
            csv_writer = csv.writer(f)
            csv_writer.writerow(
                ["bio_ref_subject_id", "probe_subject_id", "key", "score"]
                + list(metadata.keys())
            )
113
114
        # Generate one line per probe (unless "--force-count" specified)
        logger.debug("Writing positive scores.")
115
116
117
118
        for i, score in enumerate(pos):
            s_name = s_subjects[int(i / n_probes_per_subject) % n_subjects]
            s_five = " " if not five_col else " d" + s_name + " "
            probe_id = "%s_%d" % (s_name, i % n_probes_per_subject)
119
120
121
122
123
            if to_csv:
                csv_writer.writerow(
                    [s_name, s_name, probe_id, score] + list(metadata.values())
                )
            else:
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
124
125
126
                f.write(
                    "%s%s%s %s %f\n" % (s_name, s_five, s_name, probe_id, score)
                )
127
128
129

        # Generate one line per probe against each ref (unless "--force-count" specified)
        logger.debug("Writing negative scores.")
130
        for i, score in enumerate(neg):
131
            n_impostors = n_subjects - 1
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
132
133
134
            ref = s_subjects[
                int(i / n_probes_per_subject / n_impostors) % n_subjects
            ]
135
136
137
138
            impostors = [s for s in s_subjects if s != ref]  # ignore pos
            probe = impostors[int(i / n_probes_per_subject) % n_impostors]
            s_five = " " if not five_col else " d" + ref
            probe_id = "%s_%d" % (probe, i % n_probes_per_subject)
139
140
141
142
143
            if to_csv:
                csv_writer.writerow(
                    [ref, probe, probe_id, score] + list(metadata.values())
                )
            else:
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
144
145
146
                f.write(
                    "%s%s%s %s %f\n" % (ref, s_five, probe, probe_id, score)
                )
147
148

        logger.debug("Writing unknown scores.")
149
        if neg_unknown is not None:
150
151
152
            s_unknown_subjects = ["u%d" % i for i in range(n_unknown_subjects)]
            for i, score in enumerate(neg_unknown):
                ref = s_subjects[
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
153
154
                    int(i / n_probes_per_subject / n_unknown_subjects)
                    % n_subjects
155
156
157
158
159
160
                ]
                probe = s_unknown_subjects[
                    int(i / n_probes_per_subject) % n_unknown_subjects
                ]
                s_five = " " if not five_col else " d" + ref + " "
                probe_id = "%s_%d" % (probe, i % n_probes_per_subject)
161
162
163
164
165
                if to_csv:
                    csv_writer.writerow(
                        [ref, probe, probe_id, score] + list(metadata.values())
                    )
                else:
Amir MOHAMMADI's avatar
Amir MOHAMMADI committed
166
167
168
                    f.write(
                        "%s%s%s %s %f\n" % (ref, s_five, probe, probe_id, score)
                    )
169

170

171
172
173
174
@click.command(
    epilog="""
Scores generation examples:

175
Output 'scores-dev.csv' and 'scores-eval.csv' in a new folder 'generated_scores/':
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196

  $ bob bio gen ./generated_scores

Output scores similar to a system evaluated on the AT&T dataset dev group:

  $ bob bio gen -s 20 -p 5 ./generated_scores

Output a given number of scores in each file:

  $ bob bio gen -f --n-neg 500 --n-pos 100 ./generated_scores

Include unknown subjects scores:

  $ bob bio gen -s 5 -u 2 ./generated_scores

Change the mean and standard deviation of the scores distributions:

  $ bob bio gen -mm 1 -sp 0.3 -mnm -1 -sn 0.5 ./generated_scores

You can observe the distributions histograms in a pdf file with:

197
  $ bob bio hist -e ./generated_scores/scores-{dev,eval}.csv -o hist_gen.pdf
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
"""
)
@click.argument("outdir")
@click.option(
    "-mm",
    "--mean-match",
    default=10,
    type=click.FLOAT,
    show_default=True,
    help="Mean for the positive scores distribution",
)
@click.option(
    "-mnm",
    "--mean-non-match",
    default=-10,
    type=click.FLOAT,
    show_default=True,
    help="Mean for the negative scores distribution",
)
@click.option(
    "-p",
    "--n-probes-per-subject",
    default=5,
    type=click.INT,
    show_default=True,
    help="Number of probes per subject",
)
@click.option(
    "-s",
    "--n-subjects",
    default=50,
    type=click.INT,
    show_default=True,
    help="Number of subjects",
)
@click.option(
    "-sp",
    "--sigma-positive",
    default=10,
    type=click.FLOAT,
    show_default=True,
    help="Variance for the positive score distributions",
)
@click.option(
    "-sn",
    "--sigma-negative",
    default=10,
    type=click.FLOAT,
    show_default=True,
    help="Variance for the negative score distributions",
)
@click.option(
    "-u",
    "--n-unknown-subjects",
    default=0,
    type=click.INT,
    show_default=True,
255
    help="Number of unknown subjects (useful for open-set plots)",
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
)
@click.option(
    "-f",
    "--force-count",
    "force_count",
    is_flag=True,
    help="Use --n-pos and --n-neg amounts instead of the subject and sample counts",
)
@click.option(
    "--n-pos",
    "n_pos",
    default=5000,
    type=click.INT,
    show_default=True,
    help="Number of Positive verifications (number of lines in the file)",
)
@click.option(
    "--n-neg",
    "n_neg",
    default=5000,
    type=click.INT,
    show_default=True,
    help="Number of Negative verifications (number of lines in the file)",
)
@click.option(
    "--n-unk",
    "n_unk",
    default=5000,
    type=click.INT,
    show_default=True,
    help="Number of Unknown verifications (number of lines in the file)",
)
288
@click.option("--csv/--legacy", default=True, show_default=True)
289
@click.option("--five-col/--four-col", default=False, show_default=True)
290
@verbosity_option()
291
292
293
294
295
296
297
298
299
def gen(
    outdir,
    mean_match,
    mean_non_match,
    n_probes_per_subject,
    n_subjects,
    sigma_positive,
    sigma_negative,
    n_unknown_subjects,
300
    csv,
301
302
303
304
305
306
307
    five_col,
    force_count,
    n_pos,
    n_neg,
    n_unk,
    **kwargs,
):
308
    """Generate random scores.
309

310
    Generates random scores in 4col or 5col format. The scores are generated
311
312
    using Gaussian distribution whose mean and variance are an input
    parameter. The generated scores can be used as hypothetical datasets.
313
314

    This command generates scores relative to the number of subjects and
315
316
317
    probes per subjects, unless the -f flag is set. In that case, the --n-pos
    and --n-neg options are used as number of genuine and impostor
    comparisons.
318
    """
319
320
321
322
323
324

    # Compute the number of verifications needed
    if force_count:
        neg_count, pos_count, unknown_count = n_neg, n_pos, n_unk
    else:
        # One reference (model), and `n_probes_per_subject` probes per subject
325
        neg_count = n_subjects * n_probes_per_subject * (n_subjects - 1)
326
        pos_count = n_probes_per_subject * n_subjects
327
        unknown_count = n_unknown_subjects * n_subjects * n_probes_per_subject
328

329
    # Generate the data
330
    logger.info("Generating dev scores.")
331
332
333
334
335
336
337
338
339
    neg_dev, pos_dev = gen_score_distr(
        mean_non_match,
        mean_match,
        sigma_negative,
        sigma_positive,
        n_neg=neg_count,
        n_pos=pos_count,
        seed=0,
    )
340
    logger.info("Generating eval scores.")
341
342
343
344
345
346
347
348
349
    neg_eval, pos_eval = gen_score_distr(
        mean_non_match,
        mean_match,
        sigma_negative,
        sigma_positive,
        n_neg=neg_count,
        n_pos=pos_count,
        seed=1,
    )
350
351
352

    # For simplicity I will use the same distribution for dev-eval
    if n_unknown_subjects:
353
        logger.info("Generating unknown scores.")
354
355
356
357
358
359
360
361
362
        neg_unknown, _ = gen_score_distr(
            mean_non_match,
            mean_match,
            sigma_negative,
            sigma_positive,
            n_neg=unknown_count,
            n_pos=0,
            seed=2,
        )
363
364
    else:
        neg_unknown = None
365
366

    # Write the data into files
367
    logger.info("Saving results.")
368
369
370
    write_scores_to_file(
        neg_dev,
        pos_dev,
371
        os.path.join(outdir, "scores-dev.csv"),
372
373
374
375
        n_subjects,
        n_probes_per_subject,
        n_unknown_subjects,
        neg_unknown,
376
        csv,
377
378
379
380
381
382
        five_col,
    )

    write_scores_to_file(
        neg_eval,
        pos_eval,
383
        os.path.join(outdir, "scores-eval.csv"),
384
385
386
387
        n_subjects,
        n_probes_per_subject,
        n_unknown_subjects,
        neg_unknown,
388
        csv,
389
390
        five_col,
    )