diff --git a/src/bob/bio/base/score/load.py b/src/bob/bio/base/score/load.py index e2c87b816a31ddcc78e16dc9c3ad10be43810c56..47043f85dfebb77a2eb940fdd33e4ebb7abe650c 100644 --- a/src/bob/bio/base/score/load.py +++ b/src/bob/bio/base/score/load.py @@ -169,7 +169,7 @@ def get_split_dataframe(filename): return impostors, genuines -def split_csv_scores(filename): +def split_csv_scores(filename, score_column: str = "score"): """Loads a score set that was written with :any:`bob.bio.base.pipelines.CSVScoreWriter` Parameters @@ -178,6 +178,8 @@ def split_csv_scores(filename): filename (:py:class:`str`, ``file-like``): The file object that will be opened with :py:func:`open_file` containing the scores. + score_column: The CSV column that contains the score values. + Returns ------- @@ -196,8 +198,8 @@ def split_csv_scores(filename): impostors = df[df.probe_subject_id != df.bio_ref_subject_id] return ( - impostors["score"].to_dask_array().compute(), - genuines["score"].to_dask_array().compute(), + impostors[score_column].to_dask_array().compute(), + genuines[score_column].to_dask_array().compute(), ) @@ -358,7 +360,7 @@ def scores(filename, ncolumns=None): return _iterate_score_file(filename) -def split(filename, ncolumns=None, sort=False): +def split(filename, ncolumns=None, sort=False, csv_score_column: str = "score"): """Loads the scores from the given score file and splits them into positives and negatives. Depending on the score file format, it calls see @@ -376,6 +378,8 @@ def split(filename, ncolumns=None, sort=False): estimated automatically sort : :obj:`bool`, optional If ``True``, will return sorted negatives and positives + csv_score_column : + When loading a CSV file, specifies the column that holds scores. Returns ------- @@ -388,7 +392,7 @@ def split(filename, ncolumns=None, sort=False): the ``real_id`` are identical (see :py:func:`four_column`) """ if iscsv(filename): - neg, pos = split_csv_scores(filename) + neg, pos = split_csv_scores(filename, score_column=csv_score_column) else: ncolumns = _estimate_score_file_format(filename, ncolumns) if ncolumns == 4: @@ -404,7 +408,7 @@ def split(filename, ncolumns=None, sort=False): return neg, pos -def cmc(filename, ncolumns=None): +def cmc(filename, ncolumns=None, csv_score_column: str = "score"): """cmc(filename, ncolumns=None) -> list Loads scores to compute CMC curves. @@ -422,6 +426,9 @@ def cmc(filename, ncolumns=None): the score file will be assumed to be in the given format. If not specified, the score file format will be estimated automatically + csv_score_column: When loading a CSV file, specifies the column that holds + scores. + Returns: :any:`list`: [(neg,pos)] A list of tuples, where each tuple contains the @@ -462,7 +469,7 @@ def load_score(filename, ncolumns=None, minimal=False, **kwargs): Returns: - array: An array which contains not only the actual scores but also the + array: An array which contains not only the actual ``score`` but also the ``claimed_id``, ``real_id``, ``test_label`` and ``['model_label']`` """ @@ -536,15 +543,15 @@ def load_files(filenames, func_load): return res -def get_negatives_positives(score_lines): +def get_negatives_positives(score_lines, score_column: str = "score"): """Take the output of load_score and return negatives and positives. This function aims to replace split_four_column and split_five_column but takes a different input. It's up to you to use which one. """ pos_mask = score_lines["claimed_id"] == score_lines["real_id"] - positives = score_lines["score"][pos_mask] - negatives = score_lines["score"][numpy.logical_not(pos_mask)] + positives = score_lines[score_column][pos_mask] + negatives = score_lines[score_column][numpy.logical_not(pos_mask)] return (negatives, positives) @@ -552,17 +559,19 @@ def get_negatives_positives_from_file(filename, **kwargs): """Loads the scores first efficiently and then calls get_negatives_positives""" score_lines = load_score(filename, minimal=True, **kwargs) - return get_negatives_positives(score_lines) + return get_negatives_positives(score_lines, score_column="score") -def get_negatives_positives_all(score_lines_list): +def get_negatives_positives_all(score_lines_list, score_column: str = "score"): """Take a list of outputs of load_score and return stacked negatives and positives. """ negatives, positives = [], [] for score_lines in score_lines_list: - neg_pos = get_negatives_positives(score_lines) + neg_pos = get_negatives_positives( + score_lines, score_column=score_column + ) negatives.append(neg_pos[0]) positives.append(neg_pos[1]) negatives = numpy.vstack(negatives).T @@ -570,11 +579,11 @@ def get_negatives_positives_all(score_lines_list): return (negatives, positives) -def get_all_scores(score_lines_list): +def get_all_scores(score_lines_list, score_column: str = "score"): """Take a list of outputs of load_score and return stacked scores""" return numpy.vstack( - [score_lines["score"] for score_lines in score_lines_list] + [score_lines[score_column] for score_lines in score_lines_list] ).T @@ -614,18 +623,20 @@ def _estimate_score_file_format(filename, ncolumns=None): return ncolumns -def _iterate_score_file(filename): +def _iterate_score_file(filename, csv_score_column: str = "score"): """Opens the score file for reading and yields the score file line by line in a tuple/list. The last element of the line (which is the score) will be transformed to float, the other elements will be str """ if iscsv(filename): - for row in _iterate_csv_score_file(filename): + for row in _iterate_csv_score_file( + filename, score_column=csv_score_column + ): yield [ row["bio_ref_subject_id"], row["probe_subject_id"], row["probe_key"], - row["score"], + row[csv_score_column], ] else: opened = open_file(filename, "rb") @@ -640,16 +651,16 @@ def _iterate_score_file(filename): yield splits -def _iterate_csv_score_file(filename): +def _iterate_csv_score_file(filename, score_column: str = "score"): """Opens a CSV score file for reading and yields each line in a dict. - The `score` field of the line will be cast to float, the other elements will - be str. + The ``score_column`` field of the line will be cast to float, the other + elements will be str. """ opened = open_file(filename) reader = csv.DictReader(opened) for row in reader: - row["score"] = float(row["score"]) + row[score_column] = float(row[score_column]) yield row @@ -712,13 +723,13 @@ def _split_cmc_scores( ] -def split_csv_vuln(filename): +def split_csv_vuln(filename, score_column: str = "score"): """Loads vulnerability scores from a CSV score file. Returns the scores split between positive and negative as well as licit and presentation attack (spoof). - The CSV must contain a `probe_attack_type` column with each field either + The CSV must contain a ``probe_attack_type`` column with each field either containing a str defining the attack type (spoof), or empty (licit). Parameters @@ -735,14 +746,14 @@ def split_csv_vuln(filename): """ logger.debug(f"Loading CSV score file: '{filename}'") split_scores = {"licit_neg": [], "licit_pos": [], "spoof": []} - for row in _iterate_csv_score_file(filename): + for row in _iterate_csv_score_file(filename, score_column=score_column): if not row["probe_attack_type"]: # licit if row["probe_subject_id"] == row["bio_ref_subject_id"]: - split_scores["licit_pos"].append(row["score"]) + split_scores["licit_pos"].append(row[score_column]) else: - split_scores["licit_neg"].append(row["score"]) + split_scores["licit_neg"].append(row[score_column]) else: - split_scores["spoof"].append(row["score"]) + split_scores["spoof"].append(row[score_column]) logger.debug( f"Found {len(split_scores['licit_neg'])} negative (ZEI), " f"{len(split_scores['licit_pos'])} positive (licit), and " diff --git a/src/bob/bio/base/script/commands.py b/src/bob/bio/base/script/commands.py index c5a20f5d343bcc755d13c76272e820e551e8d8d4..7d130654b7c0cdc3123fc58fb6a1a699ea5460bc 100644 --- a/src/bob/bio/base/script/commands.py +++ b/src/bob/bio/base/script/commands.py @@ -1,4 +1,5 @@ """ Click commands for ``bob.bio.base`` """ +import functools import logging import click @@ -57,11 +58,30 @@ def rank_option(**kwargs): criteria=CRITERIA, ) @common_options.cost_option() -def metrics(ctx, scores, evaluation, **kwargs): +@click.option( + "--score-column", + default="score", + show_default=True, + help=( + "Selects the CSV column to consider as scores. This is ignored for " + "non-CSV files. The column must contain numerical values." + ), +) +def metrics(ctx, scores, evaluation, score_column, **kwargs): if "criterion" in ctx.meta and ctx.meta["criterion"] == "rr": - process = bio_figure.Metrics(ctx, scores, evaluation, load.cmc) + process = bio_figure.Metrics( + ctx, + scores, + evaluation, + functools.partial(load.cmc, csv_score_column=score_column), + ) else: - process = bio_figure.Metrics(ctx, scores, evaluation, load.split) + process = bio_figure.Metrics( + ctx, + scores, + evaluation, + functools.partial(load.split, csv_score_column=score_column), + ) process.run() @@ -70,8 +90,22 @@ def metrics(ctx, scores, evaluation, **kwargs): score_format=SCORE_FORMAT, command="bob bio roc" ) ) -def roc(ctx, scores, evaluation, **kwargs): - process = bio_figure.Roc(ctx, scores, evaluation, load.split) +@click.option( + "--score-column", + default="score", + show_default=True, + help=( + "Selects the CSV column to consider as scores. This is ignored for " + "non-CSV files. The column must contain numerical values." + ), +) +def roc(ctx, scores, evaluation, score_column, **kwargs): + process = bio_figure.Roc( + ctx, + scores, + evaluation, + functools.partial(load.split, csv_score_column=score_column), + ) process.run() @@ -80,8 +114,22 @@ def roc(ctx, scores, evaluation, **kwargs): score_format=SCORE_FORMAT, command="bob bio det" ) ) -def det(ctx, scores, evaluation, **kwargs): - process = bio_figure.Det(ctx, scores, evaluation, load.split) +@click.option( + "--score-column", + default="score", + show_default=True, + help=( + "Selects the CSV column to consider as scores. This is ignored for " + "non-CSV files. The column must contain numerical values." + ), +) +def det(ctx, scores, evaluation, score_column, **kwargs): + process = bio_figure.Det( + ctx, + scores, + evaluation, + functools.partial(load.split, csv_score_column=score_column), + ) process.run() @@ -90,8 +138,22 @@ def det(ctx, scores, evaluation, **kwargs): score_format=SCORE_FORMAT, command="bob bio epc" ) ) -def epc(ctx, scores, **kwargs): - process = measure_figure.Epc(ctx, scores, True, load.split) +@click.option( + "--score-column", + default="score", + show_default=True, + help=( + "Selects the CSV column to consider as scores. This is ignored for " + "non-CSV files. The column must contain numerical values." + ), +) +def epc(ctx, scores, score_column, **kwargs): + process = measure_figure.Epc( + ctx, + scores, + True, + functools.partial(load.split, csv_score_column=score_column), + ) process.run() @@ -100,8 +162,22 @@ def epc(ctx, scores, **kwargs): score_format=SCORE_FORMAT, command="bob bio hist" ) ) -def hist(ctx, scores, evaluation, **kwargs): - process = bio_figure.Hist(ctx, scores, evaluation, load.split) +@click.option( + "--score-column", + default="score", + show_default=True, + help=( + "Selects the CSV column to consider as scores. This is ignored for " + "non-CSV files. The column must contain numerical values." + ), +) +def hist(ctx, scores, evaluation, score_column, **kwargs): + process = bio_figure.Hist( + ctx, + scores, + evaluation, + functools.partial(load.split, csv_score_column=score_column), + ) process.run() @@ -111,8 +187,23 @@ def hist(ctx, scores, evaluation, **kwargs): ), criteria=CRITERIA, ) +@click.option( + "--score-column", + default="score", + show_default=True, + help=( + "NOT YET IMPLEMENTED. " + "Selects the CSV column to consider as scores. This is ignored for " + "non-CSV files. The column must contain numerical values." + ), +) @common_options.cost_option() -def evaluate(ctx, scores, evaluation, **kwargs): +def evaluate(ctx, scores, evaluation, score_column, **kwargs): + if score_column != "score": + raise NotImplementedError( + "'evaluate' does not yet support files with scores in columns " + "other than 'score'." + ) common_options.evaluate_flow( ctx, scores, evaluation, metrics, roc, det, epc, hist, **kwargs ) @@ -127,9 +218,25 @@ def evaluate(ctx, scores, evaluation, **kwargs): ), criteria=CRITERIA, ) -def multi_metrics(ctx, scores, evaluation, protocols_number, **kwargs): +@click.option( + "--score-column", + default="score", + show_default=True, + help=( + "Selects the CSV column to consider as scores. This is ignored for " + "non-CSV files. The column must contain numerical values." + ), +) +def multi_metrics( + ctx, scores, evaluation, protocols_number, score_column, **kwargs +): ctx.meta["min_arg"] = protocols_number * (2 if evaluation else 1) - process = bio_figure.MultiMetrics(ctx, scores, evaluation, load.split) + process = bio_figure.MultiMetrics( + ctx, + scores, + evaluation, + functools.partial(load.split, csv_score_column=score_column), + ) process.run()