diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py index 95cfc9c180f867fd1fd948dedb1c3a48bd10bb15..f4d697f160a59f2d8c149650cc4c02d56a14743a 100644 --- a/bob/ip/binseg/engine/evaluator.py +++ b/bob/ip/binseg/engine/evaluator.py @@ -16,7 +16,6 @@ import torchvision.transforms.functional as VF import h5py from ..utils.metric import base_metrics -from ..utils.plot import precision_recall_f1iso_confintval import logging @@ -50,7 +49,7 @@ def _posneg(pred, gt, threshold): return tp_tensor, fp_tensor, tn_tensor, fn_tensor -def _sample_metrics(pred, gt): +def _sample_metrics(pred, gt, bins): """ Calculates metrics on one single sample and saves it to disk @@ -64,6 +63,10 @@ def _sample_metrics(pred, gt): gt : torch.Tensor ground-truth (annotations) + bins : int + number of bins to use for threshold analysis. The step size is + calculated from this by dividing ``1.0/bins``. + Returns ------- @@ -82,10 +85,10 @@ def _sample_metrics(pred, gt): """ - step_size = 0.01 + step_size = 1.0/bins data = [] - for threshold in numpy.arange(0.0, 1.0, step_size): + for index, threshold in enumerate(numpy.arange(0.0, 1.0, step_size)): tp_tensor, fp_tensor, tn_tensor, fn_tensor = _posneg( pred, gt, threshold @@ -107,6 +110,7 @@ def _sample_metrics(pred, gt): data.append( [ + index, threshold, precision, recall, @@ -120,6 +124,7 @@ def _sample_metrics(pred, gt): return pandas.DataFrame( data, columns=( + "index", "threshold", "precision", "recall", @@ -254,6 +259,7 @@ def run( """ # Collect overall metrics + bins = 100 #number of thresholds to analyse for data = {} for sample in tqdm(dataset): @@ -268,7 +274,7 @@ def run( raise RuntimeError( f"{stem} entry already exists in data. Cannot overwrite." ) - data[stem] = _sample_metrics(pred, gt) + data[stem] = _sample_metrics(pred, gt, bins) if overlayed_folder is not None: overlay_image = _sample_analysis( @@ -286,8 +292,8 @@ def run( df_metrics = pandas.concat(data.values()) # Report and Averages - avg_metrics = df_metrics.groupby("threshold").mean() - std_metrics = df_metrics.groupby("threshold").std() + avg_metrics = df_metrics.groupby("index").mean() + std_metrics = df_metrics.groupby("index").std() # Uncomment below for F1-score calculation based on average precision and # metrics instead of F1-scores of individual images. This method is in line @@ -306,20 +312,24 @@ def run( avg_metrics["std_f1"] = std_metrics["f1_score"] maxf1 = avg_metrics["f1_score"].max() - optimal_f1_threshold = avg_metrics["f1_score"].idxmax() + maxf1_index = avg_metrics["f1_score"].idxmax() + maxf1_threshold = avg_metrics["threshold"][maxf1_index] logger.info( f"Maximum F1-score of {maxf1:.5f}, achieved at " - f"threshold {optimal_f1_threshold:.2f} (chosen *a posteriori*)" + f"threshold {maxf1_threshold:.3f} (chosen *a posteriori*)" ) if threshold is not None: - f1_a_priori = avg_metrics["f1_score"][threshold] + # get the closest possible threshold we have + index = int(round(bins*threshold)) + f1_a_priori = avg_metrics["f1_score"][index] + actual_threshold = avg_metrics["threshold"][index] logger.info( - f"F1-score of {f1_a_priori:.5f}, at threshold {threshold:.5f} " - f"(chosen *a priori*)" + f"F1-score of {f1_a_priori:.5f}, at threshold " + f"{actual_threshold:.3f} (chosen *a priori*)" ) if output_folder is not None: @@ -335,22 +345,7 @@ def run( ) avg_metrics.to_csv(metrics_path) - # Plotting - np_avg_metrics = avg_metrics.to_numpy().T - figure_path = os.path.join(output_folder, "precision-recall.pdf") - logger.info(f"Saving overall precision-recall plot at {figure_path}...") - fig = precision_recall_f1iso_confintval( - [np_avg_metrics[0]], - [np_avg_metrics[1]], - [np_avg_metrics[7]], - [np_avg_metrics[8]], - [np_avg_metrics[10]], - [np_avg_metrics[11]], - ["data"], - ) - fig.savefig(figure_path) - - return optimal_f1_threshold + return maxf1_threshold def compare_annotators(baseline, other, output_folder, overlayed_folder=None): @@ -395,7 +390,7 @@ def compare_annotators(baseline, other, output_folder, overlayed_folder=None): raise RuntimeError( f"{stem} entry already exists in data. " f"Cannot overwrite." ) - data[stem] = _sample_metrics(pred, gt) + data[stem] = _sample_metrics(pred, gt, 2) if overlayed_folder is not None: overlay_image = _sample_analysis( @@ -413,8 +408,8 @@ def compare_annotators(baseline, other, output_folder, overlayed_folder=None): df_metrics = pandas.concat(data.values()) # Report and Averages - avg_metrics = df_metrics.groupby("threshold").mean() - std_metrics = df_metrics.groupby("threshold").std() + avg_metrics = df_metrics.groupby("index").mean() + std_metrics = df_metrics.groupby("index").std() # Uncomment below for F1-score calculation based on average precision and # metrics instead of F1-scores of individual images. This method is in line @@ -432,9 +427,13 @@ def compare_annotators(baseline, other, output_folder, overlayed_folder=None): avg_metrics["re_lower"] = avg_metrics["recall"] - avg_metrics["std_re"] avg_metrics["std_f1"] = std_metrics["f1_score"] - metrics_path = os.path.join(output_folder, "metrics.csv") + # we actually only need to keep the second row of the pandas dataframe + # with threshold == 0.5 - the first row is redundant + avg_metrics.drop(0, inplace=True) + + metrics_path = os.path.join(output_folder, "metrics-second-annotator.csv") logger.info(f"Saving averages over all input images at {metrics_path}...") avg_metrics.to_csv(metrics_path) maxf1 = avg_metrics["f1_score"].max() - logger.info(f"Maximum F1-score of {maxf1:.5f} (second annotator)") + logger.info(f"F1-score of {maxf1:.5f} (second annotator; threshold=0.5)") diff --git a/bob/ip/binseg/script/compare.py b/bob/ip/binseg/script/compare.py index 3b2b0ae998e8dbf05015dda950975e78a042b176..e27b6296901587038184153c750a8e1310cec67b 100644 --- a/bob/ip/binseg/script/compare.py +++ b/bob/ip/binseg/script/compare.py @@ -8,12 +8,105 @@ from bob.extension.scripts.click_helper import ( AliasedGroup, ) -from ..utils.plot import combined_precision_recall_f1iso_confintval +import pandas + +from ..utils.plot import precision_recall_f1iso import logging logger = logging.getLogger(__name__) +def _validate_threshold(t, dataset): + """Validates the user threshold selection. Returns parsed threshold.""" + + if t is None: + return t + + try: + # we try to convert it to float first + t = float(t) + if t < 0.0 or t > 1.0: + raise ValueError("Float thresholds must be within range [0.0, 1.0]") + except ValueError: + # it is a bit of text - assert dataset with name is available + if not isinstance(dataset, dict): + raise ValueError( + "Threshold should be a floating-point number " + "if your provide only a single dataset for evaluation" + ) + if t not in dataset: + raise ValueError( + f"Text thresholds should match dataset names, " + f"but {t} is not available among the datasets provided (" + f"({', '.join(dataset.keys())})" + ) + + return t + + +def _load_and_plot(data, threshold=None): + """Plots comparison chart of all evaluated models + + Parameters + ---------- + + data : dict + A dict in which keys are the names of the systems and the values are + paths to ``metrics.csv`` style files. + + threshold : :py:class:`float`, :py:class:`str`, Optional + A value indicating which threshold to choose for plotting a "F1-score" + (black) dot on the various curves. If set to ``None``, then plot the + maximum F1-score on that curve. If set to a floating-point value, then + plot the F1-score that is obtained on that particular threshold. If + set to a string, it should match one of the keys in ``data``. It then + first calculate the threshold reaching the maximum F1-score on that + particular dataset and then applies that threshold to all other sets. + + + Returns + ------- + + figure : matplotlib.figure.Figure + A figure, with all systems combined into a single plot. + + """ + + if isinstance(threshold, str): + logger.info(f"Calculating threshold from maximum F1-score at " + f"'{threshold}' dataset...") + metrics_path = data[threshold] + df = pandas.read_csv(metrics_path) + + maxf1 = df["f1_score"].max() + use_threshold = df["threshold"][df["f1_score"].idxmax()] + logger.info(f"Dataset '*': threshold = {use_threshold:.3f}'") + + elif isinstance(threshold, float): + use_threshold = threshold + logger.info(f"Dataset '*': threshold = {use_threshold:.3f}'") + + names = [] + dfs = [] + thresholds = [] + + # loads all data + for name, metrics_path in data.items(): + + logger.info(f"Loading metrics from {metrics_path}...") + df = pandas.read_csv(metrics_path) + + if threshold is None: + use_threshold = df["threshold"][df["f1_score"].idxmax()] + logger.info(f"Dataset '{name}': threshold = {use_threshold:.3f}'") + + names.append(name) + dfs.append(df) + thresholds.append(use_threshold) + + return precision_recall_f1iso(names, dfs, thresholds, confidence=True) + + @click.command( epilog="""Examples: @@ -36,8 +129,23 @@ logger = logging.getLogger(__name__) default="comparison.pdf", type=click.Path(), ) + +@click.option( + "--threshold", + "-t", + help="This number is used to select which F1-score to use for " + "representing a system performance. If not set, we report the maximum " + "F1-score in the set, which is equivalent to threshold selection a " + "posteriori (biased estimator). You can either set this value to a " + "floating-point number in the range [0.0, 1.0], or to a string, naming " + "one of the systems which will be used to calculate the threshold " + "leading to the maximum F1-score and then applied to all other sets.", + default=None, + show_default=False, + required=False, +) @verbosity_option() -def compare(label_path, output, **kwargs): +def compare(label_path, output, threshold, **kwargs): """Compares multiple systems together""" # hack to get a dictionary from arguments passed to input @@ -46,6 +154,11 @@ def compare(label_path, output, **kwargs): " composed of name-path entries") data = dict(zip(label_path[::2], label_path[1::2])) - fig = combined_precision_recall_f1iso_confintval(data) + threshold = _validate_threshold(threshold, data) + + fig = _load_and_plot(data, threshold=threshold) logger.info(f"Saving plot at {output}") fig.savefig(output) + + # TODO: print table with all results + pass diff --git a/bob/ip/binseg/script/evaluate.py b/bob/ip/binseg/script/evaluate.py index 5f82ace1ddaaf6e9dcfd462a254a860771013d72..c8448e6d5bfebfd2fc388c245a9854637b4c3b70 100644 --- a/bob/ip/binseg/script/evaluate.py +++ b/bob/ip/binseg/script/evaluate.py @@ -107,16 +107,6 @@ def _validate_threshold(t, dataset): cls=ResourceOption, show_default=True, ) -@click.option( - "--second-annotator-folder", - "-O", - help="Path where to store the analysis result for second annotator " - "comparisons (only used if --second-annotator is also passed)", - required=True, - default="second-annotator", - type=click.Path(), - cls=ResourceOption, -) @click.option( "--overlayed", "-O", @@ -133,11 +123,9 @@ def _validate_threshold(t, dataset): ) @click.option( "--threshold", - "-T", - help="If you set --overlayed, then you can provide a value to be used as " - "threshold to be applied on probability maps and decide for positives and " - "negatives. This binary output will be used to define true and false " - "positives, and false negatives for the overlay analysis. This number " + "-t", + help="This number is used to define positives and negatives from " + "probability maps, and report F1-scores (a priori). It " "should either come from the training set or a separate validation set " "to avoid biasing the analysis. Optionally, if you provide a multi-set " "dataset as input, this may also be the name of an existing set from " @@ -155,7 +143,6 @@ def evaluate( predictions_folder, dataset, second_annotator, - second_annotator_folder, overlayed, threshold, **kwargs, @@ -173,7 +160,6 @@ def evaluate( "dataset": dataset, "output_folder": output_folder, "second_annotator": second_annotator, - "second_annotator_folder": second_annotator_folder, } else: for k, v in dataset.items(): @@ -184,9 +170,6 @@ def evaluate( "dataset": v, "output_folder": os.path.join(output_folder, k), "second_annotator": second_annotator.get(k), - "second_annotator_folder": os.path.join( - second_annotator_folder, k - ), } if isinstance(threshold, str): @@ -209,7 +192,7 @@ def evaluate( compare_annotators( v["dataset"], v["second_annotator"], - v["second_annotator_folder"], + v["output_folder"], os.path.join(overlayed, "second-annotator") if overlayed else None, diff --git a/bob/ip/binseg/script/experiment.py b/bob/ip/binseg/script/experiment.py index 074056c1abce92656afcb3186cd0876f4e131f11..5458036828a09fadb76a6df65d43be79c0b7bef9 100644 --- a/bob/ip/binseg/script/experiment.py +++ b/bob/ip/binseg/script/experiment.py @@ -380,14 +380,12 @@ def experiment( logger.info(f"Setting --threshold={threshold}...") analysis_folder = os.path.join(output_folder, "analysis") - second_annotator_folder = os.path.join(analysis_folder, "second-annotator") ctx.invoke( evaluate, output_folder=analysis_folder, predictions_folder=predictions_folder, dataset=dataset, second_annotator=second_annotator, - second_annotator_folder=second_annotator_folder, overlayed=overlayed_folder, threshold=threshold, verbose=verbose, @@ -412,8 +410,8 @@ def experiment( if k.startswith("_"): logger.info(f"Skipping dataset '{k}' (not to be compared)") continue - systems += [f"{k} (2nd. annot.)", - os.path.join(second_annotator_folder, k, "metrics.csv")] + systems += [f"{k} (2nd. annot.)", os.path.join(analysis_folder, k, + "metrics-second-annotator.csv")] output_pdf = os.path.join(output_folder, "comparison.pdf") ctx.invoke(compare, label_path=systems, output=output_pdf, verbose=verbose) diff --git a/bob/ip/binseg/test/test_cli.py b/bob/ip/binseg/test/test_cli.py index e39fba54f952c0faa98f64b7b8849d8602a411b0..6e2d8be26a42ba0992af601b235afd98c46123e1 100644 --- a/bob/ip/binseg/test/test_cli.py +++ b/bob/ip/binseg/test/test_cli.py @@ -127,14 +127,13 @@ def _check_experiment_stare(overlay): # check evaluation outputs eval_folder = os.path.join(output_folder, "analysis") - second_folder = os.path.join(eval_folder, "second-annotator") assert os.path.exists(os.path.join(eval_folder, "train", "metrics.csv")) assert os.path.exists(os.path.join(eval_folder, "test", "metrics.csv")) assert os.path.exists( - os.path.join(second_folder, "train", "metrics.csv") + os.path.join(eval_folder, "train", "metrics-second-annotator.csv") ) assert os.path.exists( - os.path.join(second_folder, "test", "metrics.csv") + os.path.join(eval_folder, "test", "metrics-second-annotator.csv") ) overlay_folder = os.path.join(output_folder, "overlayed", "analysis") @@ -172,8 +171,7 @@ def _check_experiment_stare(overlay): r"^Started evaluation$": 1, r"^Maximum F1-score of.*\(chosen \*a posteriori\*\)$": 3, r"^F1-score of.*\(chosen \*a priori\*\)$": 2, - r"^Maximum F1-score of .* \(second annotator\)$": 2, - r"^Saving overall precision-recall plot at .*$": 2, + r"^F1-score of.*\(second annotator; threshold=0.5\)$": 2, r"^Ended evaluation$": 1, r"^Started comparison$": 1, r"^Loading metrics from": 4, @@ -341,7 +339,6 @@ def _check_evaluate(runner): config.flush() output_folder = "evaluations" - second_folder = "evaluations-2nd" overlay_folder = os.path.join("overlayed", "analysis") result = runner.invoke( evaluate, @@ -351,13 +348,13 @@ def _check_evaluate(runner): f"--output-folder={output_folder}", "--predictions-folder=predictions", f"--overlayed={overlay_folder}", - f"--second-annotator-folder={second_folder}", ], ) _assert_exit_0(result) assert os.path.exists(os.path.join(output_folder, "metrics.csv")) - assert os.path.exists(os.path.join(second_folder, "metrics.csv")) + assert os.path.exists(os.path.join(output_folder, + "metrics-second-annotator.csv")) # check overlayed images are there (since we requested them) basedir = os.path.join(overlay_folder, "stare-images") @@ -369,7 +366,7 @@ def _check_evaluate(runner): r"^Saving averages over all input images.*$": 2, r"^Maximum F1-score of.*\(chosen \*a posteriori\*\)$": 1, r"^F1-score of.*\(chosen \*a priori\*\)$": 1, - r"^Maximum F1-score of .* \(second annotator\)$": 1, + r"^F1-score of.*\(second annotator; threshold=0.5\)$": 1, } buf.seek(0) logging_output = buf.read() @@ -393,7 +390,6 @@ def _check_compare(runner): with stdout_logging() as buf: output_folder = "evaluations" - second_folder = "evaluations-2nd" result = runner.invoke( compare, [ @@ -402,7 +398,7 @@ def _check_compare(runner): "test", os.path.join(output_folder, "metrics.csv"), "test (2nd. human)", - os.path.join(second_folder, "metrics.csv"), + os.path.join(output_folder, "metrics-second-annotator.csv"), ], ) _assert_exit_0(result) diff --git a/bob/ip/binseg/utils/plot.py b/bob/ip/binseg/utils/plot.py index 9f897d5b0e70dd60e76185ed196ea2e34dd3425b..0bbea34ae37e035807bddfe604714b05a18445e8 100644 --- a/bob/ip/binseg/utils/plot.py +++ b/bob/ip/binseg/utils/plot.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import contextlib from itertools import cycle import numpy @@ -15,72 +16,49 @@ import logging logger = logging.getLogger(__name__) -def precision_recall_f1iso(precision, recall, names): - """Creates a precision-recall plot of the given data. +@contextlib.contextmanager +def _precision_recall_canvas(title=None): + """Generates a canvas to draw precision-recall curves + + Works like a context manager, yielding a figure and an axes set in which + the precision-recall curves should be added to. The figure already + contains F1-ISO lines and is preset to a 0-1 square region. Once the + context is finished, ``fig.tight_layout()`` is called. - The plot will be annotated with F1-score iso-lines (in which the F1-score - maintains the same value) Parameters ---------- - precision : :py:class:`numpy.ndarray` or :py:class:`list` - A list of 1D arrays containing the Y coordinates of the plot, or the - precision, or a 2D np array in which the rows correspond to each of the - system's precision coordinates. - - recall : :py:class:`numpy.ndarray` or :py:class:`list` - A list of 1D arrays containing the X coordinates of the plot, or the - recall, or a 2D np array in which the rows correspond to each of the - system's recall coordinates. + title : :py:class:`str`, Optional + Optional title to add to this plot - names : :py:class:`list` - An iterable over the names of each of the systems along the rows of - ``precision`` and ``recall`` - - Returns - ------- + Yields + ------ figure : matplotlib.figure.Figure - A matplotlib figure you can save or display + The figure that should be finally returned to the user + + axes : matplotlib.figure.Axes + An axis set where to precision-recall plots should be added to """ - fig, ax1 = plt.subplots(1) - lines = ["-", "--", "-.", ":"] - linecycler = cycle(lines) - for p, r, n in zip(precision, recall, names): - # Plots only from the point where recall reaches its maximum, otherwise, we - # don't see a curve... - i = r.argmax() - pi = p[i:] - ri = r[i:] - valid = (pi + ri) > 0 - f1 = 2 * (pi[valid] * ri[valid]) / (pi[valid] + ri[valid]) - # optimal point along the curve - argmax = f1.argmax() - opi = pi[argmax] - ori = ri[argmax] - # Plot Recall/Precision as threshold changes - ax1.plot( - ri[pi > 0], - pi[pi > 0], - next(linecycler), - label="[F={:.4f}] {}".format(f1.max(), n), - ) - ax1.plot( - ori, opi, marker="o", linestyle=None, markersize=3, color="black" - ) - ax1.grid(linestyle="--", linewidth=1, color="gray", alpha=0.2) - if len(names) > 1: - plt.legend(loc="lower left", framealpha=0.5) - ax1.set_xlabel("Recall") - ax1.set_ylabel("Precision") - ax1.set_xlim([0.0, 1.0]) - ax1.set_ylim([0.0, 1.0]) + fig, axes1 = plt.subplots(1) + + # Names and bounds + axes1.set_xlabel("Recall") + axes1.set_ylabel("Precision") + axes1.set_xlim([0.0, 1.0]) + axes1.set_ylim([0.0, 1.0]) + + if title is not None: + axes1.set_title(title) + + axes1.grid(linestyle="--", linewidth=1, color="gray", alpha=0.2) + axes2 = axes1.twinx() + # Annotates plot with F1-score iso-lines - ax2 = ax1.twinx() f_scores = numpy.linspace(0.1, 0.9, num=9) tick_locs = [] tick_labels = [] @@ -90,197 +68,168 @@ def precision_recall_f1iso(precision, recall, names): (l,) = plt.plot(x[y >= 0], y[y >= 0], color="green", alpha=0.1) tick_locs.append(y[-1]) tick_labels.append("%.1f" % f_score) - ax2.tick_params(axis="y", which="both", pad=0, right=False, left=False) - ax2.set_ylabel("iso-F", color="green", alpha=0.3) - ax2.set_ylim([0.0, 1.0]) - ax2.yaxis.set_label_coords(1.015, 0.97) - ax2.set_yticks(tick_locs) # notice these are invisible - for k in ax2.set_yticklabels(tick_labels): + axes2.tick_params(axis="y", which="both", pad=0, right=False, left=False) + axes2.set_ylabel("iso-F", color="green", alpha=0.3) + axes2.set_ylim([0.0, 1.0]) + axes2.yaxis.set_label_coords(1.015, 0.97) + axes2.set_yticks(tick_locs) # notice these are invisible + for k in axes2.set_yticklabels(tick_labels): k.set_color("green") k.set_alpha(0.3) k.set_size(8) + # we should see some of axes 1 axes - ax1.spines["right"].set_visible(False) - ax1.spines["top"].set_visible(False) - ax1.spines["left"].set_position(("data", -0.015)) - ax1.spines["bottom"].set_position(("data", -0.015)) + axes1.spines["right"].set_visible(False) + axes1.spines["top"].set_visible(False) + axes1.spines["left"].set_position(("data", -0.015)) + axes1.spines["bottom"].set_position(("data", -0.015)) + # we shouldn't see any of axes 2 axes - ax2.spines["right"].set_visible(False) - ax2.spines["top"].set_visible(False) - ax2.spines["left"].set_visible(False) - ax2.spines["bottom"].set_visible(False) + axes2.spines["right"].set_visible(False) + axes2.spines["top"].set_visible(False) + axes2.spines["left"].set_visible(False) + axes2.spines["bottom"].set_visible(False) + + # yield execution, lets user draw precision-recall plots, and the legend + # before tighteneing the layout + yield fig, axes1 + plt.tight_layout() - return fig -def precision_recall_f1iso_confintval( - precision, recall, pr_upper, pr_lower, re_upper, re_lower, names -): - """Creates a precision-recall plot of the given data, with confidence - intervals +def precision_recall_f1iso(label, df, threshold, confidence=True): + """Creates a precision-recall plot with confidence intervals + + This function creates and returns a Matplotlib figure with a + precision-recall plot containing shaded confidence intervals. The plot + will be annotated with F1-score iso-lines (in which the F1-score maintains + the same value). - The plot will be annotated with F1-score iso-lines (in which the F1-score - maintains the same value) Parameters ---------- - precision : :py:class:`numpy.ndarray` or :py:class:`list` - A list of 1D arrays containing the Y coordinates of the plot, or the - precision, or a 2D array in which the rows correspond to each - of the system's precision coordinates. - - recall : :py:class:`numpy.ndarray` or :py:class:`list` - A list of 1D arrays containing the X coordinates of the plot, or - the recall, or a 2D array in which the rows correspond to each - of the system's recall coordinates. - - pr_upper : :py:class:`numpy.ndarray` or :py:class:`list` - A list of 1D arrays containing the upper bound of the confidence - interval for the Y coordinates of the plot, or the precision upper - bound, or a 2D array in which the rows correspond to each of the - system's precision upper-bound coordinates. - - pr_lower : :py:class:`numpy.ndarray` or :py:class:`list` - A list of 1D arrays containing the lower bound of the confidence - interval for the Y coordinates of the plot, or the precision lower - bound, or a 2D array in which the rows correspond to each of the - system's precision lower-bound coordinates. - - re_upper : :py:class:`numpy.ndarray` or :py:class:`list` - A list of 1D arrays containing the upper bound of the confidence - interval for the Y coordinates of the plot, or the recall upper bound, - or a 2D array in which the rows correspond to each of the system's - recall upper-bound coordinates. - - re_lower : :py:class:`numpy.ndarray` or :py:class:`list` - A list of 1D arrays containing the lower bound of the confidence - interval for the Y coordinates of the plot, or the recall lower bound, - or a 2D array in which the rows correspond to each of the system's - recall lower-bound coordinates. - - names : :py:class:`list` - An iterable over the names of each of the systems along the rows of - ``precision`` and ``recall`` + label : :py:class:`list` + A list of names to be associated to each line + + df : :py:class:`pandas.DataFrame` + A dataframe that is produced by our evaluator engine, indexed by + integer "thresholds", containing the following columns: ``threshold`` + (sorted ascending), ``precision``, ``recall``, ``pr_upper`` (upper + precision bounds), ``pr_lower`` (lower precision bounds), ``re_upper`` + (upper recall bounds), ``re_lower`` (lower recall bounds). + + Dataframes with a single entry are treated specially as these are + considered "second-annotator" performances. A single dot and a line + showing the variability is drawn in these cases. + + threshold : :py:class:`list` + A list of thresholds to graph with a dot for each set. Specific + threshold values do not affect "second-annotator" dataframes. + + confidence : :py:class:`bool`, Optional + If set, draw confidence intervals for each line, using ``*_upper`` and + ``*_lower`` entries. Returns ------- + figure : matplotlib.figure.Figure - A matplotlib figure you can save or display + A matplotlib figure you can save or display (uses an ``agg`` backend) """ - fig, ax1 = plt.subplots(1) lines = ["-", "--", "-.", ":"] colors = [ - "#1f77b4", - "#ff7f0e", - "#2ca02c", - "#d62728", - "#9467bd", - "#8c564b", - "#e377c2", - "#7f7f7f", - "#bcbd22", - "#17becf", - ] + "#1f77b4", + "#ff7f0e", + "#2ca02c", + "#d62728", + "#9467bd", + "#8c564b", + "#e377c2", + "#7f7f7f", + "#bcbd22", + "#17becf", + ] colorcycler = cycle(colors) linecycler = cycle(lines) - for p, r, pu, pl, ru, rl, n in zip( - precision, recall, pr_upper, pr_lower, re_upper, re_lower, names - ): - # Plots only from the point where recall reaches its maximum, otherwise, we - # don't see a curve... - i = r.argmax() - pi = p[i:] - ri = r[i:] - pui = pu[i:] - pli = pl[i:] - rui = ru[i:] - rli = rl[i:] - valid = (pi + ri) > 0 - f1 = 2 * (pi[valid] * ri[valid]) / (pi[valid] + ri[valid]) - # optimal point along the curve - argmax = f1.argmax() - opi = pi[argmax] - ori = ri[argmax] - # Plot Recall/Precision as threshold changes - ax1.plot( - ri[pi > 0], - pi[pi > 0], - next(linecycler), - label="[F={:.4f}] {}".format(f1.max(), n), - ) - ax1.plot( - ori, opi, marker="o", linestyle=None, markersize=3, color="black" - ) - # Plot confidence - # Upper bound - # ax1.plot(r95ui[p95ui>0], p95ui[p95ui>0]) - # Lower bound - # ax1.plot(r95li[p95li>0], p95li[p95li>0]) - # create the limiting polygon - vert_x = numpy.concatenate((rui[pui > 0], rli[pli > 0][::-1])) - vert_y = numpy.concatenate((pui[pui > 0], pli[pli > 0][::-1])) - # hacky workaround to plot 2nd human - if numpy.isclose(numpy.mean(rui), rui[1], rtol=1e-05): - logger.warning("Found 2nd human annotator in metrics - patching...") - p = plt.Polygon( - numpy.column_stack((vert_x, vert_y)), - facecolor="none", - alpha=0.2, - edgecolor=next(colorcycler), - lw=2, - ) - else: - p = plt.Polygon( - numpy.column_stack((vert_x, vert_y)), - facecolor=next(colorcycler), - alpha=0.2, - edgecolor="none", - lw=0.2, - ) - ax1.add_artist(p) - ax1.grid(linestyle="--", linewidth=1, color="gray", alpha=0.2) - if len(names) > 1: - plt.legend(loc="lower left", framealpha=0.5) - ax1.set_xlabel("Recall") - ax1.set_ylabel("Precision") - ax1.set_xlim([0.0, 1.0]) - ax1.set_ylim([0.0, 1.0]) - # Annotates plot with F1-score iso-lines - ax2 = ax1.twinx() - f_scores = numpy.linspace(0.1, 0.9, num=9) - tick_locs = [] - tick_labels = [] - for f_score in f_scores: - x = numpy.linspace(0.01, 1) - y = f_score * x / (2 * x - f_score) - (l,) = plt.plot(x[y >= 0], y[y >= 0], color="green", alpha=0.1) - tick_locs.append(y[-1]) - tick_labels.append("%.1f" % f_score) - ax2.tick_params(axis="y", which="both", pad=0, right=False, left=False) - ax2.set_ylabel("iso-F", color="green", alpha=0.3) - ax2.set_ylim([0.0, 1.0]) - ax2.yaxis.set_label_coords(1.015, 0.97) - ax2.set_yticks(tick_locs) # notice these are invisible - for k in ax2.set_yticklabels(tick_labels): - k.set_color("green") - k.set_alpha(0.3) - k.set_size(8) - # we should see some of axes 1 axes - ax1.spines["right"].set_visible(False) - ax1.spines["top"].set_visible(False) - ax1.spines["left"].set_position(("data", -0.015)) - ax1.spines["bottom"].set_position(("data", -0.015)) - # we shouldn't see any of axes 2 axes - ax2.spines["right"].set_visible(False) - ax2.spines["top"].set_visible(False) - ax2.spines["left"].set_visible(False) - ax2.spines["bottom"].set_visible(False) - plt.tight_layout() + with _precision_recall_canvas(title=None) as (fig, axes): + + legend = [] + + for kn, kdf, kt in zip(label, df, threshold): + + # plots only from the point where recall reaches its maximum, + # otherwise, we don't see a curve... + max_recall = kdf["recall"].idxmax() + pi = kdf.precision[max_recall:] + ri = kdf.recall[max_recall:] + + valid = (pi + ri) > 0 + f1 = 2 * (pi[valid] * ri[valid]) / (pi[valid] + ri[valid]) + + # optimal point along the curve + bins = len(kdf) + index = int(round(bins*kt)) + index = min(index, len(kdf)-1) #avoids out of range indexing + + # plots Recall/Precision as threshold changes + label = f"{kn} (F1={kdf.f1_score[index]:.4f})" + color = next(colorcycler) + + if len(kdf) == 1: + # plot black dot for F1-score at select threshold + marker, = axes.plot(kdf.recall[index], kdf.precision[index], + marker="*", markersize=6, color=color, alpha=0.8, + linestyle="None") + line, = axes.plot(kdf.recall[index], kdf.precision[index], + linestyle="None", color=color, alpha=0.2) + legend.append(([marker, line], label)) + else: + # line first, so marker gets on top + style = next(linecycler) + line, = axes.plot(ri[pi > 0], pi[pi > 0], color=color, + linestyle=style) + marker, = axes.plot(kdf.recall[index], kdf.precision[index], + marker="o", linestyle=style, markersize=4, + color=color, alpha=0.8) + legend.append(([marker, line], label)) + + if confidence: + + pui = kdf.pr_upper[max_recall:] + pli = kdf.pr_lower[max_recall:] + rui = kdf.re_upper[max_recall:] + rli = kdf.re_lower[max_recall:] + + # Plot confidence + # Upper bound + # create the limiting polygon + vert_x = numpy.concatenate((rui[pui > 0], rli[pli > 0][::-1])) + vert_y = numpy.concatenate((pui[pui > 0], pli[pli > 0][::-1])) + + # hacky workaround to plot 2nd human + if len(kdf) == 1: #binary system, very likely + logger.warning("Found 2nd human annotator - patching...") + p, = axes.plot(vert_x, vert_y, color=color, alpha=0.1, lw=3) + else: + p = plt.Polygon( + numpy.column_stack((vert_x, vert_y)), + facecolor=color, + alpha=0.2, + edgecolor="none", + lw=0.2, + ) + legend[-1][0].append(p) + axes.add_artist(p) + + if len(label) > 1: + axes.legend([tuple(k[0]) for k in legend], [k[1] for k in legend], + loc="lower left", fancybox=True, framealpha=0.7) + return fig @@ -311,48 +260,3 @@ def loss_curve(df): plt.tight_layout() fig = ax1.get_figure() return fig - - -def combined_precision_recall_f1iso_confintval(data): - """Plots comparison chart of all evaluated models - - Parameters - ---------- - - data : dict - A dict in which keys are the names of the systems and the values are - paths to ``metrics.csv`` style files. - - - Returns - ------- - - figure : matplotlib.figure.Figure - A figure, with all systems combined into a single plot. - - """ - - precisions = [] - recalls = [] - pr_ups = [] - pr_lows = [] - re_ups = [] - re_lows = [] - names = [] - - for name, metrics_path in data.items(): - logger.info(f"Loading metrics from {metrics_path}...") - df = pandas.read_csv(metrics_path) - precisions.append(df.precision.to_numpy()) - recalls.append(df.recall.to_numpy()) - pr_ups.append(df.pr_upper.to_numpy()) - pr_lows.append(df.pr_lower.to_numpy()) - re_ups.append(df.re_upper.to_numpy()) - re_lows.append(df.re_lower.to_numpy()) - names.append(name) - - fig = precision_recall_f1iso_confintval( - precisions, recalls, pr_ups, pr_lows, re_ups, re_lows, names - ) - - return fig diff --git a/doc/evaluation.rst b/doc/evaluation.rst index 3646969173ddde58f56b87d2f08c718f5c34477c..0368ad6ddd23ab5c231a0e3c29fe8f88f9c0e51b 100644 --- a/doc/evaluation.rst +++ b/doc/evaluation.rst @@ -65,7 +65,7 @@ Evaluation ---------- In evaluation, we input an **annotated** dataset and predictions to generate -performance figures that can help analysis of a trained model. Evaluation is +performance summaries that help analysis of a trained model. Evaluation is done using the :ref:`evaluate command `<bob.ip.binseg.cli.evaluate>` followed by the model and the annotated dataset configuration, and the path to the pretrained weights via the ``--weight`` argument.