diff --git a/bob/ip/binseg/engine/evaluator.py b/bob/ip/binseg/engine/evaluator.py index 6e957cadca8867205188ea6026286c51d8b4f60a..1163584894f1b95c64363ff5d4675a5ea39fce85 100644 --- a/bob/ip/binseg/engine/evaluator.py +++ b/bob/ip/binseg/engine/evaluator.py @@ -15,7 +15,7 @@ import torchvision.transforms.functional as VF import h5py -from ..utils.metric import base_measures +from ..utils.measure import base_measures import logging @@ -49,9 +49,9 @@ def _posneg(pred, gt, threshold): return tp_tensor, fp_tensor, tn_tensor, fn_tensor -def _sample_metrics(pred, gt, bins): +def _sample_measures(pred, gt, bins): """ - Calculates metrics on one single sample and saves it to disk + Calculates measures on one single sample and saves it to disk Parameters @@ -71,7 +71,7 @@ def _sample_metrics(pred, gt, bins): Returns ------- - metrics : pandas.DataFrame + measures : pandas.DataFrame A pandas dataframe with the following columns: @@ -94,7 +94,7 @@ def _sample_metrics(pred, gt, bins): pred, gt, threshold ) - # calc metrics from scalars + # calc measures from scalars tp_count = torch.sum(tp_tensor).item() fp_count = torch.sum(fp_tensor).item() tn_count = torch.sum(tn_tensor).item() @@ -221,7 +221,7 @@ def run( threshold=None, ): """ - Runs inference and calculates metrics + Runs inference and calculates measures Parameters @@ -232,7 +232,7 @@ def run( name : str the local name of this dataset (e.g. ``train``, or ``test``), to be - used when saving metrics files. + used when saving measures files. predictions_folder : str folder where predictions for the dataset images has been previously @@ -263,7 +263,7 @@ def run( """ - # Collect overall metrics + # Collect overall measures bins = 1000 # number of thresholds to analyse for data = {} @@ -279,7 +279,7 @@ def run( raise RuntimeError( f"{stem} entry already exists in data. Cannot overwrite." ) - data[stem] = _sample_metrics(pred, gt, bins) + data[stem] = _sample_measures(pred, gt, bins) if overlayed_folder is not None: overlay_image = _sample_analysis( @@ -291,31 +291,31 @@ def run( overlay_image.save(fullpath) # Merges all dataframes together - df_metrics = pandas.concat(data.values()) + df_measures = pandas.concat(data.values()) # Report and Averages - avg_metrics = df_metrics.groupby("index").mean() - std_metrics = df_metrics.groupby("index").std() + avg_measures = df_measures.groupby("index").mean() + std_measures = df_measures.groupby("index").std() # Uncomment below for F1-score calculation based on average precision and - # metrics instead of F1-scores of individual images. This method is in line + # measures instead of F1-scores of individual images. This method is in line # with Maninis et. al. (2016) # - # avg_metrics["f1_score"] = \ - # (2* avg_metrics["precision"]*avg_metrics["recall"])/ \ - # (avg_metrics["precision"]+avg_metrics["recall"]) - - avg_metrics["std_pr"] = std_metrics["precision"] - avg_metrics["pr_upper"] = avg_metrics["precision"] + std_metrics["precision"] - avg_metrics["pr_lower"] = avg_metrics["precision"] - std_metrics["precision"] - avg_metrics["std_re"] = std_metrics["recall"] - avg_metrics["re_upper"] = avg_metrics["recall"] + std_metrics["recall"] - avg_metrics["re_lower"] = avg_metrics["recall"] - std_metrics["recall"] - avg_metrics["std_f1"] = std_metrics["f1_score"] - - maxf1 = avg_metrics["f1_score"].max() - maxf1_index = avg_metrics["f1_score"].idxmax() - maxf1_threshold = avg_metrics["threshold"][maxf1_index] + # avg_measures["f1_score"] = \ + # (2* avg_measures["precision"]*avg_measures["recall"])/ \ + # (avg_measures["precision"]+avg_measures["recall"]) + + avg_measures["std_pr"] = std_measures["precision"] + avg_measures["pr_upper"] = avg_measures["precision"] + std_measures["precision"] + avg_measures["pr_lower"] = avg_measures["precision"] - std_measures["precision"] + avg_measures["std_re"] = std_measures["recall"] + avg_measures["re_upper"] = avg_measures["recall"] + std_measures["recall"] + avg_measures["re_lower"] = avg_measures["recall"] - std_measures["recall"] + avg_measures["std_f1"] = std_measures["f1_score"] + + maxf1 = avg_measures["f1_score"].max() + maxf1_index = avg_measures["f1_score"].idxmax() + maxf1_threshold = avg_measures["threshold"][maxf1_index] logger.info( f"Maximum F1-score of {maxf1:.5f}, achieved at " @@ -326,8 +326,8 @@ def run( # get the closest possible threshold we have index = int(round(bins * threshold)) - f1_a_priori = avg_metrics["f1_score"][index] - actual_threshold = avg_metrics["threshold"][index] + f1_a_priori = avg_measures["f1_score"][index] + actual_threshold = avg_measures["threshold"][index] logger.info( f"F1-score of {f1_a_priori:.5f}, at threshold " @@ -337,11 +337,11 @@ def run( if output_folder is not None: logger.info(f"Output folder: {output_folder}") os.makedirs(output_folder, exist_ok=True) - metrics_path = os.path.join(output_folder, f"{name}.csv") + measures_path = os.path.join(output_folder, f"{name}.csv") logger.info( - f"Saving averages over all input images at {metrics_path}..." + f"Saving averages over all input images at {measures_path}..." ) - avg_metrics.to_csv(metrics_path) + avg_measures.to_csv(measures_path) return maxf1_threshold @@ -364,7 +364,7 @@ def compare_annotators(baseline, other, name, output_folder, name : str the local name of this dataset (e.g. ``train-second-annotator``, or - ``test-second-annotator``), to be used when saving metrics files. + ``test-second-annotator``), to be used when saving measures files. output_folder : str folder where to store results @@ -378,7 +378,7 @@ def compare_annotators(baseline, other, name, output_folder, logger.info(f"Output folder: {output_folder}") os.makedirs(output_folder, exist_ok=True) - # Collect overall metrics + # Collect overall measures data = {} for baseline_sample, other_sample in tqdm( @@ -392,7 +392,7 @@ def compare_annotators(baseline, other, name, output_folder, raise RuntimeError( f"{stem} entry already exists in data. " f"Cannot overwrite." ) - data[stem] = _sample_metrics(pred, gt, 2) + data[stem] = _sample_measures(pred, gt, 2) if overlayed_folder is not None: overlay_image = _sample_analysis( @@ -405,33 +405,33 @@ def compare_annotators(baseline, other, name, output_folder, overlay_image.save(fullpath) # Merges all dataframes together - df_metrics = pandas.concat(data.values()) - df_metrics.drop(0, inplace=True) + df_measures = pandas.concat(data.values()) + df_measures.drop(0, inplace=True) # Report and Averages - avg_metrics = df_metrics.groupby("index").mean() - std_metrics = df_metrics.groupby("index").std() + avg_measures = df_measures.groupby("index").mean() + std_measures = df_measures.groupby("index").std() # Uncomment below for F1-score calculation based on average precision and # {name} instead of F1-scores of individual images. This method is in line # with Maninis et. al. (2016) # - # avg_metrics["f1_score"] = \ - # (2* avg_metrics["precision"]*avg_metrics["recall"])/ \ - # (avg_metrics["precision"]+avg_metrics["recall"]) - - avg_metrics["std_pr"] = std_metrics["precision"] - avg_metrics["pr_upper"] = avg_metrics["precision"] + std_metrics["precision"] - avg_metrics["pr_lower"] = avg_metrics["precision"] - std_metrics["precision"] - avg_metrics["std_re"] = std_metrics["recall"] - avg_metrics["re_upper"] = avg_metrics["recall"] + std_metrics["recall"] - avg_metrics["re_lower"] = avg_metrics["recall"] - std_metrics["recall"] - avg_metrics["std_f1"] = std_metrics["f1_score"] - - metrics_path = os.path.join(output_folder, "second-annotator", f"{name}.csv") - os.makedirs(os.path.dirname(metrics_path), exist_ok=True) - logger.info(f"Saving averages over all input images at {metrics_path}...") - avg_metrics.to_csv(metrics_path) - - maxf1 = avg_metrics["f1_score"].max() + # avg_measures["f1_score"] = \ + # (2* avg_measures["precision"]*avg_measures["recall"])/ \ + # (avg_measures["precision"]+avg_measures["recall"]) + + avg_measures["std_pr"] = std_measures["precision"] + avg_measures["pr_upper"] = avg_measures["precision"] + std_measures["precision"] + avg_measures["pr_lower"] = avg_measures["precision"] - std_measures["precision"] + avg_measures["std_re"] = std_measures["recall"] + avg_measures["re_upper"] = avg_measures["recall"] + std_measures["recall"] + avg_measures["re_lower"] = avg_measures["recall"] - std_measures["recall"] + avg_measures["std_f1"] = std_measures["f1_score"] + + measures_path = os.path.join(output_folder, "second-annotator", f"{name}.csv") + os.makedirs(os.path.dirname(measures_path), exist_ok=True) + logger.info(f"Saving averages over all input images at {measures_path}...") + avg_measures.to_csv(measures_path) + + maxf1 = avg_measures["f1_score"].max() logger.info(f"F1-score of {maxf1:.5f} (second annotator; threshold=0.5)") diff --git a/bob/ip/binseg/engine/ssltrainer.py b/bob/ip/binseg/engine/ssltrainer.py index d56202074bd77c4513b243eff0f71dfec4e8307e..d8b66b69d5e729de87abafd4963a9fc71a4a87d9 100644 --- a/bob/ip/binseg/engine/ssltrainer.py +++ b/bob/ip/binseg/engine/ssltrainer.py @@ -12,7 +12,7 @@ import pandas import torch from tqdm import tqdm -from ..utils.metric import SmoothedValue +from ..utils.measure import SmoothedValue from ..utils.plot import loss_curve import logging diff --git a/bob/ip/binseg/engine/trainer.py b/bob/ip/binseg/engine/trainer.py index d5591526fc149248f950e69694443335c85728a0..00f9318212e999c742f0c87380f6b45cf1c61a5a 100644 --- a/bob/ip/binseg/engine/trainer.py +++ b/bob/ip/binseg/engine/trainer.py @@ -11,7 +11,7 @@ import distutils.version import torch from tqdm import tqdm -from ..utils.metric import SmoothedValue +from ..utils.measure import SmoothedValue from ..utils.summary import summary from ..utils.resources import cpu_constants, gpu_constants, cpu_log, gpu_log diff --git a/bob/ip/binseg/script/analyze.py b/bob/ip/binseg/script/analyze.py index bd66611d635c5a31b7163c0b69eb9da1ee5e955e..8a7e502139a47ae71292c173f211c3a92ac973f9 100644 --- a/bob/ip/binseg/script/analyze.py +++ b/bob/ip/binseg/script/analyze.py @@ -149,7 +149,7 @@ def analyze( └── second-annotator/ #if set, store overlayed images for the #second annotator here └── analysis / #the outputs of the analysis of both train/test sets - #includes second-annotator "metrics" as well, if + #includes second-annotator "mesures" as well, if # configured N.B.: The tool is designed to prevent analysis bias and allows one to diff --git a/bob/ip/binseg/script/compare.py b/bob/ip/binseg/script/compare.py index 813a5cb8c392ceb2eb280d9e45a36752b01839b5..dd06106de0a5a4ba7871a9a5910aaa4fa887f2f7 100644 --- a/bob/ip/binseg/script/compare.py +++ b/bob/ip/binseg/script/compare.py @@ -55,11 +55,11 @@ def _load(data, threshold=None): data : dict A dict in which keys are the names of the systems and the values are - paths to ``metrics.csv`` style files. + paths to ``measures.csv`` style files. threshold : :py:class:`float`, :py:class:`str`, Optional A value indicating which threshold to choose for selecting a "F1-score" - If set to ``None``, then use the maximum F1-score on that metrics file. + If set to ``None``, then use the maximum F1-score on that measures file. If set to a floating-point value, then use the F1-score that is obtained on that particular threshold. If set to a string, it should match one of the keys in ``data``. It then first calculate the @@ -74,7 +74,7 @@ def _load(data, threshold=None): A dict in which keys are the names of the systems and the values are dictionaries that contain two keys: - * ``df``: A :py:class:`pandas.DataFrame` with the metrics data loaded + * ``df``: A :py:class:`pandas.DataFrame` with the measures data loaded to * ``threshold``: A threshold to be used for summarization, depending on the ``threshold`` parameter set on the input @@ -84,8 +84,8 @@ def _load(data, threshold=None): if isinstance(threshold, str): logger.info(f"Calculating threshold from maximum F1-score at " f"'{threshold}' dataset...") - metrics_path = data[threshold] - df = pandas.read_csv(metrics_path) + measures_path = data[threshold] + df = pandas.read_csv(measures_path) maxf1 = df.f1_score.max() use_threshold = df.threshold[df.f1_score.idxmax()] @@ -101,10 +101,10 @@ def _load(data, threshold=None): # loads all data retval = {} - for name, metrics_path in data.items(): + for name, measures_path in data.items(): - logger.info(f"Loading metrics from {metrics_path}...") - df = pandas.read_csv(metrics_path) + logger.info(f"Loading measures from {measures_path}...") + df = pandas.read_csv(measures_path) if threshold is None: use_threshold = df.threshold[df.f1_score.idxmax()] @@ -119,9 +119,9 @@ def _load(data, threshold=None): epilog="""Examples: \b - 1. Compares system A and B, with their own pre-computed metric files: + 1. Compares system A and B, with their own pre-computed measure files: \b - $ bob binseg compare -vv A path/to/A/metrics.csv B path/to/B/metrics.csv + $ bob binseg compare -vv A path/to/A/train.csv B path/to/B/test.csv """, ) @click.argument( @@ -182,7 +182,7 @@ def compare(label_path, output_figure, table_format, output_table, threshold, threshold = _validate_threshold(threshold, data) - # load all data metrics + # load all data measures data = _load(data, threshold=threshold) if output_figure is not None: diff --git a/bob/ip/binseg/script/experiment.py b/bob/ip/binseg/script/experiment.py index cbbfd56f0754327b6bb93abde03b4718c387d930..050910c38c29745382de7c7d9e310db427f7f9ea 100644 --- a/bob/ip/binseg/script/experiment.py +++ b/bob/ip/binseg/script/experiment.py @@ -247,7 +247,7 @@ def experiment( └── second-annotator/ #if set, store overlayed images for the #second annotator here └── analysis / #the outputs of the analysis of both train/test sets - #includes second-annotator "metrics" as well, if + #includes second-annotator "mesures" as well, if # configured Training is performed for a configurable number of epochs, and generates at diff --git a/bob/ip/binseg/test/test_batchmetrics.py b/bob/ip/binseg/test/test_batchmeasures.py similarity index 85% rename from bob/ip/binseg/test/test_batchmetrics.py rename to bob/ip/binseg/test/test_batchmeasures.py index 09ffe250a805a00718d2cb4687ea2bbe5e49daf4..d6fb2cb8e768546cff2be14558ccae25d619d716 100644 --- a/bob/ip/binseg/test/test_batchmetrics.py +++ b/bob/ip/binseg/test/test_batchmeasures.py @@ -9,7 +9,7 @@ import torch import pandas import numpy -from ..engine.evaluator import _sample_metrics +from ..engine.evaluator import _sample_measures import logging logger = logging.getLogger(__name__) @@ -17,7 +17,7 @@ logger = logging.getLogger(__name__) class Tester(unittest.TestCase): """ - Unit test for batch metrics + Unit test for batch measures """ def setUp(self): @@ -29,10 +29,10 @@ class Tester(unittest.TestCase): self.ground_truths = torch.randint(low=0, high=2, size=(2, 1, 420, 420)) self.names = ["Bob", "Tim"] - def test_batch_metrics(self): + def test_batch_measures(self): dfs = [] for pred, gt in zip(self.predictions, self.ground_truths): - dfs.append(_sample_metrics(pred, gt, 100)) + dfs.append(_sample_measures(pred, gt, 100)) bm = pandas.concat(dfs) self.assertEqual(len(bm), 2 * 100) diff --git a/bob/ip/binseg/test/test_cli.py b/bob/ip/binseg/test/test_cli.py index 0b0d20af4fb3fe312afb2c95ed7acd577d737b40..74187b1ca1842dca8d0043a85a14a25cfd5d0ed1 100644 --- a/bob/ip/binseg/test/test_cli.py +++ b/bob/ip/binseg/test/test_cli.py @@ -178,7 +178,7 @@ def _check_experiment_stare(overlay): r"^F1-score of.*\(second annotator; threshold=0.5\)$": 2, r"^Ended evaluation$": 1, r"^Started comparison$": 1, - r"^Loading metrics from": 4, + r"^Loading measures from": 4, r"^Creating and saving plot at": 1, r"^Tabulating performance summary...": 1, r"^Saving table at": 1, @@ -403,7 +403,7 @@ def _check_compare(runner): compare, [ "-vv", - # label - path to metrics + # label - path to measures "test", os.path.join(output_folder, "test.csv"), "test (2nd. human)", @@ -418,7 +418,7 @@ def _check_compare(runner): assert os.path.exists("comparison.rst") keywords = { - r"^Loading metrics from": 2, + r"^Loading measures from": 2, r"^Creating and saving plot at": 1, r"^Tabulating performance summary...": 1, r"^Saving table at": 1, diff --git a/bob/ip/binseg/utils/table.py b/bob/ip/binseg/utils/table.py index e8c4d64c3b65c52c0741c7f44b62284cf214ad71..097891260e9171b19b1f89fc40246b344e7815ad 100644 --- a/bob/ip/binseg/utils/table.py +++ b/bob/ip/binseg/utils/table.py @@ -3,7 +3,7 @@ import tabulate -from .metric import auc +from .measure import auc def performance_table(data, fmt):