Skip to content
Snippets Groups Projects
Commit 894bae97 authored by André Anjos's avatar André Anjos :speech_balloon:
Browse files

[script.analyze] Fix location of metrics files after evaluator changes

parent 49790bc8
No related branches found
No related tags found
1 merge request!12Streamlining
#!/usr/bin/env python
# coding=utf-8
import os
import click
from bob.extension.scripts.click_helper import (
verbosity_option,
ConfigCommand,
ResourceOption,
)
from .binseg import save_sh_command
import logging
logger = logging.getLogger(__name__)
@click.command(
entry_point_group="bob.ip.binseg.config",
cls=ConfigCommand,
epilog="""Examples:
\b
1. Re-evaluates a pre-trained M2U-Net model with DRIVE (vessel
segmentation), on the CPU, by running inference and evaluation on results
from its test set:
$ bob binseg analyze -vv m2unet drive --weight=model.path
""",
)
@click.option(
"--output-folder",
"-o",
help="Path where to store experiment outputs (created if does not exist)",
required=True,
type=click.Path(),
default="results",
cls=ResourceOption,
)
@click.option(
"--model",
"-m",
help="A torch.nn.Module instance implementing the network to be trained, and then evaluated",
required=True,
cls=ResourceOption,
)
@click.option(
"--dataset",
"-d",
help="A dictionary mapping string keys to "
"bob.ip.binseg.data.utils.SampleList2TorchDataset's. At least one key "
"named 'train' must be available. This dataset will be used for training "
"the network model. All other datasets will be used for prediction and "
"evaluation. Dataset descriptions include all required pre-processing, "
"including eventual data augmentation, which may be eventually excluded "
"for prediction and evaluation purposes",
required=True,
cls=ResourceOption,
)
@click.option(
"--second-annotator",
"-S",
help="A dataset or dictionary, like in --dataset, with the same "
"sample keys, but with annotations from a different annotator that is "
"going to be compared to the one in --dataset",
required=False,
default=None,
cls=ResourceOption,
show_default=True,
)
@click.option(
"--batch-size",
"-b",
help="Number of samples in every batch (this parameter affects "
"memory requirements for the network). If the number of samples in "
"the batch is larger than the total number of samples available for "
"training, this value is truncated. If this number is smaller, then "
"batches of the specified size are created and fed to the network "
"until there are no more new samples to feed (epoch is finished). "
"If the total number of training samples is not a multiple of the "
"batch-size, the last batch will be smaller than the first.",
required=True,
show_default=True,
default=1,
type=click.IntRange(min=1),
cls=ResourceOption,
)
@click.option(
"--device",
"-d",
help='A string indicating the device to use (e.g. "cpu" or "cuda:0")',
show_default=True,
required=True,
default="cpu",
cls=ResourceOption,
)
@click.option(
"--overlayed/--no-overlayed",
"-O",
help="Creates overlayed representations of the output probability maps, "
"similar to --overlayed in prediction-mode, except it includes "
"distinctive colours for true and false positives and false negatives. "
"If not set, or empty then do **NOT** output overlayed images.",
show_default=True,
default=False,
required=False,
cls=ResourceOption,
)
@click.option(
"--weight",
"-w",
help="Path or URL to pretrained model file (.pth extension)",
required=True,
cls=ResourceOption,
)
@verbosity_option(cls=ResourceOption)
@click.pass_context
def analyze(
ctx,
model,
output_folder,
batch_size,
dataset,
second_annotator,
device,
overlayed,
weight,
verbose,
**kwargs,
):
"""Runs a complete evaluation from prediction to comparison
This script is just a wrapper around the individual scripts for running
prediction and evaluating FCN models. It organises the output in a
preset way:
.. code-block:: text
└─ <output-folder>/
├── predictions/ #the prediction outputs for the train/test set
├── overlayed/ #the overlayed outputs for the train/test set
├── predictions/ #predictions overlayed on the input images
├── analysis/ #predictions overlayed on the input images
├ #including analysis of false positives, negatives
├ #and true positives
└── second-annotator/ #if set, store overlayed images for the
#second annotator here
└── analysis / #the outputs of the analysis of both train/test sets
#includes second-annotator "metrics" as well, if
# configured
N.B.: The tool is designed to prevent analysis bias and allows one to
provide separate subsets for training and evaluation. Instead of using
simple datasets, datasets for full experiment running should be
dictionaries with specific subset names:
* ``__train__``: dataset used for training, prioritarily. It is typically
the dataset containing data augmentation pipelines.
* ``train`` (optional): a copy of the ``__train__`` dataset, without data
augmentation, that will be evaluated alongside other sets available
* ``*``: any other name, not starting with an underscore character (``_``),
will be considered a test set for evaluation.
N.B.2: The threshold used for calculating the F1-score on the test set, or
overlay analysis (false positives, negatives and true positives overprinted
on the original image) also follows the logic above.
"""
command_sh = os.path.join(output_folder, "command.sh")
if not os.path.exists(command_sh):
# only save if experiment has not saved yet something similar
save_sh_command(command_sh)
## Prediction
logger.info("Started prediction")
from .predict import predict
predictions_folder = os.path.join(output_folder, "predictions")
overlayed_folder = (
os.path.join(output_folder, "overlayed", "predictions")
if overlayed
else None
)
ctx.invoke(
predict,
output_folder=predictions_folder,
model=model,
dataset=dataset,
batch_size=batch_size,
device=device,
weight=weight,
overlayed=overlayed_folder,
verbose=verbose,
)
logger.info("Ended prediction")
## Evaluation
logger.info("Started evaluation")
from .evaluate import evaluate
overlayed_folder = (
os.path.join(output_folder, "overlayed", "analysis")
if overlayed
else None
)
# choosing the overlayed_threshold
if "validation" in dataset:
threshold = "validation"
elif "train" in dataset:
threshold = "train"
else:
threshold = 0.5
logger.info(f"Setting --threshold={threshold}...")
analysis_folder = os.path.join(output_folder, "analysis")
ctx.invoke(
evaluate,
output_folder=analysis_folder,
predictions_folder=predictions_folder,
dataset=dataset,
second_annotator=second_annotator,
overlayed=overlayed_folder,
threshold=threshold,
verbose=verbose,
)
logger.info("Ended evaluation")
## Comparison
logger.info("Started comparison")
# compare performances on the various sets
from .compare import compare
systems = []
for k, v in dataset.items():
if k.startswith("_"):
logger.info(f"Skipping dataset '{k}' (not to be compared)")
continue
systems += [k, os.path.join(analysis_folder, f"{k}.csv")]
if second_annotator is not None:
for k, v in second_annotator.items():
if k.startswith("_"):
logger.info(f"Skipping dataset '{k}' (not to be compared)")
continue
systems += [
f"{k} (2nd. annot.)",
os.path.join(
analysis_folder, "second-annotator", f"{k}.csv"
),
]
output_figure = os.path.join(output_folder, "comparison.pdf")
output_table = os.path.join(output_folder, "comparison.rst")
ctx.invoke(
compare,
label_path=systems,
output_figure=output_figure,
output_table=output_table,
threshold=threshold,
verbose=verbose,
)
logger.info("Ended comparison")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment