Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
mednet
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
medai
software
mednet
Commits
a9dbca59
Commit
a9dbca59
authored
1 year ago
by
Daniel CARRON
Browse files
Options
Downloads
Patches
Plain Diff
Added experiment script
parent
84c7a7f7
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Pipeline
#76442
failed
1 year ago
Stage: qa
Stage: test
Stage: doc
Stage: dist
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/ptbench/scripts/experiment.py
+342
-0
342 additions, 0 deletions
src/ptbench/scripts/experiment.py
with
342 additions
and
0 deletions
src/ptbench/scripts/experiment.py
0 → 100644
+
342
−
0
View file @
a9dbca59
# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
import
os
import
shutil
import
click
from
clapper.click
import
ConfigCommand
,
ResourceOption
,
verbosity_option
from
clapper.logging
import
setup
logger
=
setup
(
__name__
.
split
(
"
.
"
)[
0
],
format
=
"
%(levelname)s: %(message)s
"
)
from
.utils
import
save_sh_command
@click.command
(
entry_point_group
=
"
ptbench.config
"
,
cls
=
ConfigCommand
,
epilog
=
"""
Examples:
\b
1. Trains a pasa model with shenzhen dataset, on the CPU, for only two epochs, then runs inference and
evaluation on stock datasets, report performance as a table and a figure:
.. code:: sh
$ ptbench experiment -vv pasa shenzhen --epochs=2
"""
,
)
@click.option
(
"
--output-folder
"
,
"
-o
"
,
help
=
"
Path where to store experiment outputs (created if does not exist)
"
,
required
=
True
,
type
=
click
.
Path
(),
default
=
"
results
"
,
cls
=
ResourceOption
,
)
@click.option
(
"
--model
"
,
"
-m
"
,
help
=
"
A lightining module instance implementing the network to be trained
"
,
required
=
True
,
cls
=
ResourceOption
,
)
@click.option
(
"
--datamodule
"
,
"
-d
"
,
help
=
"
A lighting data module containing the training and validation sets
"
,
required
=
True
,
cls
=
ResourceOption
,
)
@click.option
(
"
--batch-size
"
,
"
-b
"
,
help
=
"
Number of samples in every batch (this parameter affects
"
"
memory requirements for the network). If the number of samples in
"
"
the batch is larger than the total number of samples available for
"
"
training, this value is truncated. If this number is smaller, then
"
"
batches of the specified size are created and fed to the network
"
"
until there are no more new samples to feed (epoch is finished).
"
"
If the total number of training samples is not a multiple of the
"
"
batch-size, the last batch will be smaller than the first, unless
"
"
--drop-incomplete-batch is set, in which case this batch is not used.
"
,
required
=
True
,
show_default
=
True
,
default
=
1
,
type
=
click
.
IntRange
(
min
=
1
),
cls
=
ResourceOption
,
)
@click.option
(
"
--batch-chunk-count
"
,
"
-c
"
,
help
=
"
Number of chunks in every batch (this parameter affects
"
"
memory requirements for the network). The number of samples
"
"
loaded for every iteration will be batch-size/batch-chunk-count.
"
"
batch-size needs to be divisible by batch-chunk-count, otherwise an
"
"
error will be raised. This parameter is used to reduce number of
"
"
samples loaded in each iteration, in order to reduce the memory usage
"
"
in exchange for processing time (more iterations). This is specially
"
"
interesting whe one is running with GPUs with limited RAM. The
"
"
default of 1 forces the whole batch to be processed at once. Otherwise
"
"
the batch is broken into batch-chunk-count pieces, and gradients are
"
"
accumulated to complete each batch.
"
,
required
=
True
,
show_default
=
True
,
default
=
1
,
type
=
click
.
IntRange
(
min
=
1
),
cls
=
ResourceOption
,
)
@click.option
(
"
--drop-incomplete-batch/--no-drop-incomplete-batch
"
,
"
-D
"
,
help
=
"
If set, then may drop the last batch in an epoch, in case it is
"
"
incomplete. If you set this option, you should also consider
"
"
increasing the total number of epochs of training, as the total number
"
"
of training steps may be reduced
"
,
required
=
True
,
show_default
=
True
,
default
=
False
,
cls
=
ResourceOption
,
)
@click.option
(
"
--epochs
"
,
"
-e
"
,
help
=
"
Number of epochs (complete training set passes) to train for.
"
"
If continuing from a saved checkpoint, ensure to provide a greater
"
"
number of epochs than that saved on the checkpoint to be loaded.
"
,
show_default
=
True
,
required
=
True
,
default
=
1000
,
type
=
click
.
IntRange
(
min
=
1
),
cls
=
ResourceOption
,
)
@click.option
(
"
--checkpoint-period
"
,
"
-p
"
,
help
=
"
Number of epochs after which a checkpoint is saved.
"
"
A value of zero will disable check-pointing. If checkpointing is
"
"
enabled and training stops, it is automatically resumed from the
"
"
last saved checkpoint if training is restarted with the same
"
"
configuration.
"
,
show_default
=
True
,
required
=
False
,
default
=
None
,
type
=
click
.
IntRange
(
min
=
0
),
cls
=
ResourceOption
,
)
@click.option
(
"
--device
"
,
"
-d
"
,
help
=
'
A string indicating the device to use (e.g.
"
cpu
"
or
"
cuda:0
"
)
'
,
show_default
=
True
,
required
=
True
,
default
=
"
cpu
"
,
cls
=
ResourceOption
,
)
@click.option
(
"
--cache-samples/--no-cache-samples
"
,
help
=
"
If set to True, loads the sample into memory,
"
"
otherwise loads them at runtime.
"
,
required
=
True
,
show_default
=
True
,
default
=
False
,
cls
=
ResourceOption
,
)
@click.option
(
"
--seed
"
,
"
-s
"
,
help
=
"
Seed to use for the random number generator
"
,
show_default
=
True
,
required
=
False
,
default
=
42
,
type
=
click
.
IntRange
(
min
=
0
),
cls
=
ResourceOption
,
)
@click.option
(
"
--parallel
"
,
"
-P
"
,
help
=
"""
Use multiprocessing for data loading: if set to -1 (default),
disables multiprocessing data loading. Set to 0 to enable as many data
loading instances as processing cores as available in the system. Set to
>= 1 to enable that many multiprocessing instances for data loading.
"""
,
type
=
click
.
IntRange
(
min
=-
1
),
show_default
=
True
,
required
=
True
,
default
=-
1
,
cls
=
ResourceOption
,
)
@click.option
(
"
--monitoring-interval
"
,
"
-I
"
,
help
=
"""
Time between checks for the use of resources during each training
epoch. An interval of 5 seconds, for example, will lead to CPU and GPU
resources being probed every 5 seconds during each training epoch.
Values registered in the training logs correspond to averages (or maxima)
observed through possibly many probes in each epoch. Notice that setting a
very small value may cause the probing process to become extremely busy,
potentially biasing the overall perception of resource usage.
"""
,
type
=
click
.
FloatRange
(
min
=
0.1
),
show_default
=
True
,
required
=
True
,
default
=
5.0
,
cls
=
ResourceOption
,
)
@click.option
(
"
--resume-from
"
,
help
=
"
Which checkpoint to resume training from. If set, can be one of
"
"
`best`, `last`, or a path to a model checkpoint.
"
,
type
=
str
,
required
=
False
,
default
=
None
,
cls
=
ResourceOption
,
)
@click.option
(
"
--balance-classes/--no-balance-classes
"
,
"
-B/-N
"
,
help
=
"""
If set, then balances weights of the random sampler during
training, so that samples from all sample classes are picked picked
equitably. It also sets the training (and validation) losses to account
for the populations of each class.
"""
,
required
=
True
,
show_default
=
True
,
default
=
True
,
cls
=
ResourceOption
,
)
@click.option
(
"
--steps
"
,
"
-S
"
,
help
=
"
This number is used to define the number of threshold steps to
"
"
consider when evaluating the highest possible F1-score on test data.
"
,
default
=
1000
,
show_default
=
True
,
required
=
True
,
cls
=
ResourceOption
,
)
@click.option
(
"
--plot-limits
"
,
"
-L
"
,
help
=
"""
If set, this option affects the performance comparison plots. It
must be a 4-tuple containing the bounds of the plot for the x and y axis
respectively (format: x_low, x_high, y_low, y_high]). If not set, use
normal bounds ([0, 1, 0, 1]) for the performance curve.
"""
,
default
=
[
0.0
,
1.0
,
0.0
,
1.0
],
show_default
=
True
,
nargs
=
4
,
type
=
float
,
cls
=
ResourceOption
,
)
@verbosity_option
(
logger
=
logger
,
cls
=
ResourceOption
)
@click.pass_context
def
experiment
(
ctx
,
model
,
output_folder
,
epochs
,
batch_size
,
batch_chunk_count
,
drop_incomplete_batch
,
datamodule
,
checkpoint_period
,
device
,
cache_samples
,
seed
,
parallel
,
monitoring_interval
,
resume_from
,
balance_classes
,
steps
,
**
kwargs
,
):
"""
Runs a complete experiment, from training, to prediction and evaluation.
This script is just a wrapper around the individual scripts for training,
running prediction, evaluating and comparing model performance. It
organises the output in a preset way::
\b
└─ <output-folder>/
├── command
├── model/ #the generated model will be here
├── predictions/ #the prediction outputs for the sets
└── evaluations/ #the outputs of the evaluations for the sets
"""
command_sh
=
os
.
path
.
join
(
output_folder
,
"
command.sh
"
)
if
os
.
path
.
exists
(
command_sh
):
backup
=
command_sh
+
"
~
"
if
os
.
path
.
exists
(
backup
):
os
.
unlink
(
backup
)
shutil
.
move
(
command_sh
,
backup
)
save_sh_command
(
output_folder
)
# training
logger
.
info
(
"
Started training
"
)
from
.train
import
train
train_output_folder
=
os
.
path
.
join
(
output_folder
,
"
model
"
)
ctx
.
invoke
(
train
,
model
=
model
,
output_folder
=
train_output_folder
,
epochs
=
epochs
,
batch_size
=
batch_size
,
batch_chunk_count
=
batch_chunk_count
,
drop_incomplete_batch
=
drop_incomplete_batch
,
datamodule
=
datamodule
,
checkpoint_period
=
checkpoint_period
,
device
=
device
,
cache_samples
=
cache_samples
,
seed
=
seed
,
parallel
=
parallel
,
monitoring_interval
=
monitoring_interval
,
resume_from
=
resume_from
,
balance_classes
=
balance_classes
,
)
logger
.
info
(
"
Ended training
"
)
logger
.
info
(
"
Started predicting
"
)
from
.predict
import
predict
# preferably, we use the best model on the validation set
# otherwise, we get the last saved model
model_file
=
os
.
path
.
join
(
train_output_folder
,
"
model_lowest_valid_loss.ckpt
"
)
if
not
os
.
path
.
exists
(
model_file
):
model_file
=
os
.
path
.
join
(
train_output_folder
,
"
model_final_epoch.ckpt
"
)
predictions_folder
=
os
.
path
.
join
(
output_folder
,
"
predictions
"
)
ctx
.
invoke
(
predict
,
output_folder
=
predictions_folder
,
model
=
model
,
datamodule
=
datamodule
,
device
=
device
,
weight
=
model_file
,
)
logger
.
info
(
"
Ended predicting
"
)
logger
.
info
(
"
Started evaluating
"
)
from
.evaluate
import
evaluate
evaluations_folder
=
os
.
path
.
join
(
output_folder
,
"
evaluations
"
)
ctx
.
invoke
(
evaluate
,
output_folder
=
evaluations_folder
,
predictions_folder
=
predictions_folder
,
datamodule
=
datamodule
,
threshold
=
"
train
"
,
steps
=
steps
,
)
logger
.
info
(
"
Ended evaluating
"
)
This diff is collapsed.
Click to expand it.
Daniel CARRON
@dcarron
mentioned in issue
#19 (closed)
·
1 year ago
mentioned in issue
#19 (closed)
mentioned in issue #19
Toggle commit list
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment