From 2e79aae009be598b2e78da47cb31a83b5acb90cf Mon Sep 17 00:00:00 2001 From: Gokhan Ozbulak <gokhan.ozbulak@idiap.ch> Date: Tue, 28 May 2024 15:19:47 +0200 Subject: [PATCH] Change flag for batch accumulation. #25 --- src/mednet/engine/trainer.py | 17 +++++++++-------- src/mednet/scripts/experiment.py | 4 ++-- src/mednet/scripts/train.py | 27 ++++++++++++++------------- 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/src/mednet/engine/trainer.py b/src/mednet/engine/trainer.py index 5ea8ccae..0fe79c81 100644 --- a/src/mednet/engine/trainer.py +++ b/src/mednet/engine/trainer.py @@ -26,7 +26,7 @@ def run( max_epochs: int, output_folder: pathlib.Path, monitoring_interval: int | float, - batch_chunk_count: int, + accumulate_grad_batches: int, checkpoint: pathlib.Path | None, ): """Fit a CNN model using supervised learning and save it to disk. @@ -60,12 +60,13 @@ def run( monitoring_interval Interval, in seconds (or fractions), through which we should monitor resources during training. - batch_chunk_count - If this number is different than 1, then each batch will be divided in - this number of chunks. Gradients will be accumulated to perform each - mini-batch. This is particularly interesting when one has limited RAM - on the GPU, but would like to keep training with larger batches. One - exchanges for longer processing times in this case. + accumulate_grad_batches + Number of accumulations for backward propagation to accumulate gradients + over k batches before stepping the optimizer. The default of 1 forces + the whole batch to be processed at once. Otherwise the batch is multiplied + by accumulate-grad-batches pieces, and gradients are accumulated to complete + each step. This is especially interesting when one is training on GPUs with + a limited amount of onboard RAM. checkpoint Path to an optional checkpoint file to load. """ @@ -118,7 +119,7 @@ def run( accelerator=accelerator, devices=devices, max_epochs=max_epochs, - accumulate_grad_batches=batch_chunk_count, + accumulate_grad_batches=accumulate_grad_batches, logger=tensorboard_logger, check_val_every_n_epoch=validation_period, log_every_n_steps=len(datamodule.train_dataloader()), diff --git a/src/mednet/scripts/experiment.py b/src/mednet/scripts/experiment.py index 67e16b64..12a70115 100644 --- a/src/mednet/scripts/experiment.py +++ b/src/mednet/scripts/experiment.py @@ -40,7 +40,7 @@ def experiment( output_folder, epochs, batch_size, - batch_chunk_count, + accumulate_grad_batches, drop_incomplete_batch, datamodule, validation_period, @@ -79,7 +79,7 @@ def experiment( output_folder=train_output_folder, epochs=epochs, batch_size=batch_size, - batch_chunk_count=batch_chunk_count, + accumulate_grad_batches=accumulate_grad_batches, drop_incomplete_batch=drop_incomplete_batch, datamodule=datamodule, validation_period=validation_period, diff --git a/src/mednet/scripts/train.py b/src/mednet/scripts/train.py index efbdac88..8ae78f79 100644 --- a/src/mednet/scripts/train.py +++ b/src/mednet/scripts/train.py @@ -79,18 +79,19 @@ def reusable_options(f): cls=ResourceOption, ) @click.option( - "--batch-chunk-count", - "-c", - help="Number of chunks in every batch (this parameter affects " - "memory requirements for the network). The number of samples " - "loaded for every iteration will be batch-size*batch-chunk-count. " - "This parameter is used to reduce the number of samples loaded in each " - "iteration, in order to reduce the memory usage in exchange for " - "processing time (more iterations). This is especially interesting " + "--accumulate-grad-batches", + "-a", + help="Number of accumulations for backward propagation to accumulate " + "gradients over k batches before stepping the optimizer. This " + "parameter, used in conjunction with the batch-size, may be used to " + "reduce the number of samples loaded in each iteration, to affect memory " + "usage in exchange for processing time (more iterations). This is " + "especially interesting when one is training on GPUs with a limited amount " + "of onboard RAM. processing time (more iterations). This is especially interesting " "when one is training on GPUs with limited RAM. The default of 1 forces " "the whole batch to be processed at once. Otherwise the batch is " - "multiplied by batch-chunk-count pieces, and gradients are accumulated " - "to complete each batch.", + "multiplied by accumulate-grad-batches pieces, and gradients are accumulated " + "to complete each step.", required=True, show_default=True, default=1, @@ -235,7 +236,7 @@ def train( output_folder, epochs, batch_size, - batch_chunk_count, + accumulate_grad_batches, drop_incomplete_batch, datamodule, validation_period, @@ -340,7 +341,7 @@ def train( split_name=datamodule.split_name, epochs=epochs, batch_size=batch_size, - batch_chunk_count=batch_chunk_count, + accumulate_grad_batches=accumulate_grad_batches, drop_incomplete_batch=drop_incomplete_batch, validation_period=validation_period, cache_samples=cache_samples, @@ -363,6 +364,6 @@ def train( max_epochs=epochs, output_folder=output_folder, monitoring_interval=monitoring_interval, - batch_chunk_count=batch_chunk_count, + accumulate_grad_batches=accumulate_grad_batches, checkpoint=checkpoint_file, ) -- GitLab