From 2e79aae009be598b2e78da47cb31a83b5acb90cf Mon Sep 17 00:00:00 2001
From: Gokhan Ozbulak <gokhan.ozbulak@idiap.ch>
Date: Tue, 28 May 2024 15:19:47 +0200
Subject: [PATCH] Change flag for batch accumulation. #25

---
 src/mednet/engine/trainer.py     | 17 +++++++++--------
 src/mednet/scripts/experiment.py |  4 ++--
 src/mednet/scripts/train.py      | 27 ++++++++++++++-------------
 3 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/src/mednet/engine/trainer.py b/src/mednet/engine/trainer.py
index 5ea8ccae..0fe79c81 100644
--- a/src/mednet/engine/trainer.py
+++ b/src/mednet/engine/trainer.py
@@ -26,7 +26,7 @@ def run(
     max_epochs: int,
     output_folder: pathlib.Path,
     monitoring_interval: int | float,
-    batch_chunk_count: int,
+    accumulate_grad_batches: int,
     checkpoint: pathlib.Path | None,
 ):
     """Fit a CNN model using supervised learning and save it to disk.
@@ -60,12 +60,13 @@ def run(
     monitoring_interval
         Interval, in seconds (or fractions), through which we should monitor
         resources during training.
-    batch_chunk_count
-        If this number is different than 1, then each batch will be divided in
-        this number of chunks.  Gradients will be accumulated to perform each
-        mini-batch.   This is particularly interesting when one has limited RAM
-        on the GPU, but would like to keep training with larger batches.  One
-        exchanges for longer processing times in this case.
+    accumulate_grad_batches
+        Number of accumulations for backward propagation to accumulate gradients
+        over k batches before stepping the optimizer. The default of 1 forces
+        the whole batch to be processed at once. Otherwise the batch is multiplied
+        by accumulate-grad-batches pieces, and gradients are accumulated to complete
+        each step. This is especially interesting when one is training on GPUs with
+        a limited amount of onboard RAM.
     checkpoint
         Path to an optional checkpoint file to load.
     """
@@ -118,7 +119,7 @@ def run(
             accelerator=accelerator,
             devices=devices,
             max_epochs=max_epochs,
-            accumulate_grad_batches=batch_chunk_count,
+            accumulate_grad_batches=accumulate_grad_batches,
             logger=tensorboard_logger,
             check_val_every_n_epoch=validation_period,
             log_every_n_steps=len(datamodule.train_dataloader()),
diff --git a/src/mednet/scripts/experiment.py b/src/mednet/scripts/experiment.py
index 67e16b64..12a70115 100644
--- a/src/mednet/scripts/experiment.py
+++ b/src/mednet/scripts/experiment.py
@@ -40,7 +40,7 @@ def experiment(
     output_folder,
     epochs,
     batch_size,
-    batch_chunk_count,
+    accumulate_grad_batches,
     drop_incomplete_batch,
     datamodule,
     validation_period,
@@ -79,7 +79,7 @@ def experiment(
         output_folder=train_output_folder,
         epochs=epochs,
         batch_size=batch_size,
-        batch_chunk_count=batch_chunk_count,
+        accumulate_grad_batches=accumulate_grad_batches,
         drop_incomplete_batch=drop_incomplete_batch,
         datamodule=datamodule,
         validation_period=validation_period,
diff --git a/src/mednet/scripts/train.py b/src/mednet/scripts/train.py
index efbdac88..8ae78f79 100644
--- a/src/mednet/scripts/train.py
+++ b/src/mednet/scripts/train.py
@@ -79,18 +79,19 @@ def reusable_options(f):
         cls=ResourceOption,
     )
     @click.option(
-        "--batch-chunk-count",
-        "-c",
-        help="Number of chunks in every batch (this parameter affects "
-        "memory requirements for the network). The number of samples "
-        "loaded for every iteration will be batch-size*batch-chunk-count. "
-        "This parameter is used to reduce the number of samples loaded in each "
-        "iteration, in order to reduce the memory usage in exchange for "
-        "processing time (more iterations).  This is especially interesting "
+        "--accumulate-grad-batches",
+        "-a",
+        help="Number of accumulations for backward propagation to accumulate "
+        "gradients over k batches before stepping the optimizer. This "
+        "parameter, used in conjunction with the batch-size, may be used to "
+        "reduce the number of samples loaded in each iteration, to affect memory "
+        "usage in exchange for processing time (more iterations). This is "
+        "especially interesting when one is training on GPUs with a limited amount "
+        "of onboard RAM. processing time (more iterations).  This is especially interesting "
         "when one is training on GPUs with limited RAM. The default of 1 forces "
         "the whole batch to be processed at once. Otherwise the batch is "
-        "multiplied by batch-chunk-count pieces, and gradients are accumulated "
-        "to complete each batch.",
+        "multiplied by accumulate-grad-batches pieces, and gradients are accumulated "
+        "to complete each step.",
         required=True,
         show_default=True,
         default=1,
@@ -235,7 +236,7 @@ def train(
     output_folder,
     epochs,
     batch_size,
-    batch_chunk_count,
+    accumulate_grad_batches,
     drop_incomplete_batch,
     datamodule,
     validation_period,
@@ -340,7 +341,7 @@ def train(
             split_name=datamodule.split_name,
             epochs=epochs,
             batch_size=batch_size,
-            batch_chunk_count=batch_chunk_count,
+            accumulate_grad_batches=accumulate_grad_batches,
             drop_incomplete_batch=drop_incomplete_batch,
             validation_period=validation_period,
             cache_samples=cache_samples,
@@ -363,6 +364,6 @@ def train(
         max_epochs=epochs,
         output_folder=output_folder,
         monitoring_interval=monitoring_interval,
-        batch_chunk_count=batch_chunk_count,
+        accumulate_grad_batches=accumulate_grad_batches,
         checkpoint=checkpoint_file,
     )
-- 
GitLab