Gokhan OZBULAK
--- a/src/mednet/engine/trainer.py

+ 9

− 8
+++ b/src/mednet/engine/trainer.py

+ 9

− 8
 @@ -26,7 +26,7 @@ def run(
 @@ -26,7 +26,7 @@ def run(
    max_epochs: int,
    output_folder: pathlib.Path,
    monitoring_interval: int | float,
-    batch_chunk_count: int,
+    accumulate_grad_batches: int,
    checkpoint: pathlib.Path | None,
 ):
    """Fit a CNN model using supervised learning and save it to disk.
 @@ -60,12 +60,13 @@ def run(
 @@ -60,12 +60,13 @@ def run(
    monitoring_interval
        Interval, in seconds (or fractions), through which we should monitor
        resources during training.
-    batch_chunk_count
+    accumulate_grad_batches
-        If this number is different than 1, then each batch will be divided in
+        Number of accumulations for backward propagation to accumulate gradients
-        this number of chunks.  Gradients will be accumulated to perform each
+        over k batches before stepping the optimizer. The default of 1 forces
-        mini-batch.   This is particularly interesting when one has limited RAM
+        the whole batch to be processed at once. Otherwise the batch is multiplied
-        on the GPU, but would like to keep training with larger batches.  One
+        by accumulate-grad-batches pieces, and gradients are accumulated to complete
-        exchanges for longer processing times in this case.
+        each step. This is especially interesting when one is training on GPUs with
+        a limited amount of onboard RAM.
    checkpoint
        Path to an optional checkpoint file to load.
    """
 @@ -118,7 +119,7 @@ def run(
 @@ -118,7 +119,7 @@ def run(
            accelerator=accelerator,
            devices=devices,
            max_epochs=max_epochs,
-            accumulate_grad_batches=batch_chunk_count,
+            accumulate_grad_batches=accumulate_grad_batches,
            logger=tensorboard_logger,
            check_val_every_n_epoch=validation_period,
            log_every_n_steps=len(datamodule.train_dataloader()),