diff --git a/README.rst b/README.rst
index f88bd22386c136862f6ebbb791cbd3c45d93dfd3..2121792a0ecaeae8bc1e59296dc3287edfc9dc77 100644
--- a/README.rst
+++ b/README.rst
@@ -20,11 +20,6 @@
 
 This package is part of the signal-processing and machine learning toolbox Bob_.
 
-.. todo::
-
-   **Complete the sentence above to include one phrase about your
-   package!  Once this is done, delete this to-do!**
-
 
 Installation
 ------------
diff --git a/bob/ip/binseg/engine/inferencer.py b/bob/ip/binseg/engine/inferencer.py
index 3dfb77f4e9e3705aa7cae17413d1ed6e5e446307..3221277c11ff8545f0d3182952c329a82488ed50 100644
--- a/bob/ip/binseg/engine/inferencer.py
+++ b/bob/ip/binseg/engine/inferencer.py
@@ -132,7 +132,7 @@ def do_inference(
     data_loader : py:class:torch.torch.utils.data.DataLoader
         PyTorch DataLoader
     device : str
-        device to use ('cpu' or 'cuda')
+        device to use ``'cpu'`` or ``'cuda'``
     output_folder : str
     """
     logger = logging.getLogger("bob.ip.binseg.engine.inference")
diff --git a/bob/ip/binseg/engine/trainer.py b/bob/ip/binseg/engine/trainer.py
index b508cda40c250d59ae46b02464f9ecb6c7737d30..ac24b55b5030ed5a51d2787fae1db5d5ef9af5e0 100644
--- a/bob/ip/binseg/engine/trainer.py
+++ b/bob/ip/binseg/engine/trainer.py
@@ -26,24 +26,24 @@ def do_train(
     output_folder
 ):
     """ 
-    Trains the model
+    Train model and save to disk.
     
     Parameters
     ----------
     model : :py:class:`torch.nn.Module` 
         Network (e.g. DRIU, HED, UNet)
-    data_loader : :py:class:`torch.torch.utils.data.DataLoader`
-    optimizer : :py:class.`torch.torch.optim.Optimizer`
+    data_loader : :py:class:`torch.utils.data.DataLoader`
+    optimizer : :py:class.`torch.optim.Optimizer`
     criterion : :py:class.`torch.nn.modules.loss._Loss`
         loss function
-    scheduler : :py:class.`torch.torch.optim._LRScheduler`
+    scheduler : :py:class.`torch.optim._LRScheduler`
         learning rate scheduler
     checkpointer : :py:class.`bob.ip.binseg.utils.checkpointer.DetectronCheckpointer`
         checkpointer
     checkpoint_period : int
         save a checkpoint every n epochs
     device : str  
-        device to use. 'cpu' or 'cuda'.
+        device to use ``'cpu'`` or ``'cuda'``
     arguments : dict
         start end end epochs
     output_folder : str 
diff --git a/bob/ip/binseg/modeling/losses.py b/bob/ip/binseg/modeling/losses.py
index 809f19728fd31ba8c57ebc0c6ceb8672e17bc96c..c57bf7fe1ab851669de7b878b3f2fa3b5383637b 100644
--- a/bob/ip/binseg/modeling/losses.py
+++ b/bob/ip/binseg/modeling/losses.py
@@ -7,29 +7,8 @@ from torch._jit_internal import weak_script_method
 
 class WeightedBCELogitsLoss(_Loss):
     """ 
-    Implements Equation 1 in [DRIU16]_. Based on :py:class:`torch.torch.nn.modules.loss.BCEWithLogitsLoss`. 
+    Implements Equation 1 in [DRIU16]_. Based on torch.nn.modules.loss.BCEWithLogitsLoss. 
     Calculate sum of weighted cross entropy loss.
-
-    Attributes
-    ----------
-    size_average : bool, optional
-        Deprecated (see :attr:`reduction`). By default, the losses are averaged over each loss element in the batch. Note that for 
-        some losses, there are multiple elements per sample. If the field :attr:`size_average` is set to ``False``, the losses are 
-        instead summed for each minibatch. Ignored when reduce is ``False``. Default: ``True``
-    reduce : bool, optional 
-        Deprecated (see :attr:`reduction`). By default, the
-        losses are averaged or summed over observations for each minibatch depending
-        on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
-        batch element instead and ignores :attr:`size_average`. Default: ``True``
-    reduction : string, optional
-        Specifies the reduction to apply to the output:
-        ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
-        ``'mean'``: the sum of the output will be divided by the number of
-        elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
-        and :attr:`reduce` are in the process of being deprecated, and in the meantime,
-        specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
-    pos_weight : :py:class:`torch.Tensor`, optional
-        a weight of positive examples. Must be a vector with length equal to the number of classes.
     """
     def __init__(self, weight=None, size_average=None, reduce=None, reduction='mean', pos_weight=None):
         super(WeightedBCELogitsLoss, self).__init__(size_average, reduce, reduction)
@@ -38,6 +17,17 @@ class WeightedBCELogitsLoss(_Loss):
 
     @weak_script_method
     def forward(self, input, target, masks=None):
+        """
+        Parameters
+        ----------
+        input : :py:class:`torch.Tensor`
+        target : :py:class:`torch.Tensor`
+        masks : :py:class:`torch.Tensor`, optional
+        
+        Returns
+        -------
+        :py:class:`torch.Tensor`
+        """
         n, c, h, w = target.shape
         num_pos = torch.sum(target, dim=[1, 2, 3]).float().reshape(n,1) # torch.Size([n, 1])
         if hasattr(masks,'dtype'):
@@ -54,37 +44,30 @@ class WeightedBCELogitsLoss(_Loss):
 
 class SoftJaccardBCELogitsLoss(_Loss):
     """ 
-    Implements Equation 6 in [SAT17]_. Based on :py:class:`torch.torch.nn.modules.loss.BCEWithLogitsLoss`. 
+    Implements Equation 6 in [SAT17]_. Based on torch.nn.modules.loss.BCEWithLogitsLoss. 
 
     Attributes
     ----------
     alpha : float
         determines the weighting of SoftJaccard and BCE. Default: ``0.3``
-    size_average : bool, optional
-        Deprecated (see :attr:`reduction`). By default, the losses are averaged over each loss element in the batch. Note that for 
-        some losses, there are multiple elements per sample. If the field :attr:`size_average` is set to ``False``, the losses are 
-        instead summed for each minibatch. Ignored when reduce is ``False``. Default: ``True``
-    reduce : bool, optional 
-        Deprecated (see :attr:`reduction`). By default, the
-        losses are averaged or summed over observations for each minibatch depending
-        on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
-        batch element instead and ignores :attr:`size_average`. Default: ``True``
-    reduction : string, optional
-        Specifies the reduction to apply to the output:
-        ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
-        ``'mean'``: the sum of the output will be divided by the number of
-        elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
-        and :attr:`reduce` are in the process of being deprecated, and in the meantime,
-        specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
-    pos_weight : :py:class:`torch.Tensor`, optional
-        a weight of positive examples. Must be a vector with length equal to the number of classes.
     """
-    def __init__(self, alpha=0.3, size_average=None, reduce=None, reduction='mean', pos_weight=None):
+    def __init__(self, alpha=0.1, size_average=None, reduce=None, reduction='mean', pos_weight=None):
         super(SoftJaccardBCELogitsLoss, self).__init__(size_average, reduce, reduction) 
         self.alpha = alpha   
 
     @weak_script_method
-    def forward(self, input, target):
+    def forward(self, input, target, masks=None):
+        """
+        Parameters
+        ----------
+        input : :py:class:`torch.Tensor`
+        target : :py:class:`torch.Tensor`
+        masks : :py:class:`torch.Tensor`, optional
+        
+        Returns
+        -------
+        :py:class:`torch.Tensor`
+        """
         eps = 1e-8
         probabilities = torch.sigmoid(input)
         intersection = (probabilities * target).sum()
@@ -99,29 +82,8 @@ class SoftJaccardBCELogitsLoss(_Loss):
 
 class HEDWeightedBCELogitsLoss(_Loss):
     """ 
-    Implements Equation 2 in [HED15]_. Based on :py:class:`torch.torch.nn.modules.loss.BCEWithLogitsLoss`. 
+    Implements Equation 2 in [HED15]_. Based on torch.nn.modules.loss.BCEWithLogitsLoss. 
     Calculate sum of weighted cross entropy loss.
-
-    Attributes
-    ----------
-    size_average : bool, optional
-        Deprecated (see :attr:`reduction`). By default, the losses are averaged over each loss element in the batch. Note that for 
-        some losses, there are multiple elements per sample. If the field :attr:`size_average` is set to ``False``, the losses are 
-        instead summed for each minibatch. Ignored when reduce is ``False``. Default: ``True``
-    reduce : bool, optional 
-        Deprecated (see :attr:`reduction`). By default, the
-        losses are averaged or summed over observations for each minibatch depending
-        on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
-        batch element instead and ignores :attr:`size_average`. Default: ``True``
-    reduction : string, optional
-        Specifies the reduction to apply to the output:
-        ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
-        ``'mean'``: the sum of the output will be divided by the number of
-        elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
-        and :attr:`reduce` are in the process of being deprecated, and in the meantime,
-        specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
-    pos_weight : :py:class:`torch.Tensor`, optional
-        a weight of positive examples. Must be a vector with length equal to the number of classes.
     """
     def __init__(self, weight=None, size_average=None, reduce=None, reduction='mean', pos_weight=None):
         super(HEDWeightedBCELogitsLoss, self).__init__(size_average, reduce, reduction)
@@ -130,18 +92,16 @@ class HEDWeightedBCELogitsLoss(_Loss):
 
     @weak_script_method
     def forward(self, inputlist, target, masks=None):
-        """[summary]
-        
+        """
         Parameters
         ----------
         inputlist : list of :py:class:`torch.Tensor`
             HED uses multiple side-output feature maps for the loss calculation
         target : :py:class:`torch.Tensor`
-        
+        masks : :py:class:`torch.Tensor`, optional
         Returns
         -------
         :py:class:`torch.Tensor`
-            
         """
         loss_over_all_inputs = []
         for input in inputlist:
diff --git a/doc/api.rst b/doc/api.rst
index 9cfacb8d322ff9b5e9491f6e9d5231b031d17685..103d9622edc930268710ceb7bf668e6d7e54cd4a 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -6,7 +6,7 @@
 ============
 
 This section lists all the functionality available in this library allowing to
-run HED-based experiments.
+run binary-segmentation benchmarks.
 
 
 PyTorch Dataset
@@ -20,5 +20,9 @@ Transforms
 
 .. automodule:: bob.ip.binseg.data.transforms
 
+Losses
+------
+.. automodule:: bob.ip.binseg.modeling.losses
+
 
 .. include:: links.rst
diff --git a/doc/nitpick-exceptions.txt b/doc/nitpick-exceptions.txt
index 8d734225d8ec201efaa64dbaa1f16445fa70e22c..bd53da1a83ba588f4c110926093fc88ba0016501 100644
--- a/doc/nitpick-exceptions.txt
+++ b/doc/nitpick-exceptions.txt
@@ -1,5 +1,6 @@
 py:class torch.nn.modules.module.Module
 py:class torch.nn.modules.loss._Loss
 py:class torch.utils.data.dataset.Dataset
+py:class Module
 py:mod bob.db.base
 py:obj list
diff --git a/doc/references.rst b/doc/references.rst
index 3255b0a9bd3418faf8d5115c0417f101062ac49d..555005887e10b6b386accfb41e3185e2719fcdc6 100644
--- a/doc/references.rst
+++ b/doc/references.rst
@@ -2,4 +2,8 @@
 
 ===========
 References
-===========
\ No newline at end of file
+===========
+
+.. [HED15] *Saining Xie and Zhuowen Tu*, **Holistically-Nested Edge Detection**, in: Proceedings of IEEE International Conference on Computer Vision, 2015
+.. [SAT17] *Alexey Shvets, Vladimir Iglovikov, Alexander Rakhlin and Alexandr A. Kalinin** , in:  17th IEEE International Conference on Machine Learning and Applications (ICMLA), 2017
+.. [DRIU16] *K.K. Maninis, J. Pont-Tuset, P. ArbelÃ¡ez, and L. Van Gool**, in: Medical Image Computing and Computer-Assisted Intervention (MICCAI), 2016
\ No newline at end of file