diff --git a/bob/learn/em/plda_base.cpp b/bob/learn/em/plda_base.cpp
index 2de15d854ef808a420c96571119c0532babe5316..839dc9c233ab87d0e187c2493ad8d89f8d490474 100644
--- a/bob/learn/em/plda_base.cpp
+++ b/bob/learn/em/plda_base.cpp
@@ -19,7 +19,7 @@ static auto PLDABase_doc = bob::extension::ClassDoc(
   "This class is a container for the :math:`F` (between class variantion matrix), :math:`G` (within class variantion matrix) and :math:`\\Sigma` "
   "matrices and the mean vector :math:`\\mu` of a PLDA model. This also"
   "precomputes useful matrices to make the model scalable."
-  "References: [ElShafey2014,PrinceElder2007,LiFu2012]",
+  "References: [ElShafey2014]_ [PrinceElder2007]_ [LiFu2012]_ ",
   ""
 ).add_constructor(
   bob::extension::FunctionDoc(
diff --git a/bob/learn/em/plda_machine.cpp b/bob/learn/em/plda_machine.cpp
index 83c9b4223815ee45d06ce60b92d94abaf3369ffc..c85e9d542c3536343385f8c87e01d54098e20001 100644
--- a/bob/learn/em/plda_machine.cpp
+++ b/bob/learn/em/plda_machine.cpp
@@ -20,7 +20,7 @@ static auto PLDAMachine_doc = bob::extension::ClassDoc(
 
   "This class is a container for an enrolled identity/class. It contains information extracted from the enrollment samples. "
   "It should be used in combination with a PLDABase instance.\n\n"
-  "References: [ElShafey2014]_, [PrinceElder2007]_, [LiFu2012]_",
+  "References: [ElShafey2014]_ [PrinceElder2007]_ [LiFu2012]_ ",
   ""
 ).add_constructor(
   bob::extension::FunctionDoc(
diff --git a/bob/learn/em/plda_trainer.cpp b/bob/learn/em/plda_trainer.cpp
index eb04f0949cf16552a172d87e9e87077078523d2f..49842a3b086f4c3e865812a69a71eaedf553924d 100644
--- a/bob/learn/em/plda_trainer.cpp
+++ b/bob/learn/em/plda_trainer.cpp
@@ -106,7 +106,7 @@ static auto PLDATrainer_doc = bob::extension::ClassDoc(
   BOB_EXT_MODULE_PREFIX ".PLDATrainer",
   "This class can be used to train the :math:`F`, :math:`G` and "
   " :math:`\\Sigma` matrices and the mean vector :math:`\\mu` of a PLDA model."
-  "References: [ElShafey2014]_,[PrinceElder2007]_,[LiFu2012]_",
+  "References: [ElShafey2014]_ [PrinceElder2007]_ [LiFu2012]_ ",
   ""
 ).add_constructor(
   bob::extension::FunctionDoc(
diff --git a/doc/guide.rst b/doc/guide.rst
index b495c781d29a257c9ffe45a4e3d2003b79d727c5..ed81db8c6413b274b54faeded35bdcb761648cb8 100644
--- a/doc/guide.rst
+++ b/doc/guide.rst
@@ -121,7 +121,7 @@ Maximum likelihood Estimator (MLE)
 In statistics, maximum likelihood estimation (MLE) is a method of estimating
 the parameters of a statistical model given observations by finding the
 :math:`\Theta` that maximizes :math:`P(x|\Theta)` for all :math:`x` in your
-dataset [10]_. This optimization is done by the **Expectation-Maximization**
+dataset [9]_. This optimization is done by the **Expectation-Maximization**
 (EM) algorithm [8]_ and it is implemented by
 :py:class:`bob.learn.em.ML_GMMTrainer`.
 
@@ -181,7 +181,7 @@ Maximum a posteriori Estimator (MAP)
 
 Closely related to the MLE, Maximum a posteriori probability (MAP) is an
 estimate that equals the mode of the posterior distribution by incorporating in
-its loss function a prior distribution [11]_. Commonly this prior distribution
+its loss function a prior distribution [10]_. Commonly this prior distribution
 (the values of :math:`\Theta`) is estimated with MLE. This optimization is done
 by the **Expectation-Maximization** (EM) algorithm [8]_ and it is implemented
 by :py:class:`bob.learn.em.MAP_GMMTrainer`.
@@ -582,7 +582,7 @@ The snippet bellow shows how to compute scores using this approximation.
 Probabilistic Linear Discriminant Analysis (PLDA)
 -------------------------------------------------
 
-Probabilistic Linear Discriminant Analysis [16]_ is a probabilistic model that
+Probabilistic Linear Discriminant Analysis [5]_ is a probabilistic model that
 incorporates components describing both between-class and within-class
 variations. Given a mean :math:`\mu`, between-class and within-class subspaces
 :math:`F` and :math:`G` and residual noise :math:`\epsilon` with zero mean and
@@ -598,7 +598,7 @@ An Expectation-Maximization algorithm can be used to learn the parameters of
 this model :math:`\mu`, :math:`F` :math:`G` and :math:`\Sigma`. As these
 parameters can be shared between classes, there is a specific container class
 for this purpose, which is :py:class:`bob.learn.em.PLDABase`. The process is
-described in detail in [17]_.
+described in detail in [6]_.
 
 Let us consider a training set of two classes, each with 3 samples of
 dimensionality 3.
@@ -793,9 +793,11 @@ Follow bellow an example of score normalization using
 .. [2] http://publications.idiap.ch/index.php/publications/show/2606
 .. [3] http://dx.doi.org/10.1016/j.csl.2007.05.003
 .. [4] http://dx.doi.org/10.1109/TASL.2010.2064307
+.. [5] http://dx.doi.org/10.1109/ICCV.2007.4409052
+.. [6] http://doi.ieeecomputersociety.org/10.1109/TPAMI.2013.38
 .. [7] http://en.wikipedia.org/wiki/K-means_clustering
 .. [8] http://en.wikipedia.org/wiki/Expectation-maximization_algorithm
-.. [10] http://en.wikipedia.org/wiki/Maximum_likelihood
-.. [11] http://en.wikipedia.org/wiki/Maximum_a_posteriori_estimation
-.. [16] http://dx.doi.org/10.1109/ICCV.2007.4409052
-.. [17] http://doi.ieeecomputersociety.org/10.1109/TPAMI.2013.38
+.. [9] http://en.wikipedia.org/wiki/Maximum_likelihood
+.. [10] http://en.wikipedia.org/wiki/Maximum_a_posteriori_estimation
+
+
diff --git a/doc/index.rst b/doc/index.rst
index 8039c0e067ea5d16484e994138d5aa36d8451cb3..e465e09a82384f4c840f059b1494f39916f35b59 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -32,18 +32,26 @@ References
 -----------
 
 .. [Reynolds2000] *Reynolds, Douglas A., Thomas F. Quatieri, and Robert B. Dunn*. **Speaker Verification Using Adapted Gaussian Mixture Models**, Digital signal processing 10.1 (2000): 19-41.
-..   [Vogt2008]   *R. Vogt, S. Sridharan*. **'Explicit Modelling of Session Variability for Speaker Verification'**, Computer Speech & Language, 2008, vol. 22, no. 1, pp. 17-38
-..   [McCool2013] *C. McCool, R. Wallace, M. McLaren, L. El Shafey, S. Marcel*. **'Session Variability Modelling for Face Authentication'**, IET Biometrics, 2013
-..   [ElShafey2014] *Laurent El Shafey, Chris McCool, Roy Wallace, Sebastien Marcel*. **'A Scalable Formulation of Probabilistic Linear Discriminant Analysis: Applied to Face Recognition'**, TPAMI'2014
-..   [PrinceElder2007] *Prince and Elder*. **'Probabilistic Linear Discriminant Analysis for Inference About Identity'**, ICCV'2007
-..   [LiFu2012] *Li, Fu, Mohammed, Elder and Prince*. **'Probabilistic Models for Inference about Identity'**,  TPAMI'2012
-
-..   [Bishop1999] Tipping, Michael E., and Christopher M. Bishop. "Probabilistic principal component analysis." Journal of the Royal Statistical Society: Series B (Statistical Methodology) 61.3 (1999): 611-622.
-..   [Roweis1998] Roweis, Sam. "EM algorithms for PCA and SPCA." Advances in neural information processing systems (1998): 626-632.
-
-..   [Glembek2009] Glembek, Ondrej, et al. "Comparison of scoring methods used in speaker recognition with joint factor analysis." Acoustics, Speech and Signal Processing, 2009. ICASSP 2009. IEEE International Conference on. IEEE, 2009.
-..   [Auckenthaler2000] Auckenthaler, Roland, Michael Carey, and Harvey Lloyd-Thomas. "Score normalization for text-independent speaker verification systems." Digital Signal Processing 10.1 (2000): 42-54.
-..   [Mariethoz2005] Mariethoz, Johnny, and Samy Bengio. "A unified framework for score normalization techniques applied to text-independent speaker verification." IEEE signal processing letters 12.7 (2005): 532-535.
+
+.. [Vogt2008]   *R. Vogt, S. Sridharan*. **'Explicit Modelling of Session Variability for Speaker Verification'**, Computer Speech & Language, 2008, vol. 22, no. 1, pp. 17-38
+
+.. [McCool2013] *C. McCool, R. Wallace, M. McLaren, L. El Shafey, S. Marcel*. **'Session Variability Modelling for Face Authentication'**, IET Biometrics, 2013
+
+.. [ElShafey2014] *Laurent El Shafey, Chris McCool, Roy Wallace, Sebastien Marcel*. **'A Scalable Formulation of Probabilistic Linear Discriminant Analysis: Applied to Face Recognition'**, TPAMI'2014
+
+.. [PrinceElder2007] *Prince and Elder*. **'Probabilistic Linear Discriminant Analysis for Inference About Identity'**, ICCV'2007
+
+.. [LiFu2012] *Li, Fu, Mohammed, Elder and Prince*. **'Probabilistic Models for Inference about Identity'**,  TPAMI'2012
+
+.. [Bishop1999] Tipping, Michael E., and Christopher M. Bishop. "Probabilistic principal component analysis." Journal of the Royal Statistical Society: Series B (Statistical Methodology) 61.3 (1999): 611-622.
+
+.. [Roweis1998] Roweis, Sam. "EM algorithms for PCA and SPCA." Advances in neural information processing systems (1998): 626-632.
+
+.. [Glembek2009] Glembek, Ondrej, et al. "Comparison of scoring methods used in speaker recognition with joint factor analysis." Acoustics, Speech and Signal Processing, 2009. ICASSP 2009. IEEE International Conference on. IEEE, 2009.
+
+.. [Auckenthaler2000] Auckenthaler, Roland, Michael Carey, and Harvey Lloyd-Thomas. "Score normalization for text-independent speaker verification systems." Digital Signal Processing 10.1 (2000): 42-54.
+
+.. [Mariethoz2005] Mariethoz, Johnny, and Samy Bengio. "A unified framework for score normalization techniques applied to text-independent speaker verification." IEEE signal processing letters 12.7 (2005): 532-535.
 
 
 Indices and tables