diff --git a/bob/learn/tensorflow/dataset/h5.py b/bob/learn/tensorflow/dataset/h5.py
new file mode 100644
index 0000000000000000000000000000000000000000..75dbe76e563b65e0f25e90697d05f88f54673b25
--- /dev/null
+++ b/bob/learn/tensorflow/dataset/h5.py
@@ -0,0 +1,20 @@
+import h5py
+
+
+class MeanNormalizer:
+
+    """An HDF5 normalizer class. It reads the mean from an hdf5 dataset and
+    normalizer the values based on the mean value.
+
+    Attributes
+    ----------
+    mean : array_like
+        A mean value read from the hdf5 file.
+    """
+
+    def __init__(self, hdf5):
+        with h5py.File(hdf5, "r") as f:
+            self.mean = f["/mean"]["data_mean"].value
+
+    def __call__(self, data):
+        return data - self.mean
diff --git a/bob/learn/tensorflow/script/cgm.py b/bob/learn/tensorflow/script/cgm.py
new file mode 100644
index 0000000000000000000000000000000000000000..d432ee931deb1c8823fcce130bf7d40c597d84e7
--- /dev/null
+++ b/bob/learn/tensorflow/script/cgm.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python
+"""Converts datasets to TFRecords file formats.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import random
+import tempfile
+import os
+import gridtk
+import sys
+import logging
+import click
+import tensorflow as tf
+import h5py
+import scipy
+from bob.io.base import create_directories_safe
+from bob.extension.scripts.click_helper import (
+    verbosity_option,
+    log_parameters,
+    ConfigCommand,
+    ResourceOption,
+)
+from glob import glob
+import numpy
+import time
+
+logger = logging.getLogger(__name__)
+
+
+@click.command(
+    epilog="""\b
+$ bob tf distance_matrix -vvv --hdf5 <hdf5-path> --n-chunks 10 [normalizer.py]
+""",
+    entry_point_group="bob.learn.tensorflow.config",
+    cls=ConfigCommand,
+)
+@click.option(
+    "--hdf5",
+    required=True,
+    cls=ResourceOption,
+    type=click.Path(exists=True, dir_okay=False),
+    help="Path to the hdf5 dataset.",
+)
+@click.option(
+    "--n-chunks",
+    type=click.INT,
+    required=True,
+    cls=ResourceOption,
+    help="Divides the dataset into n_chunks**2 for parallel computation.",
+)
+@click.option(
+    "--normalizer",
+    cls=ResourceOption,
+    entry_point_group="bob.learn.tensorflow.normalizer",
+    help="A function that takes each data and returns a normalized data.",
+)
+@click.option(
+    "--job-name", default="dis-mat", help="The prefix to use for SGE job submissions."
+)
+@click.option(
+    "--queue-array",
+    default="all.q",
+    help="The queue to use for SGE array job submissions.",
+)
+@click.option(
+    "--finalize",
+    is_flag=True,
+    help="If for some reason your array jobs are completed but you do not have"
+    " your final distance matrix, use this option.",
+)
+@verbosity_option()
+@click.option("--n-samples", type=click.INT, help="Internal option. Do not use.")
+def distance_matrix(
+    hdf5, n_chunks, normalizer, n_samples, job_name, queue_array, finalize, **kwargs
+):
+    """Computes distance matrix of an HDF5 dataset
+
+    The data must be normalized -> zero-mean and histogram equalization
+
+    Divides the dataset into N**2 chunks for parallel computation.
+    """
+    log_parameters(logger)
+
+    if finalize:
+        return _finalize(hdf5)
+
+    if n_samples is None:
+        create_directories_safe(_output_folder(hdf5))
+        return _pre_compute(hdf5, n_chunks, job_name, queue_array)
+
+    return _compute(hdf5, n_chunks, normalizer, n_samples)
+
+
+def _pre_compute(hdf5, n_chunks, job_name, queue_array):
+    with h5py.File(hdf5, "r") as f:
+
+        # find the total number of samples in hdf5 file
+        samples = sorted(int(k) for k in f.keys() if k.isdigit())
+        n_samples = len(samples)
+
+        logger.debug("There are %d samples in the hdf5", n_samples)
+
+        # figure out the number of jobs jobs
+        n_jobs = (n_chunks * (n_chunks + 1)) // 2
+
+        # submit the array job with gridtk
+        job_manager = gridtk.sge.JobManagerSGE()
+        command = sys.argv + ["--n-samples", str(n_samples)]
+        job_id = job_manager.submit(
+            command, queue=queue_array, name=job_name + "-ary", array=(1, n_jobs, 1)
+        )
+
+    # wait for the jobs to finish and finalize
+    def get_job():
+        try:
+            job_manager.lock()
+            job_manager.communicate([job_id])
+            job = job_manager.get_jobs([job_id])[0]
+            return job
+        finally:
+            job_manager.unlock()
+
+    job = get_job()
+    while job.status not in ("success", "failure"):
+        time.sleep(10)
+        job = get_job()
+
+    _finalize(hdf5)
+
+
+def _compute(hdf5, n_chunks, normalizer, n_samples):
+    with h5py.File(hdf5, "r") as f:
+
+        # We should be in an array job
+        job_id = int(os.getenv("SGE_TASK_ID", 1)) - 1
+
+        output = os.path.join(_output_folder(hdf5), f"{job_id}.npz")
+        logger.info("Saving output in %s", output)
+
+        # calculate chunk size
+        chunk_size = int(numpy.ceil(n_samples / n_chunks))
+
+        # generate upper triangle indices
+        if n_chunks > 100:
+            logger.warning("The computation below is going to take forever!")
+        indices = []
+        for k in range(n_chunks ** 2):
+            i, j = numpy.unravel_index(k, (n_chunks, n_chunks))
+            if i <= j:
+                i, j = i * chunk_size, j * chunk_size
+                i = range(i, min(i + chunk_size, n_samples))
+                j = range(j, min(j + chunk_size, n_samples))
+                if (i, j) not in indices:
+                    indices.append((i, j))
+
+        def get_array(fd, id_range):
+            array = []
+            for i in id_range:
+                array.append(fd[f"/{i}"]["data"].value.flatten())
+            return array
+
+        i, j = indices[job_id]
+        if i == j:
+            d = scipy.spatial.distance.pdist(get_array(f, i))
+            d = scipy.spatial.distance.squareform(d)
+        else:
+            d = scipy.spatial.distance.cdist(get_array(f, i), get_array(f, j))
+
+        # create sparse matrix
+        sparse_d = scipy.sparse.dok_matrix((n_samples, n_samples))
+
+        def _slice(_range):
+            return slice(_range.start, _range.stop, _range.step)
+
+        # assign this job's distance matrix
+        sparse_d[_slice(i), _slice(j)] = d
+
+        # remove extra numbers
+        sparse_d = scipy.sparse.triu(sparse_d, k=1)
+
+        # save
+        scipy.sparse.save_npz(output, sparse_d)
+
+
+def _finalize(hdf5):
+    # load and sum all sparse matrices
+    # save the final results
+    # delete intermediate matrices
+    out_fldr = _output_folder(hdf5)
+    files = glob(os.path.join(out_fldr, "*.npz"))
+    files = list(
+        filter(lambda x: os.path.splitext(os.path.split(x)[1])[0].isdigit(), files)
+    )
+
+    dist = sum(scipy.sparse.load_npz(f) for f in files)
+    output = os.path.join(out_fldr, "dist.npz")
+    scipy.sparse.save_npz(output, dist)
+
+    for path in files:
+        os.remove(path)
+        logger.debug("Deleted: %s", path)
+
+
+def _output_folder(hdf5):
+    return hdf5.rsplit(".hdf5", 1)[0]
diff --git a/setup.py b/setup.py
index 2b1997fbdcb0bf7d8bbe19fa0280a3f20d01f0e1..ca6a9a0037efbe82c93b9638024146df6aba7d80 100644
--- a/setup.py
+++ b/setup.py
@@ -52,6 +52,7 @@ setup(
         # bob tf scripts
         'bob.learn.tensorflow.cli': [
             'cache-dataset = bob.learn.tensorflow.script.cache_dataset:cache_dataset',
+            'cgm-dataset = bob.learn.tensorflow.script.cgm:cgm_dataset',
             'compute-statistics = bob.learn.tensorflow.script.compute_statistics:compute_statistics',
             'dataset-to-hdf5 = bob.learn.tensorflow.script.db_to_tfrecords:dataset_to_hdf5',
             'datasets-to-tfrecords = bob.learn.tensorflow.script.db_to_tfrecords:datasets_to_tfrecords',