diff --git a/bob/ip/binseg/data/utils.py b/bob/ip/binseg/data/utils.py
index d8ab22992b33f0b2e91825cb09e6a19d5584055d..58f39f2fce678b78471b1ea013866747b28b0358 100644
--- a/bob/ip/binseg/data/utils.py
+++ b/bob/ip/binseg/data/utils.py
@@ -4,7 +4,6 @@
 
 """Common utilities"""
 
-import numpy
 import PIL.Image
 import PIL.ImageOps
 import PIL.ImageChops
@@ -15,32 +14,6 @@ import torch.utils.data
 from .transforms import Compose, ToTensor
 
 
-def count_bw(b):
-    """Calculates totals of black and white pixels in a binary image
-
-
-    Parameters
-    ----------
-
-    b : PIL.Image.Image
-        A PIL image in mode "1" to be used for calculating positives and
-        negatives
-
-    Returns
-    -------
-
-    black : int
-        Number of black pixels in the binary image
-
-    white : int
-        Number of white pixels in the binary image
-    """
-
-    boolean_array = numpy.array(b)
-    white = boolean_array.sum()
-    return (boolean_array.size-white), white
-
-
 def invert_mode1_image(img):
     """Inverts a binary PIL image (mode == ``"1"``)"""
 
diff --git a/bob/ip/binseg/test/__init__.py b/bob/ip/binseg/test/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..428bd154ba23e7516809b9bc147f2caf2db953a5 100644
--- a/bob/ip/binseg/test/__init__.py
+++ b/bob/ip/binseg/test/__init__.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+"""Unit tests"""
+
+import tempfile
+import logging
+logger = logging.getLogger(__name__)
+
+TESTDB_TMPDIR = None
+_URL = "http://www.idiap.ch/software/bob/data/bob/bob.ip.binseg/master/_testdb.zip"
+_RCKEY = "bob.ip.binseg.stare.datadir"
+
+
+def teardown_package():
+    global TESTDB_TMPDIR
+    if TESTDB_TMPDIR is not None:
+        logger.info(f"Removing temporary directory {TESTDB_TMPDIR.name}...")
+        TESTDB_TMPDIR.cleanup()
+
+
+def _mock_test_skipper(name):
+    """
+    Dummary decorator that does nothing
+    """
+    import functools
+    def wrapped_function(test):
+        @functools.wraps(test)
+        def wrapper(*args, **kwargs):
+            return test(*args, **kwargs)
+        return wrapper
+    return wrapped_function
+
+
+def mock_dataset():
+    global TESTDB_TMPDIR
+    from bob.extension import rc
+    if (TESTDB_TMPDIR is not None) or (_RCKEY in rc):
+        logger.info("Test database already set up - not downloading")
+    else:
+        logger.info("Test database not available, downloading...")
+        import zipfile
+        import urllib.request
+        # Download the file from `url` and save it locally under `file_name`:
+        with urllib.request.urlopen(_URL) as r, tempfile.TemporaryFile() as f:
+            f.write(r.read())
+            f.flush()
+            f.seek(0)
+            TESTDB_TMPDIR = \
+                    tempfile.TemporaryDirectory(prefix=__name__ + '-')
+            print(f"Creating test database at {TESTDB_TMPDIR.name}...")
+            logger.info(f"Creating test database at {TESTDB_TMPDIR.name}...")
+            with zipfile.ZipFile(f) as zf: zf.extractall(TESTDB_TMPDIR.name)
+
+    from ..data import stare
+    if TESTDB_TMPDIR is None:
+        # if the user has the STARE directory ready, then we do a normal return
+        from .utils import rc_variable_set
+        return stare.dataset, rc_variable_set
+
+    # else, we do a "mock" return
+    return stare.JSONDataset(stare._protocols, TESTDB_TMPDIR.name,
+            stare._loader), _mock_test_skipper
diff --git a/bob/ip/binseg/data/chasedb1/test.py b/bob/ip/binseg/test/test_chasedb1.py
similarity index 95%
rename from bob/ip/binseg/data/chasedb1/test.py
rename to bob/ip/binseg/test/test_chasedb1.py
index 4841ea7ab35ca9f110c457276b5952ea34e553f2..64e95f4cbd18667a506a18a6e3153c0735b36ca3 100644
--- a/bob/ip/binseg/data/chasedb1/test.py
+++ b/bob/ip/binseg/test/test_chasedb1.py
@@ -9,8 +9,8 @@ import os
 import numpy
 import nose.tools
 
-from . import dataset
-from ...test.utils import rc_variable_set
+from ..data.chasedb1 import dataset
+from .utils import rc_variable_set, count_bw
 
 
 def test_protocol_consistency():
@@ -45,7 +45,6 @@ def test_protocol_consistency():
 @rc_variable_set('bob.ip.binseg.chasedb1.datadir')
 def test_loading():
 
-    from ..utils import count_bw
     image_size = (999, 960)
 
     def _check_sample(s, bw_threshold_label):
@@ -74,7 +73,7 @@ def test_loading():
         # to visualize images, uncomment the folowing code
         # it should display an image with a faded background representing the
         # original data, blended with green labels.
-        #from ..utils import overlayed_image
+        #from ..data.utils import overlayed_image
         #display = overlayed_image(data["data"], data["label"])
         #display.show()
         #import ipdb; ipdb.set_trace()
diff --git a/bob/ip/binseg/test/test_config.py b/bob/ip/binseg/test/test_config.py
index 28983f08c2cb19b1050ce77f40eb94408457d6fc..b3bef5b7c5fba224fb77e3acd68994e1ba37c657 100644
--- a/bob/ip/binseg/test/test_config.py
+++ b/bob/ip/binseg/test/test_config.py
@@ -6,6 +6,8 @@ from nose.plugins.attrib import attr
 
 import torch
 
+from . import mock_dataset
+stare_dataset, stare_variable_set = mock_dataset()
 from .utils import rc_variable_set
 
 
@@ -41,10 +43,12 @@ def test_drive_default_test():
         nose.tools.eq_(sample[3].dtype, torch.float32)
 
 
-@rc_variable_set("bob.ip.binseg.stare.datadir")
+@stare_variable_set("bob.ip.binseg.stare.datadir")
 def test_stare_default_train():
 
     from ..configs.datasets.stare import dataset
+    # hack to allow testing on the CI
+    dataset._samples = stare_dataset.subsets("default")["train"]
     nose.tools.eq_(len(dataset), 10)
     for sample in dataset:
         nose.tools.eq_(len(sample), 3)
@@ -55,10 +59,12 @@ def test_stare_default_train():
         nose.tools.eq_(sample[2].dtype, torch.float32)
 
 
-@rc_variable_set("bob.ip.binseg.stare.datadir")
+@stare_variable_set("bob.ip.binseg.stare.datadir")
 def test_stare_default_test():
 
     from ..configs.datasets.stare_test import dataset
+    # hack to allow testing on the CI
+    dataset._samples = stare_dataset.subsets("default")["test"]
     nose.tools.eq_(len(dataset), 10)
     for sample in dataset:
         nose.tools.eq_(len(sample), 3)
diff --git a/bob/ip/binseg/data/drishtigs1/test.py b/bob/ip/binseg/test/test_drishtigs1.py
similarity index 95%
rename from bob/ip/binseg/data/drishtigs1/test.py
rename to bob/ip/binseg/test/test_drishtigs1.py
index e7dbf65753a3861ceef1a4b733bd9b79c45025a9..17e39a2ad455b6f04ed5f9d4798d581bc4b668a1 100644
--- a/bob/ip/binseg/data/drishtigs1/test.py
+++ b/bob/ip/binseg/test/test_drishtigs1.py
@@ -10,8 +10,8 @@ import numpy
 import nose.tools
 from nose.plugins.attrib import attr
 
-from . import dataset
-from ...test.utils import rc_variable_set
+from ..data.drishtigs1 import dataset
+from .utils import rc_variable_set, count_bw
 
 
 def test_protocol_consistency():
@@ -39,8 +39,6 @@ def test_protocol_consistency():
 @attr("slow")
 def test_loading():
 
-    from ..utils import count_bw
-
     def _check_sample(s, bw_threshold_label):
 
         data = s.data
@@ -77,8 +75,7 @@ def test_loading():
         # to visualize images, uncomment the folowing code
         # it should display an image with a faded background representing the
         # original data, blended with green labels.
-        #print(f"{s.key}: {data['data'].size}, w/b = {w/b:.3f}")
-        #from ..utils import overlayed_image
+        #from ..data.utils import overlayed_image
         #display = overlayed_image(data["data"], data["label"])
         #display.show()
         #import ipdb; ipdb.set_trace()
diff --git a/bob/ip/binseg/data/drive/test.py b/bob/ip/binseg/test/test_drive.py
similarity index 96%
rename from bob/ip/binseg/data/drive/test.py
rename to bob/ip/binseg/test/test_drive.py
index 4b0ec336a4f5980cf3b676302f043ad35c6da0e6..c18d13416342c9c1b498225405668f8f06cd127b 100644
--- a/bob/ip/binseg/data/drive/test.py
+++ b/bob/ip/binseg/test/test_drive.py
@@ -9,8 +9,8 @@ import os
 import numpy
 import nose.tools
 
-from . import dataset
-from ...test.utils import rc_variable_set
+from ..data.drive import dataset
+from .utils import rc_variable_set, count_bw
 
 
 def test_protocol_consistency():
@@ -40,7 +40,6 @@ def test_protocol_consistency():
 @rc_variable_set('bob.ip.binseg.drive.datadir')
 def test_loading():
 
-    from ..utils import count_bw
     image_size = (565, 584)
 
     def _check_sample(s, bw_threshold_label, bw_threshold_mask):
@@ -83,7 +82,7 @@ def test_loading():
         # it should display an image with a faded background representing the
         # original data, blended with green labels and blue area indicating the
         # parts to be masked out.
-        #from ..utils import overlayed_image
+        #from ..data.utils import overlayed_image
         #display = overlayed_image(data["data"], data["label"], data["mask"])
         #display.show()
         #import ipdb; ipdb.set_trace()
diff --git a/bob/ip/binseg/data/hrf/test.py b/bob/ip/binseg/test/test_hrf.py
similarity index 96%
rename from bob/ip/binseg/data/hrf/test.py
rename to bob/ip/binseg/test/test_hrf.py
index ac928ecddc6e42ccd9b67d3d08cb1800ab2ce961..fdddfbbc78d79c228e1bca01e3a3ebf9dda84f65 100644
--- a/bob/ip/binseg/data/hrf/test.py
+++ b/bob/ip/binseg/test/test_hrf.py
@@ -9,8 +9,8 @@ import os
 import numpy
 import nose.tools
 
-from . import dataset
-from ...test.utils import rc_variable_set
+from ..data.hrf import dataset
+from .utils import rc_variable_set, count_bw
 
 
 def test_protocol_consistency():
@@ -32,7 +32,6 @@ def test_protocol_consistency():
 @rc_variable_set('bob.ip.binseg.hrf.datadir')
 def test_loading():
 
-    from ..utils import count_bw
     image_size = (3504, 2336)
 
     def _check_sample(s, bw_threshold_label, bw_threshold_mask):
@@ -75,7 +74,7 @@ def test_loading():
         # it should display an image with a faded background representing the
         # original data, blended with green labels and blue area indicating the
         # parts to be masked out.
-        #from ..utils import overlayed_image
+        #from ..data.utils import overlayed_image
         #display = overlayed_image(data["data"], data["label"], data["mask"])
         #display.show()
         #import ipdb; ipdb.set_trace()
diff --git a/bob/ip/binseg/data/iostar/test.py b/bob/ip/binseg/test/test_iostar.py
similarity index 96%
rename from bob/ip/binseg/data/iostar/test.py
rename to bob/ip/binseg/test/test_iostar.py
index 4655300592422193779518dfd2d7a9eae8aeddbf..9d8946b60d1160fa5b0dca95c39de7badac1a1aa 100644
--- a/bob/ip/binseg/data/iostar/test.py
+++ b/bob/ip/binseg/test/test_iostar.py
@@ -9,8 +9,8 @@ import os
 import numpy
 import nose.tools
 
-from . import dataset
-from ...test.utils import rc_variable_set
+from ..data.iostar import dataset
+from .utils import rc_variable_set, count_bw
 
 
 def test_protocol_consistency():
@@ -45,7 +45,6 @@ def test_protocol_consistency():
 @rc_variable_set('bob.ip.binseg.iostar.datadir')
 def test_loading():
 
-    from ..utils import count_bw
     image_size = (1024, 1024)
 
     def _check_sample(s, bw_threshold_label, bw_threshold_mask):
@@ -88,7 +87,7 @@ def test_loading():
         # it should display an image with a faded background representing the
         # original data, blended with green labels and blue area indicating the
         # parts to be masked out.
-        #from ..utils import overlayed_image
+        #from ..data.utils import overlayed_image
         #display = overlayed_image(data["data"], data["label"], data["mask"])
         #display.show()
         #import ipdb; ipdb.set_trace()
diff --git a/bob/ip/binseg/data/refuge/test.py b/bob/ip/binseg/test/test_refuge.py
similarity index 90%
rename from bob/ip/binseg/data/refuge/test.py
rename to bob/ip/binseg/test/test_refuge.py
index a69334c08e204a76e63077fc1eece5e06862d477..d4fac282cc210e1f04d09d74ad16d818641cae59 100644
--- a/bob/ip/binseg/data/refuge/test.py
+++ b/bob/ip/binseg/test/test_refuge.py
@@ -10,8 +10,8 @@ import numpy
 import nose.tools
 from nose.plugins.attrib import attr
 
-from . import dataset
-from ...test.utils import rc_variable_set
+from ..data.refuge import dataset
+from .utils import rc_variable_set, count_bw
 
 
 def test_protocol_consistency():
@@ -41,8 +41,6 @@ def test_protocol_consistency():
 @attr("slow")
 def test_loading():
 
-    from ..utils import count_bw
-
     def _check_sample(
         s, image_size, glaucoma_label, entries, bw_threshold_label
     ):
@@ -76,11 +74,10 @@ def test_loading():
         # to visualize images, uncomment the folowing code
         # it should display an image with a faded background representing the
         # original data, blended with green labels.
-        # print(f"{s.key}: {data.get('glaucoma')}, w/b = {w/b:.3f}")
-        # from ..utils import overlayed_image
-        # display = overlayed_image(data["data"], data["label"])
-        # display.show()
-        # import ipdb; ipdb.set_trace()
+        #from ..data.utils import overlayed_image
+        #display = overlayed_image(data["data"], data["label"])
+        #display.show()
+        #import ipdb; ipdb.set_trace()
 
         return w/b
 
diff --git a/bob/ip/binseg/data/rimoner3/test.py b/bob/ip/binseg/test/test_rimoner3.py
similarity index 93%
rename from bob/ip/binseg/data/rimoner3/test.py
rename to bob/ip/binseg/test/test_rimoner3.py
index 6a28b668170b58e648b41adba8db05cb1d4c61ba..e6010cf6a3dec0c9b4c5c84dc0857b92c349f174 100644
--- a/bob/ip/binseg/data/rimoner3/test.py
+++ b/bob/ip/binseg/test/test_rimoner3.py
@@ -10,8 +10,8 @@ import numpy
 import nose.tools
 from nose.plugins.attrib import attr
 
-from . import dataset
-from ...test.utils import rc_variable_set
+from ..data.rimoner3 import dataset
+from .utils import rc_variable_set, count_bw
 
 
 def test_protocol_consistency():
@@ -37,7 +37,6 @@ def test_protocol_consistency():
 @attr("slow")
 def test_loading():
 
-    from ..utils import count_bw
     image_size = (1072, 1424)
 
     def _check_sample(s, bw_threshold_label):
@@ -66,10 +65,10 @@ def test_loading():
         # to visualize images, uncomment the folowing code
         # it should display an image with a faded background representing the
         # original data, blended with green labels.
-        from ..utils import overlayed_image
-        display = overlayed_image(data["data"], data["label"])
-        display.show()
-        import ipdb; ipdb.set_trace()
+        #from ..data.utils import overlayed_image
+        #display = overlayed_image(data["data"], data["label"])
+        #display.show()
+        #import ipdb; ipdb.set_trace()
 
         return w/b
 
diff --git a/bob/ip/binseg/data/stare/test.py b/bob/ip/binseg/test/test_stare.py
similarity index 94%
rename from bob/ip/binseg/data/stare/test.py
rename to bob/ip/binseg/test/test_stare.py
index 05358720004eac6f8984f28beca6a8881d885c55..685b6c98d66d8c4bd1f0b1606528ab25252220d7 100644
--- a/bob/ip/binseg/data/stare/test.py
+++ b/bob/ip/binseg/test/test_stare.py
@@ -9,8 +9,11 @@ import os
 import numpy
 import nose.tools
 
-from . import dataset
-from ...test.utils import rc_variable_set
+## special trick for CI builds
+from . import mock_dataset
+dataset, rc_variable_set = mock_dataset()
+
+from .utils import count_bw
 
 
 def test_protocol_consistency():
@@ -45,7 +48,6 @@ def test_protocol_consistency():
 @rc_variable_set('bob.ip.binseg.stare.datadir')
 def test_loading():
 
-    from ..utils import count_bw
     image_size = (700, 605)
 
     def _check_sample(s, bw_threshold_label):
@@ -74,7 +76,7 @@ def test_loading():
         # to visualize images, uncomment the folowing code
         # it should display an image with a faded background representing the
         # original data, blended with green labels.
-        #from ..utils import overlayed_image
+        #from ..data.utils import overlayed_image
         #display = overlayed_image(data["data"], data["label"])
         #display.show()
         #import ipdb; ipdb.set_trace()
diff --git a/bob/ip/binseg/test/utils.py b/bob/ip/binseg/test/utils.py
index b5739240175bb4c718c2ba98693a652a84aef02f..0f2e7fcf62dc8404b88f75a15cb2ab8c96d4992a 100644
--- a/bob/ip/binseg/test/utils.py
+++ b/bob/ip/binseg/test/utils.py
@@ -6,7 +6,10 @@
 
 
 import functools
+
+import numpy
 import nose.plugins.skip
+
 import bob.extension
 
 
@@ -25,3 +28,29 @@ def rc_variable_set(name):
         return wrapper
 
     return wrapped_function
+
+
+def count_bw(b):
+    """Calculates totals of black and white pixels in a binary image
+
+
+    Parameters
+    ----------
+
+    b : PIL.Image.Image
+        A PIL image in mode "1" to be used for calculating positives and
+        negatives
+
+    Returns
+    -------
+
+    black : int
+        Number of black pixels in the binary image
+
+    white : int
+        Number of white pixels in the binary image
+    """
+
+    boolean_array = numpy.array(b)
+    white = boolean_array.sum()
+    return (boolean_array.size-white), white