From cca26728250f0a424dde2fcc0c670d5a4e810423 Mon Sep 17 00:00:00 2001
From: Andre Anjos <andre.dos.anjos@gmail.com>
Date: Tue, 21 Apr 2020 09:40:12 +0200
Subject: [PATCH] [data.dataset] Allow limit on dataset checks

---
 bob/ip/binseg/data/dataset.py   | 33 ++++++++++++++++++++++++++++++---
 bob/ip/binseg/script/dataset.py | 13 +++++++++++--
 2 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/bob/ip/binseg/data/dataset.py b/bob/ip/binseg/data/dataset.py
index 3d7bb96e..92599aa8 100644
--- a/bob/ip/binseg/data/dataset.py
+++ b/bob/ip/binseg/data/dataset.py
@@ -87,9 +87,19 @@ class JSONDataset:
         self.loader = loader
         self.keymaker = keymaker
 
-    def check(self):
+    def check(self, limit=0):
         """For each protocol, check if all data can be correctly accessed
 
+
+        Parameters
+        ----------
+
+        limit : int
+            Maximum number of samples to check (in each protocol/subset
+            combination) in this dataset.  If set to zero, then check
+            everything.
+
+
         Returns
         -------
 
@@ -104,6 +114,9 @@ class JSONDataset:
             logger.info(f"Checking protocol '{proto}'...")
             for name, samples in self.subsets(proto).items():
                 logger.info(f"Checking subset '{name}'...")
+                if limit:
+                    logger.info(f"Checking at most first '{limit}' samples...")
+                    samples = samples[:limit]
                 for sample in samples:
                     try:
                         sample.data  # triggers loading
@@ -230,9 +243,19 @@ class CSVDataset:
         self.loader = loader
         self.keymaker = keymaker
 
-    def check(self):
+    def check(self, limit=0):
         """For each subset, check if all data can be correctly accessed
 
+
+        Parameters
+        ----------
+
+        limit : int
+            Maximum number of samples to check (in each protocol/subset
+            combination) in this dataset.  If set to zero, then check
+            everything.
+
+
         Returns
         -------
 
@@ -245,7 +268,11 @@ class CSVDataset:
         errors = 0
         for name in self._subsets.keys():
             logger.info(f"Checking subset '{name}'...")
-            for sample in self.samples(name):
+            samples = self.samples(name)
+            if limit:
+                logger.info(f"Checking at most first '{limit}' samples...")
+                samples = samples[:limit]
+            for sample in samples:
                 try:
                     sample.data  # triggers loading
                     logger.info(f"{sample.key}: OK")
diff --git a/bob/ip/binseg/script/dataset.py b/bob/ip/binseg/script/dataset.py
index c396aa47..c2eb52db 100644
--- a/bob/ip/binseg/script/dataset.py
+++ b/bob/ip/binseg/script/dataset.py
@@ -104,8 +104,17 @@ def list(**kwargs):
         'dataset',
         nargs=-1,
         )
+@click.option(
+    "--limit",
+    "-l",
+    help="Limit check to the first N samples in each dataset, making the "
+            "check sensibly faster.  Set it to zero to check everything.",
+    required=True,
+    type=click.IntRange(0),
+    default=0,
+)
 @verbosity_option()
-def check(dataset, **kwargs):
+def check(dataset, limit, **kwargs):
     """Checks file access on one or more datasets"""
 
     to_check = _get_installed_datasets()
@@ -123,6 +132,6 @@ def check(dataset, **kwargs):
             click.echo(f"Checking \"{k.group('name')}\" dataset...")
             module = importlib.import_module(f"...data.{k.group('name')}",
                     __name__)
-            errors += module.dataset.check()
+            errors += module.dataset.check(limit)
         if not errors:
             click.echo(f"No errors reported")
-- 
GitLab