From 8c198bc2cbd33c68f1524ba6ac873f5a5c7843cc Mon Sep 17 00:00:00 2001
From: Andre Anjos <andre.dos.anjos@gmail.com>
Date: Fri, 10 Apr 2020 09:35:10 +0200
Subject: [PATCH] [dataset] New CLI script to list and check datasets

---
 bob/ip/binseg/data/__init__.py       |   2 +-
 bob/ip/binseg/data/drive/__init__.py |  33 +++++---
 bob/ip/binseg/data/stare/__init__.py |  42 ++++++-----
 bob/ip/binseg/script/dataset.py      | 108 +++++++++++++++++++++++++++
 conda/meta.yaml                      |   5 ++
 setup.py                             |   1 +
 6 files changed, 161 insertions(+), 30 deletions(-)
 create mode 100644 bob/ip/binseg/script/dataset.py

diff --git a/bob/ip/binseg/data/__init__.py b/bob/ip/binseg/data/__init__.py
index d9854dc8..93e77e17 100644
--- a/bob/ip/binseg/data/__init__.py
+++ b/bob/ip/binseg/data/__init__.py
@@ -1 +1 @@
-from .binsegdataset import BinSegDataset
+"""Data manipulation and raw dataset definitions"""
diff --git a/bob/ip/binseg/data/drive/__init__.py b/bob/ip/binseg/data/drive/__init__.py
index 2e6e8aef..f5531603 100644
--- a/bob/ip/binseg/data/drive/__init__.py
+++ b/bob/ip/binseg/data/drive/__init__.py
@@ -1,6 +1,25 @@
 #!/usr/bin/env python
 # coding=utf-8
 
+"""DRIVE dataset for Vessel Segmentation
+
+The DRIVE database has been established to enable comparative studies on
+segmentation of blood vessels in retinal images.
+
+* Reference: [DRIVE-2004]_
+* Original resolution (height x width): 584 x 565
+* Split reference: [DRIVE-2004]_
+* Protocol ``default``:
+
+  * Training samples: 20 (including labels and masks)
+  * Test samples: 20 (including labels from annotator 1 and masks)
+
+* Protocol ``second-annotation``:
+
+  * Test samples: 20 (including labels from annotator 2 and masks)
+
+"""
+
 import os
 import pkg_resources
 
@@ -14,7 +33,7 @@ _protocols = [
         pkg_resources.resource_filename(__name__, "second-annotation.json"),
         ]
 
-_root_path = bob.extension.rc.get('bob.db.drive.datadir',
+_root_path = bob.extension.rc.get('bob.ip.binseg.drive.datadir',
         os.path.realpath(os.curdir))
 
 def _loader(s):
@@ -25,14 +44,4 @@ def _loader(s):
             )
 
 dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader)
-"""DRIVE dataset for Vessel Segmentation
-
-The DRIVE database has been established to enable comparative studies on
-segmentation of blood vessels in retinal images.
-
-* Reference: [DRIVE-2004]_
-* Original resolution (height x width): 584 x 565
-* Training samples: 20 (including labels and masks)
-* Test samples: 20 (including labels from 2 annotators and masks)
-* Split reference: [DRIVE-2004]_
-"""
+"""DRIVE dataset object"""
diff --git a/bob/ip/binseg/data/stare/__init__.py b/bob/ip/binseg/data/stare/__init__.py
index 62354bb9..3aa38840 100644
--- a/bob/ip/binseg/data/stare/__init__.py
+++ b/bob/ip/binseg/data/stare/__init__.py
@@ -1,6 +1,29 @@
 #!/usr/bin/env python
 # coding=utf-8
 
+"""STARE dataset for Vessel Segmentation
+
+A subset of the original STARE dataset contains 20 annotated eye fundus images
+with a resolution of 700 x 605 (width x height). Two sets of ground-truth
+vessel annotations are available. The first set by Adam Hoover ("ah") is
+commonly used for training and testing. The second set by Valentina Kouznetsova
+("vk") is typically used as a “human” baseline.
+
+* Reference: [STARE-2000]_
+* Original resolution (width x height): 700 x 605
+* Split reference: [MANINIS-2016]_
+* Protocol ``default``:
+
+  * Training samples: 10 (including labels from annotator "ah")
+  * Test samples: 10 (including labels from annotator "ah")
+
+* Protocol ``second-annotation``:
+
+  * Training samples: 10 (including labels from annotator "vk")
+  * Test samples: 10 (including labels from annotator "vk")
+
+"""
+
 import os
 import pkg_resources
 
@@ -14,7 +37,7 @@ _protocols = [
         pkg_resources.resource_filename(__name__, "second-annotation.json"),
         ]
 
-_root_path = bob.extension.rc.get('bob.db.stare.datadir',
+_root_path = bob.extension.rc.get('bob.ip.binseg.stare.datadir',
         os.path.realpath(os.curdir))
 
 def _loader(s):
@@ -24,19 +47,4 @@ def _loader(s):
             )
 
 dataset = JSONDataset(protocols=_protocols, root_path=_root_path, loader=_loader)
-"""STARE (training set) for Vessel Segmentation
-
-A subset of the original STARE dataset contains 20 annotated eye fundus images
-with a resolution of 700 x 605 (width x height). Two sets of ground-truth
-vessel annotations are available. The first set by Adam Hoover is commonly used
-for training and testing. The second set by Valentina Kouznetsova acts as a
-“human” baseline.
-
-* Reference: [STARE-2000]_
-* Original resolution (width x height): 700 x 605
-* Training samples: 10
-* Test samples: 10
-* Samples include labels from 2 annotators (AH, default and VK, seconda
-  annotator)
-* Split reference: [MANINIS-2016]_
-"""
+"""STARE dataset object"""
diff --git a/bob/ip/binseg/script/dataset.py b/bob/ip/binseg/script/dataset.py
new file mode 100644
index 00000000..ecbdc05f
--- /dev/null
+++ b/bob/ip/binseg/script/dataset.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+import importlib
+import click
+
+from bob.extension.scripts.click_helper import (
+    verbosity_option,
+    AliasedGroup,
+)
+
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+def _get_installed_datasets():
+    """Returns a list of installed datasets as regular expressions
+
+    * group(0): the name of the key for the dataset directory
+    * group("name"): the short name for the dataset
+
+    """
+
+    import re
+    from bob.extension import rc
+    dataset_re = re.compile(r'^bob\.ip\.binseg\.(?P<name>[^\.]+)\.datadir$')
+    return [dataset_re.match(k) for k in rc.keys() if dataset_re.match(k)]
+
+
+@click.group(cls=AliasedGroup)
+def dataset():
+    """Commands for listing, describing and copying configuration resources"""
+    pass
+
+
+@dataset.command(
+    epilog="""Examples:
+
+\b
+    1. To install a dataset, set up its data directory ("datadir").  For
+       example, to setup access to DRIVE files you downloaded locally at
+       the directory "/path/to/drive/files", do the following:
+\b
+       $ bob config set "bob.ip.binseg.drive.datadir" "/path/to/drive/files"
+
+       Notice this setting is **NOT** case-insensitive.
+
+    2. List all raw datasets available (and configured):
+
+       $ bob binseg dataset list -vv
+
+""",
+)
+@verbosity_option()
+def list(**kwargs):
+    """Lists all installed datasets"""
+
+    installed = _get_installed_datasets()
+    if installed:
+        click.echo("Configured datasets:")
+        for k in installed:
+            value = bob.extension.rc.get(k.group(0))
+            click.echo(f"- {k.group('name')}: {k.group(0)} = \"{value}\"")
+    else:
+        click.echo("No configured datasets")
+        click.echo("Try --help to get help in configuring a dataset")
+
+
+@dataset.command(
+    epilog="""Examples:
+
+    1. Check if all files of the DRIVE dataset can be loaded:
+
+       $ bob binseg dataset check -vv drive
+
+    2. Check if all files of multiple installed datasets can be loaded:
+
+       $ bob binseg dataset check -vv drive stare
+
+    3. Check if all files of all installed datasets can be loaded:
+
+       $ bob binseg dataset check
+""",
+)
+@click.argument(
+        'dataset',
+        nargs=-1,
+        )
+@verbosity_option()
+def check(dataset, **kwargs):
+    """Checks file access on one or more datasets"""
+
+    to_check = _get_installed_datasets()
+
+    if dataset:  #check only some
+        to_check = [k for k in to_check if k.group("name") in dataset]
+
+    if not dataset:
+        click.echo("No configured datasets matching specifications")
+        click.echo("Try bob binseg dataset list --help to get help in "
+                "configuring a dataset")
+    else:
+        for k in to_check:
+            click.echo(f"Checking \"{k.group('name')}\" dataset...")
+            module = importlib.import_module(f"...data.{k.group('name')}",
+                    __name__)
+            module.dataset.check()
diff --git a/conda/meta.yaml b/conda/meta.yaml
index 7b69b581..133f7c29 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -58,6 +58,11 @@ test:
     - bob binseg config describe drive -v
     - bob binseg config copy --help
     - bob binseg config copy drive /tmp/test.py
+    - bob binseg dataset --help
+    - bob binseg dataset list --help
+    - bob binseg dataset list
+    - bob binseg dataset check --help
+    - bob binseg dataset check
     - bob binseg train --help
     - bob binseg predict --help
     - bob binseg evaluate --help
diff --git a/setup.py b/setup.py
index 5a234f7c..8eb42600 100644
--- a/setup.py
+++ b/setup.py
@@ -32,6 +32,7 @@ setup(
         # bob binseg sub-commands
         "bob.ip.binseg.cli": [
             "config = bob.ip.binseg.script.config:config",
+            "dataset =  bob.ip.binseg.script.dataset:dataset",
             "train = bob.ip.binseg.script.train:train",
             "predict = bob.ip.binseg.script.predict:predict",
             "evaluate = bob.ip.binseg.script.evaluate:evaluate",
-- 
GitLab