Add mnist/3 (api change: beat.backend.python v1.4.2)

37e29ed5 · Philip ABBET · caca14b8 · 37e29ed5 · 37e29ed5 · 37e29ed5
Commit 37e29ed5 authored 7 years ago by Philip ABBET
--- a/advanced/databases/mnist/3.json
+++ b/advanced/databases/mnist/3.json
+{
+    "description": "The MNIST Database of Handwritten Digits",
+    "root_folder": "/idiap/group/biometric/databases/mnist",
+    "protocols": [
+        {
+            "name": "idiap",
+            "template": "simple_digit_recognition",
+            "sets": [
+                {
+                    "name": "train",
+                    "template": "train",
+                    "view": "View",
+                    "parameters": {
+                        "group": "train"
+                    },
+                    "outputs": {
+                        "id": "{{ system_user.username }}/uint64/1",
+                        "class_id": "{{ system_user.username }}/uint64/1",
+                        "image": "{{ system_user.username }}/array_2d_uint8/1"
+                    }
+                },
+                {
+                    "name": "test",
+                    "template": "test",
+                    "view": "View",
+                    "parameters": {
+                        "group": "test"
+                    },
+                    "outputs": {
+                        "id": "{{ system_user.username }}/uint64/1",
+                        "class_id": "{{ system_user.username }}/uint64/1",
+                        "image": "{{ system_user.username }}/array_2d_uint8/1"
+                    }
+                }
+            ]
+        }
+    ]
+}
--- a/advanced/databases/mnist/3.py
+++ b/advanced/databases/mnist/3.py
+###############################################################################
+#                                                                             #
+# Copyright (c) 2017 Idiap Research Institute, http://www.idiap.ch/           #
+# Contact: beat.support@idiap.ch                                              #
+#                                                                             #
+# This file is part of the beat.examples module of the BEAT platform.         #
+#                                                                             #
+# Commercial License Usage                                                    #
+# Licensees holding valid commercial BEAT licenses may use this file in       #
+# accordance with the terms contained in a written agreement between you      #
+# and Idiap. For further information contact tto@idiap.ch                     #
+#                                                                             #
+# Alternatively, this file may be used under the terms of the GNU Affero      #
+# Public License version 3 as published by the Free Software and appearing    #
+# in the file LICENSE.AGPL included in the packaging of this file.            #
+# The BEAT platform is distributed in the hope that it will be useful, but    #
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY  #
+# or FITNESS FOR A PARTICULAR PURPOSE.                                        #
+#                                                                             #
+# You should have received a copy of the GNU Affero Public License along      #
+# with the BEAT platform. If not, see http://www.gnu.org/licenses/.           #
+#                                                                             #
+###############################################################################
+
+import numpy as np
+import bob.db.mnist
+
+
+#----------------------------------------------------------
+
+
+def get_label_end_index(labels, label, label_start_index,
+                        start_index, end_index):
+    label_end_index = label_start_index
+
+    while label_end_index + 1 <= end_index:
+        label_ = labels[label_end_index + 1 - start_index]
+
+        if label_ != label:
+            return label_end_index
+
+        label_end_index += 1
+
+    return end_index
+
+
+#----------------------------------------------------------
+
+
+class View:
+    """Outputs:
+        - image: "{{ system_user.username }}/array_2d_uint8/1"
+        - id: "{{ system_user.username }}/uint64/1"
+        - class_id: "{{ system_user.username }}/uint64/1"
+
+    One "id" is associated with a given "image".
+    Several "image" are associated with a given "class_id".
+
+    --------------- --------------- --------------- --------------- --------------- ---------------
+    |    image    | |    image    | |    image    | |    image    | |    image    | |    image    |
+    --------------- --------------- --------------- --------------- --------------- ---------------
+    --------------- --------------- --------------- --------------- --------------- ---------------
+    |     id      | |     id      | |     id      | |     id      | |     id      | |     id      |
+    --------------- --------------- --------------- --------------- --------------- ---------------
+    ----------------------------------------------- -----------------------------------------------
+    |                   class_id                  | |                   class_id                  |
+    ----------------------------------------------- -----------------------------------------------
+    """
+
+    def setup(self, root_folder, outputs, parameters, force_start_index=None,
+              force_end_index=None):
+
+        # Initialisations
+        self.root_folder = root_folder
+        self.outputs     = outputs
+        self.parameters  = parameters
+
+        # Open the database and load the objects to provide via the outputs
+        self.db = bob.db.mnist.Database(data_dir=self.root_folder)
+
+        self.features, self.labels = sorted(self.db.data(groups=parameters['group']),
+                                            key=lambda x: x[1])
+
+        # Determine the range of indices that must be provided
+        self.start_index = force_start_index if force_start_index is not None else 0
+        self.end_index = force_end_index if force_end_index is not None else len(self.objs) - 1
+
+        self.objs = self.objs[self.start_index : self.end_index + 1]
+
+        self.next_index = self.start_index
+
+        return True
+
+
+    def done(self, last_data_index):
+        return last_data_index >= self.end_index
+
+
+    def next(self):
+        features = self.features[self.next_index - self.start_index, :]
+        label = self.labels[self.next_index - self.start_index]
+
+        # Output: class_id (only provide data when the class_id change)
+        if self.outputs['class_id'].isConnected() and \
+           self.outputs['class_id'].last_written_data_index < self.next_index:
+
+            label_end_index = get_label_end_index(self.labels, label,
+                                                  self.next_index,
+                                                  self.start_index,
+                                                  self.end_index)
+
+            self.outputs['class_id'].write(
+                {
+                    'value': np.uint64(label)
+                },
+                label_end_index
+            )
+
+        # Output: id (provide data at each iteration)
+        if self.outputs['id'].isConnected():
+            self.outputs['id'].write(
+                {
+                    'value': np.uint64(self.next_index)
+                },
+                self.next_index
+            )
+
+        # Output: image (provide data at each iteration)
+        if self.outputs['image'].isConnected():
+            self.outputs['image'].write(
+                {
+                    'value': features.reshape((28, 28))
+                },
+                self.next_index
+            )
+
+        # Determine the next data index that must be provided
+        self.next_index = 1 + min([ x.last_written_data_index for x in self.outputs
+                                                              if x.isConnected() ]
+        )
+
+        return True
+
+
+#----------------------------------------------------------
+
+
+def setup_tests():
+    pass
+
+
+#----------------------------------------------------------
+
+
+# Test the behavior of the views (on fake data)
+if __name__ == '__main__':
+
+    setup_tests()
+
+    # Note: This database can't be tested without the actual data, since
+    # the actual files are needed by this implementation
+
+    from beat.backend.python.database import DatabaseTester
+
+    DatabaseTester('View', View,
+        [
+            'class_id',
+            'id',
+            'image',
+        ],
+        parameters=dict(
+            group='train',
+        ),
+    )
--- a/advanced/databases/mnist/3.rst
+++ b/advanced/databases/mnist/3.rst
+.. Copyright (c) 2017 Idiap Research Institute, http://www.idiap.ch/          ..
+.. Contact: beat.support@idiap.ch                                             ..
+..                                                                            ..
+.. This file is part of the beat.examples module of the BEAT platform.        ..
+..                                                                            ..
+.. Commercial License Usage                                                   ..
+.. Licensees holding valid commercial BEAT licenses may use this file in      ..
+.. accordance with the terms contained in a written agreement between you     ..
+.. and Idiap. For further information contact tto@idiap.ch                    ..
+..                                                                            ..
+.. Alternatively, this file may be used under the terms of the GNU Affero     ..
+.. Public License version 3 as published by the Free Software and appearing   ..
+.. in the file LICENSE.AGPL included in the packaging of this file.           ..
+.. The BEAT platform is distributed in the hope that it will be useful, but   ..
+.. WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ..
+.. or FITNESS FOR A PARTICULAR PURPOSE.                                       ..
+..                                                                            ..
+.. You should have received a copy of the GNU Affero Public License along     ..
+.. with the BEAT platform. If not, see http://www.gnu.org/licenses/.          ..
+
+
+The MNIST Database of Handwritten Digits
+----------------------------------------
+
+Changelog
+=========
+
+* **Version 3**, 31/Oct/2017:
+
+  - Port to beat.backend.python v1.4.2
+
+* **Version 2**, 20/Jan/2016:
+
+  - Port to Bob v2
+
+* **Version 1**, 21/Oct/2014:
+
+  - Initial release
+
+
+Description
+===========
+
+The `MNIST database of handwritten digits <http://yann.lecun.com/exdb/mnist/>`_,
+available from this page, has a training set of 60,000 examples, and a test
+set of 10,000 examples. It is a subset of a larger set available from NIST.
+The digits have been size-normalized and centered in a fixed-size image.
+
+It is a good database for people who want to try learning techniques and
+pattern recognition methods on real-world data while spending minimal efforts
+on preprocessing and formatting.
+
+The original black and white (bilevel) images from NIST were size normalized
+to fit in a 20x20 pixel box while preserving their aspect ratio. The resulting
+images contain grey levels as a result of the anti-aliasing technique used by
+the normalization algorithm. the images were centered in a 28x28 image by
+computing the center of mass of the pixels, and translating the image so as to
+position this point at the center of the 28x28 field.
+
+The MNIST database was constructed from NIST's Special Database 3 and Special
+Database 1 which contain binary images of handwritten digits. NIST originally
+designated SD-3 as their training set and SD-1 as their test set. However,
+SD-3 is much cleaner and easier to recognize than SD-1. The reason for this
+can be found on the fact that SD-3 was collected among Census Bureau
+employees, while SD-1 was collected among high-school students. Drawing
+sensible conclusions from learning experiments requires that the result be
+independent of the choice of training set and test among the complete set of
+samples. Therefore it was necessary to build a new database by mixing NIST's
+datasets.
+
+The MNIST training set is composed of 30,000 patterns from SD-3 and 30,000
+patterns from SD-1. Our test set was composed of 5,000 patterns from SD-3 and
+5,000 patterns from SD-1. The 60,000 pattern training set contained examples
+from approximately 250 writers. We made sure that the sets of writers of the
+training set and test set were disjoint.
+
+SD-1 contains 58,527 digit images written by 500 different writers. In
+contrast to SD-3, where blocks of data from each writer appeared in sequence,
+the data in SD-1 is scrambled. Writer identities for SD-1 is available and we
+used this information to unscramble the writers. We then split SD-1 in two:
+characters written by the first 250 writers went into our new training set.
+The remaining 250 writers were placed in our test set. Thus we had two sets
+with nearly 30,000 examples each. The new training set was completed with
+enough examples from SD-3, starting at pattern # 0, to make a full set of
+60,000 training patterns. Similarly, the new test set was completed with
+SD-3 examples starting at pattern # 35,000 to make a full set with 60,000
+test patterns. Only a subset of 10,000 test images (5,000 from SD-1 and
+5,000 from SD-3) is available on this site. The full 60,000 sample training
+set is available.