From 37e29ed58abf57ee2a0a068e758ee51061c180de Mon Sep 17 00:00:00 2001 From: Philip ABBET <philip.abbet@idiap.ch> Date: Tue, 31 Oct 2017 11:42:20 +0100 Subject: [PATCH] Add mnist/3 (api change: beat.backend.python v1.4.2) --- advanced/databases/mnist/3.json | 38 +++++++ advanced/databases/mnist/3.py | 174 ++++++++++++++++++++++++++++++++ advanced/databases/mnist/3.rst | 89 ++++++++++++++++ 3 files changed, 301 insertions(+) create mode 100644 advanced/databases/mnist/3.json create mode 100644 advanced/databases/mnist/3.py create mode 100644 advanced/databases/mnist/3.rst diff --git a/advanced/databases/mnist/3.json b/advanced/databases/mnist/3.json new file mode 100644 index 0000000..46d01c2 --- /dev/null +++ b/advanced/databases/mnist/3.json @@ -0,0 +1,38 @@ +{ + "description": "The MNIST Database of Handwritten Digits", + "root_folder": "/idiap/group/biometric/databases/mnist", + "protocols": [ + { + "name": "idiap", + "template": "simple_digit_recognition", + "sets": [ + { + "name": "train", + "template": "train", + "view": "View", + "parameters": { + "group": "train" + }, + "outputs": { + "id": "{{ system_user.username }}/uint64/1", + "class_id": "{{ system_user.username }}/uint64/1", + "image": "{{ system_user.username }}/array_2d_uint8/1" + } + }, + { + "name": "test", + "template": "test", + "view": "View", + "parameters": { + "group": "test" + }, + "outputs": { + "id": "{{ system_user.username }}/uint64/1", + "class_id": "{{ system_user.username }}/uint64/1", + "image": "{{ system_user.username }}/array_2d_uint8/1" + } + } + ] + } + ] +} diff --git a/advanced/databases/mnist/3.py b/advanced/databases/mnist/3.py new file mode 100644 index 0000000..571c07e --- /dev/null +++ b/advanced/databases/mnist/3.py @@ -0,0 +1,174 @@ +############################################################################### +# # +# Copyright (c) 2017 Idiap Research Institute, http://www.idiap.ch/ # +# Contact: beat.support@idiap.ch # +# # +# This file is part of the beat.examples module of the BEAT platform. # +# # +# Commercial License Usage # +# Licensees holding valid commercial BEAT licenses may use this file in # +# accordance with the terms contained in a written agreement between you # +# and Idiap. For further information contact tto@idiap.ch # +# # +# Alternatively, this file may be used under the terms of the GNU Affero # +# Public License version 3 as published by the Free Software and appearing # +# in the file LICENSE.AGPL included in the packaging of this file. # +# The BEAT platform is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # +# or FITNESS FOR A PARTICULAR PURPOSE. # +# # +# You should have received a copy of the GNU Affero Public License along # +# with the BEAT platform. If not, see http://www.gnu.org/licenses/. # +# # +############################################################################### + +import numpy as np +import bob.db.mnist + + +#---------------------------------------------------------- + + +def get_label_end_index(labels, label, label_start_index, + start_index, end_index): + label_end_index = label_start_index + + while label_end_index + 1 <= end_index: + label_ = labels[label_end_index + 1 - start_index] + + if label_ != label: + return label_end_index + + label_end_index += 1 + + return end_index + + +#---------------------------------------------------------- + + +class View: + """Outputs: + - image: "{{ system_user.username }}/array_2d_uint8/1" + - id: "{{ system_user.username }}/uint64/1" + - class_id: "{{ system_user.username }}/uint64/1" + + One "id" is associated with a given "image". + Several "image" are associated with a given "class_id". + + --------------- --------------- --------------- --------------- --------------- --------------- + | image | | image | | image | | image | | image | | image | + --------------- --------------- --------------- --------------- --------------- --------------- + --------------- --------------- --------------- --------------- --------------- --------------- + | id | | id | | id | | id | | id | | id | + --------------- --------------- --------------- --------------- --------------- --------------- + ----------------------------------------------- ----------------------------------------------- + | class_id | | class_id | + ----------------------------------------------- ----------------------------------------------- + """ + + def setup(self, root_folder, outputs, parameters, force_start_index=None, + force_end_index=None): + + # Initialisations + self.root_folder = root_folder + self.outputs = outputs + self.parameters = parameters + + # Open the database and load the objects to provide via the outputs + self.db = bob.db.mnist.Database(data_dir=self.root_folder) + + self.features, self.labels = sorted(self.db.data(groups=parameters['group']), + key=lambda x: x[1]) + + # Determine the range of indices that must be provided + self.start_index = force_start_index if force_start_index is not None else 0 + self.end_index = force_end_index if force_end_index is not None else len(self.objs) - 1 + + self.objs = self.objs[self.start_index : self.end_index + 1] + + self.next_index = self.start_index + + return True + + + def done(self, last_data_index): + return last_data_index >= self.end_index + + + def next(self): + features = self.features[self.next_index - self.start_index, :] + label = self.labels[self.next_index - self.start_index] + + # Output: class_id (only provide data when the class_id change) + if self.outputs['class_id'].isConnected() and \ + self.outputs['class_id'].last_written_data_index < self.next_index: + + label_end_index = get_label_end_index(self.labels, label, + self.next_index, + self.start_index, + self.end_index) + + self.outputs['class_id'].write( + { + 'value': np.uint64(label) + }, + label_end_index + ) + + # Output: id (provide data at each iteration) + if self.outputs['id'].isConnected(): + self.outputs['id'].write( + { + 'value': np.uint64(self.next_index) + }, + self.next_index + ) + + # Output: image (provide data at each iteration) + if self.outputs['image'].isConnected(): + self.outputs['image'].write( + { + 'value': features.reshape((28, 28)) + }, + self.next_index + ) + + # Determine the next data index that must be provided + self.next_index = 1 + min([ x.last_written_data_index for x in self.outputs + if x.isConnected() ] + ) + + return True + + +#---------------------------------------------------------- + + +def setup_tests(): + pass + + +#---------------------------------------------------------- + + +# Test the behavior of the views (on fake data) +if __name__ == '__main__': + + setup_tests() + + # Note: This database can't be tested without the actual data, since + # the actual files are needed by this implementation + + from beat.backend.python.database import DatabaseTester + + DatabaseTester('View', View, + [ + 'class_id', + 'id', + 'image', + ], + parameters=dict( + group='train', + ), + ) diff --git a/advanced/databases/mnist/3.rst b/advanced/databases/mnist/3.rst new file mode 100644 index 0000000..0c136b9 --- /dev/null +++ b/advanced/databases/mnist/3.rst @@ -0,0 +1,89 @@ +.. Copyright (c) 2017 Idiap Research Institute, http://www.idiap.ch/ .. +.. Contact: beat.support@idiap.ch .. +.. .. +.. This file is part of the beat.examples module of the BEAT platform. .. +.. .. +.. Commercial License Usage .. +.. Licensees holding valid commercial BEAT licenses may use this file in .. +.. accordance with the terms contained in a written agreement between you .. +.. and Idiap. For further information contact tto@idiap.ch .. +.. .. +.. Alternatively, this file may be used under the terms of the GNU Affero .. +.. Public License version 3 as published by the Free Software and appearing .. +.. in the file LICENSE.AGPL included in the packaging of this file. .. +.. The BEAT platform is distributed in the hope that it will be useful, but .. +.. WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY .. +.. or FITNESS FOR A PARTICULAR PURPOSE. .. +.. .. +.. You should have received a copy of the GNU Affero Public License along .. +.. with the BEAT platform. If not, see http://www.gnu.org/licenses/. .. + + +The MNIST Database of Handwritten Digits +---------------------------------------- + +Changelog +========= + +* **Version 3**, 31/Oct/2017: + + - Port to beat.backend.python v1.4.2 + +* **Version 2**, 20/Jan/2016: + + - Port to Bob v2 + +* **Version 1**, 21/Oct/2014: + + - Initial release + + +Description +=========== + +The `MNIST database of handwritten digits <http://yann.lecun.com/exdb/mnist/>`_, +available from this page, has a training set of 60,000 examples, and a test +set of 10,000 examples. It is a subset of a larger set available from NIST. +The digits have been size-normalized and centered in a fixed-size image. + +It is a good database for people who want to try learning techniques and +pattern recognition methods on real-world data while spending minimal efforts +on preprocessing and formatting. + +The original black and white (bilevel) images from NIST were size normalized +to fit in a 20x20 pixel box while preserving their aspect ratio. The resulting +images contain grey levels as a result of the anti-aliasing technique used by +the normalization algorithm. the images were centered in a 28x28 image by +computing the center of mass of the pixels, and translating the image so as to +position this point at the center of the 28x28 field. + +The MNIST database was constructed from NIST's Special Database 3 and Special +Database 1 which contain binary images of handwritten digits. NIST originally +designated SD-3 as their training set and SD-1 as their test set. However, +SD-3 is much cleaner and easier to recognize than SD-1. The reason for this +can be found on the fact that SD-3 was collected among Census Bureau +employees, while SD-1 was collected among high-school students. Drawing +sensible conclusions from learning experiments requires that the result be +independent of the choice of training set and test among the complete set of +samples. Therefore it was necessary to build a new database by mixing NIST's +datasets. + +The MNIST training set is composed of 30,000 patterns from SD-3 and 30,000 +patterns from SD-1. Our test set was composed of 5,000 patterns from SD-3 and +5,000 patterns from SD-1. The 60,000 pattern training set contained examples +from approximately 250 writers. We made sure that the sets of writers of the +training set and test set were disjoint. + +SD-1 contains 58,527 digit images written by 500 different writers. In +contrast to SD-3, where blocks of data from each writer appeared in sequence, +the data in SD-1 is scrambled. Writer identities for SD-1 is available and we +used this information to unscramble the writers. We then split SD-1 in two: +characters written by the first 250 writers went into our new training set. +The remaining 250 writers were placed in our test set. Thus we had two sets +with nearly 30,000 examples each. The new training set was completed with +enough examples from SD-3, starting at pattern # 0, to make a full set of +60,000 training patterns. Similarly, the new test set was completed with +SD-3 examples starting at pattern # 35,000 to make a full set with 60,000 +test patterns. Only a subset of 10,000 test images (5,000 from SD-1 and +5,000 from SD-3) is available on this site. The full 60,000 sample training +set is available. -- GitLab