Commit 5b109f10 authored by Tiago de Freitas Pereira's avatar Tiago de Freitas Pereira
Browse files

Merge branch 'predict' into 'master'

Add prediction script for bob.bio.base databases

Closes #42

See merge request !26
parents aeaf3303 e69b911c
Pipeline #13465 passed with stages
in 19 minutes and 22 seconds
import os
import six
import tensorflow as tf
from bob.bio.base.tools.grid import indices
from bob.bio.base import read_original_data as _read_original_data
def make_output_path(output_dir, key):
"""Returns an output path used for saving keys. You need to make sure the
directories leading to this output path exist.
Parameters
----------
output_dir : str
The root directory to save the results
key : str
The key of the sample. Usually biofile.make_path("", "")
Returns
-------
str
The path for the provided key.
"""
return os.path.join(output_dir, key + '.hdf5')
def bio_generator(database, groups, number_of_parallel_jobs, output_dir,
read_original_data=None, biofile_to_label=None,
multiple_samples=False, force=False):
"""Returns a generator and its output types and shapes based on
bob.bio.base databases.
Parameters
----------
database : :any:`bob.bio.base.database.BioDatabase`
The database that you want to use.
groups : [str]
List of groups. Can be any permutation of ``('world', 'dev', 'eval')``
number_of_parallel_jobs : int
The number of parallel jobs that the script has ran with. This is used
to split the number files into array jobs.
output_dir : str
The root directory where the data will be saved.
read_original_data : :obj:`object`, optional
A callable with the signature of
``data = read_original_data(biofile, directory, extension)``.
:any:`bob.bio.base.read_original_data` is used by default.
biofile_to_label : :obj:`object`, optional
A callable with the signature of ``label = biofile_to_label(biofile)``.
By default -1 is returned as label.
multiple_samples : bool, optional
If true, it assumes that the bio database's samples actually contain
multiple samples. This is useful for when you want to treat video
databases as image databases.
force : bool, optional
If true, all files will be overwritten.
Returns
-------
generator : object
A generator function that when called will return the samples. The
samples will be like ``(data, label, key)``.
output_types : (object, object, object)
The types of the returned samples.
output_shapes : (tf.TensorShape, tf.TensorShape, tf.TensorShape)
The shapes of the returned samples.
"""
if read_original_data is None:
read_original_data = _read_original_data
if biofile_to_label is None:
def biofile_to_label(biofile):
return -1
biofiles = list(database.all_files(groups))
if number_of_parallel_jobs > 1:
start, end = indices(biofiles, number_of_parallel_jobs)
biofiles = biofiles[start:end]
labels = (biofile_to_label(f) for f in biofiles)
keys = (str(f.make_path("", "")) for f in biofiles)
def generator():
for f, label, key in six.moves.zip(biofiles, labels, keys):
outpath = make_output_path(output_dir, key)
if not force and os.path.isfile(outpath):
continue
data = read_original_data(f, database.original_directory,
database.original_extension)
# labels
if multiple_samples:
for d in data:
yield (d, label, key)
else:
yield (data, label, key)
# load one data to get its type and shape
data = read_original_data(biofiles[0], database.original_directory,
database.original_extension)
if multiple_samples:
try:
data = data[0]
except TypeError:
# if the data is a generator
data = six.next(data)
data = tf.convert_to_tensor(data)
output_types = (data.dtype, tf.int64, tf.string)
output_shapes = (data.shape, tf.TensorShape([]), tf.TensorShape([]))
return (generator, output_types, output_shapes)
......@@ -25,8 +25,6 @@ def architecture(input_layer, mode=tf.estimator.ModeKeys.TRAIN,
# Pooling Layer #1
# First max pooling layer with a 2x2 filter and stride of 2
# Input Tensor Shape: [batch_size, 50, 1024, 32]
# Output Tensor Shape: [batch_size, 25, 512, 32]
pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2,
data_format=data_format)
......@@ -47,7 +45,7 @@ def architecture(input_layer, mode=tf.estimator.ModeKeys.TRAIN,
data_format=data_format)
# Flatten tensor into a batch of vectors
# TODO: use tf.layers.flatten in tensorflow 1.4 above
# TODO: use tf.layers.flatten in tensorflow 1.4 and above
pool2_flat = tf.contrib.layers.flatten(pool2)
# Dense Layer
......
......@@ -36,7 +36,7 @@ The configuration files should have the following objects totally::
samples : a list of all samples that you want to write in the tfrecords
file. Whatever is inside this list is passed to the reader.
reader : a function with the signature of
`data, label, key = reader(sample)` which takes a sample and
``data, label, key = reader(sample)`` which takes a sample and
returns the loaded data, the label of the data, and a key which
is unique for every sample.
......@@ -91,7 +91,6 @@ from __future__ import print_function
import random
# import pkg_resources so that bob imports work properly:
import pkg_resources
import six
import tensorflow as tf
from bob.io.base import create_directories_safe
from bob.bio.base.utils import read_config_file
......
#!/usr/bin/env python
"""Saves predictions or embeddings of tf.estimators. This script works with
bob.bio.base databases. To use it see the configuration details below. This
script works with tensorflow 1.4 and above.
Usage:
%(prog)s [-v...] [-k KEY]... [options] <config_files>...
%(prog)s --help
%(prog)s --version
Arguments:
<config_files> The configuration files. The
configuration files are loaded in order
and they need to have several objects
inside totally. See below for
explanation.
Options:
-h --help Show this help message and exit
--version Show version and exit
-o PATH, --output-dir PATH Name of the output file.
-k KEY, --predict-keys KEY List of `str`, name of the keys to
predict. It is used if the
`EstimatorSpec.predictions` is a `dict`.
If `predict_keys` is used then rest of
the predictions will be filtered from
the dictionary. If `None`, returns all.
--checkpoint-path=<path> Path of a specific checkpoint to
predict. If `None`, the latest
checkpoint in `model_dir` is used.
--multiple-samples If provided, it assumes that the db
interface returns several samples from a
biofile. This option can be used when
you are working with sequences.
-p N, --number-of-parallel-jobs N The number of parallel jobs that this
script is run in the SGE grid. You
should use this option with ``jman
submit -t N``.
-f, --force If provided, it will overwrite the
existing predictions.
-v, --verbose Increases the output verbosity level
The -- options above can also be supplied through configuration files. You just
need to create a variable with a name that replaces ``-`` with ``_``. For
example, use ``multiple_samples`` instead of ``--multiple-samples``.
The configuration files should have the following objects totally:
# Required objects:
estimator : :any:`tf.estimator.Estimator`
An estimator instance that represents the neural network.
database : :any:`bob.bio.base.database.BioDatabase`
A bio database. Its original_directory must point to the correct path.
groups : [str]
A list of groups to evaluate. Can be any permutation of
``('world', 'dev', 'eval')``.
bio_predict_input_fn : callable
A callable with the signature of
``input_fn = bio_predict_input_fn(generator, output_types, output_shapes)``
The inputs are documented in :any:`tf.data.Dataset.from_generator` and
the output should be a function with no arguments and is passed to
:any:`tf.estimator.Estimator.predict`.
# Optional objects:
read_original_data : callable
A callable with the signature of
``data = read_original_data(biofile, directory, extension)``.
:any:`bob.bio.base.read_original_data` is used by default.
hooks : [:any:`tf.train.SessionRunHook`]
Optional hooks that you may want to attach to the predictions.
An example configuration for a trained model and its evaluation could be::
import tensorflow as tf
# define the database:
from bob.bio.base.test.dummy.database import database
# load the estimator model
estimator = tf.estimator.Estimator(model_fn, model_dir)
groups = ['dev']
# the ``dataset = tf.data.Dataset.from_generator(generator, output_types,
# output_shapes)`` line is mandatory in the function below. You have to
# create it in your configuration file since you want it to be created in
# the same graph as your model.
def bio_predict_input_fn(generator,output_types, output_shapes):
def input_fn():
dataset = tf.data.Dataset.from_generator(generator, output_types,
output_shapes)
# apply all kinds of transformations here, process the data even
# further if you want.
dataset = dataset.prefetch(1)
dataset = dataset.batch(10**3)
images, labels, keys = dataset.make_one_shot_iterator().get_next()
return {'data': images, 'keys': keys}, labels
return input_fn
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# import pkg_resources so that bob imports work properly:
import pkg_resources
import os
from multiprocessing import Pool
from collections import defaultdict
import numpy as np
from bob.io.base import create_directories_safe
from bob.bio.base.utils import read_config_file, save
from bob.learn.tensorflow.utils.commandline import \
get_from_config_or_commandline
from bob.learn.tensorflow.dataset.bio import make_output_path, bio_generator
from bob.core.log import setup, set_verbosity_level
logger = setup(__name__)
def save_predictions(pool, output_dir, key, pred_buffer):
outpath = make_output_path(output_dir, key)
create_directories_safe(os.path.dirname(outpath))
pool.apply_async(save, (np.mean(pred_buffer[key], axis=0), outpath))
def main(argv=None):
from docopt import docopt
import sys
docs = __doc__ % {'prog': os.path.basename(sys.argv[0])}
version = pkg_resources.require('bob.learn.tensorflow')[0].version
defaults = docopt(docs, argv=[""])
args = docopt(docs, argv=argv, version=version)
config_files = args['<config_files>']
config = read_config_file(config_files)
# optional arguments
verbosity = get_from_config_or_commandline(
config, 'verbose', args, defaults)
predict_keys = get_from_config_or_commandline(
config, 'predict_keys', args, defaults)
checkpoint_path = get_from_config_or_commandline(
config, 'checkpoint_path', args, defaults)
multiple_samples = get_from_config_or_commandline(
config, 'multiple_samples', args, defaults)
number_of_parallel_jobs = get_from_config_or_commandline(
config, 'number_of_parallel_jobs', args, defaults)
force = get_from_config_or_commandline(
config, 'force', args, defaults)
hooks = getattr(config, 'hooks', None)
read_original_data = getattr(config, 'read_original_data', None)
# Sets-up logging
set_verbosity_level(logger, verbosity)
# required arguments
estimator = config.estimator
database = config.database
groups = config.groups
bio_predict_input_fn = config.bio_predict_input_fn
output_dir = get_from_config_or_commandline(
config, 'output_dir', args, defaults, False)
generator, output_types, output_shapes = bio_generator(
database, groups, number_of_parallel_jobs, output_dir,
read_original_data=read_original_data, biofile_to_label=None,
multiple_samples=multiple_samples, force=force)
predict_input_fn = bio_predict_input_fn(generator,
output_types, output_shapes)
predictions = estimator.predict(
predict_input_fn,
predict_keys=predict_keys,
hooks=hooks,
checkpoint_path=checkpoint_path,
)
pool = Pool()
try:
pred_buffer = defaultdict(list)
for i, pred in enumerate(predictions):
key = pred['key']
prob = pred.get('probabilities', pred.get('embeddings'))
pred_buffer[key].append(prob)
if i == 0:
last_key = key
if last_key == key:
continue
else:
save_predictions(pool, output_dir, last_key, pred_buffer)
last_key = key
# else below is for the for loop
else:
save_predictions(pool, output_dir, key, pred_buffer)
finally:
pool.close()
pool.join()
if __name__ == '__main__':
main()
......@@ -105,7 +105,7 @@ def main(argv=None):
try:
pred_buffer = defaultdict(list)
for i, pred in enumerate(predictions):
key = pred['keys']
key = pred['key']
prob = pred.get('probabilities', pred.get('embeddings'))
pred_buffer[key].append(prob)
if i == 0:
......
import os
from bob.bio.base.test.dummy.database import database
from bob.bio.base.test.dummy.preprocessor import preprocessor
from bob.bio.base.utils import read_original_data
groups = 'dev'
groups = ['dev']
files = database.all_files(groups=groups)
samples = database.all_files(groups=groups)
output = os.path.join('TEST_DIR', 'dev.tfrecords')
CLIENT_IDS = (str(f.client_id) for f in database.all_files(groups=groups))
CLIENT_IDS = list(set(CLIENT_IDS))
......@@ -15,8 +18,8 @@ def file_to_label(f):
def reader(biofile):
data = preprocessor.read_original_data(
data = read_original_data(
biofile, database.original_directory, database.original_extension)
label = file_to_label(biofile)
key = biofile.path
key = str(biofile.path).encode("utf-8")
return (data, label, key)
......@@ -4,7 +4,6 @@ import pkg_resources
import tempfile
from bob.learn.tensorflow.script.db_to_tfrecords import main as tfrecords
from bob.bio.base.script.verify import main as verify
regenerate_reference = False
......@@ -21,9 +20,7 @@ def test_verify_and_tfrecords():
parameters = [config_path]
try:
#verify(parameters)
#tfrecords(parameters)
pass
tfrecords(parameters)
# TODO: test if tfrecords are equal
# tfrecords_path = os.path.join(test_dir, 'sub_directory', 'dev.tfrecords')
......
......@@ -7,7 +7,6 @@ logging.getLogger("tensorflow").setLevel(logging.WARNING)
from bob.io.base.test_utils import datafile
from bob.learn.tensorflow.script.db_to_tfrecords import main as tfrecords
from bob.bio.base.script.verify import main as verify
from bob.learn.tensorflow.script.train_generic import main as train_generic
from bob.learn.tensorflow.script.eval_generic import main as eval_generic
......@@ -44,6 +43,9 @@ def architecture(images):
def model_fn(features, labels, mode, params, config):
key = features['key']
features = features['data']
logits = architecture(features)
predictions = {
......@@ -51,7 +53,8 @@ def model_fn(features, labels, mode, params, config):
"classes": tf.argmax(input=logits, axis=1),
# Add `softmax_tensor` to the graph. It is used for PREDICT and by the
# `logging_hook`.
"probabilities": tf.nn.softmax(logits, name="softmax_tensor")
"probabilities": tf.nn.softmax(logits, name="softmax_tensor"),
"key": key,
}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
......@@ -82,9 +85,8 @@ def _create_tfrecord(test_dir):
config_path = os.path.join(test_dir, 'tfrecordconfig.py')
with open(dummy_tfrecord_config) as f, open(config_path, 'w') as f2:
f2.write(f.read().replace('TEST_DIR', test_dir))
#verify([config_path])
tfrecords([config_path])
return os.path.join(test_dir, 'sub_directory', 'dev.tfrecords')
return os.path.join(test_dir, 'dev.tfrecords')
def _create_checkpoint(tmpdir, model_dir, dummy_tfrecord):
......@@ -112,21 +114,21 @@ def test_eval_once():
eval_dir = os.path.join(model_dir, 'eval')
print('\nCreating a dummy tfrecord')
#dummy_tfrecord = _create_tfrecord(tmpdir)
dummy_tfrecord = _create_tfrecord(tmpdir)
print('Training a dummy network')
#_create_checkpoint(tmpdir, model_dir, dummy_tfrecord)
_create_checkpoint(tmpdir, model_dir, dummy_tfrecord)
print('Evaluating a dummy network')
#_eval(tmpdir, model_dir, dummy_tfrecord)
_eval(tmpdir, model_dir, dummy_tfrecord)
#evaluated_path = os.path.join(eval_dir, 'evaluated')
#assert os.path.exists(evaluated_path), evaluated_path
#with open(evaluated_path) as f:
# doc = f.read()
evaluated_path = os.path.join(eval_dir, 'evaluated')
assert os.path.exists(evaluated_path), evaluated_path
with open(evaluated_path) as f:
doc = f.read()
# assert '1' in doc, doc
# assert '100' in doc, doc
assert '1' in doc, doc
assert '100' in doc, doc
finally:
try:
shutil.rmtree(tmpdir)
......
......@@ -54,6 +54,7 @@ setup(
'bob_tf_train_generic = bob.learn.tensorflow.script.train_generic:main',
'bob_tf_eval_generic = bob.learn.tensorflow.script.eval_generic:main',
'bob_tf_predict_generic = bob.learn.tensorflow.script.predict_generic:main',
'bob_tf_predict_bio = bob.learn.tensorflow.script.predict_bio:main',
],
},
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment