diff --git a/bob/learn/tensorflow/dataset/tfrecords.py b/bob/learn/tensorflow/dataset/tfrecords.py index 2c6aea41ae48f7d3f53d67c406df7ba397c0b006..f31ad7a9bdf29a0e1c0cb71d4fb2fd56e1829afb 100644 --- a/bob/learn/tensorflow/dataset/tfrecords.py +++ b/bob/learn/tensorflow/dataset/tfrecords.py @@ -102,7 +102,7 @@ def create_dataset_from_records(tfrecord_filenames, if feature is None: feature = DEFAULT_FEATURE - dataset = tf.contrib.data.TFRecordDataset(tfrecord_filenames) + dataset = tf.data.TFRecordDataset(tfrecord_filenames) parser = partial( example_parser, feature=feature, @@ -144,7 +144,7 @@ def create_dataset_from_records_with_augmentation( if feature is None: feature = DEFAULT_FEATURE - dataset = tf.contrib.data.TFRecordDataset(tfrecord_filenames) + dataset = tf.data.TFRecordDataset(tfrecord_filenames) parser = partial( image_augmentation_parser, feature=feature, diff --git a/bob/learn/tensorflow/examples/mnist/mnist_config.py b/bob/learn/tensorflow/examples/mnist/mnist_config.py index fbed1f5f5972742967c752372e5ec04e132ec860..b6780ff74d73fce28e387161de0b11413c6a798d 100644 --- a/bob/learn/tensorflow/examples/mnist/mnist_config.py +++ b/bob/learn/tensorflow/examples/mnist/mnist_config.py @@ -67,7 +67,7 @@ def input_fn(mode, batch_size=1): 'convert_to_records.py first to convert the MNIST data to ' 'TFRecord file format.') - dataset = tf.contrib.data.TFRecordDataset(tfrecords_files) + dataset = tf.data.TFRecordDataset(tfrecords_files) # For training, repeat the dataset forever if mode == tf.estimator.ModeKeys.TRAIN: diff --git a/bob/learn/tensorflow/network/SimpleCNN.py b/bob/learn/tensorflow/network/SimpleCNN.py index 2be23869113ace75058f700d5e98f6d3b074e624..667a9465eb51dd72cd1acc70177a385d473407b1 100644 --- a/bob/learn/tensorflow/network/SimpleCNN.py +++ b/bob/learn/tensorflow/network/SimpleCNN.py @@ -153,6 +153,7 @@ def model_fn(features, labels, mode, params=None, config=None): extra_checkpoint = params.get('extra_checkpoint', None) trainable_variables = get_trainable_variables(extra_checkpoint) loss_weights = params.get('loss_weights', 1.0) + add_histograms = params.get('add_histograms', None) arch_kwargs = { 'kernerl_size': params.get('kernerl_size', None), @@ -239,6 +240,15 @@ def model_fn(features, labels, mode, params=None, config=None): for l in tf.get_collection(tf.GraphKeys.LOSSES): tf.summary.scalar(l.op.name + "_averaged", loss_averages.average(l)) + + # add histograms summaries + if add_histograms == 'all': + for v in tf.all_variables(): + tf.summary.histogram(v.name, v) + elif add_histograms == 'train': + for v in tf.trainable_variables(): + tf.summary.histogram(v.name, v) + else: train_op = None diff --git a/bob/learn/tensorflow/script/eval.py b/bob/learn/tensorflow/script/eval.py index 4a80f24a40904a6c7f97381711194223b19c6dc3..36ac4ea073c776c35f5f0c61a7a846982c647b07 100644 --- a/bob/learn/tensorflow/script/eval.py +++ b/bob/learn/tensorflow/script/eval.py @@ -4,20 +4,101 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import click import logging import os -import time import six +import shutil import sys import tensorflow as tf +import time +from glob import glob +from collections import defaultdict, OrderedDict from ..utils.eval import get_global_step -import click from bob.extension.scripts.click_helper import ( verbosity_option, ConfigCommand, ResourceOption) +from bob.io.base import create_directories_safe logger = logging.getLogger(__name__) +def save_n_best_models(train_dir, save_dir, evaluated_file, + keep_n_best_models): + create_directories_safe(save_dir) + evaluated = read_evaluated_file(evaluated_file) + + def _key(x): + x = x[1] + ac = x.get('accuracy') + lo = x.get('loss') or 0 + return ac * -1 if ac is not None else lo + + best_models = OrderedDict(sorted( + evaluated.items(), key=_key)[:keep_n_best_models]) + + # delete the old saved models that are not in top N best anymore + saved_models = defaultdict(list) + for path in glob('{}/model.ckpt-*'.format(save_dir)): + global_step = path.split('model.ckpt-')[1].split('.')[0] + saved_models[global_step].append(path) + + for global_step, paths in saved_models.items(): + if global_step not in best_models: + for path in paths: + logger.info("Deleting `%s'", path) + os.remove(path) + + # copy over the best models if not already there + for global_step in best_models: + for path in glob('{}/model.ckpt-{}.*'.format(train_dir, global_step)): + dst = os.path.join(save_dir, os.path.basename(path)) + if os.path.isfile(dst): + continue + logger.info("Copying `%s' over to `%s'", path, dst) + shutil.copy(path, dst) + + # create a checkpoint file indicating to the best existing model: + # 1. filter non-existing models first + def _filter(x): + return len(glob('{}/model.ckpt-{}.*'.format(save_dir, x[0]))) > 0 + best_models = OrderedDict(filter(_filter, best_models.items())) + + # 2. create the checkpoint file + with open(os.path.join(save_dir, 'checkpoint'), 'wt') as f: + for i, global_step in enumerate(best_models): + if i == 0: + f.write('model_checkpoint_path: "model.ckpt-{}"\n'.format( + global_step)) + f.write('all_model_checkpoint_paths: "model.ckpt-{}"\n'.format( + global_step)) + + +def read_evaluated_file(path): + evaluated = {} + with open(path) as f: + for line in f: + global_step, line = line.split(' ', 1) + temp = {} + for k_v in line.strip().split(', '): + k, v = k_v.split(' = ') + v = float(v) + if 'global_step' in k: + v = int(v) + temp[k] = v + evaluated[global_step] = temp + return evaluated + + +def append_evaluated_file(path, evaluations): + str_evaluations = ', '.join( + '%s = %s' % (k, v) + for k, v in sorted(six.iteritems(evaluations))) + with open(path, 'a') as f: + f.write('{} {}\n'.format(evaluations['global_step'], + str_evaluations)) + return str_evaluations + + @click.command(entry_point_group='bob.learn.tensorflow.config', cls=ConfigCommand) @click.option('--estimator', '-e', required=True, cls=ResourceOption, @@ -28,12 +109,14 @@ logger = logging.getLogger(__name__) entry_point_group='bob.learn.tensorflow.hook') @click.option('--run-once', cls=ResourceOption, default=False, show_default=True) -@click.option('--eval-interval-secs', cls=ResourceOption, type=click.types.INT, +@click.option('--eval-interval-secs', cls=ResourceOption, type=click.INT, default=60, show_default=True) @click.option('--name', cls=ResourceOption) +@click.option('--keep-n-best-models', '-K', type=click.INT, cls=ResourceOption, + default=0, show_default=True) @verbosity_option(cls=ResourceOption) def eval(estimator, eval_input_fn, hooks, run_once, eval_interval_secs, name, - **kwargs): + keep_n_best_models, **kwargs): """Evaluates networks using Tensorflow estimators. \b @@ -76,18 +159,20 @@ def eval(estimator, eval_input_fn, hooks, run_once, eval_interval_secs, name, logger.debug('run_once: %s', run_once) logger.debug('eval_interval_secs: %s', eval_interval_secs) logger.debug('name: %s', name) + logger.debug('keep_n_best_models: %s', keep_n_best_models) logger.debug('kwargs: %s', kwargs) - if name: - real_name = 'eval_' + name - else: - real_name = 'eval' - evaluated_file = os.path.join(estimator.model_dir, real_name, 'evaluated') + real_name = 'eval_' + name if name else 'eval' + eval_dir = os.path.join(estimator.model_dir, real_name) + evaluated_file = os.path.join(eval_dir, 'evaluated') while True: - evaluated_steps = [] + evaluated_steps = {} if os.path.exists(evaluated_file): - with open(evaluated_file) as f: - evaluated_steps = [line.split()[0] for line in f] + evaluated_steps = read_evaluated_file(evaluated_file) + + # Save the best N models into the eval directory + save_n_best_models(estimator.model_dir, eval_dir, evaluated_file, + keep_n_best_models) ckpt = tf.train.get_checkpoint_state(estimator.model_dir) if (not ckpt) or (not ckpt.model_checkpoint_path): @@ -113,14 +198,15 @@ def eval(estimator, eval_input_fn, hooks, run_once, eval_interval_secs, name, name=name, ) - str_evaluations = ', '.join( - '%s = %s' % (k, v) - for k, v in sorted(six.iteritems(evaluations))) - print(str_evaluations) + str_evaluations = append_evaluated_file( + evaluated_file, evaluations) + click.echo(str_evaluations) sys.stdout.flush() - with open(evaluated_file, 'a') as f: - f.write('{} {}\n'.format(evaluations['global_step'], - str_evaluations)) + + # Save the best N models into the eval directory + save_n_best_models(estimator.model_dir, eval_dir, evaluated_file, + keep_n_best_models) + if run_once: break time.sleep(eval_interval_secs) diff --git a/bob/learn/tensorflow/script/predict_bio.py b/bob/learn/tensorflow/script/predict_bio.py index 9f981d0bdbb74f983fea1db37fca87ad00bced13..c4b396340a27ee1aad19243140c0b23a75cd33da 100644 --- a/bob/learn/tensorflow/script/predict_bio.py +++ b/bob/learn/tensorflow/script/predict_bio.py @@ -13,6 +13,7 @@ from bob.extension.scripts.click_helper import ( from multiprocessing import Pool from collections import defaultdict import numpy as np +import tensorflow as tf from bob.io.base import create_directories_safe from bob.bio.base.utils import save from bob.bio.base.tools.grid import indices @@ -74,8 +75,8 @@ def save_predictions(pool, output_dir, key, pred_buffer): entry_point_group='bob.learn.tensorflow.hook') @click.option('--predict-keys', '-k', multiple=True, default=None, cls=ResourceOption) -@click.option('--checkpoint-path', cls=ResourceOption) -@click.option('--multiple-samples', is_flag=True, cls=ResourceOption) +@click.option('--checkpoint-path', '-c', cls=ResourceOption) +@click.option('--multiple-samples', '-m', is_flag=True, cls=ResourceOption) @click.option('--array', '-t', type=click.INT, default=1, cls=ResourceOption) @click.option('--force', '-f', is_flag=True, cls=ResourceOption) @verbosity_option(cls=ResourceOption) @@ -121,7 +122,9 @@ def predict_bio(estimator, database, biofiles, bio_predict_input_fn, `None`, returns all. checkpoint_path : str, optional Path of a specific checkpoint to predict. If `None`, the latest - checkpoint in `model_dir` is used. + checkpoint in `model_dir` is used. This can also be a folder which + contains a "checkpoint" file where the latest checkpoint from inside + this file will be used as checkpoint_path. multiple_samples : bool, optional If provided, it assumes that the db interface returns several samples from a biofile. This option can be used when you are working with @@ -216,6 +219,11 @@ def predict_bio(estimator, database, biofiles, bio_predict_input_fn, generator.output_shapes) if checkpoint_path: + if os.path.isdir(checkpoint_path): + ckpt = tf.train.get_checkpoint_state(estimator.model_dir) + if ckpt and ckpt.model_checkpoint_path: + checkpoint_path = ckpt.model_checkpoint_path + logger.info("Restoring the model from %s", checkpoint_path) predictions = estimator.predict( diff --git a/bob/learn/tensorflow/test/test_estimator_scripts.py b/bob/learn/tensorflow/test/test_estimator_scripts.py index 7e69f8decc42532811f5d6b11da1e17da3b1c4f1..625c69620d8de2d11e1cafcfcf960900a4892eba 100644 --- a/bob/learn/tensorflow/test/test_estimator_scripts.py +++ b/bob/learn/tensorflow/test/test_estimator_scripts.py @@ -1,6 +1,7 @@ from __future__ import print_function import os import shutil +from glob import glob from tempfile import mkdtemp from click.testing import CliRunner from bob.io.base.test_utils import datafile @@ -122,7 +123,7 @@ def _create_checkpoint(tmpdir, model_dir, dummy_tfrecord): result.exc_info, result.output, result.exception) -def _eval(tmpdir, model_dir, dummy_tfrecord): +def _eval(tmpdir, model_dir, dummy_tfrecord, extra_args=[]): config = CONFIG % { 'model_dir': model_dir, 'tfrecord_filenames': dummy_tfrecord @@ -131,7 +132,7 @@ def _eval(tmpdir, model_dir, dummy_tfrecord): with open(config_path, 'w') as f: f.write(config) runner = CliRunner() - result = runner.invoke(eval_script, args=[config_path]) + result = runner.invoke(eval_script, args=[config_path] + extra_args) assert result.exit_code == 0, '%s\n%s\n%s' % ( result.exc_info, result.output, result.exception) @@ -179,3 +180,35 @@ def test_eval(): shutil.rmtree(tmpdir) except Exception: pass + + +def test_eval_keep_n_model(): + tmpdir = mkdtemp(prefix='bob_') + try: + model_dir = os.path.join(tmpdir, 'model_dir') + eval_dir = os.path.join(model_dir, 'eval') + + print('\nCreating a dummy tfrecord') + dummy_tfrecord = _create_tfrecord(tmpdir) + + print('Training a dummy network') + _create_checkpoint(tmpdir, model_dir, dummy_tfrecord) + + print('Evaluating a dummy network') + _eval(tmpdir, model_dir, dummy_tfrecord, ['-K', '1']) + + evaluated_path = os.path.join(eval_dir, 'evaluated') + assert os.path.exists(evaluated_path), evaluated_path + with open(evaluated_path) as f: + doc = f.read() + + assert '1 ' in doc, doc + assert '200 ' in doc, doc + assert len(glob('{}/model.ckpt-*'.format(eval_dir))) == 3, \ + os.listdir(eval_dir) + + finally: + try: + shutil.rmtree(tmpdir) + except Exception: + pass