Added prefetching and text loading

parent 65a91493
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
# @date: Wed 11 May 2016 09:39:36 CEST
import numpy
import tensorflow as tf
class BaseDataShuffler(object):
def __init__(self, data, labels, input_shape, perc_train=0.9, scale=True, train_batch_size=1, validation_batch_size=300):
"""
The class provide base functionoalies to shuffle the data
**Parameters**
data:
labels:
perc_train:
scale:
train_batch_size:
validation_batch_size:
"""
self.scale = scale
self.scale_value = 0.00390625
# TODO: Check if the bacth size is higher than the input data
self.train_batch_size = train_batch_size
self.validation_batch_size = validation_batch_size
self.data = data
self.train_shape = tuple([train_batch_size] + input_shape)
self.validation_shape = tuple([validation_batch_size] + input_shape)
# TODO: Check if the labels goes from O to N-1
self.labels = labels
self.total_labels = max(labels) + 1
# Spliting in train and validation
self.n_samples = len(self.labels)
self.n_train_samples = int(round(self.n_samples * perc_train))
self.n_validation_samples = self.n_samples - self.n_train_samples
# Shuffling all the indexes
self.indexes = numpy.array(range(self.n_samples))
numpy.random.shuffle(self.indexes)
def get_placeholders_forprefetch(self, name="", train_dataset=True):
"""
Returns a place holder with the size of your batch
"""
shape = self.train_shape if train_dataset else self.validation_shape
data = tf.placeholder(tf.float32, shape=tuple([None] + list(shape[1:])), name=name)
labels = tf.placeholder(tf.int64, shape=[None, ])
return data, labels
def get_placeholders(self, name="", train_dataset=True):
"""
Returns a place holder with the size of your batch
"""
shape = self.train_shape if train_dataset else self.validation_shape
data = tf.placeholder(tf.float32, shape=shape, name=name)
labels = tf.placeholder(tf.int64, shape=shape[0])
return data, labels
...@@ -6,6 +6,8 @@ ...@@ -6,6 +6,8 @@
import numpy import numpy
import tensorflow as tf import tensorflow as tf
from .BaseDataShuffler import BaseDataShuffler
def scale_mean_norm(data, scale=0.00390625): def scale_mean_norm(data, scale=0.00390625):
mean = numpy.mean(data) mean = numpy.mean(data)
data = (data - mean) * scale data = (data - mean) * scale
...@@ -13,66 +15,39 @@ def scale_mean_norm(data, scale=0.00390625): ...@@ -13,66 +15,39 @@ def scale_mean_norm(data, scale=0.00390625):
return data, mean return data, mean
class DataShuffler(object): class MemoryDataShuffler(BaseDataShuffler):
def __init__(self, data, labels, perc_train=0.9, scale=True, train_batch_size=1, validation_batch_size=300): def __init__(self, data, labels, input_shape, perc_train=0.9, scale=True, train_batch_size=1, validation_batch_size=300):
"""
The class provide some functionalities for shuffling data
**Parameters**
data:
"""
self.perc_train = perc_train
self.scale = scale
self.scale_value = 0.00390625
self.train_batch_size = train_batch_size
self.validation_batch_size = validation_batch_size
self.data = data
self.labels = labels # From O to N-1
self.total_labels = max(labels) + 1
self.n_samples = self.data.shape[0]
self.width = self.data.shape[1]
self.height = self.data.shape[2]
self.channels = self.data.shape[3]
self.start_shuffler()
def get_placeholders(self, name="", train_dataset=True):
"""
"""
batch = self.train_batch_size if train_dataset else self.validation_batch_size
data = tf.placeholder(tf.float32, shape=(batch, self.width,
self.height, self.channels), name=name)
labels = tf.placeholder(tf.int64, shape=batch)
return data, labels
def start_shuffler(self):
""" """
Some base functions for neural networks Shuffler that deal with memory datasets
**Parameters** **Parameters**
data: data:
labels:
perc_train:
scale:
train_batch_size:
validation_batch_size:
""" """
indexes = numpy.array(range(self.n_samples)) super(MemoryDataShuffler, self).__init__(
numpy.random.shuffle(indexes) data=data,
labels=labels,
# Spliting train and validation input_shape=input_shape,
train_samples = int(round(self.n_samples * self.perc_train)) perc_train=perc_train,
validation_samples = self.n_samples - train_samples scale=scale,
train_batch_size=train_batch_size,
self.train_data = self.data[indexes[0:train_samples], :, :, :] validation_batch_size=validation_batch_size
self.train_labels = self.labels[indexes[0:train_samples]] )
self.validation_data = self.data[indexes[train_samples:train_samples + validation_samples], :, :, :] # Spliting between train and test
self.validation_labels = self.labels[indexes[train_samples:train_samples + validation_samples]] self.train_data = self.data[self.indexes[0:self.n_train_samples], ...]
self.train_labels = self.labels[self.indexes[0:self.n_train_samples]]
self.validation_data = self.data[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples], ...]
self.validation_labels = self.labels[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples]]
if self.scale: if self.scale:
# data = scale_minmax_norm(data,lower_bound = -1, upper_bound = 1)
self.train_data, self.mean = scale_mean_norm(self.train_data) self.train_data, self.mean = scale_mean_norm(self.train_data)
self.validation_data = (self.validation_data - self.mean) * self.scale_value self.validation_data = (self.validation_data - self.mean) * self.scale_value
...@@ -80,13 +55,10 @@ class DataShuffler(object): ...@@ -80,13 +55,10 @@ class DataShuffler(object):
if train_dataset: if train_dataset:
n_samples = self.train_batch_size n_samples = self.train_batch_size
else:
n_samples = self.validation_batch_size
if train_dataset:
data = self.train_data data = self.train_data
label = self.train_labels label = self.train_labels
else: else:
n_samples = self.validation_batch_size
data = self.validation_data data = self.validation_data
label = self.validation_labels label = self.validation_labels
......
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
# @date: Wed 11 May 2016 09:39:36 CEST
import numpy
from .MemoryDataShuffler import MemoryDataShuffler
class MemoryPairDataShuffler(MemoryDataShuffler):
def __init__(self, data, labels, input_shape, perc_train=0.9, scale=True, train_batch_size=1, validation_batch_size=300):
"""
The class provide some functionalities for shuffling data
**Parameters**
data:
"""
data = data
labels = labels
input_shape = input_shape
perc_train = perc_train
scale = scale
train_batch_size = train_batch_size
validation_batch_size = validation_batch_size
super(MemoryPairDataShuffler, self).__init__(data, labels,
input_shape=input_shape,
perc_train=perc_train,
scale=scale,
train_batch_size=train_batch_size*2,
validation_batch_size=validation_batch_size)
def get_pair(self, train_dataset=True, zero_one_labels=True):
"""
Get a random pair of samples
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
def get_genuine_or_not(input_data, input_labels, genuine=True):
if genuine:
# TODO: THIS KEY SELECTION NEEDS TO BE MORE EFFICIENT
# Getting a client
index = numpy.random.randint(self.total_labels)
# Getting the indexes of the data from a particular client
indexes = numpy.where(input_labels == index)[0]
numpy.random.shuffle(indexes)
# Picking a pair
data = input_data[indexes[0], ...]
data_p = input_data[indexes[1], ...]
else:
# Picking a pair from different clients
index = numpy.random.choice(self.total_labels, 2, replace=False)
# Getting the indexes of the two clients
indexes = numpy.where(input_labels == index[0])[0]
indexes_p = numpy.where(input_labels == index[1])[0]
numpy.random.shuffle(indexes)
numpy.random.shuffle(indexes_p)
# Picking a pair
data = input_data[indexes[0], ...]
data_p = input_data[indexes_p[0], ...]
return data, data_p
if train_dataset:
target_data = self.train_data
target_labels = self.train_labels
shape = self.train_shape
else:
target_data = self.validation_data
target_labels = self.validation_labels
shape = self.validation_shape
data = numpy.zeros(shape=shape, dtype='float32')
data_p = numpy.zeros(shape=shape, dtype='float32')
labels_siamese = numpy.zeros(shape=shape[0], dtype='float32')
genuine = True
for i in range(shape[0]):
data[i, ...], data_p[i, ...] = get_genuine_or_not(target_data, target_labels, genuine=genuine)
if zero_one_labels:
labels_siamese[i] = not genuine
else:
labels_siamese[i] = -1 if genuine else +1
genuine = not genuine
return data, data_p, labels_siamese
def get_triplet(self, n_labels, n_triplets=1, is_target_set_train=True):
"""
Get a triplet
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
def get_one_triplet(input_data, input_labels):
# Getting a pair of clients
index = numpy.random.choice(n_labels, 2, replace=False)
label_positive = index[0]
label_negative = index[1]
# Getting the indexes of the data from a particular client
indexes = numpy.where(input_labels == index[0])[0]
numpy.random.shuffle(indexes)
# Picking a positive pair
data_anchor = input_data[indexes[0], :, :, :]
data_positive = input_data[indexes[1], :, :, :]
# Picking a negative sample
indexes = numpy.where(input_labels == index[1])[0]
numpy.random.shuffle(indexes)
data_negative = input_data[indexes[0], :, :, :]
return data_anchor, data_positive, data_negative, label_positive, label_positive, label_negative
if is_target_set_train:
target_data = self.train_data
target_labels = self.train_labels
else:
target_data = self.validation_data
target_labels = self.validation_labels
c = target_data.shape[3]
w = target_data.shape[1]
h = target_data.shape[2]
data_a = numpy.zeros(shape=(n_triplets, w, h, c), dtype='float32')
data_p = numpy.zeros(shape=(n_triplets, w, h, c), dtype='float32')
data_n = numpy.zeros(shape=(n_triplets, w, h, c), dtype='float32')
labels_a = numpy.zeros(shape=n_triplets, dtype='float32')
labels_p = numpy.zeros(shape=n_triplets, dtype='float32')
labels_n = numpy.zeros(shape=n_triplets, dtype='float32')
for i in range(n_triplets):
data_a[i, :, :, :], data_p[i, :, :, :], data_n[i, :, :, :], \
labels_a[i], labels_p[i], labels_n[i] = \
get_one_triplet(target_data, target_labels)
return data_a, data_p, data_n, labels_a, labels_p, labels_n
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
# @date: Wed 11 May 2016 09:39:36 CEST
import numpy
import bob.io.base
import bob.io.image
import tensorflow as tf
from .BaseDataShuffler import BaseDataShuffler
#def scale_mean_norm(data, scale=0.00390625):
# mean = numpy.mean(data)
# data = (data - mean) * scale
# return data, mean
class TextDataShuffler(BaseDataShuffler):
def __init__(self, data, labels, input_shape, perc_train=0.9, scale=0.00390625, train_batch_size=1, validation_batch_size=300):
"""
Shuffler that deal with file list
**Parameters**
data:
labels:
perc_train:
scale:
train_batch_size:
validation_batch_size:
"""
super(TextDataShuffler, self).__init__(
data=data,
labels=labels,
input_shape=input_shape,
perc_train=perc_train,
scale=scale,
train_batch_size=train_batch_size,
validation_batch_size=validation_batch_size
)
if isinstance(self.data, list):
self.data = numpy.array(self.data)
if isinstance(self.labels, list):
self.labels = numpy.array(self.labels)
# Spliting between train and test
self.train_data = self.data[self.indexes[0:self.n_train_samples]]
self.train_labels = self.labels[self.indexes[0:self.n_train_samples]]
self.validation_data = self.data[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples]]
self.validation_labels = self.labels[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples]]
def get_batch(self, train_dataset=True):
if train_dataset:
batch_size = self.train_batch_size
shape = self.train_shape
files_names = self.train_data
label = self.train_labels
else:
batch_size = self.validation_batch_size
shape = self.validation_shape
files_names = self.validation_data
label = self.validation_labels
# Shuffling samples
indexes = numpy.array(range(files_names.shape[0]))
numpy.random.shuffle(indexes)
selected_data = numpy.zeros(shape=shape)
for i in range(batch_size):
file_name = files_names[indexes[i]]
d = bob.io.base.load(file_name)
if len(d.shape) == 2:
data = numpy.zeros(shape=tuple(shape[1:]))
data[:, :, 0] = d
else:
data = d
selected_data[i, ...] = data
if self.scale is not None:
selected_data[i, ...] *= self.scale
selected_labels = label[indexes[0:batch_size]]
return selected_data.astype("float32"), selected_labels
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
# @date: Wed 11 May 2016 09:39:36 CEST
import numpy
from .TextDataShuffler import TextDataShuffler
class TextPairDataShuffler(TextDataShuffler):
def __init__(self, data, labels, input_shape, perc_train=0.9, scale=True, train_batch_size=1, validation_batch_size=300):
"""
The class provide some functionalities for shuffling data
**Parameters**
data:
"""
data = data
labels = labels
input_shape = input_shape
perc_train = perc_train
scale = scale
train_batch_size = train_batch_size
validation_batch_size = validation_batch_size
super(TextPairDataShuffler, self).__init__(data, labels,
input_shape=input_shape,
perc_train=perc_train,
scale=scale,
train_batch_size=train_batch_size*2,
validation_batch_size=validation_batch_size)
def get_pair(self, train_dataset=True, zero_one_labels=True):
"""
Get a random pair of samples
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
def get_genuine_or_not(input_data, input_labels, genuine=True):
if genuine:
# TODO: THIS KEY SELECTION NEEDS TO BE MORE EFFICIENT
# Getting a client
index = numpy.random.randint(self.total_labels)
# Getting the indexes of the data from a particular client
indexes = numpy.where(input_labels == index)[0]
numpy.random.shuffle(indexes)
# Picking a pair
data = input_data[indexes[0]]
data_p = input_data[indexes[1]]
else:
# Picking a pair from different clients
index = numpy.random.choice(self.total_labels, 2, replace=False)
# Getting the indexes of the two clients
indexes = numpy.where(input_labels == index[0])[0]
indexes_p = numpy.where(input_labels == index[1])[0]
numpy.random.shuffle(indexes)
numpy.random.shuffle(indexes_p)
# Picking a pair
data = input_data[indexes[0]]
data_p = input_data[indexes_p[0]]
return data, data_p
if train_dataset:
target_data = self.train_data
target_labels = self.train_labels
shape = self.train_shape
else:
target_data = self.validation_data
target_labels = self.validation_labels
shape = self.validation_shape
data = numpy.zeros(shape=shape, dtype='float32')
data_p = numpy.zeros(shape=shape, dtype='float32')
labels_siamese = numpy.zeros(shape=shape[0], dtype='float32')
genuine = True
for i in range(shape[0]):
data[i, ...], data_p[i, ...] = get_genuine_or_not(target_data, target_labels, genuine=genuine)
if zero_one_labels:
labels_siamese[i] = not genuine
else:
labels_siamese[i] = -1 if genuine else +1
genuine = not genuine
return data, data_p, labels_siamese
def get_triplet(self, n_labels, n_triplets=1, is_target_set_train=True):
"""
Get a triplet
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
def get_one_triplet(input_data, input_labels):
# Getting a pair of clients
index = numpy.random.choice(n_labels, 2, replace=False)
label_positive = index[0]
label_negative = index[1]
# Getting the indexes of the data from a particular client
indexes = numpy.where(input_labels == index[0])[0]
numpy.random.shuffle(indexes)
# Picking a positive pair
data_anchor = input_data[indexes[0], :, :, :]
data_positive = input_data[indexes[1], :, :, :]
# Picking a negative sample
indexes = numpy.where(input_labels == index[1])[0]
numpy.random.shuffle(indexes)
data_negative = input_data[indexes[0], :, :, :]
return data_anchor, data_positive, data_negative, label_positive, label_positive, label_negative
if is_target_set_train:
target_data = self.train_data
target_labels = self.train_labels
else:
target_data = self.validation_data
target_labels = self.validation_labels
c = target_data.shape[3]
w = target_data.shape[1]
h = target_data.shape[2]
data_a = numpy.zeros(shape=(n_triplets, w, h, c), dtype='float32')
data_p = numpy.zeros(shape=(n_triplets, w, h, c), dtype='float32')
data_n = numpy.zeros(shape=(n_triplets, w, h, c), dtype='float32')
labels_a = numpy.zeros(shape=n_triplets, dtype='float32')
labels_p = numpy.zeros(shape=n_triplets, dtype='float32')
labels_n = numpy.zeros(shape=n_triplets, dtype='float32')
for i in range(n_triplets):
data_a[i, :, :, :], data_p[i, :, :, :], data_n[i, :, :, :], \
labels_a[i], labels_p[i], labels_n[i] = \
get_one_triplet(target_data, target_labels)
return data_a, data_p, data_n, labels_a, labels_p, labels_n
...@@ -2,8 +2,10 @@ ...@@ -2,8 +2,10 @@
from pkgutil import extend_path from pkgutil import extend_path
__path__ = extend_path(__path__, __name__) __path__ = extend_path(__path__, __name__)
from .DataShuffler import DataShuffler from .BaseDataShuffler import BaseDataShuffler
from .PairDataShuffler import PairDataShuffler from .MemoryDataShuffler import MemoryDataShuffler
from .MemoryPairDataShuffler import MemoryPairDataShuffler
from .TextDataShuffler import TextDataShuffler
# gets sphinx autodoc done right - don't remove it # gets sphinx autodoc done right - don't remove it
__all__ = [_ for _ in dir() if not _.startswith('_')] __all__ = [_ for _ in dir() if not _.startswith('_')]
...@@ -21,10 +21,11 @@ from docopt import docopt ...@@ -21,10 +21,11 @@ from docopt import docopt
import tensorflow as tf import tensorflow as tf
from .. import util from .. import util
SEED = 10 SEED = 10
from bob.learn.tensorflow.data import DataShuffler from bob.learn.tensorflow.data import MemoryDataShuffler, TextDataShuffler
from bob.learn.tensorflow.network import Lenet from bob.learn.tensorflow.network import Lenet
from bob.learn.tensorflow.trainers import Trainer from bob.learn.tensorflow.trainers import Trainer
from bob.learn.tensorflow.loss import BaseLoss from bob.learn.tensorflow.loss import BaseLoss
import bob.db.mobio
import numpy import numpy
...@@ -40,7 +41,24 @@ def main(): ...@@ -40,7 +41,24 @@ def main():
# Loading data # Loading data
data, labels = util.load_mnist(data_dir="./src/bob.db.mnist/bob/db/mnist/") data, labels = util.load_mnist(data_dir="./src/bob.db.mnist/bob/db/mnist/")
data = numpy.reshape(data, (data.shape[0], 28, 28, 1)) data = numpy.reshape(data, (data.shape[0], 28, 28, 1))
data_shuffler = DataShuffler(data, labels, train_batch_size=BATCH_SIZE, validation_batch_size=BATCH_SIZE*100) data_shuffler = MemoryDataShuffler(data, labels,
input_shape=[28, 28, 1],
train_batch_size=BATCH_SIZE,
validation_batch_size=BATCH_SIZE*100)
#db = bob.db.mobio.Database()
#objects = db.objects(protocol="male")
#labels = [o.client_id for o in objects]
#file_names = [o.make_path(
# directory="/remote/lustre/2/temp/tpereira/FACEREC_EXPERIMENTS/mobio_male/lda/preprocessed",
# extension=".hdf5")
# for o in objects]
#data_shuffler = TextDataShuffler(file_names, labels,
# input_shape=[80, 64, 1],
# train_batch_size=BATCH_SIZE,
# validation_batch_size=BATCH_SIZE*100)
# Preparing the architecture # Preparing the architecture
lenet = Lenet() lenet = Lenet()
......
...@@ -21,11 +21,11 @@ from docopt import docopt ...@@ -21,11 +21,11 @@ from docopt import docopt
import tensorflow as tf import tensorflow as tf
from .. import util from .. import util
SEED = 10