Added prefetching and text loading

parent 65a91493
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
# @date: Wed 11 May 2016 09:39:36 CEST
import numpy
import tensorflow as tf
class BaseDataShuffler(object):
def __init__(self, data, labels, input_shape, perc_train=0.9, scale=True, train_batch_size=1, validation_batch_size=300):
"""
The class provide base functionoalies to shuffle the data
**Parameters**
data:
labels:
perc_train:
scale:
train_batch_size:
validation_batch_size:
"""
self.scale = scale
self.scale_value = 0.00390625
# TODO: Check if the bacth size is higher than the input data
self.train_batch_size = train_batch_size
self.validation_batch_size = validation_batch_size
self.data = data
self.train_shape = tuple([train_batch_size] + input_shape)
self.validation_shape = tuple([validation_batch_size] + input_shape)
# TODO: Check if the labels goes from O to N-1
self.labels = labels
self.total_labels = max(labels) + 1
# Spliting in train and validation
self.n_samples = len(self.labels)
self.n_train_samples = int(round(self.n_samples * perc_train))
self.n_validation_samples = self.n_samples - self.n_train_samples
# Shuffling all the indexes
self.indexes = numpy.array(range(self.n_samples))
numpy.random.shuffle(self.indexes)
def get_placeholders_forprefetch(self, name="", train_dataset=True):
"""
Returns a place holder with the size of your batch
"""
shape = self.train_shape if train_dataset else self.validation_shape
data = tf.placeholder(tf.float32, shape=tuple([None] + list(shape[1:])), name=name)
labels = tf.placeholder(tf.int64, shape=[None, ])
return data, labels
def get_placeholders(self, name="", train_dataset=True):
"""
Returns a place holder with the size of your batch
"""
shape = self.train_shape if train_dataset else self.validation_shape
data = tf.placeholder(tf.float32, shape=shape, name=name)
labels = tf.placeholder(tf.int64, shape=shape[0])
return data, labels
......@@ -6,6 +6,8 @@
import numpy
import tensorflow as tf
from .BaseDataShuffler import BaseDataShuffler
def scale_mean_norm(data, scale=0.00390625):
mean = numpy.mean(data)
data = (data - mean) * scale
......@@ -13,66 +15,39 @@ def scale_mean_norm(data, scale=0.00390625):
return data, mean
class DataShuffler(object):
def __init__(self, data, labels, perc_train=0.9, scale=True, train_batch_size=1, validation_batch_size=300):
"""
The class provide some functionalities for shuffling data
**Parameters**
data:
"""
self.perc_train = perc_train
self.scale = scale
self.scale_value = 0.00390625
self.train_batch_size = train_batch_size
self.validation_batch_size = validation_batch_size
self.data = data
self.labels = labels # From O to N-1
self.total_labels = max(labels) + 1
self.n_samples = self.data.shape[0]
self.width = self.data.shape[1]
self.height = self.data.shape[2]
self.channels = self.data.shape[3]
self.start_shuffler()
def get_placeholders(self, name="", train_dataset=True):
"""
"""
batch = self.train_batch_size if train_dataset else self.validation_batch_size
data = tf.placeholder(tf.float32, shape=(batch, self.width,
self.height, self.channels), name=name)
labels = tf.placeholder(tf.int64, shape=batch)
return data, labels
def start_shuffler(self):
class MemoryDataShuffler(BaseDataShuffler):
def __init__(self, data, labels, input_shape, perc_train=0.9, scale=True, train_batch_size=1, validation_batch_size=300):
"""
Some base functions for neural networks
Shuffler that deal with memory datasets
**Parameters**
data:
labels:
perc_train:
scale:
train_batch_size:
validation_batch_size:
"""
indexes = numpy.array(range(self.n_samples))
numpy.random.shuffle(indexes)
# Spliting train and validation
train_samples = int(round(self.n_samples * self.perc_train))
validation_samples = self.n_samples - train_samples
self.train_data = self.data[indexes[0:train_samples], :, :, :]
self.train_labels = self.labels[indexes[0:train_samples]]
self.validation_data = self.data[indexes[train_samples:train_samples + validation_samples], :, :, :]
self.validation_labels = self.labels[indexes[train_samples:train_samples + validation_samples]]
super(MemoryDataShuffler, self).__init__(
data=data,
labels=labels,
input_shape=input_shape,
perc_train=perc_train,
scale=scale,
train_batch_size=train_batch_size,
validation_batch_size=validation_batch_size
)
# Spliting between train and test
self.train_data = self.data[self.indexes[0:self.n_train_samples], ...]
self.train_labels = self.labels[self.indexes[0:self.n_train_samples]]
self.validation_data = self.data[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples], ...]
self.validation_labels = self.labels[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples]]
if self.scale:
# data = scale_minmax_norm(data,lower_bound = -1, upper_bound = 1)
self.train_data, self.mean = scale_mean_norm(self.train_data)
self.validation_data = (self.validation_data - self.mean) * self.scale_value
......@@ -80,13 +55,10 @@ class DataShuffler(object):
if train_dataset:
n_samples = self.train_batch_size
else:
n_samples = self.validation_batch_size
if train_dataset:
data = self.train_data
label = self.train_labels
else:
n_samples = self.validation_batch_size
data = self.validation_data
label = self.validation_labels
......
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
# @date: Wed 11 May 2016 09:39:36 CEST
import numpy
from .MemoryDataShuffler import MemoryDataShuffler
class MemoryPairDataShuffler(MemoryDataShuffler):
def __init__(self, data, labels, input_shape, perc_train=0.9, scale=True, train_batch_size=1, validation_batch_size=300):
"""
The class provide some functionalities for shuffling data
**Parameters**
data:
"""
data = data
labels = labels
input_shape = input_shape
perc_train = perc_train
scale = scale
train_batch_size = train_batch_size
validation_batch_size = validation_batch_size
super(MemoryPairDataShuffler, self).__init__(data, labels,
input_shape=input_shape,
perc_train=perc_train,
scale=scale,
train_batch_size=train_batch_size*2,
validation_batch_size=validation_batch_size)
def get_pair(self, train_dataset=True, zero_one_labels=True):
"""
Get a random pair of samples
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
def get_genuine_or_not(input_data, input_labels, genuine=True):
if genuine:
# TODO: THIS KEY SELECTION NEEDS TO BE MORE EFFICIENT
# Getting a client
index = numpy.random.randint(self.total_labels)
# Getting the indexes of the data from a particular client
indexes = numpy.where(input_labels == index)[0]
numpy.random.shuffle(indexes)
# Picking a pair
data = input_data[indexes[0], ...]
data_p = input_data[indexes[1], ...]
else:
# Picking a pair from different clients
index = numpy.random.choice(self.total_labels, 2, replace=False)
# Getting the indexes of the two clients
indexes = numpy.where(input_labels == index[0])[0]
indexes_p = numpy.where(input_labels == index[1])[0]
numpy.random.shuffle(indexes)
numpy.random.shuffle(indexes_p)
# Picking a pair
data = input_data[indexes[0], ...]
data_p = input_data[indexes_p[0], ...]
return data, data_p
if train_dataset:
target_data = self.train_data
target_labels = self.train_labels
shape = self.train_shape
else:
target_data = self.validation_data
target_labels = self.validation_labels
shape = self.validation_shape
data = numpy.zeros(shape=shape, dtype='float32')
data_p = numpy.zeros(shape=shape, dtype='float32')
labels_siamese = numpy.zeros(shape=shape[0], dtype='float32')
genuine = True
for i in range(shape[0]):
data[i, ...], data_p[i, ...] = get_genuine_or_not(target_data, target_labels, genuine=genuine)
if zero_one_labels:
labels_siamese[i] = not genuine
else:
labels_siamese[i] = -1 if genuine else +1
genuine = not genuine
return data, data_p, labels_siamese
def get_triplet(self, n_labels, n_triplets=1, is_target_set_train=True):
"""
Get a triplet
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
def get_one_triplet(input_data, input_labels):
# Getting a pair of clients
index = numpy.random.choice(n_labels, 2, replace=False)
label_positive = index[0]
label_negative = index[1]
# Getting the indexes of the data from a particular client
indexes = numpy.where(input_labels == index[0])[0]
numpy.random.shuffle(indexes)
# Picking a positive pair
data_anchor = input_data[indexes[0], :, :, :]
data_positive = input_data[indexes[1], :, :, :]
# Picking a negative sample
indexes = numpy.where(input_labels == index[1])[0]
numpy.random.shuffle(indexes)
data_negative = input_data[indexes[0], :, :, :]
return data_anchor, data_positive, data_negative, label_positive, label_positive, label_negative
if is_target_set_train:
target_data = self.train_data
target_labels = self.train_labels
else:
target_data = self.validation_data
target_labels = self.validation_labels
c = target_data.shape[3]
w = target_data.shape[1]
h = target_data.shape[2]
data_a = numpy.zeros(shape=(n_triplets, w, h, c), dtype='float32')
data_p = numpy.zeros(shape=(n_triplets, w, h, c), dtype='float32')
data_n = numpy.zeros(shape=(n_triplets, w, h, c), dtype='float32')
labels_a = numpy.zeros(shape=n_triplets, dtype='float32')
labels_p = numpy.zeros(shape=n_triplets, dtype='float32')
labels_n = numpy.zeros(shape=n_triplets, dtype='float32')
for i in range(n_triplets):
data_a[i, :, :, :], data_p[i, :, :, :], data_n[i, :, :, :], \
labels_a[i], labels_p[i], labels_n[i] = \
get_one_triplet(target_data, target_labels)
return data_a, data_p, data_n, labels_a, labels_p, labels_n
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
# @date: Wed 11 May 2016 09:39:36 CEST
import numpy
import bob.io.base
import bob.io.image
import tensorflow as tf
from .BaseDataShuffler import BaseDataShuffler
#def scale_mean_norm(data, scale=0.00390625):
# mean = numpy.mean(data)
# data = (data - mean) * scale
# return data, mean
class TextDataShuffler(BaseDataShuffler):
def __init__(self, data, labels, input_shape, perc_train=0.9, scale=0.00390625, train_batch_size=1, validation_batch_size=300):
"""
Shuffler that deal with file list
**Parameters**
data:
labels:
perc_train:
scale:
train_batch_size:
validation_batch_size:
"""
super(TextDataShuffler, self).__init__(
data=data,
labels=labels,
input_shape=input_shape,
perc_train=perc_train,
scale=scale,
train_batch_size=train_batch_size,
validation_batch_size=validation_batch_size
)
if isinstance(self.data, list):
self.data = numpy.array(self.data)
if isinstance(self.labels, list):
self.labels = numpy.array(self.labels)
# Spliting between train and test
self.train_data = self.data[self.indexes[0:self.n_train_samples]]
self.train_labels = self.labels[self.indexes[0:self.n_train_samples]]
self.validation_data = self.data[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples]]
self.validation_labels = self.labels[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples]]
def get_batch(self, train_dataset=True):
if train_dataset:
batch_size = self.train_batch_size
shape = self.train_shape
files_names = self.train_data
label = self.train_labels
else:
batch_size = self.validation_batch_size
shape = self.validation_shape
files_names = self.validation_data
label = self.validation_labels
# Shuffling samples
indexes = numpy.array(range(files_names.shape[0]))
numpy.random.shuffle(indexes)
selected_data = numpy.zeros(shape=shape)
for i in range(batch_size):
file_name = files_names[indexes[i]]
d = bob.io.base.load(file_name)
if len(d.shape) == 2:
data = numpy.zeros(shape=tuple(shape[1:]))
data[:, :, 0] = d
else:
data = d
selected_data[i, ...] = data
if self.scale is not None:
selected_data[i, ...] *= self.scale
selected_labels = label[indexes[0:batch_size]]
return selected_data.astype("float32"), selected_labels
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
# @date: Wed 11 May 2016 09:39:36 CEST
import numpy
from .TextDataShuffler import TextDataShuffler
class TextPairDataShuffler(TextDataShuffler):
def __init__(self, data, labels, input_shape, perc_train=0.9, scale=True, train_batch_size=1, validation_batch_size=300):
"""
The class provide some functionalities for shuffling data
**Parameters**
data:
"""
data = data
labels = labels
input_shape = input_shape
perc_train = perc_train
scale = scale
train_batch_size = train_batch_size
validation_batch_size = validation_batch_size
super(TextPairDataShuffler, self).__init__(data, labels,
input_shape=input_shape,
perc_train=perc_train,
scale=scale,
train_batch_size=train_batch_size*2,
validation_batch_size=validation_batch_size)
def get_pair(self, train_dataset=True, zero_one_labels=True):
"""
Get a random pair of samples
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
def get_genuine_or_not(input_data, input_labels, genuine=True):
if genuine:
# TODO: THIS KEY SELECTION NEEDS TO BE MORE EFFICIENT
# Getting a client
index = numpy.random.randint(self.total_labels)
# Getting the indexes of the data from a particular client
indexes = numpy.where(input_labels == index)[0]
numpy.random.shuffle(indexes)
# Picking a pair
data = input_data[indexes[0]]
data_p = input_data[indexes[1]]
else:
# Picking a pair from different clients
index = numpy.random.choice(self.total_labels, 2, replace=False)
# Getting the indexes of the two clients
indexes = numpy.where(input_labels == index[0])[0]
indexes_p = numpy.where(input_labels == index[1])[0]
numpy.random.shuffle(indexes)
numpy.random.shuffle(indexes_p)
# Picking a pair
data = input_data[indexes[0]]
data_p = input_data[indexes_p[0]]
return data, data_p
if train_dataset:
target_data = self.train_data
target_labels = self.train_labels
shape = self.train_shape
else:
target_data = self.validation_data
target_labels = self.validation_labels
shape = self.validation_shape
data = numpy.zeros(shape=shape, dtype='float32')
data_p = numpy.zeros(shape=shape, dtype='float32')
labels_siamese = numpy.zeros(shape=shape[0], dtype='float32')
genuine = True
for i in range(shape[0]):
data[i, ...], data_p[i, ...] = get_genuine_or_not(target_data, target_labels, genuine=genuine)
if zero_one_labels:
labels_siamese[i] = not genuine
else:
labels_siamese[i] = -1 if genuine else +1
genuine = not genuine
return data, data_p, labels_siamese
def get_triplet(self, n_labels, n_triplets=1, is_target_set_train=True):
"""
Get a triplet
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
def get_one_triplet(input_data, input_labels):
# Getting a pair of clients
index = numpy.random.choice(n_labels, 2, replace=False)
label_positive = index[0]
label_negative = index[1]
# Getting the indexes of the data from a particular client
indexes = numpy.where(input_labels == index[0])[0]
numpy.random.shuffle(indexes)
# Picking a positive pair
data_anchor = input_data[indexes[0], :, :, :]
data_positive = input_data[indexes[1], :, :, :]
# Picking a negative sample
indexes = numpy.where(input_labels == index[1])[0]
numpy.random.shuffle(indexes)
data_negative = input_data[indexes[0], :, :, :]
return data_anchor, data_positive, data_negative, label_positive, label_positive, label_negative
if is_target_set_train:
target_data = self.train_data
target_labels = self.train_labels
else:
target_data = self.validation_data
target_labels = self.validation_labels
c = target_data.shape[3]
w = target_data.shape[1]
h = target_data.shape[2]
data_a = numpy.zeros(shape=(n_triplets, w, h, c), dtype='float32')
data_p = numpy.zeros(shape=(n_triplets, w, h, c), dtype='float32')
data_n = numpy.zeros(shape=(n_triplets, w, h, c), dtype='float32')
labels_a = numpy.zeros(shape=n_triplets, dtype='float32')
labels_p = numpy.zeros(shape=n_triplets, dtype='float32')
labels_n = numpy.zeros(shape=n_triplets, dtype='float32')
for i in range(n_triplets):
data_a[i, :, :, :], data_p[i, :, :, :], data_n[i, :, :, :], \
labels_a[i], labels_p[i], labels_n[i] = \
get_one_triplet(target_data, target_labels)
return data_a, data_p, data_n, labels_a, labels_p, labels_n
......@@ -2,8 +2,10 @@
from pkgutil import extend_path
__path__ = extend_path(__path__, __name__)
from .DataShuffler import DataShuffler
from .PairDataShuffler import PairDataShuffler
from .BaseDataShuffler import BaseDataShuffler
from .MemoryDataShuffler import MemoryDataShuffler
from .MemoryPairDataShuffler import MemoryPairDataShuffler
from .TextDataShuffler import TextDataShuffler
# gets sphinx autodoc done right - don't remove it
__all__ = [_ for _ in dir() if not _.startswith('_')]
......@@ -21,10 +21,11 @@ from docopt import docopt
import tensorflow as tf
from .. import util
SEED = 10
from bob.learn.tensorflow.data import DataShuffler
from bob.learn.tensorflow.data import MemoryDataShuffler, TextDataShuffler
from bob.learn.tensorflow.network import Lenet
from bob.learn.tensorflow.trainers import Trainer
from bob.learn.tensorflow.loss import BaseLoss
import bob.db.mobio
import numpy
......@@ -40,7 +41,24 @@ def main():
# Loading data
data, labels = util.load_mnist(data_dir="./src/bob.db.mnist/bob/db/mnist/")
data = numpy.reshape(data, (data.shape[0], 28, 28, 1))
data_shuffler = DataShuffler(data, labels, train_batch_size=BATCH_SIZE, validation_batch_size=BATCH_SIZE*100)
data_shuffler = MemoryDataShuffler(data, labels,
input_shape=[28, 28, 1],
train_batch_size=BATCH_SIZE,
validation_batch_size=BATCH_SIZE*100)
#db = bob.db.mobio.Database()
#objects = db.objects(protocol="male")
#labels = [o.client_id for o in objects]
#file_names = [o.make_path(
# directory="/remote/lustre/2/temp/tpereira/FACEREC_EXPERIMENTS/mobio_male/lda/preprocessed",
# extension=".hdf5")
# for o in objects]
#data_shuffler = TextDataShuffler(file_names, labels,
# input_shape=[80, 64, 1],
# train_batch_size=BATCH_SIZE,
# validation_batch_size=BATCH_SIZE*100)
# Preparing the architecture
lenet = Lenet()
......
......@@ -21,11 +21,11 @@ from docopt import docopt
import tensorflow as tf
from .. import util
SEED = 10
from bob.learn.tensorflow.data import PairDataShuffler
from bob.learn.tensorflow.data import MemoryPairDataShuffler, TextDataShuffler
from bob.learn.tensorflow.network import Lenet
from bob.learn.tensorflow.trainers import SiameseTrainer
from bob.learn.tensorflow.loss import ContrastiveLoss
import bob.db.mobio
import numpy
def main():
......@@ -40,8 +40,25 @@ def main():
# Loading data
data, labels = util.load_mnist(data_dir="./src/bob.db.mnist/bob/db/mnist/")
data = numpy.reshape(data, (data.shape[0], 28, 28, 1))
data_shuffler = MemoryPairDataShuffler(data, labels,
input_shape=[28, 28, 1],
train_batch_size=BATCH_SIZE,
validation_batch_size=BATCH_SIZE*1000
)
#db = bob.db.mobio.Database()
#objects = db.objects(protocol="male")
#labels = [o.client_id for o in objects]
#file_names = [o.make_path(
# directory="/remote/lustre/2/temp/tpereira/FACEREC_EXPERIMENTS/mobio_male/lda/preprocessed",
# extension=".hdf5")
# for o in objects]
#data_shuffler = TextDataShuffler(file_names, labels,
# input_shape=[80, 64, 1],