Reorganized shufflers

parent 5884afff
...@@ -49,6 +49,8 @@ class Analizer: ...@@ -49,6 +49,8 @@ class Analizer:
enroll_features = self.machine(enroll_data, session=self.session) enroll_features = self.machine(enroll_data, session=self.session)
del enroll_data del enroll_data
#import ipdb; ipdb.set_trace();
# Extracting features for probing # Extracting features for probing
probe_data, probe_labels = self.data_shuffler.get_batch(train_dataset=False) probe_data, probe_labels = self.data_shuffler.get_batch(train_dataset=False)
probe_features = self.machine(probe_data, session=self.session) probe_features = self.machine(probe_data, session=self.session)
...@@ -56,22 +58,23 @@ class Analizer: ...@@ -56,22 +58,23 @@ class Analizer:
# Creating models # Creating models
models = [] models = []
for i in range(self.data_shuffler.total_labels): for i in range(len(self.data_shuffler.possible_labels)):
indexes_model = numpy.where(enroll_labels == i)[0] indexes_model = numpy.where(enroll_labels == self.data_shuffler.possible_labels[i])[0]
models.append(numpy.mean(enroll_features[indexes_model, :], axis=0)) models.append(numpy.mean(enroll_features[indexes_model, :], axis=0))
# Probing # Probing
positive_scores = numpy.zeros(shape=0) positive_scores = numpy.zeros(shape=0)
negative_scores = numpy.zeros(shape=0) negative_scores = numpy.zeros(shape=0)
for i in range(self.data_shuffler.total_labels): for i in range(len(self.data_shuffler.possible_labels)):
#for i in self.data_shuffler.possible_labels:
# Positive scoring # Positive scoring
indexes = probe_labels == i indexes = probe_labels == self.data_shuffler.possible_labels[i]
positive_data = probe_features[indexes, :] positive_data = probe_features[indexes, :]
p = [cosine(models[i], positive_data[j]) for j in range(positive_data.shape[0])] p = [cosine(models[i], positive_data[j]) for j in range(positive_data.shape[0])]
positive_scores = numpy.hstack((positive_scores, p)) positive_scores = numpy.hstack((positive_scores, p))
# negative scoring # negative scoring
indexes = probe_labels != i indexes = probe_labels != self.data_shuffler.possible_labels[i]
negative_data = probe_features[indexes, :] negative_data = probe_features[indexes, :]
n = [cosine(models[i], negative_data[j]) for j in range(negative_data.shape[0])] n = [cosine(models[i], negative_data[j]) for j in range(negative_data.shape[0])]
negative_scores = numpy.hstack((negative_scores, n)) negative_scores = numpy.hstack((negative_scores, n))
......
...@@ -8,7 +8,13 @@ import tensorflow as tf ...@@ -8,7 +8,13 @@ import tensorflow as tf
class BaseDataShuffler(object): class BaseDataShuffler(object):
def __init__(self, data, labels, input_shape, perc_train=0.9, scale=True, train_batch_size=1, validation_batch_size=300): def __init__(self, data, labels,
input_shape,
input_dtype="float64",
perc_train=0.9,
scale=True,
train_batch_size=1,
validation_batch_size=300):
""" """
The class provide base functionoalies to shuffle the data The class provide base functionoalies to shuffle the data
...@@ -23,6 +29,7 @@ class BaseDataShuffler(object): ...@@ -23,6 +29,7 @@ class BaseDataShuffler(object):
self.scale = scale self.scale = scale
self.scale_value = 0.00390625 self.scale_value = 0.00390625
self.input_dtype = input_dtype
# TODO: Check if the bacth size is higher than the input data # TODO: Check if the bacth size is higher than the input data
self.train_batch_size = train_batch_size self.train_batch_size = train_batch_size
...@@ -34,9 +41,9 @@ class BaseDataShuffler(object): ...@@ -34,9 +41,9 @@ class BaseDataShuffler(object):
# TODO: Check if the labels goes from O to N-1 # TODO: Check if the labels goes from O to N-1
self.labels = labels self.labels = labels
self.total_labels = max(labels) + 1 self.possible_labels = list(set(self.labels))
# Spliting in train and validation # Computing the data samples fro train and validation
self.n_samples = len(self.labels) self.n_samples = len(self.labels)
self.n_train_samples = int(round(self.n_samples * perc_train)) self.n_train_samples = int(round(self.n_samples * perc_train))
self.n_validation_samples = self.n_samples - self.n_train_samples self.n_validation_samples = self.n_samples - self.n_train_samples
...@@ -45,6 +52,15 @@ class BaseDataShuffler(object): ...@@ -45,6 +52,15 @@ class BaseDataShuffler(object):
self.indexes = numpy.array(range(self.n_samples)) self.indexes = numpy.array(range(self.n_samples))
numpy.random.shuffle(self.indexes) numpy.random.shuffle(self.indexes)
# Spliting the data between train and validation
self.train_data = self.data[self.indexes[0:self.n_train_samples], ...]
self.train_labels = self.labels[self.indexes[0:self.n_train_samples]]
self.validation_data = self.data[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples], ...]
self.validation_labels = self.labels[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples]]
def get_placeholders_forprefetch(self, name="", train_dataset=True): def get_placeholders_forprefetch(self, name="", train_dataset=True):
""" """
Returns a place holder with the size of your batch Returns a place holder with the size of your batch
...@@ -66,3 +82,35 @@ class BaseDataShuffler(object): ...@@ -66,3 +82,35 @@ class BaseDataShuffler(object):
labels = tf.placeholder(tf.int64, shape=shape[0]) labels = tf.placeholder(tf.int64, shape=shape[0])
return data, labels return data, labels
def get_genuine_or_not(self, input_data, input_labels, genuine=True):
if genuine:
# Getting a client
index = numpy.random.randint(len(self.possible_labels))
index = self.possible_labels[index]
# Getting the indexes of the data from a particular client
indexes = numpy.where(input_labels == index)[0]
numpy.random.shuffle(indexes)
# Picking a pair
data = input_data[indexes[0], ...]
data_p = input_data[indexes[1], ...]
else:
# Picking a pair of labels from different clients
index = numpy.random.choice(len(self.possible_labels), 2, replace=False)
index[0] = self.possible_labels[index[0]]
index[1] = self.possible_labels[index[1]]
# Getting the indexes of the two clients
indexes = numpy.where(input_labels == index[0])[0]
indexes_p = numpy.where(input_labels == index[1])[0]
numpy.random.shuffle(indexes)
numpy.random.shuffle(indexes_p)
# Picking a pair
data = input_data[indexes[0], ...]
data_p = input_data[indexes_p[0], ...]
return data, data_p
...@@ -16,7 +16,14 @@ def scale_mean_norm(data, scale=0.00390625): ...@@ -16,7 +16,14 @@ def scale_mean_norm(data, scale=0.00390625):
class MemoryDataShuffler(BaseDataShuffler): class MemoryDataShuffler(BaseDataShuffler):
def __init__(self, data, labels, input_shape, perc_train=0.9, scale=True, train_batch_size=1, validation_batch_size=300):
def __init__(self, data, labels,
input_shape,
input_dtype="float64",
perc_train=0.9,
scale=True,
train_batch_size=1,
validation_batch_size=300):
""" """
Shuffler that deal with memory datasets Shuffler that deal with memory datasets
...@@ -33,23 +40,19 @@ class MemoryDataShuffler(BaseDataShuffler): ...@@ -33,23 +40,19 @@ class MemoryDataShuffler(BaseDataShuffler):
data=data, data=data,
labels=labels, labels=labels,
input_shape=input_shape, input_shape=input_shape,
input_dtype=input_dtype,
perc_train=perc_train, perc_train=perc_train,
scale=scale, scale=scale,
train_batch_size=train_batch_size, train_batch_size=train_batch_size,
validation_batch_size=validation_batch_size validation_batch_size=validation_batch_size
) )
# Spliting between train and test self.train_data = self.train_data.astype(input_dtype)
self.train_data = self.data[self.indexes[0:self.n_train_samples], ...] self.validation_data = self.validation_data.astype(input_dtype)
self.train_labels = self.labels[self.indexes[0:self.n_train_samples]]
self.validation_data = self.data[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples], ...]
self.validation_labels = self.labels[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples]]
if self.scale: if self.scale:
self.train_data, self.mean = scale_mean_norm(self.train_data) self.train_data *= self.scale_value
self.validation_data = (self.validation_data - self.mean) * self.scale_value self.validation_data *= self.scale_value
def get_batch(self, train_dataset=True): def get_batch(self, train_dataset=True):
...@@ -70,3 +73,37 @@ class MemoryDataShuffler(BaseDataShuffler): ...@@ -70,3 +73,37 @@ class MemoryDataShuffler(BaseDataShuffler):
selected_labels = label[indexes[0:n_samples]] selected_labels = label[indexes[0:n_samples]]
return selected_data.astype("float32"), selected_labels return selected_data.astype("float32"), selected_labels
def get_pair(self, train_dataset=True, zero_one_labels=True):
"""
Get a random pair of samples
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
if train_dataset:
target_data = self.train_data
target_labels = self.train_labels
shape = self.train_shape
else:
target_data = self.validation_data
target_labels = self.validation_labels
shape = self.validation_shape
data = numpy.zeros(shape=shape, dtype='float32')
data_p = numpy.zeros(shape=shape, dtype='float32')
labels_siamese = numpy.zeros(shape=shape[0], dtype='float32')
genuine = True
for i in range(shape[0]):
data[i, ...], data_p[i, ...] = self.get_genuine_or_not(target_data, target_labels, genuine=genuine)
if zero_one_labels:
labels_siamese[i] = not genuine
else:
labels_siamese[i] = -1 if genuine else +1
genuine = not genuine
return data, data_p, labels_siamese
...@@ -18,7 +18,13 @@ from .BaseDataShuffler import BaseDataShuffler ...@@ -18,7 +18,13 @@ from .BaseDataShuffler import BaseDataShuffler
class TextDataShuffler(BaseDataShuffler): class TextDataShuffler(BaseDataShuffler):
def __init__(self, data, labels, input_shape, perc_train=0.9, scale=0.00390625, train_batch_size=1, validation_batch_size=300): def __init__(self, data, labels,
input_shape,
input_dtype="float64",
perc_train=0.9,
scale=True,
train_batch_size=1,
validation_batch_size=300):
""" """
Shuffler that deal with file list Shuffler that deal with file list
...@@ -31,30 +37,32 @@ class TextDataShuffler(BaseDataShuffler): ...@@ -31,30 +37,32 @@ class TextDataShuffler(BaseDataShuffler):
validation_batch_size: validation_batch_size:
""" """
if isinstance(data, list):
data = numpy.array(data)
if isinstance(labels, list):
labels = numpy.array(labels)
super(TextDataShuffler, self).__init__( super(TextDataShuffler, self).__init__(
data=data, data=data,
labels=labels, labels=labels,
input_shape=input_shape, input_shape=input_shape,
input_dtype=input_dtype,
perc_train=perc_train, perc_train=perc_train,
scale=scale, scale=scale,
train_batch_size=train_batch_size, train_batch_size=train_batch_size,
validation_batch_size=validation_batch_size validation_batch_size=validation_batch_size
) )
if isinstance(self.data, list): def load_from_file(self, file_name, shape):
self.data = numpy.array(self.data) d = bob.io.base.load(file_name)
if len(d.shape) == 2:
if isinstance(self.labels, list): data = numpy.zeros(shape=tuple(shape[1:]))
self.labels = numpy.array(self.labels) data[:, :, 0] = d
else:
# Spliting between train and test data = d
self.train_data = self.data[self.indexes[0:self.n_train_samples]]
self.train_labels = self.labels[self.indexes[0:self.n_train_samples]]
self.validation_data = self.data[self.indexes[self.n_train_samples: return data
self.n_train_samples + self.n_validation_samples]]
self.validation_labels = self.labels[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples]]
def get_batch(self, train_dataset=True): def get_batch(self, train_dataset=True):
...@@ -77,20 +85,54 @@ class TextDataShuffler(BaseDataShuffler): ...@@ -77,20 +85,54 @@ class TextDataShuffler(BaseDataShuffler):
for i in range(batch_size): for i in range(batch_size):
file_name = files_names[indexes[i]] file_name = files_names[indexes[i]]
data = self.load_from_file(file_name, shape)
d = bob.io.base.load(file_name)
if len(d.shape) == 2:
data = numpy.zeros(shape=tuple(shape[1:]))
data[:, :, 0] = d
else:
data = d
selected_data[i, ...] = data selected_data[i, ...] = data
if self.scale is not None: if self.scale:
selected_data[i, ...] *= self.scale selected_data[i, ...] *= self.scale_value
selected_labels = label[indexes[0:batch_size]] selected_labels = label[indexes[0:batch_size]]
return selected_data.astype("float32"), selected_labels return selected_data.astype("float32"), selected_labels
def get_pair(self, train_dataset=True, zero_one_labels=True):
"""
Get a random pair of samples
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
if train_dataset:
target_data = self.train_data
target_labels = self.train_labels
shape = self.train_shape
else:
target_data = self.validation_data
target_labels = self.validation_labels
shape = self.validation_shape
data = numpy.zeros(shape=shape, dtype='float32')
data_p = numpy.zeros(shape=shape, dtype='float32')
labels_siamese = numpy.zeros(shape=shape[0], dtype='float32')
genuine = True
for i in range(shape[0]):
file_name, file_name_p = self.get_genuine_or_not(target_data, target_labels, genuine=genuine)
data[i, ...] = self.load_from_file(str(file_name), shape)
data_p[i, ...] = self.load_from_file(str(file_name_p), shape)
if zero_one_labels:
labels_siamese[i] = not genuine
else:
labels_siamese[i] = -1 if genuine else +1
genuine = not genuine
if self.scale:
data *= self.scale_value
data_p *= self.scale_value
return data, data_p, labels_siamese
...@@ -39,26 +39,28 @@ def main(): ...@@ -39,26 +39,28 @@ def main():
perc_train = 0.9 perc_train = 0.9
# Loading data # Loading data
data, labels = util.load_mnist(data_dir="./src/bob.db.mnist/bob/db/mnist/") #data, labels = util.load_mnist(data_dir="./src/bob.db.mnist/bob/db/mnist/")
data = numpy.reshape(data, (data.shape[0], 28, 28, 1)) #data = numpy.reshape(data, (data.shape[0], 28, 28, 1))
data_shuffler = MemoryDataShuffler(data, labels,
input_shape=[28, 28, 1], #data_shuffler = MemoryDataShuffler(data, labels,
train_batch_size=BATCH_SIZE, # input_shape=[28, 28, 1],
validation_batch_size=BATCH_SIZE*100) # train_batch_size=BATCH_SIZE,
# validation_batch_size=BATCH_SIZE*100)
#db = bob.db.mobio.Database()
#objects = db.objects(protocol="male") db = bob.db.mobio.Database()
objects = db.objects(protocol="male")
#labels = [o.client_id for o in objects]
#file_names = [o.make_path( labels = [o.client_id for o in objects]
# directory="/remote/lustre/2/temp/tpereira/FACEREC_EXPERIMENTS/mobio_male/lda/preprocessed", file_names = [o.make_path(
# extension=".hdf5") directory="/remote/lustre/2/temp/tpereira/FACEREC_EXPERIMENTS/mobio_male/lda/preprocessed",
# for o in objects] extension=".hdf5")
#data_shuffler = TextDataShuffler(file_names, labels, for o in objects]
# input_shape=[80, 64, 1],
# train_batch_size=BATCH_SIZE, data_shuffler = TextDataShuffler(file_names, labels,
# validation_batch_size=BATCH_SIZE*100) input_shape=[80, 64, 1],
train_batch_size=BATCH_SIZE,
validation_batch_size=BATCH_SIZE*100)
# Preparing the architecture # Preparing the architecture
lenet = Lenet() lenet = Lenet()
......
...@@ -21,7 +21,7 @@ from docopt import docopt ...@@ -21,7 +21,7 @@ from docopt import docopt
import tensorflow as tf import tensorflow as tf
from .. import util from .. import util
SEED = 10 SEED = 10
from bob.learn.tensorflow.data import MemoryPairDataShuffler, TextDataShuffler from bob.learn.tensorflow.data import MemoryDataShuffler, TextDataShuffler
from bob.learn.tensorflow.network import Lenet from bob.learn.tensorflow.network import Lenet
from bob.learn.tensorflow.trainers import SiameseTrainer from bob.learn.tensorflow.trainers import SiameseTrainer
from bob.learn.tensorflow.loss import ContrastiveLoss from bob.learn.tensorflow.loss import ContrastiveLoss
...@@ -40,11 +40,11 @@ def main(): ...@@ -40,11 +40,11 @@ def main():
# Loading data # Loading data
data, labels = util.load_mnist(data_dir="./src/bob.db.mnist/bob/db/mnist/") data, labels = util.load_mnist(data_dir="./src/bob.db.mnist/bob/db/mnist/")
data = numpy.reshape(data, (data.shape[0], 28, 28, 1)) data = numpy.reshape(data, (data.shape[0], 28, 28, 1))
data_shuffler = MemoryPairDataShuffler(data, labels, data_shuffler = MemoryDataShuffler(data, labels,
input_shape=[28, 28, 1], input_shape=[28, 28, 1],
train_batch_size=BATCH_SIZE, scale=True,
validation_batch_size=BATCH_SIZE*1000 train_batch_size=BATCH_SIZE,
) validation_batch_size=BATCH_SIZE*1000)
#db = bob.db.mobio.Database() #db = bob.db.mobio.Database()
#objects = db.objects(protocol="male") #objects = db.objects(protocol="male")
...@@ -54,11 +54,11 @@ def main(): ...@@ -54,11 +54,11 @@ def main():
# directory="/remote/lustre/2/temp/tpereira/FACEREC_EXPERIMENTS/mobio_male/lda/preprocessed", # directory="/remote/lustre/2/temp/tpereira/FACEREC_EXPERIMENTS/mobio_male/lda/preprocessed",
# extension=".hdf5") # extension=".hdf5")
# for o in objects] # for o in objects]
#data_shuffler = TextDataShuffler(file_names, labels, #data_shuffler = TextDataShuffler(file_names, labels,
# input_shape=[80, 64, 1], # input_shape=[80, 64, 1],
# train_batch_size=BATCH_SIZE, # train_batch_size=BATCH_SIZE,
# validation_batch_size=BATCH_SIZE*100) # validation_batch_size=BATCH_SIZE*500)
# Preparing the architecture # Preparing the architecture
lenet = Lenet(default_feature_layer="fc2") lenet = Lenet(default_feature_layer="fc2")
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment