Skip to content
Snippets Groups Projects
Commit 86f2c9cf authored by Tiago de Freitas Pereira's avatar Tiago de Freitas Pereira
Browse files

Reorganized shufflers

parent 5884afff
Branches
Tags
No related merge requests found
......@@ -49,6 +49,8 @@ class Analizer:
enroll_features = self.machine(enroll_data, session=self.session)
del enroll_data
#import ipdb; ipdb.set_trace();
# Extracting features for probing
probe_data, probe_labels = self.data_shuffler.get_batch(train_dataset=False)
probe_features = self.machine(probe_data, session=self.session)
......@@ -56,22 +58,23 @@ class Analizer:
# Creating models
models = []
for i in range(self.data_shuffler.total_labels):
indexes_model = numpy.where(enroll_labels == i)[0]
for i in range(len(self.data_shuffler.possible_labels)):
indexes_model = numpy.where(enroll_labels == self.data_shuffler.possible_labels[i])[0]
models.append(numpy.mean(enroll_features[indexes_model, :], axis=0))
# Probing
positive_scores = numpy.zeros(shape=0)
negative_scores = numpy.zeros(shape=0)
for i in range(self.data_shuffler.total_labels):
for i in range(len(self.data_shuffler.possible_labels)):
#for i in self.data_shuffler.possible_labels:
# Positive scoring
indexes = probe_labels == i
indexes = probe_labels == self.data_shuffler.possible_labels[i]
positive_data = probe_features[indexes, :]
p = [cosine(models[i], positive_data[j]) for j in range(positive_data.shape[0])]
positive_scores = numpy.hstack((positive_scores, p))
# negative scoring
indexes = probe_labels != i
indexes = probe_labels != self.data_shuffler.possible_labels[i]
negative_data = probe_features[indexes, :]
n = [cosine(models[i], negative_data[j]) for j in range(negative_data.shape[0])]
negative_scores = numpy.hstack((negative_scores, n))
......
......@@ -8,7 +8,13 @@ import tensorflow as tf
class BaseDataShuffler(object):
def __init__(self, data, labels, input_shape, perc_train=0.9, scale=True, train_batch_size=1, validation_batch_size=300):
def __init__(self, data, labels,
input_shape,
input_dtype="float64",
perc_train=0.9,
scale=True,
train_batch_size=1,
validation_batch_size=300):
"""
The class provide base functionoalies to shuffle the data
......@@ -23,6 +29,7 @@ class BaseDataShuffler(object):
self.scale = scale
self.scale_value = 0.00390625
self.input_dtype = input_dtype
# TODO: Check if the bacth size is higher than the input data
self.train_batch_size = train_batch_size
......@@ -34,9 +41,9 @@ class BaseDataShuffler(object):
# TODO: Check if the labels goes from O to N-1
self.labels = labels
self.total_labels = max(labels) + 1
self.possible_labels = list(set(self.labels))
# Spliting in train and validation
# Computing the data samples fro train and validation
self.n_samples = len(self.labels)
self.n_train_samples = int(round(self.n_samples * perc_train))
self.n_validation_samples = self.n_samples - self.n_train_samples
......@@ -45,6 +52,15 @@ class BaseDataShuffler(object):
self.indexes = numpy.array(range(self.n_samples))
numpy.random.shuffle(self.indexes)
# Spliting the data between train and validation
self.train_data = self.data[self.indexes[0:self.n_train_samples], ...]
self.train_labels = self.labels[self.indexes[0:self.n_train_samples]]
self.validation_data = self.data[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples], ...]
self.validation_labels = self.labels[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples]]
def get_placeholders_forprefetch(self, name="", train_dataset=True):
"""
Returns a place holder with the size of your batch
......@@ -66,3 +82,35 @@ class BaseDataShuffler(object):
labels = tf.placeholder(tf.int64, shape=shape[0])
return data, labels
def get_genuine_or_not(self, input_data, input_labels, genuine=True):
if genuine:
# Getting a client
index = numpy.random.randint(len(self.possible_labels))
index = self.possible_labels[index]
# Getting the indexes of the data from a particular client
indexes = numpy.where(input_labels == index)[0]
numpy.random.shuffle(indexes)
# Picking a pair
data = input_data[indexes[0], ...]
data_p = input_data[indexes[1], ...]
else:
# Picking a pair of labels from different clients
index = numpy.random.choice(len(self.possible_labels), 2, replace=False)
index[0] = self.possible_labels[index[0]]
index[1] = self.possible_labels[index[1]]
# Getting the indexes of the two clients
indexes = numpy.where(input_labels == index[0])[0]
indexes_p = numpy.where(input_labels == index[1])[0]
numpy.random.shuffle(indexes)
numpy.random.shuffle(indexes_p)
# Picking a pair
data = input_data[indexes[0], ...]
data_p = input_data[indexes_p[0], ...]
return data, data_p
......@@ -16,7 +16,14 @@ def scale_mean_norm(data, scale=0.00390625):
class MemoryDataShuffler(BaseDataShuffler):
def __init__(self, data, labels, input_shape, perc_train=0.9, scale=True, train_batch_size=1, validation_batch_size=300):
def __init__(self, data, labels,
input_shape,
input_dtype="float64",
perc_train=0.9,
scale=True,
train_batch_size=1,
validation_batch_size=300):
"""
Shuffler that deal with memory datasets
......@@ -33,23 +40,19 @@ class MemoryDataShuffler(BaseDataShuffler):
data=data,
labels=labels,
input_shape=input_shape,
input_dtype=input_dtype,
perc_train=perc_train,
scale=scale,
train_batch_size=train_batch_size,
validation_batch_size=validation_batch_size
)
# Spliting between train and test
self.train_data = self.data[self.indexes[0:self.n_train_samples], ...]
self.train_labels = self.labels[self.indexes[0:self.n_train_samples]]
self.train_data = self.train_data.astype(input_dtype)
self.validation_data = self.validation_data.astype(input_dtype)
self.validation_data = self.data[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples], ...]
self.validation_labels = self.labels[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples]]
if self.scale:
self.train_data, self.mean = scale_mean_norm(self.train_data)
self.validation_data = (self.validation_data - self.mean) * self.scale_value
self.train_data *= self.scale_value
self.validation_data *= self.scale_value
def get_batch(self, train_dataset=True):
......@@ -70,3 +73,37 @@ class MemoryDataShuffler(BaseDataShuffler):
selected_labels = label[indexes[0:n_samples]]
return selected_data.astype("float32"), selected_labels
def get_pair(self, train_dataset=True, zero_one_labels=True):
"""
Get a random pair of samples
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
if train_dataset:
target_data = self.train_data
target_labels = self.train_labels
shape = self.train_shape
else:
target_data = self.validation_data
target_labels = self.validation_labels
shape = self.validation_shape
data = numpy.zeros(shape=shape, dtype='float32')
data_p = numpy.zeros(shape=shape, dtype='float32')
labels_siamese = numpy.zeros(shape=shape[0], dtype='float32')
genuine = True
for i in range(shape[0]):
data[i, ...], data_p[i, ...] = self.get_genuine_or_not(target_data, target_labels, genuine=genuine)
if zero_one_labels:
labels_siamese[i] = not genuine
else:
labels_siamese[i] = -1 if genuine else +1
genuine = not genuine
return data, data_p, labels_siamese
......@@ -18,7 +18,13 @@ from .BaseDataShuffler import BaseDataShuffler
class TextDataShuffler(BaseDataShuffler):
def __init__(self, data, labels, input_shape, perc_train=0.9, scale=0.00390625, train_batch_size=1, validation_batch_size=300):
def __init__(self, data, labels,
input_shape,
input_dtype="float64",
perc_train=0.9,
scale=True,
train_batch_size=1,
validation_batch_size=300):
"""
Shuffler that deal with file list
......@@ -31,30 +37,32 @@ class TextDataShuffler(BaseDataShuffler):
validation_batch_size:
"""
if isinstance(data, list):
data = numpy.array(data)
if isinstance(labels, list):
labels = numpy.array(labels)
super(TextDataShuffler, self).__init__(
data=data,
labels=labels,
input_shape=input_shape,
input_dtype=input_dtype,
perc_train=perc_train,
scale=scale,
train_batch_size=train_batch_size,
validation_batch_size=validation_batch_size
)
if isinstance(self.data, list):
self.data = numpy.array(self.data)
if isinstance(self.labels, list):
self.labels = numpy.array(self.labels)
# Spliting between train and test
self.train_data = self.data[self.indexes[0:self.n_train_samples]]
self.train_labels = self.labels[self.indexes[0:self.n_train_samples]]
def load_from_file(self, file_name, shape):
d = bob.io.base.load(file_name)
if len(d.shape) == 2:
data = numpy.zeros(shape=tuple(shape[1:]))
data[:, :, 0] = d
else:
data = d
self.validation_data = self.data[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples]]
self.validation_labels = self.labels[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples]]
return data
def get_batch(self, train_dataset=True):
......@@ -77,20 +85,54 @@ class TextDataShuffler(BaseDataShuffler):
for i in range(batch_size):
file_name = files_names[indexes[i]]
d = bob.io.base.load(file_name)
if len(d.shape) == 2:
data = numpy.zeros(shape=tuple(shape[1:]))
data[:, :, 0] = d
else:
data = d
data = self.load_from_file(file_name, shape)
selected_data[i, ...] = data
if self.scale is not None:
selected_data[i, ...] *= self.scale
if self.scale:
selected_data[i, ...] *= self.scale_value
selected_labels = label[indexes[0:batch_size]]
return selected_data.astype("float32"), selected_labels
def get_pair(self, train_dataset=True, zero_one_labels=True):
"""
Get a random pair of samples
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
if train_dataset:
target_data = self.train_data
target_labels = self.train_labels
shape = self.train_shape
else:
target_data = self.validation_data
target_labels = self.validation_labels
shape = self.validation_shape
data = numpy.zeros(shape=shape, dtype='float32')
data_p = numpy.zeros(shape=shape, dtype='float32')
labels_siamese = numpy.zeros(shape=shape[0], dtype='float32')
genuine = True
for i in range(shape[0]):
file_name, file_name_p = self.get_genuine_or_not(target_data, target_labels, genuine=genuine)
data[i, ...] = self.load_from_file(str(file_name), shape)
data_p[i, ...] = self.load_from_file(str(file_name_p), shape)
if zero_one_labels:
labels_siamese[i] = not genuine
else:
labels_siamese[i] = -1 if genuine else +1
genuine = not genuine
if self.scale:
data *= self.scale_value
data_p *= self.scale_value
return data, data_p, labels_siamese
......@@ -39,26 +39,28 @@ def main():
perc_train = 0.9
# Loading data
data, labels = util.load_mnist(data_dir="./src/bob.db.mnist/bob/db/mnist/")
data = numpy.reshape(data, (data.shape[0], 28, 28, 1))
data_shuffler = MemoryDataShuffler(data, labels,
input_shape=[28, 28, 1],
train_batch_size=BATCH_SIZE,
validation_batch_size=BATCH_SIZE*100)
#db = bob.db.mobio.Database()
#objects = db.objects(protocol="male")
#labels = [o.client_id for o in objects]
#file_names = [o.make_path(
# directory="/remote/lustre/2/temp/tpereira/FACEREC_EXPERIMENTS/mobio_male/lda/preprocessed",
# extension=".hdf5")
# for o in objects]
#data_shuffler = TextDataShuffler(file_names, labels,
# input_shape=[80, 64, 1],
# train_batch_size=BATCH_SIZE,
# validation_batch_size=BATCH_SIZE*100)
#data, labels = util.load_mnist(data_dir="./src/bob.db.mnist/bob/db/mnist/")
#data = numpy.reshape(data, (data.shape[0], 28, 28, 1))
#data_shuffler = MemoryDataShuffler(data, labels,
# input_shape=[28, 28, 1],
# train_batch_size=BATCH_SIZE,
# validation_batch_size=BATCH_SIZE*100)
db = bob.db.mobio.Database()
objects = db.objects(protocol="male")
labels = [o.client_id for o in objects]
file_names = [o.make_path(
directory="/remote/lustre/2/temp/tpereira/FACEREC_EXPERIMENTS/mobio_male/lda/preprocessed",
extension=".hdf5")
for o in objects]
data_shuffler = TextDataShuffler(file_names, labels,
input_shape=[80, 64, 1],
train_batch_size=BATCH_SIZE,
validation_batch_size=BATCH_SIZE*100)
# Preparing the architecture
lenet = Lenet()
......
......@@ -21,7 +21,7 @@ from docopt import docopt
import tensorflow as tf
from .. import util
SEED = 10
from bob.learn.tensorflow.data import MemoryPairDataShuffler, TextDataShuffler
from bob.learn.tensorflow.data import MemoryDataShuffler, TextDataShuffler
from bob.learn.tensorflow.network import Lenet
from bob.learn.tensorflow.trainers import SiameseTrainer
from bob.learn.tensorflow.loss import ContrastiveLoss
......@@ -40,11 +40,11 @@ def main():
# Loading data
data, labels = util.load_mnist(data_dir="./src/bob.db.mnist/bob/db/mnist/")
data = numpy.reshape(data, (data.shape[0], 28, 28, 1))
data_shuffler = MemoryPairDataShuffler(data, labels,
input_shape=[28, 28, 1],
train_batch_size=BATCH_SIZE,
validation_batch_size=BATCH_SIZE*1000
)
data_shuffler = MemoryDataShuffler(data, labels,
input_shape=[28, 28, 1],
scale=True,
train_batch_size=BATCH_SIZE,
validation_batch_size=BATCH_SIZE*1000)
#db = bob.db.mobio.Database()
#objects = db.objects(protocol="male")
......@@ -54,11 +54,11 @@ def main():
# directory="/remote/lustre/2/temp/tpereira/FACEREC_EXPERIMENTS/mobio_male/lda/preprocessed",
# extension=".hdf5")
# for o in objects]
#data_shuffler = TextDataShuffler(file_names, labels,
# input_shape=[80, 64, 1],
# train_batch_size=BATCH_SIZE,
# validation_batch_size=BATCH_SIZE*100)
# validation_batch_size=BATCH_SIZE*500)
# Preparing the architecture
lenet = Lenet(default_feature_layer="fc2")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment