Reorganized shufflers

parent 5884afff
......@@ -49,6 +49,8 @@ class Analizer:
enroll_features = self.machine(enroll_data, session=self.session)
del enroll_data
#import ipdb; ipdb.set_trace();
# Extracting features for probing
probe_data, probe_labels = self.data_shuffler.get_batch(train_dataset=False)
probe_features = self.machine(probe_data, session=self.session)
......@@ -56,22 +58,23 @@ class Analizer:
# Creating models
models = []
for i in range(self.data_shuffler.total_labels):
indexes_model = numpy.where(enroll_labels == i)[0]
for i in range(len(self.data_shuffler.possible_labels)):
indexes_model = numpy.where(enroll_labels == self.data_shuffler.possible_labels[i])[0]
models.append(numpy.mean(enroll_features[indexes_model, :], axis=0))
# Probing
positive_scores = numpy.zeros(shape=0)
negative_scores = numpy.zeros(shape=0)
for i in range(self.data_shuffler.total_labels):
for i in range(len(self.data_shuffler.possible_labels)):
#for i in self.data_shuffler.possible_labels:
# Positive scoring
indexes = probe_labels == i
indexes = probe_labels == self.data_shuffler.possible_labels[i]
positive_data = probe_features[indexes, :]
p = [cosine(models[i], positive_data[j]) for j in range(positive_data.shape[0])]
positive_scores = numpy.hstack((positive_scores, p))
# negative scoring
indexes = probe_labels != i
indexes = probe_labels != self.data_shuffler.possible_labels[i]
negative_data = probe_features[indexes, :]
n = [cosine(models[i], negative_data[j]) for j in range(negative_data.shape[0])]
negative_scores = numpy.hstack((negative_scores, n))
......
......@@ -8,7 +8,13 @@ import tensorflow as tf
class BaseDataShuffler(object):
def __init__(self, data, labels, input_shape, perc_train=0.9, scale=True, train_batch_size=1, validation_batch_size=300):
def __init__(self, data, labels,
input_shape,
input_dtype="float64",
perc_train=0.9,
scale=True,
train_batch_size=1,
validation_batch_size=300):
"""
The class provide base functionoalies to shuffle the data
......@@ -23,6 +29,7 @@ class BaseDataShuffler(object):
self.scale = scale
self.scale_value = 0.00390625
self.input_dtype = input_dtype
# TODO: Check if the bacth size is higher than the input data
self.train_batch_size = train_batch_size
......@@ -34,9 +41,9 @@ class BaseDataShuffler(object):
# TODO: Check if the labels goes from O to N-1
self.labels = labels
self.total_labels = max(labels) + 1
self.possible_labels = list(set(self.labels))
# Spliting in train and validation
# Computing the data samples fro train and validation
self.n_samples = len(self.labels)
self.n_train_samples = int(round(self.n_samples * perc_train))
self.n_validation_samples = self.n_samples - self.n_train_samples
......@@ -45,6 +52,15 @@ class BaseDataShuffler(object):
self.indexes = numpy.array(range(self.n_samples))
numpy.random.shuffle(self.indexes)
# Spliting the data between train and validation
self.train_data = self.data[self.indexes[0:self.n_train_samples], ...]
self.train_labels = self.labels[self.indexes[0:self.n_train_samples]]
self.validation_data = self.data[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples], ...]
self.validation_labels = self.labels[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples]]
def get_placeholders_forprefetch(self, name="", train_dataset=True):
"""
Returns a place holder with the size of your batch
......@@ -66,3 +82,35 @@ class BaseDataShuffler(object):
labels = tf.placeholder(tf.int64, shape=shape[0])
return data, labels
def get_genuine_or_not(self, input_data, input_labels, genuine=True):
if genuine:
# Getting a client
index = numpy.random.randint(len(self.possible_labels))
index = self.possible_labels[index]
# Getting the indexes of the data from a particular client
indexes = numpy.where(input_labels == index)[0]
numpy.random.shuffle(indexes)
# Picking a pair
data = input_data[indexes[0], ...]
data_p = input_data[indexes[1], ...]
else:
# Picking a pair of labels from different clients
index = numpy.random.choice(len(self.possible_labels), 2, replace=False)
index[0] = self.possible_labels[index[0]]
index[1] = self.possible_labels[index[1]]
# Getting the indexes of the two clients
indexes = numpy.where(input_labels == index[0])[0]
indexes_p = numpy.where(input_labels == index[1])[0]
numpy.random.shuffle(indexes)
numpy.random.shuffle(indexes_p)
# Picking a pair
data = input_data[indexes[0], ...]
data_p = input_data[indexes_p[0], ...]
return data, data_p
......@@ -16,7 +16,14 @@ def scale_mean_norm(data, scale=0.00390625):
class MemoryDataShuffler(BaseDataShuffler):
def __init__(self, data, labels, input_shape, perc_train=0.9, scale=True, train_batch_size=1, validation_batch_size=300):
def __init__(self, data, labels,
input_shape,
input_dtype="float64",
perc_train=0.9,
scale=True,
train_batch_size=1,
validation_batch_size=300):
"""
Shuffler that deal with memory datasets
......@@ -33,23 +40,19 @@ class MemoryDataShuffler(BaseDataShuffler):
data=data,
labels=labels,
input_shape=input_shape,
input_dtype=input_dtype,
perc_train=perc_train,
scale=scale,
train_batch_size=train_batch_size,
validation_batch_size=validation_batch_size
)
# Spliting between train and test
self.train_data = self.data[self.indexes[0:self.n_train_samples], ...]
self.train_labels = self.labels[self.indexes[0:self.n_train_samples]]
self.train_data = self.train_data.astype(input_dtype)
self.validation_data = self.validation_data.astype(input_dtype)
self.validation_data = self.data[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples], ...]
self.validation_labels = self.labels[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples]]
if self.scale:
self.train_data, self.mean = scale_mean_norm(self.train_data)
self.validation_data = (self.validation_data - self.mean) * self.scale_value
self.train_data *= self.scale_value
self.validation_data *= self.scale_value
def get_batch(self, train_dataset=True):
......@@ -70,3 +73,37 @@ class MemoryDataShuffler(BaseDataShuffler):
selected_labels = label[indexes[0:n_samples]]
return selected_data.astype("float32"), selected_labels
def get_pair(self, train_dataset=True, zero_one_labels=True):
"""
Get a random pair of samples
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
if train_dataset:
target_data = self.train_data
target_labels = self.train_labels
shape = self.train_shape
else:
target_data = self.validation_data
target_labels = self.validation_labels
shape = self.validation_shape
data = numpy.zeros(shape=shape, dtype='float32')
data_p = numpy.zeros(shape=shape, dtype='float32')
labels_siamese = numpy.zeros(shape=shape[0], dtype='float32')
genuine = True
for i in range(shape[0]):
data[i, ...], data_p[i, ...] = self.get_genuine_or_not(target_data, target_labels, genuine=genuine)
if zero_one_labels:
labels_siamese[i] = not genuine
else:
labels_siamese[i] = -1 if genuine else +1
genuine = not genuine
return data, data_p, labels_siamese
......@@ -18,7 +18,13 @@ from .BaseDataShuffler import BaseDataShuffler
class TextDataShuffler(BaseDataShuffler):
def __init__(self, data, labels, input_shape, perc_train=0.9, scale=0.00390625, train_batch_size=1, validation_batch_size=300):
def __init__(self, data, labels,
input_shape,
input_dtype="float64",
perc_train=0.9,
scale=True,
train_batch_size=1,
validation_batch_size=300):
"""
Shuffler that deal with file list
......@@ -31,30 +37,32 @@ class TextDataShuffler(BaseDataShuffler):
validation_batch_size:
"""
if isinstance(data, list):
data = numpy.array(data)
if isinstance(labels, list):
labels = numpy.array(labels)
super(TextDataShuffler, self).__init__(
data=data,
labels=labels,
input_shape=input_shape,
input_dtype=input_dtype,
perc_train=perc_train,
scale=scale,
train_batch_size=train_batch_size,
validation_batch_size=validation_batch_size
)
if isinstance(self.data, list):
self.data = numpy.array(self.data)
if isinstance(self.labels, list):
self.labels = numpy.array(self.labels)
# Spliting between train and test
self.train_data = self.data[self.indexes[0:self.n_train_samples]]
self.train_labels = self.labels[self.indexes[0:self.n_train_samples]]
def load_from_file(self, file_name, shape):
d = bob.io.base.load(file_name)
if len(d.shape) == 2:
data = numpy.zeros(shape=tuple(shape[1:]))
data[:, :, 0] = d
else:
data = d
self.validation_data = self.data[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples]]
self.validation_labels = self.labels[self.indexes[self.n_train_samples:
self.n_train_samples + self.n_validation_samples]]
return data
def get_batch(self, train_dataset=True):
......@@ -77,20 +85,54 @@ class TextDataShuffler(BaseDataShuffler):
for i in range(batch_size):
file_name = files_names[indexes[i]]
d = bob.io.base.load(file_name)
if len(d.shape) == 2:
data = numpy.zeros(shape=tuple(shape[1:]))
data[:, :, 0] = d
else:
data = d
data = self.load_from_file(file_name, shape)
selected_data[i, ...] = data
if self.scale is not None:
selected_data[i, ...] *= self.scale
if self.scale:
selected_data[i, ...] *= self.scale_value
selected_labels = label[indexes[0:batch_size]]
return selected_data.astype("float32"), selected_labels
def get_pair(self, train_dataset=True, zero_one_labels=True):
"""
Get a random pair of samples
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
if train_dataset:
target_data = self.train_data
target_labels = self.train_labels
shape = self.train_shape
else:
target_data = self.validation_data
target_labels = self.validation_labels
shape = self.validation_shape
data = numpy.zeros(shape=shape, dtype='float32')
data_p = numpy.zeros(shape=shape, dtype='float32')
labels_siamese = numpy.zeros(shape=shape[0], dtype='float32')
genuine = True
for i in range(shape[0]):
file_name, file_name_p = self.get_genuine_or_not(target_data, target_labels, genuine=genuine)
data[i, ...] = self.load_from_file(str(file_name), shape)
data_p[i, ...] = self.load_from_file(str(file_name_p), shape)
if zero_one_labels:
labels_siamese[i] = not genuine
else:
labels_siamese[i] = -1 if genuine else +1
genuine = not genuine
if self.scale:
data *= self.scale_value
data_p *= self.scale_value
return data, data_p, labels_siamese
......@@ -39,26 +39,28 @@ def main():
perc_train = 0.9
# Loading data
data, labels = util.load_mnist(data_dir="./src/bob.db.mnist/bob/db/mnist/")
data = numpy.reshape(data, (data.shape[0], 28, 28, 1))
data_shuffler = MemoryDataShuffler(data, labels,
input_shape=[28, 28, 1],
train_batch_size=BATCH_SIZE,
validation_batch_size=BATCH_SIZE*100)
#db = bob.db.mobio.Database()
#objects = db.objects(protocol="male")
#labels = [o.client_id for o in objects]
#file_names = [o.make_path(
# directory="/remote/lustre/2/temp/tpereira/FACEREC_EXPERIMENTS/mobio_male/lda/preprocessed",
# extension=".hdf5")
# for o in objects]
#data_shuffler = TextDataShuffler(file_names, labels,
# input_shape=[80, 64, 1],
# train_batch_size=BATCH_SIZE,
# validation_batch_size=BATCH_SIZE*100)
#data, labels = util.load_mnist(data_dir="./src/bob.db.mnist/bob/db/mnist/")
#data = numpy.reshape(data, (data.shape[0], 28, 28, 1))
#data_shuffler = MemoryDataShuffler(data, labels,
# input_shape=[28, 28, 1],
# train_batch_size=BATCH_SIZE,
# validation_batch_size=BATCH_SIZE*100)
db = bob.db.mobio.Database()
objects = db.objects(protocol="male")
labels = [o.client_id for o in objects]
file_names = [o.make_path(
directory="/remote/lustre/2/temp/tpereira/FACEREC_EXPERIMENTS/mobio_male/lda/preprocessed",
extension=".hdf5")
for o in objects]
data_shuffler = TextDataShuffler(file_names, labels,
input_shape=[80, 64, 1],
train_batch_size=BATCH_SIZE,
validation_batch_size=BATCH_SIZE*100)
# Preparing the architecture
lenet = Lenet()
......
......@@ -21,7 +21,7 @@ from docopt import docopt
import tensorflow as tf
from .. import util
SEED = 10
from bob.learn.tensorflow.data import MemoryPairDataShuffler, TextDataShuffler
from bob.learn.tensorflow.data import MemoryDataShuffler, TextDataShuffler
from bob.learn.tensorflow.network import Lenet
from bob.learn.tensorflow.trainers import SiameseTrainer
from bob.learn.tensorflow.loss import ContrastiveLoss
......@@ -40,11 +40,11 @@ def main():
# Loading data
data, labels = util.load_mnist(data_dir="./src/bob.db.mnist/bob/db/mnist/")
data = numpy.reshape(data, (data.shape[0], 28, 28, 1))
data_shuffler = MemoryPairDataShuffler(data, labels,
input_shape=[28, 28, 1],
train_batch_size=BATCH_SIZE,
validation_batch_size=BATCH_SIZE*1000
)
data_shuffler = MemoryDataShuffler(data, labels,
input_shape=[28, 28, 1],
scale=True,
train_batch_size=BATCH_SIZE,
validation_batch_size=BATCH_SIZE*1000)
#db = bob.db.mobio.Database()
#objects = db.objects(protocol="male")
......@@ -54,11 +54,11 @@ def main():
# directory="/remote/lustre/2/temp/tpereira/FACEREC_EXPERIMENTS/mobio_male/lda/preprocessed",
# extension=".hdf5")
# for o in objects]
#data_shuffler = TextDataShuffler(file_names, labels,
# input_shape=[80, 64, 1],
# train_batch_size=BATCH_SIZE,
# validation_batch_size=BATCH_SIZE*100)
# validation_batch_size=BATCH_SIZE*500)
# Preparing the architecture
lenet = Lenet(default_feature_layer="fc2")
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment