diff --git a/bob/learn/tensorflow/analyzers/Analizer.py b/bob/learn/tensorflow/analyzers/Analizer.py index 08ba2abf7dd4dd20963ac6be0d1598e0ee93576d..cf1499685f67f0ef311bf1ac5bd6438fb21218b0 100644 --- a/bob/learn/tensorflow/analyzers/Analizer.py +++ b/bob/learn/tensorflow/analyzers/Analizer.py @@ -49,6 +49,8 @@ class Analizer: enroll_features = self.machine(enroll_data, session=self.session) del enroll_data + #import ipdb; ipdb.set_trace(); + # Extracting features for probing probe_data, probe_labels = self.data_shuffler.get_batch(train_dataset=False) probe_features = self.machine(probe_data, session=self.session) @@ -56,22 +58,23 @@ class Analizer: # Creating models models = [] - for i in range(self.data_shuffler.total_labels): - indexes_model = numpy.where(enroll_labels == i)[0] + for i in range(len(self.data_shuffler.possible_labels)): + indexes_model = numpy.where(enroll_labels == self.data_shuffler.possible_labels[i])[0] models.append(numpy.mean(enroll_features[indexes_model, :], axis=0)) # Probing positive_scores = numpy.zeros(shape=0) negative_scores = numpy.zeros(shape=0) - for i in range(self.data_shuffler.total_labels): + for i in range(len(self.data_shuffler.possible_labels)): + #for i in self.data_shuffler.possible_labels: # Positive scoring - indexes = probe_labels == i + indexes = probe_labels == self.data_shuffler.possible_labels[i] positive_data = probe_features[indexes, :] p = [cosine(models[i], positive_data[j]) for j in range(positive_data.shape[0])] positive_scores = numpy.hstack((positive_scores, p)) # negative scoring - indexes = probe_labels != i + indexes = probe_labels != self.data_shuffler.possible_labels[i] negative_data = probe_features[indexes, :] n = [cosine(models[i], negative_data[j]) for j in range(negative_data.shape[0])] negative_scores = numpy.hstack((negative_scores, n)) diff --git a/bob/learn/tensorflow/data/BaseDataShuffler.py b/bob/learn/tensorflow/data/BaseDataShuffler.py index c6e53c7c4f12f899d37e86ccea2074e955fdb255..cc60caaef48837761e5498f32b719b48668a8565 100644 --- a/bob/learn/tensorflow/data/BaseDataShuffler.py +++ b/bob/learn/tensorflow/data/BaseDataShuffler.py @@ -8,7 +8,13 @@ import tensorflow as tf class BaseDataShuffler(object): - def __init__(self, data, labels, input_shape, perc_train=0.9, scale=True, train_batch_size=1, validation_batch_size=300): + def __init__(self, data, labels, + input_shape, + input_dtype="float64", + perc_train=0.9, + scale=True, + train_batch_size=1, + validation_batch_size=300): """ The class provide base functionoalies to shuffle the data @@ -23,6 +29,7 @@ class BaseDataShuffler(object): self.scale = scale self.scale_value = 0.00390625 + self.input_dtype = input_dtype # TODO: Check if the bacth size is higher than the input data self.train_batch_size = train_batch_size @@ -34,9 +41,9 @@ class BaseDataShuffler(object): # TODO: Check if the labels goes from O to N-1 self.labels = labels - self.total_labels = max(labels) + 1 + self.possible_labels = list(set(self.labels)) - # Spliting in train and validation + # Computing the data samples fro train and validation self.n_samples = len(self.labels) self.n_train_samples = int(round(self.n_samples * perc_train)) self.n_validation_samples = self.n_samples - self.n_train_samples @@ -45,6 +52,15 @@ class BaseDataShuffler(object): self.indexes = numpy.array(range(self.n_samples)) numpy.random.shuffle(self.indexes) + # Spliting the data between train and validation + self.train_data = self.data[self.indexes[0:self.n_train_samples], ...] + self.train_labels = self.labels[self.indexes[0:self.n_train_samples]] + + self.validation_data = self.data[self.indexes[self.n_train_samples: + self.n_train_samples + self.n_validation_samples], ...] + self.validation_labels = self.labels[self.indexes[self.n_train_samples: + self.n_train_samples + self.n_validation_samples]] + def get_placeholders_forprefetch(self, name="", train_dataset=True): """ Returns a place holder with the size of your batch @@ -66,3 +82,35 @@ class BaseDataShuffler(object): labels = tf.placeholder(tf.int64, shape=shape[0]) return data, labels + + def get_genuine_or_not(self, input_data, input_labels, genuine=True): + if genuine: + # Getting a client + index = numpy.random.randint(len(self.possible_labels)) + index = self.possible_labels[index] + + # Getting the indexes of the data from a particular client + indexes = numpy.where(input_labels == index)[0] + numpy.random.shuffle(indexes) + + # Picking a pair + data = input_data[indexes[0], ...] + data_p = input_data[indexes[1], ...] + + else: + # Picking a pair of labels from different clients + index = numpy.random.choice(len(self.possible_labels), 2, replace=False) + index[0] = self.possible_labels[index[0]] + index[1] = self.possible_labels[index[1]] + + # Getting the indexes of the two clients + indexes = numpy.where(input_labels == index[0])[0] + indexes_p = numpy.where(input_labels == index[1])[0] + numpy.random.shuffle(indexes) + numpy.random.shuffle(indexes_p) + + # Picking a pair + data = input_data[indexes[0], ...] + data_p = input_data[indexes_p[0], ...] + + return data, data_p diff --git a/bob/learn/tensorflow/data/MemoryDataShuffler.py b/bob/learn/tensorflow/data/MemoryDataShuffler.py index 36b671f966c9d8487429d5e6e68da457e42af0b8..5bd4945915ada4688cac7f44b5a87b338731869d 100644 --- a/bob/learn/tensorflow/data/MemoryDataShuffler.py +++ b/bob/learn/tensorflow/data/MemoryDataShuffler.py @@ -16,7 +16,14 @@ def scale_mean_norm(data, scale=0.00390625): class MemoryDataShuffler(BaseDataShuffler): - def __init__(self, data, labels, input_shape, perc_train=0.9, scale=True, train_batch_size=1, validation_batch_size=300): + + def __init__(self, data, labels, + input_shape, + input_dtype="float64", + perc_train=0.9, + scale=True, + train_batch_size=1, + validation_batch_size=300): """ Shuffler that deal with memory datasets @@ -33,23 +40,19 @@ class MemoryDataShuffler(BaseDataShuffler): data=data, labels=labels, input_shape=input_shape, + input_dtype=input_dtype, perc_train=perc_train, scale=scale, train_batch_size=train_batch_size, validation_batch_size=validation_batch_size ) - # Spliting between train and test - self.train_data = self.data[self.indexes[0:self.n_train_samples], ...] - self.train_labels = self.labels[self.indexes[0:self.n_train_samples]] + self.train_data = self.train_data.astype(input_dtype) + self.validation_data = self.validation_data.astype(input_dtype) - self.validation_data = self.data[self.indexes[self.n_train_samples: - self.n_train_samples + self.n_validation_samples], ...] - self.validation_labels = self.labels[self.indexes[self.n_train_samples: - self.n_train_samples + self.n_validation_samples]] if self.scale: - self.train_data, self.mean = scale_mean_norm(self.train_data) - self.validation_data = (self.validation_data - self.mean) * self.scale_value + self.train_data *= self.scale_value + self.validation_data *= self.scale_value def get_batch(self, train_dataset=True): @@ -70,3 +73,37 @@ class MemoryDataShuffler(BaseDataShuffler): selected_labels = label[indexes[0:n_samples]] return selected_data.astype("float32"), selected_labels + + def get_pair(self, train_dataset=True, zero_one_labels=True): + """ + Get a random pair of samples + + **Parameters** + is_target_set_train: Defining the target set to get the batch + + **Return** + """ + + if train_dataset: + target_data = self.train_data + target_labels = self.train_labels + shape = self.train_shape + else: + target_data = self.validation_data + target_labels = self.validation_labels + shape = self.validation_shape + + data = numpy.zeros(shape=shape, dtype='float32') + data_p = numpy.zeros(shape=shape, dtype='float32') + labels_siamese = numpy.zeros(shape=shape[0], dtype='float32') + + genuine = True + for i in range(shape[0]): + data[i, ...], data_p[i, ...] = self.get_genuine_or_not(target_data, target_labels, genuine=genuine) + if zero_one_labels: + labels_siamese[i] = not genuine + else: + labels_siamese[i] = -1 if genuine else +1 + genuine = not genuine + + return data, data_p, labels_siamese diff --git a/bob/learn/tensorflow/data/TextDataShuffler.py b/bob/learn/tensorflow/data/TextDataShuffler.py index e5472d682de9db4c9271c24c473752e133f14aed..2bcec45f7e7e756dba345a0bb2bfc2fba3a97d4b 100644 --- a/bob/learn/tensorflow/data/TextDataShuffler.py +++ b/bob/learn/tensorflow/data/TextDataShuffler.py @@ -18,7 +18,13 @@ from .BaseDataShuffler import BaseDataShuffler class TextDataShuffler(BaseDataShuffler): - def __init__(self, data, labels, input_shape, perc_train=0.9, scale=0.00390625, train_batch_size=1, validation_batch_size=300): + def __init__(self, data, labels, + input_shape, + input_dtype="float64", + perc_train=0.9, + scale=True, + train_batch_size=1, + validation_batch_size=300): """ Shuffler that deal with file list @@ -31,30 +37,32 @@ class TextDataShuffler(BaseDataShuffler): validation_batch_size: """ + if isinstance(data, list): + data = numpy.array(data) + + if isinstance(labels, list): + labels = numpy.array(labels) + super(TextDataShuffler, self).__init__( data=data, labels=labels, input_shape=input_shape, + input_dtype=input_dtype, perc_train=perc_train, scale=scale, train_batch_size=train_batch_size, validation_batch_size=validation_batch_size ) - if isinstance(self.data, list): - self.data = numpy.array(self.data) - - if isinstance(self.labels, list): - self.labels = numpy.array(self.labels) - - # Spliting between train and test - self.train_data = self.data[self.indexes[0:self.n_train_samples]] - self.train_labels = self.labels[self.indexes[0:self.n_train_samples]] + def load_from_file(self, file_name, shape): + d = bob.io.base.load(file_name) + if len(d.shape) == 2: + data = numpy.zeros(shape=tuple(shape[1:])) + data[:, :, 0] = d + else: + data = d - self.validation_data = self.data[self.indexes[self.n_train_samples: - self.n_train_samples + self.n_validation_samples]] - self.validation_labels = self.labels[self.indexes[self.n_train_samples: - self.n_train_samples + self.n_validation_samples]] + return data def get_batch(self, train_dataset=True): @@ -77,20 +85,54 @@ class TextDataShuffler(BaseDataShuffler): for i in range(batch_size): file_name = files_names[indexes[i]] - - d = bob.io.base.load(file_name) - if len(d.shape) == 2: - data = numpy.zeros(shape=tuple(shape[1:])) - data[:, :, 0] = d - else: - data = d + data = self.load_from_file(file_name, shape) selected_data[i, ...] = data - if self.scale is not None: - selected_data[i, ...] *= self.scale - - + if self.scale: + selected_data[i, ...] *= self.scale_value selected_labels = label[indexes[0:batch_size]] return selected_data.astype("float32"), selected_labels + + def get_pair(self, train_dataset=True, zero_one_labels=True): + """ + Get a random pair of samples + + **Parameters** + is_target_set_train: Defining the target set to get the batch + + **Return** + """ + + if train_dataset: + target_data = self.train_data + target_labels = self.train_labels + shape = self.train_shape + else: + target_data = self.validation_data + target_labels = self.validation_labels + shape = self.validation_shape + + data = numpy.zeros(shape=shape, dtype='float32') + data_p = numpy.zeros(shape=shape, dtype='float32') + labels_siamese = numpy.zeros(shape=shape[0], dtype='float32') + + genuine = True + for i in range(shape[0]): + file_name, file_name_p = self.get_genuine_or_not(target_data, target_labels, genuine=genuine) + data[i, ...] = self.load_from_file(str(file_name), shape) + data_p[i, ...] = self.load_from_file(str(file_name_p), shape) + + if zero_one_labels: + labels_siamese[i] = not genuine + else: + labels_siamese[i] = -1 if genuine else +1 + genuine = not genuine + + if self.scale: + data *= self.scale_value + data_p *= self.scale_value + + return data, data_p, labels_siamese + diff --git a/bob/learn/tensorflow/script/train_mnist.py b/bob/learn/tensorflow/script/train_mnist.py index 04b139c19b3f8bbe2da5d875bc2645b3fbdfdcfd..5a1683069976033a0d27efc762965952af2d87f6 100644 --- a/bob/learn/tensorflow/script/train_mnist.py +++ b/bob/learn/tensorflow/script/train_mnist.py @@ -39,26 +39,28 @@ def main(): perc_train = 0.9 # Loading data - data, labels = util.load_mnist(data_dir="./src/bob.db.mnist/bob/db/mnist/") - data = numpy.reshape(data, (data.shape[0], 28, 28, 1)) - data_shuffler = MemoryDataShuffler(data, labels, - input_shape=[28, 28, 1], - train_batch_size=BATCH_SIZE, - validation_batch_size=BATCH_SIZE*100) - - - #db = bob.db.mobio.Database() - #objects = db.objects(protocol="male") - - #labels = [o.client_id for o in objects] - #file_names = [o.make_path( - # directory="/remote/lustre/2/temp/tpereira/FACEREC_EXPERIMENTS/mobio_male/lda/preprocessed", - # extension=".hdf5") - # for o in objects] - #data_shuffler = TextDataShuffler(file_names, labels, - # input_shape=[80, 64, 1], - # train_batch_size=BATCH_SIZE, - # validation_batch_size=BATCH_SIZE*100) + #data, labels = util.load_mnist(data_dir="./src/bob.db.mnist/bob/db/mnist/") + #data = numpy.reshape(data, (data.shape[0], 28, 28, 1)) + + #data_shuffler = MemoryDataShuffler(data, labels, + # input_shape=[28, 28, 1], + # train_batch_size=BATCH_SIZE, + # validation_batch_size=BATCH_SIZE*100) + + + db = bob.db.mobio.Database() + objects = db.objects(protocol="male") + + labels = [o.client_id for o in objects] + file_names = [o.make_path( + directory="/remote/lustre/2/temp/tpereira/FACEREC_EXPERIMENTS/mobio_male/lda/preprocessed", + extension=".hdf5") + for o in objects] + + data_shuffler = TextDataShuffler(file_names, labels, + input_shape=[80, 64, 1], + train_batch_size=BATCH_SIZE, + validation_batch_size=BATCH_SIZE*100) # Preparing the architecture lenet = Lenet() diff --git a/bob/learn/tensorflow/script/train_mnist_siamese.py b/bob/learn/tensorflow/script/train_mnist_siamese.py index c4085a948a390204f9ab0fa84c76ead36bd73f85..7bb54b34ac419361a6ab91e694a1cc72c576b8ae 100644 --- a/bob/learn/tensorflow/script/train_mnist_siamese.py +++ b/bob/learn/tensorflow/script/train_mnist_siamese.py @@ -21,7 +21,7 @@ from docopt import docopt import tensorflow as tf from .. import util SEED = 10 -from bob.learn.tensorflow.data import MemoryPairDataShuffler, TextDataShuffler +from bob.learn.tensorflow.data import MemoryDataShuffler, TextDataShuffler from bob.learn.tensorflow.network import Lenet from bob.learn.tensorflow.trainers import SiameseTrainer from bob.learn.tensorflow.loss import ContrastiveLoss @@ -40,11 +40,11 @@ def main(): # Loading data data, labels = util.load_mnist(data_dir="./src/bob.db.mnist/bob/db/mnist/") data = numpy.reshape(data, (data.shape[0], 28, 28, 1)) - data_shuffler = MemoryPairDataShuffler(data, labels, - input_shape=[28, 28, 1], - train_batch_size=BATCH_SIZE, - validation_batch_size=BATCH_SIZE*1000 - ) + data_shuffler = MemoryDataShuffler(data, labels, + input_shape=[28, 28, 1], + scale=True, + train_batch_size=BATCH_SIZE, + validation_batch_size=BATCH_SIZE*1000) #db = bob.db.mobio.Database() #objects = db.objects(protocol="male") @@ -54,11 +54,11 @@ def main(): # directory="/remote/lustre/2/temp/tpereira/FACEREC_EXPERIMENTS/mobio_male/lda/preprocessed", # extension=".hdf5") # for o in objects] + #data_shuffler = TextDataShuffler(file_names, labels, # input_shape=[80, 64, 1], # train_batch_size=BATCH_SIZE, - # validation_batch_size=BATCH_SIZE*100) - + # validation_batch_size=BATCH_SIZE*500) # Preparing the architecture lenet = Lenet(default_feature_layer="fc2")