Redesign the data shuffler

parent ee0db09e
......@@ -6,6 +6,7 @@
import numpy
import tensorflow as tf
import bob.ip.base
import numpy
class Base(object):
......@@ -13,9 +14,10 @@ class Base(object):
input_shape,
input_dtype="float64",
scale=True,
batch_size=1):
batch_size=1,
seed=10):
"""
The class provide base functionoalies to shuffle the data
The class provide base functionalities to shuffle the data before to train a neural network
**Parameters**
data:
......@@ -24,12 +26,16 @@ class Base(object):
scale:
train_batch_size:
validation_batch_size:
seed: Seed for the random number generator
"""
self.seed = seed
numpy.random.seed(seed)
self.scale = scale
self.scale_value = 0.00390625
self.input_dtype = input_dtype
# TODO: Check if the bacth size is higher than the input data
self.batch_size = batch_size
......
......@@ -18,9 +18,11 @@ class Disk(Base):
input_shape,
input_dtype="float64",
scale=True,
batch_size=1):
batch_size=1,
seed=10):
"""
Shuffler that deal with file list
This datashuffler deal with databases that are stored in the disk.
The data is loaded on the fly,.
**Parameters**
data:
......@@ -43,13 +45,16 @@ class Disk(Base):
input_shape=input_shape,
input_dtype=input_dtype,
scale=scale,
batch_size=batch_size
batch_size=batch_size,
seed=seed
)
# Seting the seed
numpy.random.seed(seed)
# TODO: very bad solution to deal with bob.shape images an tf shape images
self.bob_shape = tuple([input_shape[2]] + list(input_shape[0:2]))
def load_from_file(self, file_name, shape):
def load_from_file(self, file_name):
d = bob.io.base.load(file_name)
if d.shape[0] != 3 and self.input_shape[2] != 3: # GRAY SCALE IMAGE
data = numpy.zeros(shape=(d.shape[0], d.shape[1], 1))
......@@ -61,7 +66,7 @@ class Disk(Base):
# Checking NaN
if numpy.sum(numpy.isnan(data)) > 0:
logger.warning("######### Image {0} has noise #########".format(file_name))
logger.warning("######### Sample {0} has noise #########".format(file_name))
return data
......
......@@ -14,9 +14,10 @@ class Memory(Base):
input_shape,
input_dtype="float64",
scale=True,
batch_size=1):
batch_size=1,
seed=10):
"""
Shuffler that deal with memory datasets
This datashuffler deal with databases that are stored in a :py:class`numpy.array`
**Parameters**
data:
......@@ -33,8 +34,11 @@ class Memory(Base):
input_shape=input_shape,
input_dtype=input_dtype,
scale=scale,
batch_size=batch_size
batch_size=batch_size,
seed=seed
)
# Seting the seed
numpy.random.seed(seed)
self.data = self.data.astype(input_dtype)
if self.scale:
......
......@@ -10,7 +10,11 @@ import tensorflow as tf
class Siamese(Base):
"""
Siamese Shuffler base class
This datashuffler deal with databases that are provides data to Siamese networks.
Basically the py:meth:`get_batch` method provides you 3 elements in the returned list.
The first two are the batch data, and the last is the label. Either `0` for samples from the same class or `1`
for samples from different class.
"""
def __init__(self, **kwargs):
......@@ -32,6 +36,21 @@ class Siamese(Base):
return [self.data_placeholder, self.data2_placeholder, self.label_placeholder]
def get_placeholders_forprefetch(self, name=""):
"""
Returns a place holder with the size of your batch
"""
if self.data_placeholder is None:
self.data_placeholder = tf.placeholder(tf.float32, shape=tuple([None] + list(self.shape[1:])), name=name)
if self.data2_placeholder is None:
self.data2_placeholder = tf.placeholder(tf.float32, shape=tuple([None] + list(self.shape[1:])), name=name)
if self.label_placeholder is None:
self.label_placeholder = tf.placeholder(tf.int64, shape=[None, ])
return [self.data_placeholder, self.data2_placeholder, self.label_placeholder]
def get_genuine_or_not(self, input_data, input_labels, genuine=True):
if genuine:
......
......@@ -4,29 +4,20 @@
# @date: Wed 11 May 2016 09:39:36 CEST
import numpy
import bob.io.base
import bob.io.image
import bob.ip.base
import bob.core
logger = bob.core.log.setup("bob.learn.tensorflow")
import tensorflow as tf
from .Disk import Disk
from .Siamese import Siamese
from .BaseDataShuffler import BaseDataShuffler
#def scale_mean_norm(data, scale=0.00390625):
# mean = numpy.mean(data)
# data = (data - mean) * scale
# return data, mean
class TextDataShuffler(BaseDataShuffler):
class SiameseDisk(Siamese, Disk):
def __init__(self, data, labels,
input_shape,
input_dtype="float64",
scale=True,
batch_size=1):
batch_size=1,
seed=10):
"""
Shuffler that deal with file list
......@@ -45,101 +36,22 @@ class TextDataShuffler(BaseDataShuffler):
if isinstance(labels, list):
labels = numpy.array(labels)
super(TextDataShuffler, self).__init__(
super(SiameseDisk, self).__init__(
data=data,
labels=labels,
input_shape=input_shape,
input_dtype=input_dtype,
scale=scale,
batch_size=batch_size
batch_size=batch_size,
seed=seed
)
# Seting the seed
numpy.random.seed(seed)
# TODO: very bad solution to deal with bob.shape images an tf shape images
self.bob_shape = tuple([input_shape[2]] + list(input_shape[0:2]))
def load_from_file(self, file_name, shape):
d = bob.io.base.load(file_name)
if d.shape[0] != 3 and self.input_shape[2] != 3: # GRAY SCALE IMAGE
data = numpy.zeros(shape=(d.shape[0], d.shape[1], 1))
data[:, :, 0] = d
data = self.rescale(data)
else:
d = self.rescale(d)
data = self.bob2skimage(d)
# Checking NaN
if numpy.sum(numpy.isnan(data)) > 0:
logger.warning("######### Image {0} has noise #########".format(file_name))
return data
def bob2skimage(self, bob_image):
"""
Convert bob color image to the skcit image
"""
skimage = numpy.zeros(shape=(bob_image.shape[1], bob_image.shape[2], 3))
skimage[:, :, 0] = bob_image[0, :, :] #Copying red
skimage[:, :, 1] = bob_image[1, :, :] #Copying green
skimage[:, :, 2] = bob_image[2, :, :] #Copying blue
return skimage
def get_batch(self):
# Shuffling samples
indexes = numpy.array(range(self.data.shape[0]))
numpy.random.shuffle(indexes)
selected_data = numpy.zeros(shape=self.shape)
for i in range(self.batch_size):
file_name = self.data[indexes[i]]
data = self.load_from_file(file_name, self.shape)
selected_data[i, ...] = data
if self.scale:
selected_data[i, ...] *= self.scale_value
selected_labels = self.labels[indexes[0:self.batch_size]]
return selected_data.astype("float32"), selected_labels
def rescale(self, data):
"""
Reescale a single sample with input_shape
"""
#if self.input_shape != data.shape:
if self.bob_shape != data.shape:
# TODO: Implement a better way to do this reescaling
# If it is gray scale
if self.input_shape[2] == 1:
copy = data[:, :, 0].copy()
dst = numpy.zeros(shape=self.input_shape[0:2])
bob.ip.base.scale(copy, dst)
dst = numpy.reshape(dst, self.input_shape)
else:
#dst = numpy.resize(data, self.bob_shape) # Scaling with numpy, because bob is c,w,d instead of w,h,c
dst = numpy.zeros(shape=self.bob_shape)
# TODO: LAME SOLUTION
if data.shape[0] != 3: # GRAY SCALE IMAGES IN A RGB DATABASE
step_data = numpy.zeros(shape=(3, data.shape[0], data.shape[1]))
step_data[0, ...] = data[:, :]
step_data[1, ...] = data[:, :]
step_data[2, ...] = data[:, :]
data = step_data
bob.ip.base.scale(data, dst)
return dst
else:
return data
def get_pair(self, zero_one_labels=True):
"""
Get a random pair of samples
......@@ -159,10 +71,7 @@ class TextDataShuffler(BaseDataShuffler):
data[i, ...] = self.load_from_file(str(file_name), self.shape)
data_p[i, ...] = self.load_from_file(str(file_name_p), self.shape)
if zero_one_labels:
labels_siamese[i] = not genuine
else:
labels_siamese[i] = -1 if genuine else +1
labels_siamese[i] = not genuine
genuine = not genuine
if self.scale:
......@@ -170,30 +79,3 @@ class TextDataShuffler(BaseDataShuffler):
data_p *= self.scale_value
return data, data_p, labels_siamese
def get_random_triplet(self):
"""
Get a random pair of samples
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
data_a = numpy.zeros(shape=self.shape, dtype='float32')
data_p = numpy.zeros(shape=self.shape, dtype='float32')
data_n = numpy.zeros(shape=self.shape, dtype='float32')
for i in range(self.shape[0]):
file_name_a, file_name_p, file_name_n = self.get_one_triplet(self.data, self.labels)
data_a[i, ...] = self.load_from_file(str(file_name_a), self.shape)
data_p[i, ...] = self.load_from_file(str(file_name_p), self.shape)
data_n[i, ...] = self.load_from_file(str(file_name_n), self.shape)
if self.scale:
data_a *= self.scale_value
data_p *= self.scale_value
data_n *= self.scale_value
return data_a, data_p, data_n
......@@ -16,7 +16,8 @@ class SiameseMemory(Siamese, Memory):
input_shape,
input_dtype="float64",
scale=True,
batch_size=1):
batch_size=1,
seed=10):
"""
Shuffler that deal with memory datasets
......@@ -35,8 +36,11 @@ class SiameseMemory(Siamese, Memory):
input_shape=input_shape,
input_dtype=input_dtype,
scale=scale,
batch_size=batch_size
batch_size=batch_size,
seed=seed
)
# Seting the seed
numpy.random.seed(seed)
self.data = self.data.astype(input_dtype)
if self.scale:
......
......@@ -5,10 +5,16 @@
import numpy
from .Base import Base
import tensorflow as tf
class Triplet(Base):
"""
Triplet Shuffler base class
This datashuffler deal with databases that are provides data to Triplet networks.
Basically the py:meth:`get_batch` method provides you 3 elements in the returned list.
The first element is the batch for the anchor, the second one is the batch for the positive class, w.r.t the
anchor, and the last one is the batch for the negative class , w.r.t the anchor.
"""
def __init__(self, **kwargs):
......@@ -16,6 +22,36 @@ class Triplet(Base):
self.data2_placeholder = None
self.data3_placeholder = None
def get_placeholders(self, name=""):
"""
Returns a place holder with the size of your batch
"""
if self.data_placeholder is None:
self.data_placeholder = tf.placeholder(tf.float32, shape=self.shape, name=name+"_anchor")
if self.data2_placeholder is None:
self.data2_placeholder = tf.placeholder(tf.float32, shape=self.shape, name=name+"_positive")
if self.data3_placeholder is None:
self.data3_placeholder = tf.placeholder(tf.float32, shape=self.shape, name=name+"_negative")
return [self.data_placeholder, self.data2_placeholder, self.data3_placeholder]
def get_placeholders_triplet_forprefetch(self, name=""):
"""
Returns a place holder with the size of your batch
"""
if self.data_placeholder is None:
self.data_placeholder = tf.placeholder(tf.float32, shape=tuple([None] + list(self.shape[1:])), name=name)
if self.data2_placeholder is None:
self.data2_placeholder = tf.placeholder(tf.float32, shape=tuple([None] + list(self.shape[1:])), name=name)
if self.data3_placeholder is None:
self.data3_placeholder = tf.placeholder(tf.float32, shape=tuple([None] + list(self.shape[1:])), name=name)
return [self.data_placeholder, self.data2_placeholder, self.data3_placeholder]
def get_one_triplet(self, input_data, input_labels):
# Getting a pair of clients
index = numpy.random.choice(len(self.possible_labels), 2, replace=False)
......
......@@ -12,21 +12,17 @@ logger = bob.core.log.setup("bob.learn.tensorflow")
import tensorflow as tf
from .BaseDataShuffler import BaseDataShuffler
from .Disk import Disk
from .Triplet import Triplet
#def scale_mean_norm(data, scale=0.00390625):
# mean = numpy.mean(data)
# data = (data - mean) * scale
# return data, mean
class TextDataShuffler(BaseDataShuffler):
class TripletDisk(Triplet, Disk):
def __init__(self, data, labels,
input_shape,
input_dtype="float64",
scale=True,
batch_size=1):
batch_size=1,
seed=10):
"""
Shuffler that deal with file list
......@@ -45,7 +41,7 @@ class TextDataShuffler(BaseDataShuffler):
if isinstance(labels, list):
labels = numpy.array(labels)
super(TextDataShuffler, self).__init__(
super(TripletDisk, self).__init__(
data=data,
labels=labels,
input_shape=input_shape,
......@@ -53,125 +49,13 @@ class TextDataShuffler(BaseDataShuffler):
scale=scale,
batch_size=batch_size
)
# Seting the seed
numpy.random.seed(seed)
# TODO: very bad solution to deal with bob.shape images an tf shape images
self.bob_shape = tuple([input_shape[2]] + list(input_shape[0:2]))
def load_from_file(self, file_name, shape):
d = bob.io.base.load(file_name)
if d.shape[0] != 3 and self.input_shape[2] != 3: # GRAY SCALE IMAGE
data = numpy.zeros(shape=(d.shape[0], d.shape[1], 1))
data[:, :, 0] = d
data = self.rescale(data)
else:
d = self.rescale(d)
data = self.bob2skimage(d)
# Checking NaN
if numpy.sum(numpy.isnan(data)) > 0:
logger.warning("######### Image {0} has noise #########".format(file_name))
return data
def bob2skimage(self, bob_image):
"""
Convert bob color image to the skcit image
"""
skimage = numpy.zeros(shape=(bob_image.shape[1], bob_image.shape[2], 3))
skimage[:, :, 0] = bob_image[0, :, :] #Copying red
skimage[:, :, 1] = bob_image[1, :, :] #Copying green
skimage[:, :, 2] = bob_image[2, :, :] #Copying blue
return skimage
def get_batch(self):
# Shuffling samples
indexes = numpy.array(range(self.data.shape[0]))
numpy.random.shuffle(indexes)
selected_data = numpy.zeros(shape=self.shape)
for i in range(self.batch_size):
file_name = self.data[indexes[i]]
data = self.load_from_file(file_name, self.shape)
selected_data[i, ...] = data
if self.scale:
selected_data[i, ...] *= self.scale_value
selected_labels = self.labels[indexes[0:self.batch_size]]
return selected_data.astype("float32"), selected_labels
def rescale(self, data):
"""
Reescale a single sample with input_shape
"""
#if self.input_shape != data.shape:
if self.bob_shape != data.shape:
# TODO: Implement a better way to do this reescaling
# If it is gray scale
if self.input_shape[2] == 1:
copy = data[:, :, 0].copy()
dst = numpy.zeros(shape=self.input_shape[0:2])
bob.ip.base.scale(copy, dst)
dst = numpy.reshape(dst, self.input_shape)
else:
#dst = numpy.resize(data, self.bob_shape) # Scaling with numpy, because bob is c,w,d instead of w,h,c
dst = numpy.zeros(shape=self.bob_shape)
# TODO: LAME SOLUTION
if data.shape[0] != 3: # GRAY SCALE IMAGES IN A RGB DATABASE
step_data = numpy.zeros(shape=(3, data.shape[0], data.shape[1]))
step_data[0, ...] = data[:, :]
step_data[1, ...] = data[:, :]
step_data[2, ...] = data[:, :]
data = step_data
bob.ip.base.scale(data, dst)
return dst
else:
return data
def get_pair(self, zero_one_labels=True):
"""
Get a random pair of samples
**Parameters**
is_target_set_train: Defining the target set to get the batch
**Return**
"""
data = numpy.zeros(shape=self.shape, dtype='float32')
data_p = numpy.zeros(shape=self.shape, dtype='float32')
labels_siamese = numpy.zeros(shape=self.shape[0], dtype='float32')
genuine = True
for i in range(self.shape[0]):
file_name, file_name_p = self.get_genuine_or_not(self.data, self.labels, genuine=genuine)
data[i, ...] = self.load_from_file(str(file_name), self.shape)
data_p[i, ...] = self.load_from_file(str(file_name_p), self.shape)
if zero_one_labels:
labels_siamese[i] = not genuine
else:
labels_siamese[i] = -1 if genuine else +1
genuine = not genuine
if self.scale:
data *= self.scale_value
data_p *= self.scale_value
return data, data_p, labels_siamese
def get_random_triplet(self):
"""
Get a random pair of samples
......@@ -196,4 +80,4 @@ class TextDataShuffler(BaseDataShuffler):
data_p *= self.scale_value
data_n *= self.scale_value
return data_a, data_p, data_n
return [data_a, data_p, data_n]
......@@ -16,7 +16,8 @@ class TripletMemory(Triplet, Memory):
input_shape,
input_dtype="float64",
scale=True,
batch_size=1):
batch_size=1,
seed=10):
"""
Shuffler that deal with memory datasets
......@@ -35,8 +36,11 @@ class TripletMemory(Triplet, Memory):
input_shape=input_shape,
input_dtype=input_dtype,
scale=scale,
batch_size=batch_size
batch_size=batch_size,
seed=seed
)
# Seting the seed
numpy.random.seed(seed)
self.data = self.data.astype(input_dtype)
if self.scale:
......
......@@ -5,8 +5,13 @@ __path__ = extend_path(__path__, __name__)
from .Base import Base
from .Siamese import Siamese
from .Memory import Memory
from .Disk import Disk
from .SiameseMemory import SiameseMemory
from .TripletMemory import TripletMemory
from .SiameseDisk import SiameseDisk
from .TripletDisk import TripletDisk
# gets sphinx autodoc done right - don't remove it
__all__ = [_ for _ in dir() if not _.startswith('_')]
......@@ -22,10 +22,11 @@ from docopt import docopt
import tensorflow as tf
from .. import util
SEED = 10
from bob.learn.tensorflow.data import MemoryDataShuffler, TextDataShuffler
from bob.learn.tensorflow.datashuffler import Memory, SiameseMemory, TripletMemory
from bob.learn.tensorflow.network import Lenet, MLP, Dummy, Chopra
from bob.learn.tensorflow.trainers import Trainer
from bob.learn.tensorflow.loss import BaseLoss
import bob.io.base
from ..analyzers import ExperimentAnalizer, SoftmaxAnalizer
import numpy
......@@ -42,64 +43,37 @@ def main():
mnist = True
# Loading data
if mnist:
train_data, train_labels, validation_data, validation_labels = \
util.load_mnist(data_dir="./src/bob.db.mnist/bob/db/mnist/")
train_data, train_labels, validation_data, validation_labels = \
util.load_mnist(data_dir="./src/bob.db.mnist/bob/db/mnist/")
train_data = numpy.reshape(train_data, (train_data.shape[0], 28, 28, 1))
validation_data = numpy.reshape(validation_data, (validation_data.shape[0], 28, 28, 1))
train_data = numpy.reshape(train_data, (train_data.shape[0], 28, 28, 1))
validation_data = numpy.reshape(validation_data, (validation_data.shape[0], 28, 28, 1))
# Creating datashufflers
train_data_shuffler = Memory(train_data, train_labels,
input_shape=[28, 28, 1],
batch_size=BATCH_SIZE)
validation_data_shuffler = Memory(validation_data, validation_labels,
input_shape=[28, 28, 1],
batch_size=VALIDATION_BATCH_SIZE)
train_data_shuffler = MemoryDataShuffler(train_data, train_labels,
input_shape=[28, 28, 1],
batch_size=BATCH_SIZE)
validation_data_shuffler = MemoryDataShuffler(validation_data, validation_labels,
input_shape=[28, 28, 1],
batch_size=VALIDATION_BATCH_SIZE)
else:
import bob.db.mobio
db = bob.db.mobio.Database()
# Preparing train set
train_objects = db.objects(protocol="male", groups="world")
train_labels = [o.client_id for o in train_objects]
train_file_names = [o.make_path(
directory="/idiap/user/tpereira/face/baselines/eigenface/preprocessed",
extension=".hdf5")
for o in train_objects]
train_data_shuffler = TextDataShuffler(train_file_names, train_labels,
scale=False,
input_shape=[80, 64, 1],
batch_size=BATCH_SIZE)
# Preparing train set
validation_objects = db.objects(protocol="male", groups="dev")
validation_labels = [o.client_id for o in validation_objects]
validation_file_names = [o.make_path(
directory="/idiap/user/tpereira/face/baselines/eigenface/preprocessed",
extension=".hdf5")
for o in validation_objects]
validation_data_shuffler = TextDataShuffler(validation_file_names, validation_labels,
input_shape=[80, 64, 1],
scale=False,
batch_size=VALIDATION_BATCH_SIZE)
# Preparing the architecture
cnn = True
if cnn:
architecture = Chopra(seed=SEED)
architecture = Chopra(seed=SEED, fc1_output=10)
#architecture = Lenet(seed=SEED)
#architecture = Dummy(seed=SEED)