Commit 16c9feff authored by Tiago de Freitas Pereira's avatar Tiago de Freitas Pereira

Merge branch 'audio-clean' into 'master'

Added support for audio databases

A cleaner version of code for audio support

See merge request !2
parents 6183b5f0 e9f97a17
include README.rst bootstrap-buildout.py buildout.cfg COPYING version.txt requirements.txt include README.rst bootstrap-buildout.py buildout.cfg COPYING version.txt requirements.txt
recursive-include doc *.py *.rst recursive-include doc *.py *.rst
recursive-include bob *.wav *.hdf5
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Pavel Korshunov <pavel.korshunov@idiap.ch>
# @date: Wed 19 Oct 23:43:22 2016
import numpy
import bob.core
from .Base import Base
from scipy.io.wavfile import read as readWAV
logger = bob.core.log.setup("bob.learn.tensorflow")
logger.propagate = False
class DiskAudio(Base):
def __init__(self, data, labels,
input_dtype="float64",
batch_size=1,
seed=10,
data_augmentation=None,
context_size=20,
win_length_ms=10,
rate=16000,
out_file=""
):
"""
This datashuffler deals with speech databases that are stored in the disk.
The data is loaded and preprocessed on the fly.
"""
self.out_file = out_file
self.context_size = context_size
self.win_length_ms = win_length_ms
self.m_win_length = self.win_length_ms * rate / 1000 # number of values in a given window
self.m_frame_length = self.m_win_length * (2 * self.context_size + 1)
input_shape = [self.m_frame_length, 1]
if isinstance(data, list):
data = numpy.array(data)
if isinstance(labels, list):
labels = numpy.array(labels)
super(DiskAudio, self).__init__(
data=data,
labels=labels,
input_shape=input_shape,
input_dtype=input_dtype,
batch_size=batch_size,
seed=seed,
data_augmentation=data_augmentation
)
# Seting the seed
numpy.random.seed(seed)
# a flexible queue that stores audio frames extracted from files
self.frames_storage = []
# a similar queue for the corresponding labels
self.labels_storage = []
# if self.out_file != "":
# bob.io.base.create_directories_safe(os.path.dirname(self.out_file))
# f = open(self.out_file, "w")
# for i in range(0, self.data.shape[0]):
# f.write("%d %s\n" % (self.labels[i], str(self.data[i])))
# f.close()
def load_from_file(self, file_name):
rate, audio = readWAV(file_name)
# We consider there is only 1 channel in the audio file => data[0]
data = numpy.cast['float32'](audio)
return rate, data
def get_batch(self, noise=False):
# Shuffling samples
indexes = numpy.array(range(self.data.shape[0]))
numpy.random.shuffle(indexes)
f = None
if self.out_file != "":
f = open(self.out_file, "a")
i = 0
# if not enough in the storage, we pre-load frames from the audio files
while len(self.frames_storage) < self.batch_size:
if f is not None:
f.write("%s\n" % self.data[indexes[i]])
frames, labels = self.extract_frames_from_file(self.data[indexes[i]], self.labels[indexes[i]])
self.frames_storage.extend(frames)
self.labels_storage.extend(labels)
i += 1
# our temp frame queue should have enough data
selected_data = numpy.asarray(self.frames_storage[:self.batch_size])
selected_labels = numpy.asarray(self.labels_storage[:self.batch_size])
# remove them from the list
del self.frames_storage[:self.batch_size]
del self.labels_storage[:self.batch_size]
selected_data = numpy.reshape(selected_data, (self.batch_size, -1, 1))
if f is not None:
f.close()
return [selected_data.astype("float32"), selected_labels.astype("int64")]
def extract_frames_from_file(self, filename, label):
rate, wav_signal = self.load_from_file(filename)
return self.extract_frames_from_wav(wav_signal, label)
def extract_frames_from_wav(self, wav_signal, label):
m_total_length = len(wav_signal)
m_num_win = int(m_total_length / self.m_win_length) # discard the tail of the signal
# normalize the signal first
wav_signal -= numpy.mean(wav_signal)
wav_signal /= numpy.std(wav_signal)
# make sure the array is divided into equal chunks
windows = numpy.split(wav_signal[:self.m_win_length * m_num_win], m_num_win)
final_frames = []
final_labels = [label] * m_num_win
# loop through the windows
for i, window in zip(range(0, len(windows)), windows):
# window with surrounding context will form the frame we seek
# if we don't have enough frame for the context
# copy the first frame necessary number of times
if i < self.context_size:
left_context = numpy.tile(windows[0], self.context_size - i)
final_frames.append(numpy.append(left_context, windows[:i + self.context_size + 1]))
elif (i + self.context_size) > (m_num_win - 1):
right_context = numpy.tile(windows[-1], i + self.context_size - m_num_win + 1)
final_frames.append(numpy.append(windows[i - self.context_size:], right_context))
else:
final_frames.append(numpy.ravel(windows[i - self.context_size:i + self.context_size + 1]))
return final_frames, final_labels
...@@ -20,6 +20,7 @@ from .ImageAugmentation import ImageAugmentation ...@@ -20,6 +20,7 @@ from .ImageAugmentation import ImageAugmentation
from .Normalizer import ScaleFactor, MeanOffset, Linear from .Normalizer import ScaleFactor, MeanOffset, Linear
from .DiskAudio import DiskAudio
# gets sphinx autodoc done right - don't remove it # gets sphinx autodoc done right - don't remove it
def __appropriate__(*args): def __appropriate__(*args):
...@@ -51,7 +52,7 @@ __appropriate__( ...@@ -51,7 +52,7 @@ __appropriate__(
TripletWithSelectionDisk, TripletWithSelectionDisk,
DataAugmentation, DataAugmentation,
ImageAugmentation, ImageAugmentation,
ScaleFactor, MeanOffset, Linear ScaleFactor, MeanOffset, Linear,
DiskAudio,
) )
__all__ = [_ for _ in dir() if not _.startswith('_')] __all__ = [_ for _ in dir() if not _.startswith('_')]
\ No newline at end of file
...@@ -27,7 +27,7 @@ class Constant(Initialization): ...@@ -27,7 +27,7 @@ class Constant(Initialization):
self.constant_value = constant_value self.constant_value = constant_value
super(Constant, self).__init__(seed=None, use_gpu=use_gpu) super(Constant, self).__init__(seed=None, use_gpu=use_gpu)
def __call__(self, shape, name, scope): def __call__(self, shape, name, scope, init_value=None):
initializer = tf.constant(self.constant_value, shape=shape) initializer = tf.constant(self.constant_value, shape=shape)
try: try:
......
...@@ -31,7 +31,7 @@ class Gaussian(Initialization): ...@@ -31,7 +31,7 @@ class Gaussian(Initialization):
self.std = std self.std = std
super(Gaussian, self).__init__(seed, use_gpu=use_gpu) super(Gaussian, self).__init__(seed, use_gpu=use_gpu)
def __call__(self, shape, name, scope): def __call__(self, shape, name, scope, init_value=None):
""" """
Create the gaussian initialized variables Create the gaussian initialized variables
......
...@@ -31,5 +31,5 @@ class Initialization(object): ...@@ -31,5 +31,5 @@ class Initialization(object):
def variable_exist(self, var): def variable_exist(self, var):
return var in [v.name.split("/")[0] for v in tf.all_variables()] return var in [v.name.split("/")[0] for v in tf.all_variables()]
def __call__(self, shape, name, scope): def __call__(self, shape, name, scope, init_value=None):
NotImplementedError("Please implement this function in derived classes") NotImplementedError("Please implement this function in derived classes")
...@@ -26,7 +26,7 @@ class SimplerXavier(Initialization): ...@@ -26,7 +26,7 @@ class SimplerXavier(Initialization):
def __init__(self, seed=10., use_gpu=False): def __init__(self, seed=10., use_gpu=False):
super(SimplerXavier, self).__init__(seed, use_gpu=use_gpu) super(SimplerXavier, self).__init__(seed, use_gpu=use_gpu)
def __call__(self, shape, name, scope): def __call__(self, shape, name, scope, init_value=None):
""" """
Create the gaussian initialized variables Create the gaussian initialized variables
......
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Pavel Korshunov <pavel.korshunov@idiap.ch>
# @date: Wed 09 Nov 2016 13:55:22 CEST
import logging
logger = logging.getLogger("bob.learn.tensorflow")
from .Initialization import Initialization
import tensorflow as tf
class Uniform(Initialization):
"""
Implements Random Uniform initialization
"""
def __init__(self, seed=10., use_gpu=False):
super(Uniform, self).__init__(seed, use_gpu=use_gpu)
def __call__(self, shape, name, scope, init_value=None):
if init_value is None:
init_value = shape[0]
import math
# We use init_value as normalization value, but it can be used differently in different initializations
stddev = 1.0 / math.sqrt(init_value) # RANDOM UNIFORM INITIALIZATION
initializer = tf.random_uniform(shape,
minval=-stddev,
maxval=stddev,
seed=self.seed)
try:
with tf.variable_scope(scope):
if self.use_gpu:
with tf.device("/gpu:0"):
return tf.get_variable(name, initializer=initializer, dtype=tf.float32)
else:
with tf.device("/cpu"):
return tf.get_variable(name, initializer=initializer, dtype=tf.float32)
except ValueError:
with tf.variable_scope(scope, reuse=True):
if self.use_gpu:
with tf.device("/gpu:0"):
return tf.get_variable(name, initializer=initializer, dtype=tf.float32)
else:
with tf.device("/cpu"):
return tf.get_variable(name, initializer=initializer, dtype=tf.float32)
...@@ -27,7 +27,7 @@ class Xavier(Initialization): ...@@ -27,7 +27,7 @@ class Xavier(Initialization):
super(Xavier, self).__init__(seed, use_gpu=use_gpu) super(Xavier, self).__init__(seed, use_gpu=use_gpu)
def __call__(self, shape, name, scope): def __call__(self, shape, name, scope, init_value=None):
""" """
Create the gaussian initialized variables Create the gaussian initialized variables
......
...@@ -3,6 +3,7 @@ from .Xavier import Xavier ...@@ -3,6 +3,7 @@ from .Xavier import Xavier
from .SimplerXavier import SimplerXavier from .SimplerXavier import SimplerXavier
from .Gaussian import Gaussian from .Gaussian import Gaussian
from .Constant import Constant from .Constant import Constant
from .Uniform import Uniform
# gets sphinx autodoc done right - don't remove it # gets sphinx autodoc done right - don't remove it
...@@ -25,6 +26,7 @@ __appropriate__( ...@@ -25,6 +26,7 @@ __appropriate__(
SimplerXavier, SimplerXavier,
Gaussian, Gaussian,
Constant, Constant,
Uniform,
) )
__all__ = [_ for _ in dir() if not _.startswith('_')] __all__ = [_ for _ in dir() if not _.startswith('_')]
......
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
# @author: Pavel Korshunov <pavel.korshunov@idiap.ch>
# @date: Wed 09 Nov 2016 13:55:22 CEST
import tensorflow as tf
from .Layer import Layer
from bob.learn.tensorflow.initialization import Xavier
from bob.learn.tensorflow.initialization import Constant
class Conv1D(Layer):
"""
1D Convolution
**Parameters**
name: str
The name of the layer
activation:
Tensor Flow activation
kernel_size: int
Size of the convolutional kernel
filters: int
Number of filters
stride:
Shape of the stride
weights_initialization: py:class:`bob.learn.tensorflow.initialization.Initialization`
Initialization type for the weights
bias_initialization: py:class:`bob.learn.tensorflow.initialization.Initialization`
Initialization type for the weights
batch_norm: bool
Do batch norm?
use_gpu: bool
Store data in the GPU
"""
def __init__(self, name, activation=None,
kernel_size=300,
filters=20,
stride=100,
weights_initialization=Xavier(),
init_value=None,
bias_initialization=Constant(),
batch_norm=False,
use_gpu=False
):
super(Conv1D, self).__init__(name=name,
activation=activation,
weights_initialization=weights_initialization,
bias_initialization=bias_initialization,
batch_norm=batch_norm,
use_gpu=use_gpu,
)
self.kernel_size = kernel_size
self.filters = filters
self.W = None
self.b = None
self.stride = stride
self.init_value = init_value
def create_variables(self, input_layer):
self.input_layer = input_layer
# TODO: Do an assert here
if len(input_layer.get_shape().as_list()) != 3:
raise ValueError("The input as a convolutional layer must have 3 dimensions, "
"but {0} were provided".format(len(input_layer.get_shape().as_list())))
n_channels = input_layer.get_shape().as_list()[2]
if self.W is None:
if self.init_value is None:
self.init_value = self.kernel_size * n_channels
self.W = self.weights_initialization(shape=[self.kernel_size, n_channels, self.filters],
name="w_" + str(self.name),
scope="w_" + str(self.name),
init_value=self.init_value
)
self.b = self.bias_initialization(shape=[self.filters],
name="b_" + str(self.name) + "bias",
scope="b_" + str(self.name),
init_value=self.init_value
)
def get_graph(self, training_phase=True):
with tf.name_scope(str(self.name)):
conv1d = tf.nn.conv1d(self.input_layer, self.W, stride=self.stride, padding='VALID')
if self.batch_norm:
conv1d = self.batch_normalize(conv1d, training_phase)
if self.activation is not None:
output = self.activation(tf.nn.bias_add(conv1d, self.b))
else:
output = tf.nn.bias_add(conv1d, self.b)
return output
...@@ -46,6 +46,7 @@ class FullyConnected(Layer): ...@@ -46,6 +46,7 @@ class FullyConnected(Layer):
weights_initialization=Xavier(), weights_initialization=Xavier(),
bias_initialization=Constant(), bias_initialization=Constant(),
batch_norm=False, batch_norm=False,
init_value=None,
use_gpu=False, use_gpu=False,
): ):
...@@ -61,11 +62,14 @@ class FullyConnected(Layer): ...@@ -61,11 +62,14 @@ class FullyConnected(Layer):
self.W = None self.W = None
self.b = None self.b = None
self.shape = None self.shape = None
self.init_value = init_value
def create_variables(self, input_layer): def create_variables(self, input_layer):
self.input_layer = input_layer self.input_layer = input_layer
if self.W is None: if self.W is None:
input_dim = reduce(mul, self.input_layer.get_shape().as_list()[1:]) input_dim = reduce(mul, self.input_layer.get_shape().as_list()[1:])
if self.init_value is None:
self.init_value = input_dim
variable = "W_" + str(self.name) variable = "W_" + str(self.name)
if self.get_varible_by_name(variable) is not None: if self.get_varible_by_name(variable) is not None:
...@@ -73,7 +77,8 @@ class FullyConnected(Layer): ...@@ -73,7 +77,8 @@ class FullyConnected(Layer):
else: else:
self.W = self.weights_initialization(shape=[input_dim, self.output_dim], self.W = self.weights_initialization(shape=[input_dim, self.output_dim],
name="W_" + str(self.name), name="W_" + str(self.name),
scope="W_" +str(self.name) scope="W_" +str(self.name),
init_value=self.init_value
) )
# if self.activation is not None: # if self.activation is not None:
variable = "b_" + str(self.name) variable = "b_" + str(self.name)
...@@ -82,14 +87,15 @@ class FullyConnected(Layer): ...@@ -82,14 +87,15 @@ class FullyConnected(Layer):
else: else:
self.b = self.bias_initialization(shape=[self.output_dim], self.b = self.bias_initialization(shape=[self.output_dim],
name="b_" + str(self.name), name="b_" + str(self.name),
scope="b_" + str(self.name) scope="b_" + str(self.name),
init_value=self.init_value
) )
def get_graph(self, training_phase=True): def get_graph(self, training_phase=True):
with tf.name_scope(str(self.name)): with tf.name_scope(str(self.name)):
if len(self.input_layer.get_shape()) == 4: if len(self.input_layer.get_shape()) == 4 or len(self.input_layer.get_shape()) == 3:
shape = self.input_layer.get_shape().as_list() shape = self.input_layer.get_shape().as_list()
fc = tf.reshape(self.input_layer, [-1, numpy.prod(shape[1:])]) fc = tf.reshape(self.input_layer, [-1, numpy.prod(shape[1:])])
else: else:
......
#from DataShuffler import * #from DataShuffler import *
from .Layer import Layer from .Layer import Layer
from .Conv1D import Conv1D
from .Conv2D import Conv2D from .Conv2D import Conv2D
from .FullyConnected import FullyConnected from .FullyConnected import FullyConnected
from .MaxPooling import MaxPooling from .MaxPooling import MaxPooling
...@@ -24,12 +25,13 @@ def __appropriate__(*args): ...@@ -24,12 +25,13 @@ def __appropriate__(*args):
__appropriate__( __appropriate__(
Layer, Layer,
Conv1D,
Conv2D, Conv2D,
FullyConnected, FullyConnected,
MaxPooling, MaxPooling,
AveragePooling, AveragePooling,
Dropout, Dropout,
InputLayer InputLayer,
) )
__all__ = [_ for _ in dir() if not _.startswith('_')] __all__ = [_ for _ in dir() if not _.startswith('_')]
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Pavel Korshunov <pavel.korshunov@idiap.ch>
# @date: Wed 19 Oct 23:43:22 2016
import logging
logger = logging.getLogger("bob.learn.tensorflow")
import tensorflow as tf
from .BaseLoss import BaseLoss
class NegLogLoss(BaseLoss):
"""
Compute the negative log likelihood loss
This is similar to the combination of LogSoftMax layer and ClassNLLCriterion in Torch7
"""
def __init__(self, operation):
# loss function is None since we compute the custom one inside __call__()
super(NegLogLoss, self).__init__(None, operation)
def gather_nd(self, params, indices, name=None):
shape = params.get_shape().as_list()
rank = len(shape)
flat_params = tf.reshape(params, [-1])
if rank > 2:
indices_unpacked = tf.unpack(tf.transpose(indices, [rank - 1] + range(0, rank - 1), name))
elif rank == 2:
indices_unpacked = tf.unpack(indices)
else:
indices_unpacked = indices
flat_indices = [i * rank + indices_unpacked[i] for i in range(0, len(indices_unpacked))]
return tf.gather(flat_params, flat_indices, name=name)
def __call__(self, graph, label):
# get the log-probabilities with log softmax
log_probabilities = tf.nn.log_softmax(graph)
# negative of the log-probability that correspond to the correct label
correct_probabilities = self.gather_nd(log_probabilities, label)
neg_log_prob = tf.neg(correct_probabilities)
# use negative log likelihood as the loss
return self.operation(neg_log_prob)
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
from .BaseLoss import BaseLoss from .BaseLoss import BaseLoss
from .ContrastiveLoss import ContrastiveLoss from .ContrastiveLoss import ContrastiveLoss
from .TripletLoss import TripletLoss from .TripletLoss import TripletLoss
from NegLogLoss import NegLogLoss
# gets sphinx autodoc done right - don't remove it # gets sphinx autodoc done right - don't remove it
...@@ -22,6 +23,7 @@ __appropriate__( ...@@ -22,6 +23,7 @@ __appropriate__(
BaseLoss, BaseLoss,
ContrastiveLoss, ContrastiveLoss,
TripletLoss, TripletLoss,
NegLogLoss,
) )
__all__ = [_ for _ in dir() if not _.startswith('_')] __all__ = [_ for _ in dir() if not _.startswith('_')]
......
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
# @date: Wed 11 May 2016 09:39:36 CEST
"""
Class that creates the lenet architecture
"""
import tensorflow as tf
from .SequenceNetwork import SequenceNetwork
from ..layers import Conv1D, FullyConnected
from bob.learn.tensorflow.initialization import Uniform
# construct HardTanh activation function
def hard_tanh(x, name=None):
one = tf.constant(1, dtype=tf.float32)
neg_one = tf.constant(-1, dtype=tf.float32)
return tf.minimum(tf.maximum(x, neg_one), one)
class SimpleAudio(SequenceNetwork):
def __init__(self,
conv1_kernel_size=300,
conv1_output=20,
conv1_stride=100,
fc1_output=40,
n_classes=2,
default_feature_layer="fc2",
seed=10,
use_gpu=False
):
super(SimpleAudio, self).__init__(default_feature_layer=default_feature_layer,
use_gpu=use_gpu)
self.add(Conv1D(name="conv1", kernel_size=conv1_kernel_size,
filters=conv1_output,
stride=conv1_stride,
activation=hard_tanh,
weights_initialization=Uniform(seed=seed, use_gpu=use_gpu),
bias_initialization=Uniform(seed=seed, use_gpu=use_gpu),
use_gpu=use_gpu
))
self.add(FullyConnected(name="fc1", output_dim=fc1_output,
activation=hard_tanh,
weights_initialization=Uniform(seed=seed, use_gpu=use_gpu),
bias_initialization=Uniform(seed=seed, use_gpu=use_gpu),
use_gpu=use_gpu
))
self.add(FullyConnected(name="fc2", output_dim=n_classes,
activation=None,
weights_initialization=Uniform(seed=seed, use_gpu=use_gpu),
bias_initialization=Uniform(seed=seed, use_gpu=use_gpu),
use_gpu=use_gpu
))
\ No newline at end of file
...@@ -9,7 +9,7 @@ from .FaceNet import FaceNet ...@@ -9,7 +9,7 @@ from .FaceNet import FaceNet
from .FaceNetSimple import FaceNetSimple from .FaceNetSimple import FaceNetSimple
from .VGG16 import VGG16 from .VGG16 import VGG16
from .VGG16_mod import VGG16_mod from .VGG16_mod import VGG16_mod
from SimpleAudio import SimpleAudio
# gets sphinx autodoc done right - don't remove it # gets sphinx autodoc done right - don't remove it
def __appropriate__(*args): def __appropriate__(*args):
...@@ -36,6 +36,7 @@ __appropriate__( ...@@ -36,6 +36,7 @@ __appropriate__(
FaceNetSimple, FaceNetSimple,
VGG16, VGG16,