Commit 16c9feff authored by Tiago de Freitas Pereira's avatar Tiago de Freitas Pereira

Merge branch 'audio-clean' into 'master'

Added support for audio databases

A cleaner version of code for audio support

See merge request !2
parents 6183b5f0 e9f97a17
include README.rst bootstrap-buildout.py buildout.cfg COPYING version.txt requirements.txt
recursive-include doc *.py *.rst
recursive-include bob *.wav *.hdf5
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Pavel Korshunov <pavel.korshunov@idiap.ch>
# @date: Wed 19 Oct 23:43:22 2016
import numpy
import bob.core
from .Base import Base
from scipy.io.wavfile import read as readWAV
logger = bob.core.log.setup("bob.learn.tensorflow")
logger.propagate = False
class DiskAudio(Base):
def __init__(self, data, labels,
input_dtype="float64",
batch_size=1,
seed=10,
data_augmentation=None,
context_size=20,
win_length_ms=10,
rate=16000,
out_file=""
):
"""
This datashuffler deals with speech databases that are stored in the disk.
The data is loaded and preprocessed on the fly.
"""
self.out_file = out_file
self.context_size = context_size
self.win_length_ms = win_length_ms
self.m_win_length = self.win_length_ms * rate / 1000 # number of values in a given window
self.m_frame_length = self.m_win_length * (2 * self.context_size + 1)
input_shape = [self.m_frame_length, 1]
if isinstance(data, list):
data = numpy.array(data)
if isinstance(labels, list):
labels = numpy.array(labels)
super(DiskAudio, self).__init__(
data=data,
labels=labels,
input_shape=input_shape,
input_dtype=input_dtype,
batch_size=batch_size,
seed=seed,
data_augmentation=data_augmentation
)
# Seting the seed
numpy.random.seed(seed)
# a flexible queue that stores audio frames extracted from files
self.frames_storage = []
# a similar queue for the corresponding labels
self.labels_storage = []
# if self.out_file != "":
# bob.io.base.create_directories_safe(os.path.dirname(self.out_file))
# f = open(self.out_file, "w")
# for i in range(0, self.data.shape[0]):
# f.write("%d %s\n" % (self.labels[i], str(self.data[i])))
# f.close()
def load_from_file(self, file_name):
rate, audio = readWAV(file_name)
# We consider there is only 1 channel in the audio file => data[0]
data = numpy.cast['float32'](audio)
return rate, data
def get_batch(self, noise=False):
# Shuffling samples
indexes = numpy.array(range(self.data.shape[0]))
numpy.random.shuffle(indexes)
f = None
if self.out_file != "":
f = open(self.out_file, "a")
i = 0
# if not enough in the storage, we pre-load frames from the audio files
while len(self.frames_storage) < self.batch_size:
if f is not None:
f.write("%s\n" % self.data[indexes[i]])
frames, labels = self.extract_frames_from_file(self.data[indexes[i]], self.labels[indexes[i]])
self.frames_storage.extend(frames)
self.labels_storage.extend(labels)
i += 1
# our temp frame queue should have enough data
selected_data = numpy.asarray(self.frames_storage[:self.batch_size])
selected_labels = numpy.asarray(self.labels_storage[:self.batch_size])
# remove them from the list
del self.frames_storage[:self.batch_size]
del self.labels_storage[:self.batch_size]
selected_data = numpy.reshape(selected_data, (self.batch_size, -1, 1))
if f is not None:
f.close()
return [selected_data.astype("float32"), selected_labels.astype("int64")]
def extract_frames_from_file(self, filename, label):
rate, wav_signal = self.load_from_file(filename)
return self.extract_frames_from_wav(wav_signal, label)
def extract_frames_from_wav(self, wav_signal, label):
m_total_length = len(wav_signal)
m_num_win = int(m_total_length / self.m_win_length) # discard the tail of the signal
# normalize the signal first
wav_signal -= numpy.mean(wav_signal)
wav_signal /= numpy.std(wav_signal)
# make sure the array is divided into equal chunks
windows = numpy.split(wav_signal[:self.m_win_length * m_num_win], m_num_win)
final_frames = []
final_labels = [label] * m_num_win
# loop through the windows
for i, window in zip(range(0, len(windows)), windows):
# window with surrounding context will form the frame we seek
# if we don't have enough frame for the context
# copy the first frame necessary number of times
if i < self.context_size:
left_context = numpy.tile(windows[0], self.context_size - i)
final_frames.append(numpy.append(left_context, windows[:i + self.context_size + 1]))
elif (i + self.context_size) > (m_num_win - 1):
right_context = numpy.tile(windows[-1], i + self.context_size - m_num_win + 1)
final_frames.append(numpy.append(windows[i - self.context_size:], right_context))
else:
final_frames.append(numpy.ravel(windows[i - self.context_size:i + self.context_size + 1]))
return final_frames, final_labels
......@@ -20,6 +20,7 @@ from .ImageAugmentation import ImageAugmentation
from .Normalizer import ScaleFactor, MeanOffset, Linear
from .DiskAudio import DiskAudio
# gets sphinx autodoc done right - don't remove it
def __appropriate__(*args):
......@@ -51,7 +52,7 @@ __appropriate__(
TripletWithSelectionDisk,
DataAugmentation,
ImageAugmentation,
ScaleFactor, MeanOffset, Linear
ScaleFactor, MeanOffset, Linear,
DiskAudio,
)
__all__ = [_ for _ in dir() if not _.startswith('_')]
\ No newline at end of file
......@@ -27,7 +27,7 @@ class Constant(Initialization):
self.constant_value = constant_value
super(Constant, self).__init__(seed=None, use_gpu=use_gpu)
def __call__(self, shape, name, scope):
def __call__(self, shape, name, scope, init_value=None):
initializer = tf.constant(self.constant_value, shape=shape)
try:
......
......@@ -31,7 +31,7 @@ class Gaussian(Initialization):
self.std = std
super(Gaussian, self).__init__(seed, use_gpu=use_gpu)
def __call__(self, shape, name, scope):
def __call__(self, shape, name, scope, init_value=None):
"""
Create the gaussian initialized variables
......
......@@ -31,5 +31,5 @@ class Initialization(object):
def variable_exist(self, var):
return var in [v.name.split("/")[0] for v in tf.all_variables()]
def __call__(self, shape, name, scope):
def __call__(self, shape, name, scope, init_value=None):
NotImplementedError("Please implement this function in derived classes")
......@@ -26,7 +26,7 @@ class SimplerXavier(Initialization):
def __init__(self, seed=10., use_gpu=False):
super(SimplerXavier, self).__init__(seed, use_gpu=use_gpu)
def __call__(self, shape, name, scope):
def __call__(self, shape, name, scope, init_value=None):
"""
Create the gaussian initialized variables
......
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Pavel Korshunov <pavel.korshunov@idiap.ch>
# @date: Wed 09 Nov 2016 13:55:22 CEST
import logging
logger = logging.getLogger("bob.learn.tensorflow")
from .Initialization import Initialization
import tensorflow as tf
class Uniform(Initialization):
"""
Implements Random Uniform initialization
"""
def __init__(self, seed=10., use_gpu=False):
super(Uniform, self).__init__(seed, use_gpu=use_gpu)
def __call__(self, shape, name, scope, init_value=None):
if init_value is None:
init_value = shape[0]
import math
# We use init_value as normalization value, but it can be used differently in different initializations
stddev = 1.0 / math.sqrt(init_value) # RANDOM UNIFORM INITIALIZATION
initializer = tf.random_uniform(shape,
minval=-stddev,
maxval=stddev,
seed=self.seed)
try:
with tf.variable_scope(scope):
if self.use_gpu:
with tf.device("/gpu:0"):
return tf.get_variable(name, initializer=initializer, dtype=tf.float32)
else:
with tf.device("/cpu"):
return tf.get_variable(name, initializer=initializer, dtype=tf.float32)
except ValueError:
with tf.variable_scope(scope, reuse=True):
if self.use_gpu:
with tf.device("/gpu:0"):
return tf.get_variable(name, initializer=initializer, dtype=tf.float32)
else:
with tf.device("/cpu"):
return tf.get_variable(name, initializer=initializer, dtype=tf.float32)
......@@ -27,7 +27,7 @@ class Xavier(Initialization):
super(Xavier, self).__init__(seed, use_gpu=use_gpu)
def __call__(self, shape, name, scope):
def __call__(self, shape, name, scope, init_value=None):
"""
Create the gaussian initialized variables
......
......@@ -3,6 +3,7 @@ from .Xavier import Xavier
from .SimplerXavier import SimplerXavier
from .Gaussian import Gaussian
from .Constant import Constant
from .Uniform import Uniform
# gets sphinx autodoc done right - don't remove it
......@@ -25,6 +26,7 @@ __appropriate__(
SimplerXavier,
Gaussian,
Constant,
Uniform,
)
__all__ = [_ for _ in dir() if not _.startswith('_')]
......
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
# @author: Pavel Korshunov <pavel.korshunov@idiap.ch>
# @date: Wed 09 Nov 2016 13:55:22 CEST
import tensorflow as tf
from .Layer import Layer
from bob.learn.tensorflow.initialization import Xavier
from bob.learn.tensorflow.initialization import Constant
class Conv1D(Layer):
"""
1D Convolution
**Parameters**
name: str
The name of the layer
activation:
Tensor Flow activation
kernel_size: int
Size of the convolutional kernel
filters: int
Number of filters
stride:
Shape of the stride
weights_initialization: py:class:`bob.learn.tensorflow.initialization.Initialization`
Initialization type for the weights
bias_initialization: py:class:`bob.learn.tensorflow.initialization.Initialization`
Initialization type for the weights
batch_norm: bool
Do batch norm?
use_gpu: bool
Store data in the GPU
"""
def __init__(self, name, activation=None,
kernel_size=300,
filters=20,
stride=100,
weights_initialization=Xavier(),
init_value=None,
bias_initialization=Constant(),
batch_norm=False,
use_gpu=False
):
super(Conv1D, self).__init__(name=name,
activation=activation,
weights_initialization=weights_initialization,
bias_initialization=bias_initialization,
batch_norm=batch_norm,
use_gpu=use_gpu,
)
self.kernel_size = kernel_size
self.filters = filters
self.W = None
self.b = None
self.stride = stride
self.init_value = init_value
def create_variables(self, input_layer):
self.input_layer = input_layer
# TODO: Do an assert here
if len(input_layer.get_shape().as_list()) != 3:
raise ValueError("The input as a convolutional layer must have 3 dimensions, "
"but {0} were provided".format(len(input_layer.get_shape().as_list())))
n_channels = input_layer.get_shape().as_list()[2]
if self.W is None:
if self.init_value is None:
self.init_value = self.kernel_size * n_channels
self.W = self.weights_initialization(shape=[self.kernel_size, n_channels, self.filters],
name="w_" + str(self.name),
scope="w_" + str(self.name),
init_value=self.init_value
)
self.b = self.bias_initialization(shape=[self.filters],
name="b_" + str(self.name) + "bias",
scope="b_" + str(self.name),
init_value=self.init_value
)
def get_graph(self, training_phase=True):
with tf.name_scope(str(self.name)):
conv1d = tf.nn.conv1d(self.input_layer, self.W, stride=self.stride, padding='VALID')
if self.batch_norm:
conv1d = self.batch_normalize(conv1d, training_phase)
if self.activation is not None:
output = self.activation(tf.nn.bias_add(conv1d, self.b))
else:
output = tf.nn.bias_add(conv1d, self.b)
return output
......@@ -46,6 +46,7 @@ class FullyConnected(Layer):
weights_initialization=Xavier(),
bias_initialization=Constant(),
batch_norm=False,
init_value=None,
use_gpu=False,
):
......@@ -61,11 +62,14 @@ class FullyConnected(Layer):
self.W = None
self.b = None
self.shape = None
self.init_value = init_value
def create_variables(self, input_layer):
self.input_layer = input_layer
if self.W is None:
input_dim = reduce(mul, self.input_layer.get_shape().as_list()[1:])
if self.init_value is None:
self.init_value = input_dim
variable = "W_" + str(self.name)
if self.get_varible_by_name(variable) is not None:
......@@ -73,7 +77,8 @@ class FullyConnected(Layer):
else:
self.W = self.weights_initialization(shape=[input_dim, self.output_dim],
name="W_" + str(self.name),
scope="W_" +str(self.name)
scope="W_" +str(self.name),
init_value=self.init_value
)
# if self.activation is not None:
variable = "b_" + str(self.name)
......@@ -82,14 +87,15 @@ class FullyConnected(Layer):
else:
self.b = self.bias_initialization(shape=[self.output_dim],
name="b_" + str(self.name),
scope="b_" + str(self.name)
scope="b_" + str(self.name),
init_value=self.init_value
)
def get_graph(self, training_phase=True):
with tf.name_scope(str(self.name)):
if len(self.input_layer.get_shape()) == 4:
if len(self.input_layer.get_shape()) == 4 or len(self.input_layer.get_shape()) == 3:
shape = self.input_layer.get_shape().as_list()
fc = tf.reshape(self.input_layer, [-1, numpy.prod(shape[1:])])
else:
......
#from DataShuffler import *
from .Layer import Layer
from .Conv1D import Conv1D
from .Conv2D import Conv2D
from .FullyConnected import FullyConnected
from .MaxPooling import MaxPooling
......@@ -24,12 +25,13 @@ def __appropriate__(*args):
__appropriate__(
Layer,
Conv1D,
Conv2D,
FullyConnected,
MaxPooling,
AveragePooling,
Dropout,
InputLayer
InputLayer,
)
__all__ = [_ for _ in dir() if not _.startswith('_')]
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Pavel Korshunov <pavel.korshunov@idiap.ch>
# @date: Wed 19 Oct 23:43:22 2016
import logging
logger = logging.getLogger("bob.learn.tensorflow")
import tensorflow as tf
from .BaseLoss import BaseLoss
class NegLogLoss(BaseLoss):
"""
Compute the negative log likelihood loss
This is similar to the combination of LogSoftMax layer and ClassNLLCriterion in Torch7
"""
def __init__(self, operation):
# loss function is None since we compute the custom one inside __call__()
super(NegLogLoss, self).__init__(None, operation)
def gather_nd(self, params, indices, name=None):
shape = params.get_shape().as_list()
rank = len(shape)
flat_params = tf.reshape(params, [-1])
if rank > 2:
indices_unpacked = tf.unpack(tf.transpose(indices, [rank - 1] + range(0, rank - 1), name))
elif rank == 2:
indices_unpacked = tf.unpack(indices)
else:
indices_unpacked = indices
flat_indices = [i * rank + indices_unpacked[i] for i in range(0, len(indices_unpacked))]
return tf.gather(flat_params, flat_indices, name=name)
def __call__(self, graph, label):
# get the log-probabilities with log softmax
log_probabilities = tf.nn.log_softmax(graph)
# negative of the log-probability that correspond to the correct label
correct_probabilities = self.gather_nd(log_probabilities, label)
neg_log_prob = tf.neg(correct_probabilities)
# use negative log likelihood as the loss
return self.operation(neg_log_prob)
......@@ -2,6 +2,7 @@
from .BaseLoss import BaseLoss
from .ContrastiveLoss import ContrastiveLoss
from .TripletLoss import TripletLoss
from NegLogLoss import NegLogLoss
# gets sphinx autodoc done right - don't remove it
......@@ -22,6 +23,7 @@ __appropriate__(
BaseLoss,
ContrastiveLoss,
TripletLoss,
NegLogLoss,
)
__all__ = [_ for _ in dir() if not _.startswith('_')]
......
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
# @date: Wed 11 May 2016 09:39:36 CEST
"""
Class that creates the lenet architecture
"""
import tensorflow as tf
from .SequenceNetwork import SequenceNetwork
from ..layers import Conv1D, FullyConnected
from bob.learn.tensorflow.initialization import Uniform
# construct HardTanh activation function
def hard_tanh(x, name=None):
one = tf.constant(1, dtype=tf.float32)
neg_one = tf.constant(-1, dtype=tf.float32)
return tf.minimum(tf.maximum(x, neg_one), one)
class SimpleAudio(SequenceNetwork):
def __init__(self,
conv1_kernel_size=300,
conv1_output=20,
conv1_stride=100,
fc1_output=40,
n_classes=2,
default_feature_layer="fc2",
seed=10,
use_gpu=False
):
super(SimpleAudio, self).__init__(default_feature_layer=default_feature_layer,
use_gpu=use_gpu)
self.add(Conv1D(name="conv1", kernel_size=conv1_kernel_size,
filters=conv1_output,
stride=conv1_stride,
activation=hard_tanh,
weights_initialization=Uniform(seed=seed, use_gpu=use_gpu),
bias_initialization=Uniform(seed=seed, use_gpu=use_gpu),
use_gpu=use_gpu
))
self.add(FullyConnected(name="fc1", output_dim=fc1_output,
activation=hard_tanh,
weights_initialization=Uniform(seed=seed, use_gpu=use_gpu),
bias_initialization=Uniform(seed=seed, use_gpu=use_gpu),
use_gpu=use_gpu
))
self.add(FullyConnected(name="fc2", output_dim=n_classes,
activation=None,
weights_initialization=Uniform(seed=seed, use_gpu=use_gpu),
bias_initialization=Uniform(seed=seed, use_gpu=use_gpu),
use_gpu=use_gpu
))
\ No newline at end of file
......@@ -9,7 +9,7 @@ from .FaceNet import FaceNet
from .FaceNetSimple import FaceNetSimple
from .VGG16 import VGG16
from .VGG16_mod import VGG16_mod
from SimpleAudio import SimpleAudio
# gets sphinx autodoc done right - don't remove it
def __appropriate__(*args):
......@@ -36,6 +36,7 @@ __appropriate__(
FaceNetSimple,
VGG16,
VGG16_mod,
SimpleAudio,
)
__all__ = [_ for _ in dir() if not _.startswith('_')]
......@@ -5,7 +5,7 @@
import numpy
from bob.learn.tensorflow.datashuffler import Memory, SiameseMemory, TripletMemory, Disk, SiameseDisk, TripletDisk, \
TripletWithFastSelectionDisk, TripletWithSelectionDisk
TripletWithFastSelectionDisk, TripletWithSelectionDisk, DiskAudio
import pkg_resources
from bob.learn.tensorflow.utils import load_mnist
import os
......@@ -27,6 +27,19 @@ def get_dummy_files():
return files, clients
def get_dummy_audiofiles():
base_path = pkg_resources.resource_filename(__name__, 'data/dummy_audio')
files = []
labels = []
for f in os.listdir(base_path):
if f.endswith(".wav"):
files.append(os.path.join(base_path, f))
labels.append((1 if 'attack' in f else 0))
return files, labels
def test_memory_shuffler():
train_data, train_labels, validation_data, validation_labels = load_mnist()
train_data = numpy.reshape(train_data, (train_data.shape[0], 28, 28, 1))
......@@ -202,3 +215,22 @@ def test_triplet_selection_disk_shuffler():
assert placeholders[0].get_shape().as_list() == batch_shape
assert placeholders[1].get_shape().as_list() == batch_shape
assert placeholders[2].get_shape().as_list() == batch_shape
def test_diskaudio_shuffler():
train_data, train_labels = get_dummy_audiofiles()
batch_shape = [582, 6560, 1]
data_shuffler = DiskAudio(train_data, train_labels, batch_size=batch_shape[0])
batch = data_shuffler.get_batch()
assert len(batch) == 2
assert batch[0].shape == tuple(batch_shape)
assert batch[1].shape[0] == batch_shape[0]
placeholders = data_shuffler.get_placeholders(name="train")
assert placeholders[0].get_shape().as_list() == batch_shape
assert placeholders[1].get_shape().as_list()[0] == batch_shape[0]
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment