Skip to content
Snippets Groups Projects
Commit baa22518 authored by Saeed SARFJOO's avatar Saeed SARFJOO
Browse files

add audio extractor

parent f1fe9f47
No related branches found
No related tags found
1 merge request!19Add audio extractor
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo
import torch
__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
'resnet152']
model_urls = {
'resnet18': 'https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth',
'resnet34': 'https://s3.amazonaws.com/pytorch/models/resnet34-333f7ec4.pth',
'resnet50': 'https://s3.amazonaws.com/pytorch/models/resnet50-19c8e357.pth',
'resnet101': 'https://s3.amazonaws.com/pytorch/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://s3.amazonaws.com/pytorch/models/resnet152-b121ed2d.pth',
}
def conv3x3(in_planes, out_planes, stride=1):
"3x3 convolution with padding"
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000, tp='cls', bn_dim=128, bn_dim2=128):
self.net_type = tp
self.inplanes = 64
super(ResNet, self).__init__()
self.conv1 = nn.Conv2d(1, 16, kernel_size=5, stride=2, padding=2,
bias=False)
self.bn1 = nn.BatchNorm2d(16)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(16, 32, kernel_size=(5,1), stride=(1,1), padding=(4,0),
bias=False, dilation=(2,1))
self.bn2 = nn.BatchNorm2d(32)
self.maxpool = nn.MaxPool2d(kernel_size=(3, 1), stride=(2, 1), padding=0)
self.conv3 = nn.Conv2d(32, 64, kernel_size=(3,1), stride=(1,1), padding=(3,0),
bias=False, dilation=(3,1))
self.bn3 = nn.BatchNorm2d(64)
self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.conv4 = nn.Conv2d(256, 256, kernel_size=(1, 9), stride=1, padding=0,
bias=False)
self.bn4 = nn.BatchNorm2d(256)
self.conv5 = nn.Conv2d(256, 512, kernel_size=(1, 9), stride=1, padding=0,
bias=False)
self.bn5 = nn.BatchNorm2d(512)
self.fc = nn.Linear(1024, num_classes)
self.fc1 = nn.Linear(bn_dim, num_classes)
self.fc2 = nn.Linear(bn_dim2, num_classes)
self.em = nn.Linear(1024, bn_dim)
self.em2 = nn.Linear(bn_dim, bn_dim2)
self.bn = nn.BatchNorm2d(bn_dim2)
self.drp = nn.Dropout(p=0.2)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
if self.net_type == 'prefc':
return self.forward_cnn(x)
if self.net_type == 'cls': # without segment layer
return self.forward_cls(x)
if self.net_type == 'cls2': # with one segment layer
return self.forward_cls2(x)
if self.net_type == 'cls3': # with two segment layers
return self.forward_cls3(x)
if self.net_type == 'emb': # embedding from first segment layer
return self.forward_emb(x)
if self.net_type == 'emb2': # embedding from second segment layer
return self.forward_emb2(x)
def forward_cnn(self, x):
# x = x.unsqueeze(1)
try:
x = self.conv1(x)
except ValueError:
print("Error with ", x.size())
x = self.bn1(x)
x = self.relu(x)
x = self.conv2(x)
x = self.bn2(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.conv3(x)
x = self.bn3(x)
x = self.relu(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.conv4(x)
x = self.bn4(x)
x = self.relu(x)
x = self.conv5(x)
x = self.bn5(x)
x = self.relu(x)
t = torch.std(x, 2)
m = torch.mean(x, 2)
x = torch.cat((m, t), 1)
x = x.view(x.size(0), -1)
return x
def forward_cls(self, x):
x = self.forward_cnn(x)
x = self.fc(x)
return x
def forward_cls2(self, x):
x = self.forward_cnn(x)
x = self.drp(x)
x = self.em(x)
x = self.drp(x)
x = self.fc1(x)
return x
def forward_cls3(self, x):
x = self.forward_cnn(x)
x = self.drp(x)
x = self.em(x)
x = self.em2(x)
x = self.bn(x)
x = self.relu(x)
x = self.drp(x)
x = self.fc2(x)
return x
def forward_emb(self, x):
x = self.forward_cnn(x)
x = self.em(x)
x = torch.div(x, torch.norm(x, 2, 1).unsqueeze(1).expand_as(x))
return x
def forward_emb2(self, x):
x = self.forward_cnn(x)
x = self.em(x)
x = self.em2(x)
x = torch.div(x, torch.norm(x, 2, 1).unsqueeze(1).expand_as(x))
return x
def forward_emb_test(self, x, step=1):
x = self.conv1(x)
if step == 0:
return x
x = self.bn1(x)
x = self.relu(x)
x = self.conv2(x)
x = self.bn2(x)
x = self.relu(x)
if step == 1:
return x
x = self.maxpool(x)
if step == 2:
return x
x = self.conv3(x)
x = self.bn3(x)
x = self.relu(x)
if step == 3:
return x
x = self.layer1(x)
if step == 4:
return x
x = self.layer2(x)
if step == 5:
return x
x = self.layer3(x)
if step == 6:
return x
x = self.conv4(x)
if step == 7:
return x
x = self.bn4(x)
x = self.relu(x)
x = self.conv5(x)
x = self.bn5(x)
x = self.relu(x)
if step == 8:
return x
t = torch.std(x, 2)
m = torch.mean(x, 2)
x = torch.cat((m, t), 1)
x = x.view(x.size(0), -1)
return x
def resnet18(pretrained=False):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [2, 2, 2, 2])
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
return model
def resnet34(pretrained=False, num_classes=1251, tp='cls', bn_dim=128, bn_dim2=128):
"""Constructs a ResNet-34 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [3, 4, 6, 3], num_classes=num_classes, tp=tp, bn_dim=bn_dim, bn_dim2=bn_dim2)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
return model
def resnet50(pretrained=False, num_classes=1000):
"""Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 14, 3], num_classes=num_classes)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
return model
def resnet101(pretrained=False):
"""Constructs a ResNet-101 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 23, 3])
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
return model
def resnet152(pretrained=False):
"""Constructs a ResNet-152 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 8, 36, 3])
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
return model
import numpy
import torch
from torch.autograd import Variable
import torchvision.transforms as transforms
from bob.learn.pytorch.architectures.DltResNet import resnet34
from bob.bio.base.preprocessor import Preprocessor
import bob.io.base
import torch.backends.cudnn as cudnn
from .spectrogram_utils import spectrogram_computation, read
import pickle
class DltResNetExtractor(Preprocessor):
""" The class implementing the feature extraction of DltResNet embeddings.
Attributes
----------
network: :py:class:`torch.nn.Module`
The network architecture
"""
def __init__(self, model_file=None, num_classes=1211, bn_dim=128, cuda_flag=0):
""" Init method
Parameters
----------
model_file: str
The path of the trained network to load
num_classes: float
The number of classes (Default: 1211).
bn_dim: int32
The embedding dimension (Default: 128).
cuda_flag: int32
Use gpu for extracting the embeddings (Default: 0).
"""
Preprocessor.__init__(self, min_preprocessed_file_size=bn_dim)
# model
model_type = "emb"
self.network = resnet34(num_classes=num_classes, tp=model_type, bn_dim=bn_dim)
self.cuda_flag = cuda_flag
if model_file is None:
# do nothing (used mainly for unit testing)
pass
else:
if cuda_flag == 1:
self.network = torch.nn.DataParallel(self.network).cuda()
cudnn.benchmark = True
checkpoint = torch.load(model_file)
else:
self.network = torch.nn.DataParallel(self.network)
checkpoint = torch.load(model_file, map_location=lambda storage, loc: storage)
if 'state_dict' in checkpoint:
self.network.load_state_dict(checkpoint['state_dict'], strict=False)
self.network.net_type = model_type
self.network.module.net_type = self.network.net_type
self.network.eval()
def extract_embeddings(self, model, specs, L, batch_size=1, cuda_flag=0, min_length=30):
""" Extract features from spectrogram
Parameters
----------
model : :py:class:`torch.nn.Module`
The trained pytorch model.
specs : :py:class:`numpy.ndarray` (floats)
The spectrogram of audio file.
L : int32
Maximum chunk size for embedding extraction. 0 for using all the data (Default: 0).
batch_size : int32
Batch size for extracting the embedding (Default: 1).
cuda_flag : int32
Use gpu for extracting the embeddings
min_length : int32
Minimum chunk size for embedding extraction (Default: 30).
Returns
-------
feature : 2D :py:class:`numpy.ndarray` (floats)
The extracted features as a 1d array of size 128
"""
feats = []
batch = []
if L == 0 or L > specs.shape[0]:
L = specs.shape[0]
embedding_count = specs.shape[0] // L
if embedding_count == 0 :
embedding_count = 1
elif specs.shape[0] % L > min_length:
embedding_count += 1
for i in range(embedding_count):
if (i+1)* L < specs.shape[0]:
k = specs[i*L:(i+1)*L,:]
else:
k = specs[i*L:,:]
X = (k - k.mean(axis=0)) / (k.std(axis=0) + 1e-7)
batch.append(X)
if len(batch) == batch_size or i == embedding_count - 2 or i == embedding_count - 1:
with torch.no_grad():
in_batch = numpy.array(batch)
if cuda_flag == 1:
input_var = torch.autograd.Variable(torch.Tensor(in_batch).unsqueeze(1).cuda(async=True))
else:
input_var = torch.autograd.Variable(torch.Tensor(in_batch).unsqueeze(1))
fX = model(input_var).data.cpu().numpy()
feats.append(fX)
batch = []
return numpy.vstack(feats)
def __call__(self, audio, annotations=None):
""" Extract features from an audio file
Parameters
----------
audio : :py:class:`numpy.ndarray` (floats)
The audio file to extract the features from.
annotations : None
Apply annotations if needed
Returns
-------
feature : 2D :py:class:`numpy.ndarray` (floats)
The extracted features as a 1d array of size 128
"""
data = audio[1]
if data.dtype =='int16':
data = numpy.cast['float'](data)
if numpy.max(numpy.abs(data)) < 1:
data = data * 2**15
rate = audio[0]
input_feat = spectrogram_computation(rate, data)
features = self.extract_embeddings(self.network, input_feat, 0, batch_size=1, cuda_flag=self.cuda_flag)
return features
def write_data(self, data, data_file, compression=0):
"""Writes the given *preprocessed* data to a file with the given name.
"""
f = bob.io.base.HDF5File(data_file, 'w')
f.set("feats", data.astype('float'), compression=compression)
def read_data(self, data_file):
f= bob.io.base.HDF5File(data_file)
feats = f.read("feats")
return feats
\ No newline at end of file
from .DltResNet import DltResNetExtractor
# gets sphinx autodoc done right - don't remove it
def __appropriate__(*args):
"""Says object was actually declared here, and not in the import module.
Fixing sphinx warnings of not being able to find classes, when path is
shortened. Parameters:
*args: An iterable of objects to modify
Resolves `Sphinx referencing issues
<https://github.com/sphinx-doc/sphinx/issues/3048>`
"""
for obj in args:
obj.__module__ = __name__
__appropriate__(
DltResNetExtractor,
)
# gets sphinx autodoc done right - don't remove it
__all__ = [_ for _ in dir() if not _.startswith('_')]
import os
import math
import numpy
import scipy.io.wavfile
import sys
from numpy.fft import fft
def spectrogram_computation(rate, data, win_length_ms=25, win_shift_ms=10, n_filters=24,
f_min=0., f_max=4000., pre_emphasis_coef=0.97, mel_scale=True):
win_length = int (rate * win_length_ms / 1000)
win_shift = int (rate * win_shift_ms / 1000)
win_size = int (2.0 ** math.ceil(math.log(win_length) / math.log(2)))
m = int (math.log(win_size) / math.log(2))
# Hamming initialisation
hamming_kernel = init_hamming_kernel(win_length)
# Compute cut-off frequencies
p_index = init_freqfilter(rate, win_size, mel_scale, n_filters, f_min, f_max)
data_size = data.shape[0]
n_frames = int(1 + (data_size - win_length) / win_shift)
# create features set
try:
features = numpy.zeros([n_frames, int(win_size/2)+1], dtype=numpy.float64)
except ValueError:
print(data.shape)
last_frame_elem = 0
# compute cepstral coefficients
for i in range(n_frames):
# create a frame
frame = numpy.zeros(win_size, dtype=numpy.float64)
vec = numpy.arange(win_length)
frame[vec] = data[vec + i * win_shift]
som = numpy.sum(frame)
som = som / win_size
frame[vec] -= som # normalization by mean here
frame_, last_frame_elem = pre_emphasis(frame[vec], win_shift, pre_emphasis_coef, last_frame_elem)
frame[vec] = frame_
# Hamming windowing
frame = hamming_window(frame, hamming_kernel, win_length)
filters, spec_row = log_filter_bank(frame, n_filters, p_index, win_size)
features[i] = spec_row[0:int(win_size/2)+1]
return numpy.array(features)
def init_hamming_kernel(win_length):
# Hamming initialisation
cst = 2 * math.pi / (win_length - 1.0)
hamming_kernel = numpy.zeros(win_length)
for i in range(win_length):
hamming_kernel[i] = (0.54 - 0.46 * math.cos(i * cst))
return hamming_kernel
def init_freqfilter(rate, win_size, mel_scale, n_filters, f_min, f_max):
# Compute cut-off frequencies
p_index = numpy.array(numpy.zeros(n_filters + 2), dtype=numpy.float64)
if (mel_scale):
# Mel scale
m_max = mel_python(f_max)
m_min = mel_python(f_min)
for i in range(n_filters + 2):
alpha = float(i) / (n_filters+1)
f = mel_inv_python(m_min * (1 - alpha) + m_max * alpha)
factor = float(f) / rate
p_index[i] = win_size * factor
else:
# linear scale
for i in range(n_filters + 2):
alpha = float(i) / (n_filters+1)
f = f_min * (1.0 - alpha) + f_max * alpha
p_index[i] = float(win_size) / rate * f
return p_index
def init_dct_kernel(n_filters, n_ceps, dct_norm):
dct_kernel = numpy.zeros([n_ceps, n_filters], dtype=numpy.float64)
dct_coeff = 1.0
if dct_norm:
dct_coeff = math.sqrt(2.0/n_filters)
for i in range(0, n_ceps):
for j in range(0, n_filters ):
dct_kernel[i][j] = dct_coeff * math.cos(math.pi * i * (j + 0.5) / float(n_filters))
if dct_norm:
column_multiplier = numpy.ones(n_ceps, dtype=numpy.float64)
column_multiplier[0] = math.sqrt(0.5) # first element sqrt(0.5), the rest are 1.
for j in range(0, n_filters):
dct_kernel[:, j] = column_multiplier * dct_kernel[:, j]
return dct_kernel
def read(filename):
"""Read video.FrameContainer containing preprocessed frames"""
fileName, fileExtension = os.path.splitext(filename)
wav_filename = filename
rate, data = scipy.io.wavfile.read(str(wav_filename)) # the data is read in its native format
if data.dtype =='int16':
data = numpy.cast['float'](data)
return [rate,data]
def compare(v1, v2, width):
return abs(v1-v2) <= width
def mel_python(f):
return 2595.0*math.log10(1.+f/700.0)
def mel_inv_python(value):
return 700.0 * (10 ** (value / 2595.0) - 1)
def sig_norm(win_length, frame, flag):
gain = 0.0
for i in range(win_length):
gain = gain + frame[i] * frame[i]
ENERGY_FLOOR = 1.0
if gain < ENERGY_FLOOR:
gain = math.log(ENERGY_FLOOR)
else:
gain = math.log(gain)
if(flag and gain != 0.0):
for i in range(win_length):
frame[i] = frame[i] / gain
return gain
def pre_emphasis(frame, win_shift, coef, last_frame_elem):
if (coef <= 0.0) or (coef > 1.0):
print("Error: The emphasis coeff. should be between 0 and 1")
return None
last_element = frame[win_shift - 1]
return numpy.append(frame[0]-coef * last_frame_elem, frame[1:]-coef*frame[:-1]), last_element
def hamming_window(vector, hamming_kernel, win_length):
for i in range(win_length):
vector[i] = vector[i] * hamming_kernel[i]
return vector
def log_filter_bank(frame, n_filters, p_index, win_size):
x1 = numpy.array(frame, dtype=numpy.complex128)
complex_ = fft(x1)
abscomplex = numpy.absolute(complex_)
frame[0:int(win_size / 2) + 1] = abscomplex[0:int(win_size / 2) + 1]
filters = log_triangular_bank(frame, n_filters, p_index)
return filters, frame
def log_triangular_bank(data, n_filters, p_index):
res_ = numpy.zeros(n_filters, dtype=numpy.float64)
denominator = 1.0 / (p_index[1:n_filters+2] - p_index[0:n_filters+1])
for i in range(0, n_filters):
li = int(math.floor(p_index[i] + 1))
mi = int(math.floor(p_index[i+1]))
ri = int(math.floor(p_index[i+2]))
if i == 0 or li == ri:
li -= 1
vec_left = numpy.arange(li, mi+1)
vec_right = numpy.arange(mi+1, ri+1)
res_[i] = numpy.sum(data[vec_left] * denominator[i] * (vec_left-p_index[i])) + \
numpy.sum(data[vec_right] * denominator[i+1] * (p_index[i+2]-vec_right))
# alternative but equivalent implementation:
# filt = numpy.zeros(ri-li+1, dtype=numpy.float64)
# filt_l = denominator[i] * (vec_left-p_index[i])
# filt_p = denominator[i+1] * (p_index[i+2]-vec_right)
# filt = numpy.append(filt_l, filt_p)
# vect_full = numpy.arange(li, ri+1)
# res_[i] = numpy.sum(data[vect_full] * filt)
FBANK_OUT_FLOOR = sys.float_info.epsilon
return numpy.log(numpy.where(res_ < FBANK_OUT_FLOOR, FBANK_OUT_FLOOR, res_))
def dct_transform(filters, n_filters, dct_kernel, n_ceps):
ceps = numpy.zeros(n_ceps)
vec = numpy.array(range(0, n_filters))
for i in range(0, n_ceps):
ceps[i] = numpy.sum(filters[vec] * dct_kernel[i])
return ceps
#!/usr/bin/env python
# encoding: utf-8
import pkg_resources
import numpy
numpy.random.seed(10)
import os
def test_drn():
""" test for the DltResNet architecture
This architecture takes Tx257 audio chunk as input,
where T is the number of frames and 257 is unique numbers from
spectrogram with 512 FFT resolution.
output is an embedding of dimension 128
"""
from bob.learn.pytorch.preprocessor.audio import DltResNetExtractor
extractor = DltResNetExtractor()
# this architecture expects Tx257
data = numpy.random.rand(512, 257).astype("float32")
output = extractor.extract_embeddings(extractor.network, data, 0)
assert output.shape[1] == 128
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment