Skip to content
Snippets Groups Projects

DltResNet audio extractor improvements

2 files
+ 38
27
Compare changes
  • Side-by-side
  • Inline

Files

@@ -13,6 +13,8 @@ from bob.extension import rc
from bob.extension.download import download_file
import os
from bob.ap import Spectrogram
from scipy.signal import resample
class DltResNetExtractor(Preprocessor):
""" The class for implementing the feature extraction of DltResNet embeddings.
@@ -25,7 +27,7 @@ class DltResNetExtractor(Preprocessor):
Use gpu for extracting the embeddings.
"""
def __init__(self, model_file=None, num_classes=1211, bn_dim=128, cuda_flag=0, test=False, use_res=False):
""" Init method
@@ -43,11 +45,11 @@ class DltResNetExtractor(Preprocessor):
Called for unit test (Default: False).
use_res: bool
Use ResNet model instead of DltResNet model (Default: False).
"""
Preprocessor.__init__(self, min_preprocessed_file_size=bn_dim)
# model
model_type = "emb"
if use_res:
@@ -77,9 +79,9 @@ class DltResNetExtractor(Preprocessor):
model_file = model_path
else:
# do nothing (used mainly for unit testing)
# do nothing (used mainly for unit testing)
pass
if test:
self.network = torch.nn.DataParallel(self.network)
else:
@@ -118,8 +120,8 @@ class DltResNetExtractor(Preprocessor):
Returns
-------
feature : 2D :py:class:`numpy.ndarray` (floats)
The extracted features as a 1d array of size 128
The extracted features as a 1d array of size 128
"""
feats = []
@@ -139,10 +141,10 @@ class DltResNetExtractor(Preprocessor):
k = specs[i*L:(i+1)*L,:]
else:
k = specs[i*L:,:]
X = (k - k.mean(axis=0)) / (k.std(axis=0) + 1e-7)
batch.append(X)
if len(batch) == batch_size or i == embedding_count - 2 or i == embedding_count - 1:
if len(batch) == batch_size or i == embedding_count - 2 or i == embedding_count - 1:
with torch.no_grad():
in_batch = numpy.array(batch)
if cuda_flag == 1:
@@ -152,7 +154,7 @@ class DltResNetExtractor(Preprocessor):
fX = model(input_var).data.cpu().numpy()
feats.append(fX)
batch = []
return numpy.vstack(feats)
def __call__(self, audio, annotations=None):
@@ -161,22 +163,29 @@ class DltResNetExtractor(Preprocessor):
Parameters
----------
audio : :py:class:`numpy.ndarray` (floats)
The audio file to extract the features from.
The audio file to extract the features from.
annotations : None
Apply annotations if needed
Returns
-------
feature : 2D :py:class:`numpy.ndarray` (floats)
The extracted features as a 1d array of size 128
The extracted features as a 1d array of size 128
"""
data = audio[1]
if data.dtype =='int16':
data = numpy.cast['float'](data)
if numpy.max(numpy.abs(data)) < 1:
data = data * 2**15
rate = audio[0]
# resample audio to 16KHz so it can work with the model
if rate != 16000:
samps = round(len(data) * 16000 / rate) # Number of samples to resample
data = resample(data, samps)
rate = 16000
win_length_ms = 25
win_shift_ms = 10
normalize_mean = True
@@ -204,4 +213,4 @@ class DltResNetExtractor(Preprocessor):
"""
f= bob.io.base.HDF5File(data_file)
feats = f.read("feats")
return feats
\ No newline at end of file
return feats
Loading