Commit 2c3117ed authored by Amir MOHAMMADI's avatar Amir MOHAMMADI

DltResNet audio extractor improvements

parent 975c12bf
Pipeline #31362 passed with stage
in 77 minutes and 54 seconds
......@@ -13,6 +13,8 @@ from bob.extension import rc
from bob.extension.download import download_file
import os
from bob.ap import Spectrogram
from scipy.signal import resample
class DltResNetExtractor(Preprocessor):
""" The class for implementing the feature extraction of DltResNet embeddings.
......@@ -25,7 +27,7 @@ class DltResNetExtractor(Preprocessor):
Use gpu for extracting the embeddings.
"""
def __init__(self, model_file=None, num_classes=1211, bn_dim=128, cuda_flag=0, test=False, use_res=False):
""" Init method
......@@ -43,11 +45,11 @@ class DltResNetExtractor(Preprocessor):
Called for unit test (Default: False).
use_res: bool
Use ResNet model instead of DltResNet model (Default: False).
"""
Preprocessor.__init__(self, min_preprocessed_file_size=bn_dim)
# model
model_type = "emb"
if use_res:
......@@ -77,9 +79,9 @@ class DltResNetExtractor(Preprocessor):
model_file = model_path
else:
# do nothing (used mainly for unit testing)
# do nothing (used mainly for unit testing)
pass
if test:
self.network = torch.nn.DataParallel(self.network)
else:
......@@ -118,8 +120,8 @@ class DltResNetExtractor(Preprocessor):
Returns
-------
feature : 2D :py:class:`numpy.ndarray` (floats)
The extracted features as a 1d array of size 128
The extracted features as a 1d array of size 128
"""
feats = []
......@@ -139,10 +141,10 @@ class DltResNetExtractor(Preprocessor):
k = specs[i*L:(i+1)*L,:]
else:
k = specs[i*L:,:]
X = (k - k.mean(axis=0)) / (k.std(axis=0) + 1e-7)
batch.append(X)
if len(batch) == batch_size or i == embedding_count - 2 or i == embedding_count - 1:
if len(batch) == batch_size or i == embedding_count - 2 or i == embedding_count - 1:
with torch.no_grad():
in_batch = numpy.array(batch)
if cuda_flag == 1:
......@@ -152,7 +154,7 @@ class DltResNetExtractor(Preprocessor):
fX = model(input_var).data.cpu().numpy()
feats.append(fX)
batch = []
return numpy.vstack(feats)
def __call__(self, audio, annotations=None):
......@@ -161,22 +163,29 @@ class DltResNetExtractor(Preprocessor):
Parameters
----------
audio : :py:class:`numpy.ndarray` (floats)
The audio file to extract the features from.
The audio file to extract the features from.
annotations : None
Apply annotations if needed
Returns
-------
feature : 2D :py:class:`numpy.ndarray` (floats)
The extracted features as a 1d array of size 128
The extracted features as a 1d array of size 128
"""
data = audio[1]
if data.dtype =='int16':
data = numpy.cast['float'](data)
if numpy.max(numpy.abs(data)) < 1:
data = data * 2**15
rate = audio[0]
# resample audio to 16KHz so it can work with the model
if rate != 16000:
samps = round(len(data) * 16000 / rate) # Number of samples to resample
data = resample(data, samps)
rate = 16000
win_length_ms = 25
win_shift_ms = 10
normalize_mean = True
......@@ -204,4 +213,4 @@ class DltResNetExtractor(Preprocessor):
"""
f= bob.io.base.HDF5File(data_file)
feats = f.read("feats")
return feats
\ No newline at end of file
return feats
......@@ -2,7 +2,7 @@
Audio Embedding Extractor
=========================
This subpackage is part of ``bob.learn.pytorch`` package to extract features from an input audio using CNN models which
This subpackage is part of ``bob.learn.pytorch`` package to extract features from an input audio using CNN models which
trained with pytorch_.
For this purpose, you can specify your feature extractor in configuration
......@@ -18,10 +18,10 @@ DltResNet Model
.. note::
The models will automatically download to the data folder of this package and save it in
The models will automatically download to the data folder of this package and save it in
``[env-path]./bob/learn/pytorch/preprocessor/audio/data/drn34.pth.tar``.
If you want to set another path for this model do::
$ bob config set bob.learn.pytorch.extractor.audio.drn_modelpath /path/to/mymodel
......@@ -34,18 +34,18 @@ ResNet Model
.. note::
The models will automatically download to the data folder of this package and save it in
The models will automatically download to the data folder of this package and save it in
``[env-path]./bob/learn/pytorch/preprocessor/audio/data/rn34.pth.tar``.
If you want to set another path for this model do::
$ bob config set bob.learn.pytorch.extractor.audio.rn_modelpath /path/to/mymodel
A concrete example
------------------
Imagine that you have the DltResNet model and you would
like to use the embedding layer as a feature to encode identity.
Imagine that you have the DltResNet model and you would
like to use the embedding layer as a feature to encode identity.
Your ``preprocessor`` in bob_ pipe-lines should be defined this way in the configuration file:
......@@ -56,16 +56,18 @@ Your ``preprocessor`` in bob_ pipe-lines should be defined this way in the confi
_model = 'path/to/your/model.pth'
_num_classes = 1211
_use_res = False
preprocessor = ResNetDltExtractor(_model, _num_classes, use_res=_use_res)
preprocessor = DltResNetExtractor(_model, _num_classes, use_res=_use_res)
Note that the number of classes is irrelevant here, but is required to build the
Note that the number of classes is irrelevant here, but is required to build the
network (before loading it). ``_model``, ``_num_classes`` and ``_use_res`` are optional input arguments and will be set automatically. If you want to use ResNet model instead of DltResNet model, set the ``use_res`` input argument to ``True``. This class is the embedding extractor and this style of naming (``preprocessor``), is for compatibility with bob_ framework. In this set, we just need a dummy ``extractor`` in bob_ framework which can be defined in the configuration file in this way:
.. code:: python
from bob.learn.pytorch.extractor.audio import DummyExtractor
from bob.bio.base.extractor import CallableExtractor
extractor = DummyExtractor()
extractor = CallableExtractor(lambda x: x)
extracted_directory = "preprocessed"
skip_extraction = True
You can easily implement your own extractor based on your own network too. Just have
......@@ -73,4 +75,4 @@ a look at the code in ``bob/learn/pytorch/preprocessor/audio``.
.. _bob: http://idiap.github.io/bob/
.. _pytorch: http://pytorch.org/
\ No newline at end of file
.. _pytorch: http://pytorch.org/
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment