Commit cc6a6e11 authored by Pavel KORSHUNOV's avatar Pavel KORSHUNOV

tensorflow based audio extractor

parent 2cc51060
Pipeline #7283 failed with stages
in 16 minutes and 23 seconds
......@@ -5,6 +5,7 @@ from .glcms import GLCMs
from .spectrogram_extended import SpectrogramExtended
from .lbp_histograms import LBPHistograms
from .dummy_tensorflow import DummyTF
from .audio_tensorflow import AudioTFExtractor
# gets sphinx autodoc done right - don't remove it
def __appropriate__(*args):
......@@ -28,5 +29,6 @@ __appropriate__(
SpectrogramExtended,
LBPHistograms,
DummyTF,
AudioTFExtractor,
)
__all__ = [_ for _ in dir() if not _.startswith('_')]
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# Tiago de Freitas Pereira <tiago.pereira@idiap.ch>
"""Features for face recognition"""
import numpy
import bob.io.base
from bob.bio.base.extractor import Extractor
from bob.learn.tensorflow.network import SequenceNetwork
from bob.learn.tensorflow.utils import Session
from bob.learn.tensorflow.datashuffler import DiskAudio
import bob.io.base
import bob.ip.base
import logging
logger = logging.getLogger("bob.pad.voice")
class AudioTFExtractor(Extractor):
"""
**Parameters:**
feature_layer: The layer to be used as features. Possible values are `fc1` or 'fc2'.
"""
def __init__(
self,
feature_layer="fc1",
**kwargs
):
Extractor.__init__(self, requires_training=True, # it's True, so that extractor can be loaded from file
split_training_data_by_client=False,
skip_extractor_training=True, **kwargs)
# block parameters
import tensorflow as tf
self.session = tf.Session()
# self.session = Session.instance().session
self.feature_layer = feature_layer
self.data_reader = DiskAudio([0], [0])
self.dnn_model = SequenceNetwork(default_feature_layer=feature_layer)
def __call__(self, input_data):
"""
"""
# create empty labels array, since this what read/write function of Base accepts
rate = input_data[0]
wav_sample = input_data[1]
logger.debug(" .... Extracting frames on the fly from %d length sample" % wav_sample.shape[0])
frames, labels = self.data_reader.extract_frames_from_wav(wav_sample, 0)
frames = numpy.asarray(frames)
logger.debug(" .... And %d frames are extracted to pass into DNN model" % frames.shape[0])
frames = numpy.reshape(frames, (frames.shape[0], -1, 1))
projection_on_dnn = self.dnn_model(frames, self.feature_layer)
return numpy.asarray(projection_on_dnn, dtype=numpy.float64)
# re-define the train function to get it non-documented
def train(*args, **kwargs): raise NotImplementedError("This function is not implemented and should not be called.")
def load(self, extractor_file):
logger.info("Loading pretrained model from {0}".format(extractor_file))
self.dnn_model = SequenceNetwork()
self.dnn_model.load_hdf5(bob.io.base.HDF5File(extractor_file), shape=[1, 6560, 1])
# self.dnn_model.load(extractor_file, clear_devices=True)
#hdf5 = bob.io.base.HDF5File(extractor_file)
#self.lenet.load(hdf5, shape=(1,125,125,3), session=self.session)
audiotf = AudioTFExtractor()
......@@ -142,6 +142,7 @@ setup(
'bob.pad.extractor': [
'cqcc20e = bob.bio.spear.config.extractor.cqcc20:cqcc20', # Extractor (reads Matlab files) for CQCC features
'audiotf = bob.pad.voice.extractor.audio_tensorflow:audiotf', # For audio tensorflow
'dummytfe = bob.pad.voice.extractor.dummy_tensorflow:dummytf', # For tensorflow
'glcms = bob.pad.voice.extractor.glcms:extractor',
'lbp-hist = bob.pad.voice.extractor.lbp_histograms:extractor',
......@@ -167,6 +168,10 @@ setup(
],
'bob.bio.extractor': [
'audiotf = bob.pad.voice.extractor.audio_tensorflow:audiotf', # For audio tensorflow
],
'bob.pad.grid': [
'modest = bob.bio.spear.config.grid.modest:grid',
],
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment