Commit c2b2c55e authored by Yannick DAYER's avatar Yannick DAYER

test pipeline, and delayed audio sample loading.

parent de05cd1d
Pipeline #52701 failed with stage
in 25 minutes and 42 seconds
......@@ -35,4 +35,6 @@ if "protocol" not in locals():
database = VoxforgeBioDatabase(
protocol=protocol,
dataset_protocol_path="/idiap/temp/ydayer/bob_data/datasets/database-protocols-voxforge-short.tar.gz",
annotations_path="results~/annotations", # TODO remove (and add annotations to server?)
)
......@@ -12,6 +12,7 @@ from bob.bio.base.pipelines.vanilla_biometrics.legacy import get_temp_directory
from bob.bio.base.transformers import AlgorithmTransformer
from bob.bio.gmm.algorithm import GMM
from bob.bio.spear.extractor import Cepstral
from bob.pipelines import wrap
from bob.pipelines.sample_loaders import AnnotationsLoader
temp_dir = get_temp_directory("spear_mfcc60_voxforgegmm")
......@@ -22,18 +23,21 @@ annotations_loader = AnnotationsLoader(
annotation_directory="results~/annotations",
)
extractor_transformer = Cepstral()
legacy_algorithm = GMM(
number_of_gaussians=256,
training_threshold=0.0, # Maximum number of iterations as stopping criterion
)
algorithm_transformer = AlgorithmTransformer(
algorithm_transformer = wrap(
["sample"],
AlgorithmTransformer(
legacy_algorithm, projector_file=os.path.join(temp_dir, "projector.hdf5")
),
)
transformer = Pipeline(
[
("load_annotations", annotations_loader),
("extractor", extractor_transformer),
("algorithm_transformer", algorithm_transformer),
]
......
......@@ -20,7 +20,6 @@ from bob.extension.download import download_and_unzip
from bob.extension.download import get_file
from bob.extension.download import search_file
from bob.extension.scripts.click_helper import verbosity_option
from bob.io.audio import reader as AudioReader
from bob.pipelines.sample_loaders import AnnotationsLoader
logger = logging.getLogger(__name__)
......@@ -48,6 +47,11 @@ def get_voxforge_protocol_file():
)
def path_loader(path):
logger.debug(f"Reading CSV row for {path}")
return path
def VoxforgeBioDatabase(
protocol="Default",
dataset_protocol_path=None,
......@@ -118,8 +122,8 @@ def VoxforgeBioDatabase(
# Define the data loading transformers
# Loads an AudioReader object from a wav file
reader_loader = CSVToSampleLoaderBiometrics(
data_loader=AudioReader,
path_to_data_loader = CSVToSampleLoaderBiometrics(
data_loader=path_loader,
dataset_original_directory=data_path,
extension=".wav",
)
......@@ -140,7 +144,7 @@ def VoxforgeBioDatabase(
# Build the data loading pipeline
sample_loader = Pipeline(
[
("db:reader_loader", reader_loader),
("db:reader_loader", path_to_data_loader),
("db:reader_to_sample", reader_to_sample),
("db:annotations_loader", annotations_loader),
]
......
......@@ -28,12 +28,11 @@ from sklearn.base import TransformerMixin
import bob.ap
from bob.bio.base.extractor import Extractor
from bob.pipelines import Sample
from bob.pipelines import SampleSet
from .. import utils
logger = logging.getLogger("bob.bio.spear")
logger = logging.getLogger(__name__)
class Cepstral(BaseEstimator, TransformerMixin):
......@@ -54,30 +53,12 @@ class Cepstral(BaseEstimator, TransformerMixin):
with_delta_delta=True,
n_ceps=19, # 0-->18
pre_emphasis_coef=0.95,
features_mask=numpy.arange(0, 60),
features_mask=None,
# Normalization
normalize_flag=True,
**kwargs
**kwargs,
):
# call base class constructor with its set of parameters
Extractor.__init__(
self,
win_length_ms=win_length_ms,
win_shift_ms=win_shift_ms,
n_filters=n_filters,
dct_norm=dct_norm,
f_min=f_min,
f_max=f_max,
delta_win=delta_win,
mel_scale=mel_scale,
with_energy=with_energy,
with_delta=with_delta,
with_delta_delta=with_delta_delta,
n_ceps=n_ceps,
pre_emphasis_coef=pre_emphasis_coef,
features_mask=features_mask,
normalize_flag=normalize_flag,
)
super().__init__(**kwargs)
# copy parameters
self.win_length_ms = win_length_ms
self.win_shift_ms = win_shift_ms
......@@ -96,10 +77,6 @@ class Cepstral(BaseEstimator, TransformerMixin):
self.normalize_flag = normalize_flag
def normalize_features(self, params):
#########################
# Initialisation part #
#########################
normalized_vector = [
[0 for i in range(params.shape[1])] for j in range(params.shape[0])
]
......@@ -113,12 +90,12 @@ class Cepstral(BaseEstimator, TransformerMixin):
data = numpy.array(normalized_vector)
return data
def __call__(self, sample: Sample):
def transform_one(self, sample, **kwargs):
"""Computes and returns normalized cepstral features for the given input data"""
rate = getattr(sample, "sample_rate")
wavsample = getattr(sample, "data")
vad_labels = getattr(sample, "annotations")
logger.debug(f"Cepstral transform of {sample}")
rate = getattr(sample, "rate")
wavsample = getattr(sample, "data")[0]
vad_labels = numpy.array(getattr(sample, "annotations"))
# Set parameters
wl = self.win_length_ms
......@@ -138,6 +115,9 @@ class Cepstral(BaseEstimator, TransformerMixin):
ceps.with_delta_delta = self.with_delta_delta
cepstral_features = ceps(wavsample)
if self.features_mask is None:
features_mask = numpy.arange(0, 60)
else:
features_mask = self.features_mask
if vad_labels is not None: # don't apply VAD
filtered_features = numpy.ndarray(
......@@ -167,16 +147,21 @@ class Cepstral(BaseEstimator, TransformerMixin):
else:
normalized_features = filtered_features
if normalized_features.shape[0] == 0:
logger.warn("No speech found for this utterance")
logger.warning("No speech found for this utterance")
# But do not keep it empty!!! This avoids errors in next steps
normalized_features = numpy.array([numpy.zeros(len(features_mask))])
return normalized_features
def transform(self, samples):
output = []
result = []
for sample in samples:
output.append(self(sample))
return output
if isinstance(sample, SampleSet):
result.append(SampleSet(samples=[], parent=sample))
for s in sample:
result[-1].insert(-1, self.transform_one(s))
else:
result.append(self.transform_one(sample))
return result
def fit(self, X, y=None, **fit_params):
return self
......
......@@ -2,49 +2,72 @@
# @author: Yannick Dayer <yannick.dayer@idiap.ch>
# @date: Thu 01 Jul 2021 10:41:55 UTC+02
import logging
from functools import lru_cache
from functools import partial
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from bob.io.audio import reader as AudioReader
from bob.pipelines import Sample
from bob.pipelines.sample import SampleBatch
from bob.pipelines import DelayedSample
logger = logging.getLogger(__name__)
audio_reader_keys = [
"rate",
"number_of_samples",
"number_of_channels",
"bits_per_sample",
"duration",
"encoding",
"type",
"compression_factor",
]
@lru_cache()
def load_metadata_from_file(filename: str):
"""Extracts data and a set of metadata from a reader object."""
logger.debug(f"Reading metadata from audio file {filename}")
reader = AudioReader(filename)
return {key: getattr(reader, key) for key in audio_reader_keys}
def load_data_from_file(filename: str):
logger.debug(f"Reading data from audio file {filename}")
reader = AudioReader(filename)
return reader.load()
def get_audio_attribute(sample, key):
if key == "data":
return load_data_from_file(sample.data)
return load_metadata_from_file(sample.data)[key]
class AudioReaderToSample(BaseEstimator, TransformerMixin):
"""Transforms a Sample's data containing an audio reader to an audio signal.
"""Transforms a Sample's data containing a path to an audio signal.
The Sample's metadata are updated.
"""
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
# dict[Sample attribute name, AudioReader field name]
self.metadata_keys = {
"sample_rate": "rate",
"number_of_samples": "number_of_samples",
"number_of_channels": "number_of_channels",
"bits_per_sample": "bits_per_sample",
"audio_duration": "duration",
"audio_encoding": "encoding",
"audio_sample_type": "type",
"audio_compression_factor": "compression_factor",
def populate_from_reader(self, sample: DelayedSample) -> DelayedSample:
"""Assigns the Sample's data and metadata."""
delayed_attr = {
key: partial(get_audio_attribute, sample, key) for key in audio_reader_keys
}
def extract_from_reader(self, reader: AudioReader) -> dict:
"""Extracts a set of metadata from a reader object"""
results = {"data": reader.load()}
for metadata, reader_key in self.metadata_keys.items():
results[metadata] = getattr(reader, reader_key)
return results
def populate_from_reader(self, sample: Sample) -> Sample:
"""Loads from the AudioReader of the Sample and set its fields accordingly."""
extracted = self.extract_from_reader(sample.data)
kwargs = {e: extracted[e] for e in extracted if e != "data"}
new_sample = Sample(data=extracted["data"], parent=sample, **kwargs)
new_sample = DelayedSample(
load=partial(get_audio_attribute, sample, "data"),
parent=sample,
delayed_attributes=delayed_attr,
)
return new_sample
def transform(self, samples: SampleBatch):
def transform(self, samples: list):
output = []
for sample in samples:
output.append(self.populate_from_reader(sample))
......
......@@ -156,6 +156,13 @@ setup(
# the same as above but with smaller thresholds
"gmm-tomi-scfc = bob.bio.spear.config.algorithm.gmm_tomi_scfc:algorithm",
],
"bob.bio.pipeline": [
"test_pipeline = bob.bio.spear.config.pipeline.mfcc60_voxforgegmm:pipeline"
],
"bob.bio.config": [
"voxforge = bob.bio.spear.config.database.voxforge:database"
"test_pipeline = bob.bio.spear.config.pipeline.mfcc60_voxforgegmm:pipeline"
],
"bob.bio.grid": [
"modest = bob.bio.spear.config.grid.modest:grid",
],
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment