Commit c8d6a529 authored by Pavel KORSHUNOV's avatar Pavel KORSHUNOV

Merge branch 'new-features' into 'master'

Added RFCC, IMFCC, SCFC, SCMC, SSFC features, revamped tests



See merge request !1
parents 750df966 e9ada31f
Pipeline #3915 passed with stages
in 47 minutes and 42 seconds
......@@ -20,8 +20,14 @@
==========================
This package is part of the signal-processing and machine learning toolbox
Bob_. It contains basic audio processing utilities.
Bob_. It contains basic audio processing utilities. Currently, the following cepstral-based features are available:
using rectangular (RFCC), mel-scaled triangular (MFCC) [Davis1980]_, inverted mel-scaled triangular (IMFCC),
and linear triangular (LFCC) filters [Furui1981]_, spectral flux-based features (SSFC) [Scheirer1997]_,
subband centroid frequency (SCFC) [Le2011]_. We are planning to update and add more features in the
near future.
*Please note that the implementation of MFCC and LFCC features has changed compared to an earlier version of the package,
as we corrected pre-emphasis and DCT computations. Delta and delta-delta computations were slightly changed too.*
Installation
------------
......@@ -39,8 +45,18 @@ Contact
For questions or reporting issues to this software package, contact our
development `mailing list`_.
.. [Davis1980] S. Davis and P. Mermelstein, "Comparison of parametric representations for monosyllabic
word recognition in continuously spoken sentences", in IEEE Transactions on Acoustics, Speech, and Signal Processing,
1980, num 4, vol. 28, pages 357-366.
.. [Furui1981] S. Furui, Cepstral analysis technique for automatic speaker verification, in
IEEE Transactions on Acoustics, Speech, and Signal Processing, 1981, num 2 vol 29, pages 254-272.
.. [Scheirer1997] E. Scheirer and M. Slaney, Construction and evaluation of a robust multifeature speech/music discriminator,
in IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP, 1997, vol 2, pages 1331-1334.
.. [Le2011] P. N. Le, E. Ambikairajah, J. Epps, V. Sethu, E. H. C. Choi, Investigation of Spectral Centroid Features for Cognitive Load Classification,
in Speech Commun., April, 2011, num 4, vol 53, pages 540--551.
.. Place your references here:
.. _bob: https://www.idiap.ch/software/bob
.. _installation: https://gitlab.idiap.ch/bob/bob/wikis/Installation
.. _mailing list: https://groups.google.com/forum/?fromgroups#!forum/bob-devel
/**
* @author Andre Anjos <andre.anjos@idiap.ch>
* @author Pavel Korshunov <Pavel.Korshunov@idiap.ch>
* @date Thu 6 Feb 09:00:05 2014
*
* @brief Bindings to the base class bob::ap::Ceps
......@@ -13,7 +14,7 @@
PyDoc_STRVAR(s_ceps_str, BOB_EXT_MODULE_PREFIX ".Ceps");
PyDoc_STRVAR(s_ceps_doc,
"Ceps(sampling_frequency, [win_length_ms=20., [win_shift_ms=10., [n_filters=24, [n_ceps=19, [f_min=0., [f_max=4000., [delta_win=2, [pre_emphasis_coeff=0.95, [mel_scale=True, [dct_norm=True]]]]]]]]]]) -> new Ceps\n\
"Ceps(sampling_frequency, [win_length_ms=20., [win_shift_ms=10., [n_filters=24, [n_ceps=19, [f_min=0., [f_max=4000., [delta_win=2, [pre_emphasis_coeff=0.95, [mel_scale=True, [dct_norm=True, [normalize_mean=True, [rect_filter=False, [inverse_filter=False, [normalize_spectrum=False, [ssfc_features=False, [scfc_features=False, [scmc_features=False]]]]]]]]]]]]]]]]]) -> new Ceps\n\
Ceps(other) -> new Ceps\n\
\n\
Objects of this class, after configuration, can extract the\n\
......@@ -57,6 +58,39 @@ mel_scale\n\
dct_norm\n\
[bool] A factor by which the cepstral coefficients are\n\
multiplied\n\
normalize_mean\n\
[bool] Tells whether frame should be normalized \n\
by subtracting mean (True) or dividing by max_range (False)\n\
``True`` is the default value.\n\
\n\
rect_filter\n\
[bool] tells whether to apply the filter in the\n\
inversed order, i.e., from high frequencies to low\n\
(set it to ``True''). ``False`` is the default value.\n\
\n\
inverse_filter\n\
[bool] tells whether cepstral features are extracted\n\
using a rectungular filter (set it to ``True``), i.e., RFCC features,\n\
instead of the default filter (the default value is ``False``)\n\
\n\
normalize_spectrum\n\
[bool] Tells whether to normalize the power spectrum of the signal.\n\
The default value is ``False``.\n\
\n\
ssfc_features\n\
[bool] Set to true if you want to compute\n\
Subband Spectral Flux Coefficients (SSFC), which measures\n\
the frame-by-frame change in the power spectrum\n\
\n\
scfc_features\n\
[bool] Set to true if you want to compute\n\
Spectral Centroid Frequency Coefficients (SCFC), which\n\
capture detailed information about subbands similar to formant frequencies\n\
\n\
scmc_features\n\
[bool] Set to true if you want to compute\n\
Spectral Centroid Magnitude Coefficients (SCMC), which\n\
capture detailed information about subbands similar to SCFC features\n\
\n\
other\n\
[Ceps] an object of which is or inherits from ``Ceps``\n\
......@@ -132,6 +166,13 @@ static int PyBobApCeps_InitParameters
"pre_emphasis_coeff",
"mel_scale",
"dct_norm",
"normalize_mean",
"rect_filter",
"inverse_filter",
"normalize_spectrum",
"ssfc_features",
"scfc_features",
"scmc_features",
0};
static char** kwlist = const_cast<char**>(const_kwlist);
......@@ -141,24 +182,42 @@ static int PyBobApCeps_InitParameters
Py_ssize_t n_filters = 24;
Py_ssize_t n_ceps = 19;
double f_min = 0.;
double f_max = 4000.;
double f_max = 8000.;
Py_ssize_t delta_win = 2;
double pre_emphasis_coeff = 0.95;
PyObject* mel_scale = Py_True;
PyObject* dct_norm = Py_True;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "d|ddnnddndOO", kwlist,
PyObject* dct_norm = Py_False;
PyObject* normalize_mean = Py_True;
PyObject* rect_filter = Py_False;
PyObject* inverse_filter = Py_False;
PyObject* normalize_spectrum = Py_False;
PyObject* ssfc_features = Py_False;
PyObject* scfc_features = Py_False;
PyObject* scmc_features = Py_False;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "d|ddnnddndOOOOOOOOO", kwlist,
&sampling_frequency, &win_length_ms, &win_shift_ms, &n_filters,
&n_ceps, &f_min, &f_max, &delta_win, &pre_emphasis_coeff,
&mel_scale, &dct_norm))
&mel_scale, &dct_norm, &normalize_mean, &rect_filter,
&inverse_filter, &normalize_spectrum,
&ssfc_features, &scfc_features, &scmc_features))
return -1;
bool mel_scale_ = PyObject_IsTrue(mel_scale);
bool dct_norm_ = PyObject_IsTrue(dct_norm);
bool normalize_mean_ = PyObject_IsTrue(normalize_mean);
bool rect_filter_ = PyObject_IsTrue(rect_filter);
bool inverse_filter_ = PyObject_IsTrue(inverse_filter);
bool normalize_spectrum_ = PyObject_IsTrue(normalize_spectrum);
bool ssfc_features_ = PyObject_IsTrue(ssfc_features);
bool scfc_features_ = PyObject_IsTrue(scfc_features);
bool scmc_features_ = PyObject_IsTrue(scmc_features);
try {
self->cxx = new bob::ap::Ceps(sampling_frequency,
win_length_ms, win_shift_ms, n_filters, n_ceps, f_min, f_max,
delta_win, pre_emphasis_coeff, mel_scale_, dct_norm_);
delta_win, pre_emphasis_coeff, mel_scale_, dct_norm_, normalize_mean_,
rect_filter_, inverse_filter_, normalize_spectrum_,
ssfc_features_, scfc_features_, scmc_features_);
if (!self->cxx) {
PyErr_Format(PyExc_MemoryError, "cannot create new object of type `%s' - no more memory", Py_TYPE(self)->tp_name);
return -1;
......@@ -229,7 +288,7 @@ static PyObject* PyBobApCeps_Repr(PyBobApCepsObject* self) {
Py_ssize_t n_filters = self->cxx->getNFilters();
Py_ssize_t n_ceps = self->cxx->getNCeps();
Py_ssize_t delta_win = self->cxx->getDeltaWin();
auto count = std::snprintf(buffer, MAXSIZE, "%s(sampling_frequency=%f, win_length_ms=%f, win_shift_ms=%f, n_filters=%" PY_FORMAT_SIZE_T "d, n_ceps=%" PY_FORMAT_SIZE_T "d, f_min=%f, f_max=%f, delta_win=%" PY_FORMAT_SIZE_T "d, pre_emphasis_coeff=%f, mel_scale=%s, dct_norm=%s)", Py_TYPE(self)->tp_name, self->cxx->getSamplingFrequency(), self->cxx->getWinLengthMs(), self->cxx->getWinShiftMs(), n_filters, n_ceps, self->cxx->getFMin(), self->cxx->getFMax(), delta_win, self->cxx->getPreEmphasisCoeff(), self->cxx->getMelScale()?"True":"False", self->cxx->getDctNorm()?"True":"False");
auto count = std::snprintf(buffer, MAXSIZE, "%s(sampling_frequency=%f, win_length_ms=%f, win_shift_ms=%f, n_filters=%" PY_FORMAT_SIZE_T "d, n_ceps=%" PY_FORMAT_SIZE_T "d, f_min=%f, f_max=%f, delta_win=%" PY_FORMAT_SIZE_T "d, pre_emphasis_coeff=%f, mel_scale=%s, dct_norm=%s, normalize_mean=%s, rect_filter=%s, inverse_filter=%s, normalize_spectrum=%s, ssfc_features=%s, scfc_features=%s, scmc_features=%s)", Py_TYPE(self)->tp_name, self->cxx->getSamplingFrequency(), self->cxx->getWinLengthMs(), self->cxx->getWinShiftMs(), n_filters, n_ceps, self->cxx->getFMin(), self->cxx->getFMax(), delta_win, self->cxx->getPreEmphasisCoeff(), self->cxx->getMelScale()?"True":"False", self->cxx->getDctNorm()?"True":"False", self->cxx->getNormalizeMean()?"True":"False", self->cxx->getRectangularFilter()?"True":"False", self->cxx->getInverseFilter()?"True":"False", self->cxx->getNormalizeSpectrum()?"True":"False", self->cxx->getSSFCFeatures()?"True":"False", self->cxx->getSCFCFeatures()?"True":"False", self->cxx->getSCMCFeatures()?"True":"False");
return
# if PY_VERSION_HEX >= 0x03000000
PyUnicode_FromStringAndSize
......
......@@ -2,6 +2,7 @@
* @date Wed Jan 11:09:30 2013 +0200
* @author Elie Khoury <Elie.Khoury@idiap.ch>
* @author Laurent El Shafey <Laurent.El-Shafey@idiap.ch>
* @author Pavel Korshunov <Pavel.Korshunov@idiap.ch>
*
* @brief Implement Linear and Mel Frequency Cepstral Coefficients
* functions (MFCC and LFCC)
......@@ -16,9 +17,12 @@ bob::ap::Ceps::Ceps(const double sampling_frequency,
const double win_length_ms, const double win_shift_ms,
const size_t n_filters, const size_t n_ceps, const double f_min,
const double f_max, const size_t delta_win, const double pre_emphasis_coeff,
const bool mel_scale, const bool dct_norm):
const bool mel_scale, const bool dct_norm, const bool normalize_mean,
const bool rect_filter, const bool inverse_filter, const bool normalize_spectrum,
const bool ssfc_features, const bool scfc_features, const bool scmc_features):
bob::ap::Spectrogram(sampling_frequency, win_length_ms, win_shift_ms,
n_filters, f_min, f_max, pre_emphasis_coeff, mel_scale),
n_filters, f_min, f_max, pre_emphasis_coeff, mel_scale, normalize_mean, rect_filter, inverse_filter,
normalize_spectrum, ssfc_features, scfc_features, scmc_features),
m_n_ceps(n_ceps), m_delta_win(delta_win), m_dct_norm(dct_norm),
m_with_energy(false), m_with_delta(false), m_with_delta_delta(false)
{
......@@ -95,12 +99,21 @@ void bob::ap::Ceps::setDctNorm(bool dct_norm)
void bob::ap::Ceps::initCacheDctKernel()
{
// Dct Kernel initialization
// Dct Kernel initialization, we implement DCT-II variant here
m_dct_kernel.resize(m_n_ceps,m_n_filters);
blitz::firstIndex i;
blitz::secondIndex j;
// If normalize, use the Matlab-based implementation
double dct_coeff = m_dct_norm ? (double)sqrt(2./(double)(m_n_filters)) : 1.;
m_dct_kernel = dct_coeff * blitz::cos(M_PI*(i+1)*(j+0.5)/(double)(m_n_filters));
m_dct_kernel = dct_coeff * blitz::cos(M_PI*(i)*(j+0.5)/(double)(m_n_filters));
// Finish normalization: multiple first row by sqrt(0.5), as per Matlab implementation of DCT-II
if (m_dct_norm) {
blitz::Array<double,1> firstIndex_coeff (m_n_ceps);
firstIndex_coeff = blitz::where(i == 0, sqrt(0.5), 1.); //first element is sqrt(0.5), the rest are 1.
m_dct_kernel = firstIndex_coeff(i) * m_dct_kernel(i,j); // elementwise multiplication
}
}
......@@ -112,6 +125,10 @@ blitz::TinyVector<int,2> bob::ap::Ceps::getShape(const size_t input_size) const
// 1. Number of frames
res(0) = 1+((input_size-m_win_length)/m_win_shift);
//reduce the number of frames by 1 for SSFC features, so the resulted matrix is of correct size
if (m_ssfc_features)
res(0) -= 1;
// 2. Dimension of the feature vector
int dim0=m_n_ceps;
if (m_with_energy) dim0 += 1;
......@@ -134,33 +151,80 @@ blitz::TinyVector<int,2> bob::ap::Ceps::getShape(const blitz::Array<double,1>& i
void bob::ap::Ceps::operator()(const blitz::Array<double,1>& input,
blitz::Array<double,2>& ceps_matrix)
{
// Get expected dimensionality of output array
blitz::TinyVector<int,2> feature_shape = bob::ap::Ceps::getShape(input);
// Check dimensionality of output array
bob::core::array::assertSameShape(ceps_matrix, feature_shape);
int n_frames=feature_shape(0);
int shift_frame=0;
double last_frame_elem=0;
// Create the holder for the previous frame and make sure it's the same as the current frame
// Used by SSFC features computation
blitz::Array<double,1> _prev_frame_d;
_prev_frame_d.resize(m_cache_frame_d.shape());
// Create the temporary holder for SSFC features computation
blitz::Array<double,1> _temp_frame_d;
_temp_frame_d.resize(m_cache_frame_d.shape());
if (m_ssfc_features) {
//we are going to always process the next frame within the loop
shift_frame = 1;
// Init the first frame to the input
extractNormalizeFrame(input, 0, _prev_frame_d);
// Apply pre-emphasis
pre_emphasis(_prev_frame_d, last_frame_elem);
// Apply the Hamming window
hammingWindow(_prev_frame_d);
// Take the power spectrum of the first part of the FFT
powerSpectrumFFT(_prev_frame_d);
}
blitz::Range r1(0,m_n_ceps-1);
for (int i=0; i<n_frames; ++i)
{
// Set padded frame to zero
extractNormalizeFrame(input, i, m_cache_frame_d);
// Init the current frame from the input, we process (i+1)th frame for SSFC features
extractNormalizeFrame(input, i+shift_frame, m_cache_frame_d);
// Update output with energy if required
if (m_with_energy)
ceps_matrix(i,(int)m_n_ceps) = logEnergy(m_cache_frame_d);
// Apply pre-emphasis
pre_emphasis(m_cache_frame_d);
pre_emphasis(m_cache_frame_d, last_frame_elem);
// Apply the Hamming window
hammingWindow(m_cache_frame_d);
// Take the power spectrum of the first part of the FFT
// Note that after this call, we only operate on the first half of m_cache_frame_d array. The second half is ignored.
// powerSpectrumFFT changes first half+1 elements of m_cache_frame_d array
powerSpectrumFFT(m_cache_frame_d);
// Filter with the triangular filter bank (either in linear or Mel domain)
if (m_ssfc_features)
{
// retrieve the previous frame into our temp
_temp_frame_d = _prev_frame_d;
// remember the current frame for the next round, before we change current frame
_prev_frame_d = m_cache_frame_d;
// Computation of SSFC features:
// We take the previous frame and find the difference between values of current and previous frames
m_cache_frame_d -= _temp_frame_d;
// We compute norm2 for the difference as per SSFC features
m_cache_frame_d = blitz::pow2(m_cache_frame_d);
// Then, we can apply the filter and DCT later on
}
// Filter with triangular or rectangular filter bank (either in linear or Mel domain)
filterBank(m_cache_frame_d);
// Apply DCT kernel and update the output
blitz::Array<double,1> ceps_matrix_row(ceps_matrix(i,r1));
applyDct(ceps_matrix_row);
if (m_scfc_features)
// do not apply DCT on SCFC features
ceps_matrix_row = m_cache_filters(r1);
else
applyDct(ceps_matrix_row);
}
//compute the center of the cut-off frequencies
......@@ -226,8 +290,9 @@ void bob::ap::Ceps::addDerivative(const blitz::Array<double,2>& input, blitz::Ar
}
}
// Sum of the integer squared from 1 to delta_win
const double sum = m_delta_win*(m_delta_win+1)*(2*m_delta_win+1)/3;
output /= sum;
// pavel - remove division for the sake of compitability with Matlab code of RFFC features comparison paper
//const double sum = m_delta_win*(m_delta_win+1)*(2*m_delta_win+1)/3;
//output /= sum;
}
/*
......
......@@ -10,8 +10,8 @@
#include <bob.core/assert.h>
bob::ap::Energy::Energy(const double sampling_frequency, const double win_length_ms,
const double win_shift_ms):
bob::ap::FrameExtractor(sampling_frequency, win_length_ms, win_shift_ms),
const double win_shift_ms, const bool normalize_mean):
bob::ap::FrameExtractor(sampling_frequency, win_length_ms, win_shift_ms, normalize_mean),
m_energy_floor(1.)
{
// Initializes logarithm of flooring values
......@@ -19,7 +19,8 @@ bob::ap::Energy::Energy(const double sampling_frequency, const double win_length
}
bob::ap::Energy::Energy(const bob::ap::Energy& other):
bob::ap::FrameExtractor(other), m_energy_floor(1.)
bob::ap::FrameExtractor(other),
m_energy_floor(other.m_energy_floor)
{
// Initializes logarithm of flooring values
m_log_energy_floor = log(m_energy_floor);
......
......@@ -2,7 +2,8 @@
* @date Wed Jan 11:09:30 2013 +0200
* @author Elie Khoury <Elie.Khoury@idiap.ch>
* @author Laurent El Shafey <Laurent.El-Shafey@idiap.ch>
*
* @author Pavel Korshunov <Pavel.Korshunov@idiap.ch>
*
* Copyright (C) Idiap Research Institute, Martigny, Switzerland
*/
......@@ -12,23 +13,27 @@
#include <bob.core/check.h>
bob::ap::FrameExtractor::FrameExtractor(const double sampling_frequency,
const double win_length_ms, const double win_shift_ms):
const double win_length_ms, const double win_shift_ms,
const bool normalize_mean):
m_sampling_frequency(sampling_frequency), m_win_length_ms(win_length_ms),
m_win_shift_ms(win_shift_ms)
m_win_shift_ms(win_shift_ms), m_normalize_mean(normalize_mean)
{
// Initialization
initWinLength();
initWinShift();
initMaxRange();
}
bob::ap::FrameExtractor::FrameExtractor(const FrameExtractor& other):
m_sampling_frequency(other.m_sampling_frequency),
m_win_length_ms(other.m_win_length_ms),
m_win_shift_ms(other.m_win_shift_ms)
m_win_shift_ms(other.m_win_shift_ms),
m_normalize_mean(other.m_normalize_mean)
{
// Initialization
initWinLength();
initWinShift();
initMaxRange();
}
bob::ap::FrameExtractor::~FrameExtractor()
......@@ -42,10 +47,12 @@ bob::ap::FrameExtractor& bob::ap::FrameExtractor::operator=(const bob::ap::Frame
m_sampling_frequency = other.m_sampling_frequency;
m_win_length_ms = other.m_win_length_ms;
m_win_shift_ms = other.m_win_shift_ms;
m_normalize_mean = other.m_normalize_mean;
// Initialization
initWinLength();
initWinShift();
initMaxRange();
}
return *this;
}
......@@ -54,7 +61,8 @@ bool bob::ap::FrameExtractor::operator==(const bob::ap::FrameExtractor& other) c
{
return (m_sampling_frequency == other.m_sampling_frequency &&
m_win_length_ms == other.m_win_length_ms &&
m_win_shift_ms == other.m_win_shift_ms);
m_win_shift_ms == other.m_win_shift_ms &&
m_normalize_mean == other.m_normalize_mean);
}
bool bob::ap::FrameExtractor::operator!=(const bob::ap::FrameExtractor& other) const
......@@ -67,6 +75,7 @@ void bob::ap::FrameExtractor::setSamplingFrequency(const double sampling_frequen
m_sampling_frequency = sampling_frequency;
initWinLength();
initWinShift();
initMaxRange();
}
void bob::ap::FrameExtractor::setWinLengthMs(const double win_length_ms)
......@@ -87,6 +96,7 @@ void bob::ap::FrameExtractor::initWinLength()
if (m_win_length == 0)
throw std::runtime_error("The length of the window is 0. You should use a larger sampling rate or window length in miliseconds");
initWinSize();
}
void bob::ap::FrameExtractor::initWinShift()
......@@ -100,6 +110,12 @@ void bob::ap::FrameExtractor::initWinSize()
m_cache_frame_d.resize(m_win_size);
}
void bob::ap::FrameExtractor::initMaxRange()
{
// update m_max_range, since m_sampling_frequency may have changed or set inside an Init()
m_max_range = pow(2.0, m_sampling_frequency/1000)/2.0 - 0.5;
}
void bob::ap::FrameExtractor::extractNormalizeFrame(const blitz::Array<double,1>& input,
const size_t i, blitz::Array<double,1>& frame_d) const
{
......@@ -109,8 +125,19 @@ void bob::ap::FrameExtractor::extractNormalizeFrame(const blitz::Array<double,1>
blitz::Range rf(0,(int)m_win_length-1);
blitz::Range ri(i*(int)m_win_shift,i*(int)m_win_shift+(int)m_win_length-1);
frame_d(rf) = input(ri);
// Subtract mean value
frame_d -= blitz::mean(frame_d);
if (m_normalize_mean) { // added by Pavel Korshunov
// We normalize by subtracting mean value
frame_d(rf) -= blitz::mean(frame_d);
}
else {
//Otherwise, we normalize by dividing by maximum possible range, which is set in initWinLength()
//This method of normalization is used in the following paper from Interspeech 2015:
//"A Comparison of Features for Synthetic Speech Detection" by Md Sahidullah, Tomi Kinnunen, Cemal Hanilci
if (m_max_range == 0)
throw std::runtime_error("FrameExtractor: the maximum range in frame is 0. Please make sure you provide non-zero sampling frequency.");
frame_d /= m_max_range;
}
}
......
......@@ -2,6 +2,7 @@
* @date Wed Jan 11:09:30 2013 +0200
* @author Elie Khoury <Elie.Khoury@idiap.ch>
* @author Laurent El Shafey <Laurent.El-Shafey@idiap.ch>
* @author Pavel Korshunov <Pavel.Korshunov@idiap.ch>
*
*
* Copyright (C) Idiap Research Institute, Martigny, Switzerland
......@@ -14,10 +15,16 @@
bob::ap::Spectrogram::Spectrogram(const double sampling_frequency,
const double win_length_ms, const double win_shift_ms,
const size_t n_filters, const double f_min, const double f_max,
const double pre_emphasis_coeff, const bool mel_scale):
bob::ap::Energy(sampling_frequency, win_length_ms, win_shift_ms),
const double pre_emphasis_coeff, const bool mel_scale,
const bool normalize_mean, const bool rect_filter,
const bool inverse_filter, const bool normalize_spectrum,
const bool ssfc_features, const bool scfc_features, const bool scmc_features):
bob::ap::Energy(sampling_frequency, win_length_ms, win_shift_ms, normalize_mean),
m_n_filters(n_filters), m_f_min(f_min), m_f_max(f_max),
m_pre_emphasis_coeff(pre_emphasis_coeff), m_mel_scale(mel_scale),
m_rect_filter(rect_filter), m_inverse_filter(inverse_filter),
m_normalize_spectrum(normalize_spectrum), m_ssfc_features(ssfc_features),
m_scfc_features(scfc_features), m_scmc_features(scmc_features),
m_fb_out_floor(1.), m_energy_filter(false), m_log_filter(true),
m_energy_bands(false), m_fft()
{
......@@ -33,6 +40,8 @@ bob::ap::Spectrogram::Spectrogram(const double sampling_frequency,
initWinShift();
// Initializes logarithm of flooring values
// pavel - allow computing log for a very small value
m_fb_out_floor = std::numeric_limits<double>::epsilon();
m_log_fb_out_floor = log(m_fb_out_floor);
m_cache_filters.resize(m_n_filters);
......@@ -42,9 +51,18 @@ bob::ap::Spectrogram::Spectrogram(const Spectrogram& other):
bob::ap::Energy(other), m_n_filters(other.m_n_filters),
m_f_min(other.m_f_min), m_f_max(other.m_f_max),
m_pre_emphasis_coeff(other.m_pre_emphasis_coeff),
m_mel_scale(other.m_mel_scale), m_fb_out_floor(other.m_fb_out_floor),
m_energy_filter(other.m_energy_filter), m_log_filter(other.m_log_filter),
m_energy_bands(other.m_energy_bands), m_fft(other.m_fft)
m_mel_scale(other.m_mel_scale),
m_rect_filter(other.m_rect_filter),
m_inverse_filter(other.m_inverse_filter),
m_normalize_spectrum(other.m_normalize_spectrum),
m_ssfc_features(other.m_ssfc_features),
m_scfc_features(other.m_scfc_features),
m_scmc_features(other.m_scmc_features),
m_fb_out_floor(other.m_fb_out_floor),
m_energy_filter(other.m_energy_filter),
m_log_filter(other.m_log_filter),
m_energy_bands(other.m_energy_bands),
m_fft(other.m_fft)
{
// Initialization
initWinLength();
......@@ -66,6 +84,12 @@ bob::ap::Spectrogram& bob::ap::Spectrogram::operator=(const bob::ap::Spectrogram
m_f_max = other.m_f_max;
m_pre_emphasis_coeff = other.m_pre_emphasis_coeff;
m_mel_scale = other.m_mel_scale;
m_rect_filter = other.m_rect_filter;
m_inverse_filter = other.m_inverse_filter;
m_normalize_spectrum = other.m_normalize_spectrum;
m_ssfc_features = other.m_ssfc_features;
m_scfc_features = other.m_scfc_features;
m_scmc_features = other.m_scmc_features;
m_fb_out_floor = other.m_fb_out_floor;
m_energy_filter = other.m_energy_filter;
m_log_filter = other.m_log_filter;
......@@ -91,6 +115,12 @@ bool bob::ap::Spectrogram::operator==(const bob::ap::Spectrogram& other) const
m_f_max == other.m_f_max &&
m_pre_emphasis_coeff == other.m_pre_emphasis_coeff &&
m_mel_scale == other.m_mel_scale &&
m_rect_filter == other.m_rect_filter &&
m_normalize_spectrum == other.m_normalize_spectrum &&
m_inverse_filter == other.m_inverse_filter &&
m_ssfc_features == other.m_ssfc_features &&
m_scfc_features == other.m_scfc_features &&
m_scmc_features == other.m_scmc_features &&
m_fb_out_floor == other.m_fb_out_floor &&
m_energy_filter == other.m_energy_filter &&
m_log_filter == other.m_log_filter &&
......@@ -171,6 +201,30 @@ void bob::ap::Spectrogram::setMelScale(bool mel_scale)
initCacheFilterBank();
}
void bob::ap::Spectrogram::setInverseFilter(bool inverse_filter)
{
m_inverse_filter = inverse_filter;
initCacheFilterBank();
}
void bob::ap::Spectrogram::setRectangularFilter(bool rect_filter)
{
m_rect_filter = rect_filter;
initCacheFilterBank();
}
void bob::ap::Spectrogram::setSCFCFeatures(bool scfc_features)
{
m_scfc_features = scfc_features;
initCacheFilters();
}
void bob::ap::Spectrogram::setSCMCFeatures(bool scmc_features)
{
m_scmc_features = scmc_features;
initCacheFilters();
}
double bob::ap::Spectrogram::herzToMel(double f)
{
return (2595.*log10(1+f/700.));
......@@ -208,18 +262,26 @@ void bob::ap::Spectrogram::initCachePIndex()
for (int i=0; i<(int)m_n_filters+2; ++i) {
double alpha = i/ (double)(m_n_filters+1);
double f = melToHerz(m_min * (1-alpha) + m_max * alpha);
double factor = f / m_sampling_frequency;
m_p_index(i)=(int)round(m_win_size * factor);
// if (m_inverse_filter)
// m_p_index(m_n_filters+1-i)=(int)round(m_win_size * (m_f_max - f) / m_sampling_frequency); // from max to min
// else {
double factor = f / m_sampling_frequency;
// pavel - use double values instead of integers for better precision
// m_p_index(i)=(int)round(m_win_size * factor); // normal Mel-filter from min to max
m_p_index(i)=m_win_size * factor; // normal Mel-filter from min to max
// }
}
}
else
// Linear frequency decomposition (for LFCC)
// Linear frequency decomposition (for LFCC or RFCC)
{
const double cst_a = (m_win_size/m_sampling_frequency) * (m_f_max-m_f_min)/(double)(m_n_filters+1);
const double cst_b = (m_win_size/m_sampling_frequency) * m_f_min;
for (int i=0; i<(int)m_n_filters+2; ++i) {
m_p_index(i) = (int)round(cst_a * i + cst_b);
// pavel - temporarily change round() to floor() for compatibility
// m_p_index(i) = (int)floor(cst_a * i + cst_b);
// use double values instead of integers for better precision
m_p_index(i) = cst_a * i + cst_b;
}
}
}
......@@ -228,27 +290,52 @@ void bob::ap::Spectrogram::initCacheFilters()
{
// Creates the Triangular filter bank
m_filter_bank.clear();
m_filter_weights.clear();
blitz::firstIndex ii;
for (int i=0; i<(int)m_n_filters; ++i)
{
// Integer indices of the boundary of the triangular filter in the
// Fourier domain
int li = m_p_index(i);
int mi = m_p_index(i+1);
int ri = m_p_index(i+2);
//make sure the left border of the interval is not-included, except for the first i=0
int li = (int)floor(m_p_index(i)+1);
int mi = (int)floor(m_p_index(i+1));
int ri = (int)floor(m_p_index(i+2));
if (i == 0 || (ri-li == 0)) //first elem or left and last elements are the same
li--;
blitz::Array<double,1> filt(ri-li+1);
// Fill in the left slice of the triangular filter
blitz::Array<double,1> filt_p1(filt(blitz::Range(0,mi-li-1)));
int len = mi-li+1;
double a = 1. / len;
filt_p1 = 1.-a*(len-1-ii);
// Fill in the right slice of the triangular filter
blitz::Array<double,1> filt_p2(filt(blitz::Range(mi-li,ri-li)));
len = ri-mi+1;
a = 1. / len;
filt_p2 = 1.-a*ii;
// if we have a rectangular filter, we do not need to care about slices
if (m_rect_filter) {
// Rectangular filter with value=1. for all indexes
filt = 1.;
}
else {
// Fill in the left slice of the triangular filter
blitz::Array<double,1> filt_p1(filt(blitz::Range(0,mi-li)));
double denom = m_p_index(i+1)-m_p_index(i);
filt_p1 = (ii+li-m_p_index(i)) / denom;
// Fill in the right slice of the triangular filter
blitz::Array<double,1> filt_p2(filt(blitz::Range(mi-li+1,ri-li)));
denom = m_p_index(i+2)-m_p_index(i+1);
filt_p2 = (m_p_index(i+2)-(ii+mi+1)) / denom;
}
// Append filter into the filterbank vector
m_filter_bank.push_back(filt);
// init the weights used for some features during filtering
// create an array of size equal to the current range
blitz::Array<double,1> weights(ri-li+1);
weights = 1.;
// compute normalized frequencies for SSFC or SCMC features computation
if (m_scfc_features || m_scmc_features) {
// li=m_p_index(i) is the first adjusted frequency of the current range
// as a normalizer, we use the maximum possible adjusted frequency value
// so, we divide by (2*m_win_size/m_sampling_frequency) * (m_f_max-m_f_min)
// in a case when m_f_min=0 and m_f_max is 8000 (maximum), the denominator will become 512 - the max possible range
weights = 1.0*(li + ii) / ((2*m_win_size/m_sampling_frequency) * (m_f_max-m_f_min)); //make sure it's double value
}
// Append current weights to the weights vector
m_filter_weights.push_back(weights);
}
}
......@@ -267,16 +354,26 @@ void bob::ap::Spectrogram::initWinSize()
m_cache_frame_c2.resize(m_win_size);
}
void bob::ap::Spectrogram::pre_emphasis(blitz::Array<double,1> &data) const
void bob::ap::Spectrogram::pre_emphasis(blitz::Array<double,1> &data, double& last_elem_prev_frame) const
{
if (m_pre_emphasis_coeff != 0.)
{
// remember the last element of the frame that will be needed
// to compute emphasis correctly in the next frame
// the element is at the end of the m_win_shift, since next frame starts at m_win_shift position
double last_element = data((int)m_win_shift-1);
// Pre-emphasise the signal by applying the first order equation
// \f$data_{n} := data_{n} − a*data_{n−1}\f$
blitz::Range r0((int)m_win_length-2,0,-1);
blitz::Range r1((int)m_win_length-1,1,-1);
data(r1) -= m_pre_emphasis_coeff * data(r0); // Apply first order equation
data(0) *= 1. - m_pre_emphasis_coeff; // Update first element
// pavel - remove first element update for consistency with Matlab
// data(0) *= 1. - m_pre_emphasis_coeff; // Update first element
// pavel - use the the last remembered element of the previous frame to update the fist element of this frame
data(0) -= m_pre_emphasis_coeff * last_elem_prev_frame; // Update first elementm_last_element
// remember the last element of this frame for the next frame
last_elem_prev_frame = last_element;
}
}
......@@ -288,44 +385,79 @@ void bob::ap::Spectrogram::hammingWindow(blitz::Array<double,1> &data) const
void bob::ap::Spectrogram::powerSpectrumFFT(blitz::Array<double,1>& x)
{
// Apply the FFT
// this function, effectivelly, modifies only the first half of the input array 'x'
// first, copy the input into a complex container
m_cache_frame_c1 = bob::core::array::cast<std::complex<double> >(x);
// Apply the FFT and store the results in complex m_cache_frame_c2
m_fft(m_cache_frame_c1, m_cache_frame_c2);
// Take the the power spectrum of the first part of the output of the FFT
// This range (half of the original array) is what we work on from now on
blitz::Range r(0,(int)m_win_size/2);
// point (do not copy) to the first half of the original input
blitz::Array<double,1> x_half(x(r));
// point to the first half of the result of FFT
blitz::Array<std::complex<double>,1> complex_half(m_cache_frame_c2(r));
// basically, abs(of complex array) gives us the magnitude of the power spectrum
x_half = blitz::abs(complex_half);
if (m_energy_filter) // Apply the filter bank to the energy
if (m_energy_filter) // Energy is basically magnitude in power of 2
x_half = blitz::pow2(x_half);
if (m_normalize_spectrum) {// Normalize power spectrum, if we need the normalized value
// x_half -= blitz::mean(x_half); // this is an older implementation but it may give similar results
// it should be maximum of the FFT over the whole signal but we have access only to one frame at a time
double max_fft = blitz::max(x_half);
if (max_fft > std::numeric_limits<double>::epsilon())
x_half /= max_fft;
}
}