Commit c8d6a529 by Pavel KORSHUNOV

Merge branch 'new-features' into 'master'

Added RFCC, IMFCC, SCFC, SCMC, SSFC features, revamped tests



See merge request !1
parents 750df966 e9ada31f
Pipeline #3915 passed with stages
in 47 minutes 42 seconds
......@@ -20,8 +20,14 @@
==========================
This package is part of the signal-processing and machine learning toolbox
Bob_. It contains basic audio processing utilities.
Bob_. It contains basic audio processing utilities. Currently, the following cepstral-based features are available:
using rectangular (RFCC), mel-scaled triangular (MFCC) [Davis1980]_, inverted mel-scaled triangular (IMFCC),
and linear triangular (LFCC) filters [Furui1981]_, spectral flux-based features (SSFC) [Scheirer1997]_,
subband centroid frequency (SCFC) [Le2011]_. We are planning to update and add more features in the
near future.
*Please note that the implementation of MFCC and LFCC features has changed compared to an earlier version of the package,
as we corrected pre-emphasis and DCT computations. Delta and delta-delta computations were slightly changed too.*
Installation
------------
......@@ -39,8 +45,18 @@ Contact
For questions or reporting issues to this software package, contact our
development `mailing list`_.
.. [Davis1980] S. Davis and P. Mermelstein, "Comparison of parametric representations for monosyllabic
word recognition in continuously spoken sentences", in IEEE Transactions on Acoustics, Speech, and Signal Processing,
1980, num 4, vol. 28, pages 357-366.
.. [Furui1981] S. Furui, Cepstral analysis technique for automatic speaker verification, in
IEEE Transactions on Acoustics, Speech, and Signal Processing, 1981, num 2 vol 29, pages 254-272.
.. [Scheirer1997] E. Scheirer and M. Slaney, Construction and evaluation of a robust multifeature speech/music discriminator,
in IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP, 1997, vol 2, pages 1331-1334.
.. [Le2011] P. N. Le, E. Ambikairajah, J. Epps, V. Sethu, E. H. C. Choi, Investigation of Spectral Centroid Features for Cognitive Load Classification,
in Speech Commun., April, 2011, num 4, vol 53, pages 540--551.
.. Place your references here:
.. _bob: https://www.idiap.ch/software/bob
.. _installation: https://gitlab.idiap.ch/bob/bob/wikis/Installation
.. _mailing list: https://groups.google.com/forum/?fromgroups#!forum/bob-devel
/**
* @author Andre Anjos <andre.anjos@idiap.ch>
* @author Pavel Korshunov <Pavel.Korshunov@idiap.ch>
* @date Thu 6 Feb 09:00:05 2014
*
* @brief Bindings to the base class bob::ap::Ceps
......@@ -13,7 +14,7 @@
PyDoc_STRVAR(s_ceps_str, BOB_EXT_MODULE_PREFIX ".Ceps");
PyDoc_STRVAR(s_ceps_doc,
"Ceps(sampling_frequency, [win_length_ms=20., [win_shift_ms=10., [n_filters=24, [n_ceps=19, [f_min=0., [f_max=4000., [delta_win=2, [pre_emphasis_coeff=0.95, [mel_scale=True, [dct_norm=True]]]]]]]]]]) -> new Ceps\n\
"Ceps(sampling_frequency, [win_length_ms=20., [win_shift_ms=10., [n_filters=24, [n_ceps=19, [f_min=0., [f_max=4000., [delta_win=2, [pre_emphasis_coeff=0.95, [mel_scale=True, [dct_norm=True, [normalize_mean=True, [rect_filter=False, [inverse_filter=False, [normalize_spectrum=False, [ssfc_features=False, [scfc_features=False, [scmc_features=False]]]]]]]]]]]]]]]]]) -> new Ceps\n\
Ceps(other) -> new Ceps\n\
\n\
Objects of this class, after configuration, can extract the\n\
......@@ -57,6 +58,39 @@ mel_scale\n\
dct_norm\n\
[bool] A factor by which the cepstral coefficients are\n\
multiplied\n\
normalize_mean\n\
[bool] Tells whether frame should be normalized \n\
by subtracting mean (True) or dividing by max_range (False)\n\
``True`` is the default value.\n\
\n\
rect_filter\n\
[bool] tells whether to apply the filter in the\n\
inversed order, i.e., from high frequencies to low\n\
(set it to ``True''). ``False`` is the default value.\n\
\n\
inverse_filter\n\
[bool] tells whether cepstral features are extracted\n\
using a rectungular filter (set it to ``True``), i.e., RFCC features,\n\
instead of the default filter (the default value is ``False``)\n\
\n\
normalize_spectrum\n\
[bool] Tells whether to normalize the power spectrum of the signal.\n\
The default value is ``False``.\n\
\n\
ssfc_features\n\
[bool] Set to true if you want to compute\n\
Subband Spectral Flux Coefficients (SSFC), which measures\n\
the frame-by-frame change in the power spectrum\n\
\n\
scfc_features\n\
[bool] Set to true if you want to compute\n\
Spectral Centroid Frequency Coefficients (SCFC), which\n\
capture detailed information about subbands similar to formant frequencies\n\
\n\
scmc_features\n\
[bool] Set to true if you want to compute\n\
Spectral Centroid Magnitude Coefficients (SCMC), which\n\
capture detailed information about subbands similar to SCFC features\n\
\n\
other\n\
[Ceps] an object of which is or inherits from ``Ceps``\n\
......@@ -132,6 +166,13 @@ static int PyBobApCeps_InitParameters
"pre_emphasis_coeff",
"mel_scale",
"dct_norm",
"normalize_mean",
"rect_filter",
"inverse_filter",
"normalize_spectrum",
"ssfc_features",
"scfc_features",
"scmc_features",
0};
static char** kwlist = const_cast<char**>(const_kwlist);
......@@ -141,24 +182,42 @@ static int PyBobApCeps_InitParameters
Py_ssize_t n_filters = 24;
Py_ssize_t n_ceps = 19;
double f_min = 0.;
double f_max = 4000.;
double f_max = 8000.;
Py_ssize_t delta_win = 2;
double pre_emphasis_coeff = 0.95;
PyObject* mel_scale = Py_True;
PyObject* dct_norm = Py_True;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "d|ddnnddndOO", kwlist,
PyObject* dct_norm = Py_False;
PyObject* normalize_mean = Py_True;
PyObject* rect_filter = Py_False;
PyObject* inverse_filter = Py_False;
PyObject* normalize_spectrum = Py_False;
PyObject* ssfc_features = Py_False;
PyObject* scfc_features = Py_False;
PyObject* scmc_features = Py_False;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "d|ddnnddndOOOOOOOOO", kwlist,
&sampling_frequency, &win_length_ms, &win_shift_ms, &n_filters,
&n_ceps, &f_min, &f_max, &delta_win, &pre_emphasis_coeff,
&mel_scale, &dct_norm))
&mel_scale, &dct_norm, &normalize_mean, &rect_filter,
&inverse_filter, &normalize_spectrum,
&ssfc_features, &scfc_features, &scmc_features))
return -1;
bool mel_scale_ = PyObject_IsTrue(mel_scale);
bool dct_norm_ = PyObject_IsTrue(dct_norm);
bool normalize_mean_ = PyObject_IsTrue(normalize_mean);
bool rect_filter_ = PyObject_IsTrue(rect_filter);
bool inverse_filter_ = PyObject_IsTrue(inverse_filter);
bool normalize_spectrum_ = PyObject_IsTrue(normalize_spectrum);
bool ssfc_features_ = PyObject_IsTrue(ssfc_features);
bool scfc_features_ = PyObject_IsTrue(scfc_features);
bool scmc_features_ = PyObject_IsTrue(scmc_features);
try {
self->cxx = new bob::ap::Ceps(sampling_frequency,
win_length_ms, win_shift_ms, n_filters, n_ceps, f_min, f_max,
delta_win, pre_emphasis_coeff, mel_scale_, dct_norm_);
delta_win, pre_emphasis_coeff, mel_scale_, dct_norm_, normalize_mean_,
rect_filter_, inverse_filter_, normalize_spectrum_,
ssfc_features_, scfc_features_, scmc_features_);
if (!self->cxx) {
PyErr_Format(PyExc_MemoryError, "cannot create new object of type `%s' - no more memory", Py_TYPE(self)->tp_name);
return -1;
......@@ -229,7 +288,7 @@ static PyObject* PyBobApCeps_Repr(PyBobApCepsObject* self) {
Py_ssize_t n_filters = self->cxx->getNFilters();
Py_ssize_t n_ceps = self->cxx->getNCeps();
Py_ssize_t delta_win = self->cxx->getDeltaWin();
auto count = std::snprintf(buffer, MAXSIZE, "%s(sampling_frequency=%f, win_length_ms=%f, win_shift_ms=%f, n_filters=%" PY_FORMAT_SIZE_T "d, n_ceps=%" PY_FORMAT_SIZE_T "d, f_min=%f, f_max=%f, delta_win=%" PY_FORMAT_SIZE_T "d, pre_emphasis_coeff=%f, mel_scale=%s, dct_norm=%s)", Py_TYPE(self)->tp_name, self->cxx->getSamplingFrequency(), self->cxx->getWinLengthMs(), self->cxx->getWinShiftMs(), n_filters, n_ceps, self->cxx->getFMin(), self->cxx->getFMax(), delta_win, self->cxx->getPreEmphasisCoeff(), self->cxx->getMelScale()?"True":"False", self->cxx->getDctNorm()?"True":"False");
auto count = std::snprintf(buffer, MAXSIZE, "%s(sampling_frequency=%f, win_length_ms=%f, win_shift_ms=%f, n_filters=%" PY_FORMAT_SIZE_T "d, n_ceps=%" PY_FORMAT_SIZE_T "d, f_min=%f, f_max=%f, delta_win=%" PY_FORMAT_SIZE_T "d, pre_emphasis_coeff=%f, mel_scale=%s, dct_norm=%s, normalize_mean=%s, rect_filter=%s, inverse_filter=%s, normalize_spectrum=%s, ssfc_features=%s, scfc_features=%s, scmc_features=%s)", Py_TYPE(self)->tp_name, self->cxx->getSamplingFrequency(), self->cxx->getWinLengthMs(), self->cxx->getWinShiftMs(), n_filters, n_ceps, self->cxx->getFMin(), self->cxx->getFMax(), delta_win, self->cxx->getPreEmphasisCoeff(), self->cxx->getMelScale()?"True":"False", self->cxx->getDctNorm()?"True":"False", self->cxx->getNormalizeMean()?"True":"False", self->cxx->getRectangularFilter()?"True":"False", self->cxx->getInverseFilter()?"True":"False", self->cxx->getNormalizeSpectrum()?"True":"False", self->cxx->getSSFCFeatures()?"True":"False", self->cxx->getSCFCFeatures()?"True":"False", self->cxx->getSCMCFeatures()?"True":"False");
return
# if PY_VERSION_HEX >= 0x03000000
PyUnicode_FromStringAndSize
......
......@@ -2,6 +2,7 @@
* @date Wed Jan 11:09:30 2013 +0200
* @author Elie Khoury <Elie.Khoury@idiap.ch>
* @author Laurent El Shafey <Laurent.El-Shafey@idiap.ch>
* @author Pavel Korshunov <Pavel.Korshunov@idiap.ch>
*
* @brief Implement Linear and Mel Frequency Cepstral Coefficients
* functions (MFCC and LFCC)
......@@ -16,9 +17,12 @@ bob::ap::Ceps::Ceps(const double sampling_frequency,
const double win_length_ms, const double win_shift_ms,
const size_t n_filters, const size_t n_ceps, const double f_min,
const double f_max, const size_t delta_win, const double pre_emphasis_coeff,
const bool mel_scale, const bool dct_norm):
const bool mel_scale, const bool dct_norm, const bool normalize_mean,
const bool rect_filter, const bool inverse_filter, const bool normalize_spectrum,
const bool ssfc_features, const bool scfc_features, const bool scmc_features):
bob::ap::Spectrogram(sampling_frequency, win_length_ms, win_shift_ms,
n_filters, f_min, f_max, pre_emphasis_coeff, mel_scale),
n_filters, f_min, f_max, pre_emphasis_coeff, mel_scale, normalize_mean, rect_filter, inverse_filter,
normalize_spectrum, ssfc_features, scfc_features, scmc_features),
m_n_ceps(n_ceps), m_delta_win(delta_win), m_dct_norm(dct_norm),
m_with_energy(false), m_with_delta(false), m_with_delta_delta(false)
{
......@@ -95,12 +99,21 @@ void bob::ap::Ceps::setDctNorm(bool dct_norm)
void bob::ap::Ceps::initCacheDctKernel()
{
// Dct Kernel initialization
// Dct Kernel initialization, we implement DCT-II variant here
m_dct_kernel.resize(m_n_ceps,m_n_filters);
blitz::firstIndex i;
blitz::secondIndex j;
// If normalize, use the Matlab-based implementation
double dct_coeff = m_dct_norm ? (double)sqrt(2./(double)(m_n_filters)) : 1.;
m_dct_kernel = dct_coeff * blitz::cos(M_PI*(i+1)*(j+0.5)/(double)(m_n_filters));
m_dct_kernel = dct_coeff * blitz::cos(M_PI*(i)*(j+0.5)/(double)(m_n_filters));
// Finish normalization: multiple first row by sqrt(0.5), as per Matlab implementation of DCT-II
if (m_dct_norm) {
blitz::Array<double,1> firstIndex_coeff (m_n_ceps);
firstIndex_coeff = blitz::where(i == 0, sqrt(0.5), 1.); //first element is sqrt(0.5), the rest are 1.
m_dct_kernel = firstIndex_coeff(i) * m_dct_kernel(i,j); // elementwise multiplication
}
}
......@@ -112,6 +125,10 @@ blitz::TinyVector<int,2> bob::ap::Ceps::getShape(const size_t input_size) const
// 1. Number of frames
res(0) = 1+((input_size-m_win_length)/m_win_shift);
//reduce the number of frames by 1 for SSFC features, so the resulted matrix is of correct size
if (m_ssfc_features)
res(0) -= 1;
// 2. Dimension of the feature vector
int dim0=m_n_ceps;
if (m_with_energy) dim0 += 1;
......@@ -134,33 +151,80 @@ blitz::TinyVector<int,2> bob::ap::Ceps::getShape(const blitz::Array<double,1>& i
void bob::ap::Ceps::operator()(const blitz::Array<double,1>& input,
blitz::Array<double,2>& ceps_matrix)
{
// Get expected dimensionality of output array
blitz::TinyVector<int,2> feature_shape = bob::ap::Ceps::getShape(input);
// Check dimensionality of output array
bob::core::array::assertSameShape(ceps_matrix, feature_shape);
int n_frames=feature_shape(0);
int shift_frame=0;
double last_frame_elem=0;
// Create the holder for the previous frame and make sure it's the same as the current frame
// Used by SSFC features computation
blitz::Array<double,1> _prev_frame_d;
_prev_frame_d.resize(m_cache_frame_d.shape());
// Create the temporary holder for SSFC features computation
blitz::Array<double,1> _temp_frame_d;
_temp_frame_d.resize(m_cache_frame_d.shape());
if (m_ssfc_features) {
//we are going to always process the next frame within the loop
shift_frame = 1;
// Init the first frame to the input
extractNormalizeFrame(input, 0, _prev_frame_d);
// Apply pre-emphasis
pre_emphasis(_prev_frame_d, last_frame_elem);
// Apply the Hamming window
hammingWindow(_prev_frame_d);
// Take the power spectrum of the first part of the FFT
powerSpectrumFFT(_prev_frame_d);
}
blitz::Range r1(0,m_n_ceps-1);
for (int i=0; i<n_frames; ++i)
{
// Set padded frame to zero
extractNormalizeFrame(input, i, m_cache_frame_d);
// Init the current frame from the input, we process (i+1)th frame for SSFC features
extractNormalizeFrame(input, i+shift_frame, m_cache_frame_d);
// Update output with energy if required
if (m_with_energy)
ceps_matrix(i,(int)m_n_ceps) = logEnergy(m_cache_frame_d);
// Apply pre-emphasis
pre_emphasis(m_cache_frame_d);
pre_emphasis(m_cache_frame_d, last_frame_elem);
// Apply the Hamming window
hammingWindow(m_cache_frame_d);
// Take the power spectrum of the first part of the FFT
// Note that after this call, we only operate on the first half of m_cache_frame_d array. The second half is ignored.
// powerSpectrumFFT changes first half+1 elements of m_cache_frame_d array
powerSpectrumFFT(m_cache_frame_d);
// Filter with the triangular filter bank (either in linear or Mel domain)
if (m_ssfc_features)
{
// retrieve the previous frame into our temp
_temp_frame_d = _prev_frame_d;
// remember the current frame for the next round, before we change current frame
_prev_frame_d = m_cache_frame_d;
// Computation of SSFC features:
// We take the previous frame and find the difference between values of current and previous frames
m_cache_frame_d -= _temp_frame_d;
// We compute norm2 for the difference as per SSFC features
m_cache_frame_d = blitz::pow2(m_cache_frame_d);
// Then, we can apply the filter and DCT later on
}
// Filter with triangular or rectangular filter bank (either in linear or Mel domain)
filterBank(m_cache_frame_d);
// Apply DCT kernel and update the output
blitz::Array<double,1> ceps_matrix_row(ceps_matrix(i,r1));
applyDct(ceps_matrix_row);
if (m_scfc_features)
// do not apply DCT on SCFC features
ceps_matrix_row = m_cache_filters(r1);
else
applyDct(ceps_matrix_row);
}
//compute the center of the cut-off frequencies
......@@ -226,8 +290,9 @@ void bob::ap::Ceps::addDerivative(const blitz::Array<double,2>& input, blitz::Ar
}
}
// Sum of the integer squared from 1 to delta_win
const double sum = m_delta_win*(m_delta_win+1)*(2*m_delta_win+1)/3;
output /= sum;
// pavel - remove division for the sake of compitability with Matlab code of RFFC features comparison paper
//const double sum = m_delta_win*(m_delta_win+1)*(2*m_delta_win+1)/3;
//output /= sum;
}
/*
......
......@@ -10,8 +10,8 @@
#include <bob.core/assert.h>
bob::ap::Energy::Energy(const double sampling_frequency, const double win_length_ms,
const double win_shift_ms):
bob::ap::FrameExtractor(sampling_frequency, win_length_ms, win_shift_ms),
const double win_shift_ms, const bool normalize_mean):
bob::ap::FrameExtractor(sampling_frequency, win_length_ms, win_shift_ms, normalize_mean),
m_energy_floor(1.)
{
// Initializes logarithm of flooring values
......@@ -19,7 +19,8 @@ bob::ap::Energy::Energy(const double sampling_frequency, const double win_length
}
bob::ap::Energy::Energy(const bob::ap::Energy& other):
bob::ap::FrameExtractor(other), m_energy_floor(1.)
bob::ap::FrameExtractor(other),
m_energy_floor(other.m_energy_floor)
{
// Initializes logarithm of flooring values
m_log_energy_floor = log(m_energy_floor);
......
......@@ -2,7 +2,8 @@
* @date Wed Jan 11:09:30 2013 +0200
* @author Elie Khoury <Elie.Khoury@idiap.ch>
* @author Laurent El Shafey <Laurent.El-Shafey@idiap.ch>
*
* @author Pavel Korshunov <Pavel.Korshunov@idiap.ch>
*
* Copyright (C) Idiap Research Institute, Martigny, Switzerland
*/
......@@ -12,23 +13,27 @@
#include <bob.core/check.h>
bob::ap::FrameExtractor::FrameExtractor(const double sampling_frequency,
const double win_length_ms, const double win_shift_ms):
const double win_length_ms, const double win_shift_ms,
const bool normalize_mean):
m_sampling_frequency(sampling_frequency), m_win_length_ms(win_length_ms),
m_win_shift_ms(win_shift_ms)
m_win_shift_ms(win_shift_ms), m_normalize_mean(normalize_mean)
{
// Initialization
initWinLength();
initWinShift();
initMaxRange();
}
bob::ap::FrameExtractor::FrameExtractor(const FrameExtractor& other):
m_sampling_frequency(other.m_sampling_frequency),
m_win_length_ms(other.m_win_length_ms),
m_win_shift_ms(other.m_win_shift_ms)
m_win_shift_ms(other.m_win_shift_ms),
m_normalize_mean(other.m_normalize_mean)
{
// Initialization
initWinLength();
initWinShift();
initMaxRange();
}
bob::ap::FrameExtractor::~FrameExtractor()
......@@ -42,10 +47,12 @@ bob::ap::FrameExtractor& bob::ap::FrameExtractor::operator=(const bob::ap::Frame
m_sampling_frequency = other.m_sampling_frequency;
m_win_length_ms = other.m_win_length_ms;
m_win_shift_ms = other.m_win_shift_ms;
m_normalize_mean = other.m_normalize_mean;
// Initialization
initWinLength();
initWinShift();
initMaxRange();
}
return *this;
}
......@@ -54,7 +61,8 @@ bool bob::ap::FrameExtractor::operator==(const bob::ap::FrameExtractor& other) c
{
return (m_sampling_frequency == other.m_sampling_frequency &&
m_win_length_ms == other.m_win_length_ms &&
m_win_shift_ms == other.m_win_shift_ms);
m_win_shift_ms == other.m_win_shift_ms &&
m_normalize_mean == other.m_normalize_mean);
}
bool bob::ap::FrameExtractor::operator!=(const bob::ap::FrameExtractor& other) const
......@@ -67,6 +75,7 @@ void bob::ap::FrameExtractor::setSamplingFrequency(const double sampling_frequen
m_sampling_frequency = sampling_frequency;
initWinLength();
initWinShift();
initMaxRange();
}
void bob::ap::FrameExtractor::setWinLengthMs(const double win_length_ms)
......@@ -87,6 +96,7 @@ void bob::ap::FrameExtractor::initWinLength()
if (m_win_length == 0)
throw std::runtime_error("The length of the window is 0. You should use a larger sampling rate or window length in miliseconds");
initWinSize();
}
void bob::ap::FrameExtractor::initWinShift()
......@@ -100,6 +110,12 @@ void bob::ap::FrameExtractor::initWinSize()
m_cache_frame_d.resize(m_win_size);
}
void bob::ap::FrameExtractor::initMaxRange()
{
// update m_max_range, since m_sampling_frequency may have changed or set inside an Init()
m_max_range = pow(2.0, m_sampling_frequency/1000)/2.0 - 0.5;
}
void bob::ap::FrameExtractor::extractNormalizeFrame(const blitz::Array<double,1>& input,
const size_t i, blitz::Array<double,1>& frame_d) const
{
......@@ -109,8 +125,19 @@ void bob::ap::FrameExtractor::extractNormalizeFrame(const blitz::Array<double,1>
blitz::Range rf(0,(int)m_win_length-1);
blitz::Range ri(i*(int)m_win_shift,i*(int)m_win_shift+(int)m_win_length-1);
frame_d(rf) = input(ri);
// Subtract mean value
frame_d -= blitz::mean(frame_d);
if (m_normalize_mean) { // added by Pavel Korshunov
// We normalize by subtracting mean value
frame_d(rf) -= blitz::mean(frame_d);
}
else {
//Otherwise, we normalize by dividing by maximum possible range, which is set in initWinLength()
//This method of normalization is used in the following paper from Interspeech 2015:
//"A Comparison of Features for Synthetic Speech Detection" by Md Sahidullah, Tomi Kinnunen, Cemal Hanilci
if (m_max_range == 0)
throw std::runtime_error("FrameExtractor: the maximum range in frame is 0. Please make sure you provide non-zero sampling frequency.");
frame_d /= m_max_range;
}
}
......
......@@ -13,7 +13,7 @@
PyDoc_STRVAR(s_energy_str, BOB_EXT_MODULE_PREFIX ".Energy");
PyDoc_STRVAR(s_energy_doc,
"Energy(sampling_frequency, [win_length_ms=20., [win_shift_ms=10.]]) -> new Energy\n\
"Energy(sampling_frequency, [win_length_ms=20., [win_shift_ms=10., [normalize_mean=True]]]) -> new Energy\n\
Energy(other) -> new Energy\n\
\n\
Objects of this class, after configuration, can extract the energy\n\
......@@ -30,6 +30,11 @@ win_length_ms\n\
win_shift_ms\n\
[float] the window shift in miliseconds\n\
\n\
normalize_mean\n\
[bool] Tells whether frame should be normalized \n\
by subtracting mean (True) or dividing by max_range (False)\n\
``True`` is the default value.\n\
\n\
other\n\
[Energy] an object of which is or inherits from ``Energy``\n\
that will be deep-copied into a new instance.\n\
......@@ -92,17 +97,22 @@ static int PyBobApEnergy_InitParameters
"sampling_frequency",
"win_length_ms",
"win_shift_ms",
"normalize_mean",
0};
static char** kwlist = const_cast<char**>(const_kwlist);
double sampling_frequency = 0.;
double win_length_ms = 20.;
double win_shift_ms = 10.;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "d|dd", kwlist,
&sampling_frequency, &win_length_ms, &win_shift_ms)) return -1;
PyObject* normalize_mean = Py_True;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "d|ddO", kwlist,
&sampling_frequency, &win_length_ms, &win_shift_ms, &normalize_mean)) return -1;
bool normalize_mean_ = PyObject_IsTrue(normalize_mean);
try {
self->cxx = new bob::ap::Energy(sampling_frequency, win_length_ms, win_shift_ms);
self->cxx = new bob::ap::Energy(sampling_frequency,
win_length_ms, win_shift_ms, normalize_mean_);
if (!self->cxx) {
PyErr_Format(PyExc_MemoryError, "cannot create new object of type `%s' - no more memory", Py_TYPE(self)->tp_name);
return -1;
......@@ -168,7 +178,7 @@ static int PyBobApEnergy_Init(PyBobApEnergyObject* self,
static PyObject* PyBobApEnergy_Repr(PyBobApEnergyObject* self) {
static const int MAXSIZE = 256;
char buffer[MAXSIZE];
auto count = std::snprintf(buffer, MAXSIZE, "%s(sampling_frequency=%f, win_length_ms=%f, win_shift_ms=%f)", Py_TYPE(self)->tp_name, self->cxx->getSamplingFrequency(), self->cxx->getWinLengthMs(), self->cxx->getWinShiftMs());
auto count = std::snprintf(buffer, MAXSIZE, "%s(sampling_frequency=%f, win_length_ms=%f, win_shift_ms=%f, normalize_mean=%s)", Py_TYPE(self)->tp_name, self->cxx->getSamplingFrequency(), self->cxx->getWinLengthMs(), self->cxx->getWinShiftMs(), self->cxx->getNormalizeMean()?"True":"False");
return
# if PY_VERSION_HEX >= 0x03000000
PyUnicode_FromStringAndSize
......
......@@ -13,7 +13,7 @@
PyDoc_STRVAR(s_frame_extractor_str, BOB_EXT_MODULE_PREFIX ".FrameExtractor");
PyDoc_STRVAR(s_frame_extractor_doc,
"FrameExtractor(sampling_frequency, [win_length_ms=20., [win_shift_ms=10.]]) -> new FrameExtractor\n\
"FrameExtractor(sampling_frequency, [win_length_ms=20., [win_shift_ms=10., [normalize_mean=True]]]) -> new FrameExtractor\n\
FrameExtractor(other) -> new FrameExtractor\n\
\n\
This class is a base type for classes that perform audio\n\
......@@ -35,6 +35,11 @@ win_length_ms\n\
win_shift_ms\n\
[float] the window shift in miliseconds\n\
\n\
normalize_mean\n\
[bool] Tells whether frame should be normalized \n\
by subtracting mean (True) or dividing by max_range (False)\n\
``True`` is the default value.\n\
\n\
other\n\
[FrameExtractor] an object of which is or inherits from a FrameExtractor\n\
that will be deep-copied into a new instance.\n\
......@@ -95,17 +100,22 @@ static int PyBobApFrameExtractor_InitParameters
"sampling_frequency",
"win_length_ms",
"win_shift_ms",
"normalize_mean",
0};
static char** kwlist = const_cast<char**>(const_kwlist);
double sampling_frequency = 0.;
double win_length_ms = 20.;
double win_shift_ms = 10.;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "d|dd", kwlist,
&sampling_frequency, &win_length_ms, &win_shift_ms)) return -1;
PyObject* normalize_mean = Py_True;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "d|ddO", kwlist,
&sampling_frequency, &win_length_ms, &win_shift_ms, &normalize_mean)) return -1;
bool normalize_mean_ = PyObject_IsTrue(normalize_mean);
try {
self->cxx = new bob::ap::FrameExtractor(sampling_frequency, win_length_ms, win_shift_ms);
self->cxx = new bob::ap::FrameExtractor(sampling_frequency,
win_length_ms, win_shift_ms, normalize_mean_);
if (!self->cxx) {
PyErr_Format(PyExc_MemoryError, "cannot create new object of type `%s' - no more memory", Py_TYPE(self)->tp_name);
return -1;
......@@ -170,7 +180,7 @@ static int PyBobApFrameExtractor_Init(PyBobApFrameExtractorObject* self,
static PyObject* PyBobApFrameExtractor_Repr(PyBobApFrameExtractorObject* self) {
static const int MAXSIZE = 256;
char buffer[MAXSIZE];
auto count = std::snprintf(buffer, MAXSIZE, "%s(sampling_frequency=%f, win_length_ms=%f, win_shift_ms=%f)", Py_TYPE(self)->tp_name, self->cxx->getSamplingFrequency(), self->cxx->getWinLengthMs(), self->cxx->getWinShiftMs());
auto count = std::snprintf(buffer, MAXSIZE, "%s(sampling_frequency=%f, win_length_ms=%f, win_shift_ms=%f, normalize_mean=%s)", Py_TYPE(self)->tp_name, self->cxx->getSamplingFrequency(), self->cxx->getWinLengthMs(), self->cxx->getWinShiftMs(), self->cxx->getNormalizeMean()?"True":"False");
return
# if PY_VERSION_HEX >= 0x03000000
PyUnicode_FromStringAndSize
......@@ -318,6 +328,38 @@ static int PyBobApFrameExtractor_SetWinShiftMs
}
PyDoc_STRVAR(s_normalize_mean_str, "normalize_mean");
PyDoc_STRVAR(s_normalize_mean_doc,
"Tells whether frame should be normalized by subtracting mean (True) or dividing by max_range (False)\n\
");
static PyObject* PyBobApFrameExtractor_GetNormalizeMean
(PyBobApFrameExtractorObject* self, void* /*closure*/) {
if (self->cxx->getNormalizeMean()) Py_RETURN_TRUE;
else Py_RETURN_FALSE;
}
static int PyBobApFrameExtractor_SetNormalizeMean
(PyBobApFrameExtractorObject* self, PyObject* o, void* /*closure*/) {
bool b = PyObject_IsTrue(o);
if (PyErr_Occurred()) return -1;
try {
self->cxx->setNormalizeMean(b);
}
catch (std::exception& ex) {
PyErr_SetString(PyExc_RuntimeError, ex.what());
return -1;
}
catch (...) {
PyErr_Format(PyExc_RuntimeError, "cannot reset `normalize_mean' of %s: unknown exception caught", Py_TYPE(self)->tp_name);
return -1;
}
return 0;
}
PyDoc_STRVAR(s_win_length_str, "win_length");
PyDoc_STRVAR(s_win_length_doc,
"The normalized window length w.r.t. the sample frequency"
......@@ -374,6 +416,13 @@ static PyGetSetDef PyBobApFrameExtractor_getseters[] = {
s_win_shift_doc,
0
},
{
s_normalize_mean_str,
(getter)PyBobApFrameExtractor_GetNormalizeMean,
(setter)PyBobApFrameExtractor_SetNormalizeMean,
s_normalize_mean_doc,
0
},
{0} /* Sentinel */
};
......
/**
* @author Elie Khoury <Elie.Khoury@idiap.ch>
* @author Laurent El Shafey <Laurent.El-Shafey@idiap.ch>
* @author Pavel Korshunov <Pavel.Korshunov@idiap.ch>
* @date Wed Jan 11:10:20 2013 +0200
*
* @brief Implement Linear and Mel Frequency Cepstral Coefficients
......@@ -46,7 +47,11 @@ class Ceps: public Spectrogram
const size_t n_ceps=19, const double f_min=0.,
const double f_max=4000., const size_t delta_win=2,
const double pre_emphasis_coef=0.95, const bool mel_scale=true,
const bool dct_norm=false);
const bool dct_norm=false, const bool normalize_mean=true,
const bool rect_filter=false, const bool inverse_filter=false,
const bool normalize_spectrum=false,
const bool ssfc_features=false, const bool scfc_features=false,
const bool scmc_features=false);
/**
* @brief Copy constructor.
......
......@@ -31,7 +31,7 @@ class Energy: public FrameExtractor
* @brief Constructor. Initializes working arrays
*/
Energy(const double sampling_frequency, const double win_length_ms=20.,
const double win_shift_ms=10.);
const double win_shift_ms=10., const bool normalize_mean=true);
/**
* @brief Copy constructor
......
......@@ -32,7 +32,8 @@ class FrameExtractor
* @brief Constructor. Initializes working arrays
*/
FrameExtractor(const double sampling_frequency,
const double win_length_ms=20., const double win_shift_ms=10.);
const double win_length_ms=20., const double win_shift_ms=10.,
const bool normalize_mean=true);
/**
* @brief Copy Constructor
......@@ -90,6 +91,11 @@ class FrameExtractor
*/
size_t getWinShift() const
{ return m_win_shift; }
/**
* @brief Tells whether frame should be normalized by subtracting mean (True) or dividing by max_range (False)
*/
bool getNormalizeMean() const
{ return m_normalize_mean; }
/**
* @brief Sets the sampling frequency/frequency rate
......@@ -103,6 +109,11 @@ class FrameExtractor
* @brief Sets the window shift in miliseconds
*/
virtual void setWinShiftMs(const double win_shift_ms);
/**
* @brief Sets whether frame should be normalized by subtracting mean (True) or dividing by max_range (False)
*/
virtual void setNormalizeMean(const double normalize_mean)
{ m_normalize_mean = normalize_mean; }
protected:
/**
......@@ -114,6 +125,7 @@ class FrameExtractor
virtual void initWinSize();
virtual void initWinLength();
virtual void initWinShift();
virtual void initMaxRange();
double m_sampling_frequency; ///< The sampling frequency
double m_win_length_ms; ///< The window length in miliseconds
......@@ -121,6 +133,8 @@ class FrameExtractor
double m_win_shift_ms;
size_t m_win_shift;
size_t m_win_size;
double m_max_range; //half of the maximum possible dynamic range of the original signal (for 16 bits, it is 32768)
bool m_normalize_mean; //normalize the frame by subtracting its mean
mutable blitz::Array<double,1> m_cache_frame_d;
};
......
......@@ -2,6 +2,7 @@
* @date Wed Jan 11:10:20 2013 +0200
* @author Elie Khoury <Elie.Khoury@idiap.ch>
* @author Laurent El Shafey <Laurent.El-Shafey@idiap.ch>
* @author Pavel Korshunov <Pavel.Korshunov@idiap.ch>
*
* @brief Implement spectrogram
*
......@@ -41,8 +42,11 @@ class Spectrogram: public Energy
Spectrogram(const double sampling_frequency,
const double win_length_ms=20., const double win_shift_ms=10.,
const size_t n_filters=24, const double f_min=0.,
const double f_max=4000., const double pre_emphasis_coeff=0.95,
bool mel_scale=true);
const double f_max=8000., const double pre_emphasis_coeff=0.95,
const bool mel_scale=true, const bool normalize_mean=true,
const bool rect_filter=false, const bool inverse_filter=false,
const bool normalize_spectrum=false, const bool ssfc_features=false,
const bool scfc_features=false, const bool scmc_features=false);