Spectrogram.h 7.87 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/**
 * @date Wed Jan 11:10:20 2013 +0200
 * @author Elie Khoury <Elie.Khoury@idiap.ch>
 * @author Laurent El Shafey <Laurent.El-Shafey@idiap.ch>
 *
 * @brief Implement spectrogram
 *
 * Copyright (C) Idiap Research Institute, Martigny, Switzerland
 */


#ifndef BOB_AP_SPECTROGRAM_H
#define BOB_AP_SPECTROGRAM_H

#include <vector>
#include <stdexcept>
#include <blitz/array.h>
#include <boost/format.hpp>

20
#include <bob.sp/FFT1D.h>
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279

#include "Energy.h"

namespace bob {
/**
 * \ingroup libap_api
 * @{
 *
 */
namespace ap {

/**
 * @brief This class implements an audio spectrogram extractor
 */
class Spectrogram: public Energy
{
  public:
    /**
     * @brief Constructor. Initializes working arrays
     */
    Spectrogram(const double sampling_frequency,
      const double win_length_ms=20., const double win_shift_ms=10.,
      const size_t n_filters=24, const double f_min=0.,
      const double f_max=4000., const double pre_emphasis_coeff=0.95,
      bool mel_scale=true);

    /**
     * @brief Copy Constructor
     */
    Spectrogram(const Spectrogram& other);

    /**
     * @brief Assignment operator
     */
    Spectrogram& operator=(const Spectrogram& other);

    /**
     * @brief Equal to
     */
    bool operator==(const Spectrogram& other) const;

    /**
     * @brief Not equal to
     */
    bool operator!=(const Spectrogram& other) const;

    /**
     * @brief Destructor
     */
    virtual ~Spectrogram();

    /**
     * @brief Gets the output shape for a given input/input length
     */
    virtual blitz::TinyVector<int,2> getShape(const size_t input_length) const;
    virtual blitz::TinyVector<int,2> getShape(const blitz::Array<double,1>& input) const;

    /**
     * @brief Computes the spectrogram
     */
    void operator()(const blitz::Array<double,1>& input, blitz::Array<double,2>& output);

    /**
     * @brief Returns the number of filters used in the filter bank.
     */
    size_t getNFilters() const
    { return m_n_filters; }
    /**
     * @brief Returns the frequency of the lowest triangular filter in the
     * filter bank
     */
    double getFMin() const
    { return m_f_min; }
    /**
     * @brief Returns the frequency of the highest triangular filter in the
     * filter bank
     */
    double getFMax() const
    { return m_f_max; }
    /**
     * @brief Tells whether the frequencies of the filters in the filter bank
     * are taken from the linear or the Mel scale
     */
    bool getMelScale() const
    { return m_mel_scale; }
    /**
     * @brief Returns the pre-emphasis coefficient.
     */
    double getPreEmphasisCoeff() const
    { return m_pre_emphasis_coeff; }
    /**
     * @brief Tells whether we used the energy or the square root of the energy
     */
    bool getEnergyFilter() const
    { return m_energy_filter; }
    /**
     * @brief Tells whether we used the log triangular filter or the triangular
     * filter
     */
    bool getLogFilter() const
    { return m_log_filter; }
    /**
     * @brief Tells whether we compute a spectrogram or energy bands
     */
    bool getEnergyBands() const
    { return m_energy_bands; }

    /**
     * @brief Sets the sampling frequency/frequency rate
     */
    virtual void setSamplingFrequency(const double sampling_frequency);
    /**
     * @brief Sets the window length in miliseconds
     */
    virtual void setWinLengthMs(const double win_length_ms);
    /**
     * @brief Sets the window shift in miliseconds
     */
    virtual void setWinShiftMs(const double win_shift_ms);

    /**
     * @brief Sets the number of filters used in the filter bank.
     */
    virtual void setNFilters(size_t n_filters);
    /**
     * @brief Sets the pre-emphasis coefficient. It should be a value in the
     * range [0,1].
     */
    virtual void setPreEmphasisCoeff(double pre_emphasis_coeff)
    {
      if (pre_emphasis_coeff < 0. || pre_emphasis_coeff > 1.) {
        boost::format m("the argument for `pre_emphasis_coeff' cannot take the value %f - the value must be in the interval [0.,1.]");
        m % pre_emphasis_coeff;
        throw std::runtime_error(m.str());
      }
      m_pre_emphasis_coeff = pre_emphasis_coeff;
    }
    /**
     * @brief Returns the frequency of the lowest triangular filter in the
     * filter bank
     */
    virtual void setFMin(double f_min);
    /**
     * @brief Returns the frequency of the highest triangular filter in the
     * filter bank
     */
    virtual void setFMax(double f_max);
    /**
     * @brief Sets whether the frequencies of the filters in the filter bank
     * are taken from the linear or the Mel scale
     */
    virtual void setMelScale(bool mel_scale);
    /**
     * @brief Sets whether we used the energy or the square root of the energy
     */
    virtual void setEnergyFilter(bool energy_filter)
    { m_energy_filter = energy_filter; }
    /**
     * @brief Sets whether we used the log triangular filter or the triangular
     * filter
     */
    virtual void setLogFilter(bool log_filter)
    { m_log_filter = log_filter; }
    /**
     * @brief Sets whether we compute a spectrogram or energy bands
     */
    virtual void setEnergyBands(bool energy_bands)
    { m_energy_bands = energy_bands; }


  protected:
    /**
     * @brief Converts a frequency in Herz to the corresponding one in Mel
     */
    static double herzToMel(double f);
    /**
     * @brief Converts a frequency in Mel to the corresponding one in Herz
     */
    static double melToHerz(double f);
    /**
     * @brief Pre-emphasises the signal by applying the first order equation
     * \f$data_{n} := data_{n} − a*data_{n−1}\f$
     */
    void pre_emphasis(blitz::Array<double,1> &data) const;
    /**
     * @brief Applies the Hamming window to the signal
     */
    void hammingWindow(blitz::Array<double,1> &data) const;

    /**
     * @brief Computes the power-spectrum of the FFT of the input frame
     */
    void powerSpectrumFFT(blitz::Array<double,1>& x);
    /**
     * @brief Applies the triangular filter bank
     */
    void filterBank(blitz::Array<double,1>& x);
    /**
     * @brief Applies the triangular filter bank to the input array and
     * returns the logarithm of the magnitude in each band.
     */
    void logTriangularFilterBank(blitz::Array<double,1>& data) const;
    /**
     * @brief Applies the triangular filter bank to the input array and
     * returns the magnitude in each band.
     */
    void triangularFilterBank(blitz::Array<double,1>& data) const;


    virtual void initWinLength();
    virtual void initWinSize();

    void initCacheHammingKernel();
    void initCacheFilterBank();

    /**
     * @brief Initialize the table m_p_index, which contains the indices of
     * the cut-off frequencies of the triangular filters.. It looks like:
     *
     *                      filter 2
     *                   <------------->
     *                filter 1           filter 4
     *             <----------->       <------------->
     *        | | | | | | | | | | | | | | | | | | | | | ..........
     *         0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9  ..........
     *             ^     ^     ^       ^             ^
     *             |     |     |       |             |
     *            t[0]   |    t[2]     |           t[4]
     *                  t[1]          t[3]
     *
     */
    void initCachePIndex();
    void initCacheFilters();

    size_t m_n_filters;
    double m_f_min;
    double m_f_max;
    double m_pre_emphasis_coeff;
    bool m_mel_scale;
    double m_fb_out_floor;
    bool m_energy_filter;
    bool m_log_filter;
    bool m_energy_bands;
    double m_log_fb_out_floor;

    blitz::Array<double,1> m_hamming_kernel;
    blitz::Array<int,1> m_p_index;
    std::vector<blitz::Array<double,1> > m_filter_bank;
    bob::sp::FFT1D m_fft;

    mutable blitz::Array<std::complex<double>,1> m_cache_frame_c1;
    mutable blitz::Array<std::complex<double>,1> m_cache_frame_c2;
    mutable blitz::Array<double,1> m_cache_filters;
};

}
}

#endif /* BOB_AP_SPECTROGRAM_H */