Commit fc9a3ac2 authored by Pavel KORSHUNOV's avatar Pavel KORSHUNOV

added notebook for Voice PAD

parent a8fc014d
Pipeline #25232 passed with stage
in 58 seconds
# CKSUM[cksum] SIZE[bytes] DIRS[count] FILES[count] DATE[iso]
245807943 61765788 12 166 20180829T131253 # DIRCKSUM
# CKSUK[cksum] SIZE[bytes] FILE[name]
"""
This script computes MFCCs from a given audio signal
"""
import numpy
import matplotlib.pyplot as plt
## MFCC parameters
# change the number of mel filters (sometimes 40 filter are also used)
num_mel_filters = 24
# change the final number of MFCC features
# It must be smaller than the number of mel-filters from above (we have 24 mel-filters)
num_mfcc = 19
def windows_stats(audio, sampling_rate, window_size, window_shift):
window_length = sampling_rate // 1000 * window_size # the length of the corresponding array
overlap_size = sampling_rate // 1000 * window_shift # the length of the overlap
# get the number of overlapping windows that fit into the data
number_of_windows = (audio.shape[0] - window_length) // (window_length - overlap_size) + 1
return window_length, overlap_size, number_of_windows
def fft_size(window_length):
# for optimization reasons, we need the size of fft to be 2 in power k value
# this formula computes such value
return int(2**numpy.ceil(numpy.log2(window_length)))
def plot_spectrogram(audio, sampling_rate, window_size = 20, window_shift= 10, name=''):
window_length, overlap_size, number_of_windows = windows_stats(audio, sampling_rate, window_size, window_shift)
size_of_fft = fft_size(window_length)
## 1. use the standard function for Spectrogram from matplotlib first
mplspec, freqs, time, im_axis = plt.specgram(audio, NFFT=window_length, Fs=sampling_rate,
pad_to=size_of_fft, window=numpy.hamming(window_length),
noverlap=overlap_size, mode='magnitude', cmap="jet")
plt.xlabel('Time')
plt.ylabel('Frequency [Hz]')
plt.title('Spectrogram {0}'.format(name))
def plot_cepstrumgram(audio, sampling_rate, window_size = 20, window_shift= 10, num_mel_filters = 24, num_mfcc = 19, name=''):
mfcc = compute_mffc(audio, sampling_rate, window_size, window_shift, num_mel_filters, num_mfcc)
# Normalize MFCCs and plot them as cepstrumgram
mfcc -= numpy.mean(mfcc, axis=0) + 1e-8
from matplotlib import colors
im = plt.imshow(mfcc.T, aspect='auto', origin='lower', cmap="jet")
plt.ylabel('MFCC features')
plt.xlabel('Windows of the signal')
plt.title('Cepstrumgram {0}'.format(name))
def compute_spectrum(audio, sampling_rate, window_size = 20, window_shift= 10):
window_length, overlap_size, number_of_windows = windows_stats(audio, sampling_rate, window_size, window_shift)
# compute the extra values in the signal that do not fit exactly in the last window
leftovers = audio.shape[0] - (number_of_windows*window_length - (number_of_windows-1)*overlap_size)
#For simplicity, we drop and ignore the last values that did not fit into windows
if leftovers > 0:
audio_trimmed = audio[:-leftovers]
else:
audio_trimmed = audio
## normalize signal
audio_trimmed -= numpy.mean(audio_trimmed)
audio_trimmed /= numpy.std(audio_trimmed)
## split the data into overlapping windows
from numpy.lib.stride_tricks import as_strided
# size of the each element in the data
sz = audio.dtype.itemsize
# reshape the signal data for strides function
data = audio.reshape((-1,1))
# split into the overlapping windows
windowed_data = as_strided(data,
shape=(number_of_windows, window_length * data.shape[1]),
strides=((window_length - overlap_size)*data.shape[1] * sz, sz))
# reshape back to 1D
audio_windows = windowed_data.reshape((number_of_windows, -1))
## apply hamming window
hamming_window = numpy.hamming(window_length)
audio_hamming_windows = [win * hamming_window for win in audio_windows]
# for optimization reasons, we need the size of fft to be 2 in power k value
# this formula computes such value
size_of_fft = fft_size(window_length)
# now, compute power spectrum for every window
# Power spectrum is the mangitude of FFT in power of 2 = (abs(FFT)*2)
# The size of the resulted spectrum for each window is 512/2+1 = 257.
spectrum = [numpy.abs(numpy.fft.rfft(w, n=size_of_fft)) ** 2 for w in audio_hamming_windows]
return spectrum
# converting Hertz (f) to Mel (m), see the formular above
def hertz_to_mels(f):
return (2595 * numpy.log10(1 + f / 700.))
# converting Mel (m) to Hertz (f), see the formular above
def mels_to_hertz(m):
return 700. * (10**(m / 2595.) - 1)
# This function constructs these Mel triangular filters
# It is taken from here:
# https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
def compute_mel_filters(sample_sampling_rate, high_freq=8000.,
low_freq=0., NFFT=512, nfilt=24):
low_freq_mel = hertz_to_mels(low_freq) # Convert Hz to Mel
high_freq_mel = hertz_to_mels(high_freq) # Convert Hz to Mel
mel_points = numpy.linspace(low_freq_mel, high_freq_mel, nfilt + 2) # Equally spaced in Mel scale
hz_points = mels_to_hertz(mel_points) # Convert Mel to Hz
bin = numpy.floor((NFFT + 1) * hz_points / sample_sampling_rate)
fbank = numpy.zeros((nfilt, int(numpy.floor(NFFT / 2 + 1))))
for m in range(1, nfilt + 1):
f_m_minus = int(bin[m - 1]) # left
f_m = int(bin[m]) # center
f_m_plus = int(bin[m + 1]) # right
for k in range(f_m_minus, f_m):
fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
for k in range(f_m, f_m_plus):
fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
return fbank
## Create DCT transform matrix
# We implement the standard formula for DCT-II transform
def dct_matrix(num_mel_filters, num_mfcc):
"""
Return the DCT-II matrix of size (num_mel_filters x num_mfcc).
For computing MFCCs, N is the number of log-power-spectrum bins (num_mel_filters)
while K is the number of cepstra (num_mfcc).
"""
freqstep = numpy.pi / num_mel_filters
cosmat = numpy.zeros((num_mel_filters, num_mfcc), 'double')
for n in range(0, num_mel_filters):
for k in range(0, num_mfcc):
cosmat[n,k] = numpy.cos(freqstep * (n + 0.5) * k)
return cosmat
def compute_mffc(audio, sampling_rate, window_size = 20, window_shift= 10,
num_mel_filters = 24, num_mfcc = 19):
spectrum = compute_spectrum(audio, sampling_rate, window_size = 20, window_shift= 10)
spectrum = numpy.asarray(spectrum)
window_length, overlap_size, number_of_windows = windows_stats(audio, sampling_rate, window_size, window_shift)
size_of_fft = fft_size(window_length)
spectrum_length = int(numpy.floor(size_of_fft / 2 + 1))
# call the function with our parameters and compute the mel-filters
fbank = compute_mel_filters(sampling_rate, high_freq=sampling_rate/2., low_freq=0.,
NFFT=size_of_fft, nfilt=num_mel_filters)
# multiple each window of the spectrum by the filters
filter_banks = numpy.dot(spectrum, fbank.T)
# use the smallest possible value to make sure we do not take log of zero
filter_banks = numpy.where(filter_banks == 0, numpy.finfo(float).eps, filter_banks) # Numerical Stability
log_filter_banks = 20 * numpy.log10(filter_banks)
dct_transform = dct_matrix(num_mel_filters, num_mfcc)
mfcc = numpy.dot(log_filter_banks, dct_transform) * (2.0 / spectrum_length)
return mfcc
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment