@ -6,13 +6,15 @@ from __future__ import print_function
import numpy as np
import numpy as np
from data_utils import utils
from data_utils import utils
from data_utils . audio import AudioSegment
from data_utils . audio import AudioSegment
from python_speech_features import mfcc
from python_speech_features import delta
class AudioFeaturizer ( object ) :
class AudioFeaturizer ( object ) :
""" Audio featurizer, for extracting features from audio contents of
""" Audio featurizer, for extracting features from audio contents of
AudioSegment or SpeechSegment .
AudioSegment or SpeechSegment .
Currently , it only supports feature type of linear spectrogram .
Currently , it supports feature types of linear spectrogram and mfcc .
: param specgram_type : Specgram feature type . Options : ' linear ' .
: param specgram_type : Specgram feature type . Options : ' linear ' .
: type specgram_type : str
: type specgram_type : str
@ -20,9 +22,10 @@ class AudioFeaturizer(object):
: type stride_ms : float
: type stride_ms : float
: param window_ms : Window size ( in milliseconds ) for generating frames .
: param window_ms : Window size ( in milliseconds ) for generating frames .
: type window_ms : float
: type window_ms : float
: param max_freq : Used w hen specgram_type is ' linear ' , only FFT bins
: param max_freq : W hen specgram_type is ' linear ' , only FFT bins
corresponding to frequencies between [ 0 , max_freq ] are
corresponding to frequencies between [ 0 , max_freq ] are
returned .
returned ; when specgram_type is ' mfcc ' , max_feq is the
highest band edge of mel filters .
: types max_freq : None | float
: types max_freq : None | float
: param target_sample_rate : Audio are resampled ( if upsampling or
: param target_sample_rate : Audio are resampled ( if upsampling or
downsampling is allowed ) to this before
downsampling is allowed ) to this before
@ -91,6 +94,9 @@ class AudioFeaturizer(object):
return self . _compute_linear_specgram (
return self . _compute_linear_specgram (
samples , sample_rate , self . _stride_ms , self . _window_ms ,
samples , sample_rate , self . _stride_ms , self . _window_ms ,
self . _max_freq )
self . _max_freq )
elif self . _specgram_type == ' mfcc ' :
return self . _compute_mfcc ( samples , sample_rate , self . _stride_ms ,
self . _window_ms , self . _max_freq )
else :
else :
raise ValueError ( " Unknown specgram_type %s . "
raise ValueError ( " Unknown specgram_type %s . "
" Supported values: linear. " % self . _specgram_type )
" Supported values: linear. " % self . _specgram_type )
@ -142,3 +148,39 @@ class AudioFeaturizer(object):
# prepare fft frequency list
# prepare fft frequency list
freqs = float ( sample_rate ) / window_size * np . arange ( fft . shape [ 0 ] )
freqs = float ( sample_rate ) / window_size * np . arange ( fft . shape [ 0 ] )
return fft , freqs
return fft , freqs
def _compute_mfcc ( self ,
samples ,
sample_rate ,
stride_ms = 10.0 ,
window_ms = 20.0 ,
max_freq = None ) :
""" Compute mfcc from samples. """
if max_freq is None :
max_freq = sample_rate / 2
if max_freq > sample_rate / 2 :
raise ValueError ( " max_freq must be greater than half of "
" sample rate. " )
if stride_ms > window_ms :
raise ValueError ( " Stride size must not be greater than "
" window size. " )
# compute 13 cepstral coefficients, and the first one is replaced
# by log(frame energy)
mfcc_feat = mfcc (
signal = samples ,
samplerate = sample_rate ,
winlen = 0.001 * window_ms ,
winstep = 0.001 * stride_ms ,
highfreq = max_freq )
# Deltas
d_mfcc_feat = delta ( mfcc_feat , 2 )
# Deltas-Deltas
dd_mfcc_feat = delta ( d_mfcc_feat , 2 )
# concat above three features
concat_mfcc_feat = [
np . concatenate ( ( mfcc_feat [ i ] , d_mfcc_feat [ i ] , dd_mfcc_feat [ i ] ) )
for i in xrange ( len ( mfcc_feat ) )
]
# transpose to be consistent with the linear specgram situation
concat_mfcc_feat = np . transpose ( concat_mfcc_feat )
return concat_mfcc_feat