parent
281d46dad2
commit
48f4bda3c5
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,4 @@
|
||||
|
||||
* [python_kaldi_features](https://github.com/ZitengWang/python_kaldi_features)
|
||||
commit: fc1bd6240c2008412ab64dc25045cd872f5e126c
|
||||
ref: https://zhuanlan.zhihu.com/p/55371926
|
@ -0,0 +1,20 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2013 James Lyons
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
this software and associated documentation files (the "Software"), to deal in
|
||||
the Software without restriction, including without limitation the rights to
|
||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software is furnished to do so,
|
||||
subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
@ -0,0 +1,5 @@
|
||||
# file GENERATED by distutils, do NOT edit
|
||||
setup.py
|
||||
python_speech_features\__init__.py
|
||||
python_speech_features\base.py
|
||||
python_speech_features\sigproc.py
|
@ -0,0 +1 @@
|
||||
from .base import *
|
@ -0,0 +1,166 @@
|
||||
# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
|
||||
# Author: James Lyons 2012
|
||||
from __future__ import division
|
||||
import numpy
|
||||
from python_speech_features import sigproc
|
||||
from scipy.fftpack import dct
|
||||
|
||||
def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
|
||||
nfilt=23,nfft=512,lowfreq=20,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,
|
||||
ceplifter=22,useEnergy=True,wintype='povey'):
|
||||
"""Compute MFCC features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param numcep: the number of cepstrum to return, default 13
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
|
||||
:param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
|
||||
:returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
|
||||
"""
|
||||
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither,remove_dc_offset,preemph,wintype)
|
||||
feat = numpy.log(feat)
|
||||
feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
|
||||
feat = lifter(feat,ceplifter)
|
||||
if useEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
|
||||
return feat
|
||||
|
||||
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
|
||||
nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97,
|
||||
wintype='hamming'):
|
||||
"""Compute Mel-filterbank energy features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
|
||||
winfunc=lambda x:numpy.ones((x,))
|
||||
:returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
|
||||
second return value is the energy in each frame (total energy, unwindowed)
|
||||
"""
|
||||
highfreq= highfreq or samplerate/2
|
||||
frames,raw_frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, dither, preemph, remove_dc_offset, wintype)
|
||||
pspec = sigproc.powspec(frames,nfft) # nearly the same until this part
|
||||
energy = numpy.sum(raw_frames**2,1) # this stores the raw energy in each frame
|
||||
energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
|
||||
|
||||
fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
|
||||
feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
|
||||
feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
|
||||
|
||||
return feat,energy
|
||||
|
||||
def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
|
||||
nfilt=40,nfft=512,lowfreq=64,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,wintype='hamming'):
|
||||
"""Compute log Mel-filterbank energy features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
|
||||
"""
|
||||
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither, remove_dc_offset,preemph,wintype)
|
||||
return numpy.log(feat)
|
||||
|
||||
def hz2mel(hz):
|
||||
"""Convert a value in Hertz to Mels
|
||||
|
||||
:param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
|
||||
:returns: a value in Mels. If an array was passed in, an identical sized array is returned.
|
||||
"""
|
||||
return 1127 * numpy.log(1+hz/700.0)
|
||||
|
||||
|
||||
def mel2hz(mel):
|
||||
"""Convert a value in Mels to Hertz
|
||||
|
||||
:param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
|
||||
:returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
|
||||
"""
|
||||
return 700 * (numpy.exp(mel/1127.0)-1)
|
||||
|
||||
def get_filterbanks(nfilt=26,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
|
||||
"""Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
|
||||
to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
|
||||
|
||||
:param nfilt: the number of filters in the filterbank, default 20.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
|
||||
:param lowfreq: lowest band edge of mel filters, default 0 Hz
|
||||
:param highfreq: highest band edge of mel filters, default samplerate/2
|
||||
:returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
|
||||
"""
|
||||
highfreq= highfreq or samplerate/2
|
||||
assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
|
||||
|
||||
# compute points evenly spaced in mels
|
||||
lowmel = hz2mel(lowfreq)
|
||||
highmel = hz2mel(highfreq)
|
||||
|
||||
# check kaldi/src/feat/Mel-computations.h
|
||||
fbank = numpy.zeros([nfilt,nfft//2+1])
|
||||
mel_freq_delta = (highmel-lowmel)/(nfilt+1)
|
||||
for j in range(0,nfilt):
|
||||
leftmel = lowmel+j*mel_freq_delta
|
||||
centermel = lowmel+(j+1)*mel_freq_delta
|
||||
rightmel = lowmel+(j+2)*mel_freq_delta
|
||||
for i in range(0,nfft//2):
|
||||
mel=hz2mel(i*samplerate/nfft)
|
||||
if mel>leftmel and mel<rightmel:
|
||||
if mel<centermel:
|
||||
fbank[j,i]=(mel-leftmel)/(centermel-leftmel)
|
||||
else:
|
||||
fbank[j,i]=(rightmel-mel)/(rightmel-centermel)
|
||||
return fbank
|
||||
|
||||
def lifter(cepstra, L=22):
|
||||
"""Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
|
||||
magnitude of the high frequency DCT coeffs.
|
||||
|
||||
:param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
|
||||
:param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
|
||||
"""
|
||||
if L > 0:
|
||||
nframes,ncoeff = numpy.shape(cepstra)
|
||||
n = numpy.arange(ncoeff)
|
||||
lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
|
||||
return lift*cepstra
|
||||
else:
|
||||
# values of L <= 0, do nothing
|
||||
return cepstra
|
||||
|
||||
def delta(feat, N):
|
||||
"""Compute delta features from a feature vector sequence.
|
||||
|
||||
:param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
|
||||
:param N: For each frame, calculate delta features based on preceding and following N frames
|
||||
:returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
|
||||
"""
|
||||
if N < 1:
|
||||
raise ValueError('N must be an integer >= 1')
|
||||
NUMFRAMES = len(feat)
|
||||
denominator = 2 * sum([i**2 for i in range(1, N+1)])
|
||||
delta_feat = numpy.empty_like(feat)
|
||||
padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat
|
||||
for t in range(NUMFRAMES):
|
||||
delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
|
||||
return delta_feat
|
@ -0,0 +1,190 @@
|
||||
# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
|
||||
# Author: James Lyons 2012
|
||||
from __future__ import division
|
||||
import numpy
|
||||
from python_speech_features import sigproc
|
||||
from scipy.fftpack import dct
|
||||
|
||||
def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
|
||||
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True,
|
||||
winfunc=lambda x:numpy.ones((x,))):
|
||||
"""Compute MFCC features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param numcep: the number of cepstrum to return, default 13
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
|
||||
:param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
|
||||
:returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
|
||||
"""
|
||||
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph,winfunc)
|
||||
feat = numpy.log(feat)
|
||||
feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
|
||||
feat = lifter(feat,ceplifter)
|
||||
if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
|
||||
return feat
|
||||
|
||||
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
|
||||
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
|
||||
winfunc=lambda x:numpy.ones((x,))):
|
||||
"""Compute Mel-filterbank energy features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
|
||||
:returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
|
||||
second return value is the energy in each frame (total energy, unwindowed)
|
||||
"""
|
||||
highfreq= highfreq or samplerate/2
|
||||
signal = sigproc.preemphasis(signal,preemph)
|
||||
frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
|
||||
pspec = sigproc.powspec(frames,nfft)
|
||||
energy = numpy.sum(pspec,1) # this stores the total energy in each frame
|
||||
energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
|
||||
|
||||
fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
|
||||
feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
|
||||
feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
|
||||
|
||||
return feat,energy
|
||||
|
||||
def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
|
||||
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
|
||||
"""Compute log Mel-filterbank energy features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
|
||||
"""
|
||||
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph)
|
||||
return numpy.log(feat)
|
||||
|
||||
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
|
||||
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
|
||||
winfunc=lambda x:numpy.ones((x,))):
|
||||
"""Compute Spectral Subband Centroid features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
|
||||
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
|
||||
"""
|
||||
highfreq= highfreq or samplerate/2
|
||||
signal = sigproc.preemphasis(signal,preemph)
|
||||
frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
|
||||
pspec = sigproc.powspec(frames,nfft)
|
||||
pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems
|
||||
|
||||
fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
|
||||
feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
|
||||
R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))
|
||||
|
||||
return numpy.dot(pspec*R,fb.T) / feat
|
||||
|
||||
def hz2mel(hz):
|
||||
"""Convert a value in Hertz to Mels
|
||||
|
||||
:param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
|
||||
:returns: a value in Mels. If an array was passed in, an identical sized array is returned.
|
||||
"""
|
||||
return 2595 * numpy.log10(1+hz/700.)
|
||||
|
||||
def mel2hz(mel):
|
||||
"""Convert a value in Mels to Hertz
|
||||
|
||||
:param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
|
||||
:returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
|
||||
"""
|
||||
return 700*(10**(mel/2595.0)-1)
|
||||
|
||||
def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
|
||||
"""Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
|
||||
to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
|
||||
|
||||
:param nfilt: the number of filters in the filterbank, default 20.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
|
||||
:param lowfreq: lowest band edge of mel filters, default 0 Hz
|
||||
:param highfreq: highest band edge of mel filters, default samplerate/2
|
||||
:returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
|
||||
"""
|
||||
highfreq= highfreq or samplerate/2
|
||||
assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
|
||||
|
||||
# compute points evenly spaced in mels
|
||||
lowmel = hz2mel(lowfreq)
|
||||
highmel = hz2mel(highfreq)
|
||||
melpoints = numpy.linspace(lowmel,highmel,nfilt+2)
|
||||
# our points are in Hz, but we use fft bins, so we have to convert
|
||||
# from Hz to fft bin number
|
||||
bin = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate)
|
||||
|
||||
fbank = numpy.zeros([nfilt,nfft//2+1])
|
||||
for j in range(0,nfilt):
|
||||
for i in range(int(bin[j]), int(bin[j+1])):
|
||||
fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j])
|
||||
for i in range(int(bin[j+1]), int(bin[j+2])):
|
||||
fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1])
|
||||
return fbank
|
||||
|
||||
def lifter(cepstra, L=22):
|
||||
"""Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
|
||||
magnitude of the high frequency DCT coeffs.
|
||||
|
||||
:param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
|
||||
:param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
|
||||
"""
|
||||
if L > 0:
|
||||
nframes,ncoeff = numpy.shape(cepstra)
|
||||
n = numpy.arange(ncoeff)
|
||||
lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
|
||||
return lift*cepstra
|
||||
else:
|
||||
# values of L <= 0, do nothing
|
||||
return cepstra
|
||||
|
||||
def delta(feat, N):
|
||||
"""Compute delta features from a feature vector sequence.
|
||||
|
||||
:param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
|
||||
:param N: For each frame, calculate delta features based on preceding and following N frames
|
||||
:returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
|
||||
"""
|
||||
if N < 1:
|
||||
raise ValueError('N must be an integer >= 1')
|
||||
NUMFRAMES = len(feat)
|
||||
denominator = 2 * sum([i**2 for i in range(1, N+1)])
|
||||
delta_feat = numpy.empty_like(feat)
|
||||
padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat
|
||||
for t in range(NUMFRAMES):
|
||||
delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
|
||||
return delta_feat
|
@ -0,0 +1,158 @@
|
||||
# This file includes routines for basic signal processing including framing and computing power spectra.
|
||||
# Author: James Lyons 2012
|
||||
import decimal
|
||||
|
||||
import numpy
|
||||
import math
|
||||
import logging
|
||||
|
||||
|
||||
def round_half_up(number):
|
||||
return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
|
||||
|
||||
|
||||
def rolling_window(a, window, step=1):
|
||||
# http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
|
||||
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
|
||||
strides = a.strides + (a.strides[-1],)
|
||||
return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
|
||||
|
||||
|
||||
def framesig(sig, frame_len, frame_step, dither=1.0, preemph=0.97, remove_dc_offset=True, wintype='hamming', stride_trick=True):
|
||||
"""Frame a signal into overlapping frames.
|
||||
|
||||
:param sig: the audio signal to frame.
|
||||
:param frame_len: length of each frame measured in samples.
|
||||
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
|
||||
:param stride_trick: use stride trick to compute the rolling window and window multiplication faster
|
||||
:returns: an array of frames. Size is NUMFRAMES by frame_len.
|
||||
"""
|
||||
slen = len(sig)
|
||||
frame_len = int(round_half_up(frame_len))
|
||||
frame_step = int(round_half_up(frame_step))
|
||||
if slen <= frame_len:
|
||||
numframes = 1
|
||||
else:
|
||||
numframes = 1 + (( slen - frame_len) // frame_step)
|
||||
|
||||
# check kaldi/src/feat/feature-window.h
|
||||
padsignal = sig[:(numframes-1)*frame_step+frame_len]
|
||||
if wintype is 'povey':
|
||||
win = numpy.empty(frame_len)
|
||||
for i in range(frame_len):
|
||||
win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85
|
||||
else: # the hamming window
|
||||
win = numpy.hamming(frame_len)
|
||||
|
||||
if stride_trick:
|
||||
frames = rolling_window(padsignal, window=frame_len, step=frame_step)
|
||||
else:
|
||||
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
|
||||
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
|
||||
indices = numpy.array(indices, dtype=numpy.int32)
|
||||
frames = padsignal[indices]
|
||||
win = numpy.tile(win, (numframes, 1))
|
||||
|
||||
frames = frames.astype(numpy.float32)
|
||||
raw_frames = numpy.zeros(frames.shape)
|
||||
for frm in range(frames.shape[0]):
|
||||
frames[frm,:] = do_dither(frames[frm,:], dither) # dither
|
||||
frames[frm,:] = do_remove_dc_offset(frames[frm,:]) # remove dc offset
|
||||
raw_frames[frm,:] = frames[frm,:]
|
||||
frames[frm,:] = do_preemphasis(frames[frm,:], preemph) # preemphasize
|
||||
|
||||
return frames * win, raw_frames
|
||||
|
||||
def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
|
||||
"""Does overlap-add procedure to undo the action of framesig.
|
||||
|
||||
:param frames: the array of frames.
|
||||
:param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
|
||||
:param frame_len: length of each frame measured in samples.
|
||||
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
|
||||
:returns: a 1-D signal.
|
||||
"""
|
||||
frame_len = round_half_up(frame_len)
|
||||
frame_step = round_half_up(frame_step)
|
||||
numframes = numpy.shape(frames)[0]
|
||||
assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
|
||||
|
||||
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
|
||||
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
|
||||
indices = numpy.array(indices, dtype=numpy.int32)
|
||||
padlen = (numframes - 1) * frame_step + frame_len
|
||||
|
||||
if siglen <= 0: siglen = padlen
|
||||
|
||||
rec_signal = numpy.zeros((padlen,))
|
||||
window_correction = numpy.zeros((padlen,))
|
||||
win = winfunc(frame_len)
|
||||
|
||||
for i in range(0, numframes):
|
||||
window_correction[indices[i, :]] = window_correction[
|
||||
indices[i, :]] + win + 1e-15 # add a little bit so it is never zero
|
||||
rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
|
||||
|
||||
rec_signal = rec_signal / window_correction
|
||||
return rec_signal[0:siglen]
|
||||
|
||||
|
||||
def magspec(frames, NFFT):
|
||||
"""Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
|
||||
"""
|
||||
if numpy.shape(frames)[1] > NFFT:
|
||||
logging.warn(
|
||||
'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
|
||||
numpy.shape(frames)[1], NFFT)
|
||||
complex_spec = numpy.fft.rfft(frames, NFFT)
|
||||
return numpy.absolute(complex_spec)
|
||||
|
||||
|
||||
def powspec(frames, NFFT):
|
||||
"""Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
|
||||
"""
|
||||
return numpy.square(magspec(frames, NFFT))
|
||||
|
||||
|
||||
def logpowspec(frames, NFFT, norm=1):
|
||||
"""Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
|
||||
"""
|
||||
ps = powspec(frames, NFFT);
|
||||
ps[ps <= 1e-30] = 1e-30
|
||||
lps = 10 * numpy.log10(ps)
|
||||
if norm:
|
||||
return lps - numpy.max(lps)
|
||||
else:
|
||||
return lps
|
||||
|
||||
def do_dither(signal, dither_value=1.0):
|
||||
signal += numpy.random.normal(size=signal.shape) * dither_value
|
||||
return signal
|
||||
|
||||
def do_remove_dc_offset(signal):
|
||||
signal -= numpy.mean(signal)
|
||||
return signal
|
||||
|
||||
def do_preemphasis(signal, coeff=0.97):
|
||||
"""perform preemphasis on the input signal.
|
||||
|
||||
:param signal: The signal to filter.
|
||||
:param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
|
||||
:returns: the filtered signal.
|
||||
"""
|
||||
return numpy.append((1-coeff)*signal[0], signal[1:] - coeff * signal[:-1])
|
@ -0,0 +1,140 @@
|
||||
# This file includes routines for basic signal processing including framing and computing power spectra.
|
||||
# Author: James Lyons 2012
|
||||
import decimal
|
||||
|
||||
import numpy
|
||||
import math
|
||||
import logging
|
||||
|
||||
|
||||
def round_half_up(number):
|
||||
return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
|
||||
|
||||
|
||||
def rolling_window(a, window, step=1):
|
||||
# http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
|
||||
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
|
||||
strides = a.strides + (a.strides[-1],)
|
||||
return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
|
||||
|
||||
|
||||
def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), stride_trick=True):
|
||||
"""Frame a signal into overlapping frames.
|
||||
|
||||
:param sig: the audio signal to frame.
|
||||
:param frame_len: length of each frame measured in samples.
|
||||
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
|
||||
:param stride_trick: use stride trick to compute the rolling window and window multiplication faster
|
||||
:returns: an array of frames. Size is NUMFRAMES by frame_len.
|
||||
"""
|
||||
slen = len(sig)
|
||||
frame_len = int(round_half_up(frame_len))
|
||||
frame_step = int(round_half_up(frame_step))
|
||||
if slen <= frame_len:
|
||||
numframes = 1
|
||||
else:
|
||||
numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step))
|
||||
|
||||
padlen = int((numframes - 1) * frame_step + frame_len)
|
||||
|
||||
zeros = numpy.zeros((padlen - slen,))
|
||||
padsignal = numpy.concatenate((sig, zeros))
|
||||
if stride_trick:
|
||||
win = winfunc(frame_len)
|
||||
frames = rolling_window(padsignal, window=frame_len, step=frame_step)
|
||||
else:
|
||||
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
|
||||
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
|
||||
indices = numpy.array(indices, dtype=numpy.int32)
|
||||
frames = padsignal[indices]
|
||||
win = numpy.tile(winfunc(frame_len), (numframes, 1))
|
||||
|
||||
return frames * win
|
||||
|
||||
|
||||
def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
|
||||
"""Does overlap-add procedure to undo the action of framesig.
|
||||
|
||||
:param frames: the array of frames.
|
||||
:param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
|
||||
:param frame_len: length of each frame measured in samples.
|
||||
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
|
||||
:returns: a 1-D signal.
|
||||
"""
|
||||
frame_len = round_half_up(frame_len)
|
||||
frame_step = round_half_up(frame_step)
|
||||
numframes = numpy.shape(frames)[0]
|
||||
assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
|
||||
|
||||
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
|
||||
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
|
||||
indices = numpy.array(indices, dtype=numpy.int32)
|
||||
padlen = (numframes - 1) * frame_step + frame_len
|
||||
|
||||
if siglen <= 0: siglen = padlen
|
||||
|
||||
rec_signal = numpy.zeros((padlen,))
|
||||
window_correction = numpy.zeros((padlen,))
|
||||
win = winfunc(frame_len)
|
||||
|
||||
for i in range(0, numframes):
|
||||
window_correction[indices[i, :]] = window_correction[
|
||||
indices[i, :]] + win + 1e-15 # add a little bit so it is never zero
|
||||
rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
|
||||
|
||||
rec_signal = rec_signal / window_correction
|
||||
return rec_signal[0:siglen]
|
||||
|
||||
|
||||
def magspec(frames, NFFT):
|
||||
"""Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
|
||||
"""
|
||||
if numpy.shape(frames)[1] > NFFT:
|
||||
logging.warn(
|
||||
'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
|
||||
numpy.shape(frames)[1], NFFT)
|
||||
complex_spec = numpy.fft.rfft(frames, NFFT)
|
||||
return numpy.absolute(complex_spec)
|
||||
|
||||
|
||||
def powspec(frames, NFFT):
|
||||
"""Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
|
||||
"""
|
||||
return 1.0 / NFFT * numpy.square(magspec(frames, NFFT))
|
||||
|
||||
|
||||
def logpowspec(frames, NFFT, norm=1):
|
||||
"""Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
|
||||
"""
|
||||
ps = powspec(frames, NFFT);
|
||||
ps[ps <= 1e-30] = 1e-30
|
||||
lps = 10 * numpy.log10(ps)
|
||||
if norm:
|
||||
return lps - numpy.max(lps)
|
||||
else:
|
||||
return lps
|
||||
|
||||
|
||||
def preemphasis(signal, coeff=0.95):
|
||||
"""perform preemphasis on the input signal.
|
||||
|
||||
:param signal: The signal to filter.
|
||||
:param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
|
||||
:returns: the filtered signal.
|
||||
"""
|
||||
return numpy.append(signal[0], signal[1:] - coeff * signal[:-1])
|
Binary file not shown.
@ -0,0 +1,89 @@
|
||||
# Makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line.
|
||||
SPHINXOPTS =
|
||||
SPHINXBUILD = sphinx-build
|
||||
PAPER =
|
||||
BUILDDIR = build
|
||||
|
||||
# Internal variables.
|
||||
PAPEROPT_a4 = -D latex_paper_size=a4
|
||||
PAPEROPT_letter = -D latex_paper_size=letter
|
||||
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
|
||||
|
||||
.PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest
|
||||
|
||||
help:
|
||||
@echo "Please use \`make <target>' where <target> is one of"
|
||||
@echo " html to make standalone HTML files"
|
||||
@echo " dirhtml to make HTML files named index.html in directories"
|
||||
@echo " pickle to make pickle files"
|
||||
@echo " json to make JSON files"
|
||||
@echo " htmlhelp to make HTML files and a HTML help project"
|
||||
@echo " qthelp to make HTML files and a qthelp project"
|
||||
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
|
||||
@echo " changes to make an overview of all changed/added/deprecated items"
|
||||
@echo " linkcheck to check all external links for integrity"
|
||||
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
|
||||
|
||||
clean:
|
||||
-rm -rf $(BUILDDIR)/*
|
||||
|
||||
html:
|
||||
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
|
||||
@echo
|
||||
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
|
||||
|
||||
dirhtml:
|
||||
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
|
||||
@echo
|
||||
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
|
||||
|
||||
pickle:
|
||||
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
|
||||
@echo
|
||||
@echo "Build finished; now you can process the pickle files."
|
||||
|
||||
json:
|
||||
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
|
||||
@echo
|
||||
@echo "Build finished; now you can process the JSON files."
|
||||
|
||||
htmlhelp:
|
||||
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
|
||||
@echo
|
||||
@echo "Build finished; now you can run HTML Help Workshop with the" \
|
||||
".hhp project file in $(BUILDDIR)/htmlhelp."
|
||||
|
||||
qthelp:
|
||||
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
|
||||
@echo
|
||||
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
|
||||
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
|
||||
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/python_speech_features.qhcp"
|
||||
@echo "To view the help file:"
|
||||
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/python_speech_features.qhc"
|
||||
|
||||
latex:
|
||||
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
||||
@echo
|
||||
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
|
||||
@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
|
||||
"run these through (pdf)latex."
|
||||
|
||||
changes:
|
||||
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
|
||||
@echo
|
||||
@echo "The overview file is in $(BUILDDIR)/changes."
|
||||
|
||||
linkcheck:
|
||||
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
|
||||
@echo
|
||||
@echo "Link check complete; look for any errors in the above output " \
|
||||
"or in $(BUILDDIR)/linkcheck/output.txt."
|
||||
|
||||
doctest:
|
||||
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
|
||||
@echo "Testing of doctests in the sources finished, look at the " \
|
||||
"results in $(BUILDDIR)/doctest/output.txt."
|
@ -0,0 +1,113 @@
|
||||
@ECHO OFF
|
||||
|
||||
REM Command file for Sphinx documentation
|
||||
|
||||
set SPHINXBUILD=sphinx-build
|
||||
set BUILDDIR=build
|
||||
set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source
|
||||
if NOT "%PAPER%" == "" (
|
||||
set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
|
||||
)
|
||||
|
||||
if "%1" == "" goto help
|
||||
|
||||
if "%1" == "help" (
|
||||
:help
|
||||
echo.Please use `make ^<target^>` where ^<target^> is one of
|
||||
echo. html to make standalone HTML files
|
||||
echo. dirhtml to make HTML files named index.html in directories
|
||||
echo. pickle to make pickle files
|
||||
echo. json to make JSON files
|
||||
echo. htmlhelp to make HTML files and a HTML help project
|
||||
echo. qthelp to make HTML files and a qthelp project
|
||||
echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
|
||||
echo. changes to make an overview over all changed/added/deprecated items
|
||||
echo. linkcheck to check all external links for integrity
|
||||
echo. doctest to run all doctests embedded in the documentation if enabled
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "clean" (
|
||||
for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
|
||||
del /q /s %BUILDDIR%\*
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "html" (
|
||||
%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
|
||||
echo.
|
||||
echo.Build finished. The HTML pages are in %BUILDDIR%/html.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "dirhtml" (
|
||||
%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
|
||||
echo.
|
||||
echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "pickle" (
|
||||
%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
|
||||
echo.
|
||||
echo.Build finished; now you can process the pickle files.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "json" (
|
||||
%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
|
||||
echo.
|
||||
echo.Build finished; now you can process the JSON files.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "htmlhelp" (
|
||||
%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
|
||||
echo.
|
||||
echo.Build finished; now you can run HTML Help Workshop with the ^
|
||||
.hhp project file in %BUILDDIR%/htmlhelp.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "qthelp" (
|
||||
%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
|
||||
echo.
|
||||
echo.Build finished; now you can run "qcollectiongenerator" with the ^
|
||||
.qhcp project file in %BUILDDIR%/qthelp, like this:
|
||||
echo.^> qcollectiongenerator %BUILDDIR%\qthelp\python_speech_features.qhcp
|
||||
echo.To view the help file:
|
||||
echo.^> assistant -collectionFile %BUILDDIR%\qthelp\python_speech_features.ghc
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "latex" (
|
||||
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
|
||||
echo.
|
||||
echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "changes" (
|
||||
%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
|
||||
echo.
|
||||
echo.The overview file is in %BUILDDIR%/changes.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "linkcheck" (
|
||||
%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
|
||||
echo.
|
||||
echo.Link check complete; look for any errors in the above output ^
|
||||
or in %BUILDDIR%/linkcheck/output.txt.
|
||||
goto end
|
||||
)
|
||||
|
||||
if "%1" == "doctest" (
|
||||
%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
|
||||
echo.
|
||||
echo.Testing of doctests in the sources finished, look at the ^
|
||||
results in %BUILDDIR%/doctest/output.txt.
|
||||
goto end
|
||||
)
|
||||
|
||||
:end
|
@ -0,0 +1,202 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# python_speech_features documentation build configuration file, created by
|
||||
# sphinx-quickstart on Thu Oct 31 16:49:58 2013.
|
||||
#
|
||||
# This file is execfile()d with the current directory set to its containing dir.
|
||||
#
|
||||
# Note that not all possible configuration values are present in this
|
||||
# autogenerated file.
|
||||
#
|
||||
# All configuration values have a default; values that are commented out
|
||||
# serve to show the default.
|
||||
|
||||
import sys, os
|
||||
|
||||
import mock
|
||||
|
||||
MOCK_MODULES = ['numpy', 'scipy', 'scipy.fftpack']
|
||||
for mod_name in MOCK_MODULES:
|
||||
sys.modules[mod_name] = mock.Mock()
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
sys.path.insert(0,os.path.abspath('../..'))
|
||||
|
||||
# -- General configuration -----------------------------------------------------
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be extensions
|
||||
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
|
||||
extensions = ['sphinx.ext.autodoc']
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
|
||||
# The suffix of source filenames.
|
||||
source_suffix = '.rst'
|
||||
|
||||
# The encoding of source files.
|
||||
#source_encoding = 'utf-8'
|
||||
|
||||
# The master toctree document.
|
||||
master_doc = 'index'
|
||||
|
||||
# General information about the project.
|
||||
project = u'python_speech_features'
|
||||
copyright = u'2013, James Lyons'
|
||||
|
||||
# The version info for the project you're documenting, acts as replacement for
|
||||
# |version| and |release|, also used in various other places throughout the
|
||||
# built documents.
|
||||
#
|
||||
# The short X.Y version.
|
||||
version = '0.1.0'
|
||||
# The full version, including alpha/beta/rc tags.
|
||||
release = '0.1.0'
|
||||
|
||||
# The language for content autogenerated by Sphinx. Refer to documentation
|
||||
# for a list of supported languages.
|
||||
#language = None
|
||||
|
||||
# There are two options for replacing |today|: either, you set today to some
|
||||
# non-false value, then it is used:
|
||||
#today = ''
|
||||
# Else, today_fmt is used as the format for a strftime call.
|
||||
#today_fmt = '%B %d, %Y'
|
||||
|
||||
# List of documents that shouldn't be included in the build.
|
||||
#unused_docs = []
|
||||
|
||||
# List of directories, relative to source directory, that shouldn't be searched
|
||||
# for source files.
|
||||
exclude_trees = []
|
||||
|
||||
# The reST default role (used for this markup: `text`) to use for all documents.
|
||||
#default_role = None
|
||||
|
||||
# If true, '()' will be appended to :func: etc. cross-reference text.
|
||||
#add_function_parentheses = True
|
||||
|
||||
# If true, the current module name will be prepended to all description
|
||||
# unit titles (such as .. function::).
|
||||
#add_module_names = True
|
||||
|
||||
# If true, sectionauthor and moduleauthor directives will be shown in the
|
||||
# output. They are ignored by default.
|
||||
#show_authors = False
|
||||
|
||||
# The name of the Pygments (syntax highlighting) style to use.
|
||||
pygments_style = 'sphinx'
|
||||
|
||||
# A list of ignored prefixes for module index sorting.
|
||||
#modindex_common_prefix = []
|
||||
|
||||
|
||||
# -- Options for HTML output ---------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. Major themes that come with
|
||||
# Sphinx are currently 'default' and 'sphinxdoc'.
|
||||
html_theme = 'default'
|
||||
|
||||
# Theme options are theme-specific and customize the look and feel of a theme
|
||||
# further. For a list of options available for each theme, see the
|
||||
# documentation.
|
||||
#html_theme_options = {}
|
||||
|
||||
# Add any paths that contain custom themes here, relative to this directory.
|
||||
#html_theme_path = []
|
||||
|
||||
# The name for this set of Sphinx documents. If None, it defaults to
|
||||
# "<project> v<release> documentation".
|
||||
#html_title = None
|
||||
|
||||
# A shorter title for the navigation bar. Default is the same as html_title.
|
||||
#html_short_title = None
|
||||
|
||||
# The name of an image file (relative to this directory) to place at the top
|
||||
# of the sidebar.
|
||||
#html_logo = None
|
||||
|
||||
# The name of an image file (within the static path) to use as favicon of the
|
||||
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
|
||||
# pixels large.
|
||||
#html_favicon = None
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ['_static']
|
||||
|
||||
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
|
||||
# using the given strftime format.
|
||||
#html_last_updated_fmt = '%b %d, %Y'
|
||||
|
||||
# If true, SmartyPants will be used to convert quotes and dashes to
|
||||
# typographically correct entities.
|
||||
#html_use_smartypants = True
|
||||
|
||||
# Custom sidebar templates, maps document names to template names.
|
||||
#html_sidebars = {}
|
||||
|
||||
# Additional templates that should be rendered to pages, maps page names to
|
||||
# template names.
|
||||
#html_additional_pages = {}
|
||||
|
||||
# If false, no module index is generated.
|
||||
#html_use_modindex = True
|
||||
|
||||
# If false, no index is generated.
|
||||
#html_use_index = True
|
||||
|
||||
# If true, the index is split into individual pages for each letter.
|
||||
#html_split_index = False
|
||||
|
||||
# If true, links to the reST sources are added to the pages.
|
||||
#html_show_sourcelink = True
|
||||
|
||||
# If true, an OpenSearch description file will be output, and all pages will
|
||||
# contain a <link> tag referring to it. The value of this option must be the
|
||||
# base URL from which the finished HTML is served.
|
||||
#html_use_opensearch = ''
|
||||
|
||||
# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
|
||||
#html_file_suffix = ''
|
||||
|
||||
# Output file base name for HTML help builder.
|
||||
htmlhelp_basename = 'python_speech_featuresdoc'
|
||||
|
||||
|
||||
# -- Options for LaTeX output --------------------------------------------------
|
||||
|
||||
# The paper size ('letter' or 'a4').
|
||||
#latex_paper_size = 'letter'
|
||||
|
||||
# The font size ('10pt', '11pt' or '12pt').
|
||||
#latex_font_size = '10pt'
|
||||
|
||||
# Grouping the document tree into LaTeX files. List of tuples
|
||||
# (source start file, target name, title, author, documentclass [howto/manual]).
|
||||
latex_documents = [
|
||||
('index', 'python_speech_features.tex', u'python\\_speech\\_features Documentation',
|
||||
u'James Lyons', 'manual'),
|
||||
]
|
||||
|
||||
# The name of an image file (relative to this directory) to place at the top of
|
||||
# the title page.
|
||||
#latex_logo = None
|
||||
|
||||
# For "manual" documents, if this is true, then toplevel headings are parts,
|
||||
# not chapters.
|
||||
#latex_use_parts = False
|
||||
|
||||
# Additional stuff for the LaTeX preamble.
|
||||
#latex_preamble = ''
|
||||
|
||||
# Documents to append as an appendix to all manuals.
|
||||
#latex_appendices = []
|
||||
|
||||
# If false, no module index is generated.
|
||||
#latex_use_modindex = True
|
||||
|
||||
autodoc_member_order = 'bysource'
|
@ -0,0 +1,54 @@
|
||||
.. python_speech_features documentation master file, created by
|
||||
sphinx-quickstart on Thu Oct 31 16:49:58 2013.
|
||||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
||||
Welcome to python_speech_features's documentation!
|
||||
==================================================
|
||||
|
||||
This library provides common speech features for ASR including MFCCs and filterbank energies.
|
||||
If you are not sure what MFCCs are, and would like to know more have a look at this MFCC tutorial:
|
||||
http://www.practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/.
|
||||
|
||||
You will need numpy and scipy to run these files. The code for this project is available at https://github.com/jameslyons/python_speech_features .
|
||||
|
||||
Supported features:
|
||||
|
||||
- :py:meth:`python_speech_features.mfcc` - Mel Frequency Cepstral Coefficients
|
||||
- :py:meth:`python_speech_features.fbank` - Filterbank Energies
|
||||
- :py:meth:`python_speech_features.logfbank` - Log Filterbank Energies
|
||||
- :py:meth:`python_speech_features.ssc` - Spectral Subband Centroids
|
||||
|
||||
To use MFCC features::
|
||||
|
||||
from python_speech_features import mfcc
|
||||
from python_speech_features import logfbank
|
||||
import scipy.io.wavfile as wav
|
||||
|
||||
(rate,sig) = wav.read("file.wav")
|
||||
mfcc_feat = mfcc(sig,rate)
|
||||
fbank_feat = logfbank(sig,rate)
|
||||
|
||||
print(fbank_feat[1:3,:])
|
||||
|
||||
From here you can write the features to a file etc.
|
||||
|
||||
Functions provided in python_speech_features module
|
||||
-------------------------------------
|
||||
|
||||
.. automodule:: python_speech_features.base
|
||||
:members:
|
||||
|
||||
|
||||
Functions provided in sigproc module
|
||||
------------------------------------
|
||||
.. automodule:: python_speech_features.sigproc
|
||||
:members:
|
||||
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
|
||||
* :ref:`genindex`
|
||||
* :ref:`search`
|
||||
|
Binary file not shown.
@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from python_speech_features import mfcc
|
||||
from python_speech_features import delta
|
||||
from python_speech_features import logfbank
|
||||
import scipy.io.wavfile as wav
|
||||
|
||||
(rate,sig) = wav.read("english.wav")
|
||||
|
||||
# note that generally nfilt=40 is used for speech recognition
|
||||
fbank_feat = logfbank(sig,nfilt=23,lowfreq=20,dither=0,wintype='povey')
|
||||
|
||||
# the computed fbank coefficents of english.wav with dimension [110,23]
|
||||
# [ 12.2865 12.6906 13.1765 15.714 16.064 15.7553 16.5746 16.9205 16.6472 16.1302 16.4576 16.7326 16.8864 17.7215 18.88 19.1377 19.1495 18.6683 18.3886 20.3506 20.2772 18.8248 18.1899
|
||||
# 11.9198 13.146 14.7215 15.8642 17.4288 16.394 16.8238 16.1095 16.4297 16.6331 16.3163 16.5093 17.4981 18.3429 19.6555 19.6263 19.8435 19.0534 19.001 20.0287 19.7707 19.5852 19.1112
|
||||
# ...
|
||||
# ...
|
||||
# the same with that using kaldi commands: compute-fbank-feats --dither=0.0
|
||||
|
||||
|
||||
mfcc_feat = mfcc(sig,dither=0,useEnergy=True,wintype='povey')
|
||||
|
||||
# the computed mfcc coefficents of english.wav with dimension [110,13]
|
||||
# [ 17.1337 -23.3651 -7.41751 -7.73686 -21.3682 -8.93884 -3.70843 4.68346 -16.0676 12.782 -7.24054 8.25089 10.7292
|
||||
# 17.1692 -23.3028 -5.61872 -4.0075 -23.287 -20.6101 -5.51584 -6.15273 -14.4333 8.13052 -0.0345329 2.06274 -0.564298
|
||||
# ...
|
||||
# ...
|
||||
# the same with that using kaldi commands: compute-mfcc-feats --dither=0.0
|
||||
|
@ -0,0 +1,10 @@
|
||||
Metadata-Version: 1.0
|
||||
Name: python-speech-features
|
||||
Version: 0.6
|
||||
Summary: Python Speech Feature extraction
|
||||
Home-page: https://github.com/jameslyons/python_speech_features
|
||||
Author: James Lyons
|
||||
Author-email: james.lyons0@gmail.com
|
||||
License: MIT
|
||||
Description: UNKNOWN
|
||||
Platform: UNKNOWN
|
@ -0,0 +1,12 @@
|
||||
README.rst
|
||||
setup.py
|
||||
python_speech_features/__init__.py
|
||||
python_speech_features/base.py
|
||||
python_speech_features/base_orig.py
|
||||
python_speech_features/sigproc.py
|
||||
python_speech_features/sigproc_orig.py
|
||||
python_speech_features.egg-info/PKG-INFO
|
||||
python_speech_features.egg-info/SOURCES.txt
|
||||
python_speech_features.egg-info/dependency_links.txt
|
||||
python_speech_features.egg-info/top_level.txt
|
||||
test/test_sigproc.py
|
@ -0,0 +1 @@
|
||||
|
@ -0,0 +1 @@
|
||||
python_speech_features
|
@ -0,0 +1 @@
|
||||
from .base import *
|
@ -0,0 +1,166 @@
|
||||
# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
|
||||
# Author: James Lyons 2012
|
||||
from __future__ import division
|
||||
import numpy
|
||||
from python_speech_features import sigproc
|
||||
from scipy.fftpack import dct
|
||||
|
||||
def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
|
||||
nfilt=23,nfft=512,lowfreq=20,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,
|
||||
ceplifter=22,useEnergy=True,wintype='povey'):
|
||||
"""Compute MFCC features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param numcep: the number of cepstrum to return, default 13
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
|
||||
:param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
|
||||
:returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
|
||||
"""
|
||||
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither,remove_dc_offset,preemph,wintype)
|
||||
feat = numpy.log(feat)
|
||||
feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
|
||||
feat = lifter(feat,ceplifter)
|
||||
if useEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
|
||||
return feat
|
||||
|
||||
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
|
||||
nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97,
|
||||
wintype='hamming'):
|
||||
"""Compute Mel-filterbank energy features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
|
||||
winfunc=lambda x:numpy.ones((x,))
|
||||
:returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
|
||||
second return value is the energy in each frame (total energy, unwindowed)
|
||||
"""
|
||||
highfreq= highfreq or samplerate/2
|
||||
frames,raw_frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, dither, preemph, remove_dc_offset, wintype)
|
||||
pspec = sigproc.powspec(frames,nfft) # nearly the same until this part
|
||||
energy = numpy.sum(raw_frames**2,1) # this stores the raw energy in each frame
|
||||
energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
|
||||
|
||||
fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
|
||||
feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
|
||||
feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
|
||||
|
||||
return feat,energy
|
||||
|
||||
def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
|
||||
nfilt=40,nfft=512,lowfreq=64,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,wintype='hamming'):
|
||||
"""Compute log Mel-filterbank energy features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
|
||||
"""
|
||||
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither, remove_dc_offset,preemph,wintype)
|
||||
return numpy.log(feat)
|
||||
|
||||
def hz2mel(hz):
|
||||
"""Convert a value in Hertz to Mels
|
||||
|
||||
:param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
|
||||
:returns: a value in Mels. If an array was passed in, an identical sized array is returned.
|
||||
"""
|
||||
return 1127 * numpy.log(1+hz/700.0)
|
||||
|
||||
|
||||
def mel2hz(mel):
|
||||
"""Convert a value in Mels to Hertz
|
||||
|
||||
:param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
|
||||
:returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
|
||||
"""
|
||||
return 700 * (numpy.exp(mel/1127.0)-1)
|
||||
|
||||
def get_filterbanks(nfilt=26,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
|
||||
"""Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
|
||||
to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
|
||||
|
||||
:param nfilt: the number of filters in the filterbank, default 20.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
|
||||
:param lowfreq: lowest band edge of mel filters, default 0 Hz
|
||||
:param highfreq: highest band edge of mel filters, default samplerate/2
|
||||
:returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
|
||||
"""
|
||||
highfreq= highfreq or samplerate/2
|
||||
assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
|
||||
|
||||
# compute points evenly spaced in mels
|
||||
lowmel = hz2mel(lowfreq)
|
||||
highmel = hz2mel(highfreq)
|
||||
|
||||
# check kaldi/src/feat/Mel-computations.h
|
||||
fbank = numpy.zeros([nfilt,nfft//2+1])
|
||||
mel_freq_delta = (highmel-lowmel)/(nfilt+1)
|
||||
for j in range(0,nfilt):
|
||||
leftmel = lowmel+j*mel_freq_delta
|
||||
centermel = lowmel+(j+1)*mel_freq_delta
|
||||
rightmel = lowmel+(j+2)*mel_freq_delta
|
||||
for i in range(0,nfft//2):
|
||||
mel=hz2mel(i*samplerate/nfft)
|
||||
if mel>leftmel and mel<rightmel:
|
||||
if mel<centermel:
|
||||
fbank[j,i]=(mel-leftmel)/(centermel-leftmel)
|
||||
else:
|
||||
fbank[j,i]=(rightmel-mel)/(rightmel-centermel)
|
||||
return fbank
|
||||
|
||||
def lifter(cepstra, L=22):
|
||||
"""Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
|
||||
magnitude of the high frequency DCT coeffs.
|
||||
|
||||
:param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
|
||||
:param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
|
||||
"""
|
||||
if L > 0:
|
||||
nframes,ncoeff = numpy.shape(cepstra)
|
||||
n = numpy.arange(ncoeff)
|
||||
lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
|
||||
return lift*cepstra
|
||||
else:
|
||||
# values of L <= 0, do nothing
|
||||
return cepstra
|
||||
|
||||
def delta(feat, N):
|
||||
"""Compute delta features from a feature vector sequence.
|
||||
|
||||
:param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
|
||||
:param N: For each frame, calculate delta features based on preceding and following N frames
|
||||
:returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
|
||||
"""
|
||||
if N < 1:
|
||||
raise ValueError('N must be an integer >= 1')
|
||||
NUMFRAMES = len(feat)
|
||||
denominator = 2 * sum([i**2 for i in range(1, N+1)])
|
||||
delta_feat = numpy.empty_like(feat)
|
||||
padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat
|
||||
for t in range(NUMFRAMES):
|
||||
delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
|
||||
return delta_feat
|
@ -0,0 +1,190 @@
|
||||
# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
|
||||
# Author: James Lyons 2012
|
||||
from __future__ import division
|
||||
import numpy
|
||||
from python_speech_features import sigproc
|
||||
from scipy.fftpack import dct
|
||||
|
||||
def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
|
||||
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True,
|
||||
winfunc=lambda x:numpy.ones((x,))):
|
||||
"""Compute MFCC features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param numcep: the number of cepstrum to return, default 13
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
|
||||
:param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
|
||||
:returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
|
||||
"""
|
||||
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph,winfunc)
|
||||
feat = numpy.log(feat)
|
||||
feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
|
||||
feat = lifter(feat,ceplifter)
|
||||
if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
|
||||
return feat
|
||||
|
||||
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
|
||||
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
|
||||
winfunc=lambda x:numpy.ones((x,))):
|
||||
"""Compute Mel-filterbank energy features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
|
||||
:returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
|
||||
second return value is the energy in each frame (total energy, unwindowed)
|
||||
"""
|
||||
highfreq= highfreq or samplerate/2
|
||||
signal = sigproc.preemphasis(signal,preemph)
|
||||
frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
|
||||
pspec = sigproc.powspec(frames,nfft)
|
||||
energy = numpy.sum(pspec,1) # this stores the total energy in each frame
|
||||
energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
|
||||
|
||||
fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
|
||||
feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
|
||||
feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
|
||||
|
||||
return feat,energy
|
||||
|
||||
def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
|
||||
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
|
||||
"""Compute log Mel-filterbank energy features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
|
||||
"""
|
||||
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph)
|
||||
return numpy.log(feat)
|
||||
|
||||
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
|
||||
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
|
||||
winfunc=lambda x:numpy.ones((x,))):
|
||||
"""Compute Spectral Subband Centroid features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
|
||||
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
|
||||
"""
|
||||
highfreq= highfreq or samplerate/2
|
||||
signal = sigproc.preemphasis(signal,preemph)
|
||||
frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
|
||||
pspec = sigproc.powspec(frames,nfft)
|
||||
pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems
|
||||
|
||||
fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
|
||||
feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
|
||||
R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))
|
||||
|
||||
return numpy.dot(pspec*R,fb.T) / feat
|
||||
|
||||
def hz2mel(hz):
|
||||
"""Convert a value in Hertz to Mels
|
||||
|
||||
:param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
|
||||
:returns: a value in Mels. If an array was passed in, an identical sized array is returned.
|
||||
"""
|
||||
return 2595 * numpy.log10(1+hz/700.)
|
||||
|
||||
def mel2hz(mel):
|
||||
"""Convert a value in Mels to Hertz
|
||||
|
||||
:param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
|
||||
:returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
|
||||
"""
|
||||
return 700*(10**(mel/2595.0)-1)
|
||||
|
||||
def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
|
||||
"""Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
|
||||
to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
|
||||
|
||||
:param nfilt: the number of filters in the filterbank, default 20.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
|
||||
:param lowfreq: lowest band edge of mel filters, default 0 Hz
|
||||
:param highfreq: highest band edge of mel filters, default samplerate/2
|
||||
:returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
|
||||
"""
|
||||
highfreq= highfreq or samplerate/2
|
||||
assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
|
||||
|
||||
# compute points evenly spaced in mels
|
||||
lowmel = hz2mel(lowfreq)
|
||||
highmel = hz2mel(highfreq)
|
||||
melpoints = numpy.linspace(lowmel,highmel,nfilt+2)
|
||||
# our points are in Hz, but we use fft bins, so we have to convert
|
||||
# from Hz to fft bin number
|
||||
bin = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate)
|
||||
|
||||
fbank = numpy.zeros([nfilt,nfft//2+1])
|
||||
for j in range(0,nfilt):
|
||||
for i in range(int(bin[j]), int(bin[j+1])):
|
||||
fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j])
|
||||
for i in range(int(bin[j+1]), int(bin[j+2])):
|
||||
fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1])
|
||||
return fbank
|
||||
|
||||
def lifter(cepstra, L=22):
|
||||
"""Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
|
||||
magnitude of the high frequency DCT coeffs.
|
||||
|
||||
:param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
|
||||
:param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
|
||||
"""
|
||||
if L > 0:
|
||||
nframes,ncoeff = numpy.shape(cepstra)
|
||||
n = numpy.arange(ncoeff)
|
||||
lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
|
||||
return lift*cepstra
|
||||
else:
|
||||
# values of L <= 0, do nothing
|
||||
return cepstra
|
||||
|
||||
def delta(feat, N):
|
||||
"""Compute delta features from a feature vector sequence.
|
||||
|
||||
:param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
|
||||
:param N: For each frame, calculate delta features based on preceding and following N frames
|
||||
:returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
|
||||
"""
|
||||
if N < 1:
|
||||
raise ValueError('N must be an integer >= 1')
|
||||
NUMFRAMES = len(feat)
|
||||
denominator = 2 * sum([i**2 for i in range(1, N+1)])
|
||||
delta_feat = numpy.empty_like(feat)
|
||||
padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat
|
||||
for t in range(NUMFRAMES):
|
||||
delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
|
||||
return delta_feat
|
@ -0,0 +1,158 @@
|
||||
# This file includes routines for basic signal processing including framing and computing power spectra.
|
||||
# Author: James Lyons 2012
|
||||
import decimal
|
||||
|
||||
import numpy
|
||||
import math
|
||||
import logging
|
||||
|
||||
|
||||
def round_half_up(number):
|
||||
return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
|
||||
|
||||
|
||||
def rolling_window(a, window, step=1):
|
||||
# http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
|
||||
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
|
||||
strides = a.strides + (a.strides[-1],)
|
||||
return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
|
||||
|
||||
|
||||
def framesig(sig, frame_len, frame_step, dither=1.0, preemph=0.97, remove_dc_offset=True, wintype='hamming', stride_trick=True):
|
||||
"""Frame a signal into overlapping frames.
|
||||
|
||||
:param sig: the audio signal to frame.
|
||||
:param frame_len: length of each frame measured in samples.
|
||||
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
|
||||
:param stride_trick: use stride trick to compute the rolling window and window multiplication faster
|
||||
:returns: an array of frames. Size is NUMFRAMES by frame_len.
|
||||
"""
|
||||
slen = len(sig)
|
||||
frame_len = int(round_half_up(frame_len))
|
||||
frame_step = int(round_half_up(frame_step))
|
||||
if slen <= frame_len:
|
||||
numframes = 1
|
||||
else:
|
||||
numframes = 1 + (( slen - frame_len) // frame_step)
|
||||
|
||||
# check kaldi/src/feat/feature-window.h
|
||||
padsignal = sig[:(numframes-1)*frame_step+frame_len]
|
||||
if wintype is 'povey':
|
||||
win = numpy.empty(frame_len)
|
||||
for i in range(frame_len):
|
||||
win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85
|
||||
else: # the hamming window
|
||||
win = numpy.hamming(frame_len)
|
||||
|
||||
if stride_trick:
|
||||
frames = rolling_window(padsignal, window=frame_len, step=frame_step)
|
||||
else:
|
||||
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
|
||||
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
|
||||
indices = numpy.array(indices, dtype=numpy.int32)
|
||||
frames = padsignal[indices]
|
||||
win = numpy.tile(win, (numframes, 1))
|
||||
|
||||
frames = frames.astype(numpy.float32)
|
||||
raw_frames = numpy.zeros(frames.shape)
|
||||
for frm in range(frames.shape[0]):
|
||||
frames[frm,:] = do_dither(frames[frm,:], dither) # dither
|
||||
frames[frm,:] = do_remove_dc_offset(frames[frm,:]) # remove dc offset
|
||||
raw_frames[frm,:] = frames[frm,:]
|
||||
frames[frm,:] = do_preemphasis(frames[frm,:], preemph) # preemphasize
|
||||
|
||||
return frames * win, raw_frames
|
||||
|
||||
def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
|
||||
"""Does overlap-add procedure to undo the action of framesig.
|
||||
|
||||
:param frames: the array of frames.
|
||||
:param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
|
||||
:param frame_len: length of each frame measured in samples.
|
||||
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
|
||||
:returns: a 1-D signal.
|
||||
"""
|
||||
frame_len = round_half_up(frame_len)
|
||||
frame_step = round_half_up(frame_step)
|
||||
numframes = numpy.shape(frames)[0]
|
||||
assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
|
||||
|
||||
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
|
||||
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
|
||||
indices = numpy.array(indices, dtype=numpy.int32)
|
||||
padlen = (numframes - 1) * frame_step + frame_len
|
||||
|
||||
if siglen <= 0: siglen = padlen
|
||||
|
||||
rec_signal = numpy.zeros((padlen,))
|
||||
window_correction = numpy.zeros((padlen,))
|
||||
win = winfunc(frame_len)
|
||||
|
||||
for i in range(0, numframes):
|
||||
window_correction[indices[i, :]] = window_correction[
|
||||
indices[i, :]] + win + 1e-15 # add a little bit so it is never zero
|
||||
rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
|
||||
|
||||
rec_signal = rec_signal / window_correction
|
||||
return rec_signal[0:siglen]
|
||||
|
||||
|
||||
def magspec(frames, NFFT):
|
||||
"""Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
|
||||
"""
|
||||
if numpy.shape(frames)[1] > NFFT:
|
||||
logging.warn(
|
||||
'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
|
||||
numpy.shape(frames)[1], NFFT)
|
||||
complex_spec = numpy.fft.rfft(frames, NFFT)
|
||||
return numpy.absolute(complex_spec)
|
||||
|
||||
|
||||
def powspec(frames, NFFT):
|
||||
"""Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
|
||||
"""
|
||||
return numpy.square(magspec(frames, NFFT))
|
||||
|
||||
|
||||
def logpowspec(frames, NFFT, norm=1):
|
||||
"""Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
|
||||
"""
|
||||
ps = powspec(frames, NFFT);
|
||||
ps[ps <= 1e-30] = 1e-30
|
||||
lps = 10 * numpy.log10(ps)
|
||||
if norm:
|
||||
return lps - numpy.max(lps)
|
||||
else:
|
||||
return lps
|
||||
|
||||
def do_dither(signal, dither_value=1.0):
|
||||
signal += numpy.random.normal(size=signal.shape) * dither_value
|
||||
return signal
|
||||
|
||||
def do_remove_dc_offset(signal):
|
||||
signal -= numpy.mean(signal)
|
||||
return signal
|
||||
|
||||
def do_preemphasis(signal, coeff=0.97):
|
||||
"""perform preemphasis on the input signal.
|
||||
|
||||
:param signal: The signal to filter.
|
||||
:param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
|
||||
:returns: the filtered signal.
|
||||
"""
|
||||
return numpy.append((1-coeff)*signal[0], signal[1:] - coeff * signal[:-1])
|
@ -0,0 +1,140 @@
|
||||
# This file includes routines for basic signal processing including framing and computing power spectra.
|
||||
# Author: James Lyons 2012
|
||||
import decimal
|
||||
|
||||
import numpy
|
||||
import math
|
||||
import logging
|
||||
|
||||
|
||||
def round_half_up(number):
|
||||
return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
|
||||
|
||||
|
||||
def rolling_window(a, window, step=1):
|
||||
# http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
|
||||
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
|
||||
strides = a.strides + (a.strides[-1],)
|
||||
return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
|
||||
|
||||
|
||||
def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), stride_trick=True):
|
||||
"""Frame a signal into overlapping frames.
|
||||
|
||||
:param sig: the audio signal to frame.
|
||||
:param frame_len: length of each frame measured in samples.
|
||||
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
|
||||
:param stride_trick: use stride trick to compute the rolling window and window multiplication faster
|
||||
:returns: an array of frames. Size is NUMFRAMES by frame_len.
|
||||
"""
|
||||
slen = len(sig)
|
||||
frame_len = int(round_half_up(frame_len))
|
||||
frame_step = int(round_half_up(frame_step))
|
||||
if slen <= frame_len:
|
||||
numframes = 1
|
||||
else:
|
||||
numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step))
|
||||
|
||||
padlen = int((numframes - 1) * frame_step + frame_len)
|
||||
|
||||
zeros = numpy.zeros((padlen - slen,))
|
||||
padsignal = numpy.concatenate((sig, zeros))
|
||||
if stride_trick:
|
||||
win = winfunc(frame_len)
|
||||
frames = rolling_window(padsignal, window=frame_len, step=frame_step)
|
||||
else:
|
||||
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
|
||||
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
|
||||
indices = numpy.array(indices, dtype=numpy.int32)
|
||||
frames = padsignal[indices]
|
||||
win = numpy.tile(winfunc(frame_len), (numframes, 1))
|
||||
|
||||
return frames * win
|
||||
|
||||
|
||||
def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
|
||||
"""Does overlap-add procedure to undo the action of framesig.
|
||||
|
||||
:param frames: the array of frames.
|
||||
:param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
|
||||
:param frame_len: length of each frame measured in samples.
|
||||
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
|
||||
:returns: a 1-D signal.
|
||||
"""
|
||||
frame_len = round_half_up(frame_len)
|
||||
frame_step = round_half_up(frame_step)
|
||||
numframes = numpy.shape(frames)[0]
|
||||
assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
|
||||
|
||||
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
|
||||
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
|
||||
indices = numpy.array(indices, dtype=numpy.int32)
|
||||
padlen = (numframes - 1) * frame_step + frame_len
|
||||
|
||||
if siglen <= 0: siglen = padlen
|
||||
|
||||
rec_signal = numpy.zeros((padlen,))
|
||||
window_correction = numpy.zeros((padlen,))
|
||||
win = winfunc(frame_len)
|
||||
|
||||
for i in range(0, numframes):
|
||||
window_correction[indices[i, :]] = window_correction[
|
||||
indices[i, :]] + win + 1e-15 # add a little bit so it is never zero
|
||||
rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
|
||||
|
||||
rec_signal = rec_signal / window_correction
|
||||
return rec_signal[0:siglen]
|
||||
|
||||
|
||||
def magspec(frames, NFFT):
|
||||
"""Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
|
||||
"""
|
||||
if numpy.shape(frames)[1] > NFFT:
|
||||
logging.warn(
|
||||
'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
|
||||
numpy.shape(frames)[1], NFFT)
|
||||
complex_spec = numpy.fft.rfft(frames, NFFT)
|
||||
return numpy.absolute(complex_spec)
|
||||
|
||||
|
||||
def powspec(frames, NFFT):
|
||||
"""Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
|
||||
"""
|
||||
return 1.0 / NFFT * numpy.square(magspec(frames, NFFT))
|
||||
|
||||
|
||||
def logpowspec(frames, NFFT, norm=1):
|
||||
"""Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
|
||||
"""
|
||||
ps = powspec(frames, NFFT);
|
||||
ps[ps <= 1e-30] = 1e-30
|
||||
lps = 10 * numpy.log10(ps)
|
||||
if norm:
|
||||
return lps - numpy.max(lps)
|
||||
else:
|
||||
return lps
|
||||
|
||||
|
||||
def preemphasis(signal, coeff=0.95):
|
||||
"""perform preemphasis on the input signal.
|
||||
|
||||
:param signal: The signal to filter.
|
||||
:param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
|
||||
:returns: the filtered signal.
|
||||
"""
|
||||
return numpy.append(signal[0], signal[1:] - coeff * signal[:-1])
|
@ -0,0 +1,3 @@
|
||||
mock
|
||||
scipy
|
||||
numpy
|
@ -0,0 +1,14 @@
|
||||
try:
|
||||
from setuptools import setup #enables develop
|
||||
except ImportError:
|
||||
from distutils.core import setup
|
||||
|
||||
setup(name='python_speech_features',
|
||||
version='0.6',
|
||||
description='Python Speech Feature extraction',
|
||||
author='James Lyons',
|
||||
author_email='james.lyons0@gmail.com',
|
||||
license='MIT',
|
||||
url='https://github.com/jameslyons/python_speech_features',
|
||||
packages=['python_speech_features'],
|
||||
)
|
@ -0,0 +1,31 @@
|
||||
from python_speech_features import sigproc
|
||||
import unittest
|
||||
import numpy as np
|
||||
import time
|
||||
|
||||
|
||||
class test_case(unittest.TestCase):
|
||||
def test_frame_sig(self):
|
||||
n = 10000124
|
||||
frame_len = 37
|
||||
frame_step = 13
|
||||
x = np.random.rand(n)
|
||||
t0 = time.time()
|
||||
y_old = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=False)
|
||||
t1 = time.time()
|
||||
y_new = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=True)
|
||||
t_new = time.time() - t1
|
||||
t_old = t1 - t0
|
||||
self.assertTupleEqual(y_old.shape, y_new.shape)
|
||||
np.testing.assert_array_equal(y_old, y_new)
|
||||
self.assertLess(t_new, t_old)
|
||||
print('new run time %3.2f < %3.2f sec' % (t_new, t_old))
|
||||
|
||||
def test_rolling(self):
|
||||
x = np.arange(10)
|
||||
y = sigproc.rolling_window(x, window=4, step=3)
|
||||
y_expected = np.array([[0, 1, 2, 3],
|
||||
[3, 4, 5, 6],
|
||||
[6, 7, 8, 9]]
|
||||
)
|
||||
y = np.testing.assert_array_equal(y, y_expected)
|
Loading…
Reference in new issue