diff --git a/paddleaudio/paddleaudio/__init__.py b/paddleaudio/paddleaudio/__init__.py index 2dab610c..6184c1dd 100644 --- a/paddleaudio/paddleaudio/__init__.py +++ b/paddleaudio/paddleaudio/__init__.py @@ -11,5 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from . import compliance +from . import datasets +from . import features +from . import functional +from . import io +from . import metric +from . import sox_effects from .backends import load from .backends import save diff --git a/paddleaudio/paddleaudio/compliance/kaldi.py b/paddleaudio/paddleaudio/compliance/kaldi.py index e4192e81..35d7072c 100644 --- a/paddleaudio/paddleaudio/compliance/kaldi.py +++ b/paddleaudio/paddleaudio/compliance/kaldi.py @@ -105,7 +105,7 @@ def _get_log_energy(strided_input: Tensor, epsilon: Tensor, def _get_waveform_and_window_properties( waveform: Tensor, channel: int, - sample_frequency: float, + sr: int, frame_shift: float, frame_length: float, round_to_power_of_two: bool, @@ -115,9 +115,9 @@ def _get_waveform_and_window_properties( 'Invalid channel {} for size {}'.format(channel, waveform.shape[0])) waveform = waveform[channel, :] # size (n) window_shift = int( - sample_frequency * frame_shift * + sr * frame_shift * 0.001) # pass frame_shift and frame_length in milliseconds - window_size = int(sample_frequency * frame_length * 0.001) + window_size = int(sr * frame_length * 0.001) padded_window_size = _next_power_of_2( window_size) if round_to_power_of_two else window_size @@ -128,7 +128,7 @@ def _get_waveform_and_window_properties( assert padded_window_size % 2 == 0, 'the padded `window_size` must be divisible by two.' \ ' use `round_to_power_of_two` or change `frame_length`' assert 0. <= preemphasis_coefficient <= 1.0, '`preemphasis_coefficient` must be between [0,1]' - assert sample_frequency > 0, '`sample_frequency` must be greater than zero' + assert sr > 0, '`sr` must be greater than zero' return waveform, window_shift, window_size, padded_window_size @@ -147,45 +147,38 @@ def _get_window(waveform: Tensor, dtype = waveform.dtype epsilon = _get_epsilon(dtype) - # size (m, window_size) + # (m, window_size) strided_input = _get_strided(waveform, window_size, window_shift, snip_edges) if dither != 0.0: - # Returns a random number strictly between 0 and 1 x = paddle.maximum(epsilon, paddle.rand(strided_input.shape, dtype=dtype)) rand_gauss = paddle.sqrt(-2 * x.log()) * paddle.cos(2 * math.pi * x) strided_input = strided_input + rand_gauss * dither if remove_dc_offset: - # Subtract each row/frame by its mean - row_means = paddle.mean( - strided_input, axis=1).unsqueeze(1) # size (m, 1) + row_means = paddle.mean(strided_input, axis=1).unsqueeze(1) # (m, 1) strided_input = strided_input - row_means if raw_energy: - # Compute the log energy of each row/frame before applying preemphasis and - # window function signal_log_energy = _get_log_energy(strided_input, epsilon, - energy_floor) # size (m) + energy_floor) # (m) if preemphasis_coefficient != 0.0: - # strided_input[i,j] -= preemphasis_coefficient * strided_input[i, max(0, j-1)] for all i,j offset_strided_input = paddle.nn.functional.pad( strided_input.unsqueeze(0), (1, 0), data_format='NCL', - mode='replicate').squeeze(0) # size (m, window_size + 1) + mode='replicate').squeeze(0) # (m, window_size + 1) strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, : -1] - # Apply window_function to each row/frame window_function = _feature_window_function( window_type, window_size, blackman_coeff, - dtype).unsqueeze(0) # size (1, window_size) - strided_input = strided_input * window_function # size (m, window_size) + dtype).unsqueeze(0) # (1, window_size) + strided_input = strided_input * window_function # (m, window_size) - # Pad columns with zero until we reach size (m, padded_window_size) + # (m, padded_window_size) if padded_window_size != window_size: padding_right = padded_window_size - window_size strided_input = paddle.nn.functional.pad( @@ -194,7 +187,6 @@ def _get_window(waveform: Tensor, mode='constant', value=0).squeeze(0) - # Compute energy after window function (not the raw one) if not raw_energy: signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor) # size (m) @@ -203,8 +195,6 @@ def _get_window(waveform: Tensor, def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor: - # subtracts the column mean of the tensor size (m, n) if subtract_mean=True - # it returns size (m, n) if subtract_mean: col_means = paddle.mean(tensor, axis=0).unsqueeze(0) tensor = tensor - col_means @@ -218,61 +208,56 @@ def spectrogram(waveform: Tensor, energy_floor: float=1.0, frame_length: float=25.0, frame_shift: float=10.0, - min_duration: float=0.0, preemphasis_coefficient: float=0.97, raw_energy: bool=True, remove_dc_offset: bool=True, round_to_power_of_two: bool=True, - sample_frequency: float=16000.0, + sr: int=16000, snip_edges: bool=True, subtract_mean: bool=False, window_type: str=POVEY) -> Tensor: - """[summary] + """Compute and return a spectrogram from a waveform. The output is identical to Kaldi's. Args: - waveform (Tensor): [description] - blackman_coeff (float, optional): [description]. Defaults to 0.42. - channel (int, optional): [description]. Defaults to -1. - dither (float, optional): [description]. Defaults to 0.0. - energy_floor (float, optional): [description]. Defaults to 1.0. - frame_length (float, optional): [description]. Defaults to 25.0. - frame_shift (float, optional): [description]. Defaults to 10.0. - min_duration (float, optional): [description]. Defaults to 0.0. - preemphasis_coefficient (float, optional): [description]. Defaults to 0.97. - raw_energy (bool, optional): [description]. Defaults to True. - remove_dc_offset (bool, optional): [description]. Defaults to True. - round_to_power_of_two (bool, optional): [description]. Defaults to True. - sample_frequency (float, optional): [description]. Defaults to 16000.0. - snip_edges (bool, optional): [description]. Defaults to True. - subtract_mean (bool, optional): [description]. Defaults to False. - window_type (str, optional): [description]. Defaults to POVEY. + waveform (Tensor): A waveform tensor with shape [C, T]. + blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. + channel (int, optional): Select the channel of waveform. Defaults to -1. + dither (float, optional): Dithering constant . Defaults to 0.0. + energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0. + frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0. + frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0. + preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97. + raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True. + remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True. + round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input + to FFT. Defaults to True. + sr (int, optional): Sample rate of input waveform. Defaults to 16000. + snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it + is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. + subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. + window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. Returns: - Tensor: [description] + Tensor: A spectrogram tensor with shape (m, padded_window_size // 2 + 1) where m is the number of frames + depends on frame_length and frame_shift. """ dtype = waveform.dtype epsilon = _get_epsilon(dtype) waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties( - waveform, channel, sample_frequency, frame_shift, frame_length, - round_to_power_of_two, preemphasis_coefficient) - - if len(waveform) < min_duration * sample_frequency: - # signal is too short - return paddle.empty([0]) + waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two, + preemphasis_coefficient) strided_input, signal_log_energy = _get_window( waveform, padded_window_size, window_size, window_shift, window_type, blackman_coeff, snip_edges, raw_energy, energy_floor, dither, remove_dc_offset, preemphasis_coefficient) - # size (m, padded_window_size // 2 + 1, 2) + # (m, padded_window_size // 2 + 1, 2) fft = paddle.fft.rfft(strided_input) - # Convert the FFT into a power spectrum power_spectrum = paddle.maximum( - fft.abs().pow(2.), - epsilon).log() # size (m, padded_window_size // 2 + 1) + fft.abs().pow(2.), epsilon).log() # (m, padded_window_size // 2 + 1) power_spectrum[:, 0] = signal_log_energy power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean) @@ -306,25 +291,19 @@ def _vtln_warp_freq(vtln_low_cutoff: float, l = vtln_low_cutoff * max(1.0, vtln_warp_factor) h = vtln_high_cutoff * min(1.0, vtln_warp_factor) scale = 1.0 / vtln_warp_factor - Fl = scale * l # F(l) - Fh = scale * h # F(h) + Fl = scale * l + Fh = scale * h assert l > low_freq and h < high_freq - # slope of left part of the 3-piece linear function scale_left = (Fl - low_freq) / (l - low_freq) - # [slope of center part is just "scale"] - - # slope of right part of the 3-piece linear function scale_right = (high_freq - Fh) / (high_freq - h) - res = paddle.empty_like(freq) outside_low_high_freq = paddle.less_than(freq, paddle.to_tensor(low_freq)) \ - | paddle.greater_than(freq, paddle.to_tensor(high_freq)) # freq < low_freq || freq > high_freq - before_l = paddle.less_than(freq, paddle.to_tensor(l)) # freq < l - before_h = paddle.less_than(freq, paddle.to_tensor(h)) # freq < h - after_h = paddle.greater_equal(freq, paddle.to_tensor(h)) # freq >= h + | paddle.greater_than(freq, paddle.to_tensor(high_freq)) + before_l = paddle.less_than(freq, paddle.to_tensor(l)) + before_h = paddle.less_than(freq, paddle.to_tensor(h)) + after_h = paddle.greater_equal(freq, paddle.to_tensor(h)) - # order of operations matter here (since there is overlapping frequency regions) res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq) res[before_h] = scale * freq[before_h] res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq) @@ -363,13 +342,10 @@ def _get_mel_banks(num_bins: int, assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), \ ('Bad values in options: low-freq {} and high-freq {} vs. nyquist {}'.format(low_freq, high_freq, nyquist)) - # fft-bin width [think of it as Nyquist-freq / half-window-length] fft_bin_width = sample_freq / window_length_padded mel_low_freq = _mel_scale_scalar(low_freq) mel_high_freq = _mel_scale_scalar(high_freq) - # divide by num_bins+1 in next line because of end-effects where the bins - # spread out to the sides. mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1) if vtln_high < 0.0: @@ -381,10 +357,9 @@ def _get_mel_banks(num_bins: int, 'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq)) bin = paddle.arange(num_bins).unsqueeze(1) - left_mel = mel_low_freq + bin * mel_freq_delta # size(num_bins, 1) - center_mel = mel_low_freq + (bin + 1.0 - ) * mel_freq_delta # size(num_bins, 1) - right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # size(num_bins, 1) + left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1) + center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # (num_bins, 1) + right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # (num_bins, 1) if vtln_warp_factor != 1.0: left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, @@ -395,25 +370,23 @@ def _get_mel_banks(num_bins: int, right_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, right_mel) - center_freqs = _inverse_mel_scale(center_mel) # size (num_bins) - # size(1, num_fft_bins) + center_freqs = _inverse_mel_scale(center_mel) # (num_bins) + # (1, num_fft_bins) mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins)).unsqueeze(0) - # size (num_bins, num_fft_bins) + # (num_bins, num_fft_bins) up_slope = (mel - left_mel) / (center_mel - left_mel) down_slope = (right_mel - mel) / (right_mel - center_mel) if vtln_warp_factor == 1.0: - # left_mel < center_mel < right_mel so we can min the two slopes and clamp negative values bins = paddle.maximum( paddle.zeros([1]), paddle.minimum(up_slope, down_slope)) else: - # warping can move the order of left_mel, center_mel, right_mel anywhere bins = paddle.zeros_like(up_slope) up_idx = paddle.greater_than(mel, left_mel) & paddle.less_than( - mel, center_mel) # left_mel < mel <= center_mel + mel, center_mel) down_idx = paddle.greater_than(mel, center_mel) & paddle.less_than( - mel, right_mel) # center_mel < mel < right_mel + mel, right_mel) bins[up_idx] = up_slope[up_idx] bins[down_idx] = down_slope[down_idx] @@ -430,13 +403,12 @@ def fbank(waveform: Tensor, high_freq: float=0.0, htk_compat: bool=False, low_freq: float=20.0, - min_duration: float=0.0, - num_mel_bins: int=23, + n_mels: int=23, preemphasis_coefficient: float=0.97, raw_energy: bool=True, remove_dc_offset: bool=True, round_to_power_of_two: bool=True, - sample_frequency: float=16000.0, + sr: int=16000, snip_edges: bool=True, subtract_mean: bool=False, use_energy: bool=False, @@ -446,83 +418,75 @@ def fbank(waveform: Tensor, vtln_low: float=100.0, vtln_warp: float=1.0, window_type: str=POVEY) -> Tensor: - """[summary] + """Compute and return filter banks from a waveform. The output is identical to Kaldi's. Args: - waveform (Tensor): [description] - blackman_coeff (float, optional): [description]. Defaults to 0.42. - channel (int, optional): [description]. Defaults to -1. - dither (float, optional): [description]. Defaults to 0.0. - energy_floor (float, optional): [description]. Defaults to 1.0. - frame_length (float, optional): [description]. Defaults to 25.0. - frame_shift (float, optional): [description]. Defaults to 10.0. - high_freq (float, optional): [description]. Defaults to 0.0. - htk_compat (bool, optional): [description]. Defaults to False. - low_freq (float, optional): [description]. Defaults to 20.0. - min_duration (float, optional): [description]. Defaults to 0.0. - num_mel_bins (int, optional): [description]. Defaults to 23. - preemphasis_coefficient (float, optional): [description]. Defaults to 0.97. - raw_energy (bool, optional): [description]. Defaults to True. - remove_dc_offset (bool, optional): [description]. Defaults to True. - round_to_power_of_two (bool, optional): [description]. Defaults to True. - sample_frequency (float, optional): [description]. Defaults to 16000.0. - snip_edges (bool, optional): [description]. Defaults to True. - subtract_mean (bool, optional): [description]. Defaults to False. - use_energy (bool, optional): [description]. Defaults to False. - use_log_fbank (bool, optional): [description]. Defaults to True. - use_power (bool, optional): [description]. Defaults to True. - vtln_high (float, optional): [description]. Defaults to -500.0. - vtln_low (float, optional): [description]. Defaults to 100.0. - vtln_warp (float, optional): [description]. Defaults to 1.0. - window_type (str, optional): [description]. Defaults to POVEY. + waveform (Tensor): A waveform tensor with shape [C, T]. + blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. + channel (int, optional): Select the channel of waveform. Defaults to -1. + dither (float, optional): Dithering constant . Defaults to 0.0. + energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0. + frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0. + frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0. + high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0. + htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False. + low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0. + n_mels (int, optional): Number of output mel bins. Defaults to 23. + preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97. + raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True. + remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True. + round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input + to FFT. Defaults to True. + sr (int, optional): Sample rate of input waveform. Defaults to 16000. + snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it + is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. + subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. + use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False. + use_log_fbank (bool, optional): Return log fbank when it is set True. Defaults to True. + use_power (bool, optional): Whether to use power instead of magnitude. Defaults to True. + vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0. + vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0. + vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0. + window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. Returns: - Tensor: [description] + Tensor: A filter banks tensor with shape (m, n_mels). """ dtype = waveform.dtype waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties( - waveform, channel, sample_frequency, frame_shift, frame_length, - round_to_power_of_two, preemphasis_coefficient) - - if len(waveform) < min_duration * sample_frequency: - # signal is too short - return paddle.empty([0], dtype=dtype) + waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two, + preemphasis_coefficient) - # strided_input, size (m, padded_window_size) and signal_log_energy, size (m) strided_input, signal_log_energy = _get_window( waveform, padded_window_size, window_size, window_shift, window_type, blackman_coeff, snip_edges, raw_energy, energy_floor, dither, remove_dc_offset, preemphasis_coefficient) - # size (m, padded_window_size // 2 + 1) + # (m, padded_window_size // 2 + 1) spectrum = paddle.fft.rfft(strided_input).abs() if use_power: spectrum = spectrum.pow(2.) - # size (num_mel_bins, padded_window_size // 2) - mel_energies, _ = _get_mel_banks(num_mel_bins, padded_window_size, - sample_frequency, low_freq, high_freq, - vtln_low, vtln_high, vtln_warp) + # (n_mels, padded_window_size // 2) + mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq, + high_freq, vtln_low, vtln_high, vtln_warp) mel_energies = mel_energies.astype(dtype) - # pad right column with zeros and add dimension, size (num_mel_bins, padded_window_size // 2 + 1) + # (n_mels, padded_window_size // 2 + 1) mel_energies = paddle.nn.functional.pad( mel_energies.unsqueeze(0), (0, 1), data_format='NCL', mode='constant', value=0).squeeze(0) - # sum with mel fiterbanks over the power spectrum, size (m, num_mel_bins) + # (m, n_mels) mel_energies = paddle.mm(spectrum, mel_energies.T) if use_log_fbank: - # avoid log of zero (which should be prevented anyway by dithering) mel_energies = paddle.maximum(mel_energies, _get_epsilon(dtype)).log() - # if use_energy then add it as the last column for htk_compat == true else first column if use_energy: - signal_log_energy = signal_log_energy.unsqueeze(1) # size (m, 1) - # returns size (m, num_mel_bins + 1) + signal_log_energy = signal_log_energy.unsqueeze(1) if htk_compat: mel_energies = paddle.concat( (mel_energies, signal_log_energy), axis=1) @@ -530,28 +494,20 @@ def fbank(waveform: Tensor, mel_energies = paddle.concat( (signal_log_energy, mel_energies), axis=1) + # (m, n_mels + 1) mel_energies = _subtract_column_mean(mel_energies, subtract_mean) return mel_energies -def _get_dct_matrix(num_ceps: int, num_mel_bins: int) -> Tensor: - # returns a dct matrix of size (num_mel_bins, num_ceps) - # size (num_mel_bins, num_mel_bins) - dct_matrix = create_dct(num_mel_bins, num_mel_bins, 'ortho') - # kaldi expects the first cepstral to be weighted sum of factor sqrt(1/num_mel_bins) - # this would be the first column in the dct_matrix for torchaudio as it expects a - # right multiply (which would be the first column of the kaldi's dct_matrix as kaldi - # expects a left multiply e.g. dct_matrix * vector). - dct_matrix[:, 0] = math.sqrt(1 / float(num_mel_bins)) - dct_matrix = dct_matrix[:, :num_ceps] +def _get_dct_matrix(n_mfcc: int, n_mels: int) -> Tensor: + dct_matrix = create_dct(n_mels, n_mels, 'ortho') + dct_matrix[:, 0] = math.sqrt(1 / float(n_mels)) + dct_matrix = dct_matrix[:, :n_mfcc] # (n_mels, n_mfcc) return dct_matrix -def _get_lifter_coeffs(num_ceps: int, cepstral_lifter: float) -> Tensor: - # returns size (num_ceps) - # Compute liftering coefficients (scaling on cepstral coeffs) - # coeffs are numbered slightly differently from HTK: the zeroth index is C0, which is not affected. - i = paddle.arange(num_ceps) +def _get_lifter_coeffs(n_mfcc: int, cepstral_lifter: float) -> Tensor: + i = paddle.arange(n_mfcc) return 1.0 + 0.5 * cepstral_lifter * paddle.sin(math.pi * i / cepstral_lifter) @@ -567,14 +523,13 @@ def mfcc(waveform: Tensor, high_freq: float=0.0, htk_compat: bool=False, low_freq: float=20.0, - num_ceps: int=13, - min_duration: float=0.0, - num_mel_bins: int=23, + n_mfcc: int=13, + n_mels: int=23, preemphasis_coefficient: float=0.97, raw_energy: bool=True, remove_dc_offset: bool=True, round_to_power_of_two: bool=True, - sample_frequency: float=16000.0, + sr: int=16000, snip_edges: bool=True, subtract_mean: bool=False, use_energy: bool=False, @@ -582,47 +537,47 @@ def mfcc(waveform: Tensor, vtln_low: float=100.0, vtln_warp: float=1.0, window_type: str=POVEY) -> Tensor: - """[summary] + """Compute and return mel frequency cepstral coefficients from a waveform. The output is + identical to Kaldi's. Args: - waveform (Tensor): [description] - blackman_coeff (float, optional): [description]. Defaults to 0.42. - cepstral_lifter (float, optional): [description]. Defaults to 22.0. - channel (int, optional): [description]. Defaults to -1. - dither (float, optional): [description]. Defaults to 0.0. - energy_floor (float, optional): [description]. Defaults to 1.0. - frame_length (float, optional): [description]. Defaults to 25.0. - frame_shift (float, optional): [description]. Defaults to 10.0. - high_freq (float, optional): [description]. Defaults to 0.0. - htk_compat (bool, optional): [description]. Defaults to False. - low_freq (float, optional): [description]. Defaults to 20.0. - num_ceps (int, optional): [description]. Defaults to 13. - min_duration (float, optional): [description]. Defaults to 0.0. - num_mel_bins (int, optional): [description]. Defaults to 23. - preemphasis_coefficient (float, optional): [description]. Defaults to 0.97. - raw_energy (bool, optional): [description]. Defaults to True. - remove_dc_offset (bool, optional): [description]. Defaults to True. - round_to_power_of_two (bool, optional): [description]. Defaults to True. - sample_frequency (float, optional): [description]. Defaults to 16000.0. - snip_edges (bool, optional): [description]. Defaults to True. - subtract_mean (bool, optional): [description]. Defaults to False. - use_energy (bool, optional): [description]. Defaults to False. - vtln_high (float, optional): [description]. Defaults to -500.0. - vtln_low (float, optional): [description]. Defaults to 100.0. - vtln_warp (float, optional): [description]. Defaults to 1.0. - window_type (str, optional): [description]. Defaults to POVEY. + waveform (Tensor): A waveform tensor with shape [C, T]. + blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. + cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0. + channel (int, optional): Select the channel of waveform. Defaults to -1. + dither (float, optional): Dithering constant . Defaults to 0.0. + energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0. + frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0. + frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0. + high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0. + htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False. + low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0. + n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 13. + n_mels (int, optional): Number of output mel bins. Defaults to 23. + preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97. + raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True. + remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True. + round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input + to FFT. Defaults to True. + sr (int, optional): Sample rate of input waveform. Defaults to 16000. + snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it + is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. + subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. + use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False. + vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0. + vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0. + vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0. + window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. Returns: - Tensor: [description] + Tensor: A mel frequency cepstral coefficients tensor with shape (m, n_mfcc). """ - assert num_ceps <= num_mel_bins, 'num_ceps cannot be larger than num_mel_bins: %d vs %d' % ( - num_ceps, num_mel_bins) + assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % ( + n_mfcc, n_mels) dtype = waveform.dtype - # The mel_energies should not be squared (use_power=True), not have mean subtracted - # (subtract_mean=False), and use log (use_log_fbank=True). - # size (m, num_mel_bins + use_energy) + # (m, n_mels + use_energy) feature = fbank( waveform=waveform, blackman_coeff=blackman_coeff, @@ -634,13 +589,12 @@ def mfcc(waveform: Tensor, high_freq=high_freq, htk_compat=htk_compat, low_freq=low_freq, - min_duration=min_duration, - num_mel_bins=num_mel_bins, + n_mels=n_mels, preemphasis_coefficient=preemphasis_coefficient, raw_energy=raw_energy, remove_dc_offset=remove_dc_offset, round_to_power_of_two=round_to_power_of_two, - sample_frequency=sample_frequency, + sr=sr, snip_edges=snip_edges, subtract_mean=False, use_energy=use_energy, @@ -652,34 +606,29 @@ def mfcc(waveform: Tensor, window_type=window_type) if use_energy: - # size (m) - signal_log_energy = feature[:, num_mel_bins if htk_compat else 0] - # offset is 0 if htk_compat==True else 1 + # (m) + signal_log_energy = feature[:, n_mels if htk_compat else 0] mel_offset = int(not htk_compat) - feature = feature[:, mel_offset:(num_mel_bins + mel_offset)] + feature = feature[:, mel_offset:(n_mels + mel_offset)] - # size (num_mel_bins, num_ceps) - dct_matrix = _get_dct_matrix(num_ceps, num_mel_bins).astype(dtype=dtype) + # (n_mels, n_mfcc) + dct_matrix = _get_dct_matrix(n_mfcc, n_mels).astype(dtype=dtype) - # size (m, num_ceps) + # (m, n_mfcc) feature = feature.matmul(dct_matrix) if cepstral_lifter != 0.0: - # size (1, num_ceps) - lifter_coeffs = _get_lifter_coeffs(num_ceps, - cepstral_lifter).unsqueeze(0) + # (1, n_mfcc) + lifter_coeffs = _get_lifter_coeffs(n_mfcc, cepstral_lifter).unsqueeze(0) feature *= lifter_coeffs.astype(dtype=dtype) - # if use_energy then replace the last column for htk_compat == true else first column if use_energy: feature[:, 0] = signal_log_energy if htk_compat: - energy = feature[:, 0].unsqueeze(1) # size (m, 1) - feature = feature[:, 1:] # size (m, num_ceps - 1) + energy = feature[:, 0].unsqueeze(1) # (m, 1) + feature = feature[:, 1:] # (m, n_mfcc - 1) if not use_energy: - # scale on C0 (actually removing a scale we previously added that's - # part of one common definition of the cosine transform.) energy *= math.sqrt(2) feature = paddle.concat((feature, energy), axis=1) diff --git a/paddleaudio/paddleaudio/features/layers.py b/paddleaudio/paddleaudio/features/layers.py index 69f814d6..16fa0081 100644 --- a/paddleaudio/paddleaudio/features/layers.py +++ b/paddleaudio/paddleaudio/features/layers.py @@ -71,15 +71,17 @@ class Spectrogram(nn.Layer): if win_length is None: win_length = n_fft - fft_window = get_window(window, win_length, fftbins=True, dtype=dtype) + self.fft_window = get_window( + window, win_length, fftbins=True, dtype=dtype) self._stft = partial( paddle.signal.stft, n_fft=n_fft, hop_length=hop_length, win_length=win_length, - window=fft_window, + window=self.fft_window, center=center, pad_mode=pad_mode) + self.register_buffer('fft_window', self.fft_window) def forward(self, x): stft = self._stft(x) @@ -259,12 +261,18 @@ class MFCC(nn.Layer): sr: int=22050, n_mfcc: int=40, norm: str='ortho', + dtype: str=paddle.float32, **kwargs): - """[summary] + """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms. + Parameters: - sr (int, optional): [description]. Defaults to 22050. - n_mfcc (int, optional): [description]. Defaults to 40. - norm (str, optional): [description]. Defaults to 'ortho'. + sr(int): the audio sample rate. + The default value is 22050. + n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 40. + norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. + You can specify norm=1.0/2.0 to use customized p-norm normalization. + dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical + accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix. """ super(MFCC, self).__init__() self._log_melspectrogram = LogMelSpectrogram(sr=sr, **kwargs)