diff --git a/paddlespeech/audio/compliance/kaldi.py b/paddlespeech/audio/compliance/kaldi.py index 538be019..beb2d86b 100644 --- a/paddlespeech/audio/compliance/kaldi.py +++ b/paddlespeech/audio/compliance/kaldi.py @@ -74,16 +74,16 @@ def _feature_window_function( window_size: int, blackman_coeff: float, dtype: int, ) -> Tensor: - if window_type == HANNING: + if window_type == "hann": return get_window('hann', window_size, fftbins=False, dtype=dtype) - elif window_type == HAMMING: + elif window_type == "hamming": return get_window('hamming', window_size, fftbins=False, dtype=dtype) - elif window_type == POVEY: + elif window_type == "povey": return get_window( 'hann', window_size, fftbins=False, dtype=dtype).pow(0.85) - elif window_type == RECTANGULAR: + elif window_type == "rect": return paddle.ones([window_size], dtype=dtype) - elif window_type == BLACKMAN: + elif window_type == "blackman": a = 2 * math.pi / (window_size - 1) window_function = paddle.arange(window_size, dtype=dtype) return (blackman_coeff - 0.5 * paddle.cos(a * window_function) + @@ -216,7 +216,7 @@ def spectrogram(waveform: Tensor, sr: int=16000, snip_edges: bool=True, subtract_mean: bool=False, - window_type: str=POVEY) -> Tensor: + window_type: str="povey") -> Tensor: """Compute and return a spectrogram from a waveform. The output is identical to Kaldi's. Args: @@ -236,7 +236,7 @@ def spectrogram(waveform: Tensor, snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. - window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. + window_type (str, optional): Choose type of window for FFT computation. Defaults to "povey". Returns: Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames @@ -418,11 +418,11 @@ def fbank(waveform: Tensor, vtln_high: float=-500.0, vtln_low: float=100.0, vtln_warp: float=1.0, - window_type: str=POVEY) -> Tensor: + window_type: str="povey") -> Tensor: """Compute and return filter banks from a waveform. The output is identical to Kaldi's. Args: - waveform (Tensor): A waveform tensor with shape `(C, T)`. + waveform (Tensor): A waveform tensor with shape `(C, T)`. `C` is in the range [0,1]. blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. channel (int, optional): Select the channel of waveform. Defaults to -1. dither (float, optional): Dithering constant . Defaults to 0.0. @@ -448,7 +448,7 @@ def fbank(waveform: Tensor, vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0. vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0. vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0. - window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. + window_type (str, optional): Choose type of window for FFT computation. Defaults to "povey". Returns: Tensor: A filter banks tensor with shape `(m, n_mels)`. @@ -537,7 +537,7 @@ def mfcc(waveform: Tensor, vtln_high: float=-500.0, vtln_low: float=100.0, vtln_warp: float=1.0, - window_type: str=POVEY) -> Tensor: + window_type: str="povey") -> Tensor: """Compute and return mel frequency cepstral coefficients from a waveform. The output is identical to Kaldi's. diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 887ec7a6..c04e3ae4 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -18,6 +18,7 @@ from pathlib import Path import paddle import soundfile +import numpy as np from yacs.config import CfgNode from paddlespeech.audio.transform.transformation import Transformation @@ -77,6 +78,8 @@ class U2Infer(): feat = self.preprocessing(audio, **self.preprocess_args) logger.info(f"feat shape: {feat.shape}") + np.savetxt("feat.transform.txt", feat) + ilen = paddle.to_tensor(feat.shape[0]) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0) decode_config = self.config.decode diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index dae618db..ee4df9cb 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -474,13 +474,20 @@ class U2Tester(U2Trainer): def export(self): infer_model, input_spec = self.load_inferspec() infer_model.eval() + paddle.set_device('cpu') - assert isinstance(input_spec, list), type(input_spec) + assert isinstance(input_spec, (list, tuple)), type(input_spec) batch_size, feat_dim, model_size, num_left_chunks = input_spec - ######################### infer_model.forward_encoder_chunk zero tensor online ############ - # TODO: 80(feature dim) be configable + ######################## infer_model.forward_encoder_chunk ############ + input_spec = [ + # (T,), int16 + paddle.static.InputSpec(shape=[None], dtype='int16'), + ] + infer_model.forward_feature = paddle.jit.to_static(infer_model.forward_feature, input_spec=input_spec) + + ######################### infer_model.forward_encoder_chunk ############ input_spec = [ # xs, (B, T, D) paddle.static.InputSpec(shape=[batch_size, None, feat_dim], dtype='float32'), @@ -499,8 +506,16 @@ class U2Tester(U2Trainer): infer_model.forward_encoder_chunk = paddle.jit.to_static( infer_model.forward_encoder_chunk, input_spec=input_spec) + ######################### infer_model.ctc_activation ######################## + input_spec = [ + # encoder_out, (B,T,D) + paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') + ] + infer_model.ctc_activation = paddle.jit.to_static( + infer_model.ctc_activation, input_spec=input_spec) + + ######################### infer_model.forward_attention_decoder ######################## - # TODO: 512(encoder_output) be configable. 1 for BatchSize input_spec = [ # hyps, (B, U) paddle.static.InputSpec(shape=[None, None], dtype='int64'), @@ -512,17 +527,11 @@ class U2Tester(U2Trainer): infer_model.forward_attention_decoder = paddle.jit.to_static( infer_model.forward_attention_decoder, input_spec=input_spec) - ######################### infer_model.ctc_activation ######################## - input_spec = [ - # encoder_out, (B,T,D) - paddle.static.InputSpec(shape=[batch_size, None, model_size], dtype='float32') - ] - infer_model.ctc_activation = paddle.jit.to_static( - infer_model.ctc_activation, input_spec=input_spec) - # jit save + logger.info(f"export save: {self.args.export_path}") paddle.jit.save(infer_model, self.args.export_path, combine_params=True, skip_forward=True) + # test dy2static def flatten(out): if isinstance(out, paddle.Tensor): @@ -536,26 +545,44 @@ class U2Tester(U2Trainer): flatten_out.append(var) return flatten_out - xs1 = paddle.rand(shape=[1, 67, 80], dtype='float32') + # forward_encoder_chunk dygraph + xs1 = paddle.full([1, 67, 80], 0.1, dtype='float32') offset = paddle.to_tensor([0], dtype='int32') required_cache_size = num_left_chunks att_cache = paddle.zeros([0, 0, 0, 0]) cnn_cache = paddle.zeros([0, 0, 0, 0]) - - xs, att_cache, cnn_cache = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) - xs2 = paddle.rand(shape=[1, 67, 80], dtype='float32') - offset = paddle.to_tensor([16], dtype='int32') - out1 = infer_model.forward_encoder_chunk(xs2, offset, required_cache_size, att_cache, cnn_cache) - print('py encoder', out1) - + xs_d, att_cache_d, cnn_cache_d = infer_model.forward_encoder_chunk(xs1, offset, required_cache_size, att_cache, cnn_cache) + + import soundfile + audio, sample_rate = soundfile.read( + './zh.wav', dtype="int16", always_2d=True) + audio = audio[:, 0] + logger.info(f"audio shape: {audio.shape}") + audio = paddle.to_tensor(audio, paddle.int16) + feat_d = infer_model.forward_feature(audio) + logger.info(f"{feat_d}") + np.savetxt("feat.tostatic.txt", feat_d) + + + # load static model from paddle.jit.layer import Layer layer = Layer() layer.load(self.args.export_path, paddle.CPUPlace()) - xs1 = paddle.full([1, 7, 80], 0.1, dtype='float32') + # forward_encoder_chunk static + xs1 = paddle.full([1, 67, 80], 0.1, dtype='float32') offset = paddle.to_tensor([0], dtype='int32') att_cache = paddle.zeros([0, 0, 0, 0]) - cnn_cache=paddle.zeros([0, 0, 0, 0]) + cnn_cache = paddle.zeros([0, 0, 0, 0]) func = getattr(layer, 'forward_encoder_chunk') - xs, att_cache, cnn_cache = func(xs1, offset, att_cache, cnn_cache) - print('py static encoder', xs) + xs_s, att_cache_s, cnn_cache_s = func(xs1, offset, att_cache, cnn_cache) + np.testing.assert_allclose(xs_d, xs_s, atol=1e-5) + np.testing.assert_allclose(att_cache_d, att_cache_s, atol=1e-4) + np.testing.assert_allclose(cnn_cache_d, cnn_cache_s, atol=1e-4) + # logger.info(f"forward_encoder_chunk output: {xs_s}") + + # forward_feature static + func = getattr(layer, 'forward_feature') + feat_s = func(audio)[0] + logger.info(f"{feat_s}") + np.testing.assert_allclose(feat_d, feat_s, atol=1e-5) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 149170ed..d7b8630a 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -916,6 +916,50 @@ class U2InferModel(U2Model): def __init__(self, configs: dict): super().__init__(configs) + from paddlespeech.s2t.modules.fbank import KaldiFbank + import yaml + import json + import numpy as np + + input_dim = configs['input_dim'] + process = configs['preprocess_config'] + with open(process, encoding="utf-8") as f: + conf = yaml.safe_load(f) + assert isinstance(conf, dict), type(self.conf) + + for idx, process in enumerate(conf['process']): + assert isinstance(process, dict), type(process) + opts = dict(process) + process_type = opts.pop("type") + + if process_type == 'fbank_kaldi': + opts.update({'n_mels': input_dim}) + opts['dither'] = 0.0 + self.fbank = KaldiFbank( + **opts + ) + logger.info(f"{self.__class__.__name__} export: {self.fbank}") + if process_type == 'cmvn_json': + # align with paddlespeech.audio.transform.cmvn:GlobalCMVN + std_floor = 1.0e-20 + + cmvn = opts['cmvn_path'] + if isinstance(cmvn, dict): + cmvn_stats = cmvn + else: + with open(cmvn) as f: + cmvn_stats = json.load(f) + count = cmvn_stats['frame_num'] + mean = np.array(cmvn_stats['mean_stat']) / count + square_sums = np.array(cmvn_stats['var_stat']) + var = square_sums / count - mean**2 + std = np.maximum(np.sqrt(var), std_floor) + istd = 1.0 / std + self.global_cmvn = GlobalCMVN( + paddle.to_tensor(mean, dtype=paddle.float), + paddle.to_tensor(istd, dtype=paddle.float)) + logger.info(f"{self.__class__.__name__} export: {self.global_cmvn}") + def forward(self, feats, feats_lengths, @@ -939,3 +983,17 @@ class U2InferModel(U2Model): # num_decoding_left_chunks=num_decoding_left_chunks, # simulate_streaming=simulate_streaming) return feats, feats_lengths + + def forward_feature(self, x): + """feature pipeline. + + Args: + x (paddle.Tensor): waveform (T,). + + Return: + feat (paddle.Tensor): feature (T, D) + """ + x = paddle.cast(x, paddle.float32) + feat = self.fbank(x) + feat = self.global_cmvn(feat) + return feat \ No newline at end of file diff --git a/paddlespeech/s2t/modules/cmvn.py b/paddlespeech/s2t/modules/cmvn.py index 67f71b66..53c508f1 100644 --- a/paddlespeech/s2t/modules/cmvn.py +++ b/paddlespeech/s2t/modules/cmvn.py @@ -40,6 +40,14 @@ class GlobalCMVN(nn.Layer): self.register_buffer("mean", mean) self.register_buffer("istd", istd) + def __repr__(self): + return ( + "{name}(mean={mean}, istd={istd}, norm_var={norm_var})".format( + name=self.__class__.__name__, + mean=self.mean, + istd=self.istd, + norm_var=self.norm_var)) + def forward(self, x: paddle.Tensor): """ Args: @@ -50,4 +58,4 @@ class GlobalCMVN(nn.Layer): x = x - self.mean if self.norm_var: x = x * self.istd - return x + return x \ No newline at end of file diff --git a/paddlespeech/s2t/modules/fbank.py b/paddlespeech/s2t/modules/fbank.py new file mode 100644 index 00000000..4ec620a7 --- /dev/null +++ b/paddlespeech/s2t/modules/fbank.py @@ -0,0 +1,74 @@ + + + +import paddle +from paddle import nn + +from paddlespeech.audio.compliance import kaldi + +from paddlespeech.s2t.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ['KaldiFbank'] + +class KaldiFbank(nn.Layer): + def __init__(self, + fs=16000, + n_mels=80, + n_shift=160, # unit:sample, 10ms + win_length=400, # unit:sample, 25ms + energy_floor=0.0, + dither=0.0): + """ + Args: + fs (int): sample rate of the audio + n_mels (int): number of mel filter banks + n_shift (int): number of points in a frame shift + win_length (int): number of points in a frame windows + energy_floor (float): Floor on energy in Spectrogram computation (absolute) + dither (float): Dithering constant. Default 0.0 + """ + super().__init__() + self.fs = fs + self.n_mels = n_mels + num_point_ms = fs / 1000 + self.n_frame_length = win_length / num_point_ms + self.n_frame_shift = n_shift / num_point_ms + self.energy_floor = energy_floor + self.dither = dither + + def __repr__(self): + return ( + "{name}(fs={fs}, n_mels={n_mels}, " + "n_frame_shift={n_frame_shift}, n_frame_length={n_frame_length}, " + "dither={dither}))".format( + name=self.__class__.__name__, + fs=self.fs, + n_mels=self.n_mels, + n_frame_shift=self.n_frame_shift, + n_frame_length=self.n_frame_length, + dither=self.dither, )) + + def forward(self, x: paddle.Tensor): + """ + Args: + x (paddle.Tensor): shape (Ti). + Not support: [Time, Channel] and Batch mode. + + Returns: + paddle.Tensor: (T, D) + """ + assert x.ndim == 1 + + feat = kaldi.fbank( + x.unsqueeze(0), # append channel dim, (C, Ti) + n_mels=self.n_mels, + frame_length=self.n_frame_length, + frame_shift=self.n_frame_shift, + dither=self.dither, + energy_floor=self.energy_floor, + sr=self.fs) + + assert feat.ndim == 2 # (T,D) + return feat