Merge branch 'PaddlePaddle:develop' into cluster

3 years ago · 240520c0ca
parent 5701c2808d 9468438736
commit 240520c0ca
50 changed files with 3267 additions and 144 deletions
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@ -37,7 +37,7 @@ Model Type | Dataset| Example Link | Pretrained Models|Static Models|Size (stati
 Tacotron2|LJSpeech|[tacotron2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip)|||
 Tacotron2|CSMSC|[tacotron2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0)|[tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)|[tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip)|103MB|
 TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)|||
-SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_static_0.2.0.zip)|12MB|
+SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2)|[speedyspeech_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip)|[speedyspeech_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_static_0.2.0.zip)|12MB|
 FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip)|157MB|
 FastSpeech2-Conformer| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)|||
 FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)|||
--- a/examples/aishell/asr0/run.sh
+++ b/examples/aishell/asr0/run.sh
@ -5,7 +5,7 @@ source path.sh
 gpus=0,1,2,3
 stage=0
 stop_stage=100
-conf_path=conf/deepspeech2.yaml    #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml
+conf_path=conf/deepspeech2.yaml    #conf/deepspeech2.yaml or conf/deepspeech2_online.yaml
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=1
 model_type=offline    # offline or online
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@ -223,22 +223,28 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
 ## Pretrained Model
 Pretrained SpeedySpeech model with no silence in the edge of audios:
 - [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)
+- [speedyspeech_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip)

 The static model can be downloaded here:
 - [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)
 - [speedyspeech_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_static_0.2.0.zip)

+The ONNX model can be downloaded here:
+- [speedyspeech_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_onnx_0.2.0.zip)
+
+
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/ssim_loss
 :-------------:| :------------:| :-----: | :-----: | :--------:|:--------:
-default| 1(gpu) x 11400|0.83655|0.42324|0.03211| 0.38119
+default| 1(gpu) x 11400|0.79532|0.400246|0.030259| 0.36482

 SpeedySpeech checkpoint contains files listed below.
+
 ```text
-speedyspeech_nosil_baker_ckpt_0.5
+speedyspeech_csmsc_ckpt_0.2.0
 ├── default.yaml            # default config used to train speedyspeech
 ├── feats_stats.npy         # statistics used to normalize spectrogram when training speedyspeech
 ├── phone_id_map.txt        # phone vocabulary file when training speedyspeech
-├── snapshot_iter_11400.pdz # model parameters and optimizer states
+├── snapshot_iter_30600.pdz # model parameters and optimizer states
 └── tone_id_map.txt         # tone vocabulary file when training speedyspeech
 ```
 You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained speedyspeech and parallel wavegan models.
@ -249,9 +255,9 @@ FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../synthesize_e2e.py \
  --am=speedyspeech_csmsc \
-  --am_config=speedyspeech_nosil_baker_ckpt_0.5/default.yaml \
-  --am_ckpt=speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz \
-  --am_stat=speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy \
+  --am_config=speedyspeech_csmsc_ckpt_0.2.0/default.yaml \
+  --am_ckpt=speedyspeech_csmsc_ckpt_0.2.0/snapshot_iter_30600.pdz \
+  --am_stat=speedyspeech_csmsc_ckpt_0.2.0/feats_stats.npy \
  --voc=pwgan_csmsc \
  --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
@ -260,6 +266,6 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --text=${BIN_DIR}/../sentences.txt \
  --output_dir=exp/default/test_e2e \
  --inference_dir=exp/default/inference \
-  --phones_dict=speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt \
-  --tones_dict=speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
+  --phones_dict=speedyspeech_csmsc_ckpt_0.2.0/phone_id_map.txt \
+  --tones_dict=speedyspeech_csmsc_ckpt_0.2.0/tone_id_map.txt
 ```
--- a/examples/csmsc/tts2/local/ort_predict.sh
+++ b/examples/csmsc/tts2/local/ort_predict.sh
@ -0,0 +1,32 @@
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# only support default_fastspeech2/speedyspeech + hifigan/mb_melgan now!
+
+# synthesize from metadata
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../ort_predict.py \
+        --inference_dir=${train_output_path}/inference_onnx \
+        --am=speedyspeech_csmsc \
+        --voc=hifigan_csmsc \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/onnx_infer_out \
+        --device=cpu \
+        --cpu_threads=2
+fi
+
+# e2e, synthesize from text
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../ort_predict_e2e.py \
+        --inference_dir=${train_output_path}/inference_onnx \
+        --am=speedyspeech_csmsc \
+        --voc=hifigan_csmsc \
+        --output_dir=${train_output_path}/onnx_infer_out_e2e \
+        --text=${BIN_DIR}/../csmsc_test.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device=cpu \
+        --cpu_threads=2
+fi
--- a/examples/csmsc/tts2/local/paddle2onnx.sh
+++ b/examples/csmsc/tts2/local/paddle2onnx.sh
@ -0,0 +1 @@
+../../tts3/local/paddle2onnx.sh
--- a/examples/csmsc/tts2/run.sh
+++ b/examples/csmsc/tts2/run.sh
@ -40,3 +40,25 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # inference with static model
    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
 fi
+
+# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
+# we have only tested the following models so far
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # install paddle2onnx
+    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
+    if [[ -z "$version" || ${version} != '0.9.4' ]]; then
+        pip install paddle2onnx==0.9.4
+    fi
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
+fi
+
+# inference with onnxruntime, use fastspeech2 + hifigan by default
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    # install onnxruntime
+    version=$(echo `pip list |grep "onnxruntime"` |awk -F" " '{print $2}')
+    if [[ -z "$version" || ${version} != '1.10.0' ]]; then
+        pip install onnxruntime==1.10.0
+    fi
+    ./local/ort_predict.sh ${train_output_path}
+fi
--- a/examples/csmsc/tts3/local/ort_predict.sh
+++ b/examples/csmsc/tts3/local/ort_predict.sh
@ -3,7 +3,7 @@ train_output_path=$1
 stage=0
 stop_stage=0

-# only support default_fastspeech2 + hifigan/mb_melgan now!
+# only support default_fastspeech2/speedyspeech + hifigan/mb_melgan now!

 # synthesize from metadata
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--- a/examples/csmsc/tts3/local/paddle2onnx.sh
+++ b/examples/csmsc/tts3/local/paddle2onnx.sh
@ -19,4 +19,5 @@ paddle2onnx \
    --model_filename ${model}.pdmodel \
    --params_filename ${model}.pdiparams \
    --save_file ${train_output_path}/${output_dir}/${model}.onnx \
+    --opset_version 11 \
    --enable_dev_version ${enable_dev_version}
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@ -133,6 +133,9 @@ The pretrained model can be downloaded here:
 The static model can be downloaded here:
 - [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)

+The ONNX model can be downloaded here:
+- [pwgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_onnx_0.2.0.zip)
+
 Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss| eval/spectral_convergence_loss
 :-------------:| :------------:| :-----: | :-----: | :--------:
 default| 1(gpu) x 400000|1.948763|0.670098|0.248882
--- a/examples/other/1xt2x/src_deepspeech2x/init.py
+++ b/examples/other/1xt2x/src_deepspeech2x/init.py
@ -26,10 +26,10 @@ from paddlespeech.s2t.utils.log import Log
 #TODO(Hui Zhang): remove  fluid import
 logger = Log(__name__).getlog()

-########### hcak logging #############
+########### hack logging #############
 logger.warn = logger.warning

-########### hcak paddle #############
+########### hack paddle #############
 paddle.half = 'float16'
 paddle.float = 'float32'
 paddle.double = 'float64'
@ -110,7 +110,7 @@ if not hasattr(paddle, 'cat'):
    paddle.cat = cat


-########### hcak paddle.Tensor #############
+########### hack paddle.Tensor #############
 def item(x: paddle.Tensor):
    return x.numpy().item()

@ -353,7 +353,7 @@ if not hasattr(paddle.Tensor, 'tolist'):
    setattr(paddle.Tensor, 'tolist', tolist)


-########### hcak paddle.nn #############
+########### hack paddle.nn #############
 class GLU(nn.Layer):
    """Gated Linear Units (GLU) Layer"""

--- a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
+++ b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
@ -4,7 +4,7 @@
 augment: True
 batch_size: 32
 num_workers: 2
-num_speakers: 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
+num_speakers: 7205 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
 shuffle: True
 skip_prep: False
 split_ratio: 0.9
@ -42,8 +42,16 @@ epochs: 10
 save_interval: 10
 log_interval: 10
 learning_rate: 1e-8
+max_lr: 1e-3
+step_size: 140000


+###########################################
+#                loss                     #
+###########################################
+margin: 0.2
+scale: 30
+
 ###########################################
 #                Testing                  #
 ###########################################
--- a/examples/voxceleb/sv0/conf/ecapa_tdnn_small.yaml
+++ b/examples/voxceleb/sv0/conf/ecapa_tdnn_small.yaml
@ -2,7 +2,7 @@
 #                Data                 #
 ###########################################
 augment: True
-batch_size: 16
+batch_size: 32
 num_workers: 2
 num_speakers: 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
 shuffle: True
@ -42,7 +42,14 @@ epochs: 100
 save_interval: 10
 log_interval: 10
 learning_rate: 1e-8
+max_lr: 1e-3
+step_size: 140000

+###########################################
+#                loss                     #
+###########################################
+margin: 0.2
+scale: 30

 ###########################################
 #                Testing                  #
--- a/paddleaudio/paddleaudio/compliance/librosa.py
+++ b/paddleaudio/paddleaudio/compliance/librosa.py
@ -341,7 +341,7 @@ def stft(x: np.ndarray,
        hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None.
        win_length (Optional[int], optional): The size of window. Defaults to None.
        window (str, optional): A string of window specification. Defaults to "hann".
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
        dtype (type, optional): Data type of STFT results. Defaults to np.complex64.
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".

@ -509,7 +509,7 @@ def melspectrogram(x: np.ndarray,
        fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0.
        fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
        window (str, optional): A string of window specification. Defaults to "hann".
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
        power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
        to_db (bool, optional): Enable db scale. Defaults to True.
@ -564,7 +564,7 @@ def spectrogram(x: np.ndarray,
        window_size (int, optional): Size of FFT and window length. Defaults to 512.
        hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
        window (str, optional): A string of window specification. Defaults to "hann".
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
        power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.

--- a/paddleaudio/paddleaudio/features/layers.py
+++ b/paddleaudio/paddleaudio/features/layers.py
@ -42,7 +42,7 @@ class Spectrogram(nn.Layer):
        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
    """
@ -99,7 +99,7 @@ class MelSpectrogram(nn.Layer):
        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
        n_mels (int, optional): Number of mel bins. Defaults to 64.
        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
@ -176,7 +176,7 @@ class LogMelSpectrogram(nn.Layer):
        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
        n_mels (int, optional): Number of mel bins. Defaults to 64.
        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
@ -257,7 +257,7 @@ class MFCC(nn.Layer):
        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
        n_mels (int, optional): Number of mel bins. Defaults to 64.
        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@ -43,13 +43,13 @@ pretrained_models = {
    # speedyspeech
    "speedyspeech_csmsc-zh": {
        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip',
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip',
        'md5':
-        '9edce23b1a87f31b814d9477bf52afbc',
+        '6f6fa967b408454b6662c8c00c0027cb',
        'config':
        'default.yaml',
        'ckpt':
-        'snapshot_iter_11400.pdz',
+        'snapshot_iter_30600.pdz',
        'speech_stats':
        'feats_stats.npy',
        'phones_dict':
--- a/paddlespeech/s2t/init.py
+++ b/paddlespeech/s2t/init.py
@ -26,10 +26,10 @@ from paddlespeech.s2t.utils.log import Log
 #TODO(Hui Zhang): remove  fluid import
 logger = Log(__name__).getlog()

-########### hcak logging #############
+########### hack logging #############
 logger.warn = logger.warning

-########### hcak paddle #############
+########### hack paddle #############
 paddle.half = 'float16'
 paddle.float = 'float32'
 paddle.double = 'float64'
@ -110,7 +110,7 @@ if not hasattr(paddle, 'cat'):
    paddle.cat = cat


-########### hcak paddle.Tensor #############
+########### hack paddle.Tensor #############
 def item(x: paddle.Tensor):
    return x.numpy().item()

--- a/paddlespeech/server/engine/asr/online/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@ -37,7 +37,7 @@ pretrained_models = {
        'url':
        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.1.1.model.tar.gz',
        'md5':
-        '23e16c69730a1cb5d735c98c83c21e16',
+        'd5e076217cf60486519f72c217d21b9b',
        'cfg_path':
        'model.yaml',
        'ckpt_path':
--- a/paddlespeech/server/tests/asr/online/web/app.py
+++ b/paddlespeech/server/tests/asr/online/web/app.py
@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
+# Author: zhendong.peng@mobvoi.com (Zhendong Peng)
+
+import argparse
+
+from flask import Flask, render_template
+
+parser = argparse.ArgumentParser(description='training your network')
+parser.add_argument('--port', default=19999, type=int, help='port id')
+args = parser.parse_args()
+
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    return render_template('index.html')
+
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=args.port, debug=True)
--- a/paddlespeech/server/tests/asr/online/web/static/css/font-awesome.min.css
+++ b/paddlespeech/server/tests/asr/online/web/static/css/font-awesome.min.css
--- a/paddlespeech/server/tests/asr/online/web/static/css/style.css
+++ b/paddlespeech/server/tests/asr/online/web/static/css/style.css
@ -0,0 +1,453 @@
+/*
+* @Author: baipengxia
+* @Date:   2021-03-12 11:44:28
+* @Last Modified by:   baipengxia
+* @Last Modified time: 2021-03-12 15:14:24
+*/
+
+/** COMMON RESET **/
+* {
+  -webkit-tap-highlight-color: rgba(0, 0, 0, 0);
+}
+
+body,
+h1,
+h2,
+h3,
+h4,
+h5,
+h6,
+hr,
+p,
+dl,
+dt,
+dd,
+ul,
+ol,
+li,
+fieldset,
+lengend,
+button,
+input,
+textarea,
+th,
+td {
+  margin: 0;
+  padding: 0;
+  color: #000;
+}
+
+body {
+  font-size: 14px;
+}
+html, body {
+  min-width: 1200px;
+}
+
+button,
+input,
+select,
+textarea {
+  font-size: 14px;
+}
+
+h1 {
+  font-size: 18px;
+}
+
+h2 {
+  font-size: 14px;
+}
+
+h3 {
+  font-size: 14px;
+}
+
+ul,
+ol,
+li {
+  list-style: none;
+}
+
+a {
+  text-decoration: none;
+}
+
+a:hover {
+  text-decoration: none;
+}
+
+fieldset,
+img {
+  border: none;
+}
+
+table {
+  border-collapse: collapse;
+  border-spacing: 0;
+}
+
+i {
+  font-style: normal;
+}
+
+label {
+  position: inherit;
+}
+
+.clearfix:after {
+  content: ".";
+  display: block;
+  height: 0;
+  clear: both;
+  visibility: hidden;
+}
+
+.clearfix {
+  zoom: 1;
+  display: block;
+}
+
+html,
+body {
+  font-family: Tahoma, Arial, 'microsoft yahei', 'Roboto', 'Droid Sans', 'Helvetica Neue', 'Droid Sans Fallback', 'Heiti SC', 'Hiragino Sans GB', 'Simsun', 'sans-self';
+}
+
+
+
+.audio-banner {
+  width: 100%;
+  overflow: auto;
+  padding: 0;
+  background: url('../image/voice-dictation.svg');
+  background-size: cover;
+}
+.weaper {
+  width: 1200px;
+  height: 155px;
+  margin: 72px auto;
+}
+.text-content {
+  width: 670px;
+  height: 100%;
+  float: left;
+}
+.text-content .title {
+  font-size: 34px;
+  font-family: 'PingFangSC-Medium';
+  font-weight: 500;
+  color: rgba(255, 255, 255, 1);
+  line-height: 48px;
+}
+.text-content .con {
+  font-size: 16px;
+  font-family: PingFangSC-Light;
+  font-weight: 300;
+  color: rgba(255, 255, 255, 1);
+  line-height: 30px;
+}
+.img-con {
+  width: 416px;
+  height: 100%;
+  float: right;
+}
+.img-con img {
+  width: 100%;
+  height: 100%;
+}
+.con-container {
+  margin-top: 34px;
+}
+
+.audio-advantage {
+  background: #f8f9fa;
+}
+.asr-advantage {
+  width: 1200px;
+  margin: 0 auto;
+}
+.asr-advantage h2 {
+  text-align: center;
+  font-size: 22px;
+  padding: 30px 0 0 0;
+}
+.asr-advantage > ul > li {
+  box-sizing: border-box;
+  padding: 0 16px;
+  width: 33%;
+  text-align: center;
+  margin-bottom: 35px;
+}
+.asr-advantage > ul > li .icons{
+  margin-top: 10px;
+  margin-bottom: 20px;
+  width: 42px;
+  height: 42px;
+}
+.service-item-content {
+  margin-top: 35px;
+  display: flex;
+  justify-content: center;
+  flex-wrap: wrap;
+}
+.service-item-content img {
+  width: 160px;
+  vertical-align: bottom;
+}
+.service-item-content > li {
+    box-sizing: border-box;
+    padding: 0 16px;
+    width: 33%;
+    text-align: center;
+    margin-bottom: 35px;
+}
+.service-item-content > li .service-item-content-title {
+  line-height: 1.5;
+  font-weight: 700;
+  margin-top: 10px;
+}
+.service-item-content > li .service-item-content-desc {
+  margin-top: 5px;
+  line-height: 1.8;
+  color: #657384;
+}
+
+
+.audio-scene-con {
+  width: 100%;
+  padding-bottom: 84px;
+  background: #fff;
+}
+.audio-scene {
+  overflow: auto;
+  width: 1200px;
+  background: #fff;
+  text-align: center;
+  padding: 0;
+  margin: 0 auto;
+}
+.audio-scene h2 {
+  padding: 30px 0 0 0;
+  font-size: 22px;
+  text-align: center;
+}
+
+.audio-experience {
+  width: 100%;
+  height: 538px;
+  background: #fff;
+  padding: 0;
+  margin: 0;
+  overflow: auto;
+}
+.asr-box {
+  width: 1200px;
+  height: 394px;
+  margin: 64px auto;
+}
+.asr-box h2 {
+  font-size: 22px;
+  text-align: center;
+  margin-bottom: 64px;
+}
+.voice-container {
+  position: relative;
+  width: 1200px;
+  height: 308px;
+  background: rgba(255, 255, 255, 1);
+  border-radius: 8px;
+  border: 1px solid rgba(225, 225, 225, 1);
+}
+.voice-container .voice {
+  height: 236px;
+  width: 100%;
+  border-radius: 8px;
+}
+.voice-container .voice textarea {
+  height: 100%;
+  width: 100%;
+  border: none;
+  outline: none;
+  border-radius: 8px;
+  padding: 25px;
+  font-size: 14px;
+  box-sizing: border-box;
+  resize: none;
+}
+.voice-input {
+  width: 100%;
+  height: 72px;
+  box-sizing: border-box;
+  padding-left: 35px;
+  background: rgba(242, 244, 245, 1);
+  border-radius: 8px;
+  line-height: 72px;
+}
+.voice-input .el-select {
+  width: 492px;
+}
+.start-voice {
+  display: inline-block;
+  margin-left: 10px;
+}
+.start-voice .time {
+  margin-right: 25px;
+}
+.asr-advantage > ul > li {
+  margin-bottom: 77px;
+}
+#msg {
+  width: 100%;
+  line-height: 40px;
+  font-size: 14px;
+  margin-left: 330px;
+}
+#captcha {
+  margin-left: 350px !important;
+  display: inline-block;
+  position: relative;
+}
+.black {
+  position: fixed;
+  width: 100%;
+  height: 100%;
+  z-index: 5;
+  background: rgba(0, 0, 0, 0.5);
+  top: 0;
+  left: 0;
+}
+.container {
+  position: fixed;
+  z-index: 6;
+  top: 25%;
+  left: 10%;
+}
+.audio-scene-con {
+  width: 100%;
+  padding-bottom: 84px;
+  background: #fff;
+}
+#sound {
+  color: #fff;
+  cursor: pointer;
+  background: #147ede;
+  padding: 10px;
+  margin-top: 30px;
+  margin-left: 135px;
+  width: 176px;
+  height: 30px !important;
+  text-align: center;
+  line-height: 30px !important;
+  border-radius: 10px;
+}
+.con-ten {
+  position: absolute;
+  width: 100%;
+  height: 100%;
+  z-index: 5;
+  background: #fff;
+  opacity: 0.5;
+  top: 0;
+  left: 0;
+}
+.websocket-url {
+  width: 320px;
+  height: 20px;
+  border: 1px solid #dcdfe6;
+  line-height: 20px;
+  padding: 10px;
+  border-radius: 4px;
+}
+.voice-btn {
+  color: #fff;
+  background-color: #409eff;
+  font-weight: 500;
+  padding: 12px 20px;
+  font-size: 14px;
+  border-radius: 4px;
+  border: 0;
+  cursor: pointer;
+}
+.voice-btn.end {
+  display: none;
+}
+.result-text {
+  background: #fff;
+  padding: 20px;
+}
+.voice-footer {
+  border-top: 1px solid #dddede;
+  background: #f7f9fa;
+  text-align: center;
+  margin-bottom: 8px;
+  color: #333;
+  font-size: 12px;
+  padding: 20px 0;
+}
+
+/** line animate **/
+.time-box {
+  display: none;
+  margin-left: 10px;
+  width: 300px;
+}
+.total-time {
+  font-size: 14px;
+  color: #545454;
+}
+.voice-btn.end.show,
+.time-box.show {
+  display: inline;
+}
+.start-taste-line {
+  margin-right: 20px;
+  display: inline-block;
+}
+.start-taste-line hr {
+  background-color: #187cff;
+  width: 3px;
+  height: 8px;
+  margin: 0 3px;
+  display: inline-block;
+  border: none;
+}
+.hr {
+  animation: note 0.2s ease-in-out;
+  animation-iteration-count: infinite;
+  animation-direction: alternate;
+}
+.hr-one {
+  animation-delay: -0.9s;
+}
+.hr-two {
+  animation-delay: -0.8s;
+}
+.hr-three {
+  animation-delay: -0.7s;
+}
+.hr-four {
+  animation-delay: -0.6s;
+}
+.hr-five {
+  animation-delay: -0.5s;
+}
+.hr-six {
+  animation-delay: -0.4s;
+}
+.hr-seven {
+  animation-delay: -0.3s;
+}
+.hr-eight {
+  animation-delay: -0.2s;
+}
+.hr-nine {
+  animation-delay: -0.1s;
+}
+@keyframes note {
+  from {
+    transform: scaleY(1);
+  }
+  to {
+    transform: scaleY(4);
+  }
+}
--- a/paddlespeech/server/tests/asr/online/web/static/fonts/FontAwesome.otf
+++ b/paddlespeech/server/tests/asr/online/web/static/fonts/FontAwesome.otf
--- a/paddlespeech/server/tests/asr/online/web/static/fonts/fontawesome-webfont.eot
+++ b/paddlespeech/server/tests/asr/online/web/static/fonts/fontawesome-webfont.eot
--- a/paddlespeech/server/tests/asr/online/web/static/fonts/fontawesome-webfont.svg
+++ b/paddlespeech/server/tests/asr/online/web/static/fonts/fontawesome-webfont.svg
--- a/paddlespeech/server/tests/asr/online/web/static/fonts/fontawesome-webfont.ttf
+++ b/paddlespeech/server/tests/asr/online/web/static/fonts/fontawesome-webfont.ttf
--- a/paddlespeech/server/tests/asr/online/web/static/fonts/fontawesome-webfont.woff
+++ b/paddlespeech/server/tests/asr/online/web/static/fonts/fontawesome-webfont.woff
--- a/paddlespeech/server/tests/asr/online/web/static/fonts/fontawesome-webfont.woff2
+++ b/paddlespeech/server/tests/asr/online/web/static/fonts/fontawesome-webfont.woff2
--- a/paddlespeech/server/tests/asr/online/web/static/image/PaddleSpeech_logo.png
+++ b/paddlespeech/server/tests/asr/online/web/static/image/PaddleSpeech_logo.png
--- a/paddlespeech/server/tests/asr/online/web/static/image/voice-dictation.svg
+++ b/paddlespeech/server/tests/asr/online/web/static/image/voice-dictation.svg
--- a/paddlespeech/server/tests/asr/online/web/static/js/SoundRecognizer.js
+++ b/paddlespeech/server/tests/asr/online/web/static/js/SoundRecognizer.js
@ -0,0 +1,133 @@
+SoundRecognizer = {
+    rec: null,
+    wave: null,
+    SampleRate: 16000,
+    testBitRate: 16,
+    isCloseRecorder: false,
+    SendInterval: 300,
+    realTimeSendTryType: 'pcm',
+    realTimeSendTryEncBusy: 0,
+    realTimeSendTryTime: 0,
+    realTimeSendTryNumber: 0,
+    transferUploadNumberMax: 0,
+    realTimeSendTryChunk: null,
+    soundType: "pcm",
+    init: function (config) {
+        this.soundType = config.soundType || 'pcm';
+        this.SampleRate = config.sampleRate || 16000;
+        this.recwaveElm = config.recwaveElm || '';
+        this.TransferUpload = config.translerCallBack || this.TransferProcess;
+        this.initRecorder();
+    },
+    RealTimeSendTryReset: function (type) {
+        this.realTimeSendTryType = type;
+        this.realTimeSendTryTime = 0;
+    },
+    RealTimeSendTry: function (rec, isClose) {
+        var that = this;
+        var t1 = Date.now(), endT = 0, recImpl = Recorder.prototype;
+        if (this.realTimeSendTryTime == 0) {
+            this.realTimeSendTryTime = t1;
+            this.realTimeSendTryEncBusy = 0;
+            this.realTimeSendTryNumber = 0;
+            this.transferUploadNumberMax = 0;
+            this.realTimeSendTryChunk = null;
+        }
+        if (!isClose && t1 - this.realTimeSendTryTime < this.SendInterval) {
+            return;//控制缓冲达到指定间隔才进行传输
+        }
+        this.realTimeSendTryTime = t1;
+        var number = ++this.realTimeSendTryNumber;
+
+        //借用SampleData函数进行数据的连续处理，采样率转换是顺带的
+        var chunk = Recorder.SampleData(rec.buffers, rec.srcSampleRate, this.SampleRate, this.realTimeSendTryChunk, { frameType: isClose ? "" : this.realTimeSendTryType });
+
+        //清理已处理完的缓冲数据，释放内存以支持长时间录音，最后完成录音时不能调用stop，因为数据已经被清掉了
+        for (var i = this.realTimeSendTryChunk ? this.realTimeSendTryChunk.index : 0; i < chunk.index; i++) {
+            rec.buffers[i] = null;
+        }
+        this.realTimeSendTryChunk = chunk;
+
+        //没有新数据，或结束时的数据量太小，不能进行mock转码
+        if (chunk.data.length == 0 || isClose && chunk.data.length < 2000) {
+            this.TransferUpload(number, null, 0, null, isClose);
+            return;
+        }
+        //实时编码队列阻塞处理
+        if (!isClose) {
+            if (this.realTimeSendTryEncBusy >= 2) {
+                console.log("编码队列阻塞，已丢弃一帧", 1);
+                return;
+            }
+        }
+        this.realTimeSendTryEncBusy++;
+
+        //通过mock方法实时转码成mp3、wav
+        var encStartTime = Date.now();
+        var recMock = Recorder({
+            type: this.realTimeSendTryType
+            , sampleRate: this.SampleRate //采样率
+            , bitRate: this.testBitRate //比特率
+        });
+        recMock.mock(chunk.data, chunk.sampleRate);
+        recMock.stop(function (blob, duration) {
+            that.realTimeSendTryEncBusy && (that.realTimeSendTryEncBusy--);
+            blob.encTime = Date.now() - encStartTime;
+
+            //转码好就推入传输
+            that.TransferUpload(number, blob, duration, recMock, isClose);
+        }, function (msg) {
+            that.realTimeSendTryEncBusy && (that.realTimeSendTryEncBusy--);
+            //转码错误？没想到什么时候会产生错误！
+            console.log("不应该出现的错误:" + msg, 1);
+        });
+    },
+    recordClose: function () {
+        try {
+            this.rec.close(function () {
+                this.isCloseRecorder = true;
+            });
+            this.RealTimeSendTry(this.rec, true);//最后一次发送
+        } catch (ex) {
+            // recordClose();
+        }
+    },
+    recordEnd: function () {
+        try {
+            this.rec.stop(function (blob, time) {
+                this.recordClose();
+            }, function (s) {
+                this.recordClose();
+            });
+        } catch (ex) {
+        }
+    },
+    initRecorder: function () {
+        var that = this;
+        var rec = Recorder({
+            type: that.soundType
+            , bitRate: that.testBitRate
+            , sampleRate: that.SampleRate
+            , onProcess: function (buffers, level, time, sampleRate) {
+                that.wave.input(buffers[buffers.length - 1], level, sampleRate);
+                that.RealTimeSendTry(rec, false);//推入实时处理，因为是unknown格式，这里简化函数调用，没有用到buffers和bufferSampleRate，因为这些数据和rec.buffers是完全相同的。
+            }
+        });
+
+        rec.open(function () {
+            that.wave = Recorder.FrequencyHistogramView({
+                elem: that.recwaveElm, lineCount: 90
+                , position: 0
+                , minHeight: 1
+                , stripeEnable: false
+            });
+            rec.start();
+            that.isCloseRecorder = false;
+            that.RealTimeSendTryReset(that.soundType);//重置
+        });
+        this.rec = rec;
+    },
+    TransferProcess: function (number, blobOrNull, duration, blobRec, isClose) {
+
+    }
+}
--- a/paddlespeech/server/tests/asr/online/web/static/js/jquery-3.2.1.min.js
+++ b/paddlespeech/server/tests/asr/online/web/static/js/jquery-3.2.1.min.js
--- a/paddlespeech/server/tests/asr/online/web/static/js/recorder/engine/mp3.js
+++ b/paddlespeech/server/tests/asr/online/web/static/js/recorder/engine/mp3.js
--- a/paddlespeech/server/tests/asr/online/web/static/js/recorder/engine/pcm.js
+++ b/paddlespeech/server/tests/asr/online/web/static/js/recorder/engine/pcm.js
@ -0,0 +1,6 @@
+/*
+录音
+https://github.com/xiangyuecn/Recorder
+src: engine/pcm.js
+*/
+!function(){"use strict";Recorder.prototype.enc_pcm={stable:!0,testmsg:"pcm为未封装的原始音频数据，pcm数据文件无法直接播放；支持位数8位、16位（填在比特率里面），采样率取值无限制"},Recorder.prototype.pcm=function(e,t,r){var a=this.set,n=e.length,o=8==a.bitRate?8:16,c=new ArrayBuffer(n*(o/8)),s=new DataView(c),l=0;if(8==o)for(var p=0;p<n;p++,l++){var i=128+(e[p]>>8);s.setInt8(l,i,!0)}else for(p=0;p<n;p++,l+=2)s.setInt16(l,e[p],!0);t(new Blob([s.buffer],{type:"audio/pcm"}))},Recorder.pcm2wav=function(e,a,n){e.slice&&null!=e.type&&(e={blob:e});var o=e.sampleRate||16e3,c=e.bitRate||16;if(e.sampleRate&&e.bitRate||console.warn("pcm2wav必须提供sampleRate和bitRate"),Recorder.prototype.wav){var s=new FileReader;s.onloadend=function(){var e;if(8==c){var t=new Uint8Array(s.result);e=new Int16Array(t.length);for(var r=0;r<t.length;r++)e[r]=t[r]-128<<8}else e=new Int16Array(s.result);Recorder({type:"wav",sampleRate:o,bitRate:c}).mock(e,o).stop(function(e,t){a(e,t)},n)},s.readAsArrayBuffer(e.blob)}else n("pcm2wav必须先加载wav编码器wav.js")}}();
--- a/paddlespeech/server/tests/asr/online/web/static/js/recorder/engine/wav.js
+++ b/paddlespeech/server/tests/asr/online/web/static/js/recorder/engine/wav.js
@ -0,0 +1,6 @@
+/*
+录音
+https://github.com/xiangyuecn/Recorder
+src: engine/wav.js
+*/
+!function(){"use strict";Recorder.prototype.enc_wav={stable:!0,testmsg:"支持位数8位、16位（填在比特率里面），采样率取值无限制"},Recorder.prototype.wav=function(t,e,n){var r=this.set,a=t.length,o=r.sampleRate,f=8==r.bitRate?8:16,i=a*(f/8),s=new ArrayBuffer(44+i),c=new DataView(s),u=0,v=function(t){for(var e=0;e<t.length;e++,u++)c.setUint8(u,t.charCodeAt(e))},w=function(t){c.setUint16(u,t,!0),u+=2},l=function(t){c.setUint32(u,t,!0),u+=4};if(v("RIFF"),l(36+i),v("WAVE"),v("fmt "),l(16),w(1),w(1),l(o),l(o*(f/8)),w(f/8),w(f),v("data"),l(i),8==f)for(var p=0;p<a;p++,u++){var d=128+(t[p]>>8);c.setInt8(u,d,!0)}else for(p=0;p<a;p++,u+=2)c.setInt16(u,t[p],!0);e(new Blob([c.buffer],{type:"audio/wav"}))}}();
--- a/paddlespeech/server/tests/asr/online/web/static/js/recorder/extensions/frequency.histogram.view.js
+++ b/paddlespeech/server/tests/asr/online/web/static/js/recorder/extensions/frequency.histogram.view.js
@ -0,0 +1,6 @@
+/*
+录音
+https://github.com/xiangyuecn/Recorder
+src: extensions/frequency.histogram.view.js
+*/
+!function(){"use strict";var t=function(t){return new e(t)},e=function(t){var e=this,r={scale:2,fps:20,lineCount:30,widthRatio:.6,spaceWidth:0,minHeight:0,position:-1,mirrorEnable:!1,stripeEnable:!0,stripeHeight:3,stripeMargin:6,fallDuration:1e3,stripeFallDuration:3500,linear:[0,"rgba(0,187,17,1)",.5,"rgba(255,215,0,1)",1,"rgba(255,102,0,1)"],stripeLinear:null,shadowBlur:0,shadowColor:"#bbb",stripeShadowBlur:-1,stripeShadowColor:"",onDraw:function(t,e){}};for(var a in t)r[a]=t[a];e.set=t=r;var i=t.elem;i&&("string"==typeof i?i=document.querySelector(i):i.length&&(i=i[0])),i&&(t.width=i.offsetWidth,t.height=i.offsetHeight);var o=t.scale,l=t.width*o,n=t.height*o,h=e.elem=document.createElement("div"),s=["","transform-origin:0 0;","transform:scale("+1/o+");"];h.innerHTML='<div style="width:'+t.width+"px;height:"+t.height+'px;overflow:hidden"><div style="width:'+l+"px;height:"+n+"px;"+s.join("-webkit-")+s.join("-ms-")+s.join("-moz-")+s.join("")+'"><canvas/></div></div>';var f=e.canvas=h.querySelector("canvas");e.ctx=f.getContext("2d");if(f.width=l,f.height=n,i&&(i.innerHTML="",i.appendChild(h)),!Recorder.LibFFT)throw new Error("需要lib.fft.js支持");e.fft=Recorder.LibFFT(1024),e.lastH=[],e.stripesH=[]};e.prototype=t.prototype={genLinear:function(t,e,r,a){for(var i=t.createLinearGradient(0,r,0,a),o=0;o<e.length;)i.addColorStop(e[o++],e[o++]);return i},input:function(t,e,r){var a=this;a.sampleRate=r,a.pcmData=t,a.pcmPos=0,a.inputTime=Date.now(),a.schedule()},schedule:function(){var t=this,e=t.set,r=Math.floor(1e3/e.fps);t.timer||(t.timer=setInterval(function(){t.schedule()},r));var a=Date.now(),i=t.drawTime||0;if(a-t.inputTime>1.3*e.stripeFallDuration)return clearInterval(t.timer),void(t.timer=0);if(!(a-i<r)){t.drawTime=a;for(var o=t.fft.bufferSize,l=t.pcmData,n=t.pcmPos,h=new Int16Array(o),s=0;s<o&&n<l.length;s++,n++)h[s]=l[n];t.pcmPos=n;var f=t.fft.transform(h);t.draw(f,t.sampleRate)}},draw:function(t,e){var r=this,a=r.set,i=r.ctx,o=a.scale,l=a.width*o,n=a.height*o,h=a.lineCount,s=r.fft.bufferSize,f=a.position,d=Math.abs(a.position),c=1==f?0:n,p=n;d<1&&(c=p/=2,p=Math.floor(p*(1+d)),c=Math.floor(0<f?c*(1-d):c*(1+d)));for(var u=r.lastH,v=r.stripesH,w=Math.ceil(p/(a.fallDuration/(1e3/a.fps))),g=Math.ceil(p/(a.stripeFallDuration/(1e3/a.fps))),m=a.stripeMargin*o,M=1<<(Math.round(Math.log(s)/Math.log(2)+3)<<1),b=Math.log(M)/Math.log(10),L=20*Math.log(32767)/Math.log(10),y=s/2,S=Math.min(y,Math.floor(5e3*y/(e/2))),C=S==y,H=C?h:Math.round(.8*h),R=S/H,D=C?0:(y-S)/(h-H),x=0,F=0;F<h;F++){var T=Math.ceil(x);x+=F<H?R:D;for(var B=Math.min(Math.ceil(x),y),E=0,j=T;j<B;j++)E=Math.max(E,Math.abs(t[j]));var I=M<E?Math.floor(17*(Math.log(E)/Math.log(10)-b)):0,q=p*Math.min(I/L,1);u[F]=(u[F]||0)-w,q<u[F]&&(q=u[F]),q<0&&(q=0),u[F]=q;var z=v[F]||0;if(q&&z<q+m)v[F]=q+m;else{var P=z-g;P<0&&(P=0),v[F]=P}}i.clearRect(0,0,l,n);var W=r.genLinear(i,a.linear,c,c-p),k=a.stripeLinear&&r.genLinear(i,a.stripeLinear,c,c-p)||W,A=r.genLinear(i,a.linear,c,c+p),G=a.stripeLinear&&r.genLinear(i,a.stripeLinear,c,c+p)||A;i.shadowBlur=a.shadowBlur*o,i.shadowColor=a.shadowColor;var V=a.mirrorEnable,J=V?2*h-1:h,K=a.widthRatio,N=a.spaceWidth*o;0!=N&&(K=(l-N*(J+1))/l);for(var O=Math.max(1*o,Math.floor(l*K/J)),Q=(l-J*O)/(J+1),U=a.minHeight*o,X=V?l/2-(Q+O/2):0,Y=(F=0,X);F<h;F++)Y+=Q,$=Math.floor(Y),q=Math.max(u[F],U),0!=c&&(_=c-q,i.fillStyle=W,i.fillRect($,_,O,q)),c!=n&&(i.fillStyle=A,i.fillRect($,c,O,q)),Y+=O;if(a.stripeEnable){var Z=a.stripeShadowBlur;i.shadowBlur=(-1==Z?a.shadowBlur:Z)*o,i.shadowColor=a.stripeShadowColor||a.shadowColor;var $,_,tt=a.stripeHeight*o;for(F=0,Y=X;F<h;F++)Y+=Q,$=Math.floor(Y),q=v[F],0!=c&&((_=c-q-tt)<0&&(_=0),i.fillStyle=k,i.fillRect($,_,O,tt)),c!=n&&(n<(_=c+q)+tt&&(_=n-tt),i.fillStyle=G,i.fillRect($,_,O,tt)),Y+=O}if(V){var et=Math.floor(l/2);i.save(),i.scale(-1,1),i.drawImage(r.canvas,Math.ceil(l/2),0,et,n,-et,0,et,n),i.restore()}a.onDraw(t,e)}},Recorder.FrequencyHistogramView=t}();
--- a/paddlespeech/server/tests/asr/online/web/static/js/recorder/extensions/lib.fft.js
+++ b/paddlespeech/server/tests/asr/online/web/static/js/recorder/extensions/lib.fft.js
@ -0,0 +1,6 @@
+/*
+录音
+https://github.com/xiangyuecn/Recorder
+src: extensions/lib.fft.js
+*/
+Recorder.LibFFT=function(r){"use strict";var s,v,d,l,F,b,g,m;return function(r){var o,t,a,f;for(s=Math.round(Math.log(r)/Math.log(2)),d=((v=1<<s)<<2)*Math.sqrt(2),l=[],F=[],b=[0],g=[0],m=[],o=0;o<v;o++){for(a=o,f=t=0;t!=s;t++)f<<=1,f|=1&a,a>>>=1;m[o]=f}var n,u=2*Math.PI/v;for(o=(v>>1)-1;0<o;o--)n=o*u,g[o]=Math.cos(n),b[o]=Math.sin(n)}(r),{transform:function(r){var o,t,a,f,n,u,e,h,M=1,i=s-1;for(o=0;o!=v;o++)l[o]=r[m[o]],F[o]=0;for(o=s;0!=o;o--){for(t=0;t!=M;t++)for(n=g[t<<i],u=b[t<<i],a=t;a<v;a+=M<<1)e=n*l[f=a+M]-u*F[f],h=n*F[f]+u*l[f],l[f]=l[a]-e,F[f]=F[a]-h,l[a]+=e,F[a]+=h;M<<=1,i--}t=v>>1;var c=new Float64Array(t);for(n=-(u=d),o=t;0!=o;o--)e=l[o],h=F[o],c[o-1]=n<e&&e<u&&n<h&&h<u?0:Math.round(e*e+h*h);return c},bufferSize:v}};
--- a/paddlespeech/server/tests/asr/online/web/static/js/recorder/recorder-core.js
+++ b/paddlespeech/server/tests/asr/online/web/static/js/recorder/recorder-core.js
--- a/paddlespeech/server/tests/asr/online/web/static/paddle.ico
+++ b/paddlespeech/server/tests/asr/online/web/static/paddle.ico
--- a/paddlespeech/server/tests/asr/online/web/templates/index.html
+++ b/paddlespeech/server/tests/asr/online/web/templates/index.html
@ -0,0 +1,155 @@
+<!DOCTYPE html>
+<html>
+
+<head>
+  <meta charset="UTF-8">
+  <title>PaddleSpeech Serving-语音实时转写</title>
+  <link rel="shortcut icon" href="./static/paddle.ico">
+  <script src="../static/js/jquery-3.2.1.min.js"></script>
+  <script src="../static/js/recorder/recorder-core.js"></script>
+  <script src="../static/js/recorder/extensions/lib.fft.js"></script>
+  <script src="../static/js/recorder/extensions/frequency.histogram.view.js"></script>
+  <script src="../static/js/recorder/engine/pcm.js"></script>
+  <script src="../static/js/SoundRecognizer.js"></script>
+  <link rel="stylesheet" href="../static/css/style.css">
+  <link rel="stylesheet" href="../static/css/font-awesome.min.css">
+</head>
+
+<body>
+  <div class="asr-content">
+    <div class="audio-banner">
+      <div class="weaper">
+        <div class="text-content">
+          <p><span class="title">PaddleSpeech Serving简介</span></p>
+          <p class="con-container">
+            <span class="con">PaddleSpeech 是基于飞桨 PaddlePaddle 的语音方向的开源模型库，用于语音和音频中的各种关键任务的开发。PaddleSpeech Serving是基于python + fastapi 的语音算法模型的C/S类型后端服务，旨在统一paddle speech下的各语音算子来对外提供后端服务。</span>
+          </p>
+        </div>
+        <div class="img-con">
+          <img src="../static/image/PaddleSpeech_logo.png" alt="" />
+        </div>
+      </div>
+    </div>
+    <div class="audio-experience">
+      <div class="asr-box">
+        <h2>产品体验</h2>
+        <div id="client-word-recorder" style="position: relative;">
+          <div class="pd">
+            <div style="text-align:center;height:20px;width:100%;
+                        border:0px solid #bcbcbc;color:#000;box-sizing: border-box;display:inline-block"
+              class="recwave">
+            </div>
+          </div>
+        </div>
+        <div class="voice-container">
+          <div class="voice-input">
+            <span>WebSocket URL：</span>
+            <input type="text" id="socketUrl" class="websocket-url" value="ws://127.0.0.1:8091/ws/asr"
+              placeholder="请输入服务器地址，如：ws://127.0.0.1:8091/ws/asr">
+            <div class="start-voice">
+              <button type="primary" id="beginBtn" class="voice-btn">
+                <span class="fa fa-microphone"> 开始识别</span>
+              </button>
+              <button type="primary" id="endBtn" class="voice-btn end">
+                <span class="fa fa-microphone-slash"> 结束识别</span>
+              </button>
+              <div id="timeBox" class="time-box flex-display-1">
+                <span class="total-time">识别中，<i id="timeCount"></i> 秒后自动停止识别</span>
+              </div>
+            </div>
+          </div>
+          <div class="voice">
+            <div class="result-text" id="resultPanel">此处显示识别结果</div>
+          </div>
+        </div>
+      </div>
+    </div>
+  </div>
+  <script>
+    var wenetWs = null
+    var timeLoop = null
+    var result = ""
+    $(document).ready(function () {
+      $('#beginBtn').on('click', startRecording)
+      $('#endBtn').on('click', stopRecording)
+    })
+
+    function openWebSocket(url) {
+      if ("WebSocket" in window) {
+        wenetWs = new WebSocket(url)
+        wenetWs.onopen = function () {
+          console.log("Websocket 连接成功，开始识别")
+          wenetWs.send(JSON.stringify({
+            "signal": "start"
+          }))
+        }
+        wenetWs.onmessage = function (_msg) { parseResult(_msg.data) }
+        wenetWs.onclose = function () {
+          console.log("WebSocket 连接断开")
+        }
+        wenetWs.onerror = function () { console.log("WebSocket 连接失败") }
+      }
+    }
+
+    function parseResult(data) {
+      var data = JSON.parse(data)
+      var result = data.asr_results
+      console.log(result)
+      $("#resultPanel").html(result)
+    }
+
+    function TransferUpload(number, blobOrNull, duration, blobRec, isClose) {
+      if (blobOrNull) {
+        var blob = blobOrNull
+        var encTime = blob.encTime
+        var reader = new FileReader()
+        reader.onloadend = function () { wenetWs.send(reader.result) }
+        reader.readAsArrayBuffer(blob)
+      }
+    }
+
+    function startRecording() {
+      // Check socket url
+      var socketUrl = $('#socketUrl').val()
+      if (!socketUrl.trim()) {
+        alert('请输入 WebSocket 服务器地址，如：ws://127.0.0.1:8091/ws/asr')
+        $('#socketUrl').focus()
+        return
+      }
+      // init recorder
+      SoundRecognizer.init({
+        soundType: 'pcm',
+        sampleRate: 16000,
+        recwaveElm: '.recwave',
+        translerCallBack: TransferUpload
+      })
+      openWebSocket(socketUrl)
+
+      // Change button state
+      $('#beginBtn').hide()
+      $('#endBtn, #timeBox').addClass('show')
+      // Start countdown
+      var seconds = 180
+      $('#timeCount').text(seconds)
+      timeLoop = setInterval(function () {
+        seconds--
+        $('#timeCount').text(seconds)
+        if (seconds === 0) {
+          stopRecording()
+        }
+      }, 1000)
+    }
+
+    function stopRecording() {
+      wenetWs.send(JSON.stringify({ "signal": "end" }))
+      SoundRecognizer.recordClose()
+
+      $('#endBtn').add($('#timeBox')).removeClass('show')
+      $('#beginBtn').show()
+      $('#timeCount').text('')
+      clearInterval(timeLoop)
+    }
+  </script>
+</body>
+
+</html>
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@ -38,9 +38,7 @@ def get_predictor(args, filed='am'):
        config.enable_use_gpu(100, 0)
    elif args.device == "cpu":
        config.disable_gpu()
-    # This line must be commented for fastspeech2, if not, it will OOM
-    if model_name != 'fastspeech2':
-        config.enable_memory_optim()
+    config.enable_memory_optim()
    predictor = inference.create_predictor(config)
    return predictor

--- a/paddlespeech/t2s/exps/ort_predict.py
+++ b/paddlespeech/t2s/exps/ort_predict.py
@ -70,8 +70,15 @@ def ort_predict(args):

    # am warmup
    for T in [27, 38, 54]:
-        data = np.random.randint(1, 266, size=(T, ))
-        am_sess.run(None, {"text": data})
+        am_input_feed = {}
+        if am_name == 'fastspeech2':
+            phone_ids = np.random.randint(1, 266, size=(T, ))
+            am_input_feed.update({'text': phone_ids})
+        elif am_name == 'speedyspeech':
+            phone_ids = np.random.randint(1, 92, size=(T, ))
+            tone_ids = np.random.randint(1, 5, size=(T, ))
+            am_input_feed.update({'phones': phone_ids, 'tones': tone_ids})
+        am_sess.run(None, input_feed=am_input_feed)

    # voc warmup
    for T in [227, 308, 544]:
@ -81,14 +88,20 @@ def ort_predict(args):

    N = 0
    T = 0
+    am_input_feed = {}
    for example in test_dataset:
        utt_id = example['utt_id']
-        phone_ids = example["text"]
+        if am_name == 'fastspeech2':
+            phone_ids = example["text"]
+            am_input_feed.update({'text': phone_ids})
+        elif am_name == 'speedyspeech':
+            phone_ids = example["phones"]
+            tone_ids = example["tones"]
+            am_input_feed.update({'phones': phone_ids, 'tones': tone_ids})
        with timer() as t:
-            mel = am_sess.run(output_names=None, input_feed={'text': phone_ids})
+            mel = am_sess.run(output_names=None, input_feed=am_input_feed)
            mel = mel[0]
            wav = voc_sess.run(output_names=None, input_feed={'logmel': mel})
-
            N += len(wav[0])
            T += t.elapse
            speed = len(wav[0]) / t.elapse
@ -110,9 +123,7 @@ def parse_args():
        '--am',
        type=str,
        default='fastspeech2_csmsc',
-        choices=[
-            'fastspeech2_csmsc',
-        ],
+        choices=['fastspeech2_csmsc', 'speedyspeech_csmsc'],
        help='Choose acoustic model type of tts task.')

    # voc
--- a/paddlespeech/t2s/exps/ort_predict_e2e.py
+++ b/paddlespeech/t2s/exps/ort_predict_e2e.py
@ -68,39 +68,58 @@ def ort_predict(args):
    # vocoder
    voc_sess = get_sess(args, filed='voc')

+    # frontend warmup
+    # Loading model cost 0.5+ seconds
+    if args.lang == 'zh':
+        frontend.get_input_ids("你好，欢迎使用飞桨框架进行深度学习研究！", merge_sentences=True)
+    else:
+        print("lang should in be 'zh' here!")
+
    # am warmup
    for T in [27, 38, 54]:
-        data = np.random.randint(1, 266, size=(T, ))
-        am_sess.run(None, {"text": data})
+        am_input_feed = {}
+        if am_name == 'fastspeech2':
+            phone_ids = np.random.randint(1, 266, size=(T, ))
+            am_input_feed.update({'text': phone_ids})
+        elif am_name == 'speedyspeech':
+            phone_ids = np.random.randint(1, 92, size=(T, ))
+            tone_ids = np.random.randint(1, 5, size=(T, ))
+            am_input_feed.update({'phones': phone_ids, 'tones': tone_ids})
+        am_sess.run(None, input_feed=am_input_feed)

    # voc warmup
    for T in [227, 308, 544]:
        data = np.random.rand(T, 80).astype("float32")
-        voc_sess.run(None, {"logmel": data})
+        voc_sess.run(None, input_feed={"logmel": data})
    print("warm up done!")

-    # frontend warmup
-    # Loading model cost 0.5+ seconds
-    if args.lang == 'zh':
-        frontend.get_input_ids("你好，欢迎使用飞桨框架进行深度学习研究！", merge_sentences=True)
-    else:
-        print("lang should in be 'zh' here!")
-
    N = 0
    T = 0
    merge_sentences = True
+    get_tone_ids = False
+    am_input_feed = {}
+    if am_name == 'speedyspeech':
+        get_tone_ids = True
    for utt_id, sentence in sentences:
        with timer() as t:
            if args.lang == 'zh':
                input_ids = frontend.get_input_ids(
-                    sentence, merge_sentences=merge_sentences)
-
+                    sentence,
+                    merge_sentences=merge_sentences,
+                    get_tone_ids=get_tone_ids)
                phone_ids = input_ids["phone_ids"]
+                if get_tone_ids:
+                    tone_ids = input_ids["tone_ids"]
            else:
                print("lang should in be 'zh' here!")
            # merge_sentences=True here, so we only use the first item of phone_ids
            phone_ids = phone_ids[0].numpy()
-            mel = am_sess.run(output_names=None, input_feed={'text': phone_ids})
+            if am_name == 'fastspeech2':
+                am_input_feed.update({'text': phone_ids})
+            elif am_name == 'speedyspeech':
+                tone_ids = tone_ids[0].numpy()
+                am_input_feed.update({'phones': phone_ids, 'tones': tone_ids})
+            mel = am_sess.run(output_names=None, input_feed=am_input_feed)
            mel = mel[0]
            wav = voc_sess.run(output_names=None, input_feed={'logmel': mel})

@ -125,9 +144,7 @@ def parse_args():
        '--am',
        type=str,
        default='fastspeech2_csmsc',
-        choices=[
-            'fastspeech2_csmsc',
-        ],
+        choices=['fastspeech2_csmsc', 'speedyspeech_csmsc'],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@ -68,13 +68,15 @@ def evaluate(args):
    # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
    if am_name == 'tacotron2':
        merge_sentences = True
+
+    get_tone_ids = False
+    if am_name == 'speedyspeech':
+        get_tone_ids = True
+
    N = 0
    T = 0
    for utt_id, sentence in sentences:
        with timer() as t:
-            get_tone_ids = False
-            if am_name == 'speedyspeech':
-                get_tone_ids = True
            if args.lang == 'zh':
                input_ids = frontend.get_input_ids(
                    sentence,
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@ -667,8 +667,8 @@ class FastSpeech2(nn.Layer):
            use_teacher_forcing(bool, optional): Whether to use teacher forcing.
                If true, groundtruth of duration, pitch and energy will be used.
            spk_emb(Tensor, optional, optional): peaker embedding vector (spk_embed_dim,). (Default value = None)
-            spk_id(Tensor, optional(int64), optional): Batch of padded spk ids  (1,). (Default value = None)
-            tone_id(Tensor, optional(int64), optional): Batch of padded tone ids  (T,). (Default value = None)
+            spk_id(Tensor, optional(int64), optional): spk ids (1,). (Default value = None)
+            tone_id(Tensor, optional(int64), optional): tone ids (T,). (Default value = None)

        Returns:

@ -751,7 +751,6 @@ class FastSpeech2(nn.Layer):

        Returns:

-        
        """
        if self.tone_embed_integration_type == "add":
            # apply projection and then add to hidden states
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
@ -11,17 +11,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import List
+
 import paddle
 from paddle import nn

 from paddlespeech.t2s.modules.nets_utils import initialize
-from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding
 from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding


 class ResidualBlock(nn.Layer):
-    def __init__(self, channels, kernel_size, dilation, n=2):
+    def __init__(self,
+                 channels: int=128,
+                 kernel_size: int=3,
+                 dilation: int=3,
+                 n: int=2):
+        """SpeedySpeech encoder module.
+        Args:
+            channels (int, optional): Feature size of the residual output(and also the input).
+            kernel_size (int, optional): Kernel size of the 1D convolution.
+            dilation (int, optional): Dilation of the 1D convolution.
+            n (int): Number of blocks.
+        """
+
        super().__init__()
+        total_pad = (dilation * (kernel_size - 1))
+        begin = total_pad // 2
+        end = total_pad - begin
+        # remove padding='same' here, cause onnx don't support dilation + 'same' padding
        blocks = [
            nn.Sequential(
                nn.Conv1D(
@ -29,14 +47,20 @@ class ResidualBlock(nn.Layer):
                    channels,
                    kernel_size,
                    dilation=dilation,
-                    padding="same",
-                    data_format="NLC"),
+                    # make sure output T == input T
+                    padding=((0, 0), (0, 0), (begin, end))),
                nn.ReLU(),
-                nn.BatchNorm1D(channels, data_format="NLC"), ) for _ in range(n)
+                nn.BatchNorm1D(channels), ) for _ in range(n)
        ]
        self.blocks = nn.Sequential(*blocks)

-    def forward(self, x):
+    def forward(self, x: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor): Batch of input sequences (B, hidden_size, Tmax).
+        Returns:
+            Tensor: The residual output (B, hidden_size, Tmax).
+        """
        return x + self.blocks(x)


@ -62,7 +86,15 @@ class TextEmbedding(nn.Layer):
                tone_vocab_size, tone_embedding_size, tone_padding_idx)
        self.concat = concat

-    def forward(self, text, tone=None):
+    def forward(self, text: paddle.Tensor, tone: paddle.Tensor=None):
+        """Calculate forward propagation.
+        Args:
+            text(Tensor(int64)): Batch of padded token ids (B, Tmax).
+            tones(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax).
+        Returns:
+            Tensor: The residual output (B, Tmax, embedding_size).
+        """
+
        text_embed = self.text_embedding(text)
        if tone is None:
            return text_embed
@ -75,13 +107,24 @@ class TextEmbedding(nn.Layer):


 class SpeedySpeechEncoder(nn.Layer):
+    """SpeedySpeech encoder module.
+    Args:
+        vocab_size (int): Dimension of the inputs.
+        tone_size (Optional[int]): Number of tones.
+        hidden_size (int): Number of encoder hidden units.
+        kernel_size (int): Kernel size of encoder.
+        dilations (List[int]): Dilations of encoder.
+        spk_num (Optional[int]): Number of speakers. 
+    """
+
    def __init__(self,
-                 vocab_size,
-                 tone_size,
-                 hidden_size,
-                 kernel_size,
-                 dilations,
+                 vocab_size: int,
+                 tone_size: int,
+                 hidden_size: int=128,
+                 kernel_size: int=3,
+                 dilations: List[int]=[1, 3, 9, 27, 1, 3, 9, 27, 1, 1],
                 spk_num=None):
+
        super().__init__()
        self.embedding = TextEmbedding(
            vocab_size,
@ -109,34 +152,71 @@ class SpeedySpeechEncoder(nn.Layer):
        self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
        self.postnet2 = nn.Sequential(
            nn.ReLU(),
-            nn.BatchNorm1D(hidden_size, data_format="NLC"),
-            nn.Linear(hidden_size, hidden_size), )
-
-    def forward(self, text, tones, spk_id=None):
+            nn.BatchNorm1D(hidden_size), )
+        self.linear = nn.Linear(hidden_size, hidden_size)
+
+    def forward(self,
+                text: paddle.Tensor,
+                tones: paddle.Tensor,
+                spk_id: paddle.Tensor=None):
+        """Encoder input sequence.
+        Args:
+            text(Tensor(int64)): Batch of padded token ids (B, Tmax).
+            tones(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax).
+            spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,)
+
+        Returns:
+            Tensor: Output tensor (B, Tmax, hidden_size).
+        """
        embedding = self.embedding(text, tones)
        if self.spk_emb:
            embedding += self.spk_emb(spk_id).unsqueeze(1)
        embedding = self.prenet(embedding)
-        x = self.res_blocks(embedding)
+        x = self.res_blocks(embedding.transpose([0, 2, 1])).transpose([0, 2, 1])
+        # (B, T, dim)
        x = embedding + self.postnet1(x)
-        x = self.postnet2(x)
+        x = self.postnet2(x.transpose([0, 2, 1])).transpose([0, 2, 1])
+        x = self.linear(x)
        return x


 class DurationPredictor(nn.Layer):
-    def __init__(self, hidden_size):
+    def __init__(self, hidden_size: int=128):
        super().__init__()
        self.layers = nn.Sequential(
            ResidualBlock(hidden_size, 4, 1, n=1),
            ResidualBlock(hidden_size, 3, 1, n=1),
-            ResidualBlock(hidden_size, 1, 1, n=1), nn.Linear(hidden_size, 1))
+            ResidualBlock(hidden_size, 1, 1, n=1), )
+        self.linear = nn.Linear(hidden_size, 1)

-    def forward(self, x):
-        return paddle.squeeze(self.layers(x), -1)
+    def forward(self, x: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor): Batch of input sequences (B, Tmax, hidden_size).
+
+        Returns:
+            Tensor: Batch of predicted durations in log domain (B, Tmax).
+        """
+        x = self.layers(x.transpose([0, 2, 1])).transpose([0, 2, 1])
+        x = self.linear(x)
+        return paddle.squeeze(x, -1)


 class SpeedySpeechDecoder(nn.Layer):
-    def __init__(self, hidden_size, output_size, kernel_size, dilations):
+    def __init__(self,
+                 hidden_size: int=128,
+                 output_size: int=80,
+                 kernel_size: int=3,
+                 dilations: List[int]=[
+                     1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 1
+                 ]):
+        """SpeedySpeech decoder module.
+        Args:
+            hidden_size (int): Number of decoder hidden units.
+            kernel_size (int): Kernel size of decoder.
+            output_size (int): Dimension of the outputs.
+            dilations (List[int]): Dilations of decoder.
+        """
        super().__init__()
        res_blocks = [
            ResidualBlock(hidden_size, kernel_size, d, n=2) for d in dilations
@ -144,14 +224,21 @@ class SpeedySpeechDecoder(nn.Layer):
        self.res_blocks = nn.Sequential(*res_blocks)

        self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
-        self.postnet2 = nn.Sequential(
-            ResidualBlock(hidden_size, kernel_size, 1, n=2),
-            nn.Linear(hidden_size, output_size))
+        self.postnet2 = ResidualBlock(hidden_size, kernel_size, 1, n=2)
+        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, x):
-        xx = self.res_blocks(x)
+        """Decoder input sequence.
+        Args:
+            x(Tensor): Input tensor (B, time, hidden_size).
+
+        Returns:
+            Tensor: Output tensor (B, time, output_size).
+        """
+        xx = self.res_blocks(x.transpose([0, 2, 1])).transpose([0, 2, 1])
        x = x + self.postnet1(xx)
-        x = self.postnet2(x)
+        x = self.postnet2(x.transpose([0, 2, 1])).transpose([0, 2, 1])
+        x = self.linear(x)
        return x


@ -159,17 +246,35 @@ class SpeedySpeech(nn.Layer):
    def __init__(
            self,
            vocab_size,
-            encoder_hidden_size,
-            encoder_kernel_size,
-            encoder_dilations,
-            duration_predictor_hidden_size,
-            decoder_hidden_size,
-            decoder_output_size,
-            decoder_kernel_size,
-            decoder_dilations,
-            tone_size=None,
-            spk_num=None,
-            init_type: str="xavier_uniform", ):
+            encoder_hidden_size: int=128,
+            encoder_kernel_size: int=3,
+            encoder_dilations: List[int]=[1, 3, 9, 27, 1, 3, 9, 27, 1, 1],
+            duration_predictor_hidden_size: int=128,
+            decoder_hidden_size: int=128,
+            decoder_output_size: int=80,
+            decoder_kernel_size: int=3,
+            decoder_dilations: List[
+                int]=[1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 1],
+            tone_size: int=None,
+            spk_num: int=None,
+            init_type: str="xavier_uniform",
+            positional_dropout_rate: int=0.1):
+        """Initialize SpeedySpeech module.
+        Args:
+            vocab_size (int): Dimension of the inputs.
+            encoder_hidden_size (int): Number of encoder hidden units.
+            encoder_kernel_size (int): Kernel size of encoder.
+            encoder_dilations (List[int]): Dilations of encoder.
+            duration_predictor_hidden_size (int): Number of duration predictor hidden units.
+            decoder_hidden_size (int): Number of decoder hidden units.
+            decoder_kernel_size (int): Kernel size of decoder.
+            decoder_dilations (List[int]): Dilations of decoder.
+            decoder_output_size (int): Dimension of the outputs.
+            tone_size (Optional[int]): Number of tones.
+            spk_num (Optional[int]): Number of speakers. 
+            init_type (str): How to initialize transformer parameters.
+    
+        """
        super().__init__()

        # initialize parameters
@ -181,6 +286,8 @@ class SpeedySpeech(nn.Layer):
        duration_predictor = DurationPredictor(duration_predictor_hidden_size)
        decoder = SpeedySpeechDecoder(decoder_hidden_size, decoder_output_size,
                                      decoder_kernel_size, decoder_dilations)
+        self.position_enc = ScaledPositionalEncoding(encoder_hidden_size,
+                                                     positional_dropout_rate)

        self.encoder = encoder
        self.duration_predictor = duration_predictor
@ -190,7 +297,22 @@ class SpeedySpeech(nn.Layer):

        nn.initializer.set_global_initializer(None)

-    def forward(self, text, tones, durations, spk_id: paddle.Tensor=None):
+    def forward(self,
+                text: paddle.Tensor,
+                tones: paddle.Tensor,
+                durations: paddle.Tensor,
+                spk_id: paddle.Tensor=None):
+        """Calculate forward propagation.
+        Args:
+            text(Tensor(int64)): Batch of padded token ids (B, Tmax).
+            durations(Tensor(int64)): Batch of padded durations (B, Tmax).
+            tones(Tensor, optional(int64)): Batch of padded tone ids  (B, Tmax).
+            spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,)
+
+        Returns:
+            Tensor: Output tensor (B, T_frames, decoder_output_size).
+            Tensor: Predicted durations (B, Tmax).
+        """
        # input of embedding must be int64
        text = paddle.cast(text, 'int64')
        tones = paddle.cast(tones, 'int64')
@ -198,23 +320,30 @@ class SpeedySpeech(nn.Layer):
            spk_id = paddle.cast(spk_id, 'int64')
        durations = paddle.cast(durations, 'int64')
        encodings = self.encoder(text, tones, spk_id)
-
        pred_durations = self.duration_predictor(encodings.detach())
-
        # expand encodings
        durations_to_expand = durations
        encodings = self.length_regulator(encodings, durations_to_expand)
-
+        encodings = self.position_enc(encodings)
        # decode
-        # remove positional encoding here
-        _, t_dec, feature_size = encodings.shape
-        encodings += sinusoid_position_encoding(t_dec, feature_size)
        decoded = self.decoder(encodings)
        return decoded, pred_durations

-    def inference(self, text, tones=None, durations=None, spk_id=None):
-        # text: [T]
-        # tones: [T]
+    def inference(self,
+                  text: paddle.Tensor,
+                  tones: paddle.Tensor=None,
+                  durations: paddle.Tensor=None,
+                  spk_id: paddle.Tensor=None):
+        """Generate the sequence of features given the sequences of characters.
+        Args:
+            text(Tensor(int64)): Input sequence of characters (T,).
+            tones(Tensor, optional(int64)): Batch of padded tone ids (T, ).
+            durations(Tensor, optional (int64)): Groundtruth of duration (T,).
+            spk_id(Tensor, optional(int64), optional): spk ids (1,). (Default value = None)
+
+        Returns:
+            Tensor: logmel (T, decoder_output_size).
+        """
        # input of embedding must be int64
        text = paddle.cast(text, 'int64')
        text = text.unsqueeze(0)
@ -233,10 +362,7 @@ class SpeedySpeech(nn.Layer):
            durations_to_expand = durations
        encodings = self.length_regulator(
            encodings, durations_to_expand, is_inference=True)
-
-        shape = paddle.shape(encodings)
-        t_dec, feature_size = shape[1], shape[2]
-        encodings += sinusoid_position_encoding(t_dec, feature_size)
+        encodings = self.position_enc(encodings)
        decoded = self.decoder(encodings)
        return decoded[0]

--- a/paddlespeech/t2s/modules/predictor/length_regulator.py
+++ b/paddlespeech/t2s/modules/predictor/length_regulator.py
@ -86,7 +86,7 @@ class LengthRegulator(nn.Layer):
            M[:, i] = m - init
            init = m
        M = paddle.reshape(M, shape=[t_dec_1, batch_size, t_enc])
-        M = M[1:, :, :]
+        M = M[1:t_dec_1, :, :]
        M = paddle.transpose(M, (1, 0, 2))
        encodings = paddle.matmul(M, encodings)
        return encodings
--- a/paddlespeech/t2s/modules/residual_block.py
+++ b/paddlespeech/t2s/modules/residual_block.py
@ -30,7 +30,7 @@ class WaveNetResidualBlock(nn.Layer):

    Args:
        kernel_size (int, optional): Kernel size of the 1D convolution, by default 3
-        residual_channels (int, optional): Feature size of the resiaudl output(and also the input), by default 64
+        residual_channels (int, optional): Feature size of the residual output(and also the input), by default 64
        gate_channels (int, optional): Output feature size of the 1D convolution, by default 128
        skip_channels (int, optional): Feature size of the skip output, by default 64
        aux_channels (int, optional): Feature size of the auxiliary input (e.g. spectrogram), by default 80
--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@ -347,7 +347,7 @@ class TransformerEncoder(BaseEncoder):
            encoder_type="transformer")

    def forward(self, xs, masks):
-        """Encode input sequence.
+        """Encoder input sequence.

        Args:
            xs(Tensor): Input tensor (#batch, time, idim).
@ -355,7 +355,7 @@ class TransformerEncoder(BaseEncoder):

        Returns:
            Tensor: Output tensor (#batch, time, attention_dim).
-            Tensor:Mask tensor (#batch, 1, time).
+            Tensor: Mask tensor (#batch, 1, time).
        """
        xs = self.embed(xs)
        xs, masks = self.encoders(xs, masks)
--- a/paddlespeech/vector/exps/ecapa_tdnn/test.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/test.py
@ -38,10 +38,10 @@ def compute_dataset_embedding(data_loader, model, mean_var_norm_emb, config,
    """compute the dataset embeddings

    Args:
-        data_loader (_type_): _description_
-        model (_type_): _description_
-        mean_var_norm_emb (_type_): _description_
-        config (_type_): _description_
+        data_loader (paddle.io.Dataloader): the dataset loader to be compute the embedding
+        model (paddle.nn.Layer): the speaker verification model
+        mean_var_norm_emb : compute the embedding mean and std norm
+        config (yacs.config.CfgNode): the yaml config
    """
    logger.info(
        f'Computing embeddings on {data_loader.dataset.csv_path} dataset')
@ -65,6 +65,17 @@ def compute_dataset_embedding(data_loader, model, mean_var_norm_emb, config,


 def compute_verification_scores(id2embedding, train_cohort, config):
+    """Compute the verification trial scores
+
+    Args:
+        id2embedding (dict): the utterance embedding
+        train_cohort (paddle.tensor): the cohort dataset embedding
+        config (yacs.config.CfgNode): the yaml config
+
+    Returns:
+        the scores and the trial labels, 
+        1 refers the target and 0 refers the nontarget in labels
+    """
    labels = []
    enroll_ids = []
    test_ids = []
@ -119,20 +130,32 @@ def compute_verification_scores(id2embedding, train_cohort, config):


 def main(args, config):
+    """The main process for test the speaker verification model
+
+    Args:
+        args (argparse.Namespace): the command line args namespace
+        config (yacs.config.CfgNode): the yaml config
+    """
+
    # stage0: set the training device, cpu or gpu
+    #         if set the gpu, paddlespeech will select a gpu according the env CUDA_VISIBLE_DEVICES
    paddle.set_device(args.device)
-    # set the random seed, it is a must for multiprocess training
+    # set the random seed, it is the necessary measures for multiprocess training
    seed_everything(config.seed)

    # stage1: build the dnn backbone model network
+    #         we will extract the audio embedding from the backbone model
    ecapa_tdnn = EcapaTdnn(**config.model)

    # stage2: build the speaker verification eval instance with backbone model
+    #         because the checkpoint dict name has the SpeakerIdetification prefix
+    #         so we need to create the SpeakerIdetification instance
+    #         but we acutally use the backbone model to extact the audio embedding 
    model = SpeakerIdetification(
        backbone=ecapa_tdnn, num_class=config.num_speakers)

    # stage3: load the pre-trained model
-    #         we get the last model from the epoch and save_interval
+    #         generally, we get the last model from the epoch
    args.load_checkpoint = os.path.abspath(
        os.path.expanduser(args.load_checkpoint))

@ -143,7 +166,8 @@ def main(args, config):
    logger.info(f'Checkpoint loaded from {args.load_checkpoint}')

    # stage4: construct the enroll and test dataloader
-
+    #         Now, wo think the enroll dataset is in the {args.data_dir}/vox/csv/enroll.csv,
+    #         and the test dataset is in the {args.data_dir}/vox/csv/test.csv
    enroll_dataset = CSVDataset(
        os.path.join(args.data_dir, "vox/csv/enroll.csv"),
        feat_type='melspectrogram',
@ -152,14 +176,14 @@ def main(args, config):
        window_size=config.window_size,
        hop_length=config.hop_size)
    enroll_sampler = BatchSampler(
-        enroll_dataset, batch_size=config.batch_size,
-        shuffle=False)  # Shuffle to make embedding normalization more robust.
+        enroll_dataset, batch_size=config.batch_size, shuffle=False)
    enroll_loader = DataLoader(enroll_dataset,
                    batch_sampler=enroll_sampler,
                    collate_fn=lambda x: batch_feature_normalize(
                                x, mean_norm=True, std_norm=False),
                    num_workers=config.num_workers,
                    return_list=True,)
+
    test_dataset = CSVDataset(
        os.path.join(args.data_dir, "vox/csv/test.csv"),
        feat_type='melspectrogram',
@ -167,7 +191,6 @@ def main(args, config):
        n_mels=config.n_mels,
        window_size=config.window_size,
        hop_length=config.hop_size)
-
    test_sampler = BatchSampler(
        test_dataset, batch_size=config.batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset,
@ -180,16 +203,17 @@ def main(args, config):
    model.eval()

    # stage6: global embedding norm to imporve the performance
+    #         and we create the InputNormalization instance to process the embedding mean and std norm
    logger.info(f"global embedding norm: {config.global_embedding_norm}")
-
-    # stage7: Compute embeddings of audios in enrol and test dataset from model.
-
    if config.global_embedding_norm:
        mean_var_norm_emb = InputNormalization(
            norm_type="global",
            mean_norm=config.embedding_mean_norm,
            std_norm=config.embedding_std_norm)

+    # stage 7: score norm need the imposters dataset
+    #          we select the train dataset as the idea imposters dataset
+    #          and we select the config.n_train_snts utterance to as the final imposters dataset
    if "score_norm" in config:
        logger.info(f"we will do score norm: {config.score_norm}")
        train_dataset = CSVDataset(
@ -209,6 +233,7 @@ def main(args, config):
                            num_workers=config.num_workers,
                            return_list=True,)

+    # stage 8: Compute embeddings of audios in enrol and test dataset from model.
    id2embedding = {}
    # Run multi times to make embedding normalization more stable.
    logger.info("First loop for enroll and test dataset")
@ -225,7 +250,7 @@ def main(args, config):
    mean_var_norm_emb.save(
        os.path.join(args.load_checkpoint, "mean_var_norm_emb"))

-    # stage 8: Compute cosine scores.
+    # stage 9: Compute cosine scores.
    train_cohort = None
    if "score_norm" in config:
        train_embeddings = {}
@ -234,11 +259,11 @@ def main(args, config):
                                  train_embeddings)
        train_cohort = paddle.stack(list(train_embeddings.values()))

-    # compute the scores
+    # stage 10: compute the scores
    scores, labels = compute_verification_scores(id2embedding, train_cohort,
                                                 config)

-    # compute the EER and threshold
+    # stage 11: compute the EER and threshold
    scores = paddle.to_tensor(scores)
    EER, threshold = compute_eer(np.asarray(labels), scores.numpy())
    logger.info(
--- a/paddlespeech/vector/exps/ecapa_tdnn/train.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py
@ -42,6 +42,12 @@ logger = Log(__name__).getlog()


 def main(args, config):
+    """The main process for test the speaker verification model
+
+    Args:
+        args (argparse.Namespace): the command line args namespace
+        config (yacs.config.CfgNode): the yaml config
+    """
    # stage0: set the training device, cpu or gpu
    paddle.set_device(args.device)

@ -49,11 +55,11 @@ def main(args, config):
    paddle.distributed.init_parallel_env()
    nranks = paddle.distributed.get_world_size()
    local_rank = paddle.distributed.get_rank()
-    # set the random seed, it is a must for multiprocess training
+    # set the random seed, it is the necessary measures for multiprocess training
    seed_everything(config.seed)

    # stage2: data prepare, such vox1 and vox2 data, and augment noise data and pipline
-    # note: some cmd must do in rank==0, so wo will refactor the data prepare code
+    # note: some operations must be done in rank==0
    train_dataset = CSVDataset(
        csv_path=os.path.join(args.data_dir, "vox/csv/train.csv"),
        label2id_path=os.path.join(args.data_dir, "vox/meta/label2id.txt"))
@ -61,12 +67,14 @@ def main(args, config):
        csv_path=os.path.join(args.data_dir, "vox/csv/dev.csv"),
        label2id_path=os.path.join(args.data_dir, "vox/meta/label2id.txt"))

+    # we will build the augment pipeline process list
    if config.augment:
        augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
    else:
        augment_pipeline = []

    # stage3: build the dnn backbone model network
+    #         in speaker verification period, we use the backbone mode to extract the audio embedding
    ecapa_tdnn = EcapaTdnn(**config.model)

    # stage4: build the speaker verification train instance with backbone model
@ -77,13 +85,15 @@ def main(args, config):
    #         140000 is single gpu steps
    #         so, in multi-gpu mode, wo reduce the step_size to 140000//nranks to enable CyclicLRScheduler
    lr_schedule = CyclicLRScheduler(
-        base_lr=config.learning_rate, max_lr=1e-3, step_size=140000 // nranks)
+        base_lr=config.learning_rate,
+        max_lr=config.max_lr,
+        step_size=config.step_size // nranks)
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_schedule, parameters=model.parameters())

    # stage6: build the loss function, we now only support LogSoftmaxWrapper
    criterion = LogSoftmaxWrapper(
-        loss_fn=AdditiveAngularMargin(margin=0.2, scale=30))
+        loss_fn=AdditiveAngularMargin(margin=config.margin, scale=config.scale))

    # stage7: confirm training start epoch
    #         if pre-trained model exists, start epoch confirmed by the pre-trained model
@ -225,7 +235,7 @@ def main(args, config):
                print_msg += ' avg_train_cost: {:.5f} sec,'.format(
                    train_run_cost / config.log_interval)

-                print_msg += ' lr={:.4E} step/sec={:.2f} ips:{:.5f}| ETA {}'.format(
+                print_msg += ' lr={:.4E} step/sec={:.2f} ips={:.5f}| ETA {}'.format(
                    lr, timer.timing, timer.ips, timer.eta)
                logger.info(print_msg)

--- a/paddlespeech/vector/io/embedding_norm.py
+++ b/paddlespeech/vector/io/embedding_norm.py
@ -57,14 +57,14 @@ class InputNormalization:
            lengths (paddle.Tensor): A batch of tensors containing the relative length of each
                                    sentence (e.g, [0.7, 0.9, 1.0]). It is used to avoid
                                    computing stats on zero-padded steps.
-            spk_ids (_type_, optional): tensor containing the ids of each speaker (e.g, [0 10 6]).
+            spk_ids (paddle.Tensor, optional): tensor containing the ids of each speaker (e.g, [0 10 6]).
                                        It is used to perform per-speaker normalization when
                                        norm_type='speaker'. Defaults to paddle.to_tensor([], dtype="float32").
        Returns:
            paddle.Tensor: The normalized feature or embedding
        """
        N_batches = x.shape[0]
-        # print(f"x shape: {x.shape[1]}")
+
        current_means = []
        current_stds = []

@ -75,6 +75,9 @@ class InputNormalization:
            actual_size = paddle.round(lengths[snt_id] *
                                       x.shape[1]).astype("int32")
            # computing actual time data statistics
+            # we extract the snt_id embedding from the x
+            # and the target paddle.Tensor will reduce an 0-axis
+            # so we need unsqueeze operation to recover the all axis 
            current_mean, current_std = self._compute_current_stats(
                x[snt_id, 0:actual_size, ...].unsqueeze(0))
            current_means.append(current_mean)