Merge branch 'develop' into develop

4 years ago · 6bf0d3bf57
parent 34b600c4a2 26d413ce8f
commit 6bf0d3bf57
33 changed files with 1025 additions and 23 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,6 +2,7 @@
 *.pyc
 .vscode
 *log
 *.wav
 *.pdmodel
 *.pdiparams*
 *.zip
@ -30,5 +31,8 @@ tools/OpenBLAS/
 tools/Miniconda3-latest-Linux-x86_64.sh
 tools/activate_python.sh
 tools/miniconda.sh
 tools/CRF++-0.58/
 speechx/fc_patch/
 *output/
--- a/README.md
+++ b/README.md
@ -196,16 +196,18 @@ Developers can have a try of our models with [PaddleSpeech Command Line](./paddl
 ```shell
 paddlespeech cls --input input.wav
 ```
 **Automatic Speech Recognition**
 ```shell
 paddlespeech asr --lang zh --input input_16k.wav
 ```
 **Speech Translation** (English to Chinese)
 **Speech Translation** (English to Chinese)
 (not support for Mac and Windows now)
 ```shell
 paddlespeech st --input input_16k.wav
 ```
 **Text-to-Speech** 
 ```shell
 paddlespeech tts --input "你好，欢迎使用飞桨深度学习框架！" --output output.wav
@ -218,7 +220,16 @@ paddlespeech tts --input "你好，欢迎使用飞桨深度学习框架！" --ou
  paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭
  ```
-  
+**Batch Process**
 ```
 echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
 ```  
 **Shell Pipeline**   
 - ASR + Punctuation Restoration
 ```
 paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
 ```
 For more command lines, please see: [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos)
--- a/README_cn.md
+++ b/README_cn.md
@ -216,6 +216,17 @@ paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！
   paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭
   ```
 **批处理**
 ```
 echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
 ```  
 **Shell管道**
 ASR + Punc:
 ```
 paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
 ```
 更多命令行命令请参考 [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos)
 > Note: 如果需要训练或者微调，请查看[语音识别](./docs/source/asr/quick_start.md)， [语音合成](./docs/source/tts/quick_start.md)。
@ -558,6 +569,7 @@ year={2021}
 - 非常感谢 [kslz](https://github.com/kslz) 补充中文文档。
 - 非常感谢 [awmmmm](https://github.com/awmmmm) 提供 fastspeech2 aishell3 conformer 预训练模型。
 - 非常感谢 [phecda-xu](https://github.com/phecda-xu)/[PaddleDubbing](https://github.com/phecda-xu/PaddleDubbing) 基于 PaddleSpeech 的 TTS 模型搭建带 GUI 操作界面的配音工具。
 此外，PaddleSpeech 依赖于许多开源存储库。有关更多信息，请参阅 [references](./docs/source/reference.md)。
--- a/demos/speech_recognition/.gitignore
+++ b/demos/speech_recognition/.gitignore
@ -0,0 +1 @@
 *.wav
--- a/demos/speech_recognition/README.md
+++ b/demos/speech_recognition/README.md
@ -27,6 +27,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
  paddlespeech asr --input ./zh.wav
  # English
  paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav
  # Chinese ASR + Punctuation Restoration
  paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
  ```
  (It doesn't matter if package `paddlespeech-ctcdecoders` is not found, this package is optional.)
--- a/demos/speech_recognition/README_cn.md
+++ b/demos/speech_recognition/README_cn.md
@ -25,6 +25,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
  paddlespeech asr --input ./zh.wav
  # 英文
  paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav
  # 中文 + 标点恢复
  paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
  ```
  (如果显示 `paddlespeech-ctcdecoders` 这个 python 包没有找到的 Error，没有关系，这个包是非必须的。)
--- a/demos/speech_recognition/run.sh
+++ b/demos/speech_recognition/run.sh
@ -1,4 +1,10 @@
 #!/bin/bash
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
 # asr
 paddlespeech asr --input ./zh.wav
 # asr + punc
 paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
--- a/demos/text_to_speech/README.md
+++ b/demos/text_to_speech/README.md
@ -17,11 +17,14 @@ The input of this demo should be a text of the specific language that can be pas
 ### 3. Usage
 - Command Line (Recommended)
    - Chinese
        The default acoustic model is `Fastspeech2`, and the default vocoder is `Parallel WaveGAN`.
        ```bash
        paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！"
        ```
    - Batch Process
        ```bash
        echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
        ```
    - Chinese, use `SpeedySpeech` as the acoustic model
        ```bash
        paddlespeech tts --am speedyspeech_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
--- a/demos/text_to_speech/README_cn.md
+++ b/demos/text_to_speech/README_cn.md
@ -24,6 +24,10 @@
        ```bash
        paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！"
        ```
    - 批处理
        ```bash
        echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
        ```
    - 中文，使用 `SpeedySpeech` 作为声学模型
        ```bash
        paddlespeech tts --am speedyspeech_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
--- a/demos/text_to_speech/run.sh
+++ b/demos/text_to_speech/run.sh
@ -1,3 +1,7 @@
 #!/bin/bash
 # single process
 paddlespeech tts --input 今天的天气不错啊
 # Batch process
 echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
--- a/docs/topic/ctc/ctc_loss_speed_compare.ipynb
+++ b/docs/topic/ctc/ctc_loss_speed_compare.ipynb
@ -0,0 +1,369 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a1e738e0",
   "metadata": {},
   "source": [
    "## 获取测试的 logit 数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "29d3368b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hlens.npy\n",
      "logits.npy\n",
      "ys_lens.npy\n",
      "ys_pad.npy\n"
     ]
    }
   ],
   "source": [
    "!mkdir -p ./test_data\n",
    "!test -f ./test_data/ctc_loss_compare_data.tgz || wget -P ./test_data https://paddlespeech.bj.bcebos.com/datasets/unit_test/asr/ctc_loss_compare_data.tgz\n",
    "!tar xzvf test_data/ctc_loss_compare_data.tgz -C ./test_data\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "240caf1d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import time\n",
    "\n",
    "data_dir=\"./test_data\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "91bad949",
   "metadata": {},
   "outputs": [],
   "source": [
    "logits_np = np.load(os.path.join(data_dir, \"logits.npy\"))\n",
    "ys_pad_np = np.load(os.path.join(data_dir, \"ys_pad.npy\"))\n",
    "hlens_np = np.load(os.path.join(data_dir, \"hlens.npy\"))\n",
    "ys_lens_np = np.load(os.path.join(data_dir, \"ys_lens.npy\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4cef2f15",
   "metadata": {},
   "source": [
    "## 使用 torch 的 ctc loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "90612004",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'1.10.1+cu102'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import torch\n",
    "torch.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "00799f97",
   "metadata": {},
   "outputs": [],
   "source": [
    "def torch_ctc_loss(use_cpu):\n",
    "    if use_cpu:\n",
    "        device = torch.device(\"cpu\")\n",
    "    else:\n",
    "        device = torch.device(\"cuda\")\n",
    "\n",
    "    reduction_type = \"sum\" \n",
    "\n",
    "    ctc_loss = torch.nn.CTCLoss(reduction=reduction_type)\n",
    "\n",
    "    ys_hat = torch.tensor(logits_np, device = device)\n",
    "    ys_pad = torch.tensor(ys_pad_np, device = device)\n",
    "    hlens = torch.tensor(hlens_np, device = device)\n",
    "    ys_lens = torch.tensor(ys_lens_np, device = device)\n",
    "\n",
    "    ys_hat = ys_hat.transpose(0, 1)\n",
    "    \n",
    "    # 开始计算时间\n",
    "    start_time = time.time()\n",
    "    ys_hat = ys_hat.log_softmax(2)\n",
    "    loss = ctc_loss(ys_hat, ys_pad, hlens, ys_lens)\n",
    "    end_time = time.time()\n",
    "    \n",
    "    loss = loss / ys_hat.size(1)\n",
    "    return end_time - start_time, loss.item()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ba47b5a4",
   "metadata": {},
   "source": [
    "## 使用 paddle 的 ctc loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "6882a06e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'2.2.2'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import paddle\n",
    "paddle.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "3cfa3b7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def paddle_ctc_loss(use_cpu):    \n",
    "    import paddle.nn as pn\n",
    "    if use_cpu:\n",
    "        device = \"cpu\"\n",
    "    else:\n",
    "        device = \"gpu\"\n",
    "\n",
    "    paddle.set_device(device)\n",
    "\n",
    "    logits = paddle.to_tensor(logits_np)\n",
    "    ys_pad = paddle.to_tensor(ys_pad_np,dtype='int32')\n",
    "    hlens = paddle.to_tensor(hlens_np, dtype='int64')\n",
    "    ys_lens = paddle.to_tensor(ys_lens_np, dtype='int64')\n",
    "\n",
    "    logits = logits.transpose([1,0,2])\n",
    "\n",
    "    ctc_loss = pn.CTCLoss(reduction='sum')\n",
    "    # 开始计算时间\n",
    "    start_time = time.time()\n",
    "    pn_loss = ctc_loss(logits, ys_pad, hlens, ys_lens)\n",
    "    end_time = time.time()\n",
    "    \n",
    "    pn_loss = pn_loss / logits.shape[1]\n",
    "    return end_time - start_time, pn_loss.item()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "40413ef9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU, iteration 10\n",
      "torch_ctc_loss 159.17137145996094\n",
      "paddle_ctc_loss 159.16574096679688\n",
      "paddle average time 1.718252992630005\n",
      "torch average time 0.17536230087280275\n",
      "paddle time / torch time (cpu) 9.798303193320452\n",
      "\n",
      "GPU, iteration 10\n",
      "torch_ctc_loss 159.172119140625\n",
      "paddle_ctc_loss 159.17205810546875\n",
      "paddle average time 0.018606925010681154\n",
      "torch average time 0.0026710033416748047\n",
      "paddle time / torch time (gpu) 6.966267963938231\n"
     ]
    }
   ],
   "source": [
    "# 使用 CPU\n",
    "\n",
    "iteration = 10\n",
    "use_cpu = True\n",
    "torch_total_time = 0\n",
    "paddle_total_time = 0\n",
    "for _ in range(iteration):\n",
    "    cost_time, torch_loss = torch_ctc_loss(use_cpu)\n",
    "    torch_total_time += cost_time\n",
    "for _ in range(iteration):\n",
    "    cost_time, paddle_loss = paddle_ctc_loss(use_cpu)\n",
    "    paddle_total_time += cost_time\n",
    "print (\"CPU, iteration\", iteration)\n",
    "print (\"torch_ctc_loss\", torch_loss)\n",
    "print (\"paddle_ctc_loss\", paddle_loss)\n",
    "print (\"paddle average time\", paddle_total_time / iteration)\n",
    "print (\"torch average time\", torch_total_time / iteration)\n",
    "print (\"paddle time / torch time (cpu)\" , paddle_total_time/ torch_total_time)\n",
    "\n",
    "print (\"\")\n",
    "\n",
    "# 使用 GPU\n",
    "\n",
    "use_cpu = False\n",
    "torch_total_time = 0\n",
    "paddle_total_time = 0\n",
    "for _ in range(iteration):\n",
    "    cost_time, torch_loss  = torch_ctc_loss(use_cpu)\n",
    "    torch_total_time += cost_time\n",
    "for _ in range(iteration):\n",
    "    cost_time, paddle_loss = paddle_ctc_loss(use_cpu)\n",
    "    paddle_total_time += cost_time\n",
    "print (\"GPU, iteration\", iteration)\n",
    "print (\"torch_ctc_loss\", torch_loss)\n",
    "print (\"paddle_ctc_loss\", paddle_loss)\n",
    "print (\"paddle average time\", paddle_total_time / iteration)\n",
    "print (\"torch average time\", torch_total_time / iteration)\n",
    "print (\"paddle time / torch time (gpu)\" , paddle_total_time/ torch_total_time)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7cdf8697",
   "metadata": {},
   "source": [
    "## 其他: 使用 PaddleSpeech 中的 ctcloss 查一下loss值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "73fad81d",
   "metadata": {},
   "outputs": [],
   "source": [
    "logits_np = np.load(os.path.join(data_dir, \"logits.npy\"))\n",
    "ys_pad_np = np.load(os.path.join(data_dir, \"ys_pad.npy\"))\n",
    "hlens_np = np.load(os.path.join(data_dir, \"hlens.npy\"))\n",
    "ys_lens_np = np.load(os.path.join(data_dir, \"ys_lens.npy\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "2b41e45d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2022-02-25 11:34:34.143 | INFO     | paddlespeech.s2t.modules.loss:__init__:41 - CTCLoss Loss reduction: sum, div-bs: True\n",
      "2022-02-25 11:34:34.143 | INFO     | paddlespeech.s2t.modules.loss:__init__:42 - CTCLoss Grad Norm Type: instance\n",
      "2022-02-25 11:34:34.144 | INFO     | paddlespeech.s2t.modules.loss:__init__:73 - CTCLoss() kwargs:{'norm_by_times': True}, not support: {'norm_by_batchsize': False, 'norm_by_total_logits_len': False}\n",
      "loss 159.17205810546875\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/miniconda3/lib/python3.7/site-packages/paddle/fluid/dygraph/math_op_patch.py:253: UserWarning: The dtype of left and right variables are not the same, left dtype is paddle.float32, but right dtype is paddle.int32, the right dtype will convert to paddle.float32\n",
      "  format(lhs_dtype, rhs_dtype, lhs_dtype))\n"
     ]
    }
   ],
   "source": [
    "use_cpu = False\n",
    "\n",
    "from paddlespeech.s2t.modules.loss import CTCLoss\n",
    "\n",
    "if use_cpu:\n",
    "    device = \"cpu\"\n",
    "else:\n",
    "    device = \"gpu\"\n",
    "\n",
    "paddle.set_device(device)\n",
    "\n",
    "blank_id=0\n",
    "reduction_type='sum'\n",
    "batch_average= True\n",
    "grad_norm_type='instance'\n",
    "\n",
    "criterion = CTCLoss(\n",
    "        blank=blank_id,\n",
    "        reduction=reduction_type,\n",
    "        batch_average=batch_average,\n",
    "        grad_norm_type=grad_norm_type)\n",
    "\n",
    "logits = paddle.to_tensor(logits_np)\n",
    "ys_pad = paddle.to_tensor(ys_pad_np,dtype='int32')\n",
    "hlens = paddle.to_tensor(hlens_np, dtype='int64')\n",
    "ys_lens = paddle.to_tensor(ys_lens_np, dtype='int64')\n",
    "\n",
    "pn_ctc_loss = criterion(logits, ys_pad, hlens, ys_lens)\n",
    "print(\"loss\", pn_ctc_loss.item())\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "de525d38",
   "metadata": {},
   "source": [
    "## 结论\n",
    "在 CPU 环境下： torch 的 CTC loss 的计算速度是 paddle 的 9.8 倍  \n",
    "在 GPU 环境下： torch 的 CTC loss 的计算速度是 paddle 的 6.87 倍\n",
    "\n",
    "## 其他结论\n",
    "torch 的 ctc loss 在 CPU 和 GPU 下 都没有完全对齐。其中CPU的前向对齐精度大约为 1e-2。 GPU 的前向对齐精度大约为 1e-4 。"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@ -225,7 +225,9 @@ optional arguments:
 9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 ## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
+Pretrained FastSpeech2 model with no silence in the edge of audios:
 - [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
 - [fastspeech2_conformer_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_aishell3_ckpt_0.2.0.zip) (Thanks for [@awmmmm](https://github.com/awmmmm)'s contribution)
 FastSpeech2 checkpoint contains files listed below.
--- a/examples/aishell3/tts3/conf/conformer.yaml
+++ b/examples/aishell3/tts3/conf/conformer.yaml
@ -0,0 +1,110 @@
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
 ###########################################################
 fs: 24000          # sr
 n_fft: 2048        # FFT size (samples).
 n_shift: 300       # Hop size (samples). 12.5ms
 win_length: 1200   # Window length (samples). 50ms
                   # If set to null, it will be the same as fft_size.
 window: "hann"     # Window function.
 # Only used for feats_type != raw
 fmin: 80           # Minimum frequency of Mel basis.
 fmax: 7600         # Maximum frequency of Mel basis.
 n_mels: 80         # The number of mel basis.
 # Only used for the model using pitch features (e.g. FastSpeech2)
 f0min: 80          # Maximum f0 for pitch extraction.
 f0max: 400         # Minimum f0 for pitch extraction.
 ###########################################################
 #                       DATA SETTING                      #
 ###########################################################
 batch_size: 32
 num_workers: 4
 ###########################################################
 #                       MODEL SETTING                     #
 ###########################################################
 model:
    adim: 384         # attention dimension
    aheads: 2         # number of attention heads
    elayers: 4        # number of encoder layers
    eunits: 1536      # number of encoder ff units
    dlayers: 4        # number of decoder layers
    dunits: 1536      # number of decoder ff units
    positionwise_layer_type: conv1d   # type of position-wise layer
    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
    duration_predictor_layers: 2      # number of layers of duration predictor
    duration_predictor_chans: 256     # number of channels of duration predictor
    duration_predictor_kernel_size: 3 # filter size of duration predictor
    postnet_layers: 5                 # number of layers of postnset
    postnet_filts: 5                  # filter size of conv layers in postnet
    postnet_chans: 256                # number of channels of conv layers in postnet
    encoder_normalize_before: True    # whether to perform layer normalization before the input
    decoder_normalize_before: True    # whether to perform layer normalization before the input
    reduction_factor: 1               # reduction factor
    encoder_type: conformer           # encoder type
    decoder_type: conformer           # decoder type
    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
    conformer_activation_type: swish             # conformer activation type
    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
    use_cnn_in_conformer: true                   # whether to use CNN in conformer
    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
    init_type: xavier_uniform         # initialization type
    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
    energy_predictor_layers: 2                 # number of conv layers in energy predictor
    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
    spk_embed_dim: 256                         # speaker embedding dimension
    spk_embed_integration_type: concat         # speaker embedding integration type
 ###########################################################
 #                       UPDATER SETTING                   #
 ###########################################################
 updater:
    use_masking: True                 # whether to apply masking for padded part in loss calculation
 ###########################################################
 #                     OPTIMIZER SETTING                   #
 ###########################################################
 optimizer:
  optim: adam              # optimizer type
  learning_rate: 0.001     # learning rate
 ###########################################################
 #                     TRAINING SETTING                    #
 ###########################################################
 max_epoch: 1000
 num_snapshots: 5
 ###########################################################
 #                       OTHER SETTING                     #
 ###########################################################
 seed: 10086
--- a/examples/other/g2p/README.md
+++ b/examples/other/g2p/README.md
@ -10,7 +10,7 @@ Run the command below to get the results of the test.
 ```bash
 ./run.sh
 ```
-The `avg WER` of g2p is: 0.027124048652822204
+The `avg WER` of g2p is: 0.026014352515701198
 ```text
     ,--------------------------------------------------------------------.
     |        | # Snt    # Wrd  | Corr    Sub    Del    Ins    Err  S.Err |
--- a/paddlespeech/cli/init.py
+++ b/paddlespeech/cli/init.py
@ -20,5 +20,6 @@ from .cls import CLSExecutor
 from .st import STExecutor
 from .text import TextExecutor
 from .tts import TTSExecutor
 from .stats import StatsExecutor
 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
--- a/paddlespeech/cli/stats/init.py
+++ b/paddlespeech/cli/stats/init.py
@ -0,0 +1,14 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .infer import StatsExecutor
--- a/paddlespeech/cli/stats/infer.py
+++ b/paddlespeech/cli/stats/infer.py
@ -0,0 +1,193 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 from typing import List
 from prettytable import PrettyTable
 from ..log import logger
 from ..utils import cli_register
 from ..utils import stats_wrapper
 __all__ = ['StatsExecutor']
 model_name_format = {
    'asr': 'Model-Language-Sample Rate',
    'cls': 'Model-Sample Rate',
    'st': 'Model-Source language-Target language',
    'text': 'Model-Task-Language',
    'tts': 'Model-Language'
 }
@cli_register(
    name='paddlespeech.stats',
    description='Get speech tasks support models list.')
 class StatsExecutor():
    def __init__(self):
        super(StatsExecutor, self).__init__()
        self.parser = argparse.ArgumentParser(
            prog='paddlespeech.stats', add_help=True)
        self.parser.add_argument(
            '--task',
            type=str,
            default='asr',
            choices=['asr', 'cls', 'st', 'text', 'tts'],
            help='Choose speech task.',
            required=True)
        self.task_choices = ['asr', 'cls', 'st', 'text', 'tts']
    def show_support_models(self, pretrained_models: dict):
        fields = model_name_format[self.task].split("-")
        table = PrettyTable(fields)
        for key in pretrained_models:
            table.add_row(key.split("-"))
        print(table)
    def execute(self, argv: List[str]) -> bool:
        """
            Command line entry.
        """
        parser_args = self.parser.parse_args(argv)
        self.task = parser_args.task
        if self.task not in self.task_choices:
            logger.error(
                "Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']"
            )
            return False
        elif self.task == 'asr':
            try:
                from ..asr.infer import pretrained_models
                logger.info(
                    "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
                self.show_support_models(pretrained_models)
                return True
            except BaseException:
                logger.error("Failed to get the list of ASR pretrained models.")
                return False
        elif self.task == 'cls':
            try:
                from ..cls.infer import pretrained_models
                logger.info(
                    "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
                self.show_support_models(pretrained_models)
                return True
            except BaseException:
                logger.error("Failed to get the list of CLS pretrained models.")
                return False
        elif self.task == 'st':
            try:
                from ..st.infer import pretrained_models
                logger.info(
                    "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
                self.show_support_models(pretrained_models)
                return True
            except BaseException:
                logger.error("Failed to get the list of ST pretrained models.")
                return False
        elif self.task == 'text':
            try:
                from ..text.infer import pretrained_models
                logger.info(
                    "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
                self.show_support_models(pretrained_models)
                return True
            except BaseException:
                logger.error(
                    "Failed to get the list of TEXT pretrained models.")
                return False
        elif self.task == 'tts':
            try:
                from ..tts.infer import pretrained_models
                logger.info(
                    "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
                self.show_support_models(pretrained_models)
                return True
            except BaseException:
                logger.error("Failed to get the list of TTS pretrained models.")
                return False
    @stats_wrapper
    def __call__(
            self,
            task: str=None, ):
        """
            Python API to call an executor.
        """
        self.task = task
        if self.task not in self.task_choices:
            print(
                "Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']"
            )
        elif self.task == 'asr':
            try:
                from ..asr.infer import pretrained_models
                print(
                    "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
                self.show_support_models(pretrained_models)
            except BaseException:
                print("Failed to get the list of ASR pretrained models.")
        elif self.task == 'cls':
            try:
                from ..cls.infer import pretrained_models
                print(
                    "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
                self.show_support_models(pretrained_models)
            except BaseException:
                print("Failed to get the list of CLS pretrained models.")
        elif self.task == 'st':
            try:
                from ..st.infer import pretrained_models
                print(
                    "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
                self.show_support_models(pretrained_models)
            except BaseException:
                print("Failed to get the list of ST pretrained models.")
        elif self.task == 'text':
            try:
                from ..text.infer import pretrained_models
                print(
                    "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
                self.show_support_models(pretrained_models)
            except BaseException:
                print("Failed to get the list of TEXT pretrained models.")
        elif self.task == 'tts':
            try:
                from ..tts.infer import pretrained_models
                print(
                    "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API"
                )
                self.show_support_models(pretrained_models)
            except BaseException:
                print("Failed to get the list of TTS pretrained models.")
--- a/paddlespeech/s2t/io/sampler.py
+++ b/paddlespeech/s2t/io/sampler.py
@ -51,7 +51,7 @@ def _batch_shuffle(indices, batch_size, epoch, clipped=False):
    """
    rng = np.random.RandomState(epoch)
    shift_len = rng.randint(0, batch_size - 1)
-    batch_indices = list(zip(*[iter(indices[shift_len:])] * batch_size))
+    batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
    rng.shuffle(batch_indices)
    batch_indices = [item for batch in batch_indices for item in batch]
    assert clipped is False
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@ -33,8 +33,6 @@ from paddlespeech.s2t.modules.decoder import TransformerDecoder
 from paddlespeech.s2t.modules.encoder import ConformerEncoder
 from paddlespeech.s2t.modules.encoder import TransformerEncoder
 from paddlespeech.s2t.modules.loss import LabelSmoothingLoss
 from paddlespeech.s2t.modules.mask import mask_finished_preds
 from paddlespeech.s2t.modules.mask import mask_finished_scores
 from paddlespeech.s2t.modules.mask import subsequent_mask
 from paddlespeech.s2t.utils import checkpoint
 from paddlespeech.s2t.utils import layer_tools
@ -291,7 +289,7 @@ class U2STBaseModel(nn.Layer):
        device = speech.place
        # Let's assume B = batch_size and N = beam_size
-        # 1. Encoder and init hypothesis 
+        # 1. Encoder and init hypothesis
        encoder_out, encoder_mask = self._forward_encoder(
            speech, speech_lengths, decoding_chunk_size,
            num_decoding_left_chunks,
--- a/paddlespeech/server/bin/init.py
+++ b/paddlespeech/server/bin/init.py
@ -14,3 +14,4 @@
 from .paddlespeech_client import ASRClientExecutor
 from .paddlespeech_client import TTSClientExecutor
 from .paddlespeech_server import ServerExecutor
 from .paddlespeech_server import ServerStatsExecutor
--- a/paddlespeech/server/bin/paddlespeech_server.py
+++ b/paddlespeech/server/bin/paddlespeech_server.py
@ -16,15 +16,17 @@ from typing import List
 import uvicorn
 from fastapi import FastAPI
 from prettytable import PrettyTable
 from ..executor import BaseExecutor
 from ..util import cli_server_register
 from ..util import stats_wrapper
 from paddlespeech.cli.log import logger
 from paddlespeech.server.engine.engine_pool import init_engine_pool
 from paddlespeech.server.restful.api import setup_router
 from paddlespeech.server.utils.config import get_config
-__all__ = ['ServerExecutor']
+__all__ = ['ServerExecutor', 'ServerStatsExecutor']
 app = FastAPI(
    title="PaddleSpeech Serving API", description="Api", version="0.0.1")
@ -86,3 +88,139 @@ class ServerExecutor(BaseExecutor):
        config = get_config(config_file)
        if self.init(config):
            uvicorn.run(app, host=config.host, port=config.port, debug=True)
@cli_server_register(
    name='paddlespeech_server.stats',
    description='Get the models supported by each speech task in the service.')
 class ServerStatsExecutor():
    def __init__(self):
        super(ServerStatsExecutor, self).__init__()
        self.parser = argparse.ArgumentParser(
            prog='paddlespeech_server.stats', add_help=True)
        self.parser.add_argument(
            '--task',
            type=str,
            default=None,
            choices=['asr', 'tts'],
            help='Choose speech task.',
            required=True)
        self.task_choices = ['asr', 'tts']
        self.model_name_format = {
            'asr': 'Model-Language-Sample Rate',
            'tts': 'Model-Language'
        }
    def show_support_models(self, pretrained_models: dict):
        fields = self.model_name_format[self.task].split("-")
        table = PrettyTable(fields)
        for key in pretrained_models:
            table.add_row(key.split("-"))
        print(table)
    def execute(self, argv: List[str]) -> bool:
        """
            Command line entry.
        """
        parser_args = self.parser.parse_args(argv)
        self.task = parser_args.task
        if self.task not in self.task_choices:
            logger.error(
                "Please input correct speech task, choices = ['asr', 'tts']")
            return False
        elif self.task == 'asr':
            try:
                from paddlespeech.cli.asr.infer import pretrained_models
                logger.info(
                    "Here is the table of ASR pretrained models supported in the service."
                )
                self.show_support_models(pretrained_models)
                # show ASR static pretrained model
                from paddlespeech.server.engine.asr.paddleinference.asr_engine import pretrained_models
                logger.info(
                    "Here is the table of ASR static pretrained models supported in the service."
                )
                self.show_support_models(pretrained_models)
                return True
            except BaseException:
                logger.error(
                    "Failed to get the table of ASR pretrained models supported in the service."
                )
                return False
        elif self.task == 'tts':
            try:
                from paddlespeech.cli.tts.infer import pretrained_models
                logger.info(
                    "Here is the table of TTS pretrained models supported in the service."
                )
                self.show_support_models(pretrained_models)
                # show TTS static pretrained model
                from paddlespeech.server.engine.tts.paddleinference.tts_engine import pretrained_models
                logger.info(
                    "Here is the table of TTS static pretrained models supported in the service."
                )
                self.show_support_models(pretrained_models)
                return True
            except BaseException:
                logger.error(
                    "Failed to get the table of TTS pretrained models supported in the service."
                )
                return False
    @stats_wrapper
    def __call__(
            self,
            task: str=None, ):
        """
            Python API to call an executor.
        """
        self.task = task
        if self.task not in self.task_choices:
            print("Please input correct speech task, choices = ['asr', 'tts']")
        elif self.task == 'asr':
            try:
                from paddlespeech.cli.asr.infer import pretrained_models
                print(
                    "Here is the table of ASR pretrained models supported in the service."
                )
                self.show_support_models(pretrained_models)
                # show ASR static pretrained model
                from paddlespeech.server.engine.asr.paddleinference.asr_engine import pretrained_models
                print(
                    "Here is the table of ASR static pretrained models supported in the service."
                )
                self.show_support_models(pretrained_models)
            except BaseException:
                print(
                    "Failed to get the table of ASR pretrained models supported in the service."
                )
        elif self.task == 'tts':
            try:
                from paddlespeech.cli.tts.infer import pretrained_models
                print(
                    "Here is the table of TTS pretrained models supported in the service."
                )
                self.show_support_models(pretrained_models)
                # show TTS static pretrained model
                from paddlespeech.server.engine.tts.paddleinference.tts_engine import pretrained_models
                print(
                    "Here is the table of TTS static pretrained models supported in the service."
                )
                self.show_support_models(pretrained_models)
            except BaseException:
                print(
                    "Failed to get the table of TTS pretrained models supported in the service."
                )
--- a/paddlespeech/t2s/frontend/tone_sandhi.py
+++ b/paddlespeech/t2s/frontend/tone_sandhi.py
@ -63,7 +63,7 @@ class ToneSandhi():
            '扫把', '惦记'
        }
        self.must_not_neural_tone_words = {
-            "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子"
+            "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", "虎虎"
        }
        self.punc = "：，；。？！“”‘’':,;.?!"
@ -77,7 +77,9 @@ class ToneSandhi():
        # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
        for j, item in enumerate(word):
-            if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}:
+            if j - 1 >= 0 and item == word[j - 1] and pos[0] in {
                    "n", "v", "a"
            } and word not in self.must_not_neural_tone_words:
                finals[j] = finals[j][:-1] + "5"
        ge_idx = word.find("个")
        if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@ -20,7 +20,10 @@ import numpy as np
 import paddle
 from g2pM import G2pM
 from pypinyin import lazy_pinyin
 from pypinyin import load_phrases_dict
 from pypinyin import load_single_dict
 from pypinyin import Style
 from pypinyin_dict.phrase_pinyin_data import large_pinyin
 from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
 from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi
@ -41,6 +44,8 @@ class Frontend():
            self.g2pM_model = G2pM()
            self.pinyin2phone = generate_lexicon(
                with_tone=True, with_erhua=False)
        else:
            self.__init__pypinyin()
        self.must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿"}
        self.not_erhua = {
            "虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿",
@ -62,6 +67,23 @@ class Frontend():
            for tone, id in tone_id:
                self.vocab_tones[tone] = int(id)
    def __init__pypinyin(self):
        large_pinyin.load()
        load_phrases_dict({u'开户行': [[u'ka1i'], [u'hu4'], [u'hang2']]})
        load_phrases_dict({u'发卡行': [[u'fa4'], [u'ka3'], [u'hang2']]})
        load_phrases_dict({u'放款行': [[u'fa4ng'], [u'kua3n'], [u'hang2']]})
        load_phrases_dict({u'茧行': [[u'jia3n'], [u'hang2']]})
        load_phrases_dict({u'行号': [[u'hang2'], [u'ha4o']]})
        load_phrases_dict({u'各地': [[u'ge4'], [u'di4']]})
        load_phrases_dict({u'借还款': [[u'jie4'], [u'hua2n'], [u'kua3n']]})
        load_phrases_dict({u'时间为': [[u'shi2'], [u'jia1n'], [u'we2i']]})
        load_phrases_dict({u'为准': [[u'we2i'], [u'zhu3n']]})
        load_phrases_dict({u'色差': [[u'se4'], [u'cha1']]})
        # 调整字的拼音顺序
        load_single_dict({ord(u'地'): u'de,di4'})
    def _get_initials_finals(self, word: str) -> List[List[str]]:
        initials = []
        finals = []
--- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
@ -63,7 +63,10 @@ def replace_time(match) -> str:
    result = f"{num2str(hour)}点"
    if minute.lstrip('0'):
-        result += f"{_time_num2str(minute)}分"
+        if int(minute) == 30:
            result += f"半"
        else:
            result += f"{_time_num2str(minute)}分"
    if second and second.lstrip('0'):
        result += f"{_time_num2str(second)}秒"
@ -71,7 +74,10 @@ def replace_time(match) -> str:
        result += "至"
        result += f"{num2str(hour_2)}点"
        if minute_2.lstrip('0'):
-            result += f"{_time_num2str(minute_2)}分"
+            if int(minute) == 30:
                result += f"半"
            else:
                result += f"{_time_num2str(minute_2)}分"
        if second_2 and second_2.lstrip('0'):
            result += f"{_time_num2str(second_2)}秒"
--- a/paddlespeech/t2s/frontend/zh_normalization/num.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/num.py
@ -28,7 +28,7 @@ UNITS = OrderedDict({
    8: '亿',
 })
-COM_QUANTIFIERS = '(朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)'
+COM_QUANTIFIERS = '(所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)'
 # 分数表达式
 RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
@ -110,7 +110,7 @@ def replace_default_num(match):
 # 纯小数
 RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))')
 # 正整数 + 量词
-RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几])?" + COM_QUANTIFIERS)
+RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
 RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')
@ -123,6 +123,8 @@ def replace_positive_quantifier(match) -> str:
    """
    number = match.group(1)
    match_2 = match.group(2)
    if match_2 == "+":
        match_2 = "多"
    match_2: str = match_2 if match_2 else ""
    quantifiers: str = match.group(3)
    number: str = num2str(number)
@ -151,6 +153,7 @@ def replace_number(match) -> str:
 # 范围表达式
 # match.group(1) and match.group(8) are copy from RE_NUMBER
 RE_RANGE = re.compile(
    r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
--- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
@ -63,11 +63,19 @@ class TextNormalizer():
        # Only for pure Chinese here
        if lang == "zh":
            text = text.replace(" ", "")
            # 过滤掉特殊字符
            text = re.sub(r'[《》【】<=>{}()（）#&@“”^_|…\\]', '', text)
        text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
        text = text.strip()
        sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
        return sentences
    def _post_replace(self, sentence: str) -> str:
        sentence = sentence.replace('/', '每')
        sentence = sentence.replace('~', '至')
        return sentence
    def normalize_sentence(self, sentence: str) -> str:
        # basic character conversions
        sentence = tranditional_to_simplified(sentence)
@ -97,6 +105,7 @@ class TextNormalizer():
                                               sentence)
        sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
        sentence = RE_NUMBER.sub(replace_number, sentence)
        sentence = self._post_replace(sentence)
        return sentence
--- a/paddlespeech/t2s/modules/transformer/repeat.py
+++ b/paddlespeech/t2s/modules/transformer/repeat.py
@ -36,4 +36,4 @@ def repeat(N, fn):
    Returns:
        MultiSequential: Repeated model instance.
    """
-    return MultiSequential(*[fn(n) for n in range(N)])
+    return MultiSequential(* [fn(n) for n in range(N)])
--- a/setup.py
+++ b/setup.py
@ -48,6 +48,7 @@ base = [
    "paddlespeech_feat",
    "praatio==5.0.0",
    "pypinyin",
    "pypinyin-dict",
    "python-dateutil",
    "pyworld",
    "resampy==0.2.2",
@ -62,6 +63,7 @@ base = [
    "visualdl",
    "webrtcvad",
    "yacs~=0.1.8",
    "prettytable",
 ]
 server = [
--- a/tests/test_tipc/configs/conformer/train_infer_python.txt
+++ b/tests/test_tipc/configs/conformer/train_infer_python.txt
@ -54,4 +54,4 @@ batch_size:16|30
 fp_items:fp32
 iteration:50
 --profiler-options:"batch_range=[10,35];state=GPU;tracer_option=Default;profile_path=model.profile"
-flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096
+flags:null
--- a/tests/test_tipc/configs/pwgan/train_infer_python.txt
+++ b/tests/test_tipc/configs/pwgan/train_infer_python.txt
@ -54,4 +54,4 @@ batch_size:6|16
 fp_items:fp32
 iteration:50
 --profiler_options:"batch_range=[10,35];state=GPU;tracer_option=Default;profile_path=model.profile"
-flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096
+flags:null
--- a/tests/test_tipc/prepare.sh
+++ b/tests/test_tipc/prepare.sh
@ -26,15 +26,19 @@ if [ ${MODE} = "benchmark_train" ];then
    curPath=$(readlink -f "$(dirname "$0")")
        echo "curPath:"${curPath}
    cd ${curPath}/../..
-    pip install .
+    apt-get install libsndfile1
    pip install pytest-runner kaldiio setuptools_scm -i https://pypi.tuna.tsinghua.edu.cn/simple 
    pip install . -i https://pypi.tuna.tsinghua.edu.cn/simple 
    cd -
    if [ ${model_name} == "conformer" ]; then
        # set the URL for aishell_tiny dataset
-        URL='None'
+        URL=${conformer_data_URL:-"None"}
        echo "URL:"${URL}
        if [ ${URL} == 'None' ];then
            echo "please contact author to get the URL.\n"
            exit
 	else
 	    wget -P ${curPath}/../../dataset/aishell/ ${URL} 
        fi
        sed -i "s#^URL_ROOT_TAG#URL_ROOT = '${URL}'#g" ${curPath}/conformer/scripts/aishell_tiny.py
        cp ${curPath}/conformer/scripts/aishell_tiny.py ${curPath}/../../dataset/aishell/
@ -42,6 +46,7 @@ if [ ${MODE} = "benchmark_train" ];then
        source path.sh
        # download audio data
        sed -i "s#aishell.py#aishell_tiny.py#g" ./local/data.sh
 	sed -i "s#python3#python#g" ./local/data.sh
        bash ./local/data.sh || exit -1
        if [ $? -ne 0 ]; then
        exit 1
@ -56,7 +61,6 @@ if [ ${MODE} = "benchmark_train" ];then
        sed -i "s#conf/#test_tipc/conformer/benchmark_train/conf/#g" ${curPath}/conformer/benchmark_train/conf/conformer.yaml
        sed -i "s#data/#test_tipc/conformer/benchmark_train/data/#g" ${curPath}/conformer/benchmark_train/conf/tuning/decode.yaml
        sed -i "s#data/#test_tipc/conformer/benchmark_train/data/#g" ${curPath}/conformer/benchmark_train/conf/preprocess.yaml
    fi
    if [ ${model_name} == "pwgan" ]; then
@ -73,4 +77,4 @@ if [ ${MODE} = "benchmark_train" ];then
        python ../paddlespeech/t2s/exps/gan_vocoder/normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/feats_stats.npy
    fi
-fi
+fi
--- a/tests/unit/asr/deepspeech2_online_model_test.py
+++ b/tests/unit/asr/deepspeech2_online_model_test.py
@ -11,11 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import pickle
 import unittest
 import numpy as np
 import paddle
 from paddle import inference
 from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline
 from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
@ -182,5 +186,77 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
                paddle.allclose(final_state_c_box, final_state_c_box_chk), True)
 class TestDeepSpeech2StaticModelOnline(unittest.TestCase):
    def setUp(self):
        export_prefix = "exp/deepspeech2_online/checkpoints/test_export"
        if not os.path.exists(os.path.dirname(export_prefix)):
            os.makedirs(os.path.dirname(export_prefix), mode=0o755)
        infer_model = DeepSpeech2InferModelOnline(
            feat_size=161,
            dict_size=4233,
            num_conv_layers=2,
            num_rnn_layers=5,
            rnn_size=1024,
            num_fc_layers=0,
            fc_layers_size_list=[-1],
            use_gru=False)
        static_model = infer_model.export()
        paddle.jit.save(static_model, export_prefix)
        with open("test_data/static_ds2online_inputs.pickle", "rb") as f:
            self.data_dict = pickle.load(f)
        self.setup_model(export_prefix)
    def setup_model(self, export_prefix):
        deepspeech_config = inference.Config(export_prefix + ".pdmodel",
                                             export_prefix + ".pdiparams")
        if ('CUDA_VISIBLE_DEVICES' in os.environ.keys() and
                os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''):
            deepspeech_config.enable_use_gpu(100, 0)
            deepspeech_config.enable_memory_optim()
        deepspeech_predictor = inference.create_predictor(deepspeech_config)
        self.predictor = deepspeech_predictor
    def test_unit(self):
        input_names = self.predictor.get_input_names()
        audio_handle = self.predictor.get_input_handle(input_names[0])
        audio_len_handle = self.predictor.get_input_handle(input_names[1])
        h_box_handle = self.predictor.get_input_handle(input_names[2])
        c_box_handle = self.predictor.get_input_handle(input_names[3])
        x_chunk = self.data_dict["audio_chunk"]
        x_chunk_lens = self.data_dict["audio_chunk_lens"]
        chunk_state_h_box = self.data_dict["chunk_state_h_box"]
        chunk_state_c_box = self.data_dict["chunk_state_c_bos"]
        audio_handle.reshape(x_chunk.shape)
        audio_handle.copy_from_cpu(x_chunk)
        audio_len_handle.reshape(x_chunk_lens.shape)
        audio_len_handle.copy_from_cpu(x_chunk_lens)
        h_box_handle.reshape(chunk_state_h_box.shape)
        h_box_handle.copy_from_cpu(chunk_state_h_box)
        c_box_handle.reshape(chunk_state_c_box.shape)
        c_box_handle.copy_from_cpu(chunk_state_c_box)
        output_names = self.predictor.get_output_names()
        output_handle = self.predictor.get_output_handle(output_names[0])
        output_lens_handle = self.predictor.get_output_handle(output_names[1])
        output_state_h_handle = self.predictor.get_output_handle(
            output_names[2])
        output_state_c_handle = self.predictor.get_output_handle(
            output_names[3])
        self.predictor.run()
        output_chunk_probs = output_handle.copy_to_cpu()
        output_chunk_lens = output_lens_handle.copy_to_cpu()
        chunk_state_h_box = output_state_h_handle.copy_to_cpu()
        chunk_state_c_box = output_state_c_handle.copy_to_cpu()
        return True
 if __name__ == '__main__':
    unittest.main()
--- a/tests/unit/asr/deepspeech2_online_model_test.sh
+++ b/tests/unit/asr/deepspeech2_online_model_test.sh
@ -0,0 +1,3 @@
 mkdir -p ./test_data
 wget -P ./test_data https://paddlespeech.bj.bcebos.com/datasets/unit_test/asr/static_ds2online_inputs.pickle
 python deepspeech2_online_model_test.py