Merge branch 'develop' into develop

3 years ago · 6bf0d3bf57
parent 34b600c4a2 26d413ce8f
commit 6bf0d3bf57
33 changed files with 1025 additions and 23 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,6 +2,7 @@
 *.pyc
 .vscode
 *log
+*.wav
 *.pdmodel
 *.pdiparams*
 *.zip
@ -30,5 +31,8 @@ tools/OpenBLAS/
 tools/Miniconda3-latest-Linux-x86_64.sh
 tools/activate_python.sh
 tools/miniconda.sh
+tools/CRF++-0.58/
+
+speechx/fc_patch/

 *output/
--- a/README.md
+++ b/README.md
@ -196,16 +196,18 @@ Developers can have a try of our models with [PaddleSpeech Command Line](./paddl
 ```shell
 paddlespeech cls --input input.wav
 ```
+
 **Automatic Speech Recognition**
 ```shell
 paddlespeech asr --lang zh --input input_16k.wav
 ```
-**Speech Translation** (English to Chinese)

+**Speech Translation** (English to Chinese)
 (not support for Mac and Windows now)
 ```shell
 paddlespeech st --input input_16k.wav
 ```
+
 **Text-to-Speech** 
 ```shell
 paddlespeech tts --input "你好，欢迎使用飞桨深度学习框架！" --output output.wav
@ -218,7 +220,16 @@ paddlespeech tts --input "你好，欢迎使用飞桨深度学习框架！" --ou
  paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭
  ```

-  
+**Batch Process**
+```
+echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
+```  
+
+**Shell Pipeline**   
+- ASR + Punctuation Restoration
+```
+paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
+```

 For more command lines, please see: [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos)

--- a/README_cn.md
+++ b/README_cn.md
@ -216,6 +216,17 @@ paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！
   paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭
   ```

+**批处理**
+```
+echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
+```  
+
+**Shell管道**
+ASR + Punc:
+```
+paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
+```
+
 更多命令行命令请参考 [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos)
 > Note: 如果需要训练或者微调，请查看[语音识别](./docs/source/asr/quick_start.md)， [语音合成](./docs/source/tts/quick_start.md)。

@ -558,6 +569,7 @@ year={2021}
 - 非常感谢 [kslz](https://github.com/kslz) 补充中文文档。
 - 非常感谢 [awmmmm](https://github.com/awmmmm) 提供 fastspeech2 aishell3 conformer 预训练模型。
 - 非常感谢 [phecda-xu](https://github.com/phecda-xu)/[PaddleDubbing](https://github.com/phecda-xu/PaddleDubbing) 基于 PaddleSpeech 的 TTS 模型搭建带 GUI 操作界面的配音工具。
+
  
 此外，PaddleSpeech 依赖于许多开源存储库。有关更多信息，请参阅 [references](./docs/source/reference.md)。

--- a/demos/speech_recognition/.gitignore
+++ b/demos/speech_recognition/.gitignore
@ -0,0 +1 @@
+*.wav
--- a/demos/speech_recognition/README.md
+++ b/demos/speech_recognition/README.md
@ -27,6 +27,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
  paddlespeech asr --input ./zh.wav
  # English
  paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav
+  # Chinese ASR + Punctuation Restoration
+  paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
  ```
  (It doesn't matter if package `paddlespeech-ctcdecoders` is not found, this package is optional.)
  
--- a/demos/speech_recognition/README_cn.md
+++ b/demos/speech_recognition/README_cn.md
@ -25,6 +25,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
  paddlespeech asr --input ./zh.wav
  # 英文
  paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav
+  # 中文 + 标点恢复
+  paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
  ```
  (如果显示 `paddlespeech-ctcdecoders` 这个 python 包没有找到的 Error，没有关系，这个包是非必须的。)
  
--- a/demos/speech_recognition/run.sh
+++ b/demos/speech_recognition/run.sh
@ -1,4 +1,10 @@
 #!/bin/bash

 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+
+# asr
 paddlespeech asr --input ./zh.wav
+
+
+# asr + punc
+paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
--- a/demos/text_to_speech/README.md
+++ b/demos/text_to_speech/README.md
@ -17,11 +17,14 @@ The input of this demo should be a text of the specific language that can be pas
 ### 3. Usage
 - Command Line (Recommended)
    - Chinese
-    
        The default acoustic model is `Fastspeech2`, and the default vocoder is `Parallel WaveGAN`.
        ```bash
        paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！"
        ```
+    - Batch Process
+        ```bash
+        echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
+        ```
    - Chinese, use `SpeedySpeech` as the acoustic model
        ```bash
        paddlespeech tts --am speedyspeech_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
--- a/demos/text_to_speech/README_cn.md
+++ b/demos/text_to_speech/README_cn.md
@ -24,6 +24,10 @@
        ```bash
        paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！"
        ```
+    - 批处理
+        ```bash
+        echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
+        ```
    - 中文，使用 `SpeedySpeech` 作为声学模型
        ```bash
        paddlespeech tts --am speedyspeech_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
--- a/demos/text_to_speech/run.sh
+++ b/demos/text_to_speech/run.sh
@ -1,3 +1,7 @@
 #!/bin/bash

+# single process
 paddlespeech tts --input 今天的天气不错啊
+
+# Batch process
+echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
--- a/docs/topic/ctc/ctc_loss_speed_compare.ipynb
+++ b/docs/topic/ctc/ctc_loss_speed_compare.ipynb
@ -0,0 +1,369 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a1e738e0",
+   "metadata": {},
+   "source": [
+    "## 获取测试的 logit 数据"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "29d3368b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "hlens.npy\n",
+      "logits.npy\n",
+      "ys_lens.npy\n",
+      "ys_pad.npy\n"
+     ]
+    }
+   ],
+   "source": [
+    "!mkdir -p ./test_data\n",
+    "!test -f ./test_data/ctc_loss_compare_data.tgz || wget -P ./test_data https://paddlespeech.bj.bcebos.com/datasets/unit_test/asr/ctc_loss_compare_data.tgz\n",
+    "!tar xzvf test_data/ctc_loss_compare_data.tgz -C ./test_data\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "240caf1d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import numpy as np\n",
+    "import time\n",
+    "\n",
+    "data_dir=\"./test_data\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "91bad949",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "logits_np = np.load(os.path.join(data_dir, \"logits.npy\"))\n",
+    "ys_pad_np = np.load(os.path.join(data_dir, \"ys_pad.npy\"))\n",
+    "hlens_np = np.load(os.path.join(data_dir, \"hlens.npy\"))\n",
+    "ys_lens_np = np.load(os.path.join(data_dir, \"ys_lens.npy\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4cef2f15",
+   "metadata": {},
+   "source": [
+    "## 使用 torch 的 ctc loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "90612004",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'1.10.1+cu102'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "torch.__version__"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "00799f97",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def torch_ctc_loss(use_cpu):\n",
+    "    if use_cpu:\n",
+    "        device = torch.device(\"cpu\")\n",
+    "    else:\n",
+    "        device = torch.device(\"cuda\")\n",
+    "\n",
+    "    reduction_type = \"sum\" \n",
+    "\n",
+    "    ctc_loss = torch.nn.CTCLoss(reduction=reduction_type)\n",
+    "\n",
+    "    ys_hat = torch.tensor(logits_np, device = device)\n",
+    "    ys_pad = torch.tensor(ys_pad_np, device = device)\n",
+    "    hlens = torch.tensor(hlens_np, device = device)\n",
+    "    ys_lens = torch.tensor(ys_lens_np, device = device)\n",
+    "\n",
+    "    ys_hat = ys_hat.transpose(0, 1)\n",
+    "    \n",
+    "    # 开始计算时间\n",
+    "    start_time = time.time()\n",
+    "    ys_hat = ys_hat.log_softmax(2)\n",
+    "    loss = ctc_loss(ys_hat, ys_pad, hlens, ys_lens)\n",
+    "    end_time = time.time()\n",
+    "    \n",
+    "    loss = loss / ys_hat.size(1)\n",
+    "    return end_time - start_time, loss.item()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ba47b5a4",
+   "metadata": {},
+   "source": [
+    "## 使用 paddle 的 ctc loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "6882a06e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'2.2.2'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import paddle\n",
+    "paddle.__version__"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "3cfa3b7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def paddle_ctc_loss(use_cpu):    \n",
+    "    import paddle.nn as pn\n",
+    "    if use_cpu:\n",
+    "        device = \"cpu\"\n",
+    "    else:\n",
+    "        device = \"gpu\"\n",
+    "\n",
+    "    paddle.set_device(device)\n",
+    "\n",
+    "    logits = paddle.to_tensor(logits_np)\n",
+    "    ys_pad = paddle.to_tensor(ys_pad_np,dtype='int32')\n",
+    "    hlens = paddle.to_tensor(hlens_np, dtype='int64')\n",
+    "    ys_lens = paddle.to_tensor(ys_lens_np, dtype='int64')\n",
+    "\n",
+    "    logits = logits.transpose([1,0,2])\n",
+    "\n",
+    "    ctc_loss = pn.CTCLoss(reduction='sum')\n",
+    "    # 开始计算时间\n",
+    "    start_time = time.time()\n",
+    "    pn_loss = ctc_loss(logits, ys_pad, hlens, ys_lens)\n",
+    "    end_time = time.time()\n",
+    "    \n",
+    "    pn_loss = pn_loss / logits.shape[1]\n",
+    "    return end_time - start_time, pn_loss.item()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "40413ef9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU, iteration 10\n",
+      "torch_ctc_loss 159.17137145996094\n",
+      "paddle_ctc_loss 159.16574096679688\n",
+      "paddle average time 1.718252992630005\n",
+      "torch average time 0.17536230087280275\n",
+      "paddle time / torch time (cpu) 9.798303193320452\n",
+      "\n",
+      "GPU, iteration 10\n",
+      "torch_ctc_loss 159.172119140625\n",
+      "paddle_ctc_loss 159.17205810546875\n",
+      "paddle average time 0.018606925010681154\n",
+      "torch average time 0.0026710033416748047\n",
+      "paddle time / torch time (gpu) 6.966267963938231\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 使用 CPU\n",
+    "\n",
+    "iteration = 10\n",
+    "use_cpu = True\n",
+    "torch_total_time = 0\n",
+    "paddle_total_time = 0\n",
+    "for _ in range(iteration):\n",
+    "    cost_time, torch_loss = torch_ctc_loss(use_cpu)\n",
+    "    torch_total_time += cost_time\n",
+    "for _ in range(iteration):\n",
+    "    cost_time, paddle_loss = paddle_ctc_loss(use_cpu)\n",
+    "    paddle_total_time += cost_time\n",
+    "print (\"CPU, iteration\", iteration)\n",
+    "print (\"torch_ctc_loss\", torch_loss)\n",
+    "print (\"paddle_ctc_loss\", paddle_loss)\n",
+    "print (\"paddle average time\", paddle_total_time / iteration)\n",
+    "print (\"torch average time\", torch_total_time / iteration)\n",
+    "print (\"paddle time / torch time (cpu)\" , paddle_total_time/ torch_total_time)\n",
+    "\n",
+    "print (\"\")\n",
+    "\n",
+    "# 使用 GPU\n",
+    "\n",
+    "use_cpu = False\n",
+    "torch_total_time = 0\n",
+    "paddle_total_time = 0\n",
+    "for _ in range(iteration):\n",
+    "    cost_time, torch_loss  = torch_ctc_loss(use_cpu)\n",
+    "    torch_total_time += cost_time\n",
+    "for _ in range(iteration):\n",
+    "    cost_time, paddle_loss = paddle_ctc_loss(use_cpu)\n",
+    "    paddle_total_time += cost_time\n",
+    "print (\"GPU, iteration\", iteration)\n",
+    "print (\"torch_ctc_loss\", torch_loss)\n",
+    "print (\"paddle_ctc_loss\", paddle_loss)\n",
+    "print (\"paddle average time\", paddle_total_time / iteration)\n",
+    "print (\"torch average time\", torch_total_time / iteration)\n",
+    "print (\"paddle time / torch time (gpu)\" , paddle_total_time/ torch_total_time)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7cdf8697",
+   "metadata": {},
+   "source": [
+    "## 其他: 使用 PaddleSpeech 中的 ctcloss 查一下loss值"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "73fad81d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "logits_np = np.load(os.path.join(data_dir, \"logits.npy\"))\n",
+    "ys_pad_np = np.load(os.path.join(data_dir, \"ys_pad.npy\"))\n",
+    "hlens_np = np.load(os.path.join(data_dir, \"hlens.npy\"))\n",
+    "ys_lens_np = np.load(os.path.join(data_dir, \"ys_lens.npy\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "2b41e45d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2022-02-25 11:34:34.143 | INFO     | paddlespeech.s2t.modules.loss:__init__:41 - CTCLoss Loss reduction: sum, div-bs: True\n",
+      "2022-02-25 11:34:34.143 | INFO     | paddlespeech.s2t.modules.loss:__init__:42 - CTCLoss Grad Norm Type: instance\n",
+      "2022-02-25 11:34:34.144 | INFO     | paddlespeech.s2t.modules.loss:__init__:73 - CTCLoss() kwargs:{'norm_by_times': True}, not support: {'norm_by_batchsize': False, 'norm_by_total_logits_len': False}\n",
+      "loss 159.17205810546875\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/root/miniconda3/lib/python3.7/site-packages/paddle/fluid/dygraph/math_op_patch.py:253: UserWarning: The dtype of left and right variables are not the same, left dtype is paddle.float32, but right dtype is paddle.int32, the right dtype will convert to paddle.float32\n",
+      "  format(lhs_dtype, rhs_dtype, lhs_dtype))\n"
+     ]
+    }
+   ],
+   "source": [
+    "use_cpu = False\n",
+    "\n",
+    "from paddlespeech.s2t.modules.loss import CTCLoss\n",
+    "\n",
+    "if use_cpu:\n",
+    "    device = \"cpu\"\n",
+    "else:\n",
+    "    device = \"gpu\"\n",
+    "\n",
+    "paddle.set_device(device)\n",
+    "\n",
+    "blank_id=0\n",
+    "reduction_type='sum'\n",
+    "batch_average= True\n",
+    "grad_norm_type='instance'\n",
+    "\n",
+    "criterion = CTCLoss(\n",
+    "        blank=blank_id,\n",
+    "        reduction=reduction_type,\n",
+    "        batch_average=batch_average,\n",
+    "        grad_norm_type=grad_norm_type)\n",
+    "\n",
+    "logits = paddle.to_tensor(logits_np)\n",
+    "ys_pad = paddle.to_tensor(ys_pad_np,dtype='int32')\n",
+    "hlens = paddle.to_tensor(hlens_np, dtype='int64')\n",
+    "ys_lens = paddle.to_tensor(ys_lens_np, dtype='int64')\n",
+    "\n",
+    "pn_ctc_loss = criterion(logits, ys_pad, hlens, ys_lens)\n",
+    "print(\"loss\", pn_ctc_loss.item())\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "de525d38",
+   "metadata": {},
+   "source": [
+    "## 结论\n",
+    "在 CPU 环境下： torch 的 CTC loss 的计算速度是 paddle 的 9.8 倍  \n",
+    "在 GPU 环境下： torch 的 CTC loss 的计算速度是 paddle 的 6.87 倍\n",
+    "\n",
+    "## 其他结论\n",
+    "torch 的 ctc loss 在 CPU 和 GPU 下 都没有完全对齐。其中CPU的前向对齐精度大约为 1e-2。 GPU 的前向对齐精度大约为 1e-4 。"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@ -225,7 +225,9 @@ optional arguments:
 9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.

 ## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
+Pretrained FastSpeech2 model with no silence in the edge of audios:
+- [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
+- [fastspeech2_conformer_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_aishell3_ckpt_0.2.0.zip) (Thanks for [@awmmmm](https://github.com/awmmmm)'s contribution)

 FastSpeech2 checkpoint contains files listed below.

--- a/examples/aishell3/tts3/conf/conformer.yaml
+++ b/examples/aishell3/tts3/conf/conformer.yaml
@ -0,0 +1,110 @@
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 2048        # FFT size (samples).
+n_shift: 300       # Hop size (samples). 12.5ms
+win_length: 1200   # Window length (samples). 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80           # Minimum frequency of Mel basis.
+fmax: 7600         # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+# Only used for the model using pitch features (e.g. FastSpeech2)
+f0min: 80          # Maximum f0 for pitch extraction.
+f0max: 400         # Minimum f0 for pitch extraction.
+
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 32
+num_workers: 4
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform         # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    spk_embed_dim: 256                         # speaker embedding dimension
+    spk_embed_integration_type: concat         # speaker embedding integration type
+
+
+###########################################################
+#                       UPDATER SETTING                   #
+###########################################################
+updater:
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+
+
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer:
+  optim: adam              # optimizer type
+  learning_rate: 0.001     # learning rate
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 1000
+num_snapshots: 5
+
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
--- a/examples/other/g2p/README.md
+++ b/examples/other/g2p/README.md
@ -10,7 +10,7 @@ Run the command below to get the results of the test.
 ```bash
 ./run.sh
 ```
-The `avg WER` of g2p is: 0.027124048652822204
+The `avg WER` of g2p is: 0.026014352515701198
 ```text
     ,--------------------------------------------------------------------.
     |        | # Snt    # Wrd  | Corr    Sub    Del    Ins    Err  S.Err |
--- a/paddlespeech/cli/init.py
+++ b/paddlespeech/cli/init.py
@ -20,5 +20,6 @@ from .cls import CLSExecutor
 from .st import STExecutor
 from .text import TextExecutor
 from .tts import TTSExecutor
+from .stats import StatsExecutor

 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
--- a/paddlespeech/cli/stats/init.py
+++ b/paddlespeech/cli/stats/init.py
@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .infer import StatsExecutor
--- a/paddlespeech/cli/stats/infer.py
+++ b/paddlespeech/cli/stats/infer.py
@ -0,0 +1,193 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from typing import List
+
+from prettytable import PrettyTable
+
+from ..log import logger
+from ..utils import cli_register
+from ..utils import stats_wrapper
+
+__all__ = ['StatsExecutor']
+
+model_name_format = {
+    'asr': 'Model-Language-Sample Rate',
+    'cls': 'Model-Sample Rate',
+    'st': 'Model-Source language-Target language',
+    'text': 'Model-Task-Language',
+    'tts': 'Model-Language'
+}
+
+
+@cli_register(
+    name='paddlespeech.stats',
+    description='Get speech tasks support models list.')
+class StatsExecutor():
+    def __init__(self):
+        super(StatsExecutor, self).__init__()
+
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech.stats', add_help=True)
+        self.parser.add_argument(
+            '--task',
+            type=str,
+            default='asr',
+            choices=['asr', 'cls', 'st', 'text', 'tts'],
+            help='Choose speech task.',
+            required=True)
+        self.task_choices = ['asr', 'cls', 'st', 'text', 'tts']
+
+    def show_support_models(self, pretrained_models: dict):
+        fields = model_name_format[self.task].split("-")
+        table = PrettyTable(fields)
+        for key in pretrained_models:
+            table.add_row(key.split("-"))
+        print(table)
+
+    def execute(self, argv: List[str]) -> bool:
+        """
+            Command line entry.
+        """
+        parser_args = self.parser.parse_args(argv)
+        self.task = parser_args.task
+        if self.task not in self.task_choices:
+            logger.error(
+                "Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']"
+            )
+            return False
+
+        elif self.task == 'asr':
+            try:
+                from ..asr.infer import pretrained_models
+                logger.info(
+                    "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+                return True
+            except BaseException:
+                logger.error("Failed to get the list of ASR pretrained models.")
+                return False
+
+        elif self.task == 'cls':
+            try:
+                from ..cls.infer import pretrained_models
+                logger.info(
+                    "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+                return True
+            except BaseException:
+                logger.error("Failed to get the list of CLS pretrained models.")
+                return False
+
+        elif self.task == 'st':
+            try:
+                from ..st.infer import pretrained_models
+                logger.info(
+                    "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+                return True
+            except BaseException:
+                logger.error("Failed to get the list of ST pretrained models.")
+                return False
+
+        elif self.task == 'text':
+            try:
+                from ..text.infer import pretrained_models
+                logger.info(
+                    "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+                return True
+            except BaseException:
+                logger.error(
+                    "Failed to get the list of TEXT pretrained models.")
+                return False
+
+        elif self.task == 'tts':
+            try:
+                from ..tts.infer import pretrained_models
+                logger.info(
+                    "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+                return True
+            except BaseException:
+                logger.error("Failed to get the list of TTS pretrained models.")
+                return False
+
+    @stats_wrapper
+    def __call__(
+            self,
+            task: str=None, ):
+        """
+            Python API to call an executor.
+        """
+        self.task = task
+        if self.task not in self.task_choices:
+            print(
+                "Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']"
+            )
+
+        elif self.task == 'asr':
+            try:
+                from ..asr.infer import pretrained_models
+                print(
+                    "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+            except BaseException:
+                print("Failed to get the list of ASR pretrained models.")
+
+        elif self.task == 'cls':
+            try:
+                from ..cls.infer import pretrained_models
+                print(
+                    "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+            except BaseException:
+                print("Failed to get the list of CLS pretrained models.")
+
+        elif self.task == 'st':
+            try:
+                from ..st.infer import pretrained_models
+                print(
+                    "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+            except BaseException:
+                print("Failed to get the list of ST pretrained models.")
+
+        elif self.task == 'text':
+            try:
+                from ..text.infer import pretrained_models
+                print(
+                    "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+            except BaseException:
+                print("Failed to get the list of TEXT pretrained models.")
+
+        elif self.task == 'tts':
+            try:
+                from ..tts.infer import pretrained_models
+                print(
+                    "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API"
+                )
+                self.show_support_models(pretrained_models)
+            except BaseException:
+                print("Failed to get the list of TTS pretrained models.")
--- a/paddlespeech/s2t/io/sampler.py
+++ b/paddlespeech/s2t/io/sampler.py
@ -51,7 +51,7 @@ def _batch_shuffle(indices, batch_size, epoch, clipped=False):
    """
    rng = np.random.RandomState(epoch)
    shift_len = rng.randint(0, batch_size - 1)
-    batch_indices = list(zip(*[iter(indices[shift_len:])] * batch_size))
+    batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
    rng.shuffle(batch_indices)
    batch_indices = [item for batch in batch_indices for item in batch]
    assert clipped is False
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@ -33,8 +33,6 @@ from paddlespeech.s2t.modules.decoder import TransformerDecoder
 from paddlespeech.s2t.modules.encoder import ConformerEncoder
 from paddlespeech.s2t.modules.encoder import TransformerEncoder
 from paddlespeech.s2t.modules.loss import LabelSmoothingLoss
-from paddlespeech.s2t.modules.mask import mask_finished_preds
-from paddlespeech.s2t.modules.mask import mask_finished_scores
 from paddlespeech.s2t.modules.mask import subsequent_mask
 from paddlespeech.s2t.utils import checkpoint
 from paddlespeech.s2t.utils import layer_tools
@ -291,7 +289,7 @@ class U2STBaseModel(nn.Layer):
        device = speech.place

        # Let's assume B = batch_size and N = beam_size
-        # 1. Encoder and init hypothesis 
+        # 1. Encoder and init hypothesis
        encoder_out, encoder_mask = self._forward_encoder(
            speech, speech_lengths, decoding_chunk_size,
            num_decoding_left_chunks,
--- a/paddlespeech/server/bin/init.py
+++ b/paddlespeech/server/bin/init.py
@ -14,3 +14,4 @@
 from .paddlespeech_client import ASRClientExecutor
 from .paddlespeech_client import TTSClientExecutor
 from .paddlespeech_server import ServerExecutor
+from .paddlespeech_server import ServerStatsExecutor
--- a/paddlespeech/server/bin/paddlespeech_server.py
+++ b/paddlespeech/server/bin/paddlespeech_server.py
@ -16,15 +16,17 @@ from typing import List

 import uvicorn
 from fastapi import FastAPI
+from prettytable import PrettyTable

 from ..executor import BaseExecutor
 from ..util import cli_server_register
 from ..util import stats_wrapper
+from paddlespeech.cli.log import logger
 from paddlespeech.server.engine.engine_pool import init_engine_pool
 from paddlespeech.server.restful.api import setup_router
 from paddlespeech.server.utils.config import get_config

-__all__ = ['ServerExecutor']
+__all__ = ['ServerExecutor', 'ServerStatsExecutor']

 app = FastAPI(
    title="PaddleSpeech Serving API", description="Api", version="0.0.1")
@ -86,3 +88,139 @@ class ServerExecutor(BaseExecutor):
        config = get_config(config_file)
        if self.init(config):
            uvicorn.run(app, host=config.host, port=config.port, debug=True)
+
+
+@cli_server_register(
+    name='paddlespeech_server.stats',
+    description='Get the models supported by each speech task in the service.')
+class ServerStatsExecutor():
+    def __init__(self):
+        super(ServerStatsExecutor, self).__init__()
+
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech_server.stats', add_help=True)
+        self.parser.add_argument(
+            '--task',
+            type=str,
+            default=None,
+            choices=['asr', 'tts'],
+            help='Choose speech task.',
+            required=True)
+        self.task_choices = ['asr', 'tts']
+        self.model_name_format = {
+            'asr': 'Model-Language-Sample Rate',
+            'tts': 'Model-Language'
+        }
+
+    def show_support_models(self, pretrained_models: dict):
+        fields = self.model_name_format[self.task].split("-")
+        table = PrettyTable(fields)
+        for key in pretrained_models:
+            table.add_row(key.split("-"))
+        print(table)
+
+    def execute(self, argv: List[str]) -> bool:
+        """
+            Command line entry.
+        """
+        parser_args = self.parser.parse_args(argv)
+        self.task = parser_args.task
+        if self.task not in self.task_choices:
+            logger.error(
+                "Please input correct speech task, choices = ['asr', 'tts']")
+            return False
+
+        elif self.task == 'asr':
+            try:
+                from paddlespeech.cli.asr.infer import pretrained_models
+                logger.info(
+                    "Here is the table of ASR pretrained models supported in the service."
+                )
+                self.show_support_models(pretrained_models)
+
+                # show ASR static pretrained model
+                from paddlespeech.server.engine.asr.paddleinference.asr_engine import pretrained_models
+                logger.info(
+                    "Here is the table of ASR static pretrained models supported in the service."
+                )
+                self.show_support_models(pretrained_models)
+
+                return True
+            except BaseException:
+                logger.error(
+                    "Failed to get the table of ASR pretrained models supported in the service."
+                )
+                return False
+
+        elif self.task == 'tts':
+            try:
+                from paddlespeech.cli.tts.infer import pretrained_models
+                logger.info(
+                    "Here is the table of TTS pretrained models supported in the service."
+                )
+                self.show_support_models(pretrained_models)
+
+                # show TTS static pretrained model
+                from paddlespeech.server.engine.tts.paddleinference.tts_engine import pretrained_models
+                logger.info(
+                    "Here is the table of TTS static pretrained models supported in the service."
+                )
+                self.show_support_models(pretrained_models)
+
+                return True
+            except BaseException:
+                logger.error(
+                    "Failed to get the table of TTS pretrained models supported in the service."
+                )
+                return False
+
+    @stats_wrapper
+    def __call__(
+            self,
+            task: str=None, ):
+        """
+            Python API to call an executor.
+        """
+        self.task = task
+        if self.task not in self.task_choices:
+            print("Please input correct speech task, choices = ['asr', 'tts']")
+
+        elif self.task == 'asr':
+            try:
+                from paddlespeech.cli.asr.infer import pretrained_models
+                print(
+                    "Here is the table of ASR pretrained models supported in the service."
+                )
+                self.show_support_models(pretrained_models)
+
+                # show ASR static pretrained model
+                from paddlespeech.server.engine.asr.paddleinference.asr_engine import pretrained_models
+                print(
+                    "Here is the table of ASR static pretrained models supported in the service."
+                )
+                self.show_support_models(pretrained_models)
+
+            except BaseException:
+                print(
+                    "Failed to get the table of ASR pretrained models supported in the service."
+                )
+
+        elif self.task == 'tts':
+            try:
+                from paddlespeech.cli.tts.infer import pretrained_models
+                print(
+                    "Here is the table of TTS pretrained models supported in the service."
+                )
+                self.show_support_models(pretrained_models)
+
+                # show TTS static pretrained model
+                from paddlespeech.server.engine.tts.paddleinference.tts_engine import pretrained_models
+                print(
+                    "Here is the table of TTS static pretrained models supported in the service."
+                )
+                self.show_support_models(pretrained_models)
+
+            except BaseException:
+                print(
+                    "Failed to get the table of TTS pretrained models supported in the service."
+                )
--- a/paddlespeech/t2s/frontend/tone_sandhi.py
+++ b/paddlespeech/t2s/frontend/tone_sandhi.py
@ -63,7 +63,7 @@ class ToneSandhi():
            '扫把', '惦记'
        }
        self.must_not_neural_tone_words = {
-            "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子"
+            "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", "虎虎"
        }
        self.punc = "：，；。？！“”‘’':,;.?!"

@ -77,7 +77,9 @@ class ToneSandhi():

        # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
        for j, item in enumerate(word):
-            if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}:
+            if j - 1 >= 0 and item == word[j - 1] and pos[0] in {
+                    "n", "v", "a"
+            } and word not in self.must_not_neural_tone_words:
                finals[j] = finals[j][:-1] + "5"
        ge_idx = word.find("个")
        if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@ -20,7 +20,10 @@ import numpy as np
 import paddle
 from g2pM import G2pM
 from pypinyin import lazy_pinyin
+from pypinyin import load_phrases_dict
+from pypinyin import load_single_dict
 from pypinyin import Style
+from pypinyin_dict.phrase_pinyin_data import large_pinyin

 from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
 from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi
@ -41,6 +44,8 @@ class Frontend():
            self.g2pM_model = G2pM()
            self.pinyin2phone = generate_lexicon(
                with_tone=True, with_erhua=False)
+        else:
+            self.__init__pypinyin()
        self.must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿"}
        self.not_erhua = {
            "虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿",
@ -62,6 +67,23 @@ class Frontend():
            for tone, id in tone_id:
                self.vocab_tones[tone] = int(id)

+    def __init__pypinyin(self):
+        large_pinyin.load()
+
+        load_phrases_dict({u'开户行': [[u'ka1i'], [u'hu4'], [u'hang2']]})
+        load_phrases_dict({u'发卡行': [[u'fa4'], [u'ka3'], [u'hang2']]})
+        load_phrases_dict({u'放款行': [[u'fa4ng'], [u'kua3n'], [u'hang2']]})
+        load_phrases_dict({u'茧行': [[u'jia3n'], [u'hang2']]})
+        load_phrases_dict({u'行号': [[u'hang2'], [u'ha4o']]})
+        load_phrases_dict({u'各地': [[u'ge4'], [u'di4']]})
+        load_phrases_dict({u'借还款': [[u'jie4'], [u'hua2n'], [u'kua3n']]})
+        load_phrases_dict({u'时间为': [[u'shi2'], [u'jia1n'], [u'we2i']]})
+        load_phrases_dict({u'为准': [[u'we2i'], [u'zhu3n']]})
+        load_phrases_dict({u'色差': [[u'se4'], [u'cha1']]})
+
+        # 调整字的拼音顺序
+        load_single_dict({ord(u'地'): u'de,di4'})
+
    def _get_initials_finals(self, word: str) -> List[List[str]]:
        initials = []
        finals = []
--- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
@ -63,7 +63,10 @@ def replace_time(match) -> str:

    result = f"{num2str(hour)}点"
    if minute.lstrip('0'):
-        result += f"{_time_num2str(minute)}分"
+        if int(minute) == 30:
+            result += f"半"
+        else:
+            result += f"{_time_num2str(minute)}分"
    if second and second.lstrip('0'):
        result += f"{_time_num2str(second)}秒"

@ -71,7 +74,10 @@ def replace_time(match) -> str:
        result += "至"
        result += f"{num2str(hour_2)}点"
        if minute_2.lstrip('0'):
-            result += f"{_time_num2str(minute_2)}分"
+            if int(minute) == 30:
+                result += f"半"
+            else:
+                result += f"{_time_num2str(minute_2)}分"
        if second_2 and second_2.lstrip('0'):
            result += f"{_time_num2str(second_2)}秒"

--- a/paddlespeech/t2s/frontend/zh_normalization/num.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/num.py
@ -28,7 +28,7 @@ UNITS = OrderedDict({
    8: '亿',
 })

-COM_QUANTIFIERS = '(朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)'
+COM_QUANTIFIERS = '(所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)'

 # 分数表达式
 RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
@ -110,7 +110,7 @@ def replace_default_num(match):
 # 纯小数
 RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))')
 # 正整数 + 量词
-RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几])?" + COM_QUANTIFIERS)
+RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
 RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')


@ -123,6 +123,8 @@ def replace_positive_quantifier(match) -> str:
    """
    number = match.group(1)
    match_2 = match.group(2)
+    if match_2 == "+":
+        match_2 = "多"
    match_2: str = match_2 if match_2 else ""
    quantifiers: str = match.group(3)
    number: str = num2str(number)
@ -151,6 +153,7 @@ def replace_number(match) -> str:

 # 范围表达式
 # match.group(1) and match.group(8) are copy from RE_NUMBER
+
 RE_RANGE = re.compile(
    r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))')

--- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
@ -63,11 +63,19 @@ class TextNormalizer():
        # Only for pure Chinese here
        if lang == "zh":
            text = text.replace(" ", "")
+            # 过滤掉特殊字符
+            text = re.sub(r'[《》【】<=>{}()（）#&@“”^_|…\\]', '', text)
        text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
        text = text.strip()
        sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
        return sentences

+    def _post_replace(self, sentence: str) -> str:
+        sentence = sentence.replace('/', '每')
+        sentence = sentence.replace('~', '至')
+
+        return sentence
+
    def normalize_sentence(self, sentence: str) -> str:
        # basic character conversions
        sentence = tranditional_to_simplified(sentence)
@ -97,6 +105,7 @@ class TextNormalizer():
                                               sentence)
        sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
        sentence = RE_NUMBER.sub(replace_number, sentence)
+        sentence = self._post_replace(sentence)

        return sentence

--- a/paddlespeech/t2s/modules/transformer/repeat.py
+++ b/paddlespeech/t2s/modules/transformer/repeat.py
@ -36,4 +36,4 @@ def repeat(N, fn):
    Returns:
        MultiSequential: Repeated model instance.
    """
-    return MultiSequential(*[fn(n) for n in range(N)])
+    return MultiSequential(* [fn(n) for n in range(N)])
--- a/setup.py
+++ b/setup.py
@ -48,6 +48,7 @@ base = [
    "paddlespeech_feat",
    "praatio==5.0.0",
    "pypinyin",
+    "pypinyin-dict",
    "python-dateutil",
    "pyworld",
    "resampy==0.2.2",
@ -62,6 +63,7 @@ base = [
    "visualdl",
    "webrtcvad",
    "yacs~=0.1.8",
+    "prettytable",
 ]

 server = [
--- a/tests/test_tipc/configs/conformer/train_infer_python.txt
+++ b/tests/test_tipc/configs/conformer/train_infer_python.txt
@ -54,4 +54,4 @@ batch_size:16|30
 fp_items:fp32
 iteration:50
 --profiler-options:"batch_range=[10,35];state=GPU;tracer_option=Default;profile_path=model.profile"
-flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096
+flags:null
--- a/tests/test_tipc/configs/pwgan/train_infer_python.txt
+++ b/tests/test_tipc/configs/pwgan/train_infer_python.txt
@ -54,4 +54,4 @@ batch_size:6|16
 fp_items:fp32
 iteration:50
 --profiler_options:"batch_range=[10,35];state=GPU;tracer_option=Default;profile_path=model.profile"
-flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096
+flags:null
--- a/tests/test_tipc/prepare.sh
+++ b/tests/test_tipc/prepare.sh
@ -26,15 +26,19 @@ if [ ${MODE} = "benchmark_train" ];then
    curPath=$(readlink -f "$(dirname "$0")")
        echo "curPath:"${curPath}
    cd ${curPath}/../..
-    pip install .
+    apt-get install libsndfile1
+    pip install pytest-runner kaldiio setuptools_scm -i https://pypi.tuna.tsinghua.edu.cn/simple 
+    pip install . -i https://pypi.tuna.tsinghua.edu.cn/simple 
    cd -
    if [ ${model_name} == "conformer" ]; then
        # set the URL for aishell_tiny dataset
-        URL='None'
+        URL=${conformer_data_URL:-"None"}
        echo "URL:"${URL}
        if [ ${URL} == 'None' ];then
            echo "please contact author to get the URL.\n"
            exit
+	else
+	    wget -P ${curPath}/../../dataset/aishell/ ${URL} 
        fi
        sed -i "s#^URL_ROOT_TAG#URL_ROOT = '${URL}'#g" ${curPath}/conformer/scripts/aishell_tiny.py
        cp ${curPath}/conformer/scripts/aishell_tiny.py ${curPath}/../../dataset/aishell/
@ -42,6 +46,7 @@ if [ ${MODE} = "benchmark_train" ];then
        source path.sh
        # download audio data
        sed -i "s#aishell.py#aishell_tiny.py#g" ./local/data.sh
+	sed -i "s#python3#python#g" ./local/data.sh
        bash ./local/data.sh || exit -1
        if [ $? -ne 0 ]; then
        exit 1
@ -56,7 +61,6 @@ if [ ${MODE} = "benchmark_train" ];then
        sed -i "s#conf/#test_tipc/conformer/benchmark_train/conf/#g" ${curPath}/conformer/benchmark_train/conf/conformer.yaml
        sed -i "s#data/#test_tipc/conformer/benchmark_train/data/#g" ${curPath}/conformer/benchmark_train/conf/tuning/decode.yaml
        sed -i "s#data/#test_tipc/conformer/benchmark_train/data/#g" ${curPath}/conformer/benchmark_train/conf/preprocess.yaml
-
    fi

    if [ ${model_name} == "pwgan" ]; then
@ -73,4 +77,4 @@ if [ ${MODE} = "benchmark_train" ];then
        python ../paddlespeech/t2s/exps/gan_vocoder/normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/feats_stats.npy
    fi

-fi
+fi
--- a/tests/unit/asr/deepspeech2_online_model_test.py
+++ b/tests/unit/asr/deepspeech2_online_model_test.py
@ -11,11 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
+import pickle
 import unittest

 import numpy as np
 import paddle
+from paddle import inference

+from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline
 from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline


@ -182,5 +186,77 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
                paddle.allclose(final_state_c_box, final_state_c_box_chk), True)


+class TestDeepSpeech2StaticModelOnline(unittest.TestCase):
+    def setUp(self):
+        export_prefix = "exp/deepspeech2_online/checkpoints/test_export"
+        if not os.path.exists(os.path.dirname(export_prefix)):
+            os.makedirs(os.path.dirname(export_prefix), mode=0o755)
+        infer_model = DeepSpeech2InferModelOnline(
+            feat_size=161,
+            dict_size=4233,
+            num_conv_layers=2,
+            num_rnn_layers=5,
+            rnn_size=1024,
+            num_fc_layers=0,
+            fc_layers_size_list=[-1],
+            use_gru=False)
+        static_model = infer_model.export()
+        paddle.jit.save(static_model, export_prefix)
+
+        with open("test_data/static_ds2online_inputs.pickle", "rb") as f:
+            self.data_dict = pickle.load(f)
+
+        self.setup_model(export_prefix)
+
+    def setup_model(self, export_prefix):
+        deepspeech_config = inference.Config(export_prefix + ".pdmodel",
+                                             export_prefix + ".pdiparams")
+        if ('CUDA_VISIBLE_DEVICES' in os.environ.keys() and
+                os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''):
+            deepspeech_config.enable_use_gpu(100, 0)
+            deepspeech_config.enable_memory_optim()
+        deepspeech_predictor = inference.create_predictor(deepspeech_config)
+        self.predictor = deepspeech_predictor
+
+    def test_unit(self):
+        input_names = self.predictor.get_input_names()
+        audio_handle = self.predictor.get_input_handle(input_names[0])
+        audio_len_handle = self.predictor.get_input_handle(input_names[1])
+        h_box_handle = self.predictor.get_input_handle(input_names[2])
+        c_box_handle = self.predictor.get_input_handle(input_names[3])
+
+        x_chunk = self.data_dict["audio_chunk"]
+        x_chunk_lens = self.data_dict["audio_chunk_lens"]
+        chunk_state_h_box = self.data_dict["chunk_state_h_box"]
+        chunk_state_c_box = self.data_dict["chunk_state_c_bos"]
+
+        audio_handle.reshape(x_chunk.shape)
+        audio_handle.copy_from_cpu(x_chunk)
+
+        audio_len_handle.reshape(x_chunk_lens.shape)
+        audio_len_handle.copy_from_cpu(x_chunk_lens)
+
+        h_box_handle.reshape(chunk_state_h_box.shape)
+        h_box_handle.copy_from_cpu(chunk_state_h_box)
+
+        c_box_handle.reshape(chunk_state_c_box.shape)
+        c_box_handle.copy_from_cpu(chunk_state_c_box)
+
+        output_names = self.predictor.get_output_names()
+        output_handle = self.predictor.get_output_handle(output_names[0])
+        output_lens_handle = self.predictor.get_output_handle(output_names[1])
+        output_state_h_handle = self.predictor.get_output_handle(
+            output_names[2])
+        output_state_c_handle = self.predictor.get_output_handle(
+            output_names[3])
+        self.predictor.run()
+
+        output_chunk_probs = output_handle.copy_to_cpu()
+        output_chunk_lens = output_lens_handle.copy_to_cpu()
+        chunk_state_h_box = output_state_h_handle.copy_to_cpu()
+        chunk_state_c_box = output_state_c_handle.copy_to_cpu()
+        return True
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/tests/unit/asr/deepspeech2_online_model_test.sh
+++ b/tests/unit/asr/deepspeech2_online_model_test.sh
@ -0,0 +1,3 @@
+mkdir -p ./test_data
+wget -P ./test_data https://paddlespeech.bj.bcebos.com/datasets/unit_test/asr/static_ds2online_inputs.pickle
+python deepspeech2_online_model_test.py