resolve setup.py conflicts, test=doc

4 years ago · e5aa24fa5a
parent fe6be4a65e 8d474c2658
commit e5aa24fa5a
37 changed files with 246 additions and 282 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -50,12 +50,13 @@ repos:
        entry: bash .pre-commit-hooks/clang-format.hook -i
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
        exclude: (?=speechx/speechx/kaldi).*(\.cpp|\.cc|\.h|\.py)$
    -   id: copyright_checker
        name: copyright_checker
        entry: python .pre-commit-hooks/copyright-check.hook
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
-        exclude: (?=third_party|pypinyin).*(\.cpp|\.h|\.py)$
+        exclude: (?=third_party|pypinyin|speechx/speechx/kaldi).*(\.cpp|\.cc|\.h|\.py)$
 -   repo: https://github.com/asottile/reorder_python_imports
    rev: v2.4.0
    hooks:
--- a/dataset/voxceleb/voxceleb1.py
+++ b/dataset/voxceleb/voxceleb1.py
@ -80,6 +80,7 @@ parser.add_argument(
 args = parser.parse_args()
 def create_manifest(data_dir, manifest_path_prefix):
    print("Creating manifest %s ..." % manifest_path_prefix)
    json_lines = []
@ -128,6 +129,7 @@ def create_manifest(data_dir, manifest_path_prefix):
        print(f"{total_text / total_sec} text/sec", file=f)
        print(f"{total_sec / total_num} sec/utt", file=f)
 def prepare_dataset(base_url, data_list, target_dir, manifest_path,
                    target_data):
    if not os.path.exists(target_dir):
@ -164,6 +166,7 @@ def prepare_dataset(base_url, data_list, target_dir, manifest_path,
    # create the manifest file
    create_manifest(data_dir=target_dir, manifest_path_prefix=manifest_path)
 def main():
    if args.target_dir.startswith('~'):
        args.target_dir = os.path.expanduser(args.target_dir)
@ -184,5 +187,6 @@ def main():
    print("Manifest prepare done!")
 if __name__ == '__main__':
    main()
--- a/demos/speech_server/conf/asr/asr.yaml
+++ b/demos/speech_server/conf/asr/asr.yaml
@ -5,4 +5,4 @@ cfg_path: # [optional]
 ckpt_path: # [optional]
 decode_method: 'attention_rescoring'
 force_yes: True
-device: 'gpu:3'  # set 'gpu:id' or 'cpu'
+device: 'cpu'  # set 'gpu:id' or 'cpu'
--- a/demos/speech_server/conf/asr/asr_pd.yaml
+++ b/demos/speech_server/conf/asr/asr_pd.yaml
@ -15,7 +15,7 @@ decode_method:
 force_yes: True
 am_predictor_conf:
-  device: 'gpu:3'  # set 'gpu:id' or 'cpu'
+  device: 'cpu'  # set 'gpu:id' or 'cpu'
  enable_mkldnn: True
  switch_ir_optim: True
--- a/demos/speech_server/conf/tts/tts.yaml
+++ b/demos/speech_server/conf/tts/tts.yaml
@ -29,4 +29,4 @@ voc_stat:
 #                            OTHERS                              #
 ##################################################################
 lang: 'zh'
-device: 'gpu:3'  # set 'gpu:id' or 'cpu'
+device: 'cpu'  # set 'gpu:id' or 'cpu'
--- a/demos/speech_server/conf/tts/tts_pd.yaml
+++ b/demos/speech_server/conf/tts/tts_pd.yaml
@ -15,7 +15,7 @@ speaker_dict:
 spk_id: 0
 am_predictor_conf:
-  device: 'gpu:3'  # set 'gpu:id' or 'cpu'
+  device: 'cpu'  # set 'gpu:id' or 'cpu'
  enable_mkldnn: False
  switch_ir_optim: False
@ -30,7 +30,7 @@ voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
 voc_sample_rate: 24000
 voc_predictor_conf:
-  device: 'gpu:3'  # set 'gpu:id' or 'cpu'
+  device: 'cpu'  # set 'gpu:id' or 'cpu'
  enable_mkldnn: False  
  switch_ir_optim: False  
--- a/docs/topic/ctc/ctc_loss_compare.ipynb
+++ b/docs/topic/ctc/ctc_loss_compare.ipynb
@ -30,12 +30,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Cloning into 'warp-ctc'...\n",
+      "fatal: destination path 'warp-ctc' already exists and is not an empty directory.\r\n"
      "remote: Enumerating objects: 829, done.\u001b[K\n",
      "remote: Total 829 (delta 0), reused 0 (delta 0), pack-reused 829\u001b[K\n",
      "Receiving objects: 100% (829/829), 388.85 KiB | 140.00 KiB/s, done.\n",
      "Resolving deltas: 100% (419/419), done.\n",
      "Checking connectivity... done.\n"
     ]
    }
   ],
@ -99,30 +94,6 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-- The C compiler identification is GNU 5.4.0\n",
      "-- The CXX compiler identification is GNU 5.4.0\n",
      "-- Check for working C compiler: /usr/bin/cc\n",
      "-- Check for working C compiler: /usr/bin/cc -- works\n",
      "-- Detecting C compiler ABI info\n",
      "-- Detecting C compiler ABI info - done\n",
      "-- Detecting C compile features\n",
      "-- Detecting C compile features - done\n",
      "-- Check for working CXX compiler: /usr/bin/c++\n",
      "-- Check for working CXX compiler: /usr/bin/c++ -- works\n",
      "-- Detecting CXX compiler ABI info\n",
      "-- Detecting CXX compiler ABI info - done\n",
      "-- Detecting CXX compile features\n",
      "-- Detecting CXX compile features - done\n",
      "-- Looking for pthread.h\n",
      "-- Looking for pthread.h - found\n",
      "-- Performing Test CMAKE_HAVE_LIBC_PTHREAD\n",
      "-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed\n",
      "-- Looking for pthread_create in pthreads\n",
      "-- Looking for pthread_create in pthreads - not found\n",
      "-- Looking for pthread_create in pthread\n",
      "-- Looking for pthread_create in pthread - found\n",
      "-- Found Threads: TRUE  \n",
      "-- Found CUDA: /usr/local/cuda (found suitable version \"10.2\", minimum required is \"6.5\") \n",
      "-- cuda found TRUE\n",
      "-- Building shared library with GPU support\n",
      "-- Configuring done\n",
@ -145,20 +116,11 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[ 11%] \u001b[34m\u001b[1mBuilding NVCC (Device) object CMakeFiles/warpctc.dir/src/warpctc_generated_reduce.cu.o\u001b[0m\n",
+      "[ 11%] \u001b[32m\u001b[1mLinking CXX shared library libwarpctc.so\u001b[0m\n",
      "[ 22%] \u001b[34m\u001b[1mBuilding NVCC (Device) object CMakeFiles/warpctc.dir/src/warpctc_generated_ctc_entrypoint.cu.o\u001b[0m\n",
      "\u001b[35m\u001b[1mScanning dependencies of target warpctc\u001b[0m\n",
      "[ 33%] \u001b[32m\u001b[1mLinking CXX shared library libwarpctc.so\u001b[0m\n",
      "[ 33%] Built target warpctc\n",
-      "[ 44%] \u001b[34m\u001b[1mBuilding NVCC (Device) object CMakeFiles/test_gpu.dir/tests/test_gpu_generated_test_gpu.cu.o\u001b[0m\n",
+      "[ 44%] \u001b[32m\u001b[1mLinking CXX executable test_cpu\u001b[0m\n",
-      "\u001b[35m\u001b[1mScanning dependencies of target test_cpu\u001b[0m\n",
+      "[ 55%] \u001b[32m\u001b[1mLinking CXX executable test_gpu\u001b[0m\n",
      "[ 55%] \u001b[32mBuilding CXX object CMakeFiles/test_cpu.dir/tests/test_cpu.cpp.o\u001b[0m\n",
      "[ 66%] \u001b[32mBuilding CXX object CMakeFiles/test_cpu.dir/tests/random.cpp.o\u001b[0m\n",
      "[ 77%] \u001b[32m\u001b[1mLinking CXX executable test_cpu\u001b[0m\n",
      "[ 77%] Built target test_cpu\n",
      "\u001b[35m\u001b[1mScanning dependencies of target test_gpu\u001b[0m\n",
      "[ 88%] \u001b[32mBuilding CXX object CMakeFiles/test_gpu.dir/tests/random.cpp.o\u001b[0m\n",
      "[100%] \u001b[32m\u001b[1mLinking CXX executable test_gpu\u001b[0m\n",
      "[100%] Built target test_gpu\n"
     ]
    }
@ -169,7 +131,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
   "id": "31761a31",
   "metadata": {},
   "outputs": [
@ -187,7 +149,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
   "id": "f53316f6",
   "metadata": {},
   "outputs": [
@ -205,7 +167,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
   "id": "084f1e49",
   "metadata": {},
   "outputs": [
@ -216,29 +178,20 @@
      "running install\n",
      "running bdist_egg\n",
      "running egg_info\n",
      "creating warpctc_pytorch.egg-info\n",
      "writing warpctc_pytorch.egg-info/PKG-INFO\n",
      "writing dependency_links to warpctc_pytorch.egg-info/dependency_links.txt\n",
      "writing top-level names to warpctc_pytorch.egg-info/top_level.txt\n",
      "writing manifest file 'warpctc_pytorch.egg-info/SOURCES.txt'\n",
      "writing manifest file 'warpctc_pytorch.egg-info/SOURCES.txt'\n",
      "installing library code to build/bdist.linux-x86_64/egg\n",
      "running install_lib\n",
      "running build_py\n",
      "creating build\n",
      "creating build/lib.linux-x86_64-3.9\n",
      "creating build/lib.linux-x86_64-3.9/warpctc_pytorch\n",
      "copying warpctc_pytorch/__init__.py -> build/lib.linux-x86_64-3.9/warpctc_pytorch\n",
      "running build_ext\n",
      "building 'warpctc_pytorch._warp_ctc' extension\n",
      "creating /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9\n",
      "creating /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src\n",
      "Emitting ninja build file /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/build.ninja...\n",
      "Compiling objects...\n",
      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
-      "[1/1] c++ -MMD -MF /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src/binding.o.d -pthread -B /workspace/zhanghui/DeepSpeech-2.x/tools/venv/compiler_compat -Wl,--sysroot=/ -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /workspace/zhanghui/DeepSpeech-2.x/tools/venv/include -fPIC -O2 -isystem /workspace/zhanghui/DeepSpeech-2.x/tools/venv/include -fPIC -I/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include/TH -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/include/python3.9 -c -c /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/src/binding.cpp -o /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src/binding.o -std=c++14 -fPIC -DWARPCTC_ENABLE_GPU -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE=\"_gcc\"' '-DPYBIND11_STDLIB=\"_libstdcpp\"' '-DPYBIND11_BUILD_ABI=\"_cxxabi1011\"' -DTORCH_EXTENSION_NAME=_warp_ctc -D_GLIBCXX_USE_CXX11_ABI=0\n",
+      "ninja: no work to do.\n",
      "g++ -pthread -B /workspace/zhanghui/DeepSpeech-2.x/tools/venv/compiler_compat -Wl,--sysroot=/ -shared -Wl,-rpath,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -Wl,-rpath-link,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -L/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -Wl,-rpath,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -Wl,-rpath-link,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -L/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src/binding.o -L/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/build -L/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/lib -L/usr/local/cuda/lib64 -lwarpctc -lc10 -ltorch -ltorch_cpu -ltorch_python -lcudart -lc10_cuda -ltorch_cuda -o build/lib.linux-x86_64-3.9/warpctc_pytorch/_warp_ctc.cpython-39-x86_64-linux-gnu.so -Wl,-rpath,/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/build\n",
      "creating build/bdist.linux-x86_64\n",
      "creating build/bdist.linux-x86_64/egg\n",
      "creating build/bdist.linux-x86_64/egg/warpctc_pytorch\n",
      "copying build/lib.linux-x86_64-3.9/warpctc_pytorch/__init__.py -> build/bdist.linux-x86_64/egg/warpctc_pytorch\n",
@ -254,7 +207,6 @@
      "writing build/bdist.linux-x86_64/egg/EGG-INFO/native_libs.txt\n",
      "zip_safe flag not set; analyzing archive contents...\n",
      "warpctc_pytorch.__pycache__._warp_ctc.cpython-39: module references __file__\n",
      "creating dist\n",
      "creating 'dist/warpctc_pytorch-0.1-py3.9-linux-x86_64.egg' and adding 'build/bdist.linux-x86_64/egg' to it\n",
      "removing 'build/bdist.linux-x86_64/egg' (and everything under it)\n",
      "Processing warpctc_pytorch-0.1-py3.9-linux-x86_64.egg\n",
@ -275,7 +227,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
   "id": "ee4ca9e3",
   "metadata": {},
   "outputs": [
@ -293,7 +245,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
   "id": "59255ed8",
   "metadata": {},
   "outputs": [
@ -311,21 +263,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 22,
   "id": "1dae09b9",
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "grep: warning: GREP_OPTIONS is deprecated; please use an alias or script\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "import warpctc_pytorch as wp\n",
    "import paddle.nn as pn\n",
    "import paddle"
@ -333,7 +278,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
   "id": "83d0762e",
   "metadata": {},
   "outputs": [
@ -343,7 +288,7 @@
       "'1.10.0+cu102'"
      ]
     },
-     "execution_count": 16,
+     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -354,17 +299,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
   "id": "62501e2c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "'2.2.0'"
+       "'2.2.1'"
      ]
     },
-     "execution_count": 17,
+     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -375,7 +320,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 17,
   "id": "9e8e0f40",
   "metadata": {},
   "outputs": [
@ -392,6 +337,7 @@
    }
   ],
   "source": [
    "# warpctc_pytorch CTCLoss\n",
    "probs = torch.FloatTensor([[\n",
    "        [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]\n",
    "    ]]).transpose(0, 1).contiguous()\n",
@ -412,7 +358,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 18,
   "id": "2cd46569",
   "metadata": {},
   "outputs": [
@ -428,6 +374,7 @@
    }
   ],
   "source": [
    "# pytorch CTCLoss\n",
    "probs = torch.FloatTensor([[\n",
    "        [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]\n",
    "    ]]).transpose(0, 1).contiguous()\n",
@ -449,7 +396,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 27,
   "id": "85c3461a",
   "metadata": {},
   "outputs": [
@ -467,6 +414,7 @@
    }
   ],
   "source": [
    "# Paddle CTCLoss\n",
    "paddle.set_device('cpu')\n",
    "probs = paddle.to_tensor([[\n",
    "        [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1],\n",
@ -490,7 +438,55 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "d390cd91",
+   "id": "8cdf76c2",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "2c305eaf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([2, 1, 5])\n",
      "2.4628584384918213\n",
      "[[[ 0.17703117 -0.7081247   0.17703117  0.17703117  0.17703117]]\n",
      "\n",
      " [[ 0.17703117  0.17703117 -0.7081247   0.17703117  0.17703117]]]\n"
     ]
    }
   ],
   "source": [
    "# warpctc_pytorch CTCLoss, log_softmax idempotent\n",
    "probs = torch.FloatTensor([[\n",
    "        [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]\n",
    "    ]]).transpose(0, 1).contiguous()\n",
    "print(probs.size())\n",
    "labels = torch.IntTensor([1, 2])\n",
    "label_sizes = torch.IntTensor([2])\n",
    "probs_sizes = torch.IntTensor([2])\n",
    "probs.requires_grad_(True)\n",
    "bs = probs.size(1)\n",
    "\n",
    "ctc_loss = wp.CTCLoss(size_average=False, length_average=False)\n",
    "\n",
    "log_probs = torch.log_softmax(probs, axis=-1)\n",
    "cost = ctc_loss(log_probs, labels, probs_sizes, label_sizes)\n",
    "cost = cost.sum() / bs\n",
    "print(cost.item())\n",
    "cost.backward()\n",
    "print(probs.grad.numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "443336f0",
   "metadata": {},
   "outputs": [],
   "source": []
--- a/examples/ami/sd0/local/ami_prepare.py
+++ b/examples/ami/sd0/local/ami_prepare.py
@ -22,19 +22,17 @@ Authors
 * qingenz123@126.com (Qingen ZHAO) 2022
 """
 import os
 import logging
 import argparse
 import xml.etree.ElementTree as et
 import glob
 import json
-from ami_splits import get_AMI_split
+import logging
 import os
 import xml.etree.ElementTree as et
 from distutils.util import strtobool
-from dataio import (
+from ami_splits import get_AMI_split
-    load_pkl,
+from dataio import load_pkl
-    save_pkl, )
+from dataio import save_pkl
 logger = logging.getLogger(__name__)
 SAMPLERATE = 16000
--- a/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py
+++ b/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py
@ -12,28 +12,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Make VoxCeleb1 trial of kaldi format
 this script creat the test trial from kaldi trial voxceleb1_test_v2.txt or official trial veri_test2.txt 
 to kaldi trial format
 """
 import argparse
 import codecs
 import os
 parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument("--voxceleb_trial",
+parser.add_argument(
-                    default="voxceleb1_test_v2",
+    "--voxceleb_trial",
-                    type=str,
+    default="voxceleb1_test_v2",
-                    help="VoxCeleb trial file. Default we use the kaldi trial voxceleb1_test_v2.txt")
+    type=str,
-parser.add_argument("--trial",
+    help="VoxCeleb trial file. Default we use the kaldi trial voxceleb1_test_v2.txt"
-                    default="data/test/trial",
+)
-                    type=str,
+parser.add_argument(
-                    help="Kaldi format trial file")
+    "--trial",
    default="data/test/trial",
    type=str,
    help="Kaldi format trial file")
 args = parser.parse_args()
 def main(voxceleb_trial, trial):
    """
        VoxCeleb provide several trial file, which format is different with kaldi format.
@ -58,7 +60,9 @@ def main(voxceleb_trial, trial):
    """
    print("Start convert the voxceleb trial to kaldi format")
    if not os.path.exists(voxceleb_trial):
-        raise RuntimeError("{} does not exist. Pleas input the correct file path".format(voxceleb_trial))
+        raise RuntimeError(
            "{} does not exist. Pleas input the correct file path".format(
                voxceleb_trial))
    trial_dirname = os.path.dirname(trial)
    if not os.path.exists(trial_dirname):
@ -66,9 +70,9 @@ def main(voxceleb_trial, trial):
    with codecs.open(voxceleb_trial, 'r', encoding='utf-8') as f, \
         codecs.open(trial, 'w', encoding='utf-8') as w:
-         for line in f:
+        for line in f:
            target_or_nontarget, path1, path2 = line.strip().split()
-             
+
            utt_id1 = "-".join(path1.split("/"))
            utt_id2 = "-".join(path2.split("/"))
            target = "nontarget"
@ -77,5 +81,6 @@ def main(voxceleb_trial, trial):
            w.write("{} {} {}\n".format(utt_id1, utt_id2, target))
    print("Convert the voxceleb trial to kaldi format successfully")
 if __name__ == "__main__":
    main(args.voxceleb_trial, args.trial)
--- a/paddlespeech/init.py
+++ b/paddlespeech/init.py
@ -11,14 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@ -413,7 +413,8 @@ class ASRExecutor(BaseExecutor):
    def _check(self, audio_file: str, sample_rate: int, force_yes: bool):
        self.sample_rate = sample_rate
        if self.sample_rate != 16000 and self.sample_rate != 8000:
-            logger.error("invalid sample rate, please input --sr 8000 or --sr 16000")
+            logger.error(
                "invalid sample rate, please input --sr 8000 or --sr 16000")
            return False
        if isinstance(audio_file, (str, os.PathLike)):
--- a/paddlespeech/s2t/io/utility.py
+++ b/paddlespeech/s2t/io/utility.py
@ -11,8 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List
 from io import BytesIO
 from typing import List
 import numpy as np
--- a/paddlespeech/server/bin/main.py
+++ b/paddlespeech/server/bin/main.py
@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import uvicorn
 import yaml
 from fastapi import FastAPI
 from paddlespeech.server.engine.engine_pool import init_engine_pool
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@ -48,8 +48,9 @@ class TTSClientExecutor(BaseExecutor):
        self.parser.add_argument(
            '--input',
            type=str,
-            default="你好，欢迎使用语音合成服务",
+            default=None,
-            help='A sentence to be synthesized.')
+            help='Text to be synthesized.',
            required=True)
        self.parser.add_argument(
            '--spk_id', type=int, default=0, help='Speaker id')
        self.parser.add_argument(
@ -123,7 +124,7 @@ class TTSClientExecutor(BaseExecutor):
            logger.info("RTF: %f " % (time_consume / duration))
            return True
-        except:
+        except BaseException:
            logger.error("Failed to synthesized audio.")
            return False
@ -163,7 +164,7 @@ class TTSClientExecutor(BaseExecutor):
            print("Audio duration: %f s." % (duration))
            print("Response time: %f s." % (time_consume))
            print("RTF: %f " % (time_consume / duration))
-        except:
+        except BaseException:
            print("Failed to synthesized audio.")
@ -181,8 +182,9 @@ class ASRClientExecutor(BaseExecutor):
        self.parser.add_argument(
            '--input',
            type=str,
-            default="./paddlespeech/server/tests/16_audio.wav",
+            default=None,
-            help='Audio file to be recognized')
+            help='Audio file to be recognized',
            required=True)
        self.parser.add_argument(
            '--sample_rate', type=int, default=16000, help='audio sample rate')
        self.parser.add_argument(
@ -209,7 +211,7 @@ class ASRClientExecutor(BaseExecutor):
            logger.info(r.json())
            logger.info("time cost %f s." % (time_end - time_start))
            return True
-        except:
+        except BaseException:
            logger.error("Failed to speech recognition.")
            return False
@ -240,5 +242,5 @@ class ASRClientExecutor(BaseExecutor):
            time_end = time.time()
            print(r.json())
            print("time cost %f s." % (time_end - time_start))
-        except:
+        except BaseException:
-            print("Failed to speech recognition.")
+            print("Failed to speech recognition.")
--- a/paddlespeech/server/bin/paddlespeech_server.py
+++ b/paddlespeech/server/bin/paddlespeech_server.py
@ -41,7 +41,8 @@ class ServerExecutor(BaseExecutor):
            "--config_file",
            action="store",
            help="yaml file of the app",
-            default="./conf/application.yaml")
+            default=None,
            required=True)
        self.parser.add_argument(
            "--log_file",
--- a/paddlespeech/server/conf/asr/asr.yaml
+++ b/paddlespeech/server/conf/asr/asr.yaml
@ -5,4 +5,4 @@ cfg_path: # [optional]
 ckpt_path: # [optional]
 decode_method: 'attention_rescoring'
 force_yes: True
-device: 'gpu:3'  # set 'gpu:id' or 'cpu'
+device: 'cpu'  # set 'gpu:id' or 'cpu'
--- a/paddlespeech/server/conf/asr/asr_pd.yaml
+++ b/paddlespeech/server/conf/asr/asr_pd.yaml
@ -15,7 +15,7 @@ decode_method:
 force_yes: True
 am_predictor_conf:
-  device: 'gpu:3'  # set 'gpu:id' or 'cpu'
+  device: 'cpu'  # set 'gpu:id' or 'cpu'
  enable_mkldnn: True
  switch_ir_optim: True
--- a/paddlespeech/server/conf/tts/tts.yaml
+++ b/paddlespeech/server/conf/tts/tts.yaml
@ -29,4 +29,4 @@ voc_stat:
 #                            OTHERS                              #
 ##################################################################
 lang: 'zh'
-device: 'gpu:3'  # set 'gpu:id' or 'cpu'
+device: 'cpu'  # set 'gpu:id' or 'cpu'
--- a/paddlespeech/server/conf/tts/tts_pd.yaml
+++ b/paddlespeech/server/conf/tts/tts_pd.yaml
@ -15,7 +15,7 @@ speaker_dict:
 spk_id: 0
 am_predictor_conf:
-  device: 'gpu:3'  # set 'gpu:id' or 'cpu'
+  device: 'cpu'  # set 'gpu:id' or 'cpu'
  enable_mkldnn: False
  switch_ir_optim: False
@ -30,7 +30,7 @@ voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
 voc_sample_rate: 24000    #must match the model
 voc_predictor_conf:
-  device: 'gpu:3'   # set 'gpu:id' or 'cpu'
+  device: 'cpu'   # set 'gpu:id' or 'cpu'
  enable_mkldnn: False
  switch_ir_optim: False  
--- a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
+++ b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
@ -13,31 +13,24 @@
 # limitations under the License.
 import io
 import os
 from typing import List
 from typing import Optional
 from typing import Union
 import librosa
 import paddle
 import soundfile
 from yacs.config import CfgNode
 from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.s2t.modules.ctc import CTCDecoder
 from paddlespeech.cli.asr.infer import ASRExecutor
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.transform.transformation import Transformation
+from paddlespeech.s2t.modules.ctc import CTCDecoder
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.s2t.utils.utility import UpdateConfig
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.config import get_config
 from paddlespeech.server.utils.paddle_predictor import init_predictor
 from paddlespeech.server.utils.paddle_predictor import run_model
 from paddlespeech.server.engine.base_engine import BaseEngine
 __all__ = ['ASREngine']
 pretrained_models = {
    "deepspeech2offline_aishell-zh-16k": {
        'url':
@ -143,7 +136,6 @@ class ASRServerExecutor(ASRExecutor):
            batch_average=True,  # sum / batch_size
            grad_norm_type=self.config.get('ctc_grad_norm_type', None))
    @paddle.no_grad()
    def infer(self, model_type: str):
        """
@ -161,9 +153,8 @@ class ASRServerExecutor(ASRExecutor):
                cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n,
                cfg.num_proc_bsearch)
-            output_data = run_model(
+            output_data = run_model(self.am_predictor,
-                                self.am_predictor,
+                                    [audio.numpy(), audio_len.numpy()])
                                [audio.numpy(), audio_len.numpy()])
            probs = output_data[0]
            eouts_len = output_data[1]
@ -208,14 +199,14 @@ class ASREngine(BaseEngine):
        paddle.set_device(paddle.get_device())
        self.executor._init_from_path(
-                    model_type=self.config.model_type,
+            model_type=self.config.model_type,
-                    am_model=self.config.am_model,
+            am_model=self.config.am_model,
-                    am_params=self.config.am_params,
+            am_params=self.config.am_params,
-                    lang=self.config.lang,
+            lang=self.config.lang,
-                    sample_rate=self.config.sample_rate,
+            sample_rate=self.config.sample_rate,
-                    cfg_path=self.config.cfg_path,
+            cfg_path=self.config.cfg_path,
-                    decode_method=self.config.decode_method,
+            decode_method=self.config.decode_method,
-                    am_predictor_conf=self.config.am_predictor_conf)
+            am_predictor_conf=self.config.am_predictor_conf)
        logger.info("Initialize ASR server engine successfully.")
        return True
@ -230,7 +221,8 @@ class ASREngine(BaseEngine):
                io.BytesIO(audio_data), self.config.sample_rate,
                self.config.force_yes):
            logger.info("start running asr engine")
-            self.executor.preprocess(self.config.model_type, io.BytesIO(audio_data))
+            self.executor.preprocess(self.config.model_type,
                                     io.BytesIO(audio_data))
            self.executor.infer(self.config.model_type)
            self.output = self.executor.postprocess()  # Retrieve result of asr.
            logger.info("end inferring asr engine")
--- a/paddlespeech/server/engine/asr/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/python/asr_engine.py
@ -53,7 +53,10 @@ class ASREngine(BaseEngine):
        self.executor = ASRServerExecutor()
        self.config = get_config(config_file)
-        paddle.set_device(self.config.device)
+        if self.config.device is None:
            paddle.set_device(paddle.get_device())
        else:
            paddle.set_device(self.config.device)
        self.executor._init_from_path(
            self.config.model, self.config.lang, self.config.sample_rate,
            self.config.cfg_path, self.config.decode_method,
--- a/paddlespeech/server/engine/base_engine.py
+++ b/paddlespeech/server/engine/base_engine.py
@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 from typing import Any
 from typing import List
 from typing import Union
 from pattern_singleton import Singleton
--- a/paddlespeech/server/engine/engine_factory.py
+++ b/paddlespeech/server/engine/engine_factory.py
@ -13,7 +13,6 @@
 # limitations under the License.
 from typing import Text
 __all__ = ['EngineFactory']
--- a/paddlespeech/server/engine/engine_pool.py
+++ b/paddlespeech/server/engine/engine_pool.py
@ -29,8 +29,10 @@ def init_engine_pool(config) -> bool:
    """
    global ENGINE_POOL
    for engine in config.engine_backend:
-        ENGINE_POOL[engine] = EngineFactory.get_engine(engine_name=engine, engine_type=config.engine_type[engine])
+        ENGINE_POOL[engine] = EngineFactory.get_engine(
-        if not ENGINE_POOL[engine].init(config_file=config.engine_backend[engine]):
+            engine_name=engine, engine_type=config.engine_type[engine])
        if not ENGINE_POOL[engine].init(
                config_file=config.engine_backend[engine]):
            return False
    return True
--- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
+++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
@ -360,8 +360,8 @@ class TTSEngine(BaseEngine):
                am_predictor_conf=self.config.am_predictor_conf,
                voc_predictor_conf=self.config.voc_predictor_conf, )
-        except:
+        except BaseException:
-            logger.info("Initialize TTS server engine Failed.")
+            logger.error("Initialize TTS server engine Failed.")
            return False
        logger.info("Initialize TTS server engine successfully.")
@ -405,11 +405,13 @@ class TTSEngine(BaseEngine):
        # transform speed
        try:  # windows not support soxbindings
            wav_speed = change_speed(wav_vol, speed, target_fs)
-        except:
+        except ServerBaseException:
            raise ServerBaseException(
                ErrorCode.SERVER_INTERNAL_ERR,
                "Transform speed failed. Can not install soxbindings on your system. \
                 You need to set speed value 1.0.")
        except BaseException:
            logger.error("Transform speed failed.")
        # wav to base64
        buf = io.BytesIO()
@ -462,9 +464,11 @@ class TTSEngine(BaseEngine):
        try:
            self.executor.infer(
                text=sentence, lang=lang, am=self.config.am, spk_id=spk_id)
-        except:
+        except ServerBaseException:
            raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR,
                                      "tts infer failed.")
        except BaseException:
            logger.error("tts infer failed.")
        try:
            target_sample_rate, wav_base64 = self.postprocess(
@ -474,8 +478,10 @@ class TTSEngine(BaseEngine):
                volume=volume,
                speed=speed,
                audio_path=save_path)
-        except:
+        except ServerBaseException:
            raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR,
                                      "tts postprocess failed.")
        except BaseException:
            logger.error("tts postprocess failed.")
        return lang, target_sample_rate, wav_base64
--- a/paddlespeech/server/engine/tts/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/python/tts_engine.py
@ -54,7 +54,10 @@ class TTSEngine(BaseEngine):
        try:
            self.config = get_config(config_file)
-            paddle.set_device(self.config.device)
+            if self.config.device is None:
                paddle.set_device(paddle.get_device())
            else:
                paddle.set_device(self.config.device)
            self.executor._init_from_path(
                am=self.config.am,
@ -69,8 +72,8 @@ class TTSEngine(BaseEngine):
                voc_ckpt=self.config.voc_ckpt,
                voc_stat=self.config.voc_stat,
                lang=self.config.lang)
-        except:
+        except BaseException:
-            logger.info("Initialize TTS server engine Failed.")
+            logger.error("Initialize TTS server engine Failed.")
            return False
        logger.info("Initialize TTS server engine successfully.")
@ -114,10 +117,13 @@ class TTSEngine(BaseEngine):
        # transform speed
        try:  # windows not support soxbindings
            wav_speed = change_speed(wav_vol, speed, target_fs)
-        except:
+        except ServerBaseException:
            raise ServerBaseException(
                ErrorCode.SERVER_INTERNAL_ERR,
-                "Can not install soxbindings on your system.")
+                "Transform speed failed. Can not install soxbindings on your system. \
                 You need to set speed value 1.0.")
        except BaseException:
            logger.error("Transform speed failed.")
        # wav to base64
        buf = io.BytesIO()
@ -170,9 +176,11 @@ class TTSEngine(BaseEngine):
        try:
            self.executor.infer(
                text=sentence, lang=lang, am=self.config.am, spk_id=spk_id)
-        except:
+        except ServerBaseException:
            raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR,
                                      "tts infer failed.")
        except BaseException:
            logger.error("tts infer failed.")
        try:
            target_sample_rate, wav_base64 = self.postprocess(
@ -182,8 +190,10 @@ class TTSEngine(BaseEngine):
                volume=volume,
                speed=speed,
                audio_path=save_path)
-        except:
+        except ServerBaseException:
            raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR,
                                      "tts postprocess failed.")
        except BaseException:
            logger.error("tts postprocess failed.")
        return lang, target_sample_rate, wav_base64
--- a/paddlespeech/server/restful/asr_api.py
+++ b/paddlespeech/server/restful/asr_api.py
@ -14,6 +14,7 @@
 import base64
 import traceback
 from typing import Union
 from fastapi import APIRouter
 from paddlespeech.server.engine.engine_pool import get_engine_pool
@ -83,7 +84,7 @@ def asr(request_body: ASRRequest):
    except ServerBaseException as e:
        response = failed_response(e.error_code, e.msg)
-    except:
+    except BaseException:
        response = failed_response(ErrorCode.SERVER_UNKOWN_ERR)
        traceback.print_exc()
--- a/paddlespeech/server/restful/request.py
+++ b/paddlespeech/server/restful/request.py
@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List
 from typing import Optional
 from pydantic import BaseModel
--- a/paddlespeech/server/restful/response.py
+++ b/paddlespeech/server/restful/response.py
@ -11,9 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List
 from typing import Optional
 from pydantic import BaseModel
 __all__ = ['ASRResponse', 'TTSResponse']
--- a/paddlespeech/server/restful/tts_api.py
+++ b/paddlespeech/server/restful/tts_api.py
@ -114,7 +114,7 @@ def tts(request_body: TTSRequest):
        }
    except ServerBaseException as e:
        response = failed_response(e.error_code, e.msg)
-    except:
+    except BaseException:
        response = failed_response(ErrorCode.SERVER_UNKOWN_ERR)
        traceback.print_exc()
--- a/paddlespeech/server/tests/asr/http_client.py
+++ b/paddlespeech/server/tests/asr/http_client.py
@ -10,11 +10,11 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the 
-import requests
+import base64
 import json
 import time
-import base64
+
-import io
+import requests
 def readwav2base64(wav_file):
@ -34,23 +34,23 @@ def main():
    url = "http://127.0.0.1:8090/paddlespeech/asr"
    # start Timestamp
-    time_start=time.time()
+    time_start = time.time()
    test_audio_dir = "./16_audio.wav"
    audio = readwav2base64(test_audio_dir)
    data = {
-            "audio": audio,
+        "audio": audio,
-            "audio_format": "wav",
+        "audio_format": "wav",
-            "sample_rate": 16000,
+        "sample_rate": 16000,
-            "lang": "zh_cn",
+        "lang": "zh_cn",
-            }
+    }
    r = requests.post(url=url, data=json.dumps(data))
    # ending Timestamp
-    time_end=time.time()
+    time_end = time.time()
-    print('time cost',time_end - time_start, 's')
+    print('time cost', time_end - time_start, 's')
    print(r.json())
--- a/paddlespeech/server/tests/tts/test_client.py
+++ b/paddlespeech/server/tests/tts/test_client.py
@ -25,6 +25,7 @@ import soundfile
 from paddlespeech.server.utils.audio_process import wav2pcm
 # Request and response
 def tts_client(args):
    """ Request and response
@ -99,5 +100,5 @@ if __name__ == "__main__":
        print("Inference time: %f" % (time_consume))
        print("The duration of synthesized audio: %f" % (duration))
        print("The RTF is: %f" % (rtf))
-    except:
+    except BaseException:
        print("Failed to synthesized audio.")
--- a/paddlespeech/server/util.py
+++ b/paddlespeech/server/util.py
@ -219,7 +219,7 @@ class ConfigCache:
            try:
                cfg = yaml.load(file, Loader=yaml.FullLoader)
                self._data.update(cfg)
-            except:
+            except BaseException:
                self.flush()
    @property
--- a/paddlespeech/t2s/datasets/dataset.py
+++ b/paddlespeech/t2s/datasets/dataset.py
@ -258,4 +258,4 @@ class ChainDataset(Dataset):
                return dataset[i]
            i -= len(dataset)
-        raise IndexError("dataset index out of range")
+        raise IndexError("dataset index out of range")
--- a/requirements.txt
+++ b/requirements.txt
@ -1,48 +0,0 @@
 ConfigArgParse
 coverage
 editdistance
 g2p_en
 g2pM
 gpustat
 h5py
 inflect
 jieba
 jsonlines
 kaldiio
 librosa
 loguru
 matplotlib
 nara_wpe
 nltk
 paddleaudio
 paddlenlp
 paddlespeech_ctcdecoders
 paddlespeech_feat
 pandas
 phkit
 Pillow
 praatio==5.0.0
 pre-commit
 pybind11
 pypi-kenlm
 pypinyin
 python-dateutil
 pyworld
 resampy==0.2.2
 sacrebleu
 scipy
 sentencepiece~=0.1.96
 snakeviz
 soundfile~=0.10
 sox
 soxbindings
 textgrid
 timer
 tqdm
 typeguard
 unidecode
 visualdl
 webrtcvad
 yacs~=0.1.8
 yq
 zhon
--- a/setup.py
+++ b/setup.py
@ -27,47 +27,53 @@ from setuptools.command.install import install
 HERE = Path(os.path.abspath(os.path.dirname(__file__)))
-VERSION = '0.1.1'
+VERSION = '0.1.2'
 base = [
    "editdistance",
    "g2p_en",
    "g2pM",
    "h5py",
    "inflect",
    "jieba",
    "jsonlines",
    "kaldiio",
    "librosa==0.8.1",
    "loguru",
    "matplotlib",
    "nara_wpe",
    "pandas",
    "paddleaudio",
    "paddlenlp",
    "paddlespeech_feat",
    "praatio==5.0.0",
    "pypinyin",
    "python-dateutil",
    "pyworld",
    "resampy==0.2.2",
    "sacrebleu",
    "scipy",
    "sentencepiece~=0.1.96",
    "soundfile~=0.10",
    "textgrid",
    "timer",
    "tqdm",
    "typeguard",
    "visualdl",
    "webrtcvad",
    "yacs~=0.1.8",
 ]
 server = [
    "fastapi",
    "uvicorn",
    "pattern_singleton",
    "prettytable",
 ]
 requirements = {
-    "install": [
+    "install":
-        "editdistance",
+    base + server,
        "g2p_en",
        "g2pM",
        "h5py",
        "inflect",
        "jieba",
        "jsonlines",
        "kaldiio",
        "librosa",
        "loguru",
        "matplotlib",
        "nara_wpe",
        "pandas",
        "paddleaudio",
        "paddlenlp",
        "paddlespeech_feat",
        "praatio==5.0.0",
        "pypinyin",
        "python-dateutil",
        "pyworld",
        "resampy==0.2.2",
        "sacrebleu",
        "scipy",
        "sentencepiece~=0.1.96",
        "soundfile~=0.10",
        "textgrid",
        "timer",
        "tqdm",
        "typeguard",
        "visualdl",
        "webrtcvad",
        "yacs~=0.1.8",
        # fastapi server
        "fastapi",
        "uvicorn",
        "prettytable"
    ],
    "develop": [
        "ConfigArgParse",
        "coverage",
--- a/utils/DER.py
+++ b/utils/DER.py
@ -23,10 +23,11 @@ Credits
 This code is adapted from https://github.com/nryant/dscore
 """
 import argparse
 from distutils.util import strtobool
 import os
 import re
 import subprocess
 from distutils.util import strtobool
 import numpy as np
 FILE_IDS = re.compile(r"(?<=Speaker Diarization for).+(?=\*\*\*)")