commit
d70bcb8f7a
@ -0,0 +1 @@
|
|||||||
|
*.wav
|
@ -1,4 +1,10 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
|
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
|
||||||
|
|
||||||
|
# asr
|
||||||
paddlespeech asr --input ./zh.wav
|
paddlespeech asr --input ./zh.wav
|
||||||
|
|
||||||
|
|
||||||
|
# asr + punc
|
||||||
|
paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
|
@ -1,7 +1,8 @@
|
|||||||
model: 'conformer_wenetspeech'
|
model: 'conformer_wenetspeech'
|
||||||
lang: 'zh'
|
lang: 'zh'
|
||||||
sample_rate: 16000
|
sample_rate: 16000
|
||||||
cfg_path:
|
cfg_path: # [optional]
|
||||||
ckpt_path:
|
ckpt_path: # [optional]
|
||||||
decode_method: 'attention_rescoring'
|
decode_method: 'attention_rescoring'
|
||||||
force_yes: False
|
force_yes: True
|
||||||
|
device: # set 'gpu:id' or 'cpu'
|
||||||
|
@ -0,0 +1,26 @@
|
|||||||
|
# This is the parameter configuration file for ASR server.
|
||||||
|
# These are the static models that support paddle inference.
|
||||||
|
|
||||||
|
##################################################################
|
||||||
|
# ACOUSTIC MODEL SETTING #
|
||||||
|
# am choices=['deepspeech2offline_aishell'] TODO
|
||||||
|
##################################################################
|
||||||
|
model_type: 'deepspeech2offline_aishell'
|
||||||
|
am_model: # the pdmodel file of am static model [optional]
|
||||||
|
am_params: # the pdiparams file of am static model [optional]
|
||||||
|
lang: 'zh'
|
||||||
|
sample_rate: 16000
|
||||||
|
cfg_path:
|
||||||
|
decode_method:
|
||||||
|
force_yes: True
|
||||||
|
|
||||||
|
am_predictor_conf:
|
||||||
|
device: # set 'gpu:id' or 'cpu'
|
||||||
|
switch_ir_optim: True
|
||||||
|
glog_info: False # True -> print glog
|
||||||
|
summary: True # False -> do not show predictor config
|
||||||
|
|
||||||
|
|
||||||
|
##################################################################
|
||||||
|
# OTHERS #
|
||||||
|
##################################################################
|
@ -1,3 +1,7 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
|
# single process
|
||||||
paddlespeech tts --input 今天的天气不错啊
|
paddlespeech tts --input 今天的天气不错啊
|
||||||
|
|
||||||
|
# Batch process
|
||||||
|
echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
|
@ -0,0 +1,369 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "a1e738e0",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 获取测试的 logit 数据"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "29d3368b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"hlens.npy\n",
|
||||||
|
"logits.npy\n",
|
||||||
|
"ys_lens.npy\n",
|
||||||
|
"ys_pad.npy\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"!mkdir -p ./test_data\n",
|
||||||
|
"!test -f ./test_data/ctc_loss_compare_data.tgz || wget -P ./test_data https://paddlespeech.bj.bcebos.com/datasets/unit_test/asr/ctc_loss_compare_data.tgz\n",
|
||||||
|
"!tar xzvf test_data/ctc_loss_compare_data.tgz -C ./test_data\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "240caf1d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import time\n",
|
||||||
|
"\n",
|
||||||
|
"data_dir=\"./test_data\"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "91bad949",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"logits_np = np.load(os.path.join(data_dir, \"logits.npy\"))\n",
|
||||||
|
"ys_pad_np = np.load(os.path.join(data_dir, \"ys_pad.npy\"))\n",
|
||||||
|
"hlens_np = np.load(os.path.join(data_dir, \"hlens.npy\"))\n",
|
||||||
|
"ys_lens_np = np.load(os.path.join(data_dir, \"ys_lens.npy\"))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "4cef2f15",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 使用 torch 的 ctc loss"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "90612004",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'1.10.1+cu102'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import torch\n",
|
||||||
|
"torch.__version__"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "00799f97",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def torch_ctc_loss(use_cpu):\n",
|
||||||
|
" if use_cpu:\n",
|
||||||
|
" device = torch.device(\"cpu\")\n",
|
||||||
|
" else:\n",
|
||||||
|
" device = torch.device(\"cuda\")\n",
|
||||||
|
"\n",
|
||||||
|
" reduction_type = \"sum\" \n",
|
||||||
|
"\n",
|
||||||
|
" ctc_loss = torch.nn.CTCLoss(reduction=reduction_type)\n",
|
||||||
|
"\n",
|
||||||
|
" ys_hat = torch.tensor(logits_np, device = device)\n",
|
||||||
|
" ys_pad = torch.tensor(ys_pad_np, device = device)\n",
|
||||||
|
" hlens = torch.tensor(hlens_np, device = device)\n",
|
||||||
|
" ys_lens = torch.tensor(ys_lens_np, device = device)\n",
|
||||||
|
"\n",
|
||||||
|
" ys_hat = ys_hat.transpose(0, 1)\n",
|
||||||
|
" \n",
|
||||||
|
" # 开始计算时间\n",
|
||||||
|
" start_time = time.time()\n",
|
||||||
|
" ys_hat = ys_hat.log_softmax(2)\n",
|
||||||
|
" loss = ctc_loss(ys_hat, ys_pad, hlens, ys_lens)\n",
|
||||||
|
" end_time = time.time()\n",
|
||||||
|
" \n",
|
||||||
|
" loss = loss / ys_hat.size(1)\n",
|
||||||
|
" return end_time - start_time, loss.item()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "ba47b5a4",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 使用 paddle 的 ctc loss"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "6882a06e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'2.2.2'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import paddle\n",
|
||||||
|
"paddle.__version__"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "3cfa3b7c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def paddle_ctc_loss(use_cpu): \n",
|
||||||
|
" import paddle.nn as pn\n",
|
||||||
|
" if use_cpu:\n",
|
||||||
|
" device = \"cpu\"\n",
|
||||||
|
" else:\n",
|
||||||
|
" device = \"gpu\"\n",
|
||||||
|
"\n",
|
||||||
|
" paddle.set_device(device)\n",
|
||||||
|
"\n",
|
||||||
|
" logits = paddle.to_tensor(logits_np)\n",
|
||||||
|
" ys_pad = paddle.to_tensor(ys_pad_np,dtype='int32')\n",
|
||||||
|
" hlens = paddle.to_tensor(hlens_np, dtype='int64')\n",
|
||||||
|
" ys_lens = paddle.to_tensor(ys_lens_np, dtype='int64')\n",
|
||||||
|
"\n",
|
||||||
|
" logits = logits.transpose([1,0,2])\n",
|
||||||
|
"\n",
|
||||||
|
" ctc_loss = pn.CTCLoss(reduction='sum')\n",
|
||||||
|
" # 开始计算时间\n",
|
||||||
|
" start_time = time.time()\n",
|
||||||
|
" pn_loss = ctc_loss(logits, ys_pad, hlens, ys_lens)\n",
|
||||||
|
" end_time = time.time()\n",
|
||||||
|
" \n",
|
||||||
|
" pn_loss = pn_loss / logits.shape[1]\n",
|
||||||
|
" return end_time - start_time, pn_loss.item()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "40413ef9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"CPU, iteration 10\n",
|
||||||
|
"torch_ctc_loss 159.17137145996094\n",
|
||||||
|
"paddle_ctc_loss 159.16574096679688\n",
|
||||||
|
"paddle average time 1.718252992630005\n",
|
||||||
|
"torch average time 0.17536230087280275\n",
|
||||||
|
"paddle time / torch time (cpu) 9.798303193320452\n",
|
||||||
|
"\n",
|
||||||
|
"GPU, iteration 10\n",
|
||||||
|
"torch_ctc_loss 159.172119140625\n",
|
||||||
|
"paddle_ctc_loss 159.17205810546875\n",
|
||||||
|
"paddle average time 0.018606925010681154\n",
|
||||||
|
"torch average time 0.0026710033416748047\n",
|
||||||
|
"paddle time / torch time (gpu) 6.966267963938231\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# 使用 CPU\n",
|
||||||
|
"\n",
|
||||||
|
"iteration = 10\n",
|
||||||
|
"use_cpu = True\n",
|
||||||
|
"torch_total_time = 0\n",
|
||||||
|
"paddle_total_time = 0\n",
|
||||||
|
"for _ in range(iteration):\n",
|
||||||
|
" cost_time, torch_loss = torch_ctc_loss(use_cpu)\n",
|
||||||
|
" torch_total_time += cost_time\n",
|
||||||
|
"for _ in range(iteration):\n",
|
||||||
|
" cost_time, paddle_loss = paddle_ctc_loss(use_cpu)\n",
|
||||||
|
" paddle_total_time += cost_time\n",
|
||||||
|
"print (\"CPU, iteration\", iteration)\n",
|
||||||
|
"print (\"torch_ctc_loss\", torch_loss)\n",
|
||||||
|
"print (\"paddle_ctc_loss\", paddle_loss)\n",
|
||||||
|
"print (\"paddle average time\", paddle_total_time / iteration)\n",
|
||||||
|
"print (\"torch average time\", torch_total_time / iteration)\n",
|
||||||
|
"print (\"paddle time / torch time (cpu)\" , paddle_total_time/ torch_total_time)\n",
|
||||||
|
"\n",
|
||||||
|
"print (\"\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 使用 GPU\n",
|
||||||
|
"\n",
|
||||||
|
"use_cpu = False\n",
|
||||||
|
"torch_total_time = 0\n",
|
||||||
|
"paddle_total_time = 0\n",
|
||||||
|
"for _ in range(iteration):\n",
|
||||||
|
" cost_time, torch_loss = torch_ctc_loss(use_cpu)\n",
|
||||||
|
" torch_total_time += cost_time\n",
|
||||||
|
"for _ in range(iteration):\n",
|
||||||
|
" cost_time, paddle_loss = paddle_ctc_loss(use_cpu)\n",
|
||||||
|
" paddle_total_time += cost_time\n",
|
||||||
|
"print (\"GPU, iteration\", iteration)\n",
|
||||||
|
"print (\"torch_ctc_loss\", torch_loss)\n",
|
||||||
|
"print (\"paddle_ctc_loss\", paddle_loss)\n",
|
||||||
|
"print (\"paddle average time\", paddle_total_time / iteration)\n",
|
||||||
|
"print (\"torch average time\", torch_total_time / iteration)\n",
|
||||||
|
"print (\"paddle time / torch time (gpu)\" , paddle_total_time/ torch_total_time)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "7cdf8697",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 其他: 使用 PaddleSpeech 中的 ctcloss 查一下loss值"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "73fad81d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"logits_np = np.load(os.path.join(data_dir, \"logits.npy\"))\n",
|
||||||
|
"ys_pad_np = np.load(os.path.join(data_dir, \"ys_pad.npy\"))\n",
|
||||||
|
"hlens_np = np.load(os.path.join(data_dir, \"hlens.npy\"))\n",
|
||||||
|
"ys_lens_np = np.load(os.path.join(data_dir, \"ys_lens.npy\"))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"id": "2b41e45d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"2022-02-25 11:34:34.143 | INFO | paddlespeech.s2t.modules.loss:__init__:41 - CTCLoss Loss reduction: sum, div-bs: True\n",
|
||||||
|
"2022-02-25 11:34:34.143 | INFO | paddlespeech.s2t.modules.loss:__init__:42 - CTCLoss Grad Norm Type: instance\n",
|
||||||
|
"2022-02-25 11:34:34.144 | INFO | paddlespeech.s2t.modules.loss:__init__:73 - CTCLoss() kwargs:{'norm_by_times': True}, not support: {'norm_by_batchsize': False, 'norm_by_total_logits_len': False}\n",
|
||||||
|
"loss 159.17205810546875\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/root/miniconda3/lib/python3.7/site-packages/paddle/fluid/dygraph/math_op_patch.py:253: UserWarning: The dtype of left and right variables are not the same, left dtype is paddle.float32, but right dtype is paddle.int32, the right dtype will convert to paddle.float32\n",
|
||||||
|
" format(lhs_dtype, rhs_dtype, lhs_dtype))\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"use_cpu = False\n",
|
||||||
|
"\n",
|
||||||
|
"from paddlespeech.s2t.modules.loss import CTCLoss\n",
|
||||||
|
"\n",
|
||||||
|
"if use_cpu:\n",
|
||||||
|
" device = \"cpu\"\n",
|
||||||
|
"else:\n",
|
||||||
|
" device = \"gpu\"\n",
|
||||||
|
"\n",
|
||||||
|
"paddle.set_device(device)\n",
|
||||||
|
"\n",
|
||||||
|
"blank_id=0\n",
|
||||||
|
"reduction_type='sum'\n",
|
||||||
|
"batch_average= True\n",
|
||||||
|
"grad_norm_type='instance'\n",
|
||||||
|
"\n",
|
||||||
|
"criterion = CTCLoss(\n",
|
||||||
|
" blank=blank_id,\n",
|
||||||
|
" reduction=reduction_type,\n",
|
||||||
|
" batch_average=batch_average,\n",
|
||||||
|
" grad_norm_type=grad_norm_type)\n",
|
||||||
|
"\n",
|
||||||
|
"logits = paddle.to_tensor(logits_np)\n",
|
||||||
|
"ys_pad = paddle.to_tensor(ys_pad_np,dtype='int32')\n",
|
||||||
|
"hlens = paddle.to_tensor(hlens_np, dtype='int64')\n",
|
||||||
|
"ys_lens = paddle.to_tensor(ys_lens_np, dtype='int64')\n",
|
||||||
|
"\n",
|
||||||
|
"pn_ctc_loss = criterion(logits, ys_pad, hlens, ys_lens)\n",
|
||||||
|
"print(\"loss\", pn_ctc_loss.item())\n",
|
||||||
|
" "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "de525d38",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 结论\n",
|
||||||
|
"在 CPU 环境下: torch 的 CTC loss 的计算速度是 paddle 的 9.8 倍 \n",
|
||||||
|
"在 GPU 环境下: torch 的 CTC loss 的计算速度是 paddle 的 6.87 倍\n",
|
||||||
|
"\n",
|
||||||
|
"## 其他结论\n",
|
||||||
|
"torch 的 ctc loss 在 CPU 和 GPU 下 都没有完全对齐。其中CPU的前向对齐精度大约为 1e-2。 GPU 的前向对齐精度大约为 1e-4 。"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.10"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,110 @@
|
|||||||
|
###########################################################
|
||||||
|
# FEATURE EXTRACTION SETTING #
|
||||||
|
###########################################################
|
||||||
|
|
||||||
|
fs: 24000 # sr
|
||||||
|
n_fft: 2048 # FFT size (samples).
|
||||||
|
n_shift: 300 # Hop size (samples). 12.5ms
|
||||||
|
win_length: 1200 # Window length (samples). 50ms
|
||||||
|
# If set to null, it will be the same as fft_size.
|
||||||
|
window: "hann" # Window function.
|
||||||
|
|
||||||
|
# Only used for feats_type != raw
|
||||||
|
|
||||||
|
fmin: 80 # Minimum frequency of Mel basis.
|
||||||
|
fmax: 7600 # Maximum frequency of Mel basis.
|
||||||
|
n_mels: 80 # The number of mel basis.
|
||||||
|
|
||||||
|
# Only used for the model using pitch features (e.g. FastSpeech2)
|
||||||
|
f0min: 80 # Maximum f0 for pitch extraction.
|
||||||
|
f0max: 400 # Minimum f0 for pitch extraction.
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# DATA SETTING #
|
||||||
|
###########################################################
|
||||||
|
batch_size: 32
|
||||||
|
num_workers: 4
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# MODEL SETTING #
|
||||||
|
###########################################################
|
||||||
|
model:
|
||||||
|
adim: 384 # attention dimension
|
||||||
|
aheads: 2 # number of attention heads
|
||||||
|
elayers: 4 # number of encoder layers
|
||||||
|
eunits: 1536 # number of encoder ff units
|
||||||
|
dlayers: 4 # number of decoder layers
|
||||||
|
dunits: 1536 # number of decoder ff units
|
||||||
|
positionwise_layer_type: conv1d # type of position-wise layer
|
||||||
|
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
|
||||||
|
duration_predictor_layers: 2 # number of layers of duration predictor
|
||||||
|
duration_predictor_chans: 256 # number of channels of duration predictor
|
||||||
|
duration_predictor_kernel_size: 3 # filter size of duration predictor
|
||||||
|
postnet_layers: 5 # number of layers of postnset
|
||||||
|
postnet_filts: 5 # filter size of conv layers in postnet
|
||||||
|
postnet_chans: 256 # number of channels of conv layers in postnet
|
||||||
|
encoder_normalize_before: True # whether to perform layer normalization before the input
|
||||||
|
decoder_normalize_before: True # whether to perform layer normalization before the input
|
||||||
|
reduction_factor: 1 # reduction factor
|
||||||
|
encoder_type: conformer # encoder type
|
||||||
|
decoder_type: conformer # decoder type
|
||||||
|
conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type
|
||||||
|
conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
|
||||||
|
conformer_activation_type: swish # conformer activation type
|
||||||
|
use_macaron_style_in_conformer: true # whether to use macaron style in conformer
|
||||||
|
use_cnn_in_conformer: true # whether to use CNN in conformer
|
||||||
|
conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder
|
||||||
|
conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder
|
||||||
|
init_type: xavier_uniform # initialization type
|
||||||
|
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
|
||||||
|
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
|
||||||
|
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
|
||||||
|
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
|
||||||
|
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
|
||||||
|
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
|
||||||
|
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
|
||||||
|
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
|
||||||
|
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
|
||||||
|
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
|
||||||
|
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
|
||||||
|
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
|
||||||
|
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
|
||||||
|
energy_predictor_layers: 2 # number of conv layers in energy predictor
|
||||||
|
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
|
||||||
|
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
|
||||||
|
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
|
||||||
|
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
|
||||||
|
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
|
||||||
|
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
|
||||||
|
spk_embed_dim: 256 # speaker embedding dimension
|
||||||
|
spk_embed_integration_type: concat # speaker embedding integration type
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# UPDATER SETTING #
|
||||||
|
###########################################################
|
||||||
|
updater:
|
||||||
|
use_masking: True # whether to apply masking for padded part in loss calculation
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OPTIMIZER SETTING #
|
||||||
|
###########################################################
|
||||||
|
optimizer:
|
||||||
|
optim: adam # optimizer type
|
||||||
|
learning_rate: 0.001 # learning rate
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# TRAINING SETTING #
|
||||||
|
###########################################################
|
||||||
|
max_epoch: 1000
|
||||||
|
num_snapshots: 5
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
seed: 10086
|
@ -0,0 +1,14 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .infer import StatsExecutor
|
@ -0,0 +1,193 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import argparse
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from prettytable import PrettyTable
|
||||||
|
|
||||||
|
from ..log import logger
|
||||||
|
from ..utils import cli_register
|
||||||
|
from ..utils import stats_wrapper
|
||||||
|
|
||||||
|
__all__ = ['StatsExecutor']
|
||||||
|
|
||||||
|
model_name_format = {
|
||||||
|
'asr': 'Model-Language-Sample Rate',
|
||||||
|
'cls': 'Model-Sample Rate',
|
||||||
|
'st': 'Model-Source language-Target language',
|
||||||
|
'text': 'Model-Task-Language',
|
||||||
|
'tts': 'Model-Language'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@cli_register(
|
||||||
|
name='paddlespeech.stats',
|
||||||
|
description='Get speech tasks support models list.')
|
||||||
|
class StatsExecutor():
|
||||||
|
def __init__(self):
|
||||||
|
super(StatsExecutor, self).__init__()
|
||||||
|
|
||||||
|
self.parser = argparse.ArgumentParser(
|
||||||
|
prog='paddlespeech.stats', add_help=True)
|
||||||
|
self.parser.add_argument(
|
||||||
|
'--task',
|
||||||
|
type=str,
|
||||||
|
default='asr',
|
||||||
|
choices=['asr', 'cls', 'st', 'text', 'tts'],
|
||||||
|
help='Choose speech task.',
|
||||||
|
required=True)
|
||||||
|
self.task_choices = ['asr', 'cls', 'st', 'text', 'tts']
|
||||||
|
|
||||||
|
def show_support_models(self, pretrained_models: dict):
|
||||||
|
fields = model_name_format[self.task].split("-")
|
||||||
|
table = PrettyTable(fields)
|
||||||
|
for key in pretrained_models:
|
||||||
|
table.add_row(key.split("-"))
|
||||||
|
print(table)
|
||||||
|
|
||||||
|
def execute(self, argv: List[str]) -> bool:
|
||||||
|
"""
|
||||||
|
Command line entry.
|
||||||
|
"""
|
||||||
|
parser_args = self.parser.parse_args(argv)
|
||||||
|
self.task = parser_args.task
|
||||||
|
if self.task not in self.task_choices:
|
||||||
|
logger.error(
|
||||||
|
"Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
elif self.task == 'asr':
|
||||||
|
try:
|
||||||
|
from ..asr.infer import pretrained_models
|
||||||
|
logger.info(
|
||||||
|
"Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||||
|
)
|
||||||
|
self.show_support_models(pretrained_models)
|
||||||
|
return True
|
||||||
|
except BaseException:
|
||||||
|
logger.error("Failed to get the list of ASR pretrained models.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
elif self.task == 'cls':
|
||||||
|
try:
|
||||||
|
from ..cls.infer import pretrained_models
|
||||||
|
logger.info(
|
||||||
|
"Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||||
|
)
|
||||||
|
self.show_support_models(pretrained_models)
|
||||||
|
return True
|
||||||
|
except BaseException:
|
||||||
|
logger.error("Failed to get the list of CLS pretrained models.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
elif self.task == 'st':
|
||||||
|
try:
|
||||||
|
from ..st.infer import pretrained_models
|
||||||
|
logger.info(
|
||||||
|
"Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||||
|
)
|
||||||
|
self.show_support_models(pretrained_models)
|
||||||
|
return True
|
||||||
|
except BaseException:
|
||||||
|
logger.error("Failed to get the list of ST pretrained models.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
elif self.task == 'text':
|
||||||
|
try:
|
||||||
|
from ..text.infer import pretrained_models
|
||||||
|
logger.info(
|
||||||
|
"Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||||
|
)
|
||||||
|
self.show_support_models(pretrained_models)
|
||||||
|
return True
|
||||||
|
except BaseException:
|
||||||
|
logger.error(
|
||||||
|
"Failed to get the list of TEXT pretrained models.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
elif self.task == 'tts':
|
||||||
|
try:
|
||||||
|
from ..tts.infer import pretrained_models
|
||||||
|
logger.info(
|
||||||
|
"Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||||
|
)
|
||||||
|
self.show_support_models(pretrained_models)
|
||||||
|
return True
|
||||||
|
except BaseException:
|
||||||
|
logger.error("Failed to get the list of TTS pretrained models.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
@stats_wrapper
|
||||||
|
def __call__(
|
||||||
|
self,
|
||||||
|
task: str=None, ):
|
||||||
|
"""
|
||||||
|
Python API to call an executor.
|
||||||
|
"""
|
||||||
|
self.task = task
|
||||||
|
if self.task not in self.task_choices:
|
||||||
|
print(
|
||||||
|
"Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']"
|
||||||
|
)
|
||||||
|
|
||||||
|
elif self.task == 'asr':
|
||||||
|
try:
|
||||||
|
from ..asr.infer import pretrained_models
|
||||||
|
print(
|
||||||
|
"Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||||
|
)
|
||||||
|
self.show_support_models(pretrained_models)
|
||||||
|
except BaseException:
|
||||||
|
print("Failed to get the list of ASR pretrained models.")
|
||||||
|
|
||||||
|
elif self.task == 'cls':
|
||||||
|
try:
|
||||||
|
from ..cls.infer import pretrained_models
|
||||||
|
print(
|
||||||
|
"Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||||
|
)
|
||||||
|
self.show_support_models(pretrained_models)
|
||||||
|
except BaseException:
|
||||||
|
print("Failed to get the list of CLS pretrained models.")
|
||||||
|
|
||||||
|
elif self.task == 'st':
|
||||||
|
try:
|
||||||
|
from ..st.infer import pretrained_models
|
||||||
|
print(
|
||||||
|
"Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||||
|
)
|
||||||
|
self.show_support_models(pretrained_models)
|
||||||
|
except BaseException:
|
||||||
|
print("Failed to get the list of ST pretrained models.")
|
||||||
|
|
||||||
|
elif self.task == 'text':
|
||||||
|
try:
|
||||||
|
from ..text.infer import pretrained_models
|
||||||
|
print(
|
||||||
|
"Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||||
|
)
|
||||||
|
self.show_support_models(pretrained_models)
|
||||||
|
except BaseException:
|
||||||
|
print("Failed to get the list of TEXT pretrained models.")
|
||||||
|
|
||||||
|
elif self.task == 'tts':
|
||||||
|
try:
|
||||||
|
from ..tts.infer import pretrained_models
|
||||||
|
print(
|
||||||
|
"Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API"
|
||||||
|
)
|
||||||
|
self.show_support_models(pretrained_models)
|
||||||
|
except BaseException:
|
||||||
|
print("Failed to get the list of TTS pretrained models.")
|
@ -0,0 +1,3 @@
|
|||||||
|
mkdir -p ./test_data
|
||||||
|
wget -P ./test_data https://paddlespeech.bj.bcebos.com/datasets/unit_test/asr/static_ds2online_inputs.pickle
|
||||||
|
python deepspeech2_online_model_test.py
|
@ -0,0 +1,114 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
|
||||||
|
def change_speech_yaml(yaml_name: str, device: str):
|
||||||
|
"""Change the settings of the device under the voice task configuration file
|
||||||
|
|
||||||
|
Args:
|
||||||
|
yaml_name (str): asr or asr_pd or tts or tts_pd
|
||||||
|
cpu (bool): True means set device to "cpu"
|
||||||
|
model_type (dict): change model type
|
||||||
|
"""
|
||||||
|
if "asr" in yaml_name:
|
||||||
|
dirpath = "./conf/asr/"
|
||||||
|
elif 'tts' in yaml_name:
|
||||||
|
dirpath = "./conf/tts/"
|
||||||
|
yamlfile = dirpath + yaml_name + ".yaml"
|
||||||
|
tmp_yamlfile = dirpath + yaml_name + "_tmp.yaml"
|
||||||
|
os.system("cp %s %s" % (yamlfile, tmp_yamlfile))
|
||||||
|
|
||||||
|
with open(tmp_yamlfile) as f, open(yamlfile, "w+", encoding="utf-8") as fw:
|
||||||
|
y = yaml.safe_load(f)
|
||||||
|
if device == 'cpu':
|
||||||
|
print("Set device: cpu")
|
||||||
|
if yaml_name == 'asr':
|
||||||
|
y['device'] = 'cpu'
|
||||||
|
elif yaml_name == 'asr_pd':
|
||||||
|
y['am_predictor_conf']['device'] = 'cpu'
|
||||||
|
elif yaml_name == 'tts':
|
||||||
|
y['device'] = 'cpu'
|
||||||
|
elif yaml_name == 'tts_pd':
|
||||||
|
y['am_predictor_conf']['device'] = 'cpu'
|
||||||
|
y['voc_predictor_conf']['device'] = 'cpu'
|
||||||
|
elif device == 'gpu':
|
||||||
|
print("Set device: gpu")
|
||||||
|
if yaml_name == 'asr':
|
||||||
|
y['device'] = 'gpu:0'
|
||||||
|
elif yaml_name == 'asr_pd':
|
||||||
|
y['am_predictor_conf']['device'] = 'gpu:0'
|
||||||
|
elif yaml_name == 'tts':
|
||||||
|
y['device'] = 'gpu:0'
|
||||||
|
elif yaml_name == 'tts_pd':
|
||||||
|
y['am_predictor_conf']['device'] = 'gpu:0'
|
||||||
|
y['voc_predictor_conf']['device'] = 'gpu:0'
|
||||||
|
else:
|
||||||
|
print("Please set correct device: cpu or gpu.")
|
||||||
|
|
||||||
|
print("The content of '%s': " % (yamlfile))
|
||||||
|
print(yaml.dump(y, default_flow_style=False, sort_keys=False))
|
||||||
|
yaml.dump(y, fw, allow_unicode=True)
|
||||||
|
os.system("rm %s" % (tmp_yamlfile))
|
||||||
|
print("Change %s successfully." % (yamlfile))
|
||||||
|
|
||||||
|
|
||||||
|
def change_app_yaml(task: str, engine_type: str):
|
||||||
|
"""Change the engine type and corresponding configuration file of the speech task in application.yaml
|
||||||
|
|
||||||
|
Args:
|
||||||
|
task (str): asr or tts
|
||||||
|
"""
|
||||||
|
yamlfile = "./conf/application.yaml"
|
||||||
|
tmp_yamlfile = "./conf/application_tmp.yaml"
|
||||||
|
os.system("cp %s %s" % (yamlfile, tmp_yamlfile))
|
||||||
|
with open(tmp_yamlfile) as f, open(yamlfile, "w+", encoding="utf-8") as fw:
|
||||||
|
y = yaml.safe_load(f)
|
||||||
|
y['engine_type'][task] = engine_type
|
||||||
|
path_list = ["./conf/", task, "/", task]
|
||||||
|
if engine_type == 'python':
|
||||||
|
path_list.append(".yaml")
|
||||||
|
|
||||||
|
elif engine_type == 'inference':
|
||||||
|
path_list.append("_pd.yaml")
|
||||||
|
y['engine_backend'][task] = ''.join(path_list)
|
||||||
|
print("The content of './conf/application.yaml': ")
|
||||||
|
print(yaml.dump(y, default_flow_style=False, sort_keys=False))
|
||||||
|
yaml.dump(y, fw, allow_unicode=True)
|
||||||
|
os.system("rm %s" % (tmp_yamlfile))
|
||||||
|
print("Change %s successfully." % (yamlfile))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
'--change_task',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='Change task',
|
||||||
|
choices=[
|
||||||
|
'app-asr-python',
|
||||||
|
'app-asr-inference',
|
||||||
|
'app-tts-python',
|
||||||
|
'app-tts-inference',
|
||||||
|
'speech-asr-cpu',
|
||||||
|
'speech-asr-gpu',
|
||||||
|
'speech-asr_pd-cpu',
|
||||||
|
'speech-asr_pd-gpu',
|
||||||
|
'speech-tts-cpu',
|
||||||
|
'speech-tts-gpu',
|
||||||
|
'speech-tts_pd-cpu',
|
||||||
|
'speech-tts_pd-gpu',
|
||||||
|
],
|
||||||
|
required=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
types = args.change_task.split("-")
|
||||||
|
if types[0] == "app":
|
||||||
|
change_app_yaml(types[1], types[2])
|
||||||
|
elif types[0] == "speech":
|
||||||
|
change_speech_yaml(types[1], types[2])
|
||||||
|
else:
|
||||||
|
print("Error change task, please check change_task.")
|
@ -0,0 +1,27 @@
|
|||||||
|
# This is the parameter configuration file for PaddleSpeech Serving.
|
||||||
|
|
||||||
|
##################################################################
|
||||||
|
# SERVER SETTING #
|
||||||
|
##################################################################
|
||||||
|
host: 127.0.0.1
|
||||||
|
port: 8090
|
||||||
|
|
||||||
|
##################################################################
|
||||||
|
# CONFIG FILE #
|
||||||
|
##################################################################
|
||||||
|
# add engine backend type (Options: asr, tts) and config file here.
|
||||||
|
# Adding a speech task to engine_backend means starting the service.
|
||||||
|
engine_backend:
|
||||||
|
asr: 'conf/asr/asr.yaml'
|
||||||
|
tts: 'conf/tts/tts.yaml'
|
||||||
|
|
||||||
|
# The engine_type of speech task needs to keep the same type as the config file of speech task.
|
||||||
|
# E.g: The engine_type of asr is 'python', the engine_backend of asr is 'XX/asr.yaml'
|
||||||
|
# E.g: The engine_type of asr is 'inference', the engine_backend of asr is 'XX/asr_pd.yaml'
|
||||||
|
#
|
||||||
|
# add engine type (Options: python, inference)
|
||||||
|
engine_type:
|
||||||
|
asr: 'python'
|
||||||
|
tts: 'python'
|
||||||
|
|
||||||
|
|
@ -0,0 +1,8 @@
|
|||||||
|
model: 'conformer_wenetspeech'
|
||||||
|
lang: 'zh'
|
||||||
|
sample_rate: 16000
|
||||||
|
cfg_path: # [optional]
|
||||||
|
ckpt_path: # [optional]
|
||||||
|
decode_method: 'attention_rescoring'
|
||||||
|
force_yes: True
|
||||||
|
device: # set 'gpu:id' or 'cpu'
|
@ -0,0 +1,26 @@
|
|||||||
|
# This is the parameter configuration file for ASR server.
|
||||||
|
# These are the static models that support paddle inference.
|
||||||
|
|
||||||
|
##################################################################
|
||||||
|
# ACOUSTIC MODEL SETTING #
|
||||||
|
# am choices=['deepspeech2offline_aishell'] TODO
|
||||||
|
##################################################################
|
||||||
|
model_type: 'deepspeech2offline_aishell'
|
||||||
|
am_model: # the pdmodel file of am static model [optional]
|
||||||
|
am_params: # the pdiparams file of am static model [optional]
|
||||||
|
lang: 'zh'
|
||||||
|
sample_rate: 16000
|
||||||
|
cfg_path:
|
||||||
|
decode_method:
|
||||||
|
force_yes: True
|
||||||
|
|
||||||
|
am_predictor_conf:
|
||||||
|
device: # set 'gpu:id' or 'cpu'
|
||||||
|
switch_ir_optim: True
|
||||||
|
glog_info: False # True -> print glog
|
||||||
|
summary: True # False -> do not show predictor config
|
||||||
|
|
||||||
|
|
||||||
|
##################################################################
|
||||||
|
# OTHERS #
|
||||||
|
##################################################################
|
@ -0,0 +1,32 @@
|
|||||||
|
# This is the parameter configuration file for TTS server.
|
||||||
|
|
||||||
|
##################################################################
|
||||||
|
# ACOUSTIC MODEL SETTING #
|
||||||
|
# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
|
||||||
|
# 'fastspeech2_ljspeech', 'fastspeech2_aishell3',
|
||||||
|
# 'fastspeech2_vctk']
|
||||||
|
##################################################################
|
||||||
|
am: 'fastspeech2_csmsc'
|
||||||
|
am_config:
|
||||||
|
am_ckpt:
|
||||||
|
am_stat:
|
||||||
|
phones_dict:
|
||||||
|
tones_dict:
|
||||||
|
speaker_dict:
|
||||||
|
spk_id: 0
|
||||||
|
|
||||||
|
##################################################################
|
||||||
|
# VOCODER SETTING #
|
||||||
|
# voc choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
|
||||||
|
# 'pwgan_vctk', 'mb_melgan_csmsc']
|
||||||
|
##################################################################
|
||||||
|
voc: 'pwgan_csmsc'
|
||||||
|
voc_config:
|
||||||
|
voc_ckpt:
|
||||||
|
voc_stat:
|
||||||
|
|
||||||
|
##################################################################
|
||||||
|
# OTHERS #
|
||||||
|
##################################################################
|
||||||
|
lang: 'zh'
|
||||||
|
device: # set 'gpu:id' or 'cpu'
|
@ -0,0 +1,42 @@
|
|||||||
|
# This is the parameter configuration file for TTS server.
|
||||||
|
# These are the static models that support paddle inference.
|
||||||
|
|
||||||
|
##################################################################
|
||||||
|
# ACOUSTIC MODEL SETTING #
|
||||||
|
# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
|
||||||
|
##################################################################
|
||||||
|
am: 'fastspeech2_csmsc'
|
||||||
|
am_model: # the pdmodel file of your am static model (XX.pdmodel)
|
||||||
|
am_params: # the pdiparams file of your am static model (XX.pdipparams)
|
||||||
|
am_sample_rate: 24000
|
||||||
|
phones_dict:
|
||||||
|
tones_dict:
|
||||||
|
speaker_dict:
|
||||||
|
spk_id: 0
|
||||||
|
|
||||||
|
am_predictor_conf:
|
||||||
|
device: # set 'gpu:id' or 'cpu'
|
||||||
|
switch_ir_optim: True
|
||||||
|
glog_info: False # True -> print glog
|
||||||
|
summary: True # False -> do not show predictor config
|
||||||
|
|
||||||
|
|
||||||
|
##################################################################
|
||||||
|
# VOCODER SETTING #
|
||||||
|
# voc choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc']
|
||||||
|
##################################################################
|
||||||
|
voc: 'pwgan_csmsc'
|
||||||
|
voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel)
|
||||||
|
voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
|
||||||
|
voc_sample_rate: 24000
|
||||||
|
|
||||||
|
voc_predictor_conf:
|
||||||
|
device: # set 'gpu:id' or 'cpu'
|
||||||
|
switch_ir_optim: True
|
||||||
|
glog_info: False # True -> print glog
|
||||||
|
summary: True # False -> do not show predictor config
|
||||||
|
|
||||||
|
##################################################################
|
||||||
|
# OTHERS #
|
||||||
|
##################################################################
|
||||||
|
lang: 'zh'
|
@ -0,0 +1,185 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# bash test_server_client.sh
|
||||||
|
|
||||||
|
StartService(){
|
||||||
|
# Start service
|
||||||
|
paddlespeech_server start --config_file $config_file 1>>log/server.log 2>>log/server.log.wf &
|
||||||
|
echo $! > pid
|
||||||
|
|
||||||
|
start_num=$(cat log/server.log.wf | grep "INFO: Uvicorn running on http://" -c)
|
||||||
|
flag="normal"
|
||||||
|
while [[ $start_num -lt $target_start_num && $flag == "normal" ]]
|
||||||
|
do
|
||||||
|
start_num=$(cat log/server.log.wf | grep "INFO: Uvicorn running on http://" -c)
|
||||||
|
# start service failed
|
||||||
|
if [ $(cat log/server.log.wf | grep -i "error" -c) -gt $error_time ];then
|
||||||
|
echo "Service started failed." | tee -a ./log/test_result.log
|
||||||
|
error_time=$(cat log/server.log.wf | grep -i "error" -c)
|
||||||
|
flag="unnormal"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
ClientTest(){
|
||||||
|
# Client test
|
||||||
|
# test asr client
|
||||||
|
paddlespeech_client asr --server_ip $server_ip --port $port --input ./zh.wav
|
||||||
|
((test_times+=1))
|
||||||
|
paddlespeech_client asr --server_ip $server_ip --port $port --input ./zh.wav
|
||||||
|
((test_times+=1))
|
||||||
|
|
||||||
|
# test tts client
|
||||||
|
paddlespeech_client tts --server_ip $server_ip --port $port --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav
|
||||||
|
((test_times+=1))
|
||||||
|
paddlespeech_client tts --server_ip $server_ip --port $port --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav
|
||||||
|
((test_times+=1))
|
||||||
|
}
|
||||||
|
|
||||||
|
GetTestResult() {
|
||||||
|
# Determine if the test was successful
|
||||||
|
response_success_time=$(cat log/server.log | grep "200 OK" -c)
|
||||||
|
if (( $response_success_time == $test_times )) ; then
|
||||||
|
echo "Testing successfully. The service configuration is: asr engine type: $1; tts engine type: $1; device: $2." | tee -a ./log/test_result.log
|
||||||
|
else
|
||||||
|
echo "Testing failed. The service configuration is: asr engine type: $1; tts engine type: $1; device: $2." | tee -a ./log/test_result.log
|
||||||
|
fi
|
||||||
|
test_times=$response_success_time
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
mkdir -p log
|
||||||
|
rm -rf log/server.log.wf
|
||||||
|
rm -rf log/server.log
|
||||||
|
rm -rf log/test_result.log
|
||||||
|
|
||||||
|
config_file=./conf/application.yaml
|
||||||
|
server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}')
|
||||||
|
port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}')
|
||||||
|
|
||||||
|
echo "Sevice ip: $server_ip" | tee ./log/test_result.log
|
||||||
|
echo "Sevice port: $port" | tee -a ./log/test_result.log
|
||||||
|
|
||||||
|
# whether a process is listening on $port
|
||||||
|
pid=`lsof -i :"$port"|grep -v "PID" | awk '{print $2}'`
|
||||||
|
if [ "$pid" != "" ]; then
|
||||||
|
echo "The port: $port is occupied, please change another port"
|
||||||
|
exit
|
||||||
|
fi
|
||||||
|
|
||||||
|
# download test audios for ASR client
|
||||||
|
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
|
||||||
|
|
||||||
|
|
||||||
|
target_start_num=0 # the number of start service
|
||||||
|
test_times=0 # The number of client test
|
||||||
|
error_time=0 # The number of error occurrences in the startup failure server.log.wf file
|
||||||
|
|
||||||
|
# start server: asr engine type: python; tts engine type: python; device: gpu
|
||||||
|
echo "Start the service: asr engine type: python; tts engine type: python; device: gpu" | tee -a ./log/test_result.log
|
||||||
|
((target_start_num+=1))
|
||||||
|
StartService
|
||||||
|
|
||||||
|
if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then
|
||||||
|
echo "Service started successfully." | tee -a ./log/test_result.log
|
||||||
|
ClientTest
|
||||||
|
echo "This round of testing is over." | tee -a ./log/test_result.log
|
||||||
|
|
||||||
|
GetTestResult python gpu
|
||||||
|
else
|
||||||
|
echo "Service failed to start, no client test."
|
||||||
|
target_start_num=$start_num
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
kill -9 `cat pid`
|
||||||
|
rm -rf pid
|
||||||
|
sleep 2s
|
||||||
|
echo "**************************************************************************************" | tee -a ./log/test_result.log
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# start server: asr engine type: python; tts engine type: python; device: cpu
|
||||||
|
python change_yaml.py --change_task speech-asr-cpu # change asr.yaml device: cpu
|
||||||
|
python change_yaml.py --change_task speech-tts-cpu # change tts.yaml device: cpu
|
||||||
|
|
||||||
|
echo "Start the service: asr engine type: python; tts engine type: python; device: cpu" | tee -a ./log/test_result.log
|
||||||
|
((target_start_num+=1))
|
||||||
|
StartService
|
||||||
|
|
||||||
|
if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then
|
||||||
|
echo "Service started successfully." | tee -a ./log/test_result.log
|
||||||
|
ClientTest
|
||||||
|
echo "This round of testing is over." | tee -a ./log/test_result.log
|
||||||
|
|
||||||
|
GetTestResult python cpu
|
||||||
|
else
|
||||||
|
echo "Service failed to start, no client test."
|
||||||
|
target_start_num=$start_num
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
kill -9 `cat pid`
|
||||||
|
rm -rf pid
|
||||||
|
sleep 2s
|
||||||
|
echo "**************************************************************************************" | tee -a ./log/test_result.log
|
||||||
|
|
||||||
|
|
||||||
|
# start server: asr engine type: inference; tts engine type: inference; device: gpu
|
||||||
|
python change_yaml.py --change_task app-asr-inference # change application.yaml, asr engine_type: inference; asr engine_backend: asr_pd.yaml
|
||||||
|
python change_yaml.py --change_task app-tts-inference # change application.yaml, tts engine_type: inference; tts engine_backend: tts_pd.yaml
|
||||||
|
|
||||||
|
echo "Start the service: asr engine type: inference; tts engine type: inference; device: gpu" | tee -a ./log/test_result.log
|
||||||
|
((target_start_num+=1))
|
||||||
|
StartService
|
||||||
|
|
||||||
|
if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then
|
||||||
|
echo "Service started successfully." | tee -a ./log/test_result.log
|
||||||
|
ClientTest
|
||||||
|
echo "This round of testing is over." | tee -a ./log/test_result.log
|
||||||
|
|
||||||
|
GetTestResult inference gpu
|
||||||
|
else
|
||||||
|
echo "Service failed to start, no client test."
|
||||||
|
target_start_num=$start_num
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
kill -9 `cat pid`
|
||||||
|
rm -rf pid
|
||||||
|
sleep 2s
|
||||||
|
echo "**************************************************************************************" | tee -a ./log/test_result.log
|
||||||
|
|
||||||
|
|
||||||
|
# start server: asr engine type: inference; tts engine type: inference; device: cpu
|
||||||
|
python change_yaml.py --change_task speech-asr_pd-cpu # change asr_pd.yaml device: cpu
|
||||||
|
python change_yaml.py --change_task speech-tts_pd-cpu # change tts_pd.yaml device: cpu
|
||||||
|
|
||||||
|
echo "start the service: asr engine type: inference; tts engine type: inference; device: cpu" | tee -a ./log/test_result.log
|
||||||
|
((target_start_num+=1))
|
||||||
|
StartService
|
||||||
|
|
||||||
|
if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then
|
||||||
|
echo "Service started successfully." | tee -a ./log/test_result.log
|
||||||
|
ClientTest
|
||||||
|
echo "This round of testing is over." | tee -a ./log/test_result.log
|
||||||
|
|
||||||
|
GetTestResult inference cpu
|
||||||
|
else
|
||||||
|
echo "Service failed to start, no client test."
|
||||||
|
target_start_num=$start_num
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
kill -9 `cat pid`
|
||||||
|
rm -rf pid
|
||||||
|
sleep 2s
|
||||||
|
echo "**************************************************************************************" | tee -a ./log/test_result.log
|
||||||
|
|
||||||
|
echo "All tests completed." | tee -a ./log/test_result.log
|
||||||
|
|
||||||
|
# sohw all the test results
|
||||||
|
echo "***************** Here are all the test results ********************"
|
||||||
|
cat ./log/test_result.log
|
||||||
|
|
||||||
|
# Restoring conf is the same as demos/speech_server
|
||||||
|
cp ../../../demos/speech_server/conf/ ./ -rf
|
Loading…
Reference in new issue