commit
7bbd9097a1
@ -1,3 +1,3 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
paddlespeech_server start --config_file ./conf/application.yaml
|
paddlespeech_server start --config_file ./conf/application.yaml &> server.log &
|
||||||
|
@ -0,0 +1,10 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
|
||||||
|
wget -c https://paddlespeech.bj.bcebos.com/vector/audio/123456789.wav
|
||||||
|
|
||||||
|
# sid extract
|
||||||
|
paddlespeech_client vector --server_ip 127.0.0.1 --port 8090 --task spk --input ./85236145389.wav
|
||||||
|
|
||||||
|
# sid score
|
||||||
|
paddlespeech_client vector --server_ip 127.0.0.1 --port 8090 --task score --enroll ./85236145389.wav --test ./123456789.wav
|
@ -0,0 +1,4 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
|
||||||
|
paddlespeech_client text --server_ip 127.0.0.1 --port 8090 --input 今天的天气真好啊你下午有空吗我想约你一起去吃饭
|
@ -1,9 +1,8 @@
|
|||||||
export CUDA_VISIBLE_DEVICE=0,1,2,3
|
#export CUDA_VISIBLE_DEVICE=0,1,2,3
|
||||||
export CUDA_VISIBLE_DEVICE=0,1,2,3
|
|
||||||
|
|
||||||
# nohup python3 punc_server.py --config_file conf/punc_application.yaml > punc.log 2>&1 &
|
# nohup python3 local/punc_server.py --config_file conf/punc_application.yaml > punc.log 2>&1 &
|
||||||
paddlespeech_server start --config_file conf/punc_application.yaml &> punc.log &
|
paddlespeech_server start --config_file conf/punc_application.yaml &> punc.log &
|
||||||
|
|
||||||
# nohup python3 streaming_asr_server.py --config_file conf/ws_conformer_wenetspeech_application.yaml > streaming_asr.log 2>&1 &
|
# nohup python3 local/streaming_asr_server.py --config_file conf/ws_conformer_wenetspeech_application.yaml > streaming_asr.log 2>&1 &
|
||||||
paddlespeech_server start --config_file conf/ws_conformer_wenetspeech_application.yaml &> streaming_asr.log &
|
paddlespeech_server start --config_file conf/ws_conformer_wenetspeech_application.yaml &> streaming_asr.log &
|
||||||
|
|
||||||
|
@ -0,0 +1,103 @@
|
|||||||
|
# This is the parameter configuration file for streaming tts server.
|
||||||
|
|
||||||
|
#################################################################################
|
||||||
|
# SERVER SETTING #
|
||||||
|
#################################################################################
|
||||||
|
host: 0.0.0.0
|
||||||
|
port: 8192
|
||||||
|
|
||||||
|
# The task format in the engin_list is: <speech task>_<engine type>
|
||||||
|
# engine_list choices = ['tts_online', 'tts_online-onnx'], the inference speed of tts_online-onnx is faster than tts_online.
|
||||||
|
# protocol choices = ['websocket', 'http']
|
||||||
|
protocol: 'websocket'
|
||||||
|
engine_list: ['tts_online-onnx']
|
||||||
|
|
||||||
|
|
||||||
|
#################################################################################
|
||||||
|
# ENGINE CONFIG #
|
||||||
|
#################################################################################
|
||||||
|
|
||||||
|
################################### TTS #########################################
|
||||||
|
################### speech task: tts; engine_type: online #######################
|
||||||
|
tts_online:
|
||||||
|
# am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc']
|
||||||
|
# fastspeech2_cnndecoder_csmsc support streaming am infer.
|
||||||
|
am: 'fastspeech2_csmsc'
|
||||||
|
am_config:
|
||||||
|
am_ckpt:
|
||||||
|
am_stat:
|
||||||
|
phones_dict:
|
||||||
|
tones_dict:
|
||||||
|
speaker_dict:
|
||||||
|
spk_id: 0
|
||||||
|
|
||||||
|
# voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
|
||||||
|
# Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference
|
||||||
|
voc: 'mb_melgan_csmsc'
|
||||||
|
voc_config:
|
||||||
|
voc_ckpt:
|
||||||
|
voc_stat:
|
||||||
|
|
||||||
|
# others
|
||||||
|
lang: 'zh'
|
||||||
|
device: 'cpu' # set 'gpu:id' or 'cpu'
|
||||||
|
# am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
|
||||||
|
# when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
|
||||||
|
am_block: 72
|
||||||
|
am_pad: 12
|
||||||
|
# voc_pad and voc_block voc model to streaming voc infer,
|
||||||
|
# when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
|
||||||
|
# when voc model is hifigan_csmsc, voc_pad set 19, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
|
||||||
|
voc_block: 36
|
||||||
|
voc_pad: 14
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#################################################################################
|
||||||
|
# ENGINE CONFIG #
|
||||||
|
#################################################################################
|
||||||
|
|
||||||
|
################################### TTS #########################################
|
||||||
|
################### speech task: tts; engine_type: online-onnx #######################
|
||||||
|
tts_online-onnx:
|
||||||
|
# am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx']
|
||||||
|
# fastspeech2_cnndecoder_csmsc_onnx support streaming am infer.
|
||||||
|
am: 'fastspeech2_cnndecoder_csmsc_onnx'
|
||||||
|
# am_ckpt is a list, if am is fastspeech2_cnndecoder_csmsc_onnx, am_ckpt = [encoder model, decoder model, postnet model];
|
||||||
|
# if am is fastspeech2_csmsc_onnx, am_ckpt = [ckpt model];
|
||||||
|
am_ckpt: # list
|
||||||
|
am_stat:
|
||||||
|
phones_dict:
|
||||||
|
tones_dict:
|
||||||
|
speaker_dict:
|
||||||
|
spk_id: 0
|
||||||
|
am_sample_rate: 24000
|
||||||
|
am_sess_conf:
|
||||||
|
device: "cpu" # set 'gpu:id' or 'cpu'
|
||||||
|
use_trt: False
|
||||||
|
cpu_threads: 4
|
||||||
|
|
||||||
|
# voc (vocoder) choices=['mb_melgan_csmsc_onnx, hifigan_csmsc_onnx']
|
||||||
|
# Both mb_melgan_csmsc_onnx and hifigan_csmsc_onnx support streaming voc inference
|
||||||
|
voc: 'hifigan_csmsc_onnx'
|
||||||
|
voc_ckpt:
|
||||||
|
voc_sample_rate: 24000
|
||||||
|
voc_sess_conf:
|
||||||
|
device: "cpu" # set 'gpu:id' or 'cpu'
|
||||||
|
use_trt: False
|
||||||
|
cpu_threads: 4
|
||||||
|
|
||||||
|
# others
|
||||||
|
lang: 'zh'
|
||||||
|
# am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
|
||||||
|
# when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
|
||||||
|
am_block: 72
|
||||||
|
am_pad: 12
|
||||||
|
# voc_pad and voc_block voc model to streaming voc infer,
|
||||||
|
# when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
|
||||||
|
# when voc model is hifigan_csmsc_onnx, voc_pad set 19, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
|
||||||
|
voc_block: 36
|
||||||
|
voc_pad: 14
|
||||||
|
# voc_upsample should be same as n_shift on voc config.
|
||||||
|
voc_upsample: 300
|
||||||
|
|
@ -0,0 +1,10 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# http server
|
||||||
|
paddlespeech_server start --config_file ./conf/tts_online_application.yaml &> tts.http.log &
|
||||||
|
|
||||||
|
|
||||||
|
# websocket server
|
||||||
|
paddlespeech_server start --config_file ./conf/tts_online_ws_application.yaml &> tts.ws.log &
|
||||||
|
|
||||||
|
|
@ -1,3 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# start server
|
|
||||||
paddlespeech_server start --config_file ./conf/tts_online_application.yaml
|
|
@ -1,15 +1,17 @@
|
|||||||
FROM registry.baidubce.com/paddlepaddle/paddle:2.2.2
|
FROM registry.baidubce.com/paddlepaddle/paddle:2.2.2
|
||||||
LABEL maintainer="paddlesl@baidu.com"
|
LABEL maintainer="paddlesl@baidu.com"
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install libsndfile-dev \
|
||||||
|
&& apt-get clean \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN git clone --depth 1 https://github.com/PaddlePaddle/PaddleSpeech.git /home/PaddleSpeech
|
RUN git clone --depth 1 https://github.com/PaddlePaddle/PaddleSpeech.git /home/PaddleSpeech
|
||||||
RUN pip3 uninstall mccabe -y ; exit 0;
|
RUN pip3 uninstall mccabe -y ; exit 0;
|
||||||
RUN pip3 install multiprocess==0.70.12 importlib-metadata==4.2.0 dill==0.3.4
|
RUN pip3 install multiprocess==0.70.12 importlib-metadata==4.2.0 dill==0.3.4
|
||||||
|
|
||||||
RUN cd /home/PaddleSpeech/audio
|
WORKDIR /home/PaddleSpeech/
|
||||||
RUN python setup.py bdist_wheel
|
|
||||||
|
|
||||||
RUN cd /home/PaddleSpeech
|
|
||||||
RUN python setup.py bdist_wheel
|
RUN python setup.py bdist_wheel
|
||||||
RUN pip install audio/dist/*.whl dist/*.whl
|
RUN pip install dist/*.whl -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||||
|
|
||||||
WORKDIR /home/PaddleSpeech/
|
CMD ['bash']
|
||||||
|
@ -1,20 +1,3 @@
|
|||||||
# Callcenter 8k sample rate
|
# Callcenter 8k sample rate
|
||||||
|
|
||||||
Data distribution:
|
This recipe only has model/data config for 8k ASR, user need to prepare data and generate manifest metafile. You can see Aishell or Libripseech.
|
||||||
|
|
||||||
```
|
|
||||||
676048 utts
|
|
||||||
491.4004722221223 h
|
|
||||||
4357792.0 text
|
|
||||||
2.4633630739178654 text/sec
|
|
||||||
2.6167397877068495 sec/utt
|
|
||||||
```
|
|
||||||
|
|
||||||
train/dev/test partition:
|
|
||||||
|
|
||||||
```
|
|
||||||
33802 manifest.dev
|
|
||||||
67606 manifest.test
|
|
||||||
574640 manifest.train
|
|
||||||
676048 total
|
|
||||||
```
|
|
||||||
|
@ -0,0 +1,26 @@
|
|||||||
|
# Test
|
||||||
|
We train a Chinese-English mixed fastspeech2 model. The training code is still being sorted out, let's show how to use it first.
|
||||||
|
The sample rate of the synthesized audio is 22050 Hz.
|
||||||
|
|
||||||
|
## Download pretrained models
|
||||||
|
Put pretrained models in a directory named `models`.
|
||||||
|
|
||||||
|
- [fastspeech2_csmscljspeech_add-zhen.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_csmscljspeech_add-zhen.zip)
|
||||||
|
- [hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir models
|
||||||
|
cd models
|
||||||
|
wget https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_csmscljspeech_add-zhen.zip
|
||||||
|
unzip fastspeech2_csmscljspeech_add-zhen.zip
|
||||||
|
wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip
|
||||||
|
unzip hifigan_ljspeech_ckpt_0.2.0.zip
|
||||||
|
cd ../
|
||||||
|
```
|
||||||
|
|
||||||
|
## test
|
||||||
|
You can choose `--spk_id` {0, 1} in `local/synthesize_e2e.sh`.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bash test.sh
|
||||||
|
```
|
@ -0,0 +1,31 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
model_dir=$1
|
||||||
|
output=$2
|
||||||
|
am_name=fastspeech2_csmscljspeech_add-zhen
|
||||||
|
am_model_dir=${model_dir}/${am_name}/
|
||||||
|
|
||||||
|
stage=1
|
||||||
|
stop_stage=1
|
||||||
|
|
||||||
|
|
||||||
|
# hifigan
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||||
|
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||||
|
--am=fastspeech2_mix \
|
||||||
|
--am_config=${am_model_dir}/default.yaml \
|
||||||
|
--am_ckpt=${am_model_dir}/snapshot_iter_94000.pdz \
|
||||||
|
--am_stat=${am_model_dir}/speech_stats.npy \
|
||||||
|
--voc=hifigan_ljspeech \
|
||||||
|
--voc_config=${model_dir}/hifigan_ljspeech_ckpt_0.2.0/default.yaml \
|
||||||
|
--voc_ckpt=${model_dir}/hifigan_ljspeech_ckpt_0.2.0/snapshot_iter_2500000.pdz \
|
||||||
|
--voc_stat=${model_dir}/hifigan_ljspeech_ckpt_0.2.0/feats_stats.npy \
|
||||||
|
--lang=mix \
|
||||||
|
--text=${BIN_DIR}/../sentences_mix.txt \
|
||||||
|
--output_dir=${output}/test_e2e \
|
||||||
|
--phones_dict=${am_model_dir}/phone_id_map.txt \
|
||||||
|
--speaker_dict=${am_model_dir}/speaker_id_map.txt \
|
||||||
|
--spk_id 0
|
||||||
|
fi
|
@ -0,0 +1,13 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||||
|
|
||||||
|
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||||
|
export LC_ALL=C
|
||||||
|
|
||||||
|
export PYTHONDONTWRITEBYTECODE=1
|
||||||
|
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||||
|
export PYTHONIOENCODING=UTF-8
|
||||||
|
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||||
|
|
||||||
|
MODEL=fastspeech2
|
||||||
|
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
|
@ -0,0 +1,23 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
source path.sh
|
||||||
|
|
||||||
|
gpus=0,1
|
||||||
|
stage=3
|
||||||
|
stop_stage=100
|
||||||
|
|
||||||
|
model_dir=models
|
||||||
|
output_dir=output
|
||||||
|
|
||||||
|
# with the following command, you can choose the stage range you want to run
|
||||||
|
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||||
|
# this can not be mixed use with `$1`, `$2` ...
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||||
|
|
||||||
|
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
# synthesize_e2e, vocoder is hifigan by default
|
||||||
|
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${model_dir} ${output_dir} || exit -1
|
||||||
|
fi
|
||||||
|
|
@ -0,0 +1,13 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
@ -1,27 +1,26 @@
|
|||||||
* [python_kaldi_features](https://github.com/ZitengWang/python_kaldi_features)
|
# python_kaldi_features
|
||||||
|
|
||||||
|
[python_kaldi_features](https://github.com/ZitengWang/python_kaldi_features)
|
||||||
commit: fc1bd6240c2008412ab64dc25045cd872f5e126c
|
commit: fc1bd6240c2008412ab64dc25045cd872f5e126c
|
||||||
ref: https://zhuanlan.zhihu.com/p/55371926
|
ref: https://zhuanlan.zhihu.com/p/55371926
|
||||||
license: MIT
|
license: MIT
|
||||||
|
|
||||||
* [python-pinyin](https://github.com/mozillazg/python-pinyin.git)
|
# Install ctc_decoder for Windows
|
||||||
commit: 55e524aa1b7b8eec3d15c5306043c6cdd5938b03
|
|
||||||
license: MIT
|
|
||||||
|
|
||||||
* [zhon](https://github.com/tsroten/zhon)
|
`install_win_ctc.bat` is bat script to install paddlespeech_ctc_decoders for windows
|
||||||
commit: 09bf543696277f71de502506984661a60d24494c
|
|
||||||
license: MIT
|
|
||||||
|
|
||||||
* [pymmseg-cpp](https://github.com/pluskid/pymmseg-cpp.git)
|
## Prepare your environment
|
||||||
commit: b76465045717fbb4f118c4fbdd24ce93bab10a6d
|
|
||||||
license: MIT
|
|
||||||
|
|
||||||
* [chinese_text_normalization](https://github.com/speechio/chinese_text_normalization.git)
|
insure your environment like this:
|
||||||
commit: 9e92c7bf2d6b5a7974305406d8e240045beac51c
|
|
||||||
license: MIT
|
|
||||||
|
|
||||||
* [phkit](https://github.com/KuangDD/phkit.git)
|
* gcc: version >= 12.1.0
|
||||||
commit: b2100293c1e36da531d7f30bd52c9b955a649522
|
* cmake: version >= 3.24.0
|
||||||
license: None
|
* make: version >= 3.82.90
|
||||||
|
* visual studio: version >= 2019
|
||||||
|
|
||||||
* [nnAudio](https://github.com/KinWaiCheuk/nnAudio.git)
|
## Start your bat script
|
||||||
license: MIT
|
|
||||||
|
```shell
|
||||||
|
start install_win_ctc.bat
|
||||||
|
|
||||||
|
```
|
||||||
|
@ -0,0 +1,21 @@
|
|||||||
|
@echo off
|
||||||
|
|
||||||
|
cd ctc_decoders
|
||||||
|
if not exist kenlm (
|
||||||
|
git clone https://github.com/Doubledongli/kenlm.git
|
||||||
|
@echo.
|
||||||
|
)
|
||||||
|
|
||||||
|
if not exist openfst-1.6.3 (
|
||||||
|
echo "Download and extract openfst ..."
|
||||||
|
git clone https://gitee.com/koala999/openfst.git
|
||||||
|
ren openfst openfst-1.6.3
|
||||||
|
@echo.
|
||||||
|
)
|
||||||
|
|
||||||
|
if not exist ThreadPool (
|
||||||
|
git clone https://github.com/progschj/ThreadPool.git
|
||||||
|
@echo.
|
||||||
|
)
|
||||||
|
echo "Install decoders ..."
|
||||||
|
python setup.py install --num_processes 4
|
Loading…
Reference in new issue