commit
7bbd9097a1
@ -1,3 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
paddlespeech_server start --config_file ./conf/application.yaml
|
||||
paddlespeech_server start --config_file ./conf/application.yaml &> server.log &
|
||||
|
@ -0,0 +1,10 @@
|
||||
#!/bin/bash
|
||||
|
||||
wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
|
||||
wget -c https://paddlespeech.bj.bcebos.com/vector/audio/123456789.wav
|
||||
|
||||
# sid extract
|
||||
paddlespeech_client vector --server_ip 127.0.0.1 --port 8090 --task spk --input ./85236145389.wav
|
||||
|
||||
# sid score
|
||||
paddlespeech_client vector --server_ip 127.0.0.1 --port 8090 --task score --enroll ./85236145389.wav --test ./123456789.wav
|
@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
|
||||
|
||||
paddlespeech_client text --server_ip 127.0.0.1 --port 8090 --input 今天的天气真好啊你下午有空吗我想约你一起去吃饭
|
@ -1,9 +1,8 @@
|
||||
export CUDA_VISIBLE_DEVICE=0,1,2,3
|
||||
export CUDA_VISIBLE_DEVICE=0,1,2,3
|
||||
#export CUDA_VISIBLE_DEVICE=0,1,2,3
|
||||
|
||||
# nohup python3 punc_server.py --config_file conf/punc_application.yaml > punc.log 2>&1 &
|
||||
# nohup python3 local/punc_server.py --config_file conf/punc_application.yaml > punc.log 2>&1 &
|
||||
paddlespeech_server start --config_file conf/punc_application.yaml &> punc.log &
|
||||
|
||||
# nohup python3 streaming_asr_server.py --config_file conf/ws_conformer_wenetspeech_application.yaml > streaming_asr.log 2>&1 &
|
||||
# nohup python3 local/streaming_asr_server.py --config_file conf/ws_conformer_wenetspeech_application.yaml > streaming_asr.log 2>&1 &
|
||||
paddlespeech_server start --config_file conf/ws_conformer_wenetspeech_application.yaml &> streaming_asr.log &
|
||||
|
||||
|
@ -0,0 +1,103 @@
|
||||
# This is the parameter configuration file for streaming tts server.
|
||||
|
||||
#################################################################################
|
||||
# SERVER SETTING #
|
||||
#################################################################################
|
||||
host: 0.0.0.0
|
||||
port: 8192
|
||||
|
||||
# The task format in the engin_list is: <speech task>_<engine type>
|
||||
# engine_list choices = ['tts_online', 'tts_online-onnx'], the inference speed of tts_online-onnx is faster than tts_online.
|
||||
# protocol choices = ['websocket', 'http']
|
||||
protocol: 'websocket'
|
||||
engine_list: ['tts_online-onnx']
|
||||
|
||||
|
||||
#################################################################################
|
||||
# ENGINE CONFIG #
|
||||
#################################################################################
|
||||
|
||||
################################### TTS #########################################
|
||||
################### speech task: tts; engine_type: online #######################
|
||||
tts_online:
|
||||
# am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc']
|
||||
# fastspeech2_cnndecoder_csmsc support streaming am infer.
|
||||
am: 'fastspeech2_csmsc'
|
||||
am_config:
|
||||
am_ckpt:
|
||||
am_stat:
|
||||
phones_dict:
|
||||
tones_dict:
|
||||
speaker_dict:
|
||||
spk_id: 0
|
||||
|
||||
# voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
|
||||
# Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference
|
||||
voc: 'mb_melgan_csmsc'
|
||||
voc_config:
|
||||
voc_ckpt:
|
||||
voc_stat:
|
||||
|
||||
# others
|
||||
lang: 'zh'
|
||||
device: 'cpu' # set 'gpu:id' or 'cpu'
|
||||
# am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
|
||||
# when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
|
||||
am_block: 72
|
||||
am_pad: 12
|
||||
# voc_pad and voc_block voc model to streaming voc infer,
|
||||
# when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
|
||||
# when voc model is hifigan_csmsc, voc_pad set 19, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
|
||||
voc_block: 36
|
||||
voc_pad: 14
|
||||
|
||||
|
||||
|
||||
#################################################################################
|
||||
# ENGINE CONFIG #
|
||||
#################################################################################
|
||||
|
||||
################################### TTS #########################################
|
||||
################### speech task: tts; engine_type: online-onnx #######################
|
||||
tts_online-onnx:
|
||||
# am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx']
|
||||
# fastspeech2_cnndecoder_csmsc_onnx support streaming am infer.
|
||||
am: 'fastspeech2_cnndecoder_csmsc_onnx'
|
||||
# am_ckpt is a list, if am is fastspeech2_cnndecoder_csmsc_onnx, am_ckpt = [encoder model, decoder model, postnet model];
|
||||
# if am is fastspeech2_csmsc_onnx, am_ckpt = [ckpt model];
|
||||
am_ckpt: # list
|
||||
am_stat:
|
||||
phones_dict:
|
||||
tones_dict:
|
||||
speaker_dict:
|
||||
spk_id: 0
|
||||
am_sample_rate: 24000
|
||||
am_sess_conf:
|
||||
device: "cpu" # set 'gpu:id' or 'cpu'
|
||||
use_trt: False
|
||||
cpu_threads: 4
|
||||
|
||||
# voc (vocoder) choices=['mb_melgan_csmsc_onnx, hifigan_csmsc_onnx']
|
||||
# Both mb_melgan_csmsc_onnx and hifigan_csmsc_onnx support streaming voc inference
|
||||
voc: 'hifigan_csmsc_onnx'
|
||||
voc_ckpt:
|
||||
voc_sample_rate: 24000
|
||||
voc_sess_conf:
|
||||
device: "cpu" # set 'gpu:id' or 'cpu'
|
||||
use_trt: False
|
||||
cpu_threads: 4
|
||||
|
||||
# others
|
||||
lang: 'zh'
|
||||
# am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
|
||||
# when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
|
||||
am_block: 72
|
||||
am_pad: 12
|
||||
# voc_pad and voc_block voc model to streaming voc infer,
|
||||
# when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
|
||||
# when voc model is hifigan_csmsc_onnx, voc_pad set 19, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
|
||||
voc_block: 36
|
||||
voc_pad: 14
|
||||
# voc_upsample should be same as n_shift on voc config.
|
||||
voc_upsample: 300
|
||||
|
@ -0,0 +1,10 @@
|
||||
#!/bin/bash
|
||||
|
||||
# http server
|
||||
paddlespeech_server start --config_file ./conf/tts_online_application.yaml &> tts.http.log &
|
||||
|
||||
|
||||
# websocket server
|
||||
paddlespeech_server start --config_file ./conf/tts_online_ws_application.yaml &> tts.ws.log &
|
||||
|
||||
|
@ -1,3 +0,0 @@
|
||||
#!/bin/bash
|
||||
# start server
|
||||
paddlespeech_server start --config_file ./conf/tts_online_application.yaml
|
@ -1,15 +1,17 @@
|
||||
FROM registry.baidubce.com/paddlepaddle/paddle:2.2.2
|
||||
LABEL maintainer="paddlesl@baidu.com"
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install libsndfile-dev \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN git clone --depth 1 https://github.com/PaddlePaddle/PaddleSpeech.git /home/PaddleSpeech
|
||||
RUN pip3 uninstall mccabe -y ; exit 0;
|
||||
RUN pip3 install multiprocess==0.70.12 importlib-metadata==4.2.0 dill==0.3.4
|
||||
|
||||
RUN cd /home/PaddleSpeech/audio
|
||||
RUN python setup.py bdist_wheel
|
||||
|
||||
RUN cd /home/PaddleSpeech
|
||||
WORKDIR /home/PaddleSpeech/
|
||||
RUN python setup.py bdist_wheel
|
||||
RUN pip install audio/dist/*.whl dist/*.whl
|
||||
RUN pip install dist/*.whl -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
WORKDIR /home/PaddleSpeech/
|
||||
CMD ['bash']
|
||||
|
@ -1,20 +1,3 @@
|
||||
# Callcenter 8k sample rate
|
||||
|
||||
Data distribution:
|
||||
|
||||
```
|
||||
676048 utts
|
||||
491.4004722221223 h
|
||||
4357792.0 text
|
||||
2.4633630739178654 text/sec
|
||||
2.6167397877068495 sec/utt
|
||||
```
|
||||
|
||||
train/dev/test partition:
|
||||
|
||||
```
|
||||
33802 manifest.dev
|
||||
67606 manifest.test
|
||||
574640 manifest.train
|
||||
676048 total
|
||||
```
|
||||
This recipe only has model/data config for 8k ASR, user need to prepare data and generate manifest metafile. You can see Aishell or Libripseech.
|
||||
|
@ -0,0 +1,26 @@
|
||||
# Test
|
||||
We train a Chinese-English mixed fastspeech2 model. The training code is still being sorted out, let's show how to use it first.
|
||||
The sample rate of the synthesized audio is 22050 Hz.
|
||||
|
||||
## Download pretrained models
|
||||
Put pretrained models in a directory named `models`.
|
||||
|
||||
- [fastspeech2_csmscljspeech_add-zhen.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_csmscljspeech_add-zhen.zip)
|
||||
- [hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip)
|
||||
|
||||
```bash
|
||||
mkdir models
|
||||
cd models
|
||||
wget https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_csmscljspeech_add-zhen.zip
|
||||
unzip fastspeech2_csmscljspeech_add-zhen.zip
|
||||
wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip
|
||||
unzip hifigan_ljspeech_ckpt_0.2.0.zip
|
||||
cd ../
|
||||
```
|
||||
|
||||
## test
|
||||
You can choose `--spk_id` {0, 1} in `local/synthesize_e2e.sh`.
|
||||
|
||||
```bash
|
||||
bash test.sh
|
||||
```
|
@ -0,0 +1,31 @@
|
||||
#!/bin/bash
|
||||
|
||||
model_dir=$1
|
||||
output=$2
|
||||
am_name=fastspeech2_csmscljspeech_add-zhen
|
||||
am_model_dir=${model_dir}/${am_name}/
|
||||
|
||||
stage=1
|
||||
stop_stage=1
|
||||
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=fastspeech2_mix \
|
||||
--am_config=${am_model_dir}/default.yaml \
|
||||
--am_ckpt=${am_model_dir}/snapshot_iter_94000.pdz \
|
||||
--am_stat=${am_model_dir}/speech_stats.npy \
|
||||
--voc=hifigan_ljspeech \
|
||||
--voc_config=${model_dir}/hifigan_ljspeech_ckpt_0.2.0/default.yaml \
|
||||
--voc_ckpt=${model_dir}/hifigan_ljspeech_ckpt_0.2.0/snapshot_iter_2500000.pdz \
|
||||
--voc_stat=${model_dir}/hifigan_ljspeech_ckpt_0.2.0/feats_stats.npy \
|
||||
--lang=mix \
|
||||
--text=${BIN_DIR}/../sentences_mix.txt \
|
||||
--output_dir=${output}/test_e2e \
|
||||
--phones_dict=${am_model_dir}/phone_id_map.txt \
|
||||
--speaker_dict=${am_model_dir}/speaker_id_map.txt \
|
||||
--spk_id 0
|
||||
fi
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=fastspeech2
|
||||
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
|
@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0,1
|
||||
stage=3
|
||||
stop_stage=100
|
||||
|
||||
model_dir=models
|
||||
output_dir=output
|
||||
|
||||
# with the following command, you can choose the stage range you want to run
|
||||
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||
# this can not be mixed use with `$1`, `$2` ...
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# synthesize_e2e, vocoder is hifigan by default
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${model_dir} ${output_dir} || exit -1
|
||||
fi
|
||||
|
@ -0,0 +1,13 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -1,27 +1,26 @@
|
||||
* [python_kaldi_features](https://github.com/ZitengWang/python_kaldi_features)
|
||||
# python_kaldi_features
|
||||
|
||||
[python_kaldi_features](https://github.com/ZitengWang/python_kaldi_features)
|
||||
commit: fc1bd6240c2008412ab64dc25045cd872f5e126c
|
||||
ref: https://zhuanlan.zhihu.com/p/55371926
|
||||
license: MIT
|
||||
|
||||
* [python-pinyin](https://github.com/mozillazg/python-pinyin.git)
|
||||
commit: 55e524aa1b7b8eec3d15c5306043c6cdd5938b03
|
||||
license: MIT
|
||||
# Install ctc_decoder for Windows
|
||||
|
||||
* [zhon](https://github.com/tsroten/zhon)
|
||||
commit: 09bf543696277f71de502506984661a60d24494c
|
||||
license: MIT
|
||||
`install_win_ctc.bat` is bat script to install paddlespeech_ctc_decoders for windows
|
||||
|
||||
* [pymmseg-cpp](https://github.com/pluskid/pymmseg-cpp.git)
|
||||
commit: b76465045717fbb4f118c4fbdd24ce93bab10a6d
|
||||
license: MIT
|
||||
## Prepare your environment
|
||||
|
||||
* [chinese_text_normalization](https://github.com/speechio/chinese_text_normalization.git)
|
||||
commit: 9e92c7bf2d6b5a7974305406d8e240045beac51c
|
||||
license: MIT
|
||||
insure your environment like this:
|
||||
|
||||
* [phkit](https://github.com/KuangDD/phkit.git)
|
||||
commit: b2100293c1e36da531d7f30bd52c9b955a649522
|
||||
license: None
|
||||
* gcc: version >= 12.1.0
|
||||
* cmake: version >= 3.24.0
|
||||
* make: version >= 3.82.90
|
||||
* visual studio: version >= 2019
|
||||
|
||||
* [nnAudio](https://github.com/KinWaiCheuk/nnAudio.git)
|
||||
license: MIT
|
||||
## Start your bat script
|
||||
|
||||
```shell
|
||||
start install_win_ctc.bat
|
||||
|
||||
```
|
||||
|
@ -0,0 +1,21 @@
|
||||
@echo off
|
||||
|
||||
cd ctc_decoders
|
||||
if not exist kenlm (
|
||||
git clone https://github.com/Doubledongli/kenlm.git
|
||||
@echo.
|
||||
)
|
||||
|
||||
if not exist openfst-1.6.3 (
|
||||
echo "Download and extract openfst ..."
|
||||
git clone https://gitee.com/koala999/openfst.git
|
||||
ren openfst openfst-1.6.3
|
||||
@echo.
|
||||
)
|
||||
|
||||
if not exist ThreadPool (
|
||||
git clone https://github.com/progschj/ThreadPool.git
|
||||
@echo.
|
||||
)
|
||||
echo "Install decoders ..."
|
||||
python setup.py install --num_processes 4
|
Loading…
Reference in new issue