commit
bc2613b772
@ -0,0 +1,26 @@
|
|||||||
|
# Test
|
||||||
|
We train a Chinese-English mixed fastspeech2 model. The training code is still being sorted out, let's show how to use it first.
|
||||||
|
The sample rate of the synthesized audio is 22050 Hz.
|
||||||
|
|
||||||
|
## Download pretrained models
|
||||||
|
Put pretrained models in a directory named `models`.
|
||||||
|
|
||||||
|
- [fastspeech2_csmscljspeech_add-zhen.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_csmscljspeech_add-zhen.zip)
|
||||||
|
- [hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir models
|
||||||
|
cd models
|
||||||
|
wget https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_csmscljspeech_add-zhen.zip
|
||||||
|
unzip fastspeech2_csmscljspeech_add-zhen.zip
|
||||||
|
wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip
|
||||||
|
unzip hifigan_ljspeech_ckpt_0.2.0.zip
|
||||||
|
cd ../
|
||||||
|
```
|
||||||
|
|
||||||
|
## test
|
||||||
|
You can choose `--spk_id` {0, 1} in `local/synthesize_e2e.sh`.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bash test.sh
|
||||||
|
```
|
@ -0,0 +1,31 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
model_dir=$1
|
||||||
|
output=$2
|
||||||
|
am_name=fastspeech2_csmscljspeech_add-zhen
|
||||||
|
am_model_dir=${model_dir}/${am_name}/
|
||||||
|
|
||||||
|
stage=1
|
||||||
|
stop_stage=1
|
||||||
|
|
||||||
|
|
||||||
|
# hifigan
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
FLAGS_allocator_strategy=naive_best_fit \
|
||||||
|
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||||
|
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||||
|
--am=fastspeech2_mix \
|
||||||
|
--am_config=${am_model_dir}/default.yaml \
|
||||||
|
--am_ckpt=${am_model_dir}/snapshot_iter_94000.pdz \
|
||||||
|
--am_stat=${am_model_dir}/speech_stats.npy \
|
||||||
|
--voc=hifigan_ljspeech \
|
||||||
|
--voc_config=${model_dir}/hifigan_ljspeech_ckpt_0.2.0/default.yaml \
|
||||||
|
--voc_ckpt=${model_dir}/hifigan_ljspeech_ckpt_0.2.0/snapshot_iter_2500000.pdz \
|
||||||
|
--voc_stat=${model_dir}/hifigan_ljspeech_ckpt_0.2.0/feats_stats.npy \
|
||||||
|
--lang=mix \
|
||||||
|
--text=${BIN_DIR}/../sentences_mix.txt \
|
||||||
|
--output_dir=${output}/test_e2e \
|
||||||
|
--phones_dict=${am_model_dir}/phone_id_map.txt \
|
||||||
|
--speaker_dict=${am_model_dir}/speaker_id_map.txt \
|
||||||
|
--spk_id 0
|
||||||
|
fi
|
@ -0,0 +1,13 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||||
|
|
||||||
|
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||||
|
export LC_ALL=C
|
||||||
|
|
||||||
|
export PYTHONDONTWRITEBYTECODE=1
|
||||||
|
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||||
|
export PYTHONIOENCODING=UTF-8
|
||||||
|
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||||
|
|
||||||
|
MODEL=fastspeech2
|
||||||
|
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
|
@ -0,0 +1,23 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
source path.sh
|
||||||
|
|
||||||
|
gpus=0,1
|
||||||
|
stage=3
|
||||||
|
stop_stage=100
|
||||||
|
|
||||||
|
model_dir=models
|
||||||
|
output_dir=output
|
||||||
|
|
||||||
|
# with the following command, you can choose the stage range you want to run
|
||||||
|
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||||
|
# this can not be mixed use with `$1`, `$2` ...
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||||
|
|
||||||
|
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
# synthesize_e2e, vocoder is hifigan by default
|
||||||
|
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${model_dir} ${output_dir} || exit -1
|
||||||
|
fi
|
||||||
|
|
Loading…
Reference in new issue