Merge pull request #3305 from zh794390558/tts

[t2s] add assets and tts codeswitch scripts
pull/3323/head
Hui Zhang 1 year ago committed by GitHub
commit 8aa9790c75
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -241,7 +241,7 @@ fastspeech2_aishell3_ckpt_1.1.0
├── speaker_id_map.txt # speaker id map file when training a multi-speaker fastspeech2
└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2
```
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
```bash
source path.sh
@ -257,7 +257,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=exp/default/test_e2e \
--phones_dict=fastspeech2_aishell3_ckpt_1.1.0/phone_id_map.txt \
--speaker_dict=fastspeech2_aishell3_ckpt_1.1.0/speaker_id_map.txt \

@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_aishell3 \
--voc=pwgan_aishell3 \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
@ -22,7 +22,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_aishell3 \
--voc=hifigan_aishell3 \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \

@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=fastspeech2_aishell3 \
--voc=pwgan_aishell3 \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=fastspeech2_aishell3 \
--voc=hifigan_aishell3 \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \

@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am=fastspeech2_aishell3 \
--voc=pwgan_aishell3 \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am=fastspeech2_aishell3 \
--voc=hifigan_aishell3 \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \

@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \

@ -196,7 +196,7 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
--phones_dict=vits_aishell3_ckpt_1.1.0/phone_id_map.txt \
--speaker_dict=vits_aishell3_ckpt_1.1.0/speaker_id_map.txt \
--output_dir=exp/default/test_e2e \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--add-blank=${add_blank}
```
-->

@ -20,6 +20,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--speaker_dict=dump/speaker_id_map.txt \
--spk_id=0 \
--output_dir=${train_output_path}/test_e2e \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--add-blank=${add_blank}
fi

@ -102,7 +102,7 @@ Download the pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](
unzip pwg_aishell3_ckpt_0.5.zip
```
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_canton.txt` using pretrained fastspeech2 and parallel wavegan models.
You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences_canton.txt` using pretrained fastspeech2 and parallel wavegan models.
```bash
source path.sh
@ -118,7 +118,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--lang=canton \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--output_dir=exp/default/test_e2e \
--phones_dict=fastspeech2_canton_ckpt_1.4.0/phone_id_map.txt \
--speaker_dict=fastspeech2_canton_ckpt_1.4.0/speaker_id_map.txt \

@ -12,7 +12,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am=fastspeech2_canton \
--voc=pwgan_aishell3 \
--spk_id=10 \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
@ -27,7 +27,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am=fastspeech2_canton \
--voc=mb_melgan_csmsc \
--spk_id=10 \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
@ -41,7 +41,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--am=fastspeech2_canton \
--voc=hifigan_csmsc \
--spk_id=10 \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--am=fastspeech2_canton \
--voc=wavernn_csmsc \
--spk_id=10 \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \

@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc=pwgan_aishell3 \
--spk_id=10 \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
--lang=canton \
@ -26,7 +26,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc=mb_melgan_csmsc \
--spk_id=10 \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
--lang=canton \
@ -40,7 +40,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--am=fastspeech2_canton \
--voc=hifigan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
--lang=canton \

@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--lang=canton \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
--lang=canton \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \

@ -9,7 +9,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/inference.py \
--inference_dir=${train_output_path}/inference \
--am=jets_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt
fi

@ -17,6 +17,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--phones_dict=dump/phone_id_map.txt \
--output_dir=${train_output_path}/test_e2e \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--inference_dir=${train_output_path}/inference
fi

@ -226,7 +226,7 @@ tacotron2_csmsc_ckpt_0.2.0
├── snapshot_iter_30600.pdz # model parameters and optimizer states
└── speech_stats.npy # statistics used to normalize spectrogram when training Tacotron2
```
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained Tacotron2 and parallel wavegan models.
You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained Tacotron2 and parallel wavegan models.
```bash
source path.sh
@ -242,7 +242,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=exp/default/test_e2e \
--inference_dir=exp/default/inference \
--phones_dict=tacotron2_csmsc_ckpt_0.2.0/phone_id_map.txt

@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/inference \
--am=tacotron2_csmsc \
--voc=pwgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
@ -22,7 +22,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/inference \
--am=tacotron2_csmsc \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
@ -33,7 +33,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--inference_dir=${train_output_path}/inference \
--am=tacotron2_csmsc \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt
fi

@ -22,7 +22,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference
@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference
@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt
# --inference_dir=${train_output_path}/inference
@ -87,7 +87,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference
@ -108,7 +108,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference

@ -248,7 +248,7 @@ speedyspeech_csmsc_ckpt_0.2.0
├── snapshot_iter_30600.pdz # model parameters and optimizer states
└── tone_id_map.txt # tone vocabulary file when training speedyspeech
```
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained speedyspeech and parallel wavegan models.
You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained speedyspeech and parallel wavegan models.
```bash
source path.sh
@ -264,7 +264,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=exp/default/test_e2e \
--inference_dir=exp/default/inference \
--phones_dict=speedyspeech_csmsc_ckpt_0.2.0/phone_id_map.txt \

@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=pwgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
@ -36,7 +36,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt

@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=speedyspeech_csmsc \
--voc=pwgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=speedyspeech_csmsc \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
@ -36,7 +36,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=speedyspeech_csmsc \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt

@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am=speedyspeech_csmsc \
--voc=pwgan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device=cpu \
@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am=speedyspeech_csmsc \
--voc=mb_melgan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device=cpu \
@ -36,7 +36,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--am=speedyspeech_csmsc \
--voc=hifigan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device=cpu \

@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
@ -87,7 +87,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
@ -109,7 +109,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \

@ -258,7 +258,7 @@ fastspeech2_nosil_baker_ckpt_0.4
├── snapshot_iter_76000.pdz # model parameters and optimizer states
└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2
```
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
If you want to use fastspeech2_conformer, you must delete this line `--inference_dir=exp/default/inference \` to skip the step of dygraph to static graph, cause we haven't tested dygraph to static graph for fastspeech2_conformer till now.
```bash
@ -276,7 +276,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=exp/default/test_e2e \
--inference_dir=exp/default/inference \
--phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt

@ -248,7 +248,7 @@ fastspeech2_nosil_baker_ckpt_0.4
├── snapshot_iter_76000.pdz # 模型参数和优化器状态
└── speech_stats.npy # 训练 fastspeech2 时用于规范化频谱图的统计数据
```
您可以使用以下脚本通过使用预训练的 fastspeech2 和 parallel wavegan 模型为 `${BIN_DIR}/../sentences.txt` 合成句子
您可以使用以下脚本通过使用预训练的 fastspeech2 和 parallel wavegan 模型为 `${BIN_DIR}/../../assets/sentences.txt` 合成句子
```bash
source path.sh
@ -264,7 +264,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=exp/default/test_e2e \
--inference_dir=exp/default/inference \
--phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt

@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_csmsc \
--voc=pwgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_csmsc \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
@ -34,7 +34,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_csmsc \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
@ -45,7 +45,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_csmsc \
--voc=wavernn_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt
fi

@ -12,7 +12,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am=fastspeech2_csmsc \
--am_stat=dump/train/speech_stats.npy \
--voc=pwgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True
@ -26,7 +26,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am=fastspeech2_csmsc \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True
@ -39,7 +39,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--am=fastspeech2_csmsc \
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True

@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=fastspeech2_csmsc \
--voc=pwgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=fastspeech2_csmsc \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
@ -34,7 +34,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=fastspeech2_csmsc \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt
fi

@ -12,7 +12,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am=fastspeech2_csmsc \
--am_stat=dump/train/speech_stats.npy \
--voc=pwgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True
@ -26,7 +26,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am=fastspeech2_csmsc \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True
@ -39,7 +39,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--am=fastspeech2_csmsc \
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True

@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am=fastspeech2_csmsc \
--voc=pwgan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2
@ -22,7 +22,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am=fastspeech2_csmsc \
--voc=mb_melgan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2
@ -34,7 +34,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--am=fastspeech2_csmsc \
--voc=hifigan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2

@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am_stat=dump/train/speech_stats.npy \
--voc=pwgan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_streaming \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
@ -25,7 +25,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_streaming \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
@ -39,7 +39,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_streaming \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \

@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference
@ -42,7 +42,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference
@ -64,7 +64,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt
# --inference_dir=${train_output_path}/inference
@ -85,7 +85,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference
@ -107,7 +107,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference

@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True \
@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True \
@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True
@ -87,7 +87,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True \

@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference \
@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference \
@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--use_rhy=True
@ -88,7 +88,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference \
@ -111,7 +111,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference \

@ -172,6 +172,6 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
--ckpt=vits_csmsc_ckpt_1.4.0/snapshot_iter_150000.pdz \
--phones_dict=vits_csmsc_ckpt_1.4.0/phone_id_map.txt \
--output_dir=exp/default/test_e2e \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--add-blank=${add_blank}
```

@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/inference.py \
--inference_dir=${train_output_path}/inference \
--am=vits_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--add-blank=${add_blank}

@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/lite_predict.py \
--inference_dir=${train_output_path}/pdlite \
--am=vits_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--add-blank=${add_blank}

@ -18,7 +18,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--phones_dict=dump/phone_id_map.txt \
--output_dir=${train_output_path}/test_e2e \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--add-blank=${add_blank} #\
# --inference_dir=${train_output_path}/inference
fi

@ -239,7 +239,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=exp/default/test_e2e \
--phones_dict=tacotron2_ljspeech_ckpt_0.2.0/phone_id_map.txt
```

@ -16,7 +16,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
# --inference_dir=${train_output_path}/inference

@ -191,7 +191,7 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
--transformer-tts-stat=transformer_tts_ljspeech_ckpt_0.4/speech_stats.npy \
--waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
--waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output-dir=exp/default/test_e2e \
--phones-dict=transformer_tts_ljspeech_ckpt_0.4/phone_id_map.txt
```

@ -12,6 +12,6 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
--transformer-tts-stat=dump/train/speech_stats.npy \
--waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
--waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output-dir=${train_output_path}/test_e2e \
--phones-dict=dump/phone_id_map.txt

@ -254,7 +254,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=exp/default/test_e2e \
--inference_dir=exp/default/inference \
--phones_dict=fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt

@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_ljspeech \
--voc=pwgan_ljspeech \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--lang=en
@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_ljspeech \
--voc=hifigan_ljspeech \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--lang=en

@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=fastspeech2_ljspeech \
--voc=pwgan_ljspeech \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--lang=en
@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=fastspeech2_ljspeech \
--voc=hifigan_ljspeech \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--lang=en

@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am=fastspeech2_ljspeech \
--voc=pwgan_ljspeech\
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am=fastspeech2_ljspeech \
--voc=hifigan_ljspeech \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \

@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt
@ -41,7 +41,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=hifigan_ljspeech_ckpt_0.2.0/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_ljspeech_ckpt_0.2.0/feats_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt

@ -267,7 +267,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
--voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
--lang=sing \
--text=${BIN_DIR}/../sentences_sing.txt \
--text=${BIN_DIR}/../../assets/sentences_sing.txt \
--output_dir=exp/default/test_e2e \
--phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
--pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \

@ -271,7 +271,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
--voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
--lang=sing \
--text=${BIN_DIR}/../sentences_sing.txt \
--text=${BIN_DIR}/../../assets/sentences_sing.txt \
--output_dir=exp/default/test_e2e \
--phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
--pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \

@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
--voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
--lang=sing \
--text=${BIN_DIR}/../sentences_sing.txt \
--text=${BIN_DIR}/../../assets/sentences_sing.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speech_stretchs=dump/train/speech_stretchs.npy \
@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=hifigan_opencpop_ckpt_1.4.0/snapshot_iter_625000.pdz \
--voc_stat=hifigan_opencpop_ckpt_1.4.0/feats_stats.npy \
--lang=sing \
--text=${BIN_DIR}/../sentences_sing.txt \
--text=${BIN_DIR}/../../assets/sentences_sing.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speech_stretchs=dump/train/speech_stretchs.npy \

@ -99,7 +99,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
--voc_ckpt=pretrained_models/hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
--voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=./test_e2e/ \
--phones_dict=${dump_dir}/phone_id_map.txt \
--speaker_dict=${dump_dir}/speaker_id_map.txt \

@ -98,7 +98,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
--voc_ckpt=pretrained_models/hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
--voc_stat=pretrained_models/hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=./test_e2e/ \
--phones_dict=${dump_dir}/phone_id_map.txt \
--speaker_dict=${dump_dir}/speaker_id_map.txt \

@ -100,7 +100,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
--voc_ckpt=pretrained_models/hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
--voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
--lang=mix \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--output_dir=./test_e2e/ \
--phones_dict=${dump_dir}/phone_id_map.txt \
--speaker_dict=${dump_dir}/speaker_id_map.txt \

@ -254,7 +254,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=exp/default/test_e2e \
--phones_dict=fastspeech2_vctk_ckpt_1.2.0/phone_id_map.txt \
--speaker_dict=fastspeech2_vctk_ckpt_1.2.0/speaker_id_map.txt \

@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_vctk \
--voc=pwgan_vctk \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_vctk \
--voc=hifigan_vctk \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \

@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=fastspeech2_vctk \
--voc=pwgan_vctk \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
@ -25,7 +25,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=fastspeech2_vctk \
--voc=hifigan_vctk \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \

@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am=fastspeech2_vctk \
--voc=pwgan_vctk \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
@ -25,7 +25,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am=fastspeech2_vctk \
--voc=hifigan_vctk \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \

@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \

@ -0,0 +1,2 @@
data
exp

@ -6,11 +6,11 @@ This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2
## Dataset
### Download and Extract
Download all datasets and extract it to `~/datasets`:
- The CSMSC dataset is in the directory `~/datasets/BZNSYP`
- The Ljspeech dataset is in the directory `~/datasets/LJSpeech-1.1`
- The aishell3 dataset is in the directory `~/datasets/data_aishell3`
- The vctk dataset is in the directory `~/datasets/VCTK-Corpus-0.92`
Download all datasets and extract it to `./data`:
- The CSMSC dataset is in the directory `./data/BZNSYP`
- The Ljspeech dataset is in the directory `./data/LJSpeech-1.1`
- The aishell3 dataset is in the directory `./data/data_aishell3`
- The vctk dataset is in the directory `./data/VCTK-Corpus-0.92`
### Get MFA Result and Extract
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for the fastspeech2 training.
@ -24,16 +24,16 @@ Or train your MFA model reference to [mfa example](https://github.com/PaddlePadd
## Get Started
Assume the paths to the datasets are:
- `~/datasets/BZNSYP`
- `~/datasets/LJSpeech-1.1`
- `~/datasets/data_aishell3`
- `~/datasets/VCTK-Corpus-0.92`
- `./data/BZNSYP`
- `./data/LJSpeech-1.1`
- `./data/data_aishell3`
- `./data/VCTK-Corpus-0.92`
Assume the path to the MFA results of the datasets are:
- `./mfa_results/baker_alignment_tone`
- `./mfa_results/ljspeech_alignment`
- `./mfa_results/aishell3_alignment_tone`
- `./mfa_results/vctk_alignment`
- `./data/mfa/baker_alignment_tone`
- `./data/mfa/ljspeech_alignment`
- `./data/mfa/aishell3_alignment_tone`
- `./data/mfa/vctk_alignment`
Run the command below to
1. **source path**.
@ -252,8 +252,10 @@ optional arguments:
## Pretrained Model
Pretrained FastSpeech2 model with no silence in the edge of audios:
- [fastspeech2_mix_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_ckpt_1.2.0.zip)
- [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)
The static model can be downloaded here:
- [fastspeech2_mix_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_static_0.2.0.zip)
@ -285,18 +287,18 @@ FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=fastspeech2_mix \
--am_config=fastspeech2_mix_ckpt_1.2.0/default.yaml \
--am_ckpt=fastspeech2_mix_ckpt_1.2.0/snapshot_iter_99200.pdz \
--am_stat=fastspeech2_mix_ckpt_1.2.0/speech_stats.npy \
--am_config=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/default.yaml \
--am_ckpt=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/snapshot_iter_99200.pdz \
--am_stat=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/speech_stats.npy \
--phones_dict=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/phone_id_map.txt \
--speaker_dict=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/speaker_id_map.txt \
--spk_id=174 \
--voc=pwgan_aishell3 \
--voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
--voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--voc_config=exp/pretrain/pwg_aishell3_ckpt_0.5/default.yaml \
--voc_ckpt=exp/pretrain/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--voc_stat=exp/pretrain/pwg_aishell3_ckpt_0.5/feats_stats.npy \
--lang=mix \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--output_dir=exp/default/test_e2e \
--phones_dict=fastspeech2_mix_ckpt_1.2.0/phone_id_map.txt \
--speaker_dict=fastspeech2_mix_ckpt_1.2.0/speaker_id_map.txt \
--spk_id=174 \
--inference_dir=exp/default/inference
```

@ -13,7 +13,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_mix \
--voc=pwgan_aishell3 \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
@ -30,7 +30,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_mix \
--voc=hifigan_aishell3 \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
@ -45,7 +45,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_mix \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \

@ -0,0 +1,16 @@
#!/bin/bash
exp=exp
mfa=$exp/mfa
mkdir -p $mfa
pushd $mfa
wget -c https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz &
wget -c https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz &
wget -c https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz &
wget -c https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz &
wait
popd

@ -0,0 +1,14 @@
#!/bin/bash
exp=exp
pretrain=$exp/pretrain
mkdir -p $pretrain
pushd $pretrain
wget -c https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_ckpt_1.2.0.zip &
wget -c https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip &
wait
popd

@ -13,7 +13,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am=fastspeech2_mix \
--voc=pwgan_aishell3 \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=4 \
@ -31,7 +31,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am=fastspeech2_mix \
--voc=hifigan_aishell3 \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=4 \
@ -45,7 +45,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--am=fastspeech2_mix \
--voc=hifigan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=4 \

@ -23,7 +23,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--lang=mix \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
@ -48,7 +48,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
--lang=mix \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
@ -73,7 +73,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=mix \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \

@ -7,8 +7,8 @@ gpus=0,1
stage=0
stop_stage=100
datasets_root_dir=~/datasets
mfa_root_dir=./mfa_results/
datasets_root_dir=./data
mfa_root_dir=./data/mfa
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_99200.pdz

Loading…
Cancel
Save