diff --git a/examples/zh_en_tts/tts3/README.md b/examples/zh_en_tts/tts3/README.md index ead57429..1752d246 100644 --- a/examples/zh_en_tts/tts3/README.md +++ b/examples/zh_en_tts/tts3/README.md @@ -98,9 +98,16 @@ optional arguments: ### Synthesizing -We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder. +We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the default neural vocoder. Download the pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it. +When speaker is `174` (csmsc), use csmsc's vocoder is better than aishell3's, we recommend that you use [hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip), please check `stage 2` of `synthesize_e2e.sh`. + +But if speaker is `175` (ljspeech), we **don't** recommend you to use ljspeech's vocoder, because ljspeech's vocoders are trained on sample rate 22.05kHz, but this acoustic model is trained on sample rate 24kHz, you can use csmsc's vocoder also, because ljspeech and csmsc are both female speakers. + +For speakers in aishell3 and vctk, we recommend you use aishell3 or vctk's vocoders, because ljspeech and csmsc are both female speakers, there vocoders may not perform well for male speakers in aishell3 and vctk, you can check speaker name and spk_id in `dump/speaker_id_map.txt` and check speakers' information ( Age / Gender / Accents / region, etc ) in [this issue](https://github.com/PaddlePaddle/PaddleSpeech/issues/1620) and choose the `spk_id` you want. + + ```bash unzip pwg_aishell3_ckpt_0.5.zip ``` diff --git a/examples/zh_en_tts/tts3/local/inference.sh b/examples/zh_en_tts/tts3/local/inference.sh index 5d3bd09e..16499ed0 100755 --- a/examples/zh_en_tts/tts3/local/inference.sh +++ b/examples/zh_en_tts/tts3/local/inference.sh @@ -37,3 +37,18 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --lang=mix \ --spk_id=174 fi + +# voc: hifigan_csmsc +# when speaker is 174 (csmsc), use csmsc's vocoder is better than aishell3's +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=fastspeech2_mix \ + --voc=hifigan_csmsc \ + --text=${BIN_DIR}/../sentences_mix.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --lang=mix \ + --spk_id=174 +fi diff --git a/examples/zh_en_tts/tts3/local/ort_predict.sh b/examples/zh_en_tts/tts3/local/ort_predict.sh index 86dcd115..d80da9c9 100755 --- a/examples/zh_en_tts/tts3/local/ort_predict.sh +++ b/examples/zh_en_tts/tts3/local/ort_predict.sh @@ -18,9 +18,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --device=cpu \ --cpu_threads=4 \ --lang=mix \ - --spk_id=174 - - + --spk_id=174 fi @@ -38,6 +36,19 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --device=cpu \ --cpu_threads=4 \ --lang=mix \ - --spk_id=174 - + --spk_id=174 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/../ort_predict_e2e.py \ + --inference_dir=${train_output_path}/inference_onnx \ + --am=fastspeech2_mix \ + --voc=hifigan_csmsc \ + --output_dir=${train_output_path}/onnx_infer_out_e2e \ + --text=${BIN_DIR}/../sentences_mix.txt \ + --phones_dict=dump/phone_id_map.txt \ + --device=cpu \ + --cpu_threads=4 \ + --lang=mix \ + --spk_id=174 fi diff --git a/examples/zh_en_tts/tts3/local/synthesize.sh b/examples/zh_en_tts/tts3/local/synthesize.sh index f3a0bf15..5bb94746 100755 --- a/examples/zh_en_tts/tts3/local/synthesize.sh +++ b/examples/zh_en_tts/tts3/local/synthesize.sh @@ -20,7 +20,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \ --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ - --test_metadata=dump/test/norm/metadata2.jsonl \ + --test_metadata=dump/test/norm/metadata.jsonl \ --output_dir=${train_output_path}/test \ --phones_dict=dump/phone_id_map.txt \ --speaker_dict=dump/speaker_id_map.txt @@ -45,6 +45,3 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --phones_dict=dump/phone_id_map.txt \ --speaker_dict=dump/speaker_id_map.txt fi - - - diff --git a/examples/zh_en_tts/tts3/local/synthesize_e2e.sh b/examples/zh_en_tts/tts3/local/synthesize_e2e.sh index ae14e3cc..f6ee04ae 100755 --- a/examples/zh_en_tts/tts3/local/synthesize_e2e.sh +++ b/examples/zh_en_tts/tts3/local/synthesize_e2e.sh @@ -54,4 +54,29 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --speaker_dict=dump/speaker_id_map.txt \ --spk_id=174 \ --inference_dir=${train_output_path}/inference - fi +fi + + +# voc: hifigan_csmsc +# when speaker is 174 (csmsc), use csmsc's vocoder is better than aishell3's +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + echo "in csmsc's hifigan syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_mix \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=mix \ + --text=${BIN_DIR}/../sentences_mix.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --spk_id=174 \ + --inference_dir=${train_output_path}/inference +fi \ No newline at end of file diff --git a/examples/zh_en_tts/tts3/run.sh b/examples/zh_en_tts/tts3/run.sh index 221ed7ee..204042b1 100755 --- a/examples/zh_en_tts/tts3/run.sh +++ b/examples/zh_en_tts/tts3/run.sh @@ -7,7 +7,7 @@ gpus=0,1 stage=0 stop_stage=100 -datasets_root_dir=./datasets/ +datasets_root_dir=~/datasets mfa_root_dir=./mfa_results/ conf_path=conf/default.yaml train_output_path=exp/default @@ -53,11 +53,11 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_mix # considering the balance between speed and quality, we recommend that you use hifigan as vocoder ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_aishell3 - #./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_aishell3 - + # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_aishell3 + # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc fi -# inference with onnxruntime, use fastspeech2 + hifigan by default +# inference with onnxruntime, use fastspeech2 + pwgan by default if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then ./local/ort_predict.sh ${train_output_path} fi diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py index 6e926d6e..f4acdc60 100644 --- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py +++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py @@ -180,7 +180,6 @@ def process_sentences(config, results.append(record) results.sort(key=itemgetter("utt_id")) - write_metadata_method) with jsonlines.open(output_dir / "metadata.jsonl", write_metadata_method) as writer: for item in results: