fix preprocess bug, add hifigan_csmsc decoder, update readme

2 years ago · 8dbefc0165
parent 368e3e1b59
commit 8dbefc0165
7 changed files with 70 additions and 16 deletions
--- a/examples/zh_en_tts/tts3/README.md
+++ b/examples/zh_en_tts/tts3/README.md
@ -98,9 +98,16 @@ optional arguments:


 ### Synthesizing
-We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
+We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the default neural vocoder.
 Download the pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it.

+When speaker is `174` (csmsc), use csmsc's vocoder is better than aishell3's, we recommend that you use [hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip), please check `stage 2`  of `synthesize_e2e.sh`.
+
+But if speaker is `175` (ljspeech), we **don't** recommend you to use ljspeech's vocoder, because ljspeech's vocoders are trained on sample rate 22.05kHz, but this acoustic model is trained on sample rate 24kHz, you can use csmsc's vocoder also, because ljspeech and csmsc are both female speakers.
+
+For speakers in aishell3 and vctk, we recommend you use aishell3 or vctk's vocoders, because ljspeech and csmsc are both female speakers, there vocoders may not perform well for male speakers in aishell3 and vctk, you can check speaker name and spk_id in `dump/speaker_id_map.txt` and check speakers' information ( Age / Gender / Accents / region, etc ) in [this issue](https://github.com/PaddlePaddle/PaddleSpeech/issues/1620) and choose the `spk_id` you want.
+
+
 ```bash
 unzip pwg_aishell3_ckpt_0.5.zip
 ```
--- a/examples/zh_en_tts/tts3/local/inference.sh
+++ b/examples/zh_en_tts/tts3/local/inference.sh
@ -37,3 +37,18 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --lang=mix \
        --spk_id=174
 fi
+
+# voc: hifigan_csmsc
+# when speaker is 174 (csmsc), use csmsc's vocoder is better than aishell3's
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_mix \
+        --voc=hifigan_csmsc \
+        --text=${BIN_DIR}/../sentences_mix.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=mix \
+        --spk_id=174
+fi
--- a/examples/zh_en_tts/tts3/local/ort_predict.sh
+++ b/examples/zh_en_tts/tts3/local/ort_predict.sh
@ -18,9 +18,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --device=cpu \
        --cpu_threads=4 \
        --lang=mix \
-        --spk_id=174 
-        
-
+        --spk_id=174
 fi


@ -38,6 +36,19 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --device=cpu \
        --cpu_threads=4 \
        --lang=mix \
-        --spk_id=174 
-        
+        --spk_id=174
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/../ort_predict_e2e.py \
+        --inference_dir=${train_output_path}/inference_onnx \
+        --am=fastspeech2_mix \
+        --voc=hifigan_csmsc \
+        --output_dir=${train_output_path}/onnx_infer_out_e2e \
+        --text=${BIN_DIR}/../sentences_mix.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --device=cpu \
+        --cpu_threads=4 \
+        --lang=mix \
+        --spk_id=174
 fi
--- a/examples/zh_en_tts/tts3/local/synthesize.sh
+++ b/examples/zh_en_tts/tts3/local/synthesize.sh
@ -20,7 +20,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
        --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
        --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
-        --test_metadata=dump/test/norm/metadata2.jsonl \
+        --test_metadata=dump/test/norm/metadata.jsonl \
        --output_dir=${train_output_path}/test \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt
@ -45,6 +45,3 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt
 fi
-
-
-
--- a/examples/zh_en_tts/tts3/local/synthesize_e2e.sh
+++ b/examples/zh_en_tts/tts3/local/synthesize_e2e.sh
@ -54,4 +54,29 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --speaker_dict=dump/speaker_id_map.txt \
        --spk_id=174 \
        --inference_dir=${train_output_path}/inference
-    fi
+fi
+
+
+# voc: hifigan_csmsc
+# when speaker is 174 (csmsc), use csmsc's vocoder is better than aishell3's
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "in csmsc's hifigan syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_mix \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=mix \
+        --text=${BIN_DIR}/../sentences_mix.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --spk_id=174 \
+        --inference_dir=${train_output_path}/inference
+fi
--- a/examples/zh_en_tts/tts3/run.sh
+++ b/examples/zh_en_tts/tts3/run.sh
@ -7,7 +7,7 @@ gpus=0,1
 stage=0
 stop_stage=100

-datasets_root_dir=./datasets/
+datasets_root_dir=~/datasets
 mfa_root_dir=./mfa_results/
 conf_path=conf/default.yaml
 train_output_path=exp/default
@ -53,11 +53,11 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_mix
    # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_aishell3
-    #./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_aishell3
-    
+    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_aishell3
+    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
 fi

-# inference with onnxruntime, use fastspeech2 + hifigan by default
+# inference with onnxruntime, use fastspeech2 + pwgan by default
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
    ./local/ort_predict.sh ${train_output_path}
 fi
--- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py
+++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
@ -180,7 +180,6 @@ def process_sentences(config,
                        results.append(record)

    results.sort(key=itemgetter("utt_id"))
-          write_metadata_method)
    with jsonlines.open(output_dir / "metadata.jsonl",
                        write_metadata_method) as writer:
        for item in results: