From d21e03c03e4fb29cbd6ce3b708de19a6d542a04a Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 26 Aug 2022 18:06:12 +0800
Subject: [PATCH] update tts3 readme, test=doc (#2315)

---
 docs/source/released_model.md                  |  6 ++++--
 examples/aishell3/tts3/README.md               | 15 ++++++++-------
 examples/aishell3/tts3/local/synthesize_e2e.sh |  6 +++---
 examples/other/g2p/README.md                   |  2 +-
 examples/vctk/tts3/README.md                   | 16 +++++++++-------
 examples/zh_en_tts/tts3/README.md              | 14 ++++++++------
 6 files changed, 33 insertions(+), 26 deletions(-)
diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index 8d0ff1d4..d6691812 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -42,9 +42,11 @@ SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/Paddl
 FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip) </br> [fastspeech2_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip)|157MB|
 FastSpeech2-Conformer| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)|||
 FastSpeech2-CNNDecoder| CSMSC| [fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)| [fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip) |  [fastspeech2_cnndecoder_csmsc_static_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_static_1.0.0.zip) </br>[fastspeech2_cnndecoder_csmsc_streaming_static_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_static_1.0.0.zip)  </br>[fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip)  </br>[fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip) | 84MB|
-FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)|[fastspeech2_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_static_1.1.0.zip) </br> [fastspeech2_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_onnx_1.1.0.zip)|147MB|
+FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_aishell3_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_1.1.0.zip)|[fastspeech2_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_static_1.1.0.zip) </br> [fastspeech2_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_onnx_1.1.0.zip)|147MB|
 FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|[fastspeech2_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_static_1.1.0.zip) </br> [fastspeech2_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_onnx_1.1.0.zip)|145MB|
-FastSpeech2| VCTK |[fastspeech2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)|[fastspeech2_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_static_1.1.0.zip) </br> [fastspeech2_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_onnx_1.1.0.zip) | 145MB|
+FastSpeech2| VCTK |[fastspeech2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_vctk_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_ckpt_1.2.0.zip)|[fastspeech2_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_static_1.1.0.zip) </br> [fastspeech2_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_onnx_1.1.0.zip) | 145MB|
+FastSpeech2| ZH_EN |[fastspeech2-zh_en](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/zh_en_tts/tts3)|[fastspeech2_mix_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_ckpt_1.2.0.zip)|[fastspeech2_mix_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_static_0.2.0.zip) </br> [fastspeech2_mix_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_onnx_0.2.0.zip) | 145MB|
+
 
 ### Vocoders
 Model Type | Dataset| Example Link | Pretrained Models| Static/ONNX Models|Size (static)
diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md
index 6ef2870c..3e1dee2f 100644
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -229,9 +229,11 @@ The ONNX model can be downloaded here:
 FastSpeech2 checkpoint contains files listed below.
 
 ```text
-fastspeech2_nosil_aishell3_ckpt_0.4
+fastspeech2_aishell3_ckpt_1.1.0
 ├── default.yaml            # default config used to train fastspeech2
+├── energy_stats.npy        # statistics used to normalize energy when training fastspeech2
 ├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
+├── pitch_stats.npy         # statistics used to normalize pitch when training fastspeech2
 ├── snapshot_iter_96400.pdz # model parameters and optimizer states
 ├── speaker_id_map.txt      # speaker id map file when training a multi-speaker fastspeech2
 └── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
@@ -244,9 +246,9 @@ FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../synthesize_e2e.py \
   --am=fastspeech2_aishell3 \
-  --am_config=fastspeech2_nosil_aishell3_ckpt_0.4/default.yaml \
-  --am_ckpt=fastspeech2_nosil_aishell3_ckpt_0.4/snapshot_iter_96400.pdz \
-  --am_stat=fastspeech2_nosil_aishell3_ckpt_0.4/speech_stats.npy \
+  --am_config=fastspeech2_aishell3_ckpt_1.1.0/default.yaml \
+  --am_ckpt=fastspeech2_aishell3_ckpt_1.1.0/snapshot_iter_96400.pdz \
+  --am_stat=fastspeech2_aishell3_ckpt_1.1.0/speech_stats.npy \
   --voc=pwgan_aishell3 \
   --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
   --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
@@ -254,9 +256,8 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --lang=zh \
   --text=${BIN_DIR}/../sentences.txt \
   --output_dir=exp/default/test_e2e \
-  --phones_dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \
-  --speaker_dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt \
+  --phones_dict=fastspeech2_aishell3_ckpt_1.1.0/phone_id_map.txt \
+  --speaker_dict=fastspeech2_aishell3_ckpt_1.1.0/speaker_id_map.txt \
   --spk_id=0 \
   --inference_dir=exp/default/inference
-
 ```
diff --git a/examples/aishell3/tts3/local/synthesize_e2e.sh b/examples/aishell3/tts3/local/synthesize_e2e.sh
index ff3608be..158350ae 100755
--- a/examples/aishell3/tts3/local/synthesize_e2e.sh
+++ b/examples/aishell3/tts3/local/synthesize_e2e.sh
@@ -38,7 +38,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=fastspeech2_aishell3 \
         --am_config=${config_path} \
         --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-        --am_stat=fastspeech2_nosil_aishell3_ckpt_0.4/speech_stats.npy \
+        --am_stat=dump/train/speech_stats.npy \
         --voc=hifigan_aishell3 \
         --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \
         --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
@@ -46,8 +46,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --lang=zh \
         --text=${BIN_DIR}/../sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
-        --phones_dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \
-        --speaker_dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
         --spk_id=0 \
         --inference_dir=${train_output_path}/inference
     fi
diff --git a/examples/other/g2p/README.md b/examples/other/g2p/README.md
index a8f8f734..88294350 100644
--- a/examples/other/g2p/README.md
+++ b/examples/other/g2p/README.md
@@ -12,7 +12,7 @@ Run the command below to get the results of the test.
 ./run.sh
 ```
 
-The `avg WER` of g2p is: 0.024219452438490413
+The `avg WER` of g2p is: 0.024169315564825305
 
 ```text
      ,--------------------------------------------------------------------.
diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md
index 9c0d7561..2a2f27fd 100644
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -216,7 +216,7 @@ optional arguments:
 
 ## Pretrained Model
 Pretrained FastSpeech2 model with no silence in the edge of audios:
-- [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)
+- [fastspeech2_vctk_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_ckpt_1.2.0.zip)
 
 The static model can be downloaded here:
 - [fastspeech2_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_static_1.1.0.zip)
@@ -226,9 +226,11 @@ The ONNX model can be downloaded here:
 
 FastSpeech2 checkpoint contains files listed below.
 ```text
-fastspeech2_nosil_vctk_ckpt_0.5
+fastspeech2_vctk_ckpt_1.2.0
 ├── default.yaml            # default config used to train fastspeech2
+├── energy_stats.npy        # statistics used to normalize energy when training fastspeech2
 ├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
+├── pitch_stats.npy         # statistics used to normalize pitch when training fastspeech2
 ├── snapshot_iter_66200.pdz # model parameters and optimizer states
 ├── speaker_id_map.txt      # speaker id map file when training a multi-speaker fastspeech2
 └── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
@@ -241,9 +243,9 @@ FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../synthesize_e2e.py \
   --am=fastspeech2_vctk \
-  --am_config=fastspeech2_nosil_vctk_ckpt_0.5/default.yaml \
-  --am_ckpt=fastspeech2_nosil_vctk_ckpt_0.5/snapshot_iter_66200.pdz \
-  --am_stat=fastspeech2_nosil_vctk_ckpt_0.5/speech_stats.npy \
+  --am_config=fastspeech2_vctk_ckpt_1.2.0/default.yaml \
+  --am_ckpt=fastspeech2_vctk_ckpt_1.2.0/snapshot_iter_66200.pdz \
+  --am_stat=fastspeech2_vctk_ckpt_1.2.0/speech_stats.npy \
   --voc=pwgan_vctk \
   --voc_config=pwg_vctk_ckpt_0.1.1/default.yaml  \
   --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
@@ -251,8 +253,8 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --lang=en \
   --text=${BIN_DIR}/../sentences_en.txt \
   --output_dir=exp/default/test_e2e \
-  --phones_dict=fastspeech2_nosil_vctk_ckpt_0.5/phone_id_map.txt \
-  --speaker_dict=fastspeech2_nosil_vctk_ckpt_0.5/speaker_id_map.txt \
+  --phones_dict=fastspeech2_vctk_ckpt_1.2.0/phone_id_map.txt \
+  --speaker_dict=fastspeech2_vctk_ckpt_1.2.0/speaker_id_map.txt \
   --spk_id=0 \
   --inference_dir=exp/default/inference
 ```
diff --git a/examples/zh_en_tts/tts3/README.md b/examples/zh_en_tts/tts3/README.md
index e7365baa..b4b68308 100644
--- a/examples/zh_en_tts/tts3/README.md
+++ b/examples/zh_en_tts/tts3/README.md
@@ -262,9 +262,11 @@ The ONNX model can be downloaded here:
 FastSpeech2 checkpoint contains files listed below.
 
 ```text
-fastspeech2_mix_ckpt_0.2.0
+fastspeech2_mix_ckpt_1.2.0
 ├── default.yaml            # default config used to train fastspeech2
+├── energy_stats.npy        # statistics used to energy spectrogram when training fastspeech2
 ├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
+├── pitch_stats.npy         # statistics used to normalize pitch when training fastspeech2
 ├── snapshot_iter_99200.pdz # model parameters and optimizer states
 ├── speaker_id_map.txt      # speaker id map file when training a multi-speaker fastspeech2
 └── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
@@ -281,9 +283,9 @@ FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../synthesize_e2e.py \
   --am=fastspeech2_mix \
-  --am_config=fastspeech2_mix_ckpt_0.2.0/default.yaml \
-  --am_ckpt=fastspeech2_mix_ckpt_0.2.0/snapshot_iter_99200.pdz \
-  --am_stat=fastspeech2_mix_ckpt_0.2.0/speech_stats.npy \
+  --am_config=fastspeech2_mix_ckpt_1.2.0/default.yaml \
+  --am_ckpt=fastspeech2_mix_ckpt_1.2.0/snapshot_iter_99200.pdz \
+  --am_stat=fastspeech2_mix_ckpt_1.2.0/speech_stats.npy \
   --voc=pwgan_aishell3 \
   --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
   --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
@@ -291,8 +293,8 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --lang=mix \
   --text=${BIN_DIR}/../sentences_mix.txt \
   --output_dir=exp/default/test_e2e \
-  --phones_dict=fastspeech2_mix_ckpt_0.2.0/phone_id_map.txt \
-  --speaker_dict=fastspeech2_mix_ckpt_0.2.0/speaker_id_map.txt \
+  --phones_dict=fastspeech2_mix_ckpt_1.2.0/phone_id_map.txt \
+  --speaker_dict=fastspeech2_mix_ckpt_1.2.0/speaker_id_map.txt \
   --spk_id=174 \
   --inference_dir=exp/default/inference
 ```