From 71bda2443798f3084002ca6609e47b54861f0f8b Mon Sep 17 00:00:00 2001
From: lance6716 <lance6716@gmail.com>
Date: Wed, 15 Feb 2023 12:59:38 +0800
Subject: [PATCH] [TTS]Fix canton (#2924)

* Update run.sh

* Update README.md
---
 examples/canton/tts3/README.md | 42 +---------------------------------
 examples/canton/tts3/run.sh    | 33 --------------------------
 2 files changed, 1 insertion(+), 74 deletions(-)

diff --git a/examples/canton/tts3/README.md b/examples/canton/tts3/README.md
index f4acd131c..3bf4fd8ee 100644
--- a/examples/canton/tts3/README.md
+++ b/examples/canton/tts3/README.md
@@ -74,44 +74,4 @@ Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file tha
 
 ### Training details can refer to the script of [examples/aishell3/tts3](../../aishell3/tts3).
 
-## Pretrained Model(Waiting========)
-Pretrained FastSpeech2 model with no silence in the edge of audios:
-- [fastspeech2_aishell3_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_1.1.0.zip)
-- [fastspeech2_conformer_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_aishell3_ckpt_0.2.0.zip) (Thanks for [@awmmmm](https://github.com/awmmmm)'s contribution)
-
-
-FastSpeech2 checkpoint contains files listed below.
-
-```text
-fastspeech2_aishell3_ckpt_1.1.0
-├── default.yaml            # default config used to train fastspeech2
-├── energy_stats.npy        # statistics used to normalize energy when training fastspeech2
-├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
-├── pitch_stats.npy         # statistics used to normalize pitch when training fastspeech2
-├── snapshot_iter_96400.pdz # model parameters and optimizer states
-├── speaker_id_map.txt      # speaker id map file when training a multi-speaker fastspeech2
-└── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
-```
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
-```bash
-source path.sh
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize_e2e.py \
-  --am=fastspeech2_aishell3 \
-  --am_config=fastspeech2_aishell3_ckpt_1.1.0/default.yaml \
-  --am_ckpt=fastspeech2_aishell3_ckpt_1.1.0/snapshot_iter_96400.pdz \
-  --am_stat=fastspeech2_aishell3_ckpt_1.1.0/speech_stats.npy \
-  --voc=pwgan_aishell3 \
-  --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
-  --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
-  --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
-  --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
-  --output_dir=exp/default/test_e2e \
-  --phones_dict=fastspeech2_aishell3_ckpt_1.1.0/phone_id_map.txt \
-  --speaker_dict=fastspeech2_aishell3_ckpt_1.1.0/speaker_id_map.txt \
-  --spk_id=0 \
-  --inference_dir=exp/default/inference
-```
+## Pretrained Model
diff --git a/examples/canton/tts3/run.sh b/examples/canton/tts3/run.sh
index 9e5c27a16..7c9038d45 100755
--- a/examples/canton/tts3/run.sh
+++ b/examples/canton/tts3/run.sh
@@ -35,36 +35,3 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # synthesize_e2e, vocoder is pwgan by default
     CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
-
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-    # inference with static model, vocoder is pwgan by default
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
-fi
-
-if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-    # install paddle2onnx
-    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
-        pip install paddle2onnx==1.0.0
-    fi
-    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_aishell3
-    # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
-    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_aishell3
-    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_aishell3
-    
-fi
-
-# inference with onnxruntime, use fastspeech2 + pwgan by default
-if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
-    ./local/ort_predict.sh ${train_output_path}
-fi
-
-if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
-    ./local/export2lite.sh ${train_output_path} inference pdlite fastspeech2_aishell3 x86
-    ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_aishell3 x86
-    # ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_aishell3 x86
-fi
-
-if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
-fi