Merge pull request #3305 from zh794390558/tts

[t2s] add assets and tts codeswitch scripts
3 years ago · 8aa9790c75
parent 46de1b0379 6b4d1f80ac
commit 8aa9790c75
71 changed files with 189 additions and 155 deletions
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@ -241,7 +241,7 @@ fastspeech2_aishell3_ckpt_1.1.0
 ├── speaker_id_map.txt      # speaker id map file when training a multi-speaker fastspeech2
 └── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
 ```
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
 ```bash
 source path.sh

@ -257,7 +257,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
  --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
  --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
  --output_dir=exp/default/test_e2e \
  --phones_dict=fastspeech2_aishell3_ckpt_1.1.0/phone_id_map.txt \
  --speaker_dict=fastspeech2_aishell3_ckpt_1.1.0/speaker_id_map.txt \
--- a/examples/aishell3/tts3/local/inference.sh
+++ b/examples/aishell3/tts3/local/inference.sh
@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_aishell3 \
        --voc=pwgan_aishell3 \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@ -22,7 +22,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_aishell3 \
        --voc=hifigan_aishell3 \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
--- a/examples/aishell3/tts3/local/lite_predict.sh
+++ b/examples/aishell3/tts3/local/lite_predict.sh
@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=fastspeech2_aishell3 \
        --voc=pwgan_aishell3 \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=fastspeech2_aishell3 \
        --voc=hifigan_aishell3 \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
--- a/examples/aishell3/tts3/local/ort_predict.sh
+++ b/examples/aishell3/tts3/local/ort_predict.sh
@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am=fastspeech2_aishell3 \
        --voc=pwgan_aishell3 \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am=fastspeech2_aishell3 \
        --voc=hifigan_aishell3 \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
--- a/examples/aishell3/tts3/local/synthesize_e2e.sh
+++ b/examples/aishell3/tts3/local/synthesize_e2e.sh
@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
        --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
--- a/examples/aishell3/vits/README.md
+++ b/examples/aishell3/vits/README.md
@ -196,7 +196,7 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
    --phones_dict=vits_aishell3_ckpt_1.1.0/phone_id_map.txt \
    --speaker_dict=vits_aishell3_ckpt_1.1.0/speaker_id_map.txt \
    --output_dir=exp/default/test_e2e \
-    --text=${BIN_DIR}/../sentences.txt \
+    --text=${BIN_DIR}/../../assets/sentences.txt \
    --add-blank=${add_blank} 
 ```
 -->
--- a/examples/aishell3/vits/local/synthesize_e2e.sh
+++ b/examples/aishell3/vits/local/synthesize_e2e.sh
@ -20,6 +20,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --speaker_dict=dump/speaker_id_map.txt \
        --spk_id=0 \
        --output_dir=${train_output_path}/test_e2e \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --add-blank=${add_blank}
 fi
--- a/examples/canton/tts3/README.md
+++ b/examples/canton/tts3/README.md
@ -102,7 +102,7 @@ Download the pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](
 unzip pwg_aishell3_ckpt_0.5.zip
 ```

-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_canton.txt` using pretrained fastspeech2 and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences_canton.txt` using pretrained fastspeech2 and parallel wavegan models.
 ```bash
 source path.sh

@ -118,7 +118,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
  --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
  --lang=canton \
-  --text=${BIN_DIR}/../sentences_canton.txt \
+  --text=${BIN_DIR}/../../assets/sentences_canton.txt \
  --output_dir=exp/default/test_e2e \
  --phones_dict=fastspeech2_canton_ckpt_1.4.0/phone_id_map.txt \
  --speaker_dict=fastspeech2_canton_ckpt_1.4.0/speaker_id_map.txt \
--- a/examples/canton/tts3/local/inference.sh
+++ b/examples/canton/tts3/local/inference.sh
@ -12,7 +12,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am=fastspeech2_canton \
        --voc=pwgan_aishell3 \
        --spk_id=10 \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@ -27,7 +27,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am=fastspeech2_canton \
        --voc=mb_melgan_csmsc \
        --spk_id=10 \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@ -41,7 +41,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --am=fastspeech2_canton \
        --voc=hifigan_csmsc \
        --spk_id=10 \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --am=fastspeech2_canton \
        --voc=wavernn_csmsc \
        --spk_id=10 \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
--- a/examples/canton/tts3/local/ort_predict.sh
+++ b/examples/canton/tts3/local/ort_predict.sh
@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc=pwgan_aishell3 \
        --spk_id=10 \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
        --lang=canton \
@ -26,7 +26,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc=mb_melgan_csmsc \
        --spk_id=10 \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
        --lang=canton \
@ -40,7 +40,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --am=fastspeech2_canton \
        --voc=hifigan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
        --lang=canton \
--- a/examples/canton/tts3/local/synthesize_e2e.sh
+++ b/examples/canton/tts3/local/synthesize_e2e.sh
@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
        --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
        --lang=canton \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
        --lang=canton \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
--- a/examples/csmsc/jets/local/inference.sh
+++ b/examples/csmsc/jets/local/inference.sh
@ -9,7 +9,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${BIN_DIR}/inference.py \
        --inference_dir=${train_output_path}/inference \
        --am=jets_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
--- a/examples/csmsc/jets/local/synthesize_e2e.sh
+++ b/examples/csmsc/jets/local/synthesize_e2e.sh
@ -17,6 +17,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
        --phones_dict=dump/phone_id_map.txt \
        --output_dir=${train_output_path}/test_e2e \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --inference_dir=${train_output_path}/inference
 fi
--- a/examples/csmsc/tts0/README.md
+++ b/examples/csmsc/tts0/README.md
@ -226,7 +226,7 @@ tacotron2_csmsc_ckpt_0.2.0
 ├── snapshot_iter_30600.pdz # model parameters and optimizer states
 └── speech_stats.npy        # statistics used to normalize spectrogram when training Tacotron2
 ```
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained Tacotron2 and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained Tacotron2 and parallel wavegan models.
 ```bash
 source path.sh

@ -242,7 +242,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
  --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
  --output_dir=exp/default/test_e2e \
  --inference_dir=exp/default/inference \
  --phones_dict=tacotron2_csmsc_ckpt_0.2.0/phone_id_map.txt
--- a/examples/csmsc/tts0/local/inference.sh
+++ b/examples/csmsc/tts0/local/inference.sh
@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=tacotron2_csmsc \
        --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
@ -22,7 +22,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=tacotron2_csmsc \
        --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
@ -33,7 +33,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=tacotron2_csmsc \
        --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
--- a/examples/csmsc/tts0/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts0/local/synthesize_e2e.sh
@ -22,7 +22,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference
@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference
@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt
        # --inference_dir=${train_output_path}/inference
@ -87,7 +87,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference
@ -108,7 +108,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@ -248,7 +248,7 @@ speedyspeech_csmsc_ckpt_0.2.0
 ├── snapshot_iter_30600.pdz # model parameters and optimizer states
 └── tone_id_map.txt         # tone vocabulary file when training speedyspeech
 ```
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained speedyspeech and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained speedyspeech and parallel wavegan models.
 ```bash
 source path.sh

@ -264,7 +264,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
  --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
  --output_dir=exp/default/test_e2e \
  --inference_dir=exp/default/inference \
  --phones_dict=speedyspeech_csmsc_ckpt_0.2.0/phone_id_map.txt \
--- a/examples/csmsc/tts2/local/inference.sh
+++ b/examples/csmsc/tts2/local/inference.sh
@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=speedyspeech_csmsc \
        --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt
@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=speedyspeech_csmsc \
        --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt
@ -36,7 +36,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=speedyspeech_csmsc \
        --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt
--- a/examples/csmsc/tts2/local/lite_predict.sh
+++ b/examples/csmsc/tts2/local/lite_predict.sh
@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=speedyspeech_csmsc \
        --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt
@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=speedyspeech_csmsc \
        --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt
@ -36,7 +36,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=speedyspeech_csmsc \
        --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt
--- a/examples/csmsc/tts2/local/ort_predict.sh
+++ b/examples/csmsc/tts2/local/ort_predict.sh
@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am=speedyspeech_csmsc \
        --voc=pwgan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt \
        --device=cpu \
@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am=speedyspeech_csmsc \
        --voc=mb_melgan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt \
        --device=cpu \
@ -36,7 +36,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --am=speedyspeech_csmsc \
        --voc=hifigan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt \
        --device=cpu \
--- a/examples/csmsc/tts2/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts2/local/synthesize_e2e.sh
@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt \
@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt \
@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt
@ -87,7 +87,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt \
@ -109,7 +109,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt \
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@ -258,7 +258,7 @@ fastspeech2_nosil_baker_ckpt_0.4
 ├── snapshot_iter_76000.pdz # model parameters and optimizer states
 └── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
 ```
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained fastspeech2 and parallel wavegan models.

 If you want to use fastspeech2_conformer, you must delete this line `--inference_dir=exp/default/inference \` to skip the step of dygraph to static graph, cause we haven't tested dygraph to static graph for fastspeech2_conformer till now.
 ```bash
@ -276,7 +276,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
  --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
  --output_dir=exp/default/test_e2e \
  --inference_dir=exp/default/inference \
  --phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
--- a/examples/csmsc/tts3/README_cn.md
+++ b/examples/csmsc/tts3/README_cn.md
@ -248,7 +248,7 @@ fastspeech2_nosil_baker_ckpt_0.4
 ├── snapshot_iter_76000.pdz # 模型参数和优化器状态
 └── speech_stats.npy        # 训练 fastspeech2 时用于规范化频谱图的统计数据
 ```
-您可以使用以下脚本通过使用预训练的 fastspeech2 和 parallel wavegan 模型为 `${BIN_DIR}/../sentences.txt` 合成句子
+您可以使用以下脚本通过使用预训练的 fastspeech2 和 parallel wavegan 模型为 `${BIN_DIR}/../../assets/sentences.txt` 合成句子
 ```bash
 source path.sh

@ -264,7 +264,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
  --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
  --output_dir=exp/default/test_e2e \
  --inference_dir=exp/default/inference \
  --phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
--- a/examples/csmsc/tts3/local/inference.sh
+++ b/examples/csmsc/tts3/local/inference.sh
@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_csmsc \
        --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_csmsc \
        --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
@ -34,7 +34,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_csmsc \
        --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
@ -45,7 +45,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_csmsc \
        --voc=wavernn_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
--- a/examples/csmsc/tts3/local/inference_streaming.sh
+++ b/examples/csmsc/tts3/local/inference_streaming.sh
@ -12,7 +12,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am=fastspeech2_csmsc \
        --am_stat=dump/train/speech_stats.npy \
        --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True
@ -26,7 +26,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am=fastspeech2_csmsc \
        --am_stat=dump/train/speech_stats.npy \
        --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True
@ -39,7 +39,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --am=fastspeech2_csmsc \
        --am_stat=dump/train/speech_stats.npy \
        --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True
--- a/examples/csmsc/tts3/local/lite_predict.sh
+++ b/examples/csmsc/tts3/local/lite_predict.sh
@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=fastspeech2_csmsc \
        --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=fastspeech2_csmsc \
        --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
@ -34,7 +34,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=fastspeech2_csmsc \
        --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
--- a/examples/csmsc/tts3/local/lite_predict_streaming.sh
+++ b/examples/csmsc/tts3/local/lite_predict_streaming.sh
@ -12,7 +12,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am=fastspeech2_csmsc \
        --am_stat=dump/train/speech_stats.npy \
        --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True
@ -26,7 +26,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am=fastspeech2_csmsc \
        --am_stat=dump/train/speech_stats.npy \
        --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True
@ -39,7 +39,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --am=fastspeech2_csmsc \
        --am_stat=dump/train/speech_stats.npy \
        --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True
--- a/examples/csmsc/tts3/local/ort_predict.sh
+++ b/examples/csmsc/tts3/local/ort_predict.sh
@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am=fastspeech2_csmsc \
        --voc=pwgan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2
@ -22,7 +22,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am=fastspeech2_csmsc \
        --voc=mb_melgan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2
@ -34,7 +34,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --am=fastspeech2_csmsc \
        --voc=hifigan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2
--- a/examples/csmsc/tts3/local/ort_predict_streaming.sh
+++ b/examples/csmsc/tts3/local/ort_predict_streaming.sh
@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am_stat=dump/train/speech_stats.npy \
        --voc=pwgan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_streaming \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
@ -25,7 +25,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am_stat=dump/train/speech_stats.npy \
        --voc=mb_melgan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_streaming \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
@ -39,7 +39,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --am_stat=dump/train/speech_stats.npy \
        --voc=hifigan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_streaming \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
--- a/examples/csmsc/tts3/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts3/local/synthesize_e2e.sh
@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference
@ -42,7 +42,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference
@ -64,7 +64,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt
        # --inference_dir=${train_output_path}/inference
@ -85,7 +85,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference
@ -107,7 +107,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference
--- a/examples/csmsc/tts3/local/synthesize_streaming.sh
+++ b/examples/csmsc/tts3/local/synthesize_streaming.sh
@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True \
@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True \
@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True
@ -87,7 +87,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e_streaming \
        --phones_dict=dump/phone_id_map.txt \
        --am_streaming=True \
--- a/examples/csmsc/tts3_rhy/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts3_rhy/local/synthesize_e2e.sh
@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference \
@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference \
@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --use_rhy=True
@ -88,7 +88,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference \
@ -111,7 +111,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference \
--- a/examples/csmsc/vits/README.md
+++ b/examples/csmsc/vits/README.md
@ -172,6 +172,6 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
    --ckpt=vits_csmsc_ckpt_1.4.0/snapshot_iter_150000.pdz \
    --phones_dict=vits_csmsc_ckpt_1.4.0/phone_id_map.txt \
    --output_dir=exp/default/test_e2e \
-    --text=${BIN_DIR}/../sentences.txt \
+    --text=${BIN_DIR}/../../assets/sentences.txt \
    --add-blank=${add_blank} 
 ```
--- a/examples/csmsc/vits/local/inference.sh
+++ b/examples/csmsc/vits/local/inference.sh
@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${BIN_DIR}/inference.py \
        --inference_dir=${train_output_path}/inference \
        --am=vits_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --add-blank=${add_blank}
--- a/examples/csmsc/vits/local/lite_predict.sh
+++ b/examples/csmsc/vits/local/lite_predict.sh
@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${BIN_DIR}/lite_predict.py \
        --inference_dir=${train_output_path}/pdlite \
        --am=vits_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --add-blank=${add_blank}
--- a/examples/csmsc/vits/local/synthesize_e2e.sh
+++ b/examples/csmsc/vits/local/synthesize_e2e.sh
@ -18,7 +18,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
        --phones_dict=dump/phone_id_map.txt \
        --output_dir=${train_output_path}/test_e2e \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --add-blank=${add_blank} #\
        # --inference_dir=${train_output_path}/inference
 fi
--- a/examples/ljspeech/tts0/README.md
+++ b/examples/ljspeech/tts0/README.md
@ -239,7 +239,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
  --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
  --lang=en \
-  --text=${BIN_DIR}/../sentences_en.txt \
+  --text=${BIN_DIR}/../../assets/sentences_en.txt \
  --output_dir=exp/default/test_e2e \
  --phones_dict=tacotron2_ljspeech_ckpt_0.2.0/phone_id_map.txt
 ```
--- a/examples/ljspeech/tts0/local/synthesize_e2e.sh
+++ b/examples/ljspeech/tts0/local/synthesize_e2e.sh
@ -16,7 +16,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
    --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
    --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
    --lang=en \
-    --text=${BIN_DIR}/../sentences_en.txt \
+    --text=${BIN_DIR}/../../assets/sentences_en.txt \
    --output_dir=${train_output_path}/test_e2e \
    --phones_dict=dump/phone_id_map.txt \
    # --inference_dir=${train_output_path}/inference
--- a/examples/ljspeech/tts1/README.md
+++ b/examples/ljspeech/tts1/README.md
@ -191,7 +191,7 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
  --transformer-tts-stat=transformer_tts_ljspeech_ckpt_0.4/speech_stats.npy \
  --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
  --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
-  --text=${BIN_DIR}/../sentences_en.txt \
+  --text=${BIN_DIR}/../../assets/sentences_en.txt \
  --output-dir=exp/default/test_e2e \
  --phones-dict=transformer_tts_ljspeech_ckpt_0.4/phone_id_map.txt
 ```
--- a/examples/ljspeech/tts1/local/synthesize_e2e.sh
+++ b/examples/ljspeech/tts1/local/synthesize_e2e.sh
@ -12,6 +12,6 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
    --transformer-tts-stat=dump/train/speech_stats.npy \
    --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
    --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
-    --text=${BIN_DIR}/../sentences_en.txt \
+    --text=${BIN_DIR}/../../assets/sentences_en.txt \
    --output-dir=${train_output_path}/test_e2e \
    --phones-dict=dump/phone_id_map.txt
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@ -254,7 +254,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
  --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
  --lang=en \
-  --text=${BIN_DIR}/../sentences_en.txt \
+  --text=${BIN_DIR}/../../assets/sentences_en.txt \
  --output_dir=exp/default/test_e2e \
  --inference_dir=exp/default/inference \
  --phones_dict=fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt
--- a/examples/ljspeech/tts3/local/inference.sh
+++ b/examples/ljspeech/tts3/local/inference.sh
@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_ljspeech \
        --voc=pwgan_ljspeech \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --lang=en
@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_ljspeech \
        --voc=hifigan_ljspeech \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --lang=en
--- a/examples/ljspeech/tts3/local/lite_predict.sh
+++ b/examples/ljspeech/tts3/local/lite_predict.sh
@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=fastspeech2_ljspeech \
        --voc=pwgan_ljspeech \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --lang=en
@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=fastspeech2_ljspeech \
        --voc=hifigan_ljspeech \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --lang=en
--- a/examples/ljspeech/tts3/local/ort_predict.sh
+++ b/examples/ljspeech/tts3/local/ort_predict.sh
@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am=fastspeech2_ljspeech \
        --voc=pwgan_ljspeech\
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_en.txt  \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt  \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am=fastspeech2_ljspeech \
        --voc=hifigan_ljspeech \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_en.txt  \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt  \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
--- a/examples/ljspeech/tts3/local/synthesize_e2e.sh
+++ b/examples/ljspeech/tts3/local/synthesize_e2e.sh
@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
        --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
        --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/test_e2e \
        --inference_dir=${train_output_path}/inference \
        --phones_dict=dump/phone_id_map.txt
@ -41,7 +41,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=hifigan_ljspeech_ckpt_0.2.0/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_ljspeech_ckpt_0.2.0/feats_stats.npy \
        --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/test_e2e \
        --inference_dir=${train_output_path}/inference \
        --phones_dict=dump/phone_id_map.txt
--- a/examples/opencpop/svs1/README.md
+++ b/examples/opencpop/svs1/README.md
@ -267,7 +267,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
  --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
  --lang=sing \
-  --text=${BIN_DIR}/../sentences_sing.txt \
+  --text=${BIN_DIR}/../../assets/sentences_sing.txt \
  --output_dir=exp/default/test_e2e \
  --phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
  --pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \
--- a/examples/opencpop/svs1/README_cn.md
+++ b/examples/opencpop/svs1/README_cn.md
@ -271,7 +271,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
  --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
  --lang=sing \
-  --text=${BIN_DIR}/../sentences_sing.txt \
+  --text=${BIN_DIR}/../../assets/sentences_sing.txt \
  --output_dir=exp/default/test_e2e \
  --phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
  --pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \
--- a/examples/opencpop/svs1/local/synthesize_e2e.sh
+++ b/examples/opencpop/svs1/local/synthesize_e2e.sh
@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
        --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
        --lang=sing \
-        --text=${BIN_DIR}/../sentences_sing.txt \
+        --text=${BIN_DIR}/../../assets/sentences_sing.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speech_stretchs=dump/train/speech_stretchs.npy \
@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=hifigan_opencpop_ckpt_1.4.0/snapshot_iter_625000.pdz \
        --voc_stat=hifigan_opencpop_ckpt_1.4.0/feats_stats.npy \
        --lang=sing \
-        --text=${BIN_DIR}/../sentences_sing.txt \
+        --text=${BIN_DIR}/../../assets/sentences_sing.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speech_stretchs=dump/train/speech_stretchs.npy \
--- a/examples/other/tts_finetune/tts3/run.sh
+++ b/examples/other/tts_finetune/tts3/run.sh
@ -99,7 +99,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
        --voc_ckpt=pretrained_models/hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
        --voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
        --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
        --output_dir=./test_e2e/ \
        --phones_dict=${dump_dir}/phone_id_map.txt \
        --speaker_dict=${dump_dir}/speaker_id_map.txt \
--- a/examples/other/tts_finetune/tts3/run_en.sh
+++ b/examples/other/tts_finetune/tts3/run_en.sh
@ -98,7 +98,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
        --voc_ckpt=pretrained_models/hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
        --voc_stat=pretrained_models/hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
        --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=./test_e2e/ \
        --phones_dict=${dump_dir}/phone_id_map.txt \
        --speaker_dict=${dump_dir}/speaker_id_map.txt \
--- a/examples/other/tts_finetune/tts3/run_mix.sh
+++ b/examples/other/tts_finetune/tts3/run_mix.sh
@ -100,7 +100,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
        --voc_ckpt=pretrained_models/hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
        --voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
        --lang=mix \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --output_dir=./test_e2e/ \
        --phones_dict=${dump_dir}/phone_id_map.txt \
        --speaker_dict=${dump_dir}/speaker_id_map.txt \
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@ -254,7 +254,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
  --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
  --lang=en \
-  --text=${BIN_DIR}/../sentences_en.txt \
+  --text=${BIN_DIR}/../../assets/sentences_en.txt \
  --output_dir=exp/default/test_e2e \
  --phones_dict=fastspeech2_vctk_ckpt_1.2.0/phone_id_map.txt \
  --speaker_dict=fastspeech2_vctk_ckpt_1.2.0/speaker_id_map.txt \
--- a/examples/vctk/tts3/local/inference.sh
+++ b/examples/vctk/tts3/local/inference.sh
@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_vctk \
        --voc=pwgan_vctk \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_vctk \
        --voc=hifigan_vctk \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
--- a/examples/vctk/tts3/local/lite_predict.sh
+++ b/examples/vctk/tts3/local/lite_predict.sh
@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=fastspeech2_vctk \
        --voc=pwgan_vctk \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@ -25,7 +25,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/pdlite \
        --am=fastspeech2_vctk \
        --voc=hifigan_vctk \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/lite_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
--- a/examples/vctk/tts3/local/ort_predict.sh
+++ b/examples/vctk/tts3/local/ort_predict.sh
@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am=fastspeech2_vctk \
        --voc=pwgan_vctk \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
@ -25,7 +25,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am=fastspeech2_vctk \
        --voc=hifigan_vctk \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
--- a/examples/vctk/tts3/local/synthesize_e2e.sh
+++ b/examples/vctk/tts3/local/synthesize_e2e.sh
@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
        --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
        --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
        --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
--- a/examples/zh_en_tts/tts3/.gitignore
+++ b/examples/zh_en_tts/tts3/.gitignore
@ -0,0 +1,2 @@
+data
+exp
--- a/examples/zh_en_tts/tts3/README.md
+++ b/examples/zh_en_tts/tts3/README.md
@ -6,11 +6,11 @@ This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2

 ## Dataset
 ### Download and Extract
-Download all datasets and extract it to `~/datasets`:
- The CSMSC dataset is in the directory `~/datasets/BZNSYP`
- The Ljspeech dataset is in the directory `~/datasets/LJSpeech-1.1`
- The aishell3 dataset is in the directory `~/datasets/data_aishell3`
- The vctk dataset is in the directory `~/datasets/VCTK-Corpus-0.92`
+Download all datasets and extract it to `./data`:
+- The CSMSC dataset is in the directory `./data/BZNSYP`
+- The Ljspeech dataset is in the directory `./data/LJSpeech-1.1`
+- The aishell3 dataset is in the directory `./data/data_aishell3`
+- The vctk dataset is in the directory `./data/VCTK-Corpus-0.92`
 
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for the fastspeech2 training.
@ -24,16 +24,16 @@ Or train your MFA model reference to [mfa example](https://github.com/PaddlePadd

 ## Get Started
 Assume the paths to the datasets are:
- `~/datasets/BZNSYP`
- `~/datasets/LJSpeech-1.1`
- `~/datasets/data_aishell3` 
- `~/datasets/VCTK-Corpus-0.92`
+- `./data/BZNSYP`
+- `./data/LJSpeech-1.1`
+- `./data/data_aishell3` 
+- `./data/VCTK-Corpus-0.92`

 Assume the path to the MFA results of the datasets are:
- `./mfa_results/baker_alignment_tone`
- `./mfa_results/ljspeech_alignment`
- `./mfa_results/aishell3_alignment_tone`
- `./mfa_results/vctk_alignment`
+- `./data/mfa/baker_alignment_tone`
+- `./data/mfa/ljspeech_alignment`
+- `./data/mfa/aishell3_alignment_tone`
+- `./data/mfa/vctk_alignment`

 Run the command below to
 1. **source path**.
@ -252,8 +252,10 @@ optional arguments:


 ## Pretrained Model
+
 Pretrained FastSpeech2 model with no silence in the edge of audios:
 - [fastspeech2_mix_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_ckpt_1.2.0.zip)
+- [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)

 The static model can be downloaded here:
 - [fastspeech2_mix_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_static_0.2.0.zip)
@ -285,18 +287,18 @@ FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../synthesize_e2e.py \
  --am=fastspeech2_mix \
-  --am_config=fastspeech2_mix_ckpt_1.2.0/default.yaml \
-  --am_ckpt=fastspeech2_mix_ckpt_1.2.0/snapshot_iter_99200.pdz \
-  --am_stat=fastspeech2_mix_ckpt_1.2.0/speech_stats.npy \
+  --am_config=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/default.yaml \
+  --am_ckpt=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/snapshot_iter_99200.pdz \
+  --am_stat=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/speech_stats.npy \
+  --phones_dict=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/phone_id_map.txt \
+  --speaker_dict=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/speaker_id_map.txt \
+  --spk_id=174 \
  --voc=pwgan_aishell3 \
-  --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
-  --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
-  --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
+  --voc_config=exp/pretrain/pwg_aishell3_ckpt_0.5/default.yaml \
+  --voc_ckpt=exp/pretrain/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+  --voc_stat=exp/pretrain/pwg_aishell3_ckpt_0.5/feats_stats.npy \
  --lang=mix \
-  --text=${BIN_DIR}/../sentences_mix.txt \
+  --text=${BIN_DIR}/../../assets/sentences_mix.txt \
  --output_dir=exp/default/test_e2e \
-  --phones_dict=fastspeech2_mix_ckpt_1.2.0/phone_id_map.txt \
-  --speaker_dict=fastspeech2_mix_ckpt_1.2.0/speaker_id_map.txt \
-  --spk_id=174 \
  --inference_dir=exp/default/inference
 ```
--- a/examples/zh_en_tts/tts3/local/inference.sh
+++ b/examples/zh_en_tts/tts3/local/inference.sh
@ -13,7 +13,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_mix \
        --voc=pwgan_aishell3 \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@ -30,7 +30,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_mix \
        --voc=hifigan_aishell3 \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@ -45,7 +45,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_mix \
        --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
--- a/examples/zh_en_tts/tts3/local/mfa_download.sh
+++ b/examples/zh_en_tts/tts3/local/mfa_download.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+
+exp=exp
+mfa=$exp/mfa
+
+mkdir -p $mfa
+
+pushd $mfa
+
+wget -c https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz &
+wget -c https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz &
+wget -c https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz &
+wget -c https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz &
+wait
+
+popd
--- a/examples/zh_en_tts/tts3/local/model_download.sh
+++ b/examples/zh_en_tts/tts3/local/model_download.sh
@ -0,0 +1,14 @@
+#!/bin/bash
+
+exp=exp
+pretrain=$exp/pretrain
+
+mkdir -p $pretrain
+
+pushd $pretrain
+
+wget -c https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_ckpt_1.2.0.zip &
+wget -c https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip &
+wait
+
+popd
--- a/examples/zh_en_tts/tts3/local/ort_predict.sh
+++ b/examples/zh_en_tts/tts3/local/ort_predict.sh
@ -13,7 +13,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --am=fastspeech2_mix \
        --voc=pwgan_aishell3 \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=4 \
@ -31,7 +31,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --am=fastspeech2_mix \
        --voc=hifigan_aishell3 \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=4 \
@ -45,7 +45,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --am=fastspeech2_mix \
        --voc=hifigan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=4 \
--- a/examples/zh_en_tts/tts3/local/synthesize_e2e.sh
+++ b/examples/zh_en_tts/tts3/local/synthesize_e2e.sh
@ -23,7 +23,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
        --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
        --lang=mix \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@ -48,7 +48,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
        --lang=mix \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
@ -73,7 +73,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
        --lang=mix \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
--- a/examples/zh_en_tts/tts3/run.sh
+++ b/examples/zh_en_tts/tts3/run.sh
@ -7,8 +7,8 @@ gpus=0,1
 stage=0
 stop_stage=100

-datasets_root_dir=~/datasets
-mfa_root_dir=./mfa_results/
+datasets_root_dir=./data
+mfa_root_dir=./data/mfa
 conf_path=conf/default.yaml
 train_output_path=exp/default
 ckpt_name=snapshot_iter_99200.pdz
--- a/paddlespeech/t2s/assets/csmsc_test.txt
+++ b/paddlespeech/t2s/assets/csmsc_test.txt
--- a/paddlespeech/t2s/assets/sentences.txt
+++ b/paddlespeech/t2s/assets/sentences.txt
--- a/paddlespeech/t2s/assets/sentences_canton.txt
+++ b/paddlespeech/t2s/assets/sentences_canton.txt
--- a/paddlespeech/t2s/assets/sentences_en.txt
+++ b/paddlespeech/t2s/assets/sentences_en.txt
--- a/paddlespeech/t2s/assets/sentences_mix.txt
+++ b/paddlespeech/t2s/assets/sentences_mix.txt
--- a/paddlespeech/t2s/assets/sentences_sing.txt
+++ b/paddlespeech/t2s/assets/sentences_sing.txt
--- a/paddlespeech/t2s/assets/sentences_ssml.txt
+++ b/paddlespeech/t2s/assets/sentences_ssml.txt