From 83d93da8d023a7df319f2911af72f150b53f7807 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 2 Jun 2023 07:12:17 +0000
Subject: [PATCH 1/2] add scripts for tts code switch

---
 examples/zh_en_tts/tts3/.gitignore            |  2 ++
 examples/zh_en_tts/tts3/README.md             | 32 +++++++++----------
 examples/zh_en_tts/tts3/local/mfa_download.sh | 16 ++++++++++
 .../zh_en_tts/tts3/local/model_download.sh    | 13 ++++++++
 examples/zh_en_tts/tts3/run.sh                |  4 +--
 5 files changed, 49 insertions(+), 18 deletions(-)
 create mode 100644 examples/zh_en_tts/tts3/.gitignore
 create mode 100755 examples/zh_en_tts/tts3/local/mfa_download.sh
 create mode 100755 examples/zh_en_tts/tts3/local/model_download.sh

diff --git a/examples/zh_en_tts/tts3/.gitignore b/examples/zh_en_tts/tts3/.gitignore
new file mode 100644
index 00000000..bbd86a25
--- /dev/null
+++ b/examples/zh_en_tts/tts3/.gitignore
@@ -0,0 +1,2 @@
+data
+exp
diff --git a/examples/zh_en_tts/tts3/README.md b/examples/zh_en_tts/tts3/README.md
index 01202800..1f04d41e 100644
--- a/examples/zh_en_tts/tts3/README.md
+++ b/examples/zh_en_tts/tts3/README.md
@@ -6,11 +6,11 @@ This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2
 
 ## Dataset
 ### Download and Extract
-Download all datasets and extract it to `~/datasets`:
-- The CSMSC dataset is in the directory `~/datasets/BZNSYP`
-- The Ljspeech dataset is in the directory `~/datasets/LJSpeech-1.1`
-- The aishell3 dataset is in the directory `~/datasets/data_aishell3`
-- The vctk dataset is in the directory `~/datasets/VCTK-Corpus-0.92`
+Download all datasets and extract it to `./data`:
+- The CSMSC dataset is in the directory `./data/BZNSYP`
+- The Ljspeech dataset is in the directory `./data/LJSpeech-1.1`
+- The aishell3 dataset is in the directory `./data/data_aishell3`
+- The vctk dataset is in the directory `./data/VCTK-Corpus-0.92`
  
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for the fastspeech2 training.
@@ -24,16 +24,16 @@ Or train your MFA model reference to [mfa example](https://github.com/PaddlePadd
 
 ## Get Started
 Assume the paths to the datasets are:
-- `~/datasets/BZNSYP`
-- `~/datasets/LJSpeech-1.1`
-- `~/datasets/data_aishell3` 
-- `~/datasets/VCTK-Corpus-0.92`
+- `./data/BZNSYP`
+- `./data/LJSpeech-1.1`
+- `./data/data_aishell3` 
+- `./data/VCTK-Corpus-0.92`
 
 Assume the path to the MFA results of the datasets are:
-- `./mfa_results/baker_alignment_tone`
-- `./mfa_results/ljspeech_alignment`
-- `./mfa_results/aishell3_alignment_tone`
-- `./mfa_results/vctk_alignment`
+- `./data/mfa/baker_alignment_tone`
+- `./data/mfa/ljspeech_alignment`
+- `./data/mfa/aishell3_alignment_tone`
+- `./data/mfa/vctk_alignment`
 
 Run the command below to
 1. **source path**.
@@ -288,6 +288,9 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --am_config=fastspeech2_mix_ckpt_1.2.0/default.yaml \
   --am_ckpt=fastspeech2_mix_ckpt_1.2.0/snapshot_iter_99200.pdz \
   --am_stat=fastspeech2_mix_ckpt_1.2.0/speech_stats.npy \
+  --phones_dict=fastspeech2_mix_ckpt_1.2.0/phone_id_map.txt \
+  --speaker_dict=fastspeech2_mix_ckpt_1.2.0/speaker_id_map.txt \
+  --spk_id=174 \
   --voc=pwgan_aishell3 \
   --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
   --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
@@ -295,8 +298,5 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --lang=mix \
   --text=${BIN_DIR}/../sentences_mix.txt \
   --output_dir=exp/default/test_e2e \
-  --phones_dict=fastspeech2_mix_ckpt_1.2.0/phone_id_map.txt \
-  --speaker_dict=fastspeech2_mix_ckpt_1.2.0/speaker_id_map.txt \
-  --spk_id=174 \
   --inference_dir=exp/default/inference
 ```
diff --git a/examples/zh_en_tts/tts3/local/mfa_download.sh b/examples/zh_en_tts/tts3/local/mfa_download.sh
new file mode 100755
index 00000000..1863c896
--- /dev/null
+++ b/examples/zh_en_tts/tts3/local/mfa_download.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+exp=exp
+mfa=$exp/mfa
+
+mkdir -p $mfa
+
+pushd $mfa
+
+wget -c https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz &
+wget -c https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz &
+wget -c https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz &
+wget -c https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz &
+wait
+
+popd
diff --git a/examples/zh_en_tts/tts3/local/model_download.sh b/examples/zh_en_tts/tts3/local/model_download.sh
new file mode 100755
index 00000000..20a830b7
--- /dev/null
+++ b/examples/zh_en_tts/tts3/local/model_download.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+exp=exp
+pretrain=$exp/pretrain
+
+mkdir -p $pretrain
+
+pushd $pretrain
+
+wget -c https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_ckpt_1.2.0.zip &
+wait
+
+popd
diff --git a/examples/zh_en_tts/tts3/run.sh b/examples/zh_en_tts/tts3/run.sh
index a4d86480..a18421f5 100755
--- a/examples/zh_en_tts/tts3/run.sh
+++ b/examples/zh_en_tts/tts3/run.sh
@@ -7,8 +7,8 @@ gpus=0,1
 stage=0
 stop_stage=100
 
-datasets_root_dir=~/datasets
-mfa_root_dir=./mfa_results/
+datasets_root_dir=./data
+mfa_root_dir=./data/mfa
 conf_path=conf/default.yaml
 train_output_path=exp/default
 ckpt_name=snapshot_iter_99200.pdz

From 6b4d1f80ac34fe855a458c803664ee924c704d1e Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 2 Jun 2023 07:50:40 +0000
Subject: [PATCH 2/2] add t2s assets

---
 examples/aishell3/tts3/README.md              |  4 ++--
 examples/aishell3/tts3/local/inference.sh     |  4 ++--
 examples/aishell3/tts3/local/lite_predict.sh  |  4 ++--
 examples/aishell3/tts3/local/ort_predict.sh   |  4 ++--
 .../aishell3/tts3/local/synthesize_e2e.sh     |  4 ++--
 examples/aishell3/vits/README.md              |  2 +-
 .../aishell3/vits/local/synthesize_e2e.sh     |  2 +-
 examples/canton/tts3/README.md                |  4 ++--
 examples/canton/tts3/local/inference.sh       |  8 ++++----
 examples/canton/tts3/local/ort_predict.sh     |  6 +++---
 examples/canton/tts3/local/synthesize_e2e.sh  |  4 ++--
 examples/csmsc/jets/local/inference.sh        |  2 +-
 examples/csmsc/jets/local/synthesize_e2e.sh   |  2 +-
 examples/csmsc/tts0/README.md                 |  4 ++--
 examples/csmsc/tts0/local/inference.sh        |  6 +++---
 examples/csmsc/tts0/local/synthesize_e2e.sh   | 10 +++++-----
 examples/csmsc/tts2/README.md                 |  4 ++--
 examples/csmsc/tts2/local/inference.sh        |  6 +++---
 examples/csmsc/tts2/local/lite_predict.sh     |  6 +++---
 examples/csmsc/tts2/local/ort_predict.sh      |  6 +++---
 examples/csmsc/tts2/local/synthesize_e2e.sh   | 10 +++++-----
 examples/csmsc/tts3/README.md                 |  4 ++--
 examples/csmsc/tts3/README_cn.md              |  4 ++--
 examples/csmsc/tts3/local/inference.sh        |  8 ++++----
 .../csmsc/tts3/local/inference_streaming.sh   |  6 +++---
 examples/csmsc/tts3/local/lite_predict.sh     |  6 +++---
 .../tts3/local/lite_predict_streaming.sh      |  6 +++---
 examples/csmsc/tts3/local/ort_predict.sh      |  6 +++---
 .../csmsc/tts3/local/ort_predict_streaming.sh |  6 +++---
 examples/csmsc/tts3/local/synthesize_e2e.sh   | 10 +++++-----
 .../csmsc/tts3/local/synthesize_streaming.sh  |  8 ++++----
 .../csmsc/tts3_rhy/local/synthesize_e2e.sh    | 10 +++++-----
 examples/csmsc/vits/README.md                 |  2 +-
 examples/csmsc/vits/local/inference.sh        |  2 +-
 examples/csmsc/vits/local/lite_predict.sh     |  2 +-
 examples/csmsc/vits/local/synthesize_e2e.sh   |  2 +-
 examples/ljspeech/tts0/README.md              |  2 +-
 .../ljspeech/tts0/local/synthesize_e2e.sh     |  2 +-
 examples/ljspeech/tts1/README.md              |  2 +-
 .../ljspeech/tts1/local/synthesize_e2e.sh     |  2 +-
 examples/ljspeech/tts3/README.md              |  2 +-
 examples/ljspeech/tts3/local/inference.sh     |  4 ++--
 examples/ljspeech/tts3/local/lite_predict.sh  |  4 ++--
 examples/ljspeech/tts3/local/ort_predict.sh   |  4 ++--
 .../ljspeech/tts3/local/synthesize_e2e.sh     |  4 ++--
 examples/opencpop/svs1/README.md              |  2 +-
 examples/opencpop/svs1/README_cn.md           |  2 +-
 .../opencpop/svs1/local/synthesize_e2e.sh     |  4 ++--
 examples/other/tts_finetune/tts3/run.sh       |  2 +-
 examples/other/tts_finetune/tts3/run_en.sh    |  2 +-
 examples/other/tts_finetune/tts3/run_mix.sh   |  2 +-
 examples/vctk/tts3/README.md                  |  2 +-
 examples/vctk/tts3/local/inference.sh         |  4 ++--
 examples/vctk/tts3/local/lite_predict.sh      |  4 ++--
 examples/vctk/tts3/local/ort_predict.sh       |  4 ++--
 examples/vctk/tts3/local/synthesize_e2e.sh    |  4 ++--
 examples/zh_en_tts/tts3/README.md             | 20 ++++++++++---------
 examples/zh_en_tts/tts3/local/inference.sh    |  6 +++---
 .../zh_en_tts/tts3/local/model_download.sh    |  1 +
 examples/zh_en_tts/tts3/local/ort_predict.sh  |  6 +++---
 .../zh_en_tts/tts3/local/synthesize_e2e.sh    |  6 +++---
 .../t2s/{exps => assets}/csmsc_test.txt       |  0
 .../t2s/{exps => assets}/sentences.txt        |  0
 .../t2s/{exps => assets}/sentences_canton.txt |  0
 .../t2s/{exps => assets}/sentences_en.txt     |  0
 .../t2s/{exps => assets}/sentences_mix.txt    |  0
 .../t2s/{exps => assets}/sentences_sing.txt   |  0
 .../t2s/{exps => assets}/sentences_ssml.txt   |  0
 68 files changed, 142 insertions(+), 139 deletions(-)
 rename paddlespeech/t2s/{exps => assets}/csmsc_test.txt (100%)
 rename paddlespeech/t2s/{exps => assets}/sentences.txt (100%)
 rename paddlespeech/t2s/{exps => assets}/sentences_canton.txt (100%)
 rename paddlespeech/t2s/{exps => assets}/sentences_en.txt (100%)
 rename paddlespeech/t2s/{exps => assets}/sentences_mix.txt (100%)
 rename paddlespeech/t2s/{exps => assets}/sentences_sing.txt (100%)
 rename paddlespeech/t2s/{exps => assets}/sentences_ssml.txt (100%)

diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md
index 49801c4c..c33d665c 100644
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -241,7 +241,7 @@ fastspeech2_aishell3_ckpt_1.1.0
 ├── speaker_id_map.txt      # speaker id map file when training a multi-speaker fastspeech2
 └── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
 ```
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
 ```bash
 source path.sh
 
@@ -257,7 +257,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
   --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
   --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
   --output_dir=exp/default/test_e2e \
   --phones_dict=fastspeech2_aishell3_ckpt_1.1.0/phone_id_map.txt \
   --speaker_dict=fastspeech2_aishell3_ckpt_1.1.0/speaker_id_map.txt \
diff --git a/examples/aishell3/tts3/local/inference.sh b/examples/aishell3/tts3/local/inference.sh
index dc05ec59..2d096bdc 100755
--- a/examples/aishell3/tts3/local/inference.sh
+++ b/examples/aishell3/tts3/local/inference.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_aishell3 \
         --voc=pwgan_aishell3 \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -22,7 +22,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_aishell3 \
         --voc=hifigan_aishell3 \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/examples/aishell3/tts3/local/lite_predict.sh b/examples/aishell3/tts3/local/lite_predict.sh
index e77e8b6c..2534b460 100755
--- a/examples/aishell3/tts3/local/lite_predict.sh
+++ b/examples/aishell3/tts3/local/lite_predict.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=fastspeech2_aishell3 \
         --voc=pwgan_aishell3 \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=fastspeech2_aishell3 \
         --voc=hifigan_aishell3 \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/examples/aishell3/tts3/local/ort_predict.sh b/examples/aishell3/tts3/local/ort_predict.sh
index 24e66f68..9c41dee3 100755
--- a/examples/aishell3/tts3/local/ort_predict.sh
+++ b/examples/aishell3/tts3/local/ort_predict.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am=fastspeech2_aishell3 \
         --voc=pwgan_aishell3 \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2 \
@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=fastspeech2_aishell3 \
         --voc=hifigan_aishell3 \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2 \
diff --git a/examples/aishell3/tts3/local/synthesize_e2e.sh b/examples/aishell3/tts3/local/synthesize_e2e.sh
index 158350ae..2cc22ede 100755
--- a/examples/aishell3/tts3/local/synthesize_e2e.sh
+++ b/examples/aishell3/tts3/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
         --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/examples/aishell3/vits/README.md b/examples/aishell3/vits/README.md
index dc80e18b..8c19e29f 100644
--- a/examples/aishell3/vits/README.md
+++ b/examples/aishell3/vits/README.md
@@ -196,7 +196,7 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
     --phones_dict=vits_aishell3_ckpt_1.1.0/phone_id_map.txt \
     --speaker_dict=vits_aishell3_ckpt_1.1.0/speaker_id_map.txt \
     --output_dir=exp/default/test_e2e \
-    --text=${BIN_DIR}/../sentences.txt \
+    --text=${BIN_DIR}/../../assets/sentences.txt \
     --add-blank=${add_blank} 
 ```
 -->
diff --git a/examples/aishell3/vits/local/synthesize_e2e.sh b/examples/aishell3/vits/local/synthesize_e2e.sh
index 1bd58549..5369cbf9 100755
--- a/examples/aishell3/vits/local/synthesize_e2e.sh
+++ b/examples/aishell3/vits/local/synthesize_e2e.sh
@@ -20,6 +20,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --speaker_dict=dump/speaker_id_map.txt \
         --spk_id=0 \
         --output_dir=${train_output_path}/test_e2e \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --add-blank=${add_blank}
 fi
diff --git a/examples/canton/tts3/README.md b/examples/canton/tts3/README.md
index f46949d2..87ef4090 100644
--- a/examples/canton/tts3/README.md
+++ b/examples/canton/tts3/README.md
@@ -102,7 +102,7 @@ Download the pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](
 unzip pwg_aishell3_ckpt_0.5.zip
 ```
 
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_canton.txt` using pretrained fastspeech2 and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences_canton.txt` using pretrained fastspeech2 and parallel wavegan models.
 ```bash
 source path.sh
 
@@ -118,7 +118,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
   --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
   --lang=canton \
-  --text=${BIN_DIR}/../sentences_canton.txt \
+  --text=${BIN_DIR}/../../assets/sentences_canton.txt \
   --output_dir=exp/default/test_e2e \
   --phones_dict=fastspeech2_canton_ckpt_1.4.0/phone_id_map.txt \
   --speaker_dict=fastspeech2_canton_ckpt_1.4.0/speaker_id_map.txt \
diff --git a/examples/canton/tts3/local/inference.sh b/examples/canton/tts3/local/inference.sh
index caf0b438..ad3af2d0 100755
--- a/examples/canton/tts3/local/inference.sh
+++ b/examples/canton/tts3/local/inference.sh
@@ -12,7 +12,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am=fastspeech2_canton \
         --voc=pwgan_aishell3 \
         --spk_id=10 \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -27,7 +27,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=fastspeech2_canton \
         --voc=mb_melgan_csmsc \
         --spk_id=10 \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -41,7 +41,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --am=fastspeech2_canton \
         --voc=hifigan_csmsc \
         --spk_id=10 \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         --am=fastspeech2_canton \
         --voc=wavernn_csmsc \
         --spk_id=10 \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/examples/canton/tts3/local/ort_predict.sh b/examples/canton/tts3/local/ort_predict.sh
index d95e49f9..edbe0406 100755
--- a/examples/canton/tts3/local/ort_predict.sh
+++ b/examples/canton/tts3/local/ort_predict.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc=pwgan_aishell3 \
         --spk_id=10 \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
         --lang=canton \
@@ -26,7 +26,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc=mb_melgan_csmsc \
         --spk_id=10 \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
         --lang=canton \
@@ -40,7 +40,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --am=fastspeech2_canton \
         --voc=hifigan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
         --lang=canton \
diff --git a/examples/canton/tts3/local/synthesize_e2e.sh b/examples/canton/tts3/local/synthesize_e2e.sh
index 8cf7eb22..38b7e1af 100755
--- a/examples/canton/tts3/local/synthesize_e2e.sh
+++ b/examples/canton/tts3/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
         --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
         --lang=canton \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
         --lang=canton \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/examples/csmsc/jets/local/inference.sh b/examples/csmsc/jets/local/inference.sh
index 30941caa..987f4cea 100755
--- a/examples/csmsc/jets/local/inference.sh
+++ b/examples/csmsc/jets/local/inference.sh
@@ -9,7 +9,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     python3 ${BIN_DIR}/inference.py \
         --inference_dir=${train_output_path}/inference \
         --am=jets_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
diff --git a/examples/csmsc/jets/local/synthesize_e2e.sh b/examples/csmsc/jets/local/synthesize_e2e.sh
index 67ae14fa..c95354d8 100755
--- a/examples/csmsc/jets/local/synthesize_e2e.sh
+++ b/examples/csmsc/jets/local/synthesize_e2e.sh
@@ -17,6 +17,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
         --phones_dict=dump/phone_id_map.txt \
         --output_dir=${train_output_path}/test_e2e \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --inference_dir=${train_output_path}/inference
 fi
diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md
index bc7769d1..ce682495 100644
--- a/examples/csmsc/tts0/README.md
+++ b/examples/csmsc/tts0/README.md
@@ -226,7 +226,7 @@ tacotron2_csmsc_ckpt_0.2.0
 ├── snapshot_iter_30600.pdz # model parameters and optimizer states
 └── speech_stats.npy        # statistics used to normalize spectrogram when training Tacotron2
 ```
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained Tacotron2 and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained Tacotron2 and parallel wavegan models.
 ```bash
 source path.sh
 
@@ -242,7 +242,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
   --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
   --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
   --output_dir=exp/default/test_e2e \
   --inference_dir=exp/default/inference \
   --phones_dict=tacotron2_csmsc_ckpt_0.2.0/phone_id_map.txt
diff --git a/examples/csmsc/tts0/local/inference.sh b/examples/csmsc/tts0/local/inference.sh
index d2960441..6ea2e4b6 100755
--- a/examples/csmsc/tts0/local/inference.sh
+++ b/examples/csmsc/tts0/local/inference.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=tacotron2_csmsc \
         --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
@@ -22,7 +22,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=tacotron2_csmsc \
         --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
@@ -33,7 +33,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=tacotron2_csmsc \
         --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
\ No newline at end of file
diff --git a/examples/csmsc/tts0/local/synthesize_e2e.sh b/examples/csmsc/tts0/local/synthesize_e2e.sh
index 4c3b08dc..40b49aa1 100755
--- a/examples/csmsc/tts0/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts0/local/synthesize_e2e.sh
@@ -22,7 +22,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
         --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference
@@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
         --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference
@@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
         --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt
         # --inference_dir=${train_output_path}/inference
@@ -87,7 +87,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference
@@ -108,7 +108,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
         --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
         --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference
diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md
index ec88959d..96956776 100644
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -248,7 +248,7 @@ speedyspeech_csmsc_ckpt_0.2.0
 ├── snapshot_iter_30600.pdz # model parameters and optimizer states
 └── tone_id_map.txt         # tone vocabulary file when training speedyspeech
 ```
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained speedyspeech and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained speedyspeech and parallel wavegan models.
 ```bash
 source path.sh
 
@@ -264,7 +264,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
   --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
   --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
   --output_dir=exp/default/test_e2e \
   --inference_dir=exp/default/inference \
   --phones_dict=speedyspeech_csmsc_ckpt_0.2.0/phone_id_map.txt \
diff --git a/examples/csmsc/tts2/local/inference.sh b/examples/csmsc/tts2/local/inference.sh
index ed92136c..9a677edc 100755
--- a/examples/csmsc/tts2/local/inference.sh
+++ b/examples/csmsc/tts2/local/inference.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=speedyspeech_csmsc \
         --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt
@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=speedyspeech_csmsc \
         --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt
@@ -36,7 +36,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=speedyspeech_csmsc \
         --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt
diff --git a/examples/csmsc/tts2/local/lite_predict.sh b/examples/csmsc/tts2/local/lite_predict.sh
index d0c6c058..9bb33cdf 100755
--- a/examples/csmsc/tts2/local/lite_predict.sh
+++ b/examples/csmsc/tts2/local/lite_predict.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=speedyspeech_csmsc \
         --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt
@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=speedyspeech_csmsc \
         --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt
@@ -36,7 +36,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=speedyspeech_csmsc \
         --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt
diff --git a/examples/csmsc/tts2/local/ort_predict.sh b/examples/csmsc/tts2/local/ort_predict.sh
index 8ca4c0e9..36f88667 100755
--- a/examples/csmsc/tts2/local/ort_predict.sh
+++ b/examples/csmsc/tts2/local/ort_predict.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am=speedyspeech_csmsc \
         --voc=pwgan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt \
         --device=cpu \
@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=speedyspeech_csmsc \
         --voc=mb_melgan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt \
         --device=cpu \
@@ -36,7 +36,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --am=speedyspeech_csmsc \
         --voc=hifigan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt \
         --device=cpu \
diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh
index 553b4554..2b278729 100755
--- a/examples/csmsc/tts2/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts2/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
         --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt \
@@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
         --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt \
@@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
         --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt
@@ -87,7 +87,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt \
@@ -109,7 +109,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
         --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
         --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt \
diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index 39926259..5a097537 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -258,7 +258,7 @@ fastspeech2_nosil_baker_ckpt_0.4
 ├── snapshot_iter_76000.pdz # model parameters and optimizer states
 └── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
 ```
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
 
 If you want to use fastspeech2_conformer, you must delete this line `--inference_dir=exp/default/inference \` to skip the step of dygraph to static graph, cause we haven't tested dygraph to static graph for fastspeech2_conformer till now.
 ```bash
@@ -276,7 +276,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
   --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
   --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
   --output_dir=exp/default/test_e2e \
   --inference_dir=exp/default/inference \
   --phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
diff --git a/examples/csmsc/tts3/README_cn.md b/examples/csmsc/tts3/README_cn.md
index 1829b770..3f2783a9 100644
--- a/examples/csmsc/tts3/README_cn.md
+++ b/examples/csmsc/tts3/README_cn.md
@@ -248,7 +248,7 @@ fastspeech2_nosil_baker_ckpt_0.4
 ├── snapshot_iter_76000.pdz # 模型参数和优化器状态
 └── speech_stats.npy        # 训练 fastspeech2 时用于规范化频谱图的统计数据
 ```
-您可以使用以下脚本通过使用预训练的 fastspeech2 和 parallel wavegan 模型为 `${BIN_DIR}/../sentences.txt` 合成句子
+您可以使用以下脚本通过使用预训练的 fastspeech2 和 parallel wavegan 模型为 `${BIN_DIR}/../../assets/sentences.txt` 合成句子
 ```bash
 source path.sh
 
@@ -264,7 +264,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
   --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
   --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
   --output_dir=exp/default/test_e2e \
   --inference_dir=exp/default/inference \
   --phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
diff --git a/examples/csmsc/tts3/local/inference.sh b/examples/csmsc/tts3/local/inference.sh
index b43fd286..5b143cdd 100755
--- a/examples/csmsc/tts3/local/inference.sh
+++ b/examples/csmsc/tts3/local/inference.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_csmsc \
         --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_csmsc \
         --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
@@ -34,7 +34,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_csmsc \
         --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
@@ -45,7 +45,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_csmsc \
         --voc=wavernn_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
\ No newline at end of file
diff --git a/examples/csmsc/tts3/local/inference_streaming.sh b/examples/csmsc/tts3/local/inference_streaming.sh
index 719f46c6..5ad50aa5 100755
--- a/examples/csmsc/tts3/local/inference_streaming.sh
+++ b/examples/csmsc/tts3/local/inference_streaming.sh
@@ -12,7 +12,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am=fastspeech2_csmsc \
         --am_stat=dump/train/speech_stats.npy \
         --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True
@@ -26,7 +26,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=fastspeech2_csmsc \
         --am_stat=dump/train/speech_stats.npy \
         --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True
@@ -39,7 +39,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --am=fastspeech2_csmsc \
         --am_stat=dump/train/speech_stats.npy \
         --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True
diff --git a/examples/csmsc/tts3/local/lite_predict.sh b/examples/csmsc/tts3/local/lite_predict.sh
index 1ed2f108..9af17899 100755
--- a/examples/csmsc/tts3/local/lite_predict.sh
+++ b/examples/csmsc/tts3/local/lite_predict.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=fastspeech2_csmsc \
         --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=fastspeech2_csmsc \
         --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
@@ -34,7 +34,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=fastspeech2_csmsc \
         --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
diff --git a/examples/csmsc/tts3/local/lite_predict_streaming.sh b/examples/csmsc/tts3/local/lite_predict_streaming.sh
index 4570cb4e..19fdde41 100755
--- a/examples/csmsc/tts3/local/lite_predict_streaming.sh
+++ b/examples/csmsc/tts3/local/lite_predict_streaming.sh
@@ -12,7 +12,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am=fastspeech2_csmsc \
         --am_stat=dump/train/speech_stats.npy \
         --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True
@@ -26,7 +26,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=fastspeech2_csmsc \
         --am_stat=dump/train/speech_stats.npy \
         --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True
@@ -39,7 +39,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --am=fastspeech2_csmsc \
         --am_stat=dump/train/speech_stats.npy \
         --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True
diff --git a/examples/csmsc/tts3/local/ort_predict.sh b/examples/csmsc/tts3/local/ort_predict.sh
index e16c7bd0..99955665 100755
--- a/examples/csmsc/tts3/local/ort_predict.sh
+++ b/examples/csmsc/tts3/local/ort_predict.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am=fastspeech2_csmsc \
         --voc=pwgan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2
@@ -22,7 +22,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=fastspeech2_csmsc \
         --voc=mb_melgan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2
@@ -34,7 +34,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --am=fastspeech2_csmsc \
         --voc=hifigan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2
diff --git a/examples/csmsc/tts3/local/ort_predict_streaming.sh b/examples/csmsc/tts3/local/ort_predict_streaming.sh
index 74393581..e2c7e852 100755
--- a/examples/csmsc/tts3/local/ort_predict_streaming.sh
+++ b/examples/csmsc/tts3/local/ort_predict_streaming.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am_stat=dump/train/speech_stats.npy \
         --voc=pwgan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_streaming \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2 \
@@ -25,7 +25,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am_stat=dump/train/speech_stats.npy \
         --voc=mb_melgan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_streaming \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2 \
@@ -39,7 +39,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --am_stat=dump/train/speech_stats.npy \
         --voc=hifigan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_streaming \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2 \
diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh
index 512e062b..35a5598a 100755
--- a/examples/csmsc/tts3/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts3/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
         --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference
@@ -42,7 +42,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
         --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference
@@ -64,7 +64,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
         --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt
         # --inference_dir=${train_output_path}/inference
@@ -85,7 +85,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference
@@ -107,7 +107,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
         --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
         --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference
diff --git a/examples/csmsc/tts3/local/synthesize_streaming.sh b/examples/csmsc/tts3/local/synthesize_streaming.sh
index 366a88db..f4e783d4 100755
--- a/examples/csmsc/tts3/local/synthesize_streaming.sh
+++ b/examples/csmsc/tts3/local/synthesize_streaming.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
         --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True \
@@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
         --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True \
@@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
         --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True
@@ -87,7 +87,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True \
diff --git a/examples/csmsc/tts3_rhy/local/synthesize_e2e.sh b/examples/csmsc/tts3_rhy/local/synthesize_e2e.sh
index 8f5d8010..bf7229e1 100755
--- a/examples/csmsc/tts3_rhy/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts3_rhy/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
         --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference \
@@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
         --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference \
@@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
         --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --use_rhy=True
@@ -88,7 +88,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference \
@@ -111,7 +111,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
         --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
         --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference \
diff --git a/examples/csmsc/vits/README.md b/examples/csmsc/vits/README.md
index 50d703b2..83871277 100644
--- a/examples/csmsc/vits/README.md
+++ b/examples/csmsc/vits/README.md
@@ -172,6 +172,6 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
     --ckpt=vits_csmsc_ckpt_1.4.0/snapshot_iter_150000.pdz \
     --phones_dict=vits_csmsc_ckpt_1.4.0/phone_id_map.txt \
     --output_dir=exp/default/test_e2e \
-    --text=${BIN_DIR}/../sentences.txt \
+    --text=${BIN_DIR}/../../assets/sentences.txt \
     --add-blank=${add_blank} 
 ```
diff --git a/examples/csmsc/vits/local/inference.sh b/examples/csmsc/vits/local/inference.sh
index 0a79c255..d26b7f71 100755
--- a/examples/csmsc/vits/local/inference.sh
+++ b/examples/csmsc/vits/local/inference.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     python3 ${BIN_DIR}/inference.py \
         --inference_dir=${train_output_path}/inference \
         --am=vits_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --add-blank=${add_blank}
diff --git a/examples/csmsc/vits/local/lite_predict.sh b/examples/csmsc/vits/local/lite_predict.sh
index e12f5349..d20d7a57 100755
--- a/examples/csmsc/vits/local/lite_predict.sh
+++ b/examples/csmsc/vits/local/lite_predict.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     python3 ${BIN_DIR}/lite_predict.py \
         --inference_dir=${train_output_path}/pdlite \
         --am=vits_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --add-blank=${add_blank}
diff --git a/examples/csmsc/vits/local/synthesize_e2e.sh b/examples/csmsc/vits/local/synthesize_e2e.sh
index 6a69b366..f3c067e4 100755
--- a/examples/csmsc/vits/local/synthesize_e2e.sh
+++ b/examples/csmsc/vits/local/synthesize_e2e.sh
@@ -18,7 +18,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
         --phones_dict=dump/phone_id_map.txt \
         --output_dir=${train_output_path}/test_e2e \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --add-blank=${add_blank} #\
         # --inference_dir=${train_output_path}/inference
 fi
diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md
index 85d9e448..fa986c85 100644
--- a/examples/ljspeech/tts0/README.md
+++ b/examples/ljspeech/tts0/README.md
@@ -239,7 +239,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
   --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
   --lang=en \
-  --text=${BIN_DIR}/../sentences_en.txt \
+  --text=${BIN_DIR}/../../assets/sentences_en.txt \
   --output_dir=exp/default/test_e2e \
   --phones_dict=tacotron2_ljspeech_ckpt_0.2.0/phone_id_map.txt
 ```
diff --git a/examples/ljspeech/tts0/local/synthesize_e2e.sh b/examples/ljspeech/tts0/local/synthesize_e2e.sh
index 73dfff60..903ebb47 100755
--- a/examples/ljspeech/tts0/local/synthesize_e2e.sh
+++ b/examples/ljspeech/tts0/local/synthesize_e2e.sh
@@ -16,7 +16,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
     --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
     --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
     --lang=en \
-    --text=${BIN_DIR}/../sentences_en.txt \
+    --text=${BIN_DIR}/../../assets/sentences_en.txt \
     --output_dir=${train_output_path}/test_e2e \
     --phones_dict=dump/phone_id_map.txt \
     # --inference_dir=${train_output_path}/inference
\ No newline at end of file
diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md
index 85621653..7f0571a1 100644
--- a/examples/ljspeech/tts1/README.md
+++ b/examples/ljspeech/tts1/README.md
@@ -191,7 +191,7 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
   --transformer-tts-stat=transformer_tts_ljspeech_ckpt_0.4/speech_stats.npy \
   --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
   --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
-  --text=${BIN_DIR}/../sentences_en.txt \
+  --text=${BIN_DIR}/../../assets/sentences_en.txt \
   --output-dir=exp/default/test_e2e \
   --phones-dict=transformer_tts_ljspeech_ckpt_0.4/phone_id_map.txt
 ```
diff --git a/examples/ljspeech/tts1/local/synthesize_e2e.sh b/examples/ljspeech/tts1/local/synthesize_e2e.sh
index 25a862f9..d6ff9cae 100755
--- a/examples/ljspeech/tts1/local/synthesize_e2e.sh
+++ b/examples/ljspeech/tts1/local/synthesize_e2e.sh
@@ -12,6 +12,6 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
     --transformer-tts-stat=dump/train/speech_stats.npy \
     --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
     --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
-    --text=${BIN_DIR}/../sentences_en.txt \
+    --text=${BIN_DIR}/../../assets/sentences_en.txt \
     --output-dir=${train_output_path}/test_e2e \
     --phones-dict=dump/phone_id_map.txt
diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md
index 23b433d4..f1ed111a 100644
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -254,7 +254,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
   --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
   --lang=en \
-  --text=${BIN_DIR}/../sentences_en.txt \
+  --text=${BIN_DIR}/../../assets/sentences_en.txt \
   --output_dir=exp/default/test_e2e \
   --inference_dir=exp/default/inference \
   --phones_dict=fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt
diff --git a/examples/ljspeech/tts3/local/inference.sh b/examples/ljspeech/tts3/local/inference.sh
index ff192f3e..94d6b371 100755
--- a/examples/ljspeech/tts3/local/inference.sh
+++ b/examples/ljspeech/tts3/local/inference.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_ljspeech \
         --voc=pwgan_ljspeech \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --lang=en
@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_ljspeech \
         --voc=hifigan_ljspeech \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --lang=en
diff --git a/examples/ljspeech/tts3/local/lite_predict.sh b/examples/ljspeech/tts3/local/lite_predict.sh
index 75db6a0e..9cf1d8d7 100755
--- a/examples/ljspeech/tts3/local/lite_predict.sh
+++ b/examples/ljspeech/tts3/local/lite_predict.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=fastspeech2_ljspeech \
         --voc=pwgan_ljspeech \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --lang=en
@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=fastspeech2_ljspeech \
         --voc=hifigan_ljspeech \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --lang=en
diff --git a/examples/ljspeech/tts3/local/ort_predict.sh b/examples/ljspeech/tts3/local/ort_predict.sh
index b4716f70..b82ec15f 100755
--- a/examples/ljspeech/tts3/local/ort_predict.sh
+++ b/examples/ljspeech/tts3/local/ort_predict.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am=fastspeech2_ljspeech \
         --voc=pwgan_ljspeech\
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_en.txt  \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt  \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2 \
@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=fastspeech2_ljspeech \
         --voc=hifigan_ljspeech \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_en.txt  \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt  \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2 \
diff --git a/examples/ljspeech/tts3/local/synthesize_e2e.sh b/examples/ljspeech/tts3/local/synthesize_e2e.sh
index 36865f7f..3f234080 100755
--- a/examples/ljspeech/tts3/local/synthesize_e2e.sh
+++ b/examples/ljspeech/tts3/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
         --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
         --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/test_e2e \
         --inference_dir=${train_output_path}/inference \
         --phones_dict=dump/phone_id_map.txt
@@ -41,7 +41,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=hifigan_ljspeech_ckpt_0.2.0/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_ljspeech_ckpt_0.2.0/feats_stats.npy \
         --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/test_e2e \
         --inference_dir=${train_output_path}/inference \
         --phones_dict=dump/phone_id_map.txt
diff --git a/examples/opencpop/svs1/README.md b/examples/opencpop/svs1/README.md
index 1600d0c7..43cc6e86 100644
--- a/examples/opencpop/svs1/README.md
+++ b/examples/opencpop/svs1/README.md
@@ -267,7 +267,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
   --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
   --lang=sing \
-  --text=${BIN_DIR}/../sentences_sing.txt \
+  --text=${BIN_DIR}/../../assets/sentences_sing.txt \
   --output_dir=exp/default/test_e2e \
   --phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
   --pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \
diff --git a/examples/opencpop/svs1/README_cn.md b/examples/opencpop/svs1/README_cn.md
index 1435b42e..cf65c97f 100644
--- a/examples/opencpop/svs1/README_cn.md
+++ b/examples/opencpop/svs1/README_cn.md
@@ -271,7 +271,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
   --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
   --lang=sing \
-  --text=${BIN_DIR}/../sentences_sing.txt \
+  --text=${BIN_DIR}/../../assets/sentences_sing.txt \
   --output_dir=exp/default/test_e2e \
   --phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
   --pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \
diff --git a/examples/opencpop/svs1/local/synthesize_e2e.sh b/examples/opencpop/svs1/local/synthesize_e2e.sh
index b3dc29b1..e8d0cc45 100755
--- a/examples/opencpop/svs1/local/synthesize_e2e.sh
+++ b/examples/opencpop/svs1/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
         --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
         --lang=sing \
-        --text=${BIN_DIR}/../sentences_sing.txt \
+        --text=${BIN_DIR}/../../assets/sentences_sing.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speech_stretchs=dump/train/speech_stretchs.npy \
@@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=hifigan_opencpop_ckpt_1.4.0/snapshot_iter_625000.pdz \
         --voc_stat=hifigan_opencpop_ckpt_1.4.0/feats_stats.npy \
         --lang=sing \
-        --text=${BIN_DIR}/../sentences_sing.txt \
+        --text=${BIN_DIR}/../../assets/sentences_sing.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speech_stretchs=dump/train/speech_stretchs.npy \
diff --git a/examples/other/tts_finetune/tts3/run.sh b/examples/other/tts_finetune/tts3/run.sh
index cc25d8f6..f5a65e6b 100755
--- a/examples/other/tts_finetune/tts3/run.sh
+++ b/examples/other/tts_finetune/tts3/run.sh
@@ -99,7 +99,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
         --voc_ckpt=pretrained_models/hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
         --voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=./test_e2e/ \
         --phones_dict=${dump_dir}/phone_id_map.txt \
         --speaker_dict=${dump_dir}/speaker_id_map.txt \
diff --git a/examples/other/tts_finetune/tts3/run_en.sh b/examples/other/tts_finetune/tts3/run_en.sh
index 53721486..86c58afa 100755
--- a/examples/other/tts_finetune/tts3/run_en.sh
+++ b/examples/other/tts_finetune/tts3/run_en.sh
@@ -98,7 +98,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
         --voc_ckpt=pretrained_models/hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
         --voc_stat=pretrained_models/hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
         --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=./test_e2e/ \
         --phones_dict=${dump_dir}/phone_id_map.txt \
         --speaker_dict=${dump_dir}/speaker_id_map.txt \
diff --git a/examples/other/tts_finetune/tts3/run_mix.sh b/examples/other/tts_finetune/tts3/run_mix.sh
index 7630022b..210f0314 100755
--- a/examples/other/tts_finetune/tts3/run_mix.sh
+++ b/examples/other/tts_finetune/tts3/run_mix.sh
@@ -100,7 +100,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
         --voc_ckpt=pretrained_models/hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
         --voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
         --lang=mix \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --output_dir=./test_e2e/ \
         --phones_dict=${dump_dir}/phone_id_map.txt \
         --speaker_dict=${dump_dir}/speaker_id_map.txt \
diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md
index 0bf2037f..3a6f3e1b 100644
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -254,7 +254,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
   --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
   --lang=en \
-  --text=${BIN_DIR}/../sentences_en.txt \
+  --text=${BIN_DIR}/../../assets/sentences_en.txt \
   --output_dir=exp/default/test_e2e \
   --phones_dict=fastspeech2_vctk_ckpt_1.2.0/phone_id_map.txt \
   --speaker_dict=fastspeech2_vctk_ckpt_1.2.0/speaker_id_map.txt \
diff --git a/examples/vctk/tts3/local/inference.sh b/examples/vctk/tts3/local/inference.sh
index 9c442614..ef23d951 100755
--- a/examples/vctk/tts3/local/inference.sh
+++ b/examples/vctk/tts3/local/inference.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_vctk \
         --voc=pwgan_vctk \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_vctk \
         --voc=hifigan_vctk \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/examples/vctk/tts3/local/lite_predict.sh b/examples/vctk/tts3/local/lite_predict.sh
index eb608535..53141b5f 100755
--- a/examples/vctk/tts3/local/lite_predict.sh
+++ b/examples/vctk/tts3/local/lite_predict.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=fastspeech2_vctk \
         --voc=pwgan_vctk \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -25,7 +25,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=fastspeech2_vctk \
         --voc=hifigan_vctk \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/examples/vctk/tts3/local/ort_predict.sh b/examples/vctk/tts3/local/ort_predict.sh
index 4019e17f..f376ee75 100755
--- a/examples/vctk/tts3/local/ort_predict.sh
+++ b/examples/vctk/tts3/local/ort_predict.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am=fastspeech2_vctk \
         --voc=pwgan_vctk \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2 \
@@ -25,7 +25,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=fastspeech2_vctk \
         --voc=hifigan_vctk \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2 \
diff --git a/examples/vctk/tts3/local/synthesize_e2e.sh b/examples/vctk/tts3/local/synthesize_e2e.sh
index a89f42b5..971c8385 100755
--- a/examples/vctk/tts3/local/synthesize_e2e.sh
+++ b/examples/vctk/tts3/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
         --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
         --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
         --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/examples/zh_en_tts/tts3/README.md b/examples/zh_en_tts/tts3/README.md
index 1f04d41e..15de3f48 100644
--- a/examples/zh_en_tts/tts3/README.md
+++ b/examples/zh_en_tts/tts3/README.md
@@ -252,8 +252,10 @@ optional arguments:
 
 
 ## Pretrained Model
+
 Pretrained FastSpeech2 model with no silence in the edge of audios:
 - [fastspeech2_mix_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_ckpt_1.2.0.zip)
+- [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)
 
 The static model can be downloaded here:
 - [fastspeech2_mix_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_static_0.2.0.zip)
@@ -285,18 +287,18 @@ FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../synthesize_e2e.py \
   --am=fastspeech2_mix \
-  --am_config=fastspeech2_mix_ckpt_1.2.0/default.yaml \
-  --am_ckpt=fastspeech2_mix_ckpt_1.2.0/snapshot_iter_99200.pdz \
-  --am_stat=fastspeech2_mix_ckpt_1.2.0/speech_stats.npy \
-  --phones_dict=fastspeech2_mix_ckpt_1.2.0/phone_id_map.txt \
-  --speaker_dict=fastspeech2_mix_ckpt_1.2.0/speaker_id_map.txt \
+  --am_config=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/default.yaml \
+  --am_ckpt=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/snapshot_iter_99200.pdz \
+  --am_stat=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/speech_stats.npy \
+  --phones_dict=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/phone_id_map.txt \
+  --speaker_dict=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/speaker_id_map.txt \
   --spk_id=174 \
   --voc=pwgan_aishell3 \
-  --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
-  --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
-  --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
+  --voc_config=exp/pretrain/pwg_aishell3_ckpt_0.5/default.yaml \
+  --voc_ckpt=exp/pretrain/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+  --voc_stat=exp/pretrain/pwg_aishell3_ckpt_0.5/feats_stats.npy \
   --lang=mix \
-  --text=${BIN_DIR}/../sentences_mix.txt \
+  --text=${BIN_DIR}/../../assets/sentences_mix.txt \
   --output_dir=exp/default/test_e2e \
   --inference_dir=exp/default/inference
 ```
diff --git a/examples/zh_en_tts/tts3/local/inference.sh b/examples/zh_en_tts/tts3/local/inference.sh
index 16499ed0..e4168fd0 100755
--- a/examples/zh_en_tts/tts3/local/inference.sh
+++ b/examples/zh_en_tts/tts3/local/inference.sh
@@ -13,7 +13,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_mix \
         --voc=pwgan_aishell3 \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -30,7 +30,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_mix \
         --voc=hifigan_aishell3 \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -45,7 +45,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_mix \
         --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/examples/zh_en_tts/tts3/local/model_download.sh b/examples/zh_en_tts/tts3/local/model_download.sh
index 20a830b7..21a218a8 100755
--- a/examples/zh_en_tts/tts3/local/model_download.sh
+++ b/examples/zh_en_tts/tts3/local/model_download.sh
@@ -8,6 +8,7 @@ mkdir -p $pretrain
 pushd $pretrain
 
 wget -c https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_ckpt_1.2.0.zip &
+wget -c https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip &
 wait
 
 popd
diff --git a/examples/zh_en_tts/tts3/local/ort_predict.sh b/examples/zh_en_tts/tts3/local/ort_predict.sh
index d80da9c9..0d5ac675 100755
--- a/examples/zh_en_tts/tts3/local/ort_predict.sh
+++ b/examples/zh_en_tts/tts3/local/ort_predict.sh
@@ -13,7 +13,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am=fastspeech2_mix \
         --voc=pwgan_aishell3 \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=4 \
@@ -31,7 +31,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=fastspeech2_mix \
         --voc=hifigan_aishell3 \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=4 \
@@ -45,7 +45,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --am=fastspeech2_mix \
         --voc=hifigan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=4 \
diff --git a/examples/zh_en_tts/tts3/local/synthesize_e2e.sh b/examples/zh_en_tts/tts3/local/synthesize_e2e.sh
index f6ee04ae..daad7180 100755
--- a/examples/zh_en_tts/tts3/local/synthesize_e2e.sh
+++ b/examples/zh_en_tts/tts3/local/synthesize_e2e.sh
@@ -23,7 +23,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
         --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
         --lang=mix \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -48,7 +48,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
         --lang=mix \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -73,7 +73,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=mix \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/paddlespeech/t2s/exps/csmsc_test.txt b/paddlespeech/t2s/assets/csmsc_test.txt
similarity index 100%
rename from paddlespeech/t2s/exps/csmsc_test.txt
rename to paddlespeech/t2s/assets/csmsc_test.txt
diff --git a/paddlespeech/t2s/exps/sentences.txt b/paddlespeech/t2s/assets/sentences.txt
similarity index 100%
rename from paddlespeech/t2s/exps/sentences.txt
rename to paddlespeech/t2s/assets/sentences.txt
diff --git a/paddlespeech/t2s/exps/sentences_canton.txt b/paddlespeech/t2s/assets/sentences_canton.txt
similarity index 100%
rename from paddlespeech/t2s/exps/sentences_canton.txt
rename to paddlespeech/t2s/assets/sentences_canton.txt
diff --git a/paddlespeech/t2s/exps/sentences_en.txt b/paddlespeech/t2s/assets/sentences_en.txt
similarity index 100%
rename from paddlespeech/t2s/exps/sentences_en.txt
rename to paddlespeech/t2s/assets/sentences_en.txt
diff --git a/paddlespeech/t2s/exps/sentences_mix.txt b/paddlespeech/t2s/assets/sentences_mix.txt
similarity index 100%
rename from paddlespeech/t2s/exps/sentences_mix.txt
rename to paddlespeech/t2s/assets/sentences_mix.txt
diff --git a/paddlespeech/t2s/exps/sentences_sing.txt b/paddlespeech/t2s/assets/sentences_sing.txt
similarity index 100%
rename from paddlespeech/t2s/exps/sentences_sing.txt
rename to paddlespeech/t2s/assets/sentences_sing.txt
diff --git a/paddlespeech/t2s/exps/sentences_ssml.txt b/paddlespeech/t2s/assets/sentences_ssml.txt
similarity index 100%
rename from paddlespeech/t2s/exps/sentences_ssml.txt
rename to paddlespeech/t2s/assets/sentences_ssml.txt