diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f72b44ac6..44bbd5cad 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -26,12 +26,12 @@ repos:
         - --no-sort-keys
         - --autofix
     -   id: check-merge-conflict
-    -   id: flake8
-        aergs:
-        -  --ignore=E501,E228,E226,E261,E266,E128,E402,W503
-        -  --builtins=G,request
-        -  --jobs=1
-        exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
+      #    -   id: flake8
+      #        aergs:
+      #        -  --ignore=E501,E228,E226,E261,E266,E128,E402,W503
+      #        -  --builtins=G,request
+      #        -  --jobs=1
+      #        exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
 
 -   repo : https://github.com/Lucas-C/pre-commit-hooks
     rev: v1.0.1
diff --git a/README.md b/README.md
index c6e9fc209..19ec61cb0 100644
--- a/README.md
+++ b/README.md
@@ -227,13 +227,13 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
 
 ## Installation
 
-We strongly recommend our users to install PaddleSpeech in **Linux** with *python>=3.7* and *paddlepaddle>=2.4.1*.
+We strongly recommend our users to install PaddleSpeech in **Linux** with *python>=3.8* and *paddlepaddle<=2.5.1*. Some new versions of Paddle do not have support for adaptation in PaddleSpeech, so currently only versions 2.5.1 and earlier can be supported.
 
 ### **Dependency Introduction**
 
 + gcc >= 4.8.5
-+ paddlepaddle >= 2.4.1
-+ python >= 3.7
++ paddlepaddle <= 2.5.1
++ python >= 3.8
 + OS support:  Linux(recommend), Windows, Mac OSX
 
 PaddleSpeech depends on paddlepaddle. For installation, please refer to the official website of [paddlepaddle](https://www.paddlepaddle.org.cn/en) and choose according to your own machine. Here is an example of the cpu version.
@@ -893,10 +893,6 @@ The Text-to-Speech module is originally called [Parakeet](https://github.com/Pad
 
 - **[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk): Use PaddleSpeech TTS and ASR to clone voice from videos.**
 
-<div align="center">
-<img src="https://raw.githubusercontent.com/jerryuhoo/VTuberTalk/main/gui/gui.png"  width = "500px"  />
-</div>
-
 
 ## Citation
 
diff --git a/README_cn.md b/README_cn.md
index eabb2ead4..7aef30871 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -8,7 +8,7 @@
     <a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-red.svg"></a>
     <a href="https://github.com/PaddlePaddle/PaddleSpeech/releases"><img src="https://img.shields.io/github/v/release/PaddlePaddle/PaddleSpeech?color=ffa"></a>
     <a href="support os"><img src="https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg"></a>
-    <a href=""><img src="https://img.shields.io/badge/python-3.7+-aff.svg"></a>
+    <a href=""><img src="https://img.shields.io/badge/python-3.8+-aff.svg"></a>
     <a href="https://github.com/PaddlePaddle/PaddleSpeech/graphs/contributors"><img src="https://img.shields.io/github/contributors/PaddlePaddle/PaddleSpeech?color=9ea"></a>
     <a href="https://github.com/PaddlePaddle/PaddleSpeech/commits"><img src="https://img.shields.io/github/commit-activity/m/PaddlePaddle/PaddleSpeech?color=3af"></a>
     <a href="https://github.com/PaddlePaddle/PaddleSpeech/issues"><img src="https://img.shields.io/github/issues/PaddlePaddle/PaddleSpeech?color=9cc"></a>
@@ -237,12 +237,12 @@
 <a name="安装"></a>
 ## 安装
 
-我们强烈建议用户在 **Linux** 环境下，*3.7* 以上版本的 *python* 上安装 PaddleSpeech。
+我们强烈建议用户在 **Linux** 环境下，*3.8* 以上版本的 *python* 上安装 PaddleSpeech。同时，有一些Paddle新版本的内容没有在做适配的支持，因此目前只能使用2.5.1及之前的版本。
 
 ### 相关依赖
 + gcc >= 4.8.5
-+ paddlepaddle >= 2.4.1
-+ python >= 3.7
++ paddlepaddle <= 2.5.1
++ python >= 3.8
 + linux(推荐), mac, windows
 
 PaddleSpeech 依赖于 paddlepaddle，安装可以参考[ paddlepaddle 官网](https://www.paddlepaddle.org.cn/)，根据自己机器的情况进行选择。这里给出 cpu 版本示例，其它版本大家可以根据自己机器的情况进行安装。
diff --git a/audio/setup.py b/audio/setup.py
index 0fe6e5995..f7d459446 100644
--- a/audio/setup.py
+++ b/audio/setup.py
@@ -38,8 +38,10 @@ VERSION = '1.2.0'
 COMMITID = 'none'
 
 base = [
+    # paddleaudio align with librosa==0.8.1, which need numpy==1.23.x
+    "librosa==0.8.1",
+    "numpy==1.23.5",
     "kaldiio",
-    "librosa>=0.10.0",
     "pathos",
     "pybind11",
     "parameterized",
diff --git a/demos/speech_web/speech_server/src/ge2e_clone.py b/demos/speech_web/speech_server/src/ge2e_clone.py
index 83c2b3f35..0711a40af 100644
--- a/demos/speech_web/speech_server/src/ge2e_clone.py
+++ b/demos/speech_web/speech_server/src/ge2e_clone.py
@@ -38,23 +38,9 @@ class VoiceCloneGE2E():
         output_dir = os.path.dirname(out_wav)
         ngpu = get_ngpu()
 
-        cmd = f"""
-            python3 {self.BIN_DIR}/voice_cloning.py \
-                    --am={self.am} \
-                    --am_config={self.am_config} \
-                    --am_ckpt={self.am_ckpt} \
-                    --am_stat={self.am_stat} \
-                    --voc={self.voc} \
-                    --voc_config={self.voc_config} \
-                    --voc_ckpt={self.voc_ckpt} \
-                    --voc_stat={self.voc_stat} \
-                    --ge2e_params_path={self.ge2e_params_path} \
-                    --text="{text}" \
-                    --input-dir={ref_audio_dir} \
-                    --output-dir={output_dir} \
-                    --phones-dict={self.phones_dict} \
-                    --ngpu={ngpu}
-        """
+        cmd = f"""python {self.BIN_DIR}/voice_cloning.py --am={self.am} --am_config={self.am_config} --am_ckpt={self.am_ckpt} --am_stat={self.am_stat} --voc={self.voc} --voc_config={self.voc_config} --voc_ckpt={self.voc_ckpt} --voc_stat={self.voc_stat} --ge2e_params_path={self.ge2e_params_path} --text="{text}" --input-dir={ref_audio_dir} --output-dir={output_dir} --phones-dict={self.phones_dict} --ngpu={ngpu}"""
+
+        print(cmd)
 
         output_name = os.path.join(output_dir, full_file_name)
         return run_cmd(cmd, output_name=output_name)
diff --git a/docs/source/install.md b/docs/source/install.md
index a4dae3640..3607d7185 100644
--- a/docs/source/install.md
+++ b/docs/source/install.md
@@ -95,7 +95,7 @@ bash
 ```
 Then you can create a conda virtual environment using the following command:
 ```bash
-conda create -y -p tools/venv python=3.7
+conda create -y -p tools/venv python=3.8
 ```
 Activate the conda virtual environment:
 ```bash
@@ -181,7 +181,7 @@ $HOME/miniconda3/bin/conda init
 # use the "bash" command to make the conda environment works
 bash
 # create a conda virtual environment
-conda create -y -p tools/venv python=3.7
+conda create -y -p tools/venv python=3.8
 # Activate the conda virtual environment:
 conda activate tools/venv
 # Install the conda packages
diff --git a/docs/source/install_cn.md b/docs/source/install_cn.md
index 7f05cdfe4..01ae21fe7 100644
--- a/docs/source/install_cn.md
+++ b/docs/source/install_cn.md
@@ -91,7 +91,7 @@ bash
 ```
 然后你可以创建一个 conda 的虚拟环境：
 ```bash
-conda create -y -p tools/venv python=3.7
+conda create -y -p tools/venv python=3.8
 ```
 激活 conda 虚拟环境：
 ```bash
@@ -173,7 +173,7 @@ $HOME/miniconda3/bin/conda init
 # 激活 conda
 bash
 # 创建 Conda 虚拟环境
-conda create -y -p tools/venv python=3.7
+conda create -y -p tools/venv python=3.8
 # 激活 Conda 虚拟环境:
 conda activate tools/venv
 # 安装 Conda 包
diff --git a/docs/topic/package_release/python_package_release.md b/docs/topic/package_release/python_package_release.md
index cb1029e7b..c735e0bd8 100644
--- a/docs/topic/package_release/python_package_release.md
+++ b/docs/topic/package_release/python_package_release.md
@@ -165,8 +165,7 @@ docker run -it xxxxxx
 设置python：
 
 ```bash
-export PATH="/opt/python/cp37-cp37m/bin/:$PATH"
-#export PATH="/opt/python/cp38-cp38/bin/:$PATH"
+export PATH="/opt/python/cp38-cp38/bin/:$PATH"
 #export PATH="/opt/python/cp39-cp39/bin/:$PATH"
 ```
 
diff --git a/examples/aishell/asr1/RESULTS.md b/examples/aishell/asr1/RESULTS.md
index 643d0e224..be771ba59 100644
--- a/examples/aishell/asr1/RESULTS.md
+++ b/examples/aishell/asr1/RESULTS.md
@@ -1,14 +1,31 @@
 # Aishell
 
-## Conformer
-paddle version: 2.2.2  
-paddlespeech version: 1.0.1
-| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |
-| --- | --- | --- | --- | --- | --- | --- | --- | 
-| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | attention | - | 0.0522 |
-| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0481 |
-| conformer | 47.07M  | conf/conformer.yaml | spec_aug| test | ctc_prefix_beam_search | - | 0.0480 | 
-| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0460 | 
+## RoFormer Streaming
+paddle version: 2.5.0  
+paddlespeech version: 1.5.0
+
+Tesla V100-SXM2-32GB: 1 node, 4 card
+Global BachSize: 32 * 4
+Training Done: 1 day, 12:56:39.639646
+### `decoding.decoding_chunk_size=16`
+
+> chunk_size=16, ((16 - 1) * 4 + 7) * 10ms = (16 * 4 + 3) * 10ms = 670ms
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | CER |  
+| --- | --- | --- | --- | --- | --- | --- | --- | --- |  
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention | 16, -1 | - |  5.63 |  
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_greedy_search | 16, -1 | - | 6.13 |  
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_prefix_beam_search | 16, -1 | - | 6.13 |  
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention_rescoring | 16, -1 |  - | 5.44 |  
+
+### `decoding.decoding_chunk_size=-1`
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | CER |  
+| --- | --- | --- | --- | --- | --- | --- | --- | --- |  
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention | -1, -1 | - | 5.39 |  
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_greedy_search | -1, -1 | - |  5.51 |  
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_prefix_beam_search | -1, -1 | - | 5.51 | 
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention_rescoring | -1, -1 |  - | 4.99 |  
 
 
 ## Conformer Streaming
@@ -24,6 +41,17 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | attention_rescoring | 16, -1 |  - | 0.051968 |  
 
 
+## Conformer
+paddle version: 2.2.2  
+paddlespeech version: 1.0.1
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |
+| --- | --- | --- | --- | --- | --- | --- | --- | 
+| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | attention | - | 0.0522 |
+| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0481 |
+| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | ctc_prefix_beam_search | - | 0.0480 | 
+| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0460 | 
+
+
 ## Transformer 
 
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |  
diff --git a/examples/aishell/asr1/conf/chunk_roformer.yaml b/examples/aishell/asr1/conf/chunk_roformer.yaml
new file mode 100644
index 000000000..a4051a021
--- /dev/null
+++ b/examples/aishell/asr1/conf/chunk_roformer.yaml
@@ -0,0 +1,98 @@
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1   # sublayer output dropout
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: True
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rope_pos' # abs_pos, rel_pos, rope_pos
+    selfattention_layer_type: 'rel_selfattn' # unused
+    causal: true
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
+# decoder related
+decoder: transformer # transformer, bitransformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    r_num_blocks: 0    # only for bitransformer
+    dropout_rate: 0.1  # sublayer output dropout
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    reverse_weight: 0.0 # only for bitransformer
+    length_normalized_loss: false
+    init_type: 'kaiming_uniform' # !Warning: need to convergence
+
+###########################################
+#                   Data                  #
+###########################################
+
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+
+
+###########################################
+#              Dataloader                 #
+###########################################
+
+vocab_filepath: data/lang_char/vocab.txt 
+spm_model_prefix: ''
+unit_type: 'char'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 32
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 2
+subsampling_factor: 1
+num_encs: 1
+
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 240 
+accum_grad: 1
+global_grad_clip: 5.0
+dist_sampler: True
+optim: adam
+optim_conf:
+  lr: 0.001
+  weight_decay: 1.0e-6
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/aishell/asr1/conf/chunk_roformer_bidecoder.yaml b/examples/aishell/asr1/conf/chunk_roformer_bidecoder.yaml
new file mode 100644
index 000000000..aa3a0aca7
--- /dev/null
+++ b/examples/aishell/asr1/conf/chunk_roformer_bidecoder.yaml
@@ -0,0 +1,98 @@
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1   # sublayer output dropout
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: True
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rope_pos' # abs_pos, rel_pos, rope_pos
+    selfattention_layer_type: 'rel_selfattn' # unused
+    causal: true
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
+# decoder related
+decoder: bitransformer # transformer, bitransformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 3
+    r_num_blocks: 3    # only for bitransformer
+    dropout_rate: 0.1  # sublayer output dropout
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    reverse_weight: 0.3 # only for bitransformer
+    length_normalized_loss: false
+    init_type: 'kaiming_uniform' # !Warning: need to convergence
+
+###########################################
+#                   Data                  #
+###########################################
+
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+
+
+###########################################
+#              Dataloader                 #
+###########################################
+
+vocab_filepath: data/lang_char/vocab.txt 
+spm_model_prefix: ''
+unit_type: 'char'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 32
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 2
+subsampling_factor: 1
+num_encs: 1
+
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 240 
+accum_grad: 1
+global_grad_clip: 5.0
+dist_sampler: True
+optim: adam
+optim_conf:
+  lr: 0.001
+  weight_decay: 1.0e-6
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/csmsc/tts2/local/inference_xpu.sh b/examples/csmsc/tts2/local/inference_xpu.sh
new file mode 100644
index 000000000..5d8d92054
--- /dev/null
+++ b/examples/csmsc/tts2/local/inference_xpu.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=pwgan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device xpu
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=mb_melgan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device xpu
+fi
+
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=hifigan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device xpu
+fi
diff --git a/examples/csmsc/tts2/local/synthesize_e2e_xpu.sh b/examples/csmsc/tts2/local/synthesize_e2e_xpu.sh
new file mode 100644
index 000000000..0285f42cd
--- /dev/null
+++ b/examples/csmsc/tts2/local/synthesize_e2e_xpu.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=pwgan_csmsc \
+        --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# the pretrained models haven't release now
+# style melgan
+# style melgan's Dygraph to Static Graph is not ready now
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+        # --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "in wavernn syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nxpu=1
+fi
diff --git a/examples/csmsc/tts2/local/synthesize_xpu.sh b/examples/csmsc/tts2/local/synthesize_xpu.sh
new file mode 100644
index 000000000..801789c26
--- /dev/null
+++ b/examples/csmsc/tts2/local/synthesize_xpu.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=pwgan_csmsc \
+        --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# style melgan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "in hifigan syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "in wavernn syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --tones_dict=dump/tone_id_map.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
diff --git a/examples/csmsc/tts2/local/train_xpu.sh b/examples/csmsc/tts2/local/train_xpu.sh
new file mode 100644
index 000000000..0c07c27fc
--- /dev/null
+++ b/examples/csmsc/tts2/local/train_xpu.sh
@@ -0,0 +1,16 @@
+
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=0 \
+    --nxpu=1 \
+    --phones-dict=dump/phone_id_map.txt \
+    --tones-dict=dump/tone_id_map.txt \
+    --use-relative-path=True
diff --git a/examples/csmsc/tts2/run_xpu.sh b/examples/csmsc/tts2/run_xpu.sh
new file mode 100644
index 000000000..4b867961f
--- /dev/null
+++ b/examples/csmsc/tts2/run_xpu.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+xpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_76.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run_xpu.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    FLAGS_selected_xpus=${xpus} ./local/train_xpu.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan by default
+    FLAGS_selected_xpus=${xpus} ./local/synthesize_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan by default
+    FLAGS_selected_xpus=${xpus} ./local/synthesize_e2e_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # inference with static model
+    FLAGS_selected_xpus=${xpus} ./local/inference_xpu.sh ${train_output_path} || exit -1
+fi
diff --git a/examples/csmsc/tts3/local/inference_xpu.sh b/examples/csmsc/tts3/local/inference_xpu.sh
new file mode 100644
index 000000000..541dc6262
--- /dev/null
+++ b/examples/csmsc/tts3/local/inference_xpu.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_csmsc \
+        --voc=pwgan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --device xpu
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_csmsc \
+        --voc=mb_melgan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --device xpu
+fi
+
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_csmsc \
+        --voc=hifigan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --device xpu
+fi
+
+# wavernn
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_csmsc \
+        --voc=wavernn_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --device xpu
+fi
\ No newline at end of file
diff --git a/examples/csmsc/tts3/local/synthesize_e2e_xpu.sh b/examples/csmsc/tts3/local/synthesize_e2e_xpu.sh
new file mode 100644
index 000000000..bb58a37c8
--- /dev/null
+++ b/examples/csmsc/tts3/local/synthesize_e2e_xpu.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_csmsc \
+        --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# the pretrained models haven't release now
+# style melgan
+# style melgan's Dygraph to Static Graph is not ready now
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+        # --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "in hifigan syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "in wavernn syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nxpu=1
+fi
diff --git a/examples/csmsc/tts3/local/synthesize_xpu.sh b/examples/csmsc/tts3/local/synthesize_xpu.sh
new file mode 100644
index 000000000..fac8677a7
--- /dev/null
+++ b/examples/csmsc/tts3/local/synthesize_xpu.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_csmsc \
+        --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# style melgan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "in hifigan syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "in wavernn syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
diff --git a/examples/csmsc/tts3/local/train_xpu.sh b/examples/csmsc/tts3/local/train_xpu.sh
new file mode 100644
index 000000000..a7d889888
--- /dev/null
+++ b/examples/csmsc/tts3/local/train_xpu.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=0 \
+    --nxpu=1 \
+    --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/csmsc/tts3/run_xpu.sh b/examples/csmsc/tts3/run_xpu.sh
new file mode 100644
index 000000000..4922d6b4b
--- /dev/null
+++ b/examples/csmsc/tts3/run_xpu.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+xpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_153.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    FLAGS_selected_xpus=${xpus} ./local/train_xpu.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan by default
+    FLAGS_selected_xpus=${xpus} ./local/synthesize_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan by default
+    FLAGS_selected_xpus=${xpus} ./local/synthesize_e2e_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # inference with static model, vocoder is pwgan by default
+    FLAGS_selected_xpus=${xpus} ./local/inference_xpu.sh ${train_output_path} || exit -1
+fi
diff --git a/paddlespeech/dataset/s2t/avg_model.py b/paddlespeech/dataset/s2t/avg_model.py
index c5753b726..5bd5cb1f0 100755
--- a/paddlespeech/dataset/s2t/avg_model.py
+++ b/paddlespeech/dataset/s2t/avg_model.py
@@ -20,30 +20,6 @@ import numpy as np
 import paddle
 
 
-def define_argparse():
-    parser = argparse.ArgumentParser(description='average model')
-    parser.add_argument('--dst_model', required=True, help='averaged model')
-    parser.add_argument(
-        '--ckpt_dir', required=True, help='ckpt model dir for average')
-    parser.add_argument(
-        '--val_best', action="store_true", help='averaged model')
-    parser.add_argument(
-        '--num', default=5, type=int, help='nums for averaged model')
-    parser.add_argument(
-        '--min_epoch',
-        default=0,
-        type=int,
-        help='min epoch used for averaging model')
-    parser.add_argument(
-        '--max_epoch',
-        default=65536,  # Big enough
-        type=int,
-        help='max epoch used for averaging model')
-
-    args = parser.parse_args()
-    return args
-
-
 def average_checkpoints(dst_model="",
                         ckpt_dir="",
                         val_best=True,
@@ -85,7 +61,7 @@ def average_checkpoints(dst_model="",
     print(path_list)
 
     avg = None
-    num = args.num
+    num = num
     assert num == len(path_list)
     for path in path_list:
         print(f'Processing {path}')
@@ -100,14 +76,14 @@ def average_checkpoints(dst_model="",
         if avg[k] is not None:
             avg[k] /= num
 
-    paddle.save(avg, args.dst_model)
-    print(f'Saving to {args.dst_model}')
+    paddle.save(avg, dst_model)
+    print(f'Saving to {dst_model}')
 
-    meta_path = os.path.splitext(args.dst_model)[0] + '.avg.json'
+    meta_path = os.path.splitext(dst_model)[0] + '.avg.json'
     with open(meta_path, 'w') as f:
         data = json.dumps({
-            "mode": 'val_best' if args.val_best else 'latest',
-            "avg_ckpt": args.dst_model,
+            "mode": 'val_best' if val_best else 'latest',
+            "avg_ckpt": dst_model,
             "val_loss_mean": avg_val_score,
             "ckpts": path_list,
             "epochs": selected_epochs.tolist(),
@@ -116,9 +92,40 @@ def average_checkpoints(dst_model="",
         f.write(data + "\n")
 
 
+def define_argparse():
+    parser = argparse.ArgumentParser(description='average model')
+    parser.add_argument('--dst_model', required=True, help='averaged model')
+    parser.add_argument(
+        '--ckpt_dir', required=True, help='ckpt model dir for average')
+    parser.add_argument(
+        '--val_best', action="store_true", help='averaged model')
+    parser.add_argument(
+        '--num', default=5, type=int, help='nums for averaged model')
+    parser.add_argument(
+        '--min_epoch',
+        default=0,
+        type=int,
+        help='min epoch used for averaging model')
+    parser.add_argument(
+        '--max_epoch',
+        default=65536,  # Big enough
+        type=int,
+        help='max epoch used for averaging model')
+
+    args = parser.parse_args()
+    print(args)
+    return args
+
+
 def main():
     args = define_argparse()
-    average_checkpoints(args)
+    average_checkpoints(
+        dst_model=args.dst_model,
+        ckpt_dir=args.ckpt_dir,
+        val_best=args.val_best,
+        num=args.num,
+        min_epoch=args.min_epoch,
+        max_epoch=args.max_epoch)
 
 
 if __name__ == '__main__':
diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py
index 7ab8cf853..d007a9e39 100644
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@@ -27,7 +27,6 @@ from paddlespeech.audio.text.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.io.dataloader import BatchDataLoader
 from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
 from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
-from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
 from paddlespeech.s2t.training.reporter import report
 from paddlespeech.s2t.training.timer import Timer
 from paddlespeech.s2t.training.trainer import Trainer
@@ -148,7 +147,7 @@ class DeepSpeech2Trainer(Trainer):
         if not self.train:
             return
 
-        grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip)
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(config.global_grad_clip)
         lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
             learning_rate=config.lr, gamma=config.lr_decay, verbose=True)
         optimizer = paddle.optimizer.Adam(
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index f716fa3b5..2e1c14ac1 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -145,7 +145,6 @@ class U2BaseModel(ASRInterface, nn.Layer):
                                 text_lengths)
             ctc_time = time.time() - start
             #logger.debug(f"ctc time: {ctc_time}")
-
         if loss_ctc is None:
             loss = loss_att
         elif loss_att is None:
@@ -916,6 +915,8 @@ class U2Model(U2DecodeModel):
         decoder_type = configs.get('decoder', 'transformer')
         logger.debug(f"U2 Decoder type: {decoder_type}")
         if decoder_type == 'transformer':
+            configs['model_conf'].pop('reverse_weight', None)
+            configs['decoder_conf'].pop('r_num_blocks', None)
             decoder = TransformerDecoder(vocab_size,
                                          encoder.output_size(),
                                          **configs['decoder_conf'])
diff --git a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
index 59a67a1e5..a3744d340 100755
--- a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
+++ b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
@@ -188,7 +188,7 @@ class Wav2vec2ASR(nn.Layer):
         x_lens = x.shape[1]
         ctc_probs = self.ctc.log_softmax(x)  # (B, maxlen, vocab_size)
         topk_prob, topk_index = ctc_probs.topk(1, axis=2)  # (B, maxlen, 1)
-        topk_index = topk_index.view(batch_size, x_lens)  # (B, maxlen)
+        topk_index = topk_index.view([batch_size, x_lens])  # (B, maxlen)
 
         hyps = [hyp.tolist() for hyp in topk_index]
         hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]
diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py
index 14336c03d..10ab3eaea 100644
--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
@@ -15,6 +15,7 @@
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
 """Multi-Head Attention layer definition."""
 import math
+from typing import List
 from typing import Tuple
 
 import paddle
@@ -26,7 +27,10 @@ from paddlespeech.s2t.utils.log import Log
 
 logger = Log(__name__).getlog()
 
-__all__ = ["MultiHeadedAttention", "RelPositionMultiHeadedAttention"]
+__all__ = [
+    "MultiHeadedAttention", "RelPositionMultiHeadedAttention",
+    "RoPERelPositionMultiHeadedAttention"
+]
 
 # Relative Positional Encodings
 # https://www.jianshu.com/p/c0608efcc26f
@@ -165,6 +169,7 @@ class MultiHeadedAttention(nn.Layer):
                 and `head * d_k == size`
 
         """
+        # (B,T,D) -> (B,T,H,D/H)
         q, k, v = self.forward_qkv(query, key, value)
 
         #   when export onnx model, for 1st chunk, we feed
@@ -373,3 +378,139 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
             self.d_k)  # (batch, head, time1, time2)
 
         return self.forward_attention(v, scores, mask), new_cache
+
+
+class RoPERelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with RoPE relative position encoding."""
+
+    def __init__(self,
+                 n_head,
+                 n_feat,
+                 dropout_rate,
+                 adaptive_scale=False,
+                 init_weights=False):
+        """Construct an RelPositionMultiHeadedAttention object.
+        Paper: https://arxiv.org/abs/1901.02860
+        Args:
+            n_head (int): The number of heads.
+            n_feat (int): The number of features.
+            dropout_rate (float): Dropout rate.
+        """
+        super().__init__(n_head, n_feat, dropout_rate)
+
+    def align(self, tensor: paddle.Tensor, axes: List[int], ndim=None):
+        """重新对齐tensor（批量版expand_dims）
+        axes：原来的第i维对齐新tensor的第axes[i]维；
+        ndim：新tensor的维度。
+        """
+        assert len(axes) == tensor.dim()
+        assert ndim or min(axes) >= 0
+
+        ndim = ndim or max(axes) + 1
+
+        # a[0, None, 1] = a[0, np.newaxis, 1]
+        indices = [None] * ndim
+        for i in axes:
+            # slice nothing, a[0, slice(None), 1] = a[0, :, 1]
+            indices[i] = slice(None)
+
+        return tensor[indices]
+
+    def apply_rotary_position_embeddings(self, sinusoidal, *tensors):
+        """应用RoPE到tensors中
+        其中，sinusoidal.shape=[B, T, D]，tensors为tensor的列表，而
+        tensor.shape=[B, T, ..., D], or (B,H,T,D/H)
+        """
+        assert len(tensors) > 0, 'at least one input tensor'
+        assert all(
+            [tensor.shape == tensors[0].shape
+             for tensor in tensors[1:]]), 'all tensors must have the same shape'
+
+        # (B,H,T,D)
+        ndim = tensors[0].dim()
+        _, H, T, D = tensors[0].shape
+
+        # sinusoidal shape same with tensors[0]
+        # [B,T,D] -> [B,T,H,D/H] -> (B,H,T,D/H)
+        # sinusoidal = self.align(sinusoidal, [0, 1, -1], ndim)
+        sinusoidal = sinusoidal.reshape((1, T, H, D)).transpose([0, 2, 1, 3])
+
+        # http://man.hubwiz.com/docset/TensorFlow.docset/Contents/Resources/Documents/api_docs/python/tf/keras/backend/repeat_elements.html
+        # like np.repeat, x (s1, s2, s3), axis 1, (s1, s2*rep, s3)
+        # [b,T, ..., d/2] -> [b,T, ..., d]
+        cos_pos = paddle.repeat_interleave(sinusoidal[..., 1::2], 2, axis=-1)
+        sin_pos = paddle.repeat_interleave(sinusoidal[..., 0::2], 2, axis=-1)
+        outputs = []
+        for tensor in tensors:
+            # x2 = [-x2, x1, -x4, x3, ..., -x_d, x_{d-1}]
+            tensor2 = paddle.stack([-tensor[..., 1::2], tensor[..., ::2]], ndim)
+            tensor2 = paddle.reshape(tensor2, paddle.shape(tensor))
+
+            # 公式 34, out = x * cos_pos + x2 * sin_pos
+            outputs.append(tensor * cos_pos + tensor2 * sin_pos)
+        return outputs[0] if len(outputs) == 1 else outputs
+
+    def forward(self,
+                query: paddle.Tensor,
+                key: paddle.Tensor,
+                value: paddle.Tensor,
+                mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+                pos_emb: paddle.Tensor=paddle.empty([0]),
+                cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0])
+                ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Ref: https://github.com/facebookresearch/llama/blob/main/llama/model.py
+        Args:
+            query (paddle.Tensor): Query tensor (#batch, time1, size).
+            key (paddle.Tensor): Key tensor (#batch, time2, size).
+            value (paddle.Tensor): Value tensor (#batch, time2, size).
+            mask (paddle.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+            pos_emb (paddle.Tensor): Positional embedding tensor
+                (#batch, time2, size).
+            cache (paddle.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        Returns:
+            paddle.Tensor: Output tensor (#batch, time1, d_model).
+            paddle.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        # q = q.transpose([0, 2, 1, 3])  # (batch, time1, head, d_k)
+
+        # f{q,k}(x_m, m) = R^d_{\theta, m} W_{q,k} x_m, m is position index
+        # q_t always is chunk_size
+        q_t = q.shape[2]
+        q = self.apply_rotary_position_embeddings(pos_emb[:, -q_t:, :], q)
+        # k will increase when in streaming decoding.
+        k = self.apply_rotary_position_embeddings(pos_emb[:, -q_t:, :], k)
+
+        #   when export onnx model, for 1st chunk, we feed
+        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
+        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
+        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
+        #       and we will always do splitting and
+        #       concatnation(this will simplify onnx export). Note that
+        #       it's OK to concat & split zero-shaped tensors(see code below).
+        #   when export jit  model, for 1st chunk, we always feed
+        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
+        # >>> a = torch.ones((1, 2, 0, 4))
+        # >>> b = torch.ones((1, 2, 3, 4))
+        # >>> c = torch.cat((a, b), dim=2)
+        # >>> torch.equal(b, c)        # True
+        # >>> d = torch.split(a, 2, dim=-1)
+        # >>> torch.equal(d[0], d[1])  # True
+        if cache.shape[0] > 0:
+            # last dim `d_k * 2` for (key, val)
+            key_cache, value_cache = paddle.split(cache, 2, axis=-1)
+            k = paddle.concat([key_cache, k], axis=2)
+            v = paddle.concat([value_cache, v], axis=2)
+        # We do cache slicing in encoder.forward_chunk, since it's
+        #   non-trivial to calculate `next_cache_start` here.
+        new_cache = paddle.concat((k, v), axis=-1)
+
+        # dot(q, k)
+        scores = paddle.matmul(q, k, transpose_y=True) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, mask), new_cache
diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py
index f41a7b5d4..1e9f01018 100644
--- a/paddlespeech/s2t/modules/embedding.py
+++ b/paddlespeech/s2t/modules/embedding.py
@@ -85,18 +85,21 @@ class PositionalEncoding(nn.Layer, PositionalEncodingInterface):
             reverse (bool, optional): Not used. Defaults to False.
         """
         nn.Layer.__init__(self)
-        self.d_model = d_model
+        self.d_model = paddle.to_tensor(d_model)
         self.max_len = max_len
         self.xscale = paddle.to_tensor(math.sqrt(self.d_model))
         self.dropout = nn.Dropout(p=dropout_rate)
+        self.base = paddle.to_tensor(10000.0)
         self.pe = paddle.zeros([1, self.max_len, self.d_model])  #[B=1,T,D]
 
         position = paddle.arange(
             0, self.max_len, dtype=paddle.float32).unsqueeze(1)  #[T, 1]
+        # base^{-2(i-1)/d)}, i \in (1,2...,d/2)
         div_term = paddle.exp(
-            paddle.arange(0, self.d_model, 2, dtype=paddle.float32) *
-            -(math.log(10000.0) / self.d_model))
+            -paddle.arange(0, self.d_model, 2, dtype=paddle.float32) *
+            (paddle.log(self.base) / self.d_model))
 
+        # [B,T,D]
         self.pe[:, :, 0::2] = paddle.sin(position * div_term)
         self.pe[:, :, 1::2] = paddle.cos(position * div_term)
 
@@ -161,6 +164,98 @@ class RelPositionalEncoding(PositionalEncoding):
         assert offset + x.shape[
             1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format(
                 offset, x.shape[1], self.max_len)
+
         x = x * self.xscale
         pos_emb = self.pe[:, offset:offset + x.shape[1]]
         return self.dropout(x), self.dropout(pos_emb)
+
+
+# RotaryRelPositionalEncoding is same to RelPositionalEncoding
+class ScaledRotaryRelPositionalEncoding(RelPositionalEncoding):
+    """Scaled Rotary Relative positional encoding module.
+    POSITION INTERPOLATION:  : https://arxiv.org/pdf/2306.15595v2.pdf
+    """
+
+    def __init__(self,
+                 d_model: int,
+                 dropout_rate: float,
+                 max_len: int=5000,
+                 scale=1):
+        """
+        Args:
+            d_model (int): Embedding dimension.
+            dropout_rate (float): Dropout rate.
+            max_len (int, optional): [Maximum input length.]. Defaults to 5000.
+            scale (int): Interpolation max input length to `scale * max_len` positions.
+        """
+        super().__init__(d_model, dropout_rate, max_len, reverse=True)
+        self.pscale = paddle.to_tensor(scale)
+        self.max_len = max_len * scale
+
+    def sinusoidal_embeddings(self,
+                              pos: paddle.Tensor,
+                              dim: paddle.Tensor,
+                              base=10000) -> paddle.Tensor:
+        """计算pos位置的dim维sinusoidal编码"""
+        assert dim % 2 == 0
+        # (d/2,)
+        indices = paddle.arange(0, dim // 2, dtype=pos.dtype)
+        indices = paddle.pow(paddle.cast(base, pos.dtype), -2 * indices / dim)
+        # pos (1, T), indices (d/2,) -> (1, T, d/2)
+        embeddings = paddle.einsum('...,d->...d', pos, indices)
+        # (1, T, d/2, 2)
+        embeddings = paddle.stack(
+            [paddle.sin(embeddings), paddle.cos(embeddings)], axis=-1)
+        # (1, T, d)
+        embeddings = paddle.flatten(embeddings, start_axis=-2, stop_axis=-1)
+        return embeddings
+
+    def forward(self, x: paddle.Tensor,
+                offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Compute positional encoding.
+        Args:
+            x (paddle.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            paddle.Tensor: Encoded tensor (batch, time, `*`).
+            paddle.Tensor: Positional embedding tensor (1, time, `*`).
+        """
+        x = x * self.xscale
+
+        B, T, D = x.shape
+        assert D == self.d_model
+
+        # postion interploation
+        start = 0
+        end = T * self.pscale
+        assert end <= self.max_len
+        position = paddle.arange(start, end, dtype=x.dtype).unsqueeze(0)
+        position *= 1.0 / self.pscale
+        pe = self.sinusoidal_embeddings(position, self.d_model, base=self.base)
+
+        pos_emb = pe[:, offset:offset + x.shape[1]]
+        return self.dropout(x), self.dropout(pos_emb)
+
+    def position_encoding(self, offset: int, size: int) -> paddle.Tensor:
+        """ For getting encoding in a streaming fashion
+        Attention!!!!!
+        we apply dropout only once at the whole utterance level in a none
+        streaming way, but will call this function several times with
+        increasing input size in a streaming scenario, so the dropout will
+        be applied several times.
+        Args:
+            offset (int): start offset
+            size (int): requried size of position encoding
+        Returns:
+            paddle.Tensor: Corresponding position encoding, #[1, T, D].
+        """
+        # postion interploation
+        start = offset
+        end = (offset + size) * self.pscale
+        assert end <= self.max_len
+        position = paddle.arange(
+            start, end, dtype=paddle.get_default_dtype()).unsqueeze(0)
+        position *= 1.0 / self.pscale
+
+        pe = self.sinusoidal_embeddings(position, self.d_model, base=self.base)
+
+        return self.dropout(pe)
diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py
index d90d69d77..27d7ffbd7 100644
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@@ -28,6 +28,7 @@ from paddlespeech.s2t.modules.align import LayerNorm
 from paddlespeech.s2t.modules.align import Linear
 from paddlespeech.s2t.modules.attention import MultiHeadedAttention
 from paddlespeech.s2t.modules.attention import RelPositionMultiHeadedAttention
+from paddlespeech.s2t.modules.attention import RoPERelPositionMultiHeadedAttention
 from paddlespeech.s2t.modules.conformer_convolution import ConvolutionModule
 from paddlespeech.s2t.modules.embedding import NoPositionalEncoding
 from paddlespeech.s2t.modules.embedding import PositionalEncoding
@@ -115,6 +116,8 @@ class BaseEncoder(nn.Layer):
             pos_enc_class = PositionalEncoding
         elif pos_enc_layer_type == "rel_pos":
             pos_enc_class = RelPositionalEncoding
+        elif pos_enc_layer_type == "rope_pos":
+            pos_enc_class = RelPositionalEncoding
         elif pos_enc_layer_type == "no_pos":
             pos_enc_class = NoPositionalEncoding
         else:
@@ -230,14 +233,14 @@ class BaseEncoder(nn.Layer):
             xs = self.global_cmvn(xs)
 
         # before embed, xs=(B, T, D1), pos_emb=(B=1, T, D)
-        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset)
+        xs, _, _ = self.embed(xs, tmp_masks, offset=offset)
         # after embed, xs=(B=1, chunk_size, hidden-dim)
 
         elayers, _, cache_t1, _ = att_cache.shape
         chunk_size = xs.shape[1]
         attention_key_size = cache_t1 + chunk_size
 
-        # only used when using `RelPositionMultiHeadedAttention`
+        # only used when using `RelPositionMultiHeadedAttention` and `RoPERelPositionMultiHeadedAttention`
         pos_emb = self.embed.position_encoding(
             offset=offset - cache_t1, size=attention_key_size)
 
@@ -474,21 +477,35 @@ class ConformerEncoder(BaseEncoder):
         activation = get_activation(activation_type)
 
         # self-attention module definition
-        encoder_selfattn_layer = RelPositionMultiHeadedAttention
-        encoder_selfattn_layer_args = (attention_heads, output_size,
-                                       attention_dropout_rate)
+        encoder_dim = output_size
+        if pos_enc_layer_type == "abs_pos":
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, encoder_dim,
+                                           attention_dropout_rate)
+        elif pos_enc_layer_type == "rel_pos":
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, encoder_dim,
+                                           attention_dropout_rate)
+        elif pos_enc_layer_type == "rope_pos":
+            encoder_selfattn_layer = RoPERelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, encoder_dim,
+                                           attention_dropout_rate)
+        else:
+            raise ValueError(
+                f"pos_enc_layer_type {pos_enc_layer_type} not supported.")
+
         # feed-forward module definition
         positionwise_layer = PositionwiseFeedForward
-        positionwise_layer_args = (output_size, linear_units, dropout_rate,
+        positionwise_layer_args = (encoder_dim, linear_units, dropout_rate,
                                    activation)
         # convolution module definition
         convolution_layer = ConvolutionModule
-        convolution_layer_args = (output_size, cnn_module_kernel, activation,
+        convolution_layer_args = (encoder_dim, cnn_module_kernel, activation,
                                   cnn_module_norm, causal)
 
         self.encoders = nn.LayerList([
             ConformerEncoderLayer(
-                size=output_size,
+                size=encoder_dim,
                 self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),
                 feed_forward=positionwise_layer(*positionwise_layer_args),
                 feed_forward_macaron=positionwise_layer(
@@ -580,15 +597,23 @@ class SqueezeformerEncoder(nn.Layer):
         activation = get_activation(activation_type)
 
         # self-attention module definition
-        if pos_enc_layer_type != "rel_pos":
+        if pos_enc_layer_type == "abs_pos":
             encoder_selfattn_layer = MultiHeadedAttention
             encoder_selfattn_layer_args = (attention_heads, output_size,
                                            attention_dropout_rate)
-        else:
+        elif pos_enc_layer_type == "rel_pos":
             encoder_selfattn_layer = RelPositionMultiHeadedAttention
             encoder_selfattn_layer_args = (attention_heads, encoder_dim,
                                            attention_dropout_rate,
                                            adaptive_scale, init_weights)
+        elif pos_enc_layer_type == "rope_pos":
+            encoder_selfattn_layer = RoPERelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, encoder_dim,
+                                           attention_dropout_rate,
+                                           adaptive_scale, init_weights)
+        else:
+            raise ValueError(
+                f"pos_enc_layer_type {pos_enc_layer_type} not supported.")
 
         # feed-forward module definition
         positionwise_layer = PositionwiseFeedForward
diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py
index ecba95e85..0499e742b 100644
--- a/paddlespeech/s2t/modules/encoder_layer.py
+++ b/paddlespeech/s2t/modules/encoder_layer.py
@@ -48,7 +48,7 @@ class TransformerEncoderLayer(nn.Layer):
         Args:
             size (int): Input dimension.
             self_attn (nn.Layer): Self-attention module instance.
-                `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+                `MultiHeadedAttention`, `RelPositionMultiHeadedAttention` or `RoPERelPositionMultiHeadedAttention`
                 instance can be used as the argument.
             feed_forward (nn.Layer): Feed-forward module instance.
                 `PositionwiseFeedForward`, instance can be used as the argument.
@@ -147,7 +147,7 @@ class ConformerEncoderLayer(nn.Layer):
         Args:
             size (int): Input dimension.
             self_attn (nn.Layer): Self-attention module instance.
-                `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+                `MultiHeadedAttention`, `RelPositionMultiHeadedAttention` or `RoPERelPositionMultiHeadedAttention`
                 instance can be used as the argument.
             feed_forward (nn.Layer): Feed-forward module instance.
                 `PositionwiseFeedForward` instance can be used as the argument.
@@ -298,7 +298,7 @@ class SqueezeformerEncoderLayer(nn.Layer):
         Args:
             size (int): Input dimension.
             self_attn (paddle.nn.Layer): Self-attention module instance.
-                `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+                `MultiHeadedAttention`, `RelPositionMultiHeadedAttention` or `RoPERelPositionMultiHeadedAttention`
                 instance can be used as the argument.
             feed_forward1 (paddle.nn.Layer): Feed-forward module instance.
                 `PositionwiseFeedForward` instance can be used as the argument.
diff --git a/paddlespeech/s2t/training/gradclip.py b/paddlespeech/s2t/training/gradclip.py
deleted file mode 100644
index 06587c749..000000000
--- a/paddlespeech/s2t/training/gradclip.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle
-from paddle.fluid import core
-from paddle.fluid import layers
-from paddle.fluid.dygraph import base as imperative_base
-
-from paddlespeech.s2t.utils.log import Log
-
-__all__ = ["ClipGradByGlobalNormWithLog"]
-
-logger = Log(__name__).getlog()
-
-
-class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
-    def __init__(self, clip_norm):
-        super().__init__(clip_norm)
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}(global_clip_norm={self.clip_norm})"
-
-    @imperative_base.no_grad
-    def _dygraph_clip(self, params_grads):
-        params_and_grads = []
-        sum_square_list = []
-        for i, (p, g) in enumerate(params_grads):
-            if g is None:
-                continue
-            if getattr(p, 'need_clip', True) is False:
-                continue
-            merge_grad = g
-            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
-                merge_grad = layers.merge_selected_rows(g)
-                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
-            square = paddle.square(merge_grad)
-            sum_square = paddle.sum(square)
-            sum_square_list.append(sum_square)
-
-            # debug log, not dump all since slow down train process
-            if i < 10:
-                logger.debug(
-                    f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
-
-        # all parameters have been filterd out
-        if len(sum_square_list) == 0:
-            return params_grads
-
-        global_norm_var = paddle.concat(sum_square_list)
-        global_norm_var = paddle.sum(global_norm_var)
-        global_norm_var = paddle.sqrt(global_norm_var)
-
-        # debug log
-        logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!")
-
-        max_global_norm = paddle.full(
-            shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm)
-        clip_var = paddle.divide(
-            x=max_global_norm,
-            y=paddle.maximum(x=global_norm_var, y=max_global_norm))
-        for i, (p, g) in enumerate(params_grads):
-            if g is None:
-                continue
-            if getattr(p, 'need_clip', True) is False:
-                params_and_grads.append((p, g))
-                continue
-            new_grad = paddle.multiply(x=g, y=clip_var)
-            params_and_grads.append((p, new_grad))
-
-            # debug log, not dump all since slow down train process
-            if i < 10:
-                logger.debug(
-                    f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
-                )
-
-        return params_and_grads
diff --git a/paddlespeech/s2t/training/optimizer/__init__.py b/paddlespeech/s2t/training/optimizer/__init__.py
index aafdc5b6a..90281e1ed 100644
--- a/paddlespeech/s2t/training/optimizer/__init__.py
+++ b/paddlespeech/s2t/training/optimizer/__init__.py
@@ -19,7 +19,7 @@ from typing import Text
 import paddle
 from paddle.optimizer import Optimizer
 from paddle.regularizer import L2Decay
-from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
+
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.s2t.utils.dynamic_import import instance_class
 from paddlespeech.s2t.utils.log import Log
@@ -100,10 +100,9 @@ class OptimizerFactory():
         assert "parameters" in args, "parameters not in args."
         assert "learning_rate" in args, "learning_rate not in args."
 
-        grad_clip = ClipGradByGlobalNormWithLog(
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(
             args['grad_clip']) if "grad_clip" in args else None
-        weight_decay = L2Decay(
-            args['weight_decay']) if "weight_decay" in args else None
+        weight_decay = args.get("weight_decay", None)
         if weight_decay:
             logger.info(f'<WeightDecay - {weight_decay}>')
         if grad_clip:
diff --git a/paddlespeech/s2t/training/optimizer/adadelta.py b/paddlespeech/s2t/training/optimizer/adadelta.py
index 900b697c5..7c3950a90 100644
--- a/paddlespeech/s2t/training/optimizer/adadelta.py
+++ b/paddlespeech/s2t/training/optimizer/adadelta.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
-from paddle.fluid import framework
+from paddle import framework
 from paddle.optimizer import Optimizer
 
 __all__ = []
diff --git a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
index 0995a55da..9dd31a08b 100644
--- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
@@ -28,7 +28,7 @@ from paddlespeech.server.utils.audio_process import float2pcm
 from paddlespeech.server.utils.onnx_infer import get_sess
 from paddlespeech.server.utils.util import denorm
 from paddlespeech.server.utils.util import get_chunks
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 
 __all__ = ['TTSEngine', 'PaddleTTSConnectionHandler']
diff --git a/paddlespeech/server/engine/tts/online/python/tts_engine.py b/paddlespeech/server/engine/tts/online/python/tts_engine.py
index a46b84bd9..0cfb20354 100644
--- a/paddlespeech/server/engine/tts/online/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py
@@ -29,7 +29,7 @@ from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.audio_process import float2pcm
 from paddlespeech.server.utils.util import denorm
 from paddlespeech.server.utils.util import get_chunks
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.modules.normalizer import ZScore
 
diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
index 20b98fae6..3a6461f8c 100644
--- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
+++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
@@ -32,7 +32,7 @@ from paddlespeech.server.utils.errors import ErrorCode
 from paddlespeech.server.utils.exception import ServerBaseException
 from paddlespeech.server.utils.paddle_predictor import init_predictor
 from paddlespeech.server.utils.paddle_predictor import run_model
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 
 __all__ = ['TTSEngine', 'PaddleTTSConnectionHandler']
diff --git a/paddlespeech/t2s/__init__.py b/paddlespeech/t2s/__init__.py
index 57fe82a9c..7d93c026e 100644
--- a/paddlespeech/t2s/__init__.py
+++ b/paddlespeech/t2s/__init__.py
@@ -18,6 +18,5 @@ from . import exps
 from . import frontend
 from . import models
 from . import modules
-from . import ssml
 from . import training
 from . import utils
diff --git a/paddlespeech/t2s/assets/__init__.py b/paddlespeech/t2s/assets/__init__.py
new file mode 100644
index 000000000..595add0ae
--- /dev/null
+++ b/paddlespeech/t2s/assets/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/t2s/assets/sentences_mix.txt b/paddlespeech/t2s/assets/sentences_mix.txt
index 06e97d14a..bfa0db636 100644
--- a/paddlespeech/t2s/assets/sentences_mix.txt
+++ b/paddlespeech/t2s/assets/sentences_mix.txt
@@ -5,4 +5,5 @@
 005 Paddle Bo Bo: 使用 Paddle Speech 的语音合成模块生成虚拟人的声音。
 006 热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！
 007 我喜欢 eat apple, 你喜欢 drink milk。
-008 我们要去云南 team building, 非常非常 happy.
\ No newline at end of file
+008 我们要去云南 team building, 非常非常 happy.
+009 AI for Sceience 平台。
\ No newline at end of file
diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py
index 97626db0b..24f2be7d5 100644
--- a/paddlespeech/t2s/exps/fastspeech2/train.py
+++ b/paddlespeech/t2s/exps/fastspeech2/train.py
@@ -44,10 +44,17 @@ from paddlespeech.t2s.utils import str2bool
 def train_sp(args, config):
     # decides device type and whether to run in parallel
     # setup running environment correctly
-    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+    if args.ngpu > 0 and paddle.is_compiled_with_cuda():
+        paddle.set_device("gpu")
+    elif args.nxpu > 0 and paddle.is_compiled_with_xpu():
+        paddle.set_device("xpu")
+    elif args.ngpu == 0 and args.nxpu == 0:
         paddle.set_device("cpu")
     else:
-        paddle.set_device("gpu")
+        raise ValueError(
+            "Please make sure that the paddle you installed matches the device type you set, "
+            "and that ngpu and nxpu cannot be negative at the same time.")
+
     world_size = paddle.distributed.get_world_size()
     if world_size > 1:
         paddle.distributed.init_parallel_env()
@@ -183,7 +190,12 @@ def main():
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
+        "--ngpu", type=int, default=1, help="if ngpu=0, use cpu or xpu.")
+    parser.add_argument(
+        "--nxpu",
+        type=int,
+        default=0,
+        help="if ngpu=0 and nxpu > 0, use xpu. if ngpu=0 and nxpu=0, use cpu.")
     parser.add_argument(
         "--phones-dict", type=str, default=None, help="phone vocabulary file.")
     parser.add_argument(
diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py
index 31fe14490..8a5269825 100644
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@@ -112,7 +112,7 @@ def parse_args():
     parser.add_argument(
         "--device",
         default="gpu",
-        choices=["gpu", "cpu"],
+        choices=["gpu", "cpu", "xpu"],
         help="Device selected for inference.", )
     parser.add_argument('--cpu_threads', type=int, default=1)
 
diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
index 57c79dee1..9a07df64d 100644
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -33,8 +33,8 @@ from yacs.config import CfgNode
 from paddlespeech.t2s.datasets.am_batch_fn import *
 from paddlespeech.t2s.datasets.data_table import DataTable
 from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip_static
-from paddlespeech.t2s.frontend import English
 from paddlespeech.t2s.frontend.canton_frontend import CantonFrontend
+from paddlespeech.t2s.frontend.en_frontend import English
 from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
 from paddlespeech.t2s.frontend.sing_frontend import SingFrontend
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
@@ -99,14 +99,23 @@ def norm(data, mean, std):
     return (data - mean) / std
 
 
-def get_chunks(data, block_size: int, pad_size: int):
-    data_len = data.shape[1]
+def get_chunks(mel, chunk_size: int, pad_size: int):
+    """
+    Split mel by chunk size with left and right context.
+
+    Args:
+        mel (paddle.Tensor): mel spectrogram, shape (B, T, D)
+        chunk_size (int): chunk size
+        pad_size (int): size for left and right context.
+    """
+    T = mel.shape[1]
+    n = math.ceil(T / chunk_size)
+
     chunks = []
-    n = math.ceil(data_len / block_size)
     for i in range(n):
-        start = max(0, i * block_size - pad_size)
-        end = min((i + 1) * block_size + pad_size, data_len)
-        chunks.append(data[:, start:end, :])
+        start = max(0, i * chunk_size - pad_size)
+        end = min((i + 1) * chunk_size + pad_size, T)
+        chunks.append(mel[:, start:end, :])
     return chunks
 
 
@@ -117,14 +126,10 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
     with open(text_file, 'rt', encoding='utf-8') as f:
         for line in f:
             if line.strip() != "":
-                items = re.split(r"\s+", line.strip(), 1)
+                items = re.split(r"\s+", line.strip(), maxsplit=1)
+                assert len(items) == 2
                 utt_id = items[0]
-                if lang in {'zh', 'canton'}:
-                    sentence = "".join(items[1:])
-                elif lang == 'en':
-                    sentence = " ".join(items[1:])
-                elif lang == 'mix':
-                    sentence = " ".join(items[1:])
+                sentence = items[1]
             sentences.append((utt_id, sentence))
     return sentences
 
@@ -319,6 +324,7 @@ def run_frontend(
         input_ids = {}
         if text.strip() != "" and re.match(r".*?<speak>.*?</speak>.*", text,
                                            re.DOTALL):
+            # using ssml
             input_ids = frontend.get_input_ids_ssml(
                 text,
                 merge_sentences=merge_sentences,
@@ -359,6 +365,7 @@ def run_frontend(
         outs.update({'is_slurs': is_slurs})
     else:
         print("lang should in {'zh', 'en', 'mix', 'canton', 'sing'}!")
+
     outs.update({'phone_ids': phone_ids})
     return outs
 
diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py
index 6189522db..e7cf7850e 100644
--- a/paddlespeech/t2s/exps/synthesize.py
+++ b/paddlespeech/t2s/exps/synthesize.py
@@ -219,7 +219,13 @@ def parse_args():
     )
     # other
     parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.")
+    parser.add_argument(
+        "--nxpu",
+        type=int,
+        default=0,
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu."
+    )
     parser.add_argument("--test_metadata", type=str, help="test metadata.")
     parser.add_argument("--output_dir", type=str, help="output dir.")
     parser.add_argument(
@@ -235,12 +241,14 @@ def parse_args():
 def main():
 
     args = parse_args()
-    if args.ngpu == 0:
-        paddle.set_device("cpu")
-    elif args.ngpu > 0:
+    if args.ngpu > 0:
         paddle.set_device("gpu")
+    elif args.nxpu > 0:
+        paddle.set_device("xpu")
+    elif args.ngpu == 0 and args.nxpu == 0:
+        paddle.set_device("cpu")
     else:
-        print("ngpu should >= 0 !")
+        print("ngpu or nxpu should >= 0 !")
 
     evaluate(args)
 
diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
index 0c7b34b09..c63a5fbe9 100644
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import argparse
 from pathlib import Path
+from pprint import pprint
 
 import paddle
 import soundfile as sf
@@ -78,6 +79,7 @@ def evaluate(args):
 
     # whether dygraph to static
     if args.inference_dir:
+        print("convert am and voc to static model.")
         # acoustic model
         am_inference = am_to_static(
             am_inference=am_inference,
@@ -92,6 +94,7 @@ def evaluate(args):
 
     output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
+
     merge_sentences = False
     # Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph
     # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
@@ -102,13 +105,19 @@ def evaluate(args):
     if am_name == 'speedyspeech':
         get_tone_ids = True
 
+    # wav samples
     N = 0
+    # inference time cost
     T = 0
+
+    # [(uid, text), ]
     if am_name == 'diffsinger':
         sentences = get_sentences_svs(text_file=args.text)
     else:
         sentences = get_sentences(text_file=args.text, lang=args.lang)
+
     for utt_id, sentence in sentences:
+        print(f"{utt_id} {sentence}")
         with timer() as t:
             if am_name == "diffsinger":
                 text = ""
@@ -116,6 +125,8 @@ def evaluate(args):
             else:
                 text = sentence
                 svs_input = None
+
+            # frontend
             frontend_dict = run_frontend(
                 frontend=frontend,
                 text=text,
@@ -124,25 +135,33 @@ def evaluate(args):
                 lang=args.lang,
                 svs_input=svs_input)
             phone_ids = frontend_dict['phone_ids']
+            # pprint(f"{utt_id} {phone_ids}")
+
             with paddle.no_grad():
                 flags = 0
                 for i in range(len(phone_ids)):
+                    # sub phone, split by `sp` or punctuation.
                     part_phone_ids = phone_ids[i]
+
                     # acoustic model
                     if am_name == 'fastspeech2':
                         # multi speaker
                         if am_dataset in {"aishell3", "vctk", "mix", "canton"}:
-                            spk_id = paddle.to_tensor(args.spk_id)
+                            # multi-speaker
+                            spk_id = paddle.to_tensor([args.spk_id])
                             mel = am_inference(part_phone_ids, spk_id)
                         else:
+                            # single-speaker
                             mel = am_inference(part_phone_ids)
                     elif am_name == 'speedyspeech':
                         part_tone_ids = frontend_dict['tone_ids'][i]
                         if am_dataset in {"aishell3", "vctk", "mix"}:
-                            spk_id = paddle.to_tensor(args.spk_id)
+                            # multi-speaker
+                            spk_id = paddle.to_tensor([args.spk_id])
                             mel = am_inference(part_phone_ids, part_tone_ids,
                                                spk_id)
                         else:
+                            # single-speaker
                             mel = am_inference(part_phone_ids, part_tone_ids)
                     elif am_name == 'tacotron2':
                         mel = am_inference(part_phone_ids)
@@ -155,6 +174,7 @@ def evaluate(args):
                             note=part_note_ids,
                             note_dur=part_note_durs,
                             is_slur=part_is_slurs, )
+
                     # vocoder
                     wav = voc_inference(mel)
                     if flags == 0:
@@ -162,17 +182,23 @@ def evaluate(args):
                         flags = 1
                     else:
                         wav_all = paddle.concat([wav_all, wav])
+
         wav = wav_all.numpy()
         N += wav.size
         T += t.elapse
+
+        # samples per second
         speed = wav.size / t.elapse
+        # generate one second wav need `RTF` seconds
         rtf = am_config.fs / speed
         print(
             f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
         )
+
         sf.write(
             str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
         print(f"{utt_id} done!")
+
     print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
 
 
@@ -273,7 +299,13 @@ def parse_args():
         default=None,
         help="dir to save inference models")
     parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.")
+    parser.add_argument(
+        "--nxpu",
+        type=int,
+        default=0,
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu."
+    )
     parser.add_argument(
         "--text",
         type=str,
@@ -303,12 +335,14 @@ def parse_args():
 def main():
     args = parse_args()
 
-    if args.ngpu == 0:
-        paddle.set_device("cpu")
-    elif args.ngpu > 0:
+    if args.ngpu > 0:
         paddle.set_device("gpu")
+    elif args.nxpu > 0:
+        paddle.set_device("xpu")
+    elif args.ngpu == 0 and args.nxpu == 0:
+        paddle.set_device("cpu")
     else:
-        print("ngpu should >= 0 !")
+        print("ngpu or nxpu should >= 0 !")
 
     evaluate(args)
 
diff --git a/paddlespeech/t2s/exps/transformer_tts/preprocess.py b/paddlespeech/t2s/exps/transformer_tts/preprocess.py
index 2ebd5ecc2..4e82e53ff 100644
--- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py
+++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py
@@ -27,7 +27,7 @@ import yaml
 from yacs.config import CfgNode as Configuration
 
 from paddlespeech.t2s.datasets.get_feats import LogMelFBank
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
 
 
 def get_lj_sentences(file_name, frontend):
diff --git a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
index 0cd7d224e..279407b38 100644
--- a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
@@ -21,7 +21,7 @@ import soundfile as sf
 import yaml
 from yacs.config import CfgNode
 
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
 from paddlespeech.t2s.models.transformer_tts import TransformerTTS
 from paddlespeech.t2s.models.transformer_tts import TransformerTTSInference
 from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow
diff --git a/paddlespeech/t2s/frontend/__init__.py b/paddlespeech/t2s/frontend/__init__.py
index 64015435e..a8f77d552 100644
--- a/paddlespeech/t2s/frontend/__init__.py
+++ b/paddlespeech/t2s/frontend/__init__.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 from .generate_lexicon import *
 from .normalizer import *
-from .phonectic import *
 from .punctuation import *
+from .ssml import *
 from .tone_sandhi import *
 from .vocab import *
 from .zh_normalization import *
diff --git a/paddlespeech/t2s/frontend/arpabet.py b/paddlespeech/t2s/frontend/arpabet.py
index 7a81b645d..9b2b11b3d 100644
--- a/paddlespeech/t2s/frontend/arpabet.py
+++ b/paddlespeech/t2s/frontend/arpabet.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddlespeech.t2s.frontend.phonectic import Phonetics
 """
 A phonology system with ARPABET symbols and limited punctuations. The G2P 
 conversion is done by g2p_en.
@@ -19,55 +18,68 @@ conversion is done by g2p_en.
 Note that g2p_en does not handle words with hypen well. So make sure the input
 sentence is first normalized.
 """
-from paddlespeech.t2s.frontend.vocab import Vocab
 from g2p_en import G2p
 
+from paddlespeech.t2s.frontend.phonectic import Phonetics
+from paddlespeech.t2s.frontend.vocab import Vocab
+
 
 class ARPABET(Phonetics):
-    """A phonology for English that uses ARPABET as the phoneme vocabulary.
+    """A phonology for English that uses ARPABET without stress as the phoneme vocabulary.
+
+    47 symbols = 39 phones + 4 punctuations + 4 special tokens(<pad> <unk> <s> </s>)
+
+    The current phoneme set contains 39 phonemes, vowels carry a lexical stress marker:
+        0    — No stress
+        1    — Primary stress
+        2    — Secondary stress
+
+    Phoneme Set:
+        Phoneme Example Translation
+            ------- ------- -----------
+            AA	odd     AA D
+            AE	at	AE T
+            AH	hut	HH AH T
+            AO	ought	AO T
+            AW	cow	K AW
+            AY	hide	HH AY D
+            B 	be	B IY
+            CH	cheese	CH IY Z
+            D 	dee	D IY
+            DH	thee	DH IY
+            EH	Ed	EH D
+            ER	hurt	HH ER T
+            EY	ate	EY T
+            F 	fee	F IY
+            G 	green	G R IY N
+            HH	he	HH IY
+            IH	it	IH T
+            IY	eat	IY T
+            JH	gee	JH IY
+            K 	key	K IY
+            L 	lee	L IY
+            M 	me	M IY
+            N 	knee	N IY
+            NG	ping	P IH NG
+            OW	oat	OW T
+            OY	toy	T OY
+            P 	pee	P IY
+            R 	read	R IY D
+            S 	sea	S IY
+            SH	she	SH IY
+            T 	tea	T IY
+            TH	theta	TH EY T AH
+            UH	hood	HH UH D
+            UW	two	T UW
+            V 	vee	V IY
+            W 	we	W IY
+            Y 	yield	Y IY L D
+            Z 	zee	Z IY
+            ZH	seizure	S IY ZH ER
+
     See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details.
-    Phoneme Example Translation
-        ------- ------- -----------
-        AA	odd     AA D
-        AE	at	AE T
-        AH	hut	HH AH T
-        AO	ought	AO T
-        AW	cow	K AW
-        AY	hide	HH AY D
-        B 	be	B IY
-        CH	cheese	CH IY Z
-        D 	dee	D IY
-        DH	thee	DH IY
-        EH	Ed	EH D
-        ER	hurt	HH ER T
-        EY	ate	EY T
-        F 	fee	F IY
-        G 	green	G R IY N
-        HH	he	HH IY
-        IH	it	IH T
-        IY	eat	IY T
-        JH	gee	JH IY
-        K 	key	K IY
-        L 	lee	L IY
-        M 	me	M IY
-        N 	knee	N IY
-        NG	ping	P IH NG
-        OW	oat	OW T
-        OY	toy	T OY
-        P 	pee	P IY
-        R 	read	R IY D
-        S 	sea	S IY
-        SH	she	SH IY
-        T 	tea	T IY
-        TH	theta	TH EY T AH
-        UH	hood	HH UH D
-        UW	two	T UW
-        V 	vee	V IY
-        W 	we	W IY
-        Y 	yield	Y IY L D
-        Z 	zee	Z IY
-        ZH	seizure	S IY ZH ER
     """
+    # 39 phonemes
     phonemes = [
         'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER',
         'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW',
@@ -76,6 +88,8 @@ class ARPABET(Phonetics):
     ]
     punctuations = [',', '.', '?', '!']
     symbols = phonemes + punctuations
+    # vowels carry a lexical stress marker：
+    # 0 unstressed（无重音）, 1 primary stress（主重音）和 2 secondary stress（次重音）
     _stress_to_no_stress_ = {
         'AA0': 'AA',
         'AA1': 'AA',
@@ -124,7 +138,12 @@ class ARPABET(Phonetics):
         'UW2': 'UW'
     }
 
+    def __repr__(self):
+        fmt = "ARPABETWithoutStress(phonemes: {}, punctuations: {})"
+        return fmt.format(len(phonemes), punctuations)
+
     def __init__(self):
+        # https://github.com/Kyubyong/g2p/blob/master/g2p_en/g2p.py
         self.backend = G2p()
         self.vocab = Vocab(self.phonemes + self.punctuations)
 
@@ -139,6 +158,7 @@ class ARPABET(Phonetics):
         Returns:
             List[str]: The list of pronunciation sequence.
         """
+        # g2p and remove vowel stress
         phonemes = [
             self._remove_vowels(item) for item in self.backend(sentence)
         ]
@@ -158,6 +178,7 @@ class ARPABET(Phonetics):
         Returns:
             List[int]: The list of pronunciation id sequence.
         """
+        # phonemes to ids
         ids = [self.vocab.lookup(item) for item in phonemes]
         return ids
 
@@ -189,11 +210,16 @@ class ARPABET(Phonetics):
     def vocab_size(self):
         """ Vocab size.
         """
-        # 47 = 39 phones + 4 punctuations + 4 special tokens
+        # 47 = 39 phones + 4 punctuations + 4 special tokens(<pad> <unk> <s> </s>)
         return len(self.vocab)
 
 
 class ARPABETWithStress(Phonetics):
+    """
+    A phonology for English that uses ARPABET with stress as the phoneme vocabulary.
+
+    77 symbols = 69 phones + 4 punctuations + 4 special tokens
+    """
     phonemes = [
         'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
         'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D',
@@ -206,6 +232,10 @@ class ARPABETWithStress(Phonetics):
     punctuations = [',', '.', '?', '!']
     symbols = phonemes + punctuations
 
+    def __repr__(self):
+        fmt = "ARPABETWithStress(phonemes: {}, punctuations: {})"
+        return fmt.format(len(phonemes), punctuations)
+
     def __init__(self):
         self.backend = G2p()
         self.vocab = Vocab(self.phonemes + self.punctuations)
diff --git a/paddlespeech/t2s/frontend/canton_frontend.py b/paddlespeech/t2s/frontend/canton_frontend.py
index f2c7175fe..bbb7bcf00 100644
--- a/paddlespeech/t2s/frontend/canton_frontend.py
+++ b/paddlespeech/t2s/frontend/canton_frontend.py
@@ -29,7 +29,8 @@ INITIALS = [
 INITIALS += ['sp', 'spl', 'spn', 'sil']
 
 
-def get_lines(cantons: List[str]):
+def jyuping_to_phonemes(cantons: List[str]):
+    # jyuping to inital and final
     phones = []
     for canton in cantons:
         for consonant in INITIALS:
@@ -47,7 +48,7 @@ def get_lines(cantons: List[str]):
 class CantonFrontend():
     def __init__(self, phone_vocab_path: str):
         self.text_normalizer = TextNormalizer()
-        self.punc = "：，；。？！“”‘’':,;.?!"
+        self.punc = "、：，；。？！“”‘’':,;.?!"
 
         self.vocab_phones = {}
         if phone_vocab_path:
@@ -61,8 +62,11 @@ class CantonFrontend():
              merge_sentences: bool=True) -> List[List[str]]:
         phones_list = []
         for sentence in sentences:
+            # jyuping
+            # 'gam3 ngaam1 lou5 sai3 jiu1 kau4 keoi5 dang2 zan6 jiu3 hoi1 wui2, zing6 dai1 ge2 je5 ngo5 wui5 gaau2 dim6 ga3 laa3.'
             phones_str = ToJyutping.get_jyutping_text(sentence)
-            phones_split = get_lines(phones_str.split(' '))
+            # phonemes 
+            phones_split = jyuping_to_phonemes(phones_str.split(' '))
             phones_list.append(phones_split)
         return phones_list
 
@@ -78,8 +82,11 @@ class CantonFrontend():
                      sentence: str,
                      merge_sentences: bool=True,
                      print_info: bool=False) -> List[List[str]]:
+        # TN & Text Segmentation
         sentences = self.text_normalizer.normalize(sentence)
+        # G2P
         phonemes = self._g2p(sentences, merge_sentences=merge_sentences)
+
         if print_info:
             print("----------------------------")
             print("text norm results:")
@@ -88,6 +95,7 @@ class CantonFrontend():
             print("g2p results:")
             print(phonemes)
             print("----------------------------")
+
         return phonemes
 
     def get_input_ids(self,
@@ -98,9 +106,9 @@ class CantonFrontend():
 
         phonemes = self.get_phonemes(
             sentence, merge_sentences=merge_sentences, print_info=print_info)
+
         result = {}
         temp_phone_ids = []
-
         for phones in phonemes:
             if phones:
                 phone_ids = self._p2id(phones)
@@ -108,6 +116,8 @@ class CantonFrontend():
                 if to_tensor:
                     phone_ids = paddle.to_tensor(phone_ids)
                 temp_phone_ids.append(phone_ids)
+
         if temp_phone_ids:
             result["phone_ids"] = temp_phone_ids
+
         return result
diff --git a/paddlespeech/t2s/frontend/en_frontend.py b/paddlespeech/t2s/frontend/en_frontend.py
new file mode 100644
index 000000000..c58bed7d3
--- /dev/null
+++ b/paddlespeech/t2s/frontend/en_frontend.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .phonectic import English
diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py
index b8c16097c..2ebfe135e 100644
--- a/paddlespeech/t2s/frontend/mix_frontend.py
+++ b/paddlespeech/t2s/frontend/mix_frontend.py
@@ -18,9 +18,9 @@ from typing import List
 import numpy as np
 import paddle
 
-from paddlespeech.t2s.frontend import English
-from paddlespeech.t2s.frontend.zh_frontend import Frontend
-from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor
+from paddlespeech.t2s.frontend.en_frontend import English as EnFrontend
+from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
+from paddlespeech.t2s.frontend.zh_frontend import Frontend as ZhFrontend
 
 
 class MixFrontend():
@@ -28,10 +28,9 @@ class MixFrontend():
                  g2p_model="pypinyin",
                  phone_vocab_path=None,
                  tone_vocab_path=None):
-
-        self.zh_frontend = Frontend(
+        self.zh_frontend = ZhFrontend(
             phone_vocab_path=phone_vocab_path, tone_vocab_path=tone_vocab_path)
-        self.en_frontend = English(phone_vocab_path=phone_vocab_path)
+        self.en_frontend = EnFrontend(phone_vocab_path=phone_vocab_path)
         self.sp_id = self.zh_frontend.vocab_phones["sp"]
         self.sp_id_numpy = np.array([self.sp_id])
         self.sp_id_tensor = paddle.to_tensor([self.sp_id])
@@ -55,15 +54,12 @@ class MixFrontend():
         else:
             return False
 
-    def get_segment(self, text: str) -> List[str]:
+    def split_by_lang(self, text: str) -> List[str]:
         # sentence --> [ch_part, en_part, ch_part, ...]
         segments = []
         types = []
-        flag = 0
-        temp_seg = ""
-        temp_lang = ""
 
-        # Determine the type of each character. type: blank, chinese, alphabet, number, unk and point.
+        # Determine the type of each character. type: chinese, alphabet, other.
         for ch in text:
             if self.is_chinese(ch):
                 types.append("zh")
@@ -74,31 +70,31 @@ class MixFrontend():
 
         assert len(types) == len(text)
 
-        for i in range(len(types)):
+        flag = 0
+        temp_seg = ""
+        temp_lang = ""
+
+        for i in range(len(text)):
             # find the first char of the seg
             if flag == 0:
                 temp_seg += text[i]
                 temp_lang = types[i]
                 flag = 1
-
             else:
                 if temp_lang == "other":
-                    if types[i] == temp_lang:
-                        temp_seg += text[i]
-                    else:
-                        temp_seg += text[i]
+                    # text start is not lang.
+                    temp_seg += text[i]
+                    if types[i] != temp_lang:
                         temp_lang = types[i]
-
                 else:
-                    if types[i] == temp_lang:
-                        temp_seg += text[i]
-                    elif types[i] == "other":
+                    if types[i] == temp_lang or types[i] == "other":
+                        # merge same lang or other
                         temp_seg += text[i]
                     else:
+                        # change lang
                         segments.append((temp_seg, temp_lang))
                         temp_seg = text[i]
-                        temp_lang = types[i]
-                        flag = 1
+                        temp_lang = types[i]  # new lang
 
         segments.append((temp_seg, temp_lang))
 
@@ -110,76 +106,95 @@ class MixFrontend():
                       get_tone_ids: bool=False,
                       add_sp: bool=True,
                       to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
-        ''' 1. 添加SSML支持，先列出 文字 和 <say-as>标签内容，
-                然后添加到tmpSegments数组里
-        '''
-        d_inputs = MixTextProcessor.get_dom_split(sentence)
-        tmpSegments = []
-        for instr in d_inputs:
-            ''' 暂时只支持 say-as '''
-            if instr.lower().startswith("<say-as"):
-                tmpSegments.append((instr, "zh"))
+        # XML Document Object Model (DOM)
+        doms = MixTextProcessor.get_dom_split(sentence)
+
+        lang_splits = []
+        for dom in doms:
+            if dom.lower().startswith("<say-as pinyin="):
+                # `<say-as pinyin=` for zh lang
+                lang_splits.append((dom, "zh"))
             else:
-                tmpSegments.extend(self.get_segment(instr))
-        ''' 2. 把zh的merge到一起，避免合成结果中间停顿
-        '''
+                # process zh, en and zh/en
+                lang_splits.extend(self.split_by_lang(dom))
+
+        # merge adjacent zh segment
         segments = []
         currentSeg = ["", ""]
-        for seg in tmpSegments:
+        for seg in lang_splits:
             if seg[1] == "en" or seg[1] == "other":
                 if currentSeg[0] == '':
+                    # first see
                     segments.append(seg)
                 else:
+                    # zh
                     currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
                     segments.append(tuple(currentSeg))
+                    # en
                     segments.append(seg)
+                    # reset
                     currentSeg = ["", ""]
             else:
+                # zh
                 if currentSeg[0] == '':
+                    # first see
                     currentSeg[0] = seg[0]
                     currentSeg[1] = seg[1]
                 else:
+                    # merge zh 
                     currentSeg[0] = currentSeg[0] + seg[0]
+
         if currentSeg[0] != '':
+            # last zh
             currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
             segments.append(tuple(currentSeg))
 
         phones_list = []
         result = {}
 
+        # 008 我们要去云南 team building, 非常非常 happy.
+        # seg ('我们要去云南 ', 'zh')
+        # seg ('team building, ', 'en')
+        # seg ('非常非常 ', 'zh')
+        # seg ('happy.', 'en')
+        # [('<speak>我们要去云南 </speak>', 'zh'), ('team building, ', 'en'), ('<speak>非常非常 </speak>', 'zh'), ('happy.', 'en')]
         for seg in segments:
             content = seg[0]
             lang = seg[1]
-            if content != '':
-                if lang == "en":
-                    input_ids = self.en_frontend.get_input_ids(
-                        content, merge_sentences=False, to_tensor=to_tensor)
+
+            if not content:
+                continue
+
+            if lang == "en":
+                input_ids = self.en_frontend.get_input_ids(
+                    content, merge_sentences=False, to_tensor=to_tensor)
+            else:
+                if content.strip() != "" and \
+                    re.match(r".*?<speak>.*?</speak>.*", content, re.DOTALL):
+                    # process ssml
+                    input_ids = self.zh_frontend.get_input_ids_ssml(
+                        content,
+                        merge_sentences=False,
+                        get_tone_ids=get_tone_ids,
+                        to_tensor=to_tensor)
                 else:
-                    ''' 3. 把带speak tag的中文和普通文字分开处理
-                    '''
-                    if content.strip() != "" and \
-                        re.match(r".*?<speak>.*?</speak>.*", content, re.DOTALL):
-                        input_ids = self.zh_frontend.get_input_ids_ssml(
-                            content,
-                            merge_sentences=False,
-                            get_tone_ids=get_tone_ids,
-                            to_tensor=to_tensor)
-                    else:
-                        input_ids = self.zh_frontend.get_input_ids(
-                            content,
-                            merge_sentences=False,
-                            get_tone_ids=get_tone_ids,
-                            to_tensor=to_tensor)
-                if add_sp:
-                    if to_tensor:
-                        input_ids["phone_ids"][-1] = paddle.concat(
-                            [input_ids["phone_ids"][-1], self.sp_id_tensor])
-                    else:
-                        input_ids["phone_ids"][-1] = np.concatenate(
-                            (input_ids["phone_ids"][-1], self.sp_id_numpy))
+                    # process plain text
+                    input_ids = self.zh_frontend.get_input_ids(
+                        content,
+                        merge_sentences=False,
+                        get_tone_ids=get_tone_ids,
+                        to_tensor=to_tensor)
+
+            if add_sp:
+                # add sp between zh and en
+                if to_tensor:
+                    input_ids["phone_ids"][-1] = paddle.concat(
+                        [input_ids["phone_ids"][-1], self.sp_id_tensor])
+                else:
+                    input_ids["phone_ids"][-1] = np.concatenate(
+                        (input_ids["phone_ids"][-1], self.sp_id_numpy))
 
-                for phones in input_ids["phone_ids"]:
-                    phones_list.append(phones)
+            phones_list.extend(input_ids["phone_ids"])
 
         if merge_sentences:
             merge_list = paddle.concat(phones_list)
diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py
index af86d9b80..d6c66f1e0 100644
--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@@ -47,15 +47,34 @@ class Phonetics(ABC):
 
 class English(Phonetics):
     """ Normalize the input text sequence and convert into pronunciation id sequence.
+
+    https://github.com/Kyubyong/g2p/blob/master/g2p_en/g2p.py
+
+    phonemes = ["<pad>", "<unk>", "<s>", "</s>"] + [   
+        'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
+        'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH',
+        'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
+        'EY2', 'F', 'G', 'HH',
+        'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L',
+        'M', 'N', 'NG', 'OW0', 'OW1',
+        'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH',
+        'UH0', 'UH1', 'UH2', 'UW',
+        'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH']
     """
 
+    LEXICON = {
+        # key using lowercase
+        "AI".lower(): [["EY0", "AY1"]],
+    }
+
     def __init__(self, phone_vocab_path=None):
         self.backend = G2p()
+        self.backend.cmu.update(English.LEXICON)
         self.phonemes = list(self.backend.phonemes)
         self.punctuations = get_punctuations("en")
         self.vocab = Vocab(self.phonemes + self.punctuations)
         self.vocab_phones = {}
-        self.punc = "：，；。？！“”‘’':,;.?!"
+        self.punc = "、：，；。？！“”‘’':,;.?!"
         self.text_normalizer = TextNormalizer()
         if phone_vocab_path:
             with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
@@ -86,8 +105,8 @@ class English(Phonetics):
                       sentence: str,
                       merge_sentences: bool=False,
                       to_tensor: bool=True) -> paddle.Tensor:
-        result = {}
         sentences = self.text_normalizer._split(sentence, lang="en")
+
         phones_list = []
         temp_phone_ids = []
         for sentence in sentences:
@@ -118,7 +137,10 @@ class English(Phonetics):
             if to_tensor:
                 phone_ids = paddle.to_tensor(phone_ids)
             temp_phone_ids.append(phone_ids)
+
+        result = {}
         result["phone_ids"] = temp_phone_ids
+
         return result
 
     def numericalize(self, phonemes):
diff --git a/paddlespeech/t2s/frontend/polyphonic.py b/paddlespeech/t2s/frontend/polyphonic.py
new file mode 100644
index 000000000..9a757e204
--- /dev/null
+++ b/paddlespeech/t2s/frontend/polyphonic.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import yaml
+
+
+class Polyphonic():
+    def __init__(self):
+        with open(
+                os.path.join(
+                    os.path.dirname(os.path.abspath(__file__)),
+                    'polyphonic.yaml'),
+                'r',
+                encoding='utf-8') as polyphonic_file:
+            # 解析yaml
+            polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
+        self.polyphonic_words = polyphonic_dict["polyphonic"]
+
+    def correct_pronunciation(self, word, pinyin):
+        # 词汇被词典收录则返回纠正后的读音
+        if word in self.polyphonic_words.keys():
+            pinyin = self.polyphonic_words[word]
+        # 否则返回原读音
+        return pinyin
diff --git a/paddlespeech/t2s/frontend/polyphonic.yaml b/paddlespeech/t2s/frontend/polyphonic.yaml
index 6885035e7..f52b1cf58 100644
--- a/paddlespeech/t2s/frontend/polyphonic.yaml
+++ b/paddlespeech/t2s/frontend/polyphonic.yaml
@@ -47,4 +47,8 @@ polyphonic:
     恶行: ['e4','xing2']
     唉: ['ai4']
     扎实: ['zha1','shi2']
-    干将: ['gan4','jiang4']
\ No newline at end of file
+    干将: ['gan4','jiang4']
+    陈威行: ['chen2', 'wei1', 'hang2']
+    郭晟: ['guo1', 'sheng4']
+    中标: ['zhong4', 'biao1']
+    抗住: ['kang2', 'zhu4']
\ No newline at end of file
diff --git a/paddlespeech/t2s/frontend/sing_frontend.py b/paddlespeech/t2s/frontend/sing_frontend.py
index c2aecf273..fff72a10c 100644
--- a/paddlespeech/t2s/frontend/sing_frontend.py
+++ b/paddlespeech/t2s/frontend/sing_frontend.py
@@ -29,7 +29,7 @@ class SingFrontend():
             pinyin_phone_path (str): pinyin to phone file path, a 'pinyin|phones' (like: ba|b a ) pair per line.
             phone_vocab_path (str): phone to phone id file path, a 'phone phone id' (like: a 4 ) pair per line.
         """
-        self.punc = '[：，；。？！“”‘’\':,;.?!]'
+        self.punc = '[、：，；。？！“”‘’\':,;.?!]'
 
         self.pinyin_phones = {'AP': 'AP', 'SP': 'SP'}
         if pinyin_phone_path:
diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/frontend/ssml/__init__.py
similarity index 89%
rename from paddlespeech/t2s/ssml/__init__.py
rename to paddlespeech/t2s/frontend/ssml/__init__.py
index 9b4db053b..b1b9d726f 100644
--- a/paddlespeech/t2s/ssml/__init__.py
+++ b/paddlespeech/t2s/frontend/ssml/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/frontend/ssml/xml_processor.py
similarity index 84%
rename from paddlespeech/t2s/ssml/xml_processor.py
rename to paddlespeech/t2s/frontend/ssml/xml_processor.py
index 892ca371e..1d216c31b 100644
--- a/paddlespeech/t2s/ssml/xml_processor.py
+++ b/paddlespeech/t2s/frontend/ssml/xml_processor.py
@@ -1,4 +1,17 @@
 # -*- coding: utf-8 -*-
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import re
 import xml.dom.minidom
 import xml.parsers.expat
@@ -17,7 +30,6 @@ Note:  xml 有5种特殊字符， &<>"'
 '  &apos;
 例如：
 <TitleName>&quot;姓名&quot;</TitleName>
-
 '''
 
 
@@ -61,17 +73,29 @@ class MixTextProcessor():
         patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
         mat = re.match(patn, mixstr)
         if mat:
+            # pre <speak>
             pre_xml = mat.group(1)
+            # between <speak> ... </speak>
             in_xml = mat.group(2)
+            # post </speak>
             after_xml = mat.group(3)
 
-            ctlist.append([pre_xml, []])
+            # pre with none syllable
+            if pre_xml:
+                ctlist.append([pre_xml, []])
+
+            # between with syllable
+            # [(sub sentence, [syllables]), ...]
             dom = DomXml(in_xml)
             pinyinlist = dom.get_pinyins_for_xml()
             ctlist = ctlist + pinyinlist
-            ctlist.append([after_xml, []])
+
+            # post with none syllable
+            if after_xml:
+                ctlist.append([after_xml, []])
         else:
             ctlist.append([mixstr, []])
+
         return ctlist
 
     @classmethod
@@ -86,17 +110,21 @@ class MixTextProcessor():
             in_xml = mat.group(2)
             after_xml = mat.group(3)
 
-            ctlist.append(pre_xml)
+            if pre_xml:
+                ctlist.append(pre_xml)
+
             dom = DomXml(in_xml)
             tags = dom.get_text_and_sayas_tags()
             ctlist.extend(tags)
-            
-            ctlist.append(after_xml)
-            return ctlist
+
+            if after_xml:
+                ctlist.append(after_xml)
         else:
             ctlist.append(mixstr)
+
         return ctlist
 
+
 class DomXml():
     def __init__(self, xmlstr):
         self.tdom = parseString(xmlstr)  #Document
diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py
index 42f7b8b2f..690f69aa2 100644
--- a/paddlespeech/t2s/frontend/tone_sandhi.py
+++ b/paddlespeech/t2s/frontend/tone_sandhi.py
@@ -20,6 +20,9 @@ from pypinyin import Style
 
 
 class ToneSandhi():
+    def __repr__(self):
+        return "MandarinToneSandhi"
+
     def __init__(self):
         self.must_neural_tone_words = {
             '麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝',
@@ -65,9 +68,22 @@ class ToneSandhi():
             '男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎',
             '幺幺', '干嘛', '学子', '哈哈', '数数', '袅袅', '局地', '以下', '娃哈哈', '花花草草', '留得',
             '耕地', '想想', '熙熙', '攘攘', '卵子', '死死', '冉冉', '恳恳', '佼佼', '吵吵', '打打',
-            '考考', '整整', '莘莘', '落地', '算子', '家家户户'
+            '考考', '整整', '莘莘', '落地', '算子', '家家户户', '青青'
         }
-        self.punc = "：，；。？！“”‘’':,;.?!"
+        self.punc = "、：，；。？！“”‘’':,;.?!"
+
+    def _split_word(self, word: str) -> List[str]:
+        word_list = jieba.cut_for_search(word)
+        word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
+        first_subword = word_list[0]
+        first_begin_idx = word.find(first_subword)
+        if first_begin_idx == 0:
+            second_subword = word[len(first_subword):]
+            new_word_list = [first_subword, second_subword]
+        else:
+            second_subword = word[:-len(first_subword)]
+            new_word_list = [second_subword, first_subword]
+        return new_word_list
 
     # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
     # e.g.
@@ -154,18 +170,8 @@ class ToneSandhi():
                             finals[i] = finals[i][:-1] + "4"
         return finals
 
-    def _split_word(self, word: str) -> List[str]:
-        word_list = jieba.cut_for_search(word)
-        word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
-        first_subword = word_list[0]
-        first_begin_idx = word.find(first_subword)
-        if first_begin_idx == 0:
-            second_subword = word[len(first_subword):]
-            new_word_list = [first_subword, second_subword]
-        else:
-            second_subword = word[:-len(first_subword)]
-            new_word_list = [second_subword, first_subword]
-        return new_word_list
+    def _all_tone_three(self, finals: List[str]) -> bool:
+        return all(x[-1] == "3" for x in finals)
 
     def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
 
@@ -207,9 +213,6 @@ class ToneSandhi():
 
         return finals
 
-    def _all_tone_three(self, finals: List[str]) -> bool:
-        return all(x[-1] == "3" for x in finals)
-
     # merge "不" and the word behind it
     # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
     def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
@@ -336,6 +339,9 @@ class ToneSandhi():
 
     def pre_merge_for_modify(
             self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        """
+            seg: [(word, pos), ...]
+        """
         seg = self._merge_bu(seg)
         seg = self._merge_yi(seg)
         seg = self._merge_reduplication(seg)
@@ -346,7 +352,11 @@ class ToneSandhi():
 
     def modified_tone(self, word: str, pos: str,
                       finals: List[str]) -> List[str]:
-
+        """
+            word: 分词
+            pos: 词性
+            finals: 带调韵母, [final1, ..., finaln]
+        """
         finals = self._bu_sandhi(word, finals)
         finals = self._yi_sandhi(word, finals)
         finals = self._neural_sandhi(word, pos, finals)
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index 35b97a93a..1431bc6d8 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -14,6 +14,7 @@
 import os
 import re
 from operator import itemgetter
+from pprint import pprint
 from typing import Dict
 from typing import List
 
@@ -30,10 +31,11 @@ from pypinyin_dict.phrase_pinyin_data import large_pinyin
 
 from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter
 from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
+from paddlespeech.t2s.frontend.polyphonic import Polyphonic
 from paddlespeech.t2s.frontend.rhy_prediction.rhy_predictor import RhyPredictor
+from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
 from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi
 from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer
-from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor
 
 INITIALS = [
     'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh',
@@ -41,6 +43,9 @@ INITIALS = [
 ]
 INITIALS += ['y', 'w', 'sp', 'spl', 'spn', 'sil']
 
+# 0 for None, 5 for neutral
+TONES = ["0", "1", "2", "3", "4", "5"]
+
 
 def intersperse(lst, item):
     result = [item] * (len(lst) * 2 + 1)
@@ -49,34 +54,19 @@ def intersperse(lst, item):
 
 
 def insert_after_character(lst, item):
+    """
+    inset `item` after finals.
+    """
     result = [item]
+
     for phone in lst:
         result.append(phone)
         if phone not in INITIALS:
             # finals has tones
             # assert phone[-1] in "12345"
             result.append(item)
-    return result
-
-
-class Polyphonic():
-    def __init__(self):
-        with open(
-                os.path.join(
-                    os.path.dirname(os.path.abspath(__file__)),
-                    'polyphonic.yaml'),
-                'r',
-                encoding='utf-8') as polyphonic_file:
-            # 解析yaml
-            polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
-        self.polyphonic_words = polyphonic_dict["polyphonic"]
 
-    def correct_pronunciation(self, word, pinyin):
-        # 词汇被词典收录则返回纠正后的读音
-        if word in self.polyphonic_words.keys():
-            pinyin = self.polyphonic_words[word]
-        # 否则返回原读音
-        return pinyin
+    return result
 
 
 class Frontend():
@@ -85,10 +75,8 @@ class Frontend():
                  phone_vocab_path=None,
                  tone_vocab_path=None,
                  use_rhy=False):
-        self.mix_ssml_processor = MixTextProcessor()
-        self.tone_modifier = ToneSandhi()
-        self.text_normalizer = TextNormalizer()
-        self.punc = "：，；。？！“”‘’':,;.?!"
+
+        self.punc = "、：，；。？！“”‘’':,;.?!"
         self.rhy_phns = ['sp1', 'sp2', 'sp3', 'sp4']
         self.phrases_dict = {
             '开户行': [['ka1i'], ['hu4'], ['hang2']],
@@ -108,28 +96,7 @@ class Frontend():
             '嘞': [['lei5']],
             '掺和': [['chan1'], ['huo5']]
         }
-        self.use_rhy = use_rhy
-        if use_rhy:
-            self.rhy_predictor = RhyPredictor()
-            print("Rhythm predictor loaded.")
-        # g2p_model can be pypinyin and g2pM and g2pW
-        self.g2p_model = g2p_model
-        if self.g2p_model == "g2pM":
-            self.g2pM_model = G2pM()
-            self.pinyin2phone = generate_lexicon(
-                with_tone=True, with_erhua=False)
-        elif self.g2p_model == "g2pW":
-            # use pypinyin as backup for non polyphonic characters in g2pW
-            self._init_pypinyin()
-            self.corrector = Polyphonic()
-            self.g2pM_model = G2pM()
-            self.g2pW_model = G2PWOnnxConverter(
-                style='pinyin', enable_non_tradional_chinese=True)
-            self.pinyin2phone = generate_lexicon(
-                with_tone=True, with_erhua=False)
 
-        else:
-            self._init_pypinyin()
         self.must_erhua = {
             "小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿"
         }
@@ -154,13 +121,51 @@ class Frontend():
             for tone, id in tone_id:
                 self.vocab_tones[tone] = int(id)
 
+        # SSML
+        self.mix_ssml_processor = MixTextProcessor()
+        # tone sandhi
+        self.tone_modifier = ToneSandhi()
+        # TN
+        self.text_normalizer = TextNormalizer()
+
+        # prosody
+        self.use_rhy = use_rhy
+        if use_rhy:
+            self.rhy_predictor = RhyPredictor()
+            print("Rhythm predictor loaded.")
+
+        # g2p
+        assert g2p_model in ('pypinyin', 'g2pM', 'g2pW')
+        self.g2p_model = g2p_model
+        if self.g2p_model == "g2pM":
+            self.g2pM_model = G2pM()
+            self.pinyin2phone = generate_lexicon(
+                with_tone=True, with_erhua=False)
+        elif self.g2p_model == "g2pW":
+            # use pypinyin as backup for non polyphonic characters in g2pW
+            self._init_pypinyin()
+            self.corrector = Polyphonic()
+            self.g2pM_model = G2pM()
+            self.g2pW_model = G2PWOnnxConverter(
+                style='pinyin', enable_non_tradional_chinese=True)
+            self.pinyin2phone = generate_lexicon(
+                with_tone=True, with_erhua=False)
+        else:
+            self._init_pypinyin()
+
     def _init_pypinyin(self):
+        """
+        Load pypinyin G2P module.
+        """
         large_pinyin.load()
         load_phrases_dict(self.phrases_dict)
         # 调整字的拼音顺序
         load_single_dict({ord(u'地'): u'de,di4'})
 
     def _get_initials_finals(self, word: str) -> List[List[str]]:
+        """
+        Get word initial and final by pypinyin or g2pM
+        """
         initials = []
         finals = []
         if self.g2p_model == "pypinyin":
@@ -171,11 +176,14 @@ class Frontend():
             for c, v in zip(orig_initials, orig_finals):
                 if re.match(r'i\d', v):
                     if c in ['z', 'c', 's']:
+                        # zi, ci, si
                         v = re.sub('i', 'ii', v)
                     elif c in ['zh', 'ch', 'sh', 'r']:
+                        # zhi, chi, shi
                         v = re.sub('i', 'iii', v)
                 initials.append(c)
                 finals.append(v)
+
         elif self.g2p_model == "g2pM":
             pinyins = self.g2pM_model(word, tone=True, char_split=False)
             for pinyin in pinyins:
@@ -192,58 +200,123 @@ class Frontend():
                     # If it's not pinyin (possibly punctuation) or no conversion is required
                     initials.append(pinyin)
                     finals.append(pinyin)
+
         return initials, finals
 
+    def _merge_erhua(self,
+                     initials: List[str],
+                     finals: List[str],
+                     word: str,
+                     pos: str) -> List[List[str]]:
+        """
+        Do erhub.
+        """
+        # fix er1
+        for i, phn in enumerate(finals):
+            if i == len(finals) - 1 and word[i] == "儿" and phn == 'er1':
+                finals[i] = 'er2'
+
+        # 发音
+        if word not in self.must_erhua and (word in self.not_erhua or
+                                            pos in {"a", "j", "nr"}):
+            return initials, finals
+
+        # "……" 等情况直接返回
+        if len(finals) != len(word):
+            return initials, finals
+
+        assert len(finals) == len(word)
+
+        # 不发音
+        new_initials = []
+        new_finals = []
+        for i, phn in enumerate(finals):
+            if i == len(finals) - 1 and word[i] == "儿" and phn in {
+                    "er2", "er5"
+            } and word[-2:] not in self.not_erhua and new_finals:
+                new_finals[-1] = new_finals[-1][:-1] + "r" + new_finals[-1][-1]
+            else:
+                new_initials.append(initials[i])
+                new_finals.append(phn)
+
+        return new_initials, new_finals
+
     # if merge_sentences, merge all sentences into one phone sequence
     def _g2p(self,
              sentences: List[str],
              merge_sentences: bool=True,
              with_erhua: bool=True) -> List[List[str]]:
+        """
+        Return: list of list phonemes.
+            [['w', 'o3', 'm', 'en2', 'sp'], ...]
+        """
         segments = sentences
         phones_list = []
+
+        # split by punctuation
         for seg in segments:
             if self.use_rhy:
                 seg = self.rhy_predictor._clean_text(seg)
-            phones = []
-            # Replace all English words in the sentence
+
+            # remove all English words in the sentence
             seg = re.sub('[a-zA-Z]+', '', seg)
+
+            # add prosody mark
             if self.use_rhy:
                 seg = self.rhy_predictor.get_prediction(seg)
+
+            # [(word, pos), ...]
             seg_cut = psg.lcut(seg)
-            initials = []
-            finals = []
+            # fix wordseg bad case for sandhi
             seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
+
             # 为了多音词获得更好的效果，这里采用整句预测
+            phones = []
+            initials = []
+            finals = []
             if self.g2p_model == "g2pW":
                 try:
+                    # undo prosody 
                     if self.use_rhy:
                         seg = self.rhy_predictor._clean_text(seg)
+
+                    # g2p
                     pinyins = self.g2pW_model(seg)[0]
                 except Exception:
-                    # g2pW采用模型采用繁体输入，如果有cover不了的简体词，采用g2pM预测
+                    # g2pW 模型采用繁体输入，如果有cover不了的简体词，采用g2pM预测
                     print("[%s] not in g2pW dict,use g2pM" % seg)
                     pinyins = self.g2pM_model(seg, tone=True, char_split=False)
+
+                # do prosody
                 if self.use_rhy:
                     rhy_text = self.rhy_predictor.get_prediction(seg)
                     final_py = self.rhy_predictor.pinyin_align(pinyins,
                                                                rhy_text)
                     pinyins = final_py
+
                 pre_word_length = 0
                 for word, pos in seg_cut:
                     sub_initials = []
                     sub_finals = []
                     now_word_length = pre_word_length + len(word)
+
+                    # skip english word
                     if pos == 'eng':
                         pre_word_length = now_word_length
                         continue
+
                     word_pinyins = pinyins[pre_word_length:now_word_length]
-                    # 矫正发音
+
+                    # 多音字消歧
                     word_pinyins = self.corrector.correct_pronunciation(
                         word, word_pinyins)
+
                     for pinyin, char in zip(word_pinyins, word):
                         if pinyin is None:
                             pinyin = char
+
                         pinyin = pinyin.replace("u:", "v")
+
                         if pinyin in self.pinyin2phone:
                             initial_final_list = self.pinyin2phone[
                                 pinyin].split(" ")
@@ -257,28 +330,41 @@ class Frontend():
                             # If it's not pinyin (possibly punctuation) or no conversion is required
                             sub_initials.append(pinyin)
                             sub_finals.append(pinyin)
+
                     pre_word_length = now_word_length
+                    # tone sandhi
                     sub_finals = self.tone_modifier.modified_tone(word, pos,
                                                                   sub_finals)
+                    # er hua                                
                     if with_erhua:
                         sub_initials, sub_finals = self._merge_erhua(
                             sub_initials, sub_finals, word, pos)
+
                     initials.append(sub_initials)
                     finals.append(sub_finals)
                     # assert len(sub_initials) == len(sub_finals) == len(word)
             else:
+                # pypinyin, g2pM
                 for word, pos in seg_cut:
                     if pos == 'eng':
+                        # skip english word
                         continue
+
+                    # g2p
                     sub_initials, sub_finals = self._get_initials_finals(word)
+                    # tone sandhi
                     sub_finals = self.tone_modifier.modified_tone(word, pos,
                                                                   sub_finals)
+                    # er hua
                     if with_erhua:
                         sub_initials, sub_finals = self._merge_erhua(
                             sub_initials, sub_finals, word, pos)
+
                     initials.append(sub_initials)
                     finals.append(sub_finals)
                     # assert len(sub_initials) == len(sub_finals) == len(word)
+
+                # sum(iterable[, start])
             initials = sum(initials, [])
             finals = sum(finals, [])
 
@@ -287,111 +373,34 @@ class Frontend():
                 # we discriminate i, ii and iii
                 if c and c not in self.punc:
                     phones.append(c)
+                # replace punctuation by `sp`
                 if c and c in self.punc:
                     phones.append('sp')
+
                 if v and v not in self.punc and v not in self.rhy_phns:
                     phones.append(v)
-            phones_list.append(phones)
-        if merge_sentences:
-            merge_list = sum(phones_list, [])
-            # rm the last 'sp' to avoid the noise at the end
-            # cause in the training data, no 'sp' in the end
-            if merge_list[-1] == 'sp':
-                merge_list = merge_list[:-1]
-            phones_list = []
-            phones_list.append(merge_list)
-        return phones_list
 
-    def _split_word_to_char(self, words):
-        res = []
-        for x in words:
-            res.append(x)
-        return res
-
-    # if using ssml, have pingyin specified, assign pinyin to words
-    def _g2p_assign(self,
-                    words: List[str],
-                    pinyin_spec: List[str],
-                    merge_sentences: bool=True) -> List[List[str]]:
-        phones_list = []
-        initials = []
-        finals = []
-
-        words = self._split_word_to_char(words[0])
-        for pinyin, char in zip(pinyin_spec, words):
-            sub_initials = []
-            sub_finals = []
-            pinyin = pinyin.replace("u:", "v")
-            #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu
-            if pinyin in self.pinyin2phone:
-                initial_final_list = self.pinyin2phone[pinyin].split(" ")
-                if len(initial_final_list) == 2:
-                    sub_initials.append(initial_final_list[0])
-                    sub_finals.append(initial_final_list[1])
-                elif len(initial_final_list) == 1:
-                    sub_initials.append('')
-                    sub_finals.append(initial_final_list[1])
-            else:
-                # If it's not pinyin (possibly punctuation) or no conversion is required
-                sub_initials.append(pinyin)
-                sub_finals.append(pinyin)
-            initials.append(sub_initials)
-            finals.append(sub_finals)
+            phones_list.append(phones)
 
-        initials = sum(initials, [])
-        finals = sum(finals, [])
-        phones = []
-        for c, v in zip(initials, finals):
-            # NOTE: post process for pypinyin outputs
-            # we discriminate i, ii and iii
-            if c and c not in self.punc:
-                phones.append(c)
-            if c and c in self.punc:
-                phones.append('sp')
-            if v and v not in self.punc and v not in self.rhy_phns:
-                phones.append(v)
-        phones_list.append(phones)
+        # merge split sub sentence into one sentence.
         if merge_sentences:
+            # sub sentence phonemes
             merge_list = sum(phones_list, [])
             # rm the last 'sp' to avoid the noise at the end
             # cause in the training data, no 'sp' in the end
             if merge_list[-1] == 'sp':
                 merge_list = merge_list[:-1]
+
+            # sentence phonemes
             phones_list = []
             phones_list.append(merge_list)
-        return phones_list
 
-    def _merge_erhua(self,
-                     initials: List[str],
-                     finals: List[str],
-                     word: str,
-                     pos: str) -> List[List[str]]:
-        # fix er1
-        for i, phn in enumerate(finals):
-            if i == len(finals) - 1 and word[i] == "儿" and phn == 'er1':
-                finals[i] = 'er2'
-        if word not in self.must_erhua and (word in self.not_erhua or
-                                            pos in {"a", "j", "nr"}):
-            return initials, finals
-        # "……" 等情况直接返回
-        if len(finals) != len(word):
-            return initials, finals
-
-        assert len(finals) == len(word)
-
-        new_initials = []
-        new_finals = []
-        for i, phn in enumerate(finals):
-            if i == len(finals) - 1 and word[i] == "儿" and phn in {
-                    "er2", "er5"
-            } and word[-2:] not in self.not_erhua and new_finals:
-                new_finals[-1] = new_finals[-1][:-1] + "r" + new_finals[-1][-1]
-            else:
-                new_finals.append(phn)
-                new_initials.append(initials[i])
-        return new_initials, new_finals
+        return phones_list
 
     def _p2id(self, phonemes: List[str]) -> np.ndarray:
+        """
+        Phoneme to Index
+        """
         # replace unk phone with sp
         phonemes = [
             phn if phn in self.vocab_phones else "sp" for phn in phonemes
@@ -400,6 +409,9 @@ class Frontend():
         return np.array(phone_ids, np.int64)
 
     def _t2id(self, tones: List[str]) -> np.ndarray:
+        """
+        Tone to Index.
+        """
         # replace unk phone with sp
         tones = [tone if tone in self.vocab_tones else "0" for tone in tones]
         tone_ids = [self.vocab_tones[item] for item in tones]
@@ -407,6 +419,9 @@ class Frontend():
 
     def _get_phone_tone(self, phonemes: List[str],
                         get_tone_ids: bool=False) -> List[List[str]]:
+        """
+        Get tone from phonemes.
+        """
         phones = []
         tones = []
         if get_tone_ids and self.vocab_tones:
@@ -423,13 +438,14 @@ class Frontend():
                             -1] == 'r' and phone not in self.vocab_phones and phone[:
                                                                                     -1] in self.vocab_phones:
                         phones.append(phone[:-1])
-                        phones.append("er")
                         tones.append(tone)
+                        phones.append("er")
                         tones.append("2")
                     else:
                         phones.append(phone)
                         tones.append(tone)
                 else:
+                    # initals with 0 tone.
                     phones.append(full_phone)
                     tones.append('0')
         else:
@@ -443,6 +459,7 @@ class Frontend():
                     phones.append("er2")
                 else:
                     phones.append(phone)
+
         return phones, tones
 
     def get_phonemes(self,
@@ -451,10 +468,16 @@ class Frontend():
                      with_erhua: bool=True,
                      robot: bool=False,
                      print_info: bool=False) -> List[List[str]]:
+        """
+        Main function to do G2P
+        """
+        # TN & Text Segmentation
         sentences = self.text_normalizer.normalize(sentence)
+        # Prosody & WS & g2p & tone sandhi
         phonemes = self._g2p(
             sentences, merge_sentences=merge_sentences, with_erhua=with_erhua)
-        # change all tones to `1`
+
+        # simulate robot pronunciation, change all tones to `1`
         if robot:
             new_phonemes = []
             for sentence in phonemes:
@@ -466,6 +489,7 @@ class Frontend():
                     new_sentence.append(item)
                 new_phonemes.append(new_sentence)
             phonemes = new_phonemes
+
         if print_info:
             print("----------------------------")
             print("text norm results:")
@@ -476,25 +500,104 @@ class Frontend():
             print("----------------------------")
         return phonemes
 
-    #@an added for ssml pinyin 
+    def _split_word_to_char(self, words):
+        res = []
+        for x in words:
+            res.append(x)
+        return res
+
+    # if using ssml, have pingyin specified, assign pinyin to words
+    def _g2p_assign(self,
+                    words: List[str],
+                    pinyin_spec: List[str],
+                    merge_sentences: bool=True) -> List[List[str]]:
+        """
+        Replace phoneme by SSML
+        """
+        phones_list = []
+        initials = []
+        finals = []
+
+        # to charactor list
+        words = self._split_word_to_char(words[0])
+
+        for pinyin, char in zip(pinyin_spec, words):
+            sub_initials = []
+            sub_finals = []
+            pinyin = pinyin.replace("u:", "v")
+
+            #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu
+            if pinyin in self.pinyin2phone:
+                initial_final_list = self.pinyin2phone[pinyin].split(" ")
+                if len(initial_final_list) == 2:
+                    sub_initials.append(initial_final_list[0])
+                    sub_finals.append(initial_final_list[1])
+                elif len(initial_final_list) == 1:
+                    sub_initials.append('')
+                    sub_finals.append(initial_final_list[1])
+            else:
+                # If it's not pinyin (possibly punctuation) or no conversion is required
+                sub_initials.append(pinyin)
+                sub_finals.append(pinyin)
+
+            initials.append(sub_initials)
+            finals.append(sub_finals)
+
+        initials = sum(initials, [])
+        finals = sum(finals, [])
+
+        phones = []
+        for c, v in zip(initials, finals):
+            # c for consonant, v for vowel
+            # NOTE: post process for pypinyin outputs
+            # we discriminate i, ii and iii
+            if c and c not in self.punc:
+                phones.append(c)
+            # replace punc to `sp`
+            if c and c in self.punc:
+                phones.append('sp')
+            if v and v not in self.punc and v not in self.rhy_phns:
+                phones.append(v)
+        phones_list.append(phones)
+
+        if merge_sentences:
+            merge_list = sum(phones_list, [])
+            # rm the last 'sp' to avoid the noise at the end
+            # cause in the training data, no 'sp' in the end
+            if merge_list[-1] == 'sp':
+                merge_list = merge_list[:-1]
+            phones_list = []
+            phones_list.append(merge_list)
+
+        return phones_list
+
     def get_phonemes_ssml(self,
                           ssml_inputs: list,
                           merge_sentences: bool=True,
                           with_erhua: bool=True,
                           robot: bool=False,
                           print_info: bool=False) -> List[List[str]]:
+        """
+         Main function to do G2P with SSML support.
+        """
         all_phonemes = []
         for word_pinyin_item in ssml_inputs:
             phonemes = []
+
+            # ['你喜欢', []] -> 你喜欢 []
             sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item)
+
+            # TN & Text Segmentation
             sentences = self.text_normalizer.normalize(sentence)
+
             if len(pinyin_spec) == 0:
+                # g2p word w/o specified <say-as>
                 phonemes = self._g2p(
                     sentences,
                     merge_sentences=merge_sentences,
                     with_erhua=with_erhua)
             else:
-                # phonemes should be pinyin_spec 
+                # word phonemes specified by <say-as>
                 phonemes = self._g2p_assign(
                     sentences, pinyin_spec, merge_sentences=merge_sentences)
 
@@ -512,17 +615,24 @@ class Frontend():
                 new_phonemes.append(new_sentence)
             all_phonemes = new_phonemes
 
+        if merge_sentences:
+            all_phonemes = [sum(all_phonemes, [])]
+
         if print_info:
             print("----------------------------")
             print("text norm results:")
             print(sentences)
             print("----------------------------")
             print("g2p results:")
-            print(all_phonemes[0])
+            print(all_phonemes)
             print("----------------------------")
-        return [sum(all_phonemes, [])]
+
+        return all_phonemes
 
     def add_sp_if_no(self, phonemes):
+        """
+        Prosody mark #4 added at sentence end.
+        """
         if not phonemes[-1][-1].startswith('sp'):
             phonemes[-1].append('sp4')
         return phonemes
@@ -542,8 +652,11 @@ class Frontend():
             merge_sentences=merge_sentences,
             print_info=print_info,
             robot=robot)
+
+        # add #4 for sentence end.
         if self.use_rhy:
             phonemes = self.add_sp_if_no(phonemes)
+
         result = {}
         phones = []
         tones = []
@@ -551,28 +664,33 @@ class Frontend():
         temp_tone_ids = []
 
         for part_phonemes in phonemes:
+
             phones, tones = self._get_phone_tone(
                 part_phonemes, get_tone_ids=get_tone_ids)
+
             if add_blank:
                 phones = insert_after_character(phones, blank_token)
+
             if tones:
                 tone_ids = self._t2id(tones)
                 if to_tensor:
                     tone_ids = paddle.to_tensor(tone_ids)
                 temp_tone_ids.append(tone_ids)
+
             if phones:
                 phone_ids = self._p2id(phones)
                 # if use paddle.to_tensor() in onnxruntime, the first time will be too low
                 if to_tensor:
                     phone_ids = paddle.to_tensor(phone_ids)
                 temp_phone_ids.append(phone_ids)
+
         if temp_tone_ids:
             result["tone_ids"] = temp_tone_ids
         if temp_phone_ids:
             result["phone_ids"] = temp_phone_ids
+
         return result
 
-    # @an added for ssml
     def get_input_ids_ssml(
             self,
             sentence: str,
@@ -584,12 +702,15 @@ class Frontend():
             blank_token: str="<pad>",
             to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
 
-        l_inputs = MixTextProcessor.get_pinyin_split(sentence)
+        # split setence by SSML tag.
+        texts = MixTextProcessor.get_pinyin_split(sentence)
+
         phonemes = self.get_phonemes_ssml(
-            l_inputs,
+            texts,
             merge_sentences=merge_sentences,
             print_info=print_info,
             robot=robot)
+
         result = {}
         phones = []
         tones = []
@@ -599,21 +720,26 @@ class Frontend():
         for part_phonemes in phonemes:
             phones, tones = self._get_phone_tone(
                 part_phonemes, get_tone_ids=get_tone_ids)
+
             if add_blank:
                 phones = insert_after_character(phones, blank_token)
+
             if tones:
                 tone_ids = self._t2id(tones)
                 if to_tensor:
                     tone_ids = paddle.to_tensor(tone_ids)
                 temp_tone_ids.append(tone_ids)
+
             if phones:
                 phone_ids = self._p2id(phones)
                 # if use paddle.to_tensor() in onnxruntime, the first time will be too low
                 if to_tensor:
                     phone_ids = paddle.to_tensor(phone_ids)
                 temp_phone_ids.append(phone_ids)
+
         if temp_tone_ids:
             result["tone_ids"] = temp_tone_ids
         if temp_phone_ids:
             result["phone_ids"] = temp_phone_ids
+
         return result
diff --git a/runtime/README.md b/runtime/README.md
index 553bb29ad..0e9c243e9 100644
--- a/runtime/README.md
+++ b/runtime/README.md
@@ -2,7 +2,7 @@
 ## Environment
 
 We develop under:
-* python - 3.7
+* python - >=3.8
 * docker - `registry.baidubce.com/paddlepaddle/paddle:2.2.2-gpu-cuda10.2-cudnn7`
 * os - Ubuntu 16.04.7 LTS
 * gcc/g++/gfortran - 8.2.0
@@ -98,7 +98,7 @@ please install paddlepaddle >= 2.4rc
 
 
 ```
-cd $YOUR_ENV_PATH/lib/python3.7/site-packages/paddle/fluid
+cd $YOUR_ENV_PATH/lib/python3.8/site-packages/paddle/fluid
 patchelf --set-soname libpaddle.so libpaddle.so
 ```
 
diff --git a/runtime/tools/venv.sh b/runtime/tools/venv.sh
index 3952988c6..2aa7e5095 100755
--- a/runtime/tools/venv.sh
+++ b/runtime/tools/venv.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
 set -ex
 
-PYTHON=python3.7
+PYTHON=python3.8
 test -d venv || virtualenv -p ${PYTHON} venv
diff --git a/setup.py b/setup.py
index 07b411bd0..af7c4dc3d 100644
--- a/setup.py
+++ b/setup.py
@@ -40,6 +40,8 @@ base = [
     "hyperpyyaml",
     "inflect",
     "jsonlines",
+    # paddleaudio align with librosa==0.8.1, which need numpy==1.23.x
+    "numpy==1.23.5",
     "librosa==0.8.1",
     "scipy>=1.4.0",
     "loguru",
@@ -260,6 +262,7 @@ setup_info = dict(
     long_description=read("README.md"),
     long_description_content_type="text/markdown",
     keywords=[
+        "SSL"
         "speech",
         "asr",
         "tts",
@@ -268,12 +271,19 @@ setup_info = dict(
         "text frontend",
         "MFA",
         "paddlepaddle",
+        "paddleaudio",
+        "streaming asr",
+        "streaming tts",
         "beam search",
         "ctcdecoder",
         "deepspeech2",
+        "wav2vec2",
+        "hubert",
+        "wavlm",
         "transformer",
         "conformer",
         "fastspeech2",
+        "hifigan",
         "gan vocoders",
     ],
     python_requires='>=3.7',
diff --git a/tests/unit/tts/test_enfrontend.py b/tests/unit/tts/test_enfrontend.py
new file mode 100644
index 000000000..4f8c49305
--- /dev/null
+++ b/tests/unit/tts/test_enfrontend.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.t2s.frontend.en_frontend import English as EnFrontend
+
+if __name__ == '__main__':
+
+    fe = EnFrontend()
+
+    text = "AI for Sceience"
+    phonemes = fe.phoneticize(text)
+    print(text)
+    print(phonemes)
+
+    text = "eight"
+    phonemes = fe.phoneticize(text)
+    print(text)
+    print(phonemes)
diff --git a/tests/unit/tts/test_mixfrontend.py b/tests/unit/tts/test_mixfrontend.py
new file mode 100644
index 000000000..5751dd2a7
--- /dev/null
+++ b/tests/unit/tts/test_mixfrontend.py
@@ -0,0 +1,444 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+import tempfile
+
+from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
+
+# mix zh & en phonemes
+phone_id_str = """
+<pad> 0
+<unk> 1
+AA0 2
+AA1 3
+AA2 4
+AE0 5
+AE1 6
+AE2 7
+AH0 8
+AH1 9
+AH2 10
+AO0 11
+AO1 12
+AO2 13
+AW0 14
+AW1 15
+AW2 16
+AY0 17
+AY1 18
+AY2 19
+B 20
+CH 21
+D 22
+DH 23
+EH0 24
+EH1 25
+EH2 26
+ER0 27
+ER1 28
+ER2 29
+EY0 30
+EY1 31
+EY2 32
+F 33
+G 34
+HH 35
+IH0 36
+IH1 37
+IH2 38
+IY0 39
+IY1 40
+IY2 41
+JH 42
+K 43
+L 44
+M 45
+N 46
+NG 47
+OW0 48
+OW1 49
+OW2 50
+OY0 51
+OY1 52
+OY2 53
+P 54
+R 55
+S 56
+SH 57
+T 58
+TH 59
+UH0 60
+UH1 61
+UH2 62
+UW0 63
+UW1 64
+UW2 65
+V 66
+W 67
+Y 68
+Z 69
+ZH 70
+a1 71
+a2 72
+a3 73
+a4 74
+a5 75
+ai1 76
+ai2 77
+ai3 78
+ai4 79
+ai5 80
+air2 81
+air3 82
+air4 83
+an1 84
+an2 85
+an3 86
+an4 87
+an5 88
+ang1 89
+ang2 90
+ang3 91
+ang4 92
+ang5 93
+angr2 94
+angr4 95
+anr1 96
+anr3 97
+anr4 98
+ao1 99
+ao2 100
+ao3 101
+ao4 102
+ao5 103
+aor1 104
+aor3 105
+aor4 106
+aor5 107
+ar2 108
+ar3 109
+ar4 110
+ar5 111
+b 112
+c 113
+ch 114
+d 115
+e1 116
+e2 117
+e3 118
+e4 119
+e5 120
+ei1 121
+ei2 122
+ei3 123
+ei4 124
+ei5 125
+eir4 126
+en1 127
+en2 128
+en3 129
+en4 130
+en5 131
+eng1 132
+eng2 133
+eng3 134
+eng4 135
+eng5 136
+engr4 137
+enr1 138
+enr2 139
+enr3 140
+enr4 141
+enr5 142
+er1 143
+er2 144
+er3 145
+er4 146
+er5 147
+f 148
+g 149
+h 150
+i1 151
+i2 152
+i3 153
+i4 154
+i5 155
+ia1 156
+ia2 157
+ia3 158
+ia4 159
+ia5 160
+ian1 161
+ian2 162
+ian3 163
+ian4 164
+ian5 165
+iang1 166
+iang2 167
+iang3 168
+iang4 169
+iang5 170
+iangr4 171
+ianr1 172
+ianr2 173
+ianr3 174
+ianr4 175
+ianr5 176
+iao1 177
+iao2 178
+iao3 179
+iao4 180
+iao5 181
+iaor1 182
+iaor2 183
+iaor3 184
+iaor4 185
+iar1 186
+iar3 187
+iar4 188
+ie1 189
+ie2 190
+ie3 191
+ie4 192
+ie5 193
+ii1 194
+ii2 195
+ii3 196
+ii4 197
+ii5 198
+iii1 199
+iii2 200
+iii3 201
+iii4 202
+iii5 203
+iiir1 204
+iiir4 205
+iir2 206
+in1 207
+in2 208
+in3 209
+in4 210
+in5 211
+ing1 212
+ing2 213
+ing3 214
+ing4 215
+ing5 216
+ingr1 217
+ingr2 218
+ingr3 219
+ingr4 220
+inr1 221
+inr4 222
+io1 223
+io3 224
+io5 225
+iong1 226
+iong2 227
+iong3 228
+iong4 229
+iong5 230
+iou1 231
+iou2 232
+iou3 233
+iou4 234
+iou5 235
+iour1 236
+iour2 237
+iour3 238
+iour4 239
+ir1 240
+ir2 241
+ir3 242
+ir4 243
+ir5 244
+j 245
+k 246
+l 247
+m 248
+n 249
+o1 250
+o2 251
+o3 252
+o4 253
+o5 254
+ong1 255
+ong2 256
+ong3 257
+ong4 258
+ong5 259
+ongr4 260
+or2 261
+ou1 262
+ou2 263
+ou3 264
+ou4 265
+ou5 266
+our2 267
+our3 268
+our4 269
+our5 270
+p 271
+q 272
+r 273
+s 274
+sh 275
+sil 276
+sp 277
+spl 278
+spn 279
+t 280
+u1 281
+u2 282
+u3 283
+u4 284
+u5 285
+ua1 286
+ua2 287
+ua3 288
+ua4 289
+ua5 290
+uai1 291
+uai2 292
+uai3 293
+uai4 294
+uai5 295
+uair4 296
+uan1 297
+uan2 298
+uan3 299
+uan4 300
+uan5 301
+uang1 302
+uang2 303
+uang3 304
+uang4 305
+uang5 306
+uangr4 307
+uanr1 308
+uanr2 309
+uanr3 310
+uanr4 311
+uanr5 312
+uar1 313
+uar2 314
+uar4 315
+uei1 316
+uei2 317
+uei3 318
+uei4 319
+uei5 320
+ueir1 321
+ueir2 322
+ueir3 323
+ueir4 324
+uen1 325
+uen2 326
+uen3 327
+uen4 328
+uen5 329
+ueng1 330
+ueng2 331
+ueng3 332
+ueng4 333
+uenr1 334
+uenr2 335
+uenr3 336
+uenr4 337
+uo1 338
+uo2 339
+uo3 340
+uo4 341
+uo5 342
+uor1 343
+uor2 344
+uor3 345
+uor5 346
+ur1 347
+ur2 348
+ur3 349
+ur4 350
+ur5 351
+v1 352
+v2 353
+v3 354
+v4 355
+v5 356
+van1 357
+van2 358
+van3 359
+van4 360
+van5 361
+vanr1 362
+vanr2 363
+vanr3 364
+vanr4 365
+ve1 366
+ve2 367
+ve3 368
+ve4 369
+ve5 370
+ver3 371
+ver4 372
+vn1 373
+vn2 374
+vn3 375
+vn4 376
+vn5 377
+vnr2 378
+vr3 379
+x 380
+z 381
+zh 382
+, 383
+. 384
+? 385
+! 386
+<eos> 387
+"""
+
+if __name__ == '__main__':
+    with tempfile.NamedTemporaryFile(mode='wt') as f:
+        phone_ids = phone_id_str.split()
+        for phone, id in zip(phone_ids[::2], phone_ids[1::2]):
+            f.write(f"{phone} {id}")
+            f.write('\n')
+            f.flush()
+
+        frontend = MixFrontend(phone_vocab_path=f.name)
+
+        text = "hello, 我爱北京天安们，what about you."
+        print(text)
+        # [('hello, ', 'en'), ('我爱北京天安们，', 'zh'), ('what about you.', 'en')]
+        segs = frontend.split_by_lang(text)
+        print(segs)
+
+        text = "hello?!!我爱北京天安们，what about you."
+        print(text)
+        # [('hello?!!', 'en'), ('我爱北京天安们，', 'zh'), ('what about you.', 'en')]
+        segs = frontend.split_by_lang(text)
+        print(segs)
+
+        text = "<speak> hello，我爱北京天安们，what about you."
+        print(text)
+        # [('<speak> hello，', 'en'), ('我爱北京天安们，', 'zh'), ('what about you.', 'en')]
+        segs = frontend.split_by_lang(text)
+        print(segs)
+
+        # 对于SSML的xml标记处理不好。需要先解析SSML，后处理中英的划分。
+        text = "<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>"
+        print(text)
+        # [('<speak>', 'en'), ('我们的声学模型使用了 ', 'zh'), ('Fast Speech Two。', 'en'), ('前浪<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('在沙滩上,沙滩上倒了一堆<', 'zh'), ("say-as pinyin='tu3'>", 'en'), ('土</', 'zh'), ('say-as>。 ', 'en'), ('想象<', 'zh'), ("say-as pinyin='gan1 gan1'>", 'en'), ('干干</', 'zh'), ('say-as>', 'en'), ('的树干<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('了, 里面有个干尸，不知是被谁<', 'zh'), ("say-as pinyin='gan4'>", 'en'), ('干</', 'zh'), ('say-as>', 'en'), ('死的。</', 'zh'), ('speak>', 'en')]
+        segs = frontend.split_by_lang(text)
+        print(segs)
diff --git a/tests/unit/tts/test_ssml.py b/tests/unit/tts/test_ssml.py
new file mode 100644
index 000000000..4c3e9d538
--- /dev/null
+++ b/tests/unit/tts/test_ssml.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
+
+if __name__ == '__main__':
+    text = "你好吗，<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>thank you."
+
+    # SSML: 13
+    # 0 ['你好吗，', []]
+    # 1 ['我们的声学模型使用了FastSpeechTwo。前浪', []]
+    # 2 ['倒', ['dao3']]
+    # 3 ['在沙滩上,沙滩上倒了一堆', []]
+    # 4 ['土', ['tu3']]
+    # 5 ['。想象', []]
+    # 6 ['干干', ['gan1', 'gan1']]
+    # 7 ['的树干', []]
+    # 8 ['倒', ['dao3']]
+    # 9 ['了,里面有个干尸，不知是被谁', []]
+    # 10 ['干', ['gan4']]
+    # 11 ['死的。', []]
+    # 12 ['thank you.', []]
+    inputs = MixTextProcessor.get_pinyin_split(text)
+    print(f"SSML get_pinyin_split: {len(inputs)}")
+    for i, sub in enumerate(inputs):
+        print(i, sub)
+    print()
+
+    # SSML get_dom_split: 13
+    # 0 你好吗，
+    # 1 我们的声学模型使用了 Fast Speech Two。前浪
+    # 2 <say-as pinyin="dao3">倒</say-as>
+    # 3 在沙滩上,沙滩上倒了一堆
+    # 4 <say-as pinyin="tu3">土</say-as>
+    # 5 。 想象
+    # 6 <say-as pinyin="gan1 gan1">干干</say-as>
+    # 7 的树干
+    # 8 <say-as pinyin="dao3">倒</say-as>
+    # 9 了, 里面有个干尸，不知是被谁
+    # 10 <say-as pinyin="gan4">干</say-as>
+    # 11 死的。
+    # 12 thank you.
+    inputs = MixTextProcessor.get_dom_split(text)
+    print(f"SSML get_dom_split: {len(inputs)}")
+    for i, sub in enumerate(inputs):
+        print(i, sub)
+    print()
+
+    # SSML object.get_pinyin_split: 246
+    # <speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>
+    outs = MixTextProcessor().get_xml_content(text)
+    print(f"SSML object.get_pinyin_split: {len(outs)}")
+    print(outs)
+    print()
+
+    # SSML object.get_content_split: 30 你好吗，
+    # 1 <speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>
+    # 倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>
+    # 2 thank you.
+    outs = MixTextProcessor().get_content_split(text)
+    print(f"SSML object.get_content_split: {len(outs)}")
+    for i, sub in enumerate(outs):
+        print(i, sub)
+    print()
+
+    import json
+    import xmltodict
+    text = "<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>"
+    ssml = xmltodict.parse(text)
+    print(json.dumps(ssml))
+    print(ssml['speak'].keys())
+    print(ssml['speak']['#text'])
+    print(ssml['speak']['say-as'])
diff --git a/tools/Makefile b/tools/Makefile
index a5a4485da..c6c667cd0 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,5 +1,5 @@
 SHELL:= /bin/bash
-PYTHON:= python3.7
+PYTHON:= python3.8
 
 CXX ?= g++
 CC ?= gcc        # used for sph2pipe