diff --git a/paddlespeech/t2s/assets/__init__.py b/paddlespeech/t2s/assets/__init__.py
index e69de29b..595add0a 100644
--- a/paddlespeech/t2s/assets/__init__.py
+++ b/paddlespeech/t2s/assets/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
index 93146df0..cafd065a 100644
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -117,7 +117,7 @@ def evaluate(args):
         sentences = get_sentences(text_file=args.text, lang=args.lang)
 
     for utt_id, sentence in sentences:
-        print(f"{utt_id} {sentence} ...")
+        print(f"{utt_id} {sentence}")
         with timer() as t:
             if am_name == "diffsinger":
                 text = ""
@@ -135,7 +135,7 @@ def evaluate(args):
                 lang=args.lang,
                 svs_input=svs_input)
             phone_ids = frontend_dict['phone_ids']
-            # pprint(f"process: {utt_id} {phone_ids}")
+            # pprint(f"{utt_id} {phone_ids}")
 
             with paddle.no_grad():
                 flags = 0
diff --git a/paddlespeech/t2s/frontend/canton_frontend.py b/paddlespeech/t2s/frontend/canton_frontend.py
index dd3484d5..bbb7bcf0 100644
--- a/paddlespeech/t2s/frontend/canton_frontend.py
+++ b/paddlespeech/t2s/frontend/canton_frontend.py
@@ -48,7 +48,7 @@ def jyuping_to_phonemes(cantons: List[str]):
 class CantonFrontend():
     def __init__(self, phone_vocab_path: str):
         self.text_normalizer = TextNormalizer()
-        self.punc = "：，；。？！“”‘’':,;.?!"
+        self.punc = "、：，；。？！“”‘’':,;.?!"
 
         self.vocab_phones = {}
         if phone_vocab_path:
diff --git a/paddlespeech/t2s/frontend/en_frontend.py b/paddlespeech/t2s/frontend/en_frontend.py
index 81991e0d..c58bed7d 100644
--- a/paddlespeech/t2s/frontend/en_frontend.py
+++ b/paddlespeech/t2s/frontend/en_frontend.py
@@ -1 +1,14 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from .phonectic import English
diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py
index 628984cb..2ebfe135 100644
--- a/paddlespeech/t2s/frontend/mix_frontend.py
+++ b/paddlespeech/t2s/frontend/mix_frontend.py
@@ -106,76 +106,95 @@ class MixFrontend():
                       get_tone_ids: bool=False,
                       add_sp: bool=True,
                       to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
-        ''' 1. 添加SSML支持，先列出 文字 和 <say-as>标签内容，
-                然后添加到tmpSegments数组里
-        '''
-        d_inputs = MixTextProcessor.get_dom_split(sentence)
-        tmpSegments = []
-        for instr in d_inputs:
-            ''' 暂时只支持 say-as '''
-            if instr.lower().startswith("<say-as"):
-                tmpSegments.append((instr, "zh"))
+        # XML Document Object Model (DOM)
+        doms = MixTextProcessor.get_dom_split(sentence)
+
+        lang_splits = []
+        for dom in doms:
+            if dom.lower().startswith("<say-as pinyin="):
+                # `<say-as pinyin=` for zh lang
+                lang_splits.append((dom, "zh"))
             else:
-                tmpSegments.extend(self.split_by_lang(instr))
-        ''' 2. 把zh的merge到一起，避免合成结果中间停顿
-        '''
+                # process zh, en and zh/en
+                lang_splits.extend(self.split_by_lang(dom))
+
+        # merge adjacent zh segment
         segments = []
         currentSeg = ["", ""]
-        for seg in tmpSegments:
+        for seg in lang_splits:
             if seg[1] == "en" or seg[1] == "other":
                 if currentSeg[0] == '':
+                    # first see
                     segments.append(seg)
                 else:
+                    # zh
                     currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
                     segments.append(tuple(currentSeg))
+                    # en
                     segments.append(seg)
+                    # reset
                     currentSeg = ["", ""]
             else:
+                # zh
                 if currentSeg[0] == '':
+                    # first see
                     currentSeg[0] = seg[0]
                     currentSeg[1] = seg[1]
                 else:
+                    # merge zh 
                     currentSeg[0] = currentSeg[0] + seg[0]
+
         if currentSeg[0] != '':
+            # last zh
             currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
             segments.append(tuple(currentSeg))
 
         phones_list = []
         result = {}
 
+        # 008 我们要去云南 team building, 非常非常 happy.
+        # seg ('我们要去云南 ', 'zh')
+        # seg ('team building, ', 'en')
+        # seg ('非常非常 ', 'zh')
+        # seg ('happy.', 'en')
+        # [('<speak>我们要去云南 </speak>', 'zh'), ('team building, ', 'en'), ('<speak>非常非常 </speak>', 'zh'), ('happy.', 'en')]
         for seg in segments:
             content = seg[0]
             lang = seg[1]
-            if content != '':
-                if lang == "en":
-                    input_ids = self.en_frontend.get_input_ids(
-                        content, merge_sentences=False, to_tensor=to_tensor)
+
+            if not content:
+                continue
+
+            if lang == "en":
+                input_ids = self.en_frontend.get_input_ids(
+                    content, merge_sentences=False, to_tensor=to_tensor)
+            else:
+                if content.strip() != "" and \
+                    re.match(r".*?<speak>.*?</speak>.*", content, re.DOTALL):
+                    # process ssml
+                    input_ids = self.zh_frontend.get_input_ids_ssml(
+                        content,
+                        merge_sentences=False,
+                        get_tone_ids=get_tone_ids,
+                        to_tensor=to_tensor)
                 else:
-                    ''' 3. 把带speak tag的中文和普通文字分开处理
-                    '''
-                    if content.strip() != "" and \
-                        re.match(r".*?<speak>.*?</speak>.*", content, re.DOTALL):
-                        input_ids = self.zh_frontend.get_input_ids_ssml(
-                            content,
-                            merge_sentences=False,
-                            get_tone_ids=get_tone_ids,
-                            to_tensor=to_tensor)
-                    else:
-                        input_ids = self.zh_frontend.get_input_ids(
-                            content,
-                            merge_sentences=False,
-                            get_tone_ids=get_tone_ids,
-                            to_tensor=to_tensor)
-                if add_sp:
-                    if to_tensor:
-                        input_ids["phone_ids"][-1] = paddle.concat(
-                            [input_ids["phone_ids"][-1], self.sp_id_tensor])
-                    else:
-                        input_ids["phone_ids"][-1] = np.concatenate(
-                            (input_ids["phone_ids"][-1], self.sp_id_numpy))
+                    # process plain text
+                    input_ids = self.zh_frontend.get_input_ids(
+                        content,
+                        merge_sentences=False,
+                        get_tone_ids=get_tone_ids,
+                        to_tensor=to_tensor)
+
+            if add_sp:
+                # add sp between zh and en
+                if to_tensor:
+                    input_ids["phone_ids"][-1] = paddle.concat(
+                        [input_ids["phone_ids"][-1], self.sp_id_tensor])
+                else:
+                    input_ids["phone_ids"][-1] = np.concatenate(
+                        (input_ids["phone_ids"][-1], self.sp_id_numpy))
 
-                for phones in input_ids["phone_ids"]:
-                    phones_list.append(phones)
+            phones_list.extend(input_ids["phone_ids"])
 
         if merge_sentences:
             merge_list = paddle.concat(phones_list)
diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py
index 2112ff4a..b3f64dc5 100644
--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@@ -55,7 +55,7 @@ class English(Phonetics):
         self.punctuations = get_punctuations("en")
         self.vocab = Vocab(self.phonemes + self.punctuations)
         self.vocab_phones = {}
-        self.punc = "：，；。？！“”‘’':,;.?!"
+        self.punc = "、：，；。？！“”‘’':,;.?!"
         self.text_normalizer = TextNormalizer()
         if phone_vocab_path:
             with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
diff --git a/paddlespeech/t2s/frontend/polyphonic.py b/paddlespeech/t2s/frontend/polyphonic.py
new file mode 100644
index 00000000..885064a6
--- /dev/null
+++ b/paddlespeech/t2s/frontend/polyphonic.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import yaml
+
+
+class Polyphonic():
+    def __init__(self):
+        with open(
+                os.path.join(
+                    os.path.dirname(os.path.abspath(__file__)),
+                    'polyphonic.yaml'),
+                'r',
+                encoding='utf-8') as polyphonic_file:
+            # 解析yaml
+            polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
+        self.polyphonic_words = polyphonic_dict["polyphonic"]
+
+    def correct_pronunciation(self, word, pinyin):
+        # 词汇被词典收录则返回纠正后的读音
+        print(word, pinyin)
+        if word in self.polyphonic_words.keys():
+            pinyin = self.polyphonic_words[word]
+        print('new', pinyin)
+        # 否则返回原读音
+        return pinyin
diff --git a/paddlespeech/t2s/frontend/polyphonic.yaml b/paddlespeech/t2s/frontend/polyphonic.yaml
index 50659afb..f52b1cf5 100644
--- a/paddlespeech/t2s/frontend/polyphonic.yaml
+++ b/paddlespeech/t2s/frontend/polyphonic.yaml
@@ -48,4 +48,7 @@ polyphonic:
     唉: ['ai4']
     扎实: ['zha1','shi2']
     干将: ['gan4','jiang4']
-    陈威行: ['chen2', 'wei1', 'hang2']
\ No newline at end of file
+    陈威行: ['chen2', 'wei1', 'hang2']
+    郭晟: ['guo1', 'sheng4']
+    中标: ['zhong4', 'biao1']
+    抗住: ['kang2', 'zhu4']
\ No newline at end of file
diff --git a/paddlespeech/t2s/frontend/sing_frontend.py b/paddlespeech/t2s/frontend/sing_frontend.py
index c2aecf27..fff72a10 100644
--- a/paddlespeech/t2s/frontend/sing_frontend.py
+++ b/paddlespeech/t2s/frontend/sing_frontend.py
@@ -29,7 +29,7 @@ class SingFrontend():
             pinyin_phone_path (str): pinyin to phone file path, a 'pinyin|phones' (like: ba|b a ) pair per line.
             phone_vocab_path (str): phone to phone id file path, a 'phone phone id' (like: a 4 ) pair per line.
         """
-        self.punc = '[：，；。？！“”‘’\':,;.?!]'
+        self.punc = '[、：，；。？！“”‘’\':,;.?!]'
 
         self.pinyin_phones = {'AP': 'AP', 'SP': 'SP'}
         if pinyin_phone_path:
diff --git a/paddlespeech/t2s/frontend/ssml/__init__.py b/paddlespeech/t2s/frontend/ssml/__init__.py
index 9b4db053..b1b9d726 100644
--- a/paddlespeech/t2s/frontend/ssml/__init__.py
+++ b/paddlespeech/t2s/frontend/ssml/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/t2s/frontend/ssml/xml_processor.py b/paddlespeech/t2s/frontend/ssml/xml_processor.py
index 3a5177d1..1d216c31 100644
--- a/paddlespeech/t2s/frontend/ssml/xml_processor.py
+++ b/paddlespeech/t2s/frontend/ssml/xml_processor.py
@@ -1,4 +1,17 @@
 # -*- coding: utf-8 -*-
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import re
 import xml.dom.minidom
 import xml.parsers.expat
@@ -68,7 +81,8 @@ class MixTextProcessor():
             after_xml = mat.group(3)
 
             # pre with none syllable
-            ctlist.append([pre_xml, []])
+            if pre_xml:
+                ctlist.append([pre_xml, []])
 
             # between with syllable
             # [(sub sentence, [syllables]), ...]
@@ -77,9 +91,11 @@ class MixTextProcessor():
             ctlist = ctlist + pinyinlist
 
             # post with none syllable
-            ctlist.append([after_xml, []])
+            if after_xml:
+                ctlist.append([after_xml, []])
         else:
             ctlist.append([mixstr, []])
+
         return ctlist
 
     @classmethod
@@ -94,15 +110,18 @@ class MixTextProcessor():
             in_xml = mat.group(2)
             after_xml = mat.group(3)
 
-            ctlist.append(pre_xml)
+            if pre_xml:
+                ctlist.append(pre_xml)
+
             dom = DomXml(in_xml)
             tags = dom.get_text_and_sayas_tags()
             ctlist.extend(tags)
 
-            ctlist.append(after_xml)
-            return ctlist
+            if after_xml:
+                ctlist.append(after_xml)
         else:
             ctlist.append(mixstr)
+
         return ctlist
 
 
diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py
index 5902540c..690f69aa 100644
--- a/paddlespeech/t2s/frontend/tone_sandhi.py
+++ b/paddlespeech/t2s/frontend/tone_sandhi.py
@@ -68,9 +68,9 @@ class ToneSandhi():
             '男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎',
             '幺幺', '干嘛', '学子', '哈哈', '数数', '袅袅', '局地', '以下', '娃哈哈', '花花草草', '留得',
             '耕地', '想想', '熙熙', '攘攘', '卵子', '死死', '冉冉', '恳恳', '佼佼', '吵吵', '打打',
-            '考考', '整整', '莘莘', '落地', '算子', '家家户户'
+            '考考', '整整', '莘莘', '落地', '算子', '家家户户', '青青'
         }
-        self.punc = "：，；。？！“”‘’':,;.?!"
+        self.punc = "、：，；。？！“”‘’':,;.?!"
 
     def _split_word(self, word: str) -> List[str]:
         word_list = jieba.cut_for_search(word)
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index 2e0b2ffd..1431bc6d 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -31,6 +31,7 @@ from pypinyin_dict.phrase_pinyin_data import large_pinyin
 
 from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter
 from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
+from paddlespeech.t2s.frontend.polyphonic import Polyphonic
 from paddlespeech.t2s.frontend.rhy_prediction.rhy_predictor import RhyPredictor
 from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
 from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi
@@ -68,26 +69,6 @@ def insert_after_character(lst, item):
     return result
 
 
-class Polyphonic():
-    def __init__(self):
-        with open(
-                os.path.join(
-                    os.path.dirname(os.path.abspath(__file__)),
-                    'polyphonic.yaml'),
-                'r',
-                encoding='utf-8') as polyphonic_file:
-            # 解析yaml
-            polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
-        self.polyphonic_words = polyphonic_dict["polyphonic"]
-
-    def correct_pronunciation(self, word, pinyin):
-        # 词汇被词典收录则返回纠正后的读音
-        if word in self.polyphonic_words.keys():
-            pinyin = self.polyphonic_words[word]
-        # 否则返回原读音
-        return pinyin
-
-
 class Frontend():
     def __init__(self,
                  g2p_model="g2pW",
@@ -95,7 +76,7 @@ class Frontend():
                  tone_vocab_path=None,
                  use_rhy=False):
 
-        self.punc = "：，；。？！“”‘’':,;.?!"
+        self.punc = "、：，；。？！“”‘’':,;.?!"
         self.rhy_phns = ['sp1', 'sp2', 'sp3', 'sp4']
         self.phrases_dict = {
             '开户行': [['ka1i'], ['hu4'], ['hang2']],
@@ -567,6 +548,7 @@ class Frontend():
 
         phones = []
         for c, v in zip(initials, finals):
+            # c for consonant, v for vowel
             # NOTE: post process for pypinyin outputs
             # we discriminate i, ii and iii
             if c and c not in self.punc:
@@ -633,16 +615,19 @@ class Frontend():
                 new_phonemes.append(new_sentence)
             all_phonemes = new_phonemes
 
+        if merge_sentences:
+            all_phonemes = [sum(all_phonemes, [])]
+
         if print_info:
             print("----------------------------")
             print("text norm results:")
             print(sentences)
             print("----------------------------")
             print("g2p results:")
-            print(all_phonemes[0])
+            print(all_phonemes)
             print("----------------------------")
 
-        return [sum(all_phonemes, [])]
+        return all_phonemes
 
     def add_sp_if_no(self, phonemes):
         """
diff --git a/tests/unit/tts/test_mixfrontend.py b/tests/unit/tts/test_mixfrontend.py
index fdfebf46..24167338 100644
--- a/tests/unit/tts/test_mixfrontend.py
+++ b/tests/unit/tts/test_mixfrontend.py
@@ -423,7 +423,7 @@ if __name__ == '__main__':
         segs = frontend.split_by_lang(text)
         print(segs)
 
-        # 对于SSML的xml标记处理不好。
+        # 对于SSML的xml标记处理不好。需要先解析SSML，后处理中英的划分。
         text = "<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>"
         print(text)
         # [('<speak>', 'en'), ('我们的声学模型使用了 ', 'zh'), ('Fast Speech Two。', 'en'), ('前浪<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('在沙滩上,沙滩上倒了一堆<', 'zh'), ("say-as pinyin='tu3'>", 'en'), ('土</', 'zh'), ('say-as>。 ', 'en'), ('想象<', 'zh'), ("say-as pinyin='gan1 gan1'>", 'en'), ('干干</', 'zh'), ('say-as>', 'en'), ('的树干<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('了, 里面有个干尸，不知是被谁<', 'zh'), ("say-as pinyin='gan4'>", 'en'), ('干</', 'zh'), ('say-as>', 'en'), ('死的。</', 'zh'), ('speak>', 'en')]