diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f72b44ac..44bbd5ca 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -26,12 +26,12 @@ repos:
         - --no-sort-keys
         - --autofix
     -   id: check-merge-conflict
-    -   id: flake8
-        aergs:
-        -  --ignore=E501,E228,E226,E261,E266,E128,E402,W503
-        -  --builtins=G,request
-        -  --jobs=1
-        exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
+      #    -   id: flake8
+      #        aergs:
+      #        -  --ignore=E501,E228,E226,E261,E266,E128,E402,W503
+      #        -  --builtins=G,request
+      #        -  --jobs=1
+      #        exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
 
 -   repo : https://github.com/Lucas-C/pre-commit-hooks
     rev: v1.0.1
diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
index f7821384..93146df0 100644
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -115,9 +115,9 @@ def evaluate(args):
         sentences = get_sentences_svs(text_file=args.text)
     else:
         sentences = get_sentences(text_file=args.text, lang=args.lang)
-    pprint(f"inputs: {sentences}")
 
     for utt_id, sentence in sentences:
+        print(f"{utt_id} {sentence} ...")
         with timer() as t:
             if am_name == "diffsinger":
                 text = ""
diff --git a/paddlespeech/t2s/frontend/canton_frontend.py b/paddlespeech/t2s/frontend/canton_frontend.py
index f2c7175f..dd3484d5 100644
--- a/paddlespeech/t2s/frontend/canton_frontend.py
+++ b/paddlespeech/t2s/frontend/canton_frontend.py
@@ -29,7 +29,8 @@ INITIALS = [
 INITIALS += ['sp', 'spl', 'spn', 'sil']
 
 
-def get_lines(cantons: List[str]):
+def jyuping_to_phonemes(cantons: List[str]):
+    # jyuping to inital and final
     phones = []
     for canton in cantons:
         for consonant in INITIALS:
@@ -61,8 +62,11 @@ class CantonFrontend():
              merge_sentences: bool=True) -> List[List[str]]:
         phones_list = []
         for sentence in sentences:
+            # jyuping
+            # 'gam3 ngaam1 lou5 sai3 jiu1 kau4 keoi5 dang2 zan6 jiu3 hoi1 wui2, zing6 dai1 ge2 je5 ngo5 wui5 gaau2 dim6 ga3 laa3.'
             phones_str = ToJyutping.get_jyutping_text(sentence)
-            phones_split = get_lines(phones_str.split(' '))
+            # phonemes 
+            phones_split = jyuping_to_phonemes(phones_str.split(' '))
             phones_list.append(phones_split)
         return phones_list
 
@@ -78,8 +82,11 @@ class CantonFrontend():
                      sentence: str,
                      merge_sentences: bool=True,
                      print_info: bool=False) -> List[List[str]]:
+        # TN & Text Segmentation
         sentences = self.text_normalizer.normalize(sentence)
+        # G2P
         phonemes = self._g2p(sentences, merge_sentences=merge_sentences)
+
         if print_info:
             print("----------------------------")
             print("text norm results:")
@@ -88,6 +95,7 @@ class CantonFrontend():
             print("g2p results:")
             print(phonemes)
             print("----------------------------")
+
         return phonemes
 
     def get_input_ids(self,
@@ -98,9 +106,9 @@ class CantonFrontend():
 
         phonemes = self.get_phonemes(
             sentence, merge_sentences=merge_sentences, print_info=print_info)
+
         result = {}
         temp_phone_ids = []
-
         for phones in phonemes:
             if phones:
                 phone_ids = self._p2id(phones)
@@ -108,6 +116,8 @@ class CantonFrontend():
                 if to_tensor:
                     phone_ids = paddle.to_tensor(phone_ids)
                 temp_phone_ids.append(phone_ids)
+
         if temp_phone_ids:
             result["phone_ids"] = temp_phone_ids
+
         return result
diff --git a/paddlespeech/t2s/frontend/ssml/xml_processor.py b/paddlespeech/t2s/frontend/ssml/xml_processor.py
index 3e713d5d..3a5177d1 100644
--- a/paddlespeech/t2s/frontend/ssml/xml_processor.py
+++ b/paddlespeech/t2s/frontend/ssml/xml_processor.py
@@ -17,7 +17,6 @@ Note:  xml 有5种特殊字符， &<>"'
 '  &apos;
 例如：
 <TitleName>&quot;姓名&quot;</TitleName>
-
 '''
 
 
@@ -61,14 +60,23 @@ class MixTextProcessor():
         patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
         mat = re.match(patn, mixstr)
         if mat:
+            # pre <speak>
             pre_xml = mat.group(1)
+            # between <speak> ... </speak>
             in_xml = mat.group(2)
+            # post </speak>
             after_xml = mat.group(3)
 
+            # pre with none syllable
             ctlist.append([pre_xml, []])
+
+            # between with syllable
+            # [(sub sentence, [syllables]), ...]
             dom = DomXml(in_xml)
             pinyinlist = dom.get_pinyins_for_xml()
             ctlist = ctlist + pinyinlist
+
+            # post with none syllable
             ctlist.append([after_xml, []])
         else:
             ctlist.append([mixstr, []])
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index 498a09fa..2e0b2ffd 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -14,6 +14,7 @@
 import os
 import re
 from operator import itemgetter
+from pprint import pprint
 from typing import Dict
 from typing import List
 
@@ -41,6 +42,9 @@ INITIALS = [
 ]
 INITIALS += ['y', 'w', 'sp', 'spl', 'spn', 'sil']
 
+# 0 for None, 5 for neutral
+TONES = ["0", "1", "2", "3", "4", "5"]
+
 
 def intersperse(lst, item):
     result = [item] * (len(lst) * 2 + 1)
@@ -597,11 +601,13 @@ class Frontend():
         all_phonemes = []
         for word_pinyin_item in ssml_inputs:
             phonemes = []
-            print("ssml inputs:", word_pinyin_item)
+
+            # ['你喜欢', []] -> 你喜欢 []
             sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item)
-            print('ssml g2p:', sentence, pinyin_spec)
+
             # TN & Text Segmentation
             sentences = self.text_normalizer.normalize(sentence)
+
             if len(pinyin_spec) == 0:
                 # g2p word w/o specified <say-as>
                 phonemes = self._g2p(
@@ -635,6 +641,7 @@ class Frontend():
             print("g2p results:")
             print(all_phonemes[0])
             print("----------------------------")
+
         return [sum(all_phonemes, [])]
 
     def add_sp_if_no(self, phonemes):
@@ -711,10 +718,10 @@ class Frontend():
             to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
 
         # split setence by SSML tag.
-        l_inputs = MixTextProcessor.get_pinyin_split(sentence)
+        texts = MixTextProcessor.get_pinyin_split(sentence)
 
         phonemes = self.get_phonemes_ssml(
-            l_inputs,
+            texts,
             merge_sentences=merge_sentences,
             print_info=print_info,
             robot=robot)
diff --git a/tests/unit/tts/test_ssml.py b/tests/unit/tts/test_ssml.py
new file mode 100644
index 00000000..382558a4
--- /dev/null
+++ b/tests/unit/tts/test_ssml.py
@@ -0,0 +1,61 @@
+from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
+
+if __name__ == '__main__':
+    text = "你好吗，<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>thank you."
+
+    # SSML: 13
+    # 0 ['你好吗，', []]
+    # 1 ['我们的声学模型使用了FastSpeechTwo。前浪', []]
+    # 2 ['倒', ['dao3']]
+    # 3 ['在沙滩上,沙滩上倒了一堆', []]
+    # 4 ['土', ['tu3']]
+    # 5 ['。想象', []]
+    # 6 ['干干', ['gan1', 'gan1']]
+    # 7 ['的树干', []]
+    # 8 ['倒', ['dao3']]
+    # 9 ['了,里面有个干尸，不知是被谁', []]
+    # 10 ['干', ['gan4']]
+    # 11 ['死的。', []]
+    # 12 ['thank you.', []]
+    inputs = MixTextProcessor.get_pinyin_split(text)
+    print(f"SSML get_pinyin_split: {len(inputs)}")
+    for i, sub in enumerate(inputs):
+        print(i, sub)
+    print()
+
+    # SSML get_dom_split: 13
+    # 0 你好吗，
+    # 1 我们的声学模型使用了 Fast Speech Two。前浪
+    # 2 <say-as pinyin="dao3">倒</say-as>
+    # 3 在沙滩上,沙滩上倒了一堆
+    # 4 <say-as pinyin="tu3">土</say-as>
+    # 5 。 想象
+    # 6 <say-as pinyin="gan1 gan1">干干</say-as>
+    # 7 的树干
+    # 8 <say-as pinyin="dao3">倒</say-as>
+    # 9 了, 里面有个干尸，不知是被谁
+    # 10 <say-as pinyin="gan4">干</say-as>
+    # 11 死的。
+    # 12 thank you.
+    inputs = MixTextProcessor.get_dom_split(text)
+    print(f"SSML get_dom_split: {len(inputs)}")
+    for i, sub in enumerate(inputs):
+        print(i, sub)
+    print()
+
+    # SSML object.get_pinyin_split: 246
+    # <speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>
+    outs = MixTextProcessor().get_xml_content(text)
+    print(f"SSML object.get_pinyin_split: {len(outs)}")
+    print(outs)
+    print()
+
+    # SSML object.get_content_split: 30 你好吗，
+    # 1 <speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>
+    # 倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>
+    # 2 thank you.
+    outs = MixTextProcessor().get_content_split(text)
+    print(f"SSML object.get_content_split: {len(outs)}")
+    for i, sub in enumerate(outs):
+        print(i, sub)
+    print()