From 4ac206e22ff2c7c669e4b4c2b6f74f842020aca6 Mon Sep 17 00:00:00 2001
From: tianhao zhang <15600919271@163.com>
Date: Fri, 16 Sep 2022 02:38:17 +0000
Subject: [PATCH 1/3] update wenetspeech RESULT.md, test=doc

---
 examples/wenetspeech/asr1/RESULTS.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md
index af84a5f6e..f22c652e6 100644
--- a/examples/wenetspeech/asr1/RESULTS.md
+++ b/examples/wenetspeech/asr1/RESULTS.md
@@ -46,3 +46,10 @@ Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1
 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | ctc_greedy_search | 16 | 0.078918 |  
 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | ctc_prefix_beam_search | 16 | 0.079080 |  
 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | attention_rescoring | 16 | 0.054401 |
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | attention | -1 | 0.050767 |  
+| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | ctc_greedy_search | -1 | 0.061884 |  
+| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | ctc_prefix_beam_search | -1 | 0.062056 |  
+| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | attention_rescoring | -1 |  0.052110 |

From eac362057c3db60a2b60ef49eb51867187050a18 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 16 Sep 2022 16:00:52 +0800
Subject: [PATCH 2/3] add typehint for g2pw (#2390)

---
 paddlespeech/t2s/frontend/g2pw/__init__.py |  2 +-
 paddlespeech/t2s/frontend/g2pw/dataset.py  | 66 +++++++++++-----------
 paddlespeech/t2s/frontend/g2pw/onnx_api.py | 50 +++++++++-------
 paddlespeech/t2s/frontend/g2pw/utils.py    | 11 ++--
 4 files changed, 71 insertions(+), 58 deletions(-)

diff --git a/paddlespeech/t2s/frontend/g2pw/__init__.py b/paddlespeech/t2s/frontend/g2pw/__init__.py
index 0eaeee5df..89b3af3ca 100644
--- a/paddlespeech/t2s/frontend/g2pw/__init__.py
+++ b/paddlespeech/t2s/frontend/g2pw/__init__.py
@@ -1 +1 @@
-from paddlespeech.t2s.frontend.g2pw.onnx_api import G2PWOnnxConverter
+from .onnx_api import G2PWOnnxConverter
diff --git a/paddlespeech/t2s/frontend/g2pw/dataset.py b/paddlespeech/t2s/frontend/g2pw/dataset.py
index 98af5f463..8a1c2e0bf 100644
--- a/paddlespeech/t2s/frontend/g2pw/dataset.py
+++ b/paddlespeech/t2s/frontend/g2pw/dataset.py
@@ -15,6 +15,10 @@
 Credits
     This code is modified from https://github.com/GitYCC/g2pW
 """
+from typing import Dict
+from typing import List
+from typing import Tuple
+
 import numpy as np
 
 from paddlespeech.t2s.frontend.g2pw.utils import tokenize_and_map
@@ -23,22 +27,17 @@ ANCHOR_CHAR = '▁'
 
 
 def prepare_onnx_input(tokenizer,
-                       labels,
-                       char2phonemes,
-                       chars,
-                       texts,
-                       query_ids,
-                       phonemes=None,
-                       pos_tags=None,
-                       use_mask=False,
-                       use_char_phoneme=False,
-                       use_pos=False,
-                       window_size=None,
-                       max_len=512):
+                       labels: List[str],
+                       char2phonemes: Dict[str, List[int]],
+                       chars: List[str],
+                       texts: List[str],
+                       query_ids: List[int],
+                       use_mask: bool=False,
+                       window_size: int=None,
+                       max_len: int=512) -> Dict[str, np.array]:
     if window_size is not None:
-        truncated_texts, truncated_query_ids = _truncate_texts(window_size,
-                                                               texts, query_ids)
-
+        truncated_texts, truncated_query_ids = _truncate_texts(
+            window_size=window_size, texts=texts, query_ids=query_ids)
     input_ids = []
     token_type_ids = []
     attention_masks = []
@@ -51,13 +50,19 @@ def prepare_onnx_input(tokenizer,
         query_id = (truncated_query_ids if window_size else query_ids)[idx]
 
         try:
-            tokens, text2token, token2text = tokenize_and_map(tokenizer, text)
+            tokens, text2token, token2text = tokenize_and_map(
+                tokenizer=tokenizer, text=text)
         except Exception:
             print(f'warning: text "{text}" is invalid')
             return {}
 
         text, query_id, tokens, text2token, token2text = _truncate(
-            max_len, text, query_id, tokens, text2token, token2text)
+            max_len=max_len,
+            text=text,
+            query_id=query_id,
+            tokens=tokens,
+            text2token=text2token,
+            token2text=token2text)
 
         processed_tokens = ['[CLS]'] + tokens + ['[SEP]']
 
@@ -91,7 +96,8 @@ def prepare_onnx_input(tokenizer,
     return outputs
 
 
-def _truncate_texts(window_size, texts, query_ids):
+def _truncate_texts(window_size: int, texts: List[str],
+                    query_ids: List[int]) -> Tuple[List[str], List[int]]:
     truncated_texts = []
     truncated_query_ids = []
     for text, query_id in zip(texts, query_ids):
@@ -105,7 +111,12 @@ def _truncate_texts(window_size, texts, query_ids):
     return truncated_texts, truncated_query_ids
 
 
-def _truncate(max_len, text, query_id, tokens, text2token, token2text):
+def _truncate(max_len: int,
+              text: str,
+              query_id: int,
+              tokens: List[str],
+              text2token: List[int],
+              token2text: List[Tuple[int]]):
     truncate_len = max_len - 2
     if len(tokens) <= truncate_len:
         return (text, query_id, tokens, text2token, token2text)
@@ -132,18 +143,8 @@ def _truncate(max_len, text, query_id, tokens, text2token, token2text):
     ], [(s - start, e - start) for s, e in token2text[token_start:token_end]])
 
 
-def prepare_data(sent_path, lb_path=None):
-    raw_texts = open(sent_path).read().rstrip().split('\n')
-    query_ids = [raw.index(ANCHOR_CHAR) for raw in raw_texts]
-    texts = [raw.replace(ANCHOR_CHAR, '') for raw in raw_texts]
-    if lb_path is None:
-        return texts, query_ids
-    else:
-        phonemes = open(lb_path).read().rstrip().split('\n')
-        return texts, query_ids, phonemes
-
-
-def get_phoneme_labels(polyphonic_chars):
+def get_phoneme_labels(polyphonic_chars: List[List[str]]
+                       ) -> Tuple[List[str], Dict[str, List[int]]]:
     labels = sorted(list(set([phoneme for char, phoneme in polyphonic_chars])))
     char2phonemes = {}
     for char, phoneme in polyphonic_chars:
@@ -153,7 +154,8 @@ def get_phoneme_labels(polyphonic_chars):
     return labels, char2phonemes
 
 
-def get_char_phoneme_labels(polyphonic_chars):
+def get_char_phoneme_labels(polyphonic_chars: List[List[str]]
+                            ) -> Tuple[List[str], Dict[str, List[int]]]:
     labels = sorted(
         list(set([f'{char} {phoneme}' for char, phoneme in polyphonic_chars])))
     char2phonemes = {}
diff --git a/paddlespeech/t2s/frontend/g2pw/onnx_api.py b/paddlespeech/t2s/frontend/g2pw/onnx_api.py
index 180e8ae15..ad32c4050 100644
--- a/paddlespeech/t2s/frontend/g2pw/onnx_api.py
+++ b/paddlespeech/t2s/frontend/g2pw/onnx_api.py
@@ -17,6 +17,10 @@ Credits
 """
 import json
 import os
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Tuple
 
 import numpy as np
 import onnxruntime
@@ -37,7 +41,8 @@ from paddlespeech.utils.env import MODEL_HOME
 model_version = '1.1'
 
 
-def predict(session, onnx_input, labels):
+def predict(session, onnx_input: Dict[str, Any],
+            labels: List[str]) -> Tuple[List[str], List[float]]:
     all_preds = []
     all_confidences = []
     probs = session.run([], {
@@ -61,10 +66,10 @@ def predict(session, onnx_input, labels):
 
 class G2PWOnnxConverter:
     def __init__(self,
-                 model_dir=MODEL_HOME,
-                 style='bopomofo',
-                 model_source=None,
-                 enable_non_tradional_chinese=False):
+                 model_dir: os.PathLike=MODEL_HOME,
+                 style: str='bopomofo',
+                 model_source: str=None,
+                 enable_non_tradional_chinese: bool=False):
         uncompress_path = download_and_decompress(
             g2pw_onnx_models['G2PWModel'][model_version], model_dir)
 
@@ -76,7 +81,8 @@ class G2PWOnnxConverter:
             os.path.join(uncompress_path, 'g2pW.onnx'),
             sess_options=sess_options)
         self.config = load_config(
-            os.path.join(uncompress_path, 'config.py'), use_default=True)
+            config_path=os.path.join(uncompress_path, 'config.py'),
+            use_default=True)
 
         self.model_source = model_source if model_source else self.config.model_source
         self.enable_opencc = enable_non_tradional_chinese
@@ -103,9 +109,9 @@ class G2PWOnnxConverter:
             .strip().split('\n')
         ]
         self.labels, self.char2phonemes = get_char_phoneme_labels(
-            self.polyphonic_chars
+            polyphonic_chars=self.polyphonic_chars
         ) if self.config.use_char_phoneme else get_phoneme_labels(
-            self.polyphonic_chars)
+            polyphonic_chars=self.polyphonic_chars)
 
         self.chars = sorted(list(self.char2phonemes.keys()))
 
@@ -146,7 +152,7 @@ class G2PWOnnxConverter:
         if self.enable_opencc:
             self.cc = OpenCC('s2tw')
 
-    def _convert_bopomofo_to_pinyin(self, bopomofo):
+    def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str:
         tone = bopomofo[-1]
         assert tone in '12345'
         component = self.bopomofo_convert_dict.get(bopomofo[:-1])
@@ -156,7 +162,7 @@ class G2PWOnnxConverter:
             print(f'Warning: "{bopomofo}" cannot convert to pinyin')
             return None
 
-    def __call__(self, sentences):
+    def __call__(self, sentences: List[str]) -> List[List[str]]:
         if isinstance(sentences, str):
             sentences = [sentences]
 
@@ -169,23 +175,25 @@ class G2PWOnnxConverter:
             sentences = translated_sentences
 
         texts, query_ids, sent_ids, partial_results = self._prepare_data(
-            sentences)
+            sentences=sentences)
         if len(texts) == 0:
             # sentences no polyphonic words
             return partial_results
 
         onnx_input = prepare_onnx_input(
-            self.tokenizer,
-            self.labels,
-            self.char2phonemes,
-            self.chars,
-            texts,
-            query_ids,
+            tokenizer=self.tokenizer,
+            labels=self.labels,
+            char2phonemes=self.char2phonemes,
+            chars=self.chars,
+            texts=texts,
+            query_ids=query_ids,
             use_mask=self.config.use_mask,
-            use_char_phoneme=self.config.use_char_phoneme,
             window_size=None)
 
-        preds, confidences = predict(self.session_g2pW, onnx_input, self.labels)
+        preds, confidences = predict(
+            session=self.session_g2pW,
+            onnx_input=onnx_input,
+            labels=self.labels)
         if self.config.use_char_phoneme:
             preds = [pred.split(' ')[1] for pred in preds]
 
@@ -195,7 +203,9 @@ class G2PWOnnxConverter:
 
         return results
 
-    def _prepare_data(self, sentences):
+    def _prepare_data(
+            self, sentences: List[str]
+    ) -> Tuple[List[str], List[int], List[int], List[List[str]]]:
         texts, query_ids, sent_ids, partial_results = [], [], [], []
         for sent_id, sent in enumerate(sentences):
             # pypinyin works well for Simplified Chinese than Traditional Chinese
diff --git a/paddlespeech/t2s/frontend/g2pw/utils.py b/paddlespeech/t2s/frontend/g2pw/utils.py
index ad02c4c1d..ba9ce51ba 100644
--- a/paddlespeech/t2s/frontend/g2pw/utils.py
+++ b/paddlespeech/t2s/frontend/g2pw/utils.py
@@ -15,10 +15,11 @@
 Credits
     This code is modified from https://github.com/GitYCC/g2pW
 """
+import os
 import re
 
 
-def wordize_and_map(text):
+def wordize_and_map(text: str):
     words = []
     index_map_from_text_to_word = []
     index_map_from_word_to_text = []
@@ -54,8 +55,8 @@ def wordize_and_map(text):
     return words, index_map_from_text_to_word, index_map_from_word_to_text
 
 
-def tokenize_and_map(tokenizer, text):
-    words, text2word, word2text = wordize_and_map(text)
+def tokenize_and_map(tokenizer, text: str):
+    words, text2word, word2text = wordize_and_map(text=text)
 
     tokens = []
     index_map_from_token_to_text = []
@@ -82,7 +83,7 @@ def tokenize_and_map(tokenizer, text):
     return tokens, index_map_from_text_to_token, index_map_from_token_to_text
 
 
-def _load_config(config_path):
+def _load_config(config_path: os.PathLike):
     import importlib.util
     spec = importlib.util.spec_from_file_location('__init__', config_path)
     config = importlib.util.module_from_spec(spec)
@@ -130,7 +131,7 @@ default_config_dict = {
 }
 
 
-def load_config(config_path, use_default=False):
+def load_config(config_path: os.PathLike, use_default: bool=False):
     config = _load_config(config_path)
     if use_default:
         for attr, val in default_config_dict.items():

From e6cbcca3e220b3b2ae869055f0771b48958b512b Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 16 Sep 2022 16:23:47 +0800
Subject: [PATCH 3/3] fix ERNIE-SAT README, test=doc (#2392)

---
 examples/aishell3/ernie_sat/README.md      | 13 ++++++-------
 examples/aishell3_vctk/ernie_sat/README.md | 13 ++++++-------
 examples/vctk/ernie_sat/README.md          | 11 +++++------
 3 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/examples/aishell3/ernie_sat/README.md b/examples/aishell3/ernie_sat/README.md
index 707ee1381..eb867ab75 100644
--- a/examples/aishell3/ernie_sat/README.md
+++ b/examples/aishell3/ernie_sat/README.md
@@ -1,11 +1,10 @@
-# ERNIE-SAT with AISHELL3 dataset
+# ERNIE-SAT with VCTK dataset
+ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
 
-ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型，其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景，该项目供研究使用。
-
-## 模型框架
-ERNIE-SAT 中我们提出了两项创新：
-- 在预训练过程中将中英双语对应的音素作为输入，实现了跨语言、个性化的软音素映射
-- 采用语言和语音的联合掩码学习实现了语言和语音的对齐
+## Model Framework
+In ERNIE-SAT, we propose two innovations:
+- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
+- The joint mask learning of speech and text is used to realize the alignment of speech and text
 
 <p align="center">
     <img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />
diff --git a/examples/aishell3_vctk/ernie_sat/README.md b/examples/aishell3_vctk/ernie_sat/README.md
index a849488d5..d55af6756 100644
--- a/examples/aishell3_vctk/ernie_sat/README.md
+++ b/examples/aishell3_vctk/ernie_sat/README.md
@@ -1,11 +1,10 @@
-# ERNIE-SAT with AISHELL3 and VCTK dataset
+# ERNIE-SAT with VCTK dataset
+ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
 
-ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型，其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景，该项目供研究使用。
-
-## 模型框架
-ERNIE-SAT 中我们提出了两项创新：
-- 在预训练过程中将中英双语对应的音素作为输入，实现了跨语言、个性化的软音素映射
-- 采用语言和语音的联合掩码学习实现了语言和语音的对齐
+## Model Framework
+In ERNIE-SAT, we propose two innovations:
+- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
+- The joint mask learning of speech and text is used to realize the alignment of speech and text
 
 <p align="center">
     <img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />
diff --git a/examples/vctk/ernie_sat/README.md b/examples/vctk/ernie_sat/README.md
index 0a2f9359e..94c7ae25d 100644
--- a/examples/vctk/ernie_sat/README.md
+++ b/examples/vctk/ernie_sat/README.md
@@ -1,11 +1,10 @@
 # ERNIE-SAT with VCTK dataset
+ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
 
-ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型，其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景，该项目供研究使用。
-
-## 模型框架
-ERNIE-SAT 中我们提出了两项创新：
-- 在预训练过程中将中英双语对应的音素作为输入，实现了跨语言、个性化的软音素映射
-- 采用语言和语音的联合掩码学习实现了语言和语音的对齐
+## Model Framework
+In ERNIE-SAT, we propose two innovations:
+- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping
+- The joint mask learning of speech and text is used to realize the alignment of speech and text
 
 <p align="center">
     <img src="https://user-images.githubusercontent.com/24568452/186110814-1b9c6618-a0ab-4c0c-bb3d-3d860b0e8cc2.png" />