add typehint for g2pw (#2390)

3 years ago · eac362057c
parent 68c2ec7563
commit eac362057c
4 changed files with 71 additions and 58 deletions
--- a/paddlespeech/t2s/frontend/g2pw/init.py
+++ b/paddlespeech/t2s/frontend/g2pw/init.py
@ -1 +1 @@
-from paddlespeech.t2s.frontend.g2pw.onnx_api import G2PWOnnxConverter
+from .onnx_api import G2PWOnnxConverter
--- a/paddlespeech/t2s/frontend/g2pw/dataset.py
+++ b/paddlespeech/t2s/frontend/g2pw/dataset.py
@ -15,6 +15,10 @@
 Credits
    This code is modified from https://github.com/GitYCC/g2pW
 """
 from typing import Dict
 from typing import List
 from typing import Tuple
 import numpy as np
 from paddlespeech.t2s.frontend.g2pw.utils import tokenize_and_map
@ -23,22 +27,17 @@ ANCHOR_CHAR = '▁'
 def prepare_onnx_input(tokenizer,
-                       labels,
+                       labels: List[str],
-                       char2phonemes,
+                       char2phonemes: Dict[str, List[int]],
-                       chars,
+                       chars: List[str],
-                       texts,
+                       texts: List[str],
-                       query_ids,
+                       query_ids: List[int],
-                       phonemes=None,
+                       use_mask: bool=False,
-                       pos_tags=None,
+                       window_size: int=None,
-                       use_mask=False,
+                       max_len: int=512) -> Dict[str, np.array]:
                       use_char_phoneme=False,
                       use_pos=False,
                       window_size=None,
                       max_len=512):
    if window_size is not None:
-        truncated_texts, truncated_query_ids = _truncate_texts(window_size,
+        truncated_texts, truncated_query_ids = _truncate_texts(
-                                                               texts, query_ids)
+            window_size=window_size, texts=texts, query_ids=query_ids)
    input_ids = []
    token_type_ids = []
    attention_masks = []
@ -51,13 +50,19 @@ def prepare_onnx_input(tokenizer,
        query_id = (truncated_query_ids if window_size else query_ids)[idx]
        try:
-            tokens, text2token, token2text = tokenize_and_map(tokenizer, text)
+            tokens, text2token, token2text = tokenize_and_map(
                tokenizer=tokenizer, text=text)
        except Exception:
            print(f'warning: text "{text}" is invalid')
            return {}
        text, query_id, tokens, text2token, token2text = _truncate(
-            max_len, text, query_id, tokens, text2token, token2text)
+            max_len=max_len,
            text=text,
            query_id=query_id,
            tokens=tokens,
            text2token=text2token,
            token2text=token2text)
        processed_tokens = ['[CLS]'] + tokens + ['[SEP]']
@ -91,7 +96,8 @@ def prepare_onnx_input(tokenizer,
    return outputs
-def _truncate_texts(window_size, texts, query_ids):
+def _truncate_texts(window_size: int, texts: List[str],
                    query_ids: List[int]) -> Tuple[List[str], List[int]]:
    truncated_texts = []
    truncated_query_ids = []
    for text, query_id in zip(texts, query_ids):
@ -105,7 +111,12 @@ def _truncate_texts(window_size, texts, query_ids):
    return truncated_texts, truncated_query_ids
-def _truncate(max_len, text, query_id, tokens, text2token, token2text):
+def _truncate(max_len: int,
              text: str,
              query_id: int,
              tokens: List[str],
              text2token: List[int],
              token2text: List[Tuple[int]]):
    truncate_len = max_len - 2
    if len(tokens) <= truncate_len:
        return (text, query_id, tokens, text2token, token2text)
@ -132,18 +143,8 @@ def _truncate(max_len, text, query_id, tokens, text2token, token2text):
    ], [(s - start, e - start) for s, e in token2text[token_start:token_end]])
-def prepare_data(sent_path, lb_path=None):
+def get_phoneme_labels(polyphonic_chars: List[List[str]]
-    raw_texts = open(sent_path).read().rstrip().split('\n')
+                       ) -> Tuple[List[str], Dict[str, List[int]]]:
    query_ids = [raw.index(ANCHOR_CHAR) for raw in raw_texts]
    texts = [raw.replace(ANCHOR_CHAR, '') for raw in raw_texts]
    if lb_path is None:
        return texts, query_ids
    else:
        phonemes = open(lb_path).read().rstrip().split('\n')
        return texts, query_ids, phonemes
 def get_phoneme_labels(polyphonic_chars):
    labels = sorted(list(set([phoneme for char, phoneme in polyphonic_chars])))
    char2phonemes = {}
    for char, phoneme in polyphonic_chars:
@ -153,7 +154,8 @@ def get_phoneme_labels(polyphonic_chars):
    return labels, char2phonemes
-def get_char_phoneme_labels(polyphonic_chars):
+def get_char_phoneme_labels(polyphonic_chars: List[List[str]]
                            ) -> Tuple[List[str], Dict[str, List[int]]]:
    labels = sorted(
        list(set([f'{char} {phoneme}' for char, phoneme in polyphonic_chars])))
    char2phonemes = {}
--- a/paddlespeech/t2s/frontend/g2pw/onnx_api.py
+++ b/paddlespeech/t2s/frontend/g2pw/onnx_api.py
@ -17,6 +17,10 @@ Credits
 """
 import json
 import os
 from typing import Any
 from typing import Dict
 from typing import List
 from typing import Tuple
 import numpy as np
 import onnxruntime
@ -37,7 +41,8 @@ from paddlespeech.utils.env import MODEL_HOME
 model_version = '1.1'
-def predict(session, onnx_input, labels):
+def predict(session, onnx_input: Dict[str, Any],
            labels: List[str]) -> Tuple[List[str], List[float]]:
    all_preds = []
    all_confidences = []
    probs = session.run([], {
@ -61,10 +66,10 @@ def predict(session, onnx_input, labels):
 class G2PWOnnxConverter:
    def __init__(self,
-                 model_dir=MODEL_HOME,
+                 model_dir: os.PathLike=MODEL_HOME,
-                 style='bopomofo',
+                 style: str='bopomofo',
-                 model_source=None,
+                 model_source: str=None,
-                 enable_non_tradional_chinese=False):
+                 enable_non_tradional_chinese: bool=False):
        uncompress_path = download_and_decompress(
            g2pw_onnx_models['G2PWModel'][model_version], model_dir)
@ -76,7 +81,8 @@ class G2PWOnnxConverter:
            os.path.join(uncompress_path, 'g2pW.onnx'),
            sess_options=sess_options)
        self.config = load_config(
-            os.path.join(uncompress_path, 'config.py'), use_default=True)
+            config_path=os.path.join(uncompress_path, 'config.py'),
            use_default=True)
        self.model_source = model_source if model_source else self.config.model_source
        self.enable_opencc = enable_non_tradional_chinese
@ -103,9 +109,9 @@ class G2PWOnnxConverter:
            .strip().split('\n')
        ]
        self.labels, self.char2phonemes = get_char_phoneme_labels(
-            self.polyphonic_chars
+            polyphonic_chars=self.polyphonic_chars
        ) if self.config.use_char_phoneme else get_phoneme_labels(
-            self.polyphonic_chars)
+            polyphonic_chars=self.polyphonic_chars)
        self.chars = sorted(list(self.char2phonemes.keys()))
@ -146,7 +152,7 @@ class G2PWOnnxConverter:
        if self.enable_opencc:
            self.cc = OpenCC('s2tw')
-    def _convert_bopomofo_to_pinyin(self, bopomofo):
+    def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str:
        tone = bopomofo[-1]
        assert tone in '12345'
        component = self.bopomofo_convert_dict.get(bopomofo[:-1])
@ -156,7 +162,7 @@ class G2PWOnnxConverter:
            print(f'Warning: "{bopomofo}" cannot convert to pinyin')
            return None
-    def __call__(self, sentences):
+    def __call__(self, sentences: List[str]) -> List[List[str]]:
        if isinstance(sentences, str):
            sentences = [sentences]
@ -169,23 +175,25 @@ class G2PWOnnxConverter:
            sentences = translated_sentences
        texts, query_ids, sent_ids, partial_results = self._prepare_data(
-            sentences)
+            sentences=sentences)
        if len(texts) == 0:
            # sentences no polyphonic words
            return partial_results
        onnx_input = prepare_onnx_input(
-            self.tokenizer,
+            tokenizer=self.tokenizer,
-            self.labels,
+            labels=self.labels,
-            self.char2phonemes,
+            char2phonemes=self.char2phonemes,
-            self.chars,
+            chars=self.chars,
-            texts,
+            texts=texts,
-            query_ids,
+            query_ids=query_ids,
            use_mask=self.config.use_mask,
            use_char_phoneme=self.config.use_char_phoneme,
            window_size=None)
-        preds, confidences = predict(self.session_g2pW, onnx_input, self.labels)
+        preds, confidences = predict(
            session=self.session_g2pW,
            onnx_input=onnx_input,
            labels=self.labels)
        if self.config.use_char_phoneme:
            preds = [pred.split(' ')[1] for pred in preds]
@ -195,7 +203,9 @@ class G2PWOnnxConverter:
        return results
-    def _prepare_data(self, sentences):
+    def _prepare_data(
            self, sentences: List[str]
    ) -> Tuple[List[str], List[int], List[int], List[List[str]]]:
        texts, query_ids, sent_ids, partial_results = [], [], [], []
        for sent_id, sent in enumerate(sentences):
            # pypinyin works well for Simplified Chinese than Traditional Chinese
--- a/paddlespeech/t2s/frontend/g2pw/utils.py
+++ b/paddlespeech/t2s/frontend/g2pw/utils.py
@ -15,10 +15,11 @@
 Credits
    This code is modified from https://github.com/GitYCC/g2pW
 """
 import os
 import re
-def wordize_and_map(text):
+def wordize_and_map(text: str):
    words = []
    index_map_from_text_to_word = []
    index_map_from_word_to_text = []
@ -54,8 +55,8 @@ def wordize_and_map(text):
    return words, index_map_from_text_to_word, index_map_from_word_to_text
-def tokenize_and_map(tokenizer, text):
+def tokenize_and_map(tokenizer, text: str):
-    words, text2word, word2text = wordize_and_map(text)
+    words, text2word, word2text = wordize_and_map(text=text)
    tokens = []
    index_map_from_token_to_text = []
@ -82,7 +83,7 @@ def tokenize_and_map(tokenizer, text):
    return tokens, index_map_from_text_to_token, index_map_from_token_to_text
-def _load_config(config_path):
+def _load_config(config_path: os.PathLike):
    import importlib.util
    spec = importlib.util.spec_from_file_location('__init__', config_path)
    config = importlib.util.module_from_spec(spec)
@ -130,7 +131,7 @@ default_config_dict = {
 }
-def load_config(config_path, use_default=False):
+def load_config(config_path: os.PathLike, use_default: bool=False):
    config = _load_config(config_path)
    if use_default:
        for attr, val in default_config_dict.items():
`@ -1 +1 @@`
	`from paddlespeech.t2s.frontend.g2pw.onnx_api import G2PWOnnxConverter`	`from .onnx_api import G2PWOnnxConverter`