From 4ac206e22ff2c7c669e4b4c2b6f74f842020aca6 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Fri, 16 Sep 2022 02:38:17 +0000 Subject: [PATCH 1/3] update wenetspeech RESULT.md, test=doc --- examples/wenetspeech/asr1/RESULTS.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md index af84a5f6e..f22c652e6 100644 --- a/examples/wenetspeech/asr1/RESULTS.md +++ b/examples/wenetspeech/asr1/RESULTS.md @@ -46,3 +46,10 @@ Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | 16 | 0.078918 | | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | 16 | 0.079080 | | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | 16 | 0.054401 | + +| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention | -1 | 0.050767 | +| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | -1 | 0.061884 | +| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | -1 | 0.062056 | +| conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | -1 | 0.052110 | From eac362057c3db60a2b60ef49eb51867187050a18 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 16 Sep 2022 16:00:52 +0800 Subject: [PATCH 2/3] add typehint for g2pw (#2390) --- paddlespeech/t2s/frontend/g2pw/__init__.py | 2 +- paddlespeech/t2s/frontend/g2pw/dataset.py | 66 +++++++++++----------- paddlespeech/t2s/frontend/g2pw/onnx_api.py | 50 +++++++++------- paddlespeech/t2s/frontend/g2pw/utils.py | 11 ++-- 4 files changed, 71 insertions(+), 58 deletions(-) diff --git a/paddlespeech/t2s/frontend/g2pw/__init__.py b/paddlespeech/t2s/frontend/g2pw/__init__.py index 0eaeee5df..89b3af3ca 100644 --- a/paddlespeech/t2s/frontend/g2pw/__init__.py +++ b/paddlespeech/t2s/frontend/g2pw/__init__.py @@ -1 +1 @@ -from paddlespeech.t2s.frontend.g2pw.onnx_api import G2PWOnnxConverter +from .onnx_api import G2PWOnnxConverter diff --git a/paddlespeech/t2s/frontend/g2pw/dataset.py b/paddlespeech/t2s/frontend/g2pw/dataset.py index 98af5f463..8a1c2e0bf 100644 --- a/paddlespeech/t2s/frontend/g2pw/dataset.py +++ b/paddlespeech/t2s/frontend/g2pw/dataset.py @@ -15,6 +15,10 @@ Credits This code is modified from https://github.com/GitYCC/g2pW """ +from typing import Dict +from typing import List +from typing import Tuple + import numpy as np from paddlespeech.t2s.frontend.g2pw.utils import tokenize_and_map @@ -23,22 +27,17 @@ ANCHOR_CHAR = '▁' def prepare_onnx_input(tokenizer, - labels, - char2phonemes, - chars, - texts, - query_ids, - phonemes=None, - pos_tags=None, - use_mask=False, - use_char_phoneme=False, - use_pos=False, - window_size=None, - max_len=512): + labels: List[str], + char2phonemes: Dict[str, List[int]], + chars: List[str], + texts: List[str], + query_ids: List[int], + use_mask: bool=False, + window_size: int=None, + max_len: int=512) -> Dict[str, np.array]: if window_size is not None: - truncated_texts, truncated_query_ids = _truncate_texts(window_size, - texts, query_ids) - + truncated_texts, truncated_query_ids = _truncate_texts( + window_size=window_size, texts=texts, query_ids=query_ids) input_ids = [] token_type_ids = [] attention_masks = [] @@ -51,13 +50,19 @@ def prepare_onnx_input(tokenizer, query_id = (truncated_query_ids if window_size else query_ids)[idx] try: - tokens, text2token, token2text = tokenize_and_map(tokenizer, text) + tokens, text2token, token2text = tokenize_and_map( + tokenizer=tokenizer, text=text) except Exception: print(f'warning: text "{text}" is invalid') return {} text, query_id, tokens, text2token, token2text = _truncate( - max_len, text, query_id, tokens, text2token, token2text) + max_len=max_len, + text=text, + query_id=query_id, + tokens=tokens, + text2token=text2token, + token2text=token2text) processed_tokens = ['[CLS]'] + tokens + ['[SEP]'] @@ -91,7 +96,8 @@ def prepare_onnx_input(tokenizer, return outputs -def _truncate_texts(window_size, texts, query_ids): +def _truncate_texts(window_size: int, texts: List[str], + query_ids: List[int]) -> Tuple[List[str], List[int]]: truncated_texts = [] truncated_query_ids = [] for text, query_id in zip(texts, query_ids): @@ -105,7 +111,12 @@ def _truncate_texts(window_size, texts, query_ids): return truncated_texts, truncated_query_ids -def _truncate(max_len, text, query_id, tokens, text2token, token2text): +def _truncate(max_len: int, + text: str, + query_id: int, + tokens: List[str], + text2token: List[int], + token2text: List[Tuple[int]]): truncate_len = max_len - 2 if len(tokens) <= truncate_len: return (text, query_id, tokens, text2token, token2text) @@ -132,18 +143,8 @@ def _truncate(max_len, text, query_id, tokens, text2token, token2text): ], [(s - start, e - start) for s, e in token2text[token_start:token_end]]) -def prepare_data(sent_path, lb_path=None): - raw_texts = open(sent_path).read().rstrip().split('\n') - query_ids = [raw.index(ANCHOR_CHAR) for raw in raw_texts] - texts = [raw.replace(ANCHOR_CHAR, '') for raw in raw_texts] - if lb_path is None: - return texts, query_ids - else: - phonemes = open(lb_path).read().rstrip().split('\n') - return texts, query_ids, phonemes - - -def get_phoneme_labels(polyphonic_chars): +def get_phoneme_labels(polyphonic_chars: List[List[str]] + ) -> Tuple[List[str], Dict[str, List[int]]]: labels = sorted(list(set([phoneme for char, phoneme in polyphonic_chars]))) char2phonemes = {} for char, phoneme in polyphonic_chars: @@ -153,7 +154,8 @@ def get_phoneme_labels(polyphonic_chars): return labels, char2phonemes -def get_char_phoneme_labels(polyphonic_chars): +def get_char_phoneme_labels(polyphonic_chars: List[List[str]] + ) -> Tuple[List[str], Dict[str, List[int]]]: labels = sorted( list(set([f'{char} {phoneme}' for char, phoneme in polyphonic_chars]))) char2phonemes = {} diff --git a/paddlespeech/t2s/frontend/g2pw/onnx_api.py b/paddlespeech/t2s/frontend/g2pw/onnx_api.py index 180e8ae15..ad32c4050 100644 --- a/paddlespeech/t2s/frontend/g2pw/onnx_api.py +++ b/paddlespeech/t2s/frontend/g2pw/onnx_api.py @@ -17,6 +17,10 @@ Credits """ import json import os +from typing import Any +from typing import Dict +from typing import List +from typing import Tuple import numpy as np import onnxruntime @@ -37,7 +41,8 @@ from paddlespeech.utils.env import MODEL_HOME model_version = '1.1' -def predict(session, onnx_input, labels): +def predict(session, onnx_input: Dict[str, Any], + labels: List[str]) -> Tuple[List[str], List[float]]: all_preds = [] all_confidences = [] probs = session.run([], { @@ -61,10 +66,10 @@ def predict(session, onnx_input, labels): class G2PWOnnxConverter: def __init__(self, - model_dir=MODEL_HOME, - style='bopomofo', - model_source=None, - enable_non_tradional_chinese=False): + model_dir: os.PathLike=MODEL_HOME, + style: str='bopomofo', + model_source: str=None, + enable_non_tradional_chinese: bool=False): uncompress_path = download_and_decompress( g2pw_onnx_models['G2PWModel'][model_version], model_dir) @@ -76,7 +81,8 @@ class G2PWOnnxConverter: os.path.join(uncompress_path, 'g2pW.onnx'), sess_options=sess_options) self.config = load_config( - os.path.join(uncompress_path, 'config.py'), use_default=True) + config_path=os.path.join(uncompress_path, 'config.py'), + use_default=True) self.model_source = model_source if model_source else self.config.model_source self.enable_opencc = enable_non_tradional_chinese @@ -103,9 +109,9 @@ class G2PWOnnxConverter: .strip().split('\n') ] self.labels, self.char2phonemes = get_char_phoneme_labels( - self.polyphonic_chars + polyphonic_chars=self.polyphonic_chars ) if self.config.use_char_phoneme else get_phoneme_labels( - self.polyphonic_chars) + polyphonic_chars=self.polyphonic_chars) self.chars = sorted(list(self.char2phonemes.keys())) @@ -146,7 +152,7 @@ class G2PWOnnxConverter: if self.enable_opencc: self.cc = OpenCC('s2tw') - def _convert_bopomofo_to_pinyin(self, bopomofo): + def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str: tone = bopomofo[-1] assert tone in '12345' component = self.bopomofo_convert_dict.get(bopomofo[:-1]) @@ -156,7 +162,7 @@ class G2PWOnnxConverter: print(f'Warning: "{bopomofo}" cannot convert to pinyin') return None - def __call__(self, sentences): + def __call__(self, sentences: List[str]) -> List[List[str]]: if isinstance(sentences, str): sentences = [sentences] @@ -169,23 +175,25 @@ class G2PWOnnxConverter: sentences = translated_sentences texts, query_ids, sent_ids, partial_results = self._prepare_data( - sentences) + sentences=sentences) if len(texts) == 0: # sentences no polyphonic words return partial_results onnx_input = prepare_onnx_input( - self.tokenizer, - self.labels, - self.char2phonemes, - self.chars, - texts, - query_ids, + tokenizer=self.tokenizer, + labels=self.labels, + char2phonemes=self.char2phonemes, + chars=self.chars, + texts=texts, + query_ids=query_ids, use_mask=self.config.use_mask, - use_char_phoneme=self.config.use_char_phoneme, window_size=None) - preds, confidences = predict(self.session_g2pW, onnx_input, self.labels) + preds, confidences = predict( + session=self.session_g2pW, + onnx_input=onnx_input, + labels=self.labels) if self.config.use_char_phoneme: preds = [pred.split(' ')[1] for pred in preds] @@ -195,7 +203,9 @@ class G2PWOnnxConverter: return results - def _prepare_data(self, sentences): + def _prepare_data( + self, sentences: List[str] + ) -> Tuple[List[str], List[int], List[int], List[List[str]]]: texts, query_ids, sent_ids, partial_results = [], [], [], [] for sent_id, sent in enumerate(sentences): # pypinyin works well for Simplified Chinese than Traditional Chinese diff --git a/paddlespeech/t2s/frontend/g2pw/utils.py b/paddlespeech/t2s/frontend/g2pw/utils.py index ad02c4c1d..ba9ce51ba 100644 --- a/paddlespeech/t2s/frontend/g2pw/utils.py +++ b/paddlespeech/t2s/frontend/g2pw/utils.py @@ -15,10 +15,11 @@ Credits This code is modified from https://github.com/GitYCC/g2pW """ +import os import re -def wordize_and_map(text): +def wordize_and_map(text: str): words = [] index_map_from_text_to_word = [] index_map_from_word_to_text = [] @@ -54,8 +55,8 @@ def wordize_and_map(text): return words, index_map_from_text_to_word, index_map_from_word_to_text -def tokenize_and_map(tokenizer, text): - words, text2word, word2text = wordize_and_map(text) +def tokenize_and_map(tokenizer, text: str): + words, text2word, word2text = wordize_and_map(text=text) tokens = [] index_map_from_token_to_text = [] @@ -82,7 +83,7 @@ def tokenize_and_map(tokenizer, text): return tokens, index_map_from_text_to_token, index_map_from_token_to_text -def _load_config(config_path): +def _load_config(config_path: os.PathLike): import importlib.util spec = importlib.util.spec_from_file_location('__init__', config_path) config = importlib.util.module_from_spec(spec) @@ -130,7 +131,7 @@ default_config_dict = { } -def load_config(config_path, use_default=False): +def load_config(config_path: os.PathLike, use_default: bool=False): config = _load_config(config_path) if use_default: for attr, val in default_config_dict.items(): From e6cbcca3e220b3b2ae869055f0771b48958b512b Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 16 Sep 2022 16:23:47 +0800 Subject: [PATCH 3/3] fix ERNIE-SAT README, test=doc (#2392) --- examples/aishell3/ernie_sat/README.md | 13 ++++++------- examples/aishell3_vctk/ernie_sat/README.md | 13 ++++++------- examples/vctk/ernie_sat/README.md | 11 +++++------ 3 files changed, 17 insertions(+), 20 deletions(-) diff --git a/examples/aishell3/ernie_sat/README.md b/examples/aishell3/ernie_sat/README.md index 707ee1381..eb867ab75 100644 --- a/examples/aishell3/ernie_sat/README.md +++ b/examples/aishell3/ernie_sat/README.md @@ -1,11 +1,10 @@ -# ERNIE-SAT with AISHELL3 dataset +# ERNIE-SAT with VCTK dataset +ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning. -ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。 - -## 模型框架 -ERNIE-SAT 中我们提出了两项创新: -- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射 -- 采用语言和语音的联合掩码学习实现了语言和语音的对齐 +## Model Framework +In ERNIE-SAT, we propose two innovations: +- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping +- The joint mask learning of speech and text is used to realize the alignment of speech and text

diff --git a/examples/aishell3_vctk/ernie_sat/README.md b/examples/aishell3_vctk/ernie_sat/README.md index a849488d5..d55af6756 100644 --- a/examples/aishell3_vctk/ernie_sat/README.md +++ b/examples/aishell3_vctk/ernie_sat/README.md @@ -1,11 +1,10 @@ -# ERNIE-SAT with AISHELL3 and VCTK dataset +# ERNIE-SAT with VCTK dataset +ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning. -ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。 - -## 模型框架 -ERNIE-SAT 中我们提出了两项创新: -- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射 -- 采用语言和语音的联合掩码学习实现了语言和语音的对齐 +## Model Framework +In ERNIE-SAT, we propose two innovations: +- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping +- The joint mask learning of speech and text is used to realize the alignment of speech and text

diff --git a/examples/vctk/ernie_sat/README.md b/examples/vctk/ernie_sat/README.md index 0a2f9359e..94c7ae25d 100644 --- a/examples/vctk/ernie_sat/README.md +++ b/examples/vctk/ernie_sat/README.md @@ -1,11 +1,10 @@ # ERNIE-SAT with VCTK dataset +ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning. -ERNIE-SAT 是可以同时处理中英文的跨语言的语音-语言跨模态大模型,其在语音编辑、个性化语音合成以及跨语言的语音合成等多个任务取得了领先效果。可以应用于语音编辑、个性化合成、语音克隆、同传翻译等一系列场景,该项目供研究使用。 - -## 模型框架 -ERNIE-SAT 中我们提出了两项创新: -- 在预训练过程中将中英双语对应的音素作为输入,实现了跨语言、个性化的软音素映射 -- 采用语言和语音的联合掩码学习实现了语言和语音的对齐 +## Model Framework +In ERNIE-SAT, we propose two innovations: +- In the pretraining process, the phonemes corresponding to Chinese and English are used as input to achieve cross-language and personalized soft phoneme mapping +- The joint mask learning of speech and text is used to realize the alignment of speech and text