From 13a7fa9808d0faaa1589e0ef0659c537bd4d5dbb Mon Sep 17 00:00:00 2001 From: "david.95" Date: Fri, 14 Oct 2022 15:37:33 +0800 Subject: [PATCH 01/20] enable chinese words' pinyin specified in text of ssml formats, test=tts --- paddlespeech/t2s/exps/syn_utils.py | 6 +- paddlespeech/t2s/frontend/zh_frontend.py | 156 ++++++++++++++++++++++ paddlespeech/t2s/ssml/xml_processor.py | 163 +++++++++++++++++++++++ 3 files changed, 323 insertions(+), 2 deletions(-) create mode 100644 paddlespeech/t2s/ssml/xml_processor.py diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 15d8dfb78..f9d1cd1b5 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -13,6 +13,7 @@ # limitations under the License. import math import os +import re from pathlib import Path from typing import Any from typing import Dict @@ -33,6 +34,7 @@ from paddlespeech.t2s.frontend.mix_frontend import MixFrontend from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.modules.normalizer import ZScore from paddlespeech.utils.dynamic_import import dynamic_import + # remove [W:onnxruntime: xxx] from ort ort.set_default_logger_severity(3) @@ -103,7 +105,7 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'): sentences = [] with open(text_file, 'rt') as f: for line in f: - items = line.strip().split() + items = re.split(r"\s+", line.strip(), 1) utt_id = items[0] if lang == 'zh': sentence = "".join(items[1:]) @@ -180,7 +182,7 @@ def run_frontend(frontend: object, to_tensor: bool=True): outs = dict() if lang == 'zh': - input_ids = frontend.get_input_ids( + input_ids = frontend.get_input_ids_ssml( text, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids, diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index 722eed601..25558780b 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -13,6 +13,7 @@ # limitations under the License. import os import re +from operator import itemgetter from typing import Dict from typing import List @@ -31,6 +32,7 @@ from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer +from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor INITIALS = [ 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh', @@ -81,6 +83,7 @@ class Frontend(): g2p_model="g2pW", phone_vocab_path=None, tone_vocab_path=None): + self.mix_ssml_processor = MixTextProcessor() self.tone_modifier = ToneSandhi() self.text_normalizer = TextNormalizer() self.punc = ":,;。?!“”‘’':,;.?!" @@ -143,6 +146,7 @@ class Frontend(): tone_id = [line.strip().split() for line in f.readlines()] for tone, id in tone_id: self.vocab_tones[tone] = int(id) + self.mix_ssml_processor.__repr__() def _init_pypinyin(self): large_pinyin.load() @@ -281,6 +285,65 @@ class Frontend(): phones_list.append(merge_list) return phones_list + def _split_word_to_char(self, words): + res = [] + for x in words: + res.append(x) + return res + + # if using ssml, have pingyin specified, assign pinyin to words + def _g2p_assign(self, + words: List[str], + pinyin_spec: List[str], + merge_sentences: bool=True) -> List[List[str]]: + phones_list = [] + initials = [] + finals = [] + + words = self._split_word_to_char(words[0]) + for pinyin, char in zip(pinyin_spec, words): + sub_initials = [] + sub_finals = [] + pinyin = pinyin.replace("u:", "v") + #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu + if pinyin in self.pinyin2phone: + initial_final_list = self.pinyin2phone[pinyin].split(" ") + if len(initial_final_list) == 2: + sub_initials.append(initial_final_list[0]) + sub_finals.append(initial_final_list[1]) + elif len(initial_final_list) == 1: + sub_initials.append('') + sub_finals.append(initial_final_list[1]) + else: + # If it's not pinyin (possibly punctuation) or no conversion is required + sub_initials.append(pinyin) + sub_finals.append(pinyin) + initials.append(sub_initials) + finals.append(sub_finals) + + initials = sum(initials, []) + finals = sum(finals, []) + phones = [] + for c, v in zip(initials, finals): + # NOTE: post process for pypinyin outputs + # we discriminate i, ii and iii + if c and c not in self.punc: + phones.append(c) + if c and c in self.punc: + phones.append('sp') + if v and v not in self.punc: + phones.append(v) + phones_list.append(phones) + if merge_sentences: + merge_list = sum(phones_list, []) + # rm the last 'sp' to avoid the noise at the end + # cause in the training data, no 'sp' in the end + if merge_list[-1] == 'sp': + merge_list = merge_list[:-1] + phones_list = [] + phones_list.append(merge_list) + return phones_list + def _merge_erhua(self, initials: List[str], finals: List[str], @@ -396,6 +459,52 @@ class Frontend(): print("----------------------------") return phonemes + #@an added for ssml pinyin + def get_phonemes_ssml(self, + ssml_inputs: list, + merge_sentences: bool=True, + with_erhua: bool=True, + robot: bool=False, + print_info: bool=False) -> List[List[str]]: + all_phonemes = [] + for word_pinyin_item in ssml_inputs: + phonemes = [] + sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item) + sentences = self.text_normalizer.normalize(sentence) + if len(pinyin_spec) == 0: + phonemes = self._g2p( + sentences, + merge_sentences=merge_sentences, + with_erhua=with_erhua) + else: + # phonemes should be pinyin_spec + phonemes = self._g2p_assign( + sentences, pinyin_spec, merge_sentences=merge_sentences) + + all_phonemes = all_phonemes + phonemes + + if robot: + new_phonemes = [] + for sentence in all_phonemes: + new_sentence = [] + for item in sentence: + # `er` only have tone `2` + if item[-1] in "12345" and item != "er2": + item = item[:-1] + "1" + new_sentence.append(item) + new_phonemes.append(new_sentence) + all_phonemes = new_phonemes + + if print_info: + print("----------------------------") + print("text norm results:") + print(sentences) + print("----------------------------") + print("g2p results:") + print(all_phonemes[0]) + print("----------------------------") + return [sum(all_phonemes, [])] + def get_input_ids(self, sentence: str, merge_sentences: bool=True, @@ -405,6 +514,7 @@ class Frontend(): add_blank: bool=False, blank_token: str="", to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: + phonemes = self.get_phonemes( sentence, merge_sentences=merge_sentences, @@ -437,3 +547,49 @@ class Frontend(): if temp_phone_ids: result["phone_ids"] = temp_phone_ids return result + + # @an added for ssml + def get_input_ids_ssml( + self, + sentence: str, + merge_sentences: bool=True, + get_tone_ids: bool=False, + robot: bool=False, + print_info: bool=False, + add_blank: bool=False, + blank_token: str="", + to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: + + l_inputs = MixTextProcessor.get_pinyin_split(sentence) + phonemes = self.get_phonemes_ssml( + l_inputs, + merge_sentences=merge_sentences, + print_info=print_info, + robot=robot) + result = {} + phones = [] + tones = [] + temp_phone_ids = [] + temp_tone_ids = [] + + for part_phonemes in phonemes: + phones, tones = self._get_phone_tone( + part_phonemes, get_tone_ids=get_tone_ids) + if add_blank: + phones = insert_after_character(phones, blank_token) + if tones: + tone_ids = self._t2id(tones) + if to_tensor: + tone_ids = paddle.to_tensor(tone_ids) + temp_tone_ids.append(tone_ids) + if phones: + phone_ids = self._p2id(phones) + # if use paddle.to_tensor() in onnxruntime, the first time will be too low + if to_tensor: + phone_ids = paddle.to_tensor(phone_ids) + temp_phone_ids.append(phone_ids) + if temp_tone_ids: + result["tone_ids"] = temp_tone_ids + if temp_phone_ids: + result["phone_ids"] = temp_phone_ids + return result diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/ssml/xml_processor.py new file mode 100644 index 000000000..54f24f59f --- /dev/null +++ b/paddlespeech/t2s/ssml/xml_processor.py @@ -0,0 +1,163 @@ +# -*- coding: utf-8 -*- +import re +import xml.dom.minidom +import xml.parsers.expat +from xml.dom.minidom import Node +from xml.dom.minidom import parseString +''' +Note: xml 有5种特殊字符, &<>"' +其一,采用特殊标签,将包含特殊字符的字符串封装起来。 +例如: + +其二,使用XML转义序列表示这些特殊的字符,这5个特殊字符所对应XML转义序列为: +& & +< < +> > +" " +' ' +例如: +"姓名" + +''' + + +class MixTextProcessor(): + def __repr__(self): + print("@an MixTextProcessor class") + + def get_xml_content(self, mixstr): + '''返回字符串的 xml 内容''' + xmlptn = re.compile(r".*?", re.M | re.S) + ctn = re.search(xmlptn, mixstr) + if ctn: + return ctn.group(0) + else: + return None + + def get_content_split(self, mixstr): + ''' 文本分解,顺序加了列表中,按非xml 和 xml 分开,对应的字符串,带标点符号 + 不能去除空格,因为xml 中tag 属性带空格 + ''' + ctlist = [] + # print("Testing:",mixstr[:20]) + patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S) + mat = re.match(patn, mixstr) + if mat: + pre_xml = mat.group(1) + in_xml = mat.group(2) + after_xml = mat.group(3) + + ctlist.append(pre_xml) + ctlist.append(in_xml) + ctlist.append(after_xml) + return ctlist + else: + ctlist.append(mixstr) + return ctlist + + @classmethod + def get_pinyin_split(self, mixstr): + ctlist = [] + patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S) + mat = re.match(patn, mixstr) + if mat: + pre_xml = mat.group(1) + in_xml = mat.group(2) + after_xml = mat.group(3) + + ctlist.append([pre_xml, []]) + dom = DomXml(in_xml) + pinyinlist = dom.get_pinyins_for_xml() + ctlist = ctlist + pinyinlist + ctlist.append([after_xml, []]) + else: + ctlist.append([mixstr, []]) + return ctlist + + +class DomXml(): + def __init__(self, xmlstr): + print("Parse xml str:", xmlstr) + self.tdom = parseString(xmlstr) #Document + # print("tdom:",type(self.tdom)) + self.root = self.tdom.documentElement #Element + # print("root:",type(self.root)) + self.rnode = self.tdom.childNodes #NodeList + # print("rnode:",type(self.rnode)) + pass + + def get_text(self): + '''返回xml 内容的所有文本内容的 列表''' + res = [] + + for x1 in self.rnode: + if x1.nodeType == Node.TEXT_NODE: + res.append(x1.value) + else: + for x2 in x1.childNodes: + if isinstance(x2, xml.dom.minidom.Text): + res.append(x2.data) + else: + for x3 in x2.childNodes: + if isinstance(x3, xml.dom.minidom.Text): + res.append(x3.data) + else: + print("len(nodes of x3):", len(x3.childNodes)) + + return res + + def get_xmlchild_list(self): + '''返回xml 内容的列表, 包括所有文本内容(不带tag)''' + res = [] + + for x1 in self.rnode: + if x1.nodeType == Node.TEXT_NODE: + res.append(x1.value) + else: + for x2 in x1.childNodes: + if isinstance(x2, xml.dom.minidom.Text): + res.append(x2.data) + else: + for x3 in x2.childNodes: + if isinstance(x3, xml.dom.minidom.Text): + res.append(x3.data) + else: + print("len(nodes of x3):", len(x3.childNodes)) + print(res) + return res + + def get_pinyins_for_xml(self): + '''返回xml 内容,如果字符串 和 拼音的 list , 如 [''' + res = [] + + for x1 in self.rnode: + if x1.nodeType == Node.TEXT_NODE: + t = re.sub(r"\s+", "", x1.value) + res.append([t, []]) + else: + for x2 in x1.childNodes: + if isinstance(x2, xml.dom.minidom.Text): + t = re.sub(r"\s+", "", x2.data) + res.append([t, []]) + else: + # print("x2",x2,x2.tagName) + if x2.hasAttribute('pinyin'): + pinyin_value = x2.getAttribute("pinyin") + pinyins = pinyin_value.split(" ") + for x3 in x2.childNodes: + # print('x3',x3) + if isinstance(x3, xml.dom.minidom.Text): + t = re.sub(r"\s+", "", x3.data) + res.append([t, pinyins]) + else: + print("len(nodes of x3):", len(x3.childNodes)) + + return res + + def get_all_tags(self, tag_name): + '''获取所有的tag 及属性值''' + alltags = self.root.getElementsByTagName(tag_name) + for x in alltags: + if x.hasAttribute('pinyin'): # pinyin + print(x.tagName, 'pinyin', + x.getAttribute('pinyin'), x.firstChild.data) From 278c7a41a83412f02bc4b0b98832c5076f0940cf Mon Sep 17 00:00:00 2001 From: "david.95" Date: Mon, 17 Oct 2022 14:59:23 +0800 Subject: [PATCH 02/20] add module define to fix ci, test=tts --- paddlespeech/t2s/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/paddlespeech/t2s/__init__.py b/paddlespeech/t2s/__init__.py index 7d93c026e..57fe82a9c 100644 --- a/paddlespeech/t2s/__init__.py +++ b/paddlespeech/t2s/__init__.py @@ -18,5 +18,6 @@ from . import exps from . import frontend from . import models from . import modules +from . import ssml from . import training from . import utils From 29508f400b23211c9e7380800e2d02c9a16a426f Mon Sep 17 00:00:00 2001 From: "david.95" Date: Mon, 17 Oct 2022 16:44:29 +0800 Subject: [PATCH 03/20] to fix CI issue, test=tts --- paddlespeech/t2s/ssml/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 paddlespeech/t2s/ssml/__init__.py diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/ssml/__init__.py new file mode 100644 index 000000000..e69de29bb From f56cc08b18f5fb6fc3254db4dd40ec3597d34f36 Mon Sep 17 00:00:00 2001 From: "david.95" Date: Mon, 17 Oct 2022 16:55:07 +0800 Subject: [PATCH 04/20] add license content, test=tts --- paddlespeech/t2s/ssml/__init__.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/ssml/__init__.py index e69de29bb..abf198b97 100644 --- a/paddlespeech/t2s/ssml/__init__.py +++ b/paddlespeech/t2s/ssml/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. From 1067088debd49ba308fc55a8c55d1d04f211ff51 Mon Sep 17 00:00:00 2001 From: "david.95" Date: Mon, 17 Oct 2022 17:18:27 +0800 Subject: [PATCH 05/20] modify __init__ --- paddlespeech/t2s/ssml/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/ssml/__init__.py index abf198b97..f344250d2 100644 --- a/paddlespeech/t2s/ssml/__init__.py +++ b/paddlespeech/t2s/ssml/__init__.py @@ -11,3 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from .xml_processor import * From 89e9ea69ebb884d5ba13d02c66c29475a153f2ea Mon Sep 17 00:00:00 2001 From: "david.95" Date: Mon, 17 Oct 2022 17:29:46 +0800 Subject: [PATCH 06/20] modify __init__ --- paddlespeech/t2s/ssml/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/ssml/__init__.py index f344250d2..9b4db053b 100644 --- a/paddlespeech/t2s/ssml/__init__.py +++ b/paddlespeech/t2s/ssml/__init__.py @@ -11,5 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .xml_processor import * From f295d2d4450099f2cf8b7e2d417a9c9599230563 Mon Sep 17 00:00:00 2001 From: "david.95" Date: Mon, 17 Oct 2022 18:00:13 +0800 Subject: [PATCH 07/20] remove useless code --- paddlespeech/t2s/frontend/zh_frontend.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index 25558780b..e30286986 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -146,7 +146,6 @@ class Frontend(): tone_id = [line.strip().split() for line in f.readlines()] for tone, id in tone_id: self.vocab_tones[tone] = int(id) - self.mix_ssml_processor.__repr__() def _init_pypinyin(self): large_pinyin.load() From 050d766915c01a59fd4880dfb263dbc30605944f Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Wed, 19 Oct 2022 05:31:18 +0000 Subject: [PATCH 08/20] fix u2pp model --- docs/source/released_model.md | 2 +- paddlespeech/cli/asr/infer.py | 4 ++-- paddlespeech/resource/model_alias.py | 1 - paddlespeech/resource/pretrained_models.py | 26 +++------------------- 4 files changed, 6 insertions(+), 27 deletions(-) diff --git a/docs/source/released_model.md b/docs/source/released_model.md index a2456f1fe..586f17c34 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -9,7 +9,7 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | [Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz) | Aishell Dataset | Char-based | 491 MB | 2 Conv + 5 LSTM layers | 0.0666 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) | onnx/inference/python | [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz)| Aishell Dataset | Char-based | 1.4 GB | 2 Conv + 5 bidirectional LSTM layers| 0.0554 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) | inference/python | [Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) 0.1879 (test\_meeting) |-| 10000 h |- | python | -[Conformer U2PP Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.4.model.tar.gz) | WenetSpeech Dataset | Char-based | 476 MB | Encoder:Conformer, Decoder:BiTransformer, Decoding method: Attention rescoring| 0.047198 (aishell test\_-1) 0.059212 (aishell test\_16) |-| 10000 h |- | python | +[Conformer U2PP Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.3.0.model.tar.gz) | WenetSpeech Dataset | Char-based | 476 MB | Encoder:Conformer, Decoder:BiTransformer, Decoding method: Attention rescoring| 0.047198 (aishell test\_-1) 0.059212 (aishell test\_16) |-| 10000 h |- | python | [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) | python | [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_1.0.1.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0460 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) | python | [Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1) | python | diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 437f64631..004143361 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -52,7 +52,7 @@ class ASRExecutor(BaseExecutor): self.parser.add_argument( '--model', type=str, - default='conformer_u2pp_wenetspeech', + default='conformer_u2pp_online_wenetspeech', choices=[ tag[:tag.index('-')] for tag in self.task_resource.pretrained_models.keys() @@ -470,7 +470,7 @@ class ASRExecutor(BaseExecutor): @stats_wrapper def __call__(self, audio_file: os.PathLike, - model: str='conformer_u2pp_wenetspeech', + model: str='conformer_u2pp_online_wenetspeech', lang: str='zh', sample_rate: int=16000, config: os.PathLike=None, diff --git a/paddlespeech/resource/model_alias.py b/paddlespeech/resource/model_alias.py index f5ec655b7..8e9ecc4ba 100644 --- a/paddlespeech/resource/model_alias.py +++ b/paddlespeech/resource/model_alias.py @@ -25,7 +25,6 @@ model_alias = { "deepspeech2online": ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"], "conformer": ["paddlespeech.s2t.models.u2:U2Model"], "conformer_online": ["paddlespeech.s2t.models.u2:U2Model"], - "conformer_u2pp": ["paddlespeech.s2t.models.u2:U2Model"], "conformer_u2pp_online": ["paddlespeech.s2t.models.u2:U2Model"], "transformer": ["paddlespeech.s2t.models.u2:U2Model"], "wenetspeech": ["paddlespeech.s2t.models.u2:U2Model"], diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index efd6bb3f2..df50a6a9d 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -68,32 +68,12 @@ asr_dynamic_pretrained_models = { '', }, }, - "conformer_u2pp_wenetspeech-zh-16k": { - '1.1': { - 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.3.model.tar.gz', - 'md5': - '662b347e1d2131b7a4dc5398365e2134', - 'cfg_path': - 'model.yaml', - 'ckpt_path': - 'exp/chunk_conformer_u2pp/checkpoints/avg_10', - 'model': - 'exp/chunk_conformer_u2pp/checkpoints/avg_10.pdparams', - 'params': - 'exp/chunk_conformer_u2pp/checkpoints/avg_10.pdparams', - 'lm_url': - '', - 'lm_md5': - '', - }, - }, "conformer_u2pp_online_wenetspeech-zh-16k": { - '1.1': { + '1.3': { 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.4.model.tar.gz', + 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.3.0.model.tar.gz', 'md5': - '3100fc1eac5779486cab859366992d0b', + '62d230c1bf27731192aa9d3b8deca300', 'cfg_path': 'model.yaml', 'ckpt_path': From cb76e664017f15b7963eca0e126e5429f0a58ba9 Mon Sep 17 00:00:00 2001 From: dahu1 <707133607@qq.com> Date: Wed, 19 Oct 2022 15:54:08 +0800 Subject: [PATCH 09/20] =?UTF-8?q?1.token=E9=85=8D=E7=BD=AE=E4=B8=8D?= =?UTF-8?q?=E5=86=99=E6=AD=BB=EF=BC=8C2.text=E6=98=BE=E7=A4=BA=E4=B8=8D?= =?UTF-8?q?=E4=B9=B1=E7=A0=81,=20test=3Dasr?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../text/exps/ernie_linear/punc_restore.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/paddlespeech/text/exps/ernie_linear/punc_restore.py b/paddlespeech/text/exps/ernie_linear/punc_restore.py index 2cb4d0719..98804606c 100644 --- a/paddlespeech/text/exps/ernie_linear/punc_restore.py +++ b/paddlespeech/text/exps/ernie_linear/punc_restore.py @@ -25,8 +25,6 @@ DefinedClassifier = { 'ErnieLinear': ErnieLinear, } -tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') - def _clean_text(text, punc_list): text = text.lower() @@ -35,7 +33,7 @@ def _clean_text(text, punc_list): return text -def preprocess(text, punc_list): +def preprocess(text, punc_list, tokenizer): clean_text = _clean_text(text, punc_list) assert len(clean_text) > 0, f'Invalid input string: {text}' tokenized_input = tokenizer( @@ -51,7 +49,8 @@ def test(args): with open(args.config) as f: config = CfgNode(yaml.safe_load(f)) print("========Args========") - print(yaml.safe_dump(vars(args))) + print(yaml.safe_dump(vars(args), allow_unicode=True)) + # print(args) print("========Config========") print(config) @@ -61,10 +60,16 @@ def test(args): punc_list.append(line.strip()) model = DefinedClassifier[config["model_type"]](**config["model"]) + # print(model) + + pretrained_token = config['data_params']['pretrained_token'] + tokenizer = ErnieTokenizer.from_pretrained(pretrained_token) + # tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') + state_dict = paddle.load(args.checkpoint) model.set_state_dict(state_dict["main_params"]) model.eval() - _inputs = preprocess(args.text, punc_list) + _inputs = preprocess(args.text, punc_list, tokenizer) seq_len = _inputs['seq_len'] input_ids = paddle.to_tensor(_inputs['input_ids']).unsqueeze(0) seg_ids = paddle.to_tensor(_inputs['seg_ids']).unsqueeze(0) From 3ac7ac253f66c46f01aa11be3de95d6177f47107 Mon Sep 17 00:00:00 2001 From: "david.95" Date: Thu, 20 Oct 2022 09:29:11 +0800 Subject: [PATCH 10/20] fix review issue,test=tts --- paddlespeech/t2s/ssml/xml_processor.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/ssml/xml_processor.py index 54f24f59f..b39121347 100644 --- a/paddlespeech/t2s/ssml/xml_processor.py +++ b/paddlespeech/t2s/ssml/xml_processor.py @@ -35,8 +35,8 @@ class MixTextProcessor(): return None def get_content_split(self, mixstr): - ''' 文本分解,顺序加了列表中,按非xml 和 xml 分开,对应的字符串,带标点符号 - 不能去除空格,因为xml 中tag 属性带空格 + ''' 文本分解,顺序加了列表中,按非 xml 和 xml 分开,对应的字符串,带标点符号 + 不能去除空格,因为 xml 中tag 属性带空格 ''' ctlist = [] # print("Testing:",mixstr[:20]) @@ -77,17 +77,12 @@ class MixTextProcessor(): class DomXml(): def __init__(self, xmlstr): - print("Parse xml str:", xmlstr) self.tdom = parseString(xmlstr) #Document - # print("tdom:",type(self.tdom)) self.root = self.tdom.documentElement #Element - # print("root:",type(self.root)) self.rnode = self.tdom.childNodes #NodeList - # print("rnode:",type(self.rnode)) - pass def get_text(self): - '''返回xml 内容的所有文本内容的 列表''' + '''返回 xml 内容的所有文本内容的列表''' res = [] for x1 in self.rnode: @@ -107,7 +102,7 @@ class DomXml(): return res def get_xmlchild_list(self): - '''返回xml 内容的列表, 包括所有文本内容(不带tag)''' + '''返回 xml 内容的列表,包括所有文本内容(不带 tag)''' res = [] for x1 in self.rnode: @@ -127,7 +122,7 @@ class DomXml(): return res def get_pinyins_for_xml(self): - '''返回xml 内容,如果字符串 和 拼音的 list , 如 [''' + '''返回 xml 内容,字符串和拼音的 list ''' res = [] for x1 in self.rnode: @@ -155,7 +150,7 @@ class DomXml(): return res def get_all_tags(self, tag_name): - '''获取所有的tag 及属性值''' + '''获取所有的 tag 及属性值''' alltags = self.root.getElementsByTagName(tag_name) for x in alltags: if x.hasAttribute('pinyin'): # pinyin From 7d5ae651ce92d0bd953f0de54b81d00cf951b01d Mon Sep 17 00:00:00 2001 From: "david.95" Date: Thu, 20 Oct 2022 10:07:21 +0800 Subject: [PATCH 11/20] add readme thanks --- README.md | 2 +- README_cn.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 49e40624d..0abb3fd69 100644 --- a/README.md +++ b/README.md @@ -923,7 +923,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P ## Acknowledgement - Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples. -- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. +- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data.Added SSML for Chinese Text Frontend. - Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW). - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help. - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files. diff --git a/README_cn.md b/README_cn.md index bf3ff4dfd..0c3af5dd4 100644 --- a/README_cn.md +++ b/README_cn.md @@ -928,7 +928,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 ## 致谢 - 非常感谢 [HighCWu](https://github.com/HighCWu) 新增 [VITS-aishell3](./examples/aishell3/vits) 和 [VITS-VC](./examples/aishell3/vits-vc) 代码示例。 -- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题,贡献补充多条程序和数据。 +- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题,贡献补充多条程序和数据。新增 SSML 中文文本前端处理。 - 非常感谢 [BarryKCL](https://github.com/BarryKCL) 基于 [G2PW](https://github.com/GitYCC/g2pW) 对 TTS 中文文本前端的优化。 - 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) 多年来的关注和建议,以及在诸多问题上的帮助。 - 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。 From ec1f9edd562275e2d2799c16e36a304bae172e1c Mon Sep 17 00:00:00 2001 From: "david.95" Date: Thu, 20 Oct 2022 10:11:26 +0800 Subject: [PATCH 12/20] add space after punctions --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0abb3fd69..d02ac4c6b 100644 --- a/README.md +++ b/README.md @@ -923,7 +923,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P ## Acknowledgement - Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples. -- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data.Added SSML for Chinese Text Frontend. +- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. Added SSML for Chinese Text Frontend. - Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW). - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help. - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files. From da525d346f0a78fc1b6f11db408a5ce1a76c5610 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 20 Oct 2022 06:17:17 +0000 Subject: [PATCH 13/20] fix uvicorn's version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e551d9fa6..3353cdada 100644 --- a/setup.py +++ b/setup.py @@ -77,7 +77,7 @@ base = [ "pybind11", ] -server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"] +server = ["fastapi", "uvicorn<=0.18.3", "pattern_singleton", "websockets"] requirements = { "install": From 63c80121e2c5691145a2bc8c49cf1a2b277c7067 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 20 Oct 2022 06:33:07 +0000 Subject: [PATCH 14/20] fix uvicorn's bug --- paddlespeech/server/bin/paddlespeech_server.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py index 10a91d9be..1b1792bd1 100644 --- a/paddlespeech/server/bin/paddlespeech_server.py +++ b/paddlespeech/server/bin/paddlespeech_server.py @@ -113,7 +113,7 @@ class ServerExecutor(BaseExecutor): """ config = get_config(config_file) if self.init(config): - uvicorn.run(app, host=config.host, port=config.port, debug=True) + uvicorn.run(app, host=config.host, port=config.port) @cli_server_register( diff --git a/setup.py b/setup.py index 3353cdada..e551d9fa6 100644 --- a/setup.py +++ b/setup.py @@ -77,7 +77,7 @@ base = [ "pybind11", ] -server = ["fastapi", "uvicorn<=0.18.3", "pattern_singleton", "websockets"] +server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"] requirements = { "install": From ce153d915e512c5ab38e7791fb733540189ebfb1 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Thu, 20 Oct 2022 07:54:00 +0000 Subject: [PATCH 15/20] update u2pp result.md --- examples/wenetspeech/asr1/RESULTS.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md index f22c652e6..cd480163e 100644 --- a/examples/wenetspeech/asr1/RESULTS.md +++ b/examples/wenetspeech/asr1/RESULTS.md @@ -53,3 +53,22 @@ Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | -1 | 0.061884 | | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | -1 | 0.062056 | | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | -1 | 0.052110 | + + +## U2PP Steaming Pretrained Model + +Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.3.0.model.tar.gz + +| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention | 16 | 0.057031 | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | 16 | 0.068826 | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | 16 | 0.069111 | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | 16 | 0.059213 | + +| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention | -1 | 0.049256 | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | -1 | 0.052086 | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | -1 | 0.052267 | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | -1 | 0.047198 | From ed0138c6e324a87e31a23138bafe6f878ed8f4e9 Mon Sep 17 00:00:00 2001 From: "david.95" Date: Thu, 20 Oct 2022 18:09:41 +0800 Subject: [PATCH 16/20] add condition check if a ssml input and filter space line, test=tts --- paddlespeech/t2s/exps/syn_utils.py | 36 +++++++++++++++++++----------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index f9d1cd1b5..41663891e 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -105,14 +105,15 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'): sentences = [] with open(text_file, 'rt') as f: for line in f: - items = re.split(r"\s+", line.strip(), 1) - utt_id = items[0] - if lang == 'zh': - sentence = "".join(items[1:]) - elif lang == 'en': - sentence = " ".join(items[1:]) - elif lang == 'mix': - sentence = " ".join(items[1:]) + if line.strip() != "": + items = re.split(r"\s+", line.strip(), 1) + utt_id = items[0] + if lang == 'zh': + sentence = "".join(items[1:]) + elif lang == 'en': + sentence = " ".join(items[1:]) + elif lang == 'mix': + sentence = " ".join(items[1:]) sentences.append((utt_id, sentence)) return sentences @@ -182,11 +183,20 @@ def run_frontend(frontend: object, to_tensor: bool=True): outs = dict() if lang == 'zh': - input_ids = frontend.get_input_ids_ssml( - text, - merge_sentences=merge_sentences, - get_tone_ids=get_tone_ids, - to_tensor=to_tensor) + input_ids = {} + if text.strip() != "" and re.match(r".*?.*?.*", text, + re.DOTALL): + input_ids = frontend.get_input_ids_ssml( + text, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) + else: + input_ids = frontend.get_input_ids( + text, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) phone_ids = input_ids["phone_ids"] if get_tone_ids: tone_ids = input_ids["tone_ids"] From 4dfb3365f637b28b30f0359dd641f571800eb2a8 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 21 Oct 2022 17:23:17 +0800 Subject: [PATCH 17/20] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index d02ac4c6b..4ed1a022c 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). ### Recent Update +- 🎉 2022.10.21: Add [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for Chinese Text Frontend. - 👑 2022.10.11: Add [Wav2vec2ASR](./examples/librispeech/asr3), wav2vec2.0 fine-tuning for ASR on LibriSpeech. - 🔥 2022.09.26: Add Voice Cloning, TTS finetune, and ERNIE-SAT in [PaddleSpeech Web Demo](./demos/speech_web). - ⚡ 2022.09.09: Add AISHELL-3 Voice Cloning [example](./examples/aishell3/vc2) with ECAPA-TDNN speaker encoder. From 7693bd1812086d2b5d5a19646e704a6155cb1103 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 21 Oct 2022 17:24:40 +0800 Subject: [PATCH 18/20] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4ed1a022c..3b26ff9b5 100644 --- a/README.md +++ b/README.md @@ -924,7 +924,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P ## Acknowledgement - Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples. -- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. Added SSML for Chinese Text Frontend. +- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. Added [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for Chinese Text Frontend. - Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW). - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help. - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files. From 9c68c2061e1b595deac62229a2f29f9f0659ff17 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 21 Oct 2022 17:29:13 +0800 Subject: [PATCH 19/20] Update README_cn.md --- README_cn.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README_cn.md b/README_cn.md index 0c3af5dd4..9a4549898 100644 --- a/README_cn.md +++ b/README_cn.md @@ -164,7 +164,8 @@ ### 近期更新 -- 👑 2022.10.11: 新增 [Wav2vec2ASR](./examples/librispeech/asr3), 在 LibriSpeech 上针对ASR任务对wav2vec2.0 的fine-tuning. + - 🎉 2022.10.21: TTS 中文文本前端新增 [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) 功能。 +- 👑 2022.10.11: 新增 [Wav2vec2ASR](./examples/librispeech/asr3), 在 LibriSpeech 上针对 ASR 任务对 wav2vec2.0 的 finetuning。 - 🔥 2022.09.26: 新增 Voice Cloning, TTS finetune 和 ERNIE-SAT 到 [PaddleSpeech 网页应用](./demos/speech_web)。 - ⚡ 2022.09.09: 新增基于 ECAPA-TDNN 声纹模型的 AISHELL-3 Voice Cloning [示例](./examples/aishell3/vc2)。 - ⚡ 2022.08.25: 发布 TTS [finetune](./examples/other/tts_finetune/tts3) 示例。 @@ -928,7 +929,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 ## 致谢 - 非常感谢 [HighCWu](https://github.com/HighCWu) 新增 [VITS-aishell3](./examples/aishell3/vits) 和 [VITS-VC](./examples/aishell3/vits-vc) 代码示例。 -- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题,贡献补充多条程序和数据。新增 SSML 中文文本前端处理。 +- 非常感谢 [david-95](https://github.com/david-95) 修复 TTS 句尾多标点符号出错的问题,贡献补充多条程序和数据。为 TTS 中文文本前端新增 [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) 功能。 - 非常感谢 [BarryKCL](https://github.com/BarryKCL) 基于 [G2PW](https://github.com/GitYCC/g2pW) 对 TTS 中文文本前端的优化。 - 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) 多年来的关注和建议,以及在诸多问题上的帮助。 - 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。 From 09a735af2449a2205a6006287e6bd1e98b355c37 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 21 Oct 2022 17:32:47 +0800 Subject: [PATCH 20/20] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3b26ff9b5..26f13d00e 100644 --- a/README.md +++ b/README.md @@ -157,7 +157,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). ### Recent Update -- 🎉 2022.10.21: Add [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for Chinese Text Frontend. +- 🎉 2022.10.21: Add [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for TTS Chinese Text Frontend. - 👑 2022.10.11: Add [Wav2vec2ASR](./examples/librispeech/asr3), wav2vec2.0 fine-tuning for ASR on LibriSpeech. - 🔥 2022.09.26: Add Voice Cloning, TTS finetune, and ERNIE-SAT in [PaddleSpeech Web Demo](./demos/speech_web). - ⚡ 2022.09.09: Add AISHELL-3 Voice Cloning [example](./examples/aishell3/vc2) with ECAPA-TDNN speaker encoder. @@ -924,8 +924,8 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P ## Acknowledgement - Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples. -- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. Added [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for Chinese Text Frontend. -- Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW). +- Many thanks to [david-95](https://github.com/david-95) for fixing multi-punctuation bug、contributing to multiple program and data, and adding [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for TTS Chinese Text Frontend. +- Many thanks to [BarryKCL](https://github.com/BarryKCL) for improving TTS Chinses Frontend based on [G2PW](https://github.com/GitYCC/g2pW). - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help. - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files. - Many thanks to [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) for developing Virtual Uploader(VUP)/Virtual YouTuber(VTuber) with PaddleSpeech TTS function.