From 66a8beb27f7ee8b537b635513c8ac63606ae6e48 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 28 Feb 2022 09:47:06 +0000 Subject: [PATCH] update text frontend, test=tts --- README.md | 1 + README_cn.md | 1 + examples/aishell3/tts3/README.md | 4 +- examples/aishell3/tts3/conf/conformer.yaml | 110 ++++++++++++++++++ examples/other/g2p/README.md | 2 +- paddlespeech/t2s/frontend/tone_sandhi.py | 6 +- paddlespeech/t2s/frontend/zh_frontend.py | 22 ++++ .../frontend/zh_normalization/chronology.py | 10 +- .../t2s/frontend/zh_normalization/num.py | 7 +- .../zh_normalization/text_normlization.py | 9 ++ setup.py | 1 + 11 files changed, 165 insertions(+), 8 deletions(-) create mode 100644 examples/aishell3/tts3/conf/conformer.yaml diff --git a/README.md b/README.md index 46730797b..e96d07107 100644 --- a/README.md +++ b/README.md @@ -561,6 +561,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P - Many thanks to [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) for developing Virtual Uploader(VUP)/Virtual YouTuber(VTuber) with PaddleSpeech TTS function. - Many thanks to [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) for contributing Punctuation Restoration model. - Many thanks to [kslz](https://github.com/745165806) for supplementary Chinese documents. +- Many thanks to [awmmmm](https://github.com/awmmmm) for contributing fastspeech2 aishell3 conformer pretrained model. Besides, PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information. diff --git a/README_cn.md b/README_cn.md index 9782240a6..32d5c518e 100644 --- a/README_cn.md +++ b/README_cn.md @@ -556,6 +556,7 @@ year={2021} - 非常感谢 [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) 采用 PaddleSpeech 语音合成功能实现 Virtual Uploader(VUP)/Virtual YouTuber(VTuber) 虚拟主播。 - 非常感谢 [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) 贡献标点重建相关模型。 - 非常感谢 [kslz](https://github.com/kslz) 补充中文文档。 +- 非常感谢 [awmmmm](https://github.com/awmmmm) 提供 fastspeech2 aishell3 conformer 预训练模型。 此外,PaddleSpeech 依赖于许多开源存储库。有关更多信息,请参阅 [references](./docs/source/reference.md)。 diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index 281ad836b..d02ad1b63 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -225,7 +225,9 @@ optional arguments: 9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip) +Pretrained FastSpeech2 model with no silence in the edge of audios: +- [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip) +- [fastspeech2_conformer_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_aishell3_ckpt_0.2.0.zip) (Thanks for [@awmmmm](https://github.com/awmmmm)'s contribution) FastSpeech2 checkpoint contains files listed below. diff --git a/examples/aishell3/tts3/conf/conformer.yaml b/examples/aishell3/tts3/conf/conformer.yaml new file mode 100644 index 000000000..ea73593d7 --- /dev/null +++ b/examples/aishell3/tts3/conf/conformer.yaml @@ -0,0 +1,110 @@ +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### + +fs: 24000 # sr +n_fft: 2048 # FFT size (samples). +n_shift: 300 # Hop size (samples). 12.5ms +win_length: 1200 # Window length (samples). 50ms + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. + +# Only used for feats_type != raw + +fmin: 80 # Minimum frequency of Mel basis. +fmax: 7600 # Maximum frequency of Mel basis. +n_mels: 80 # The number of mel basis. + +# Only used for the model using pitch features (e.g. FastSpeech2) +f0min: 80 # Maximum f0 for pitch extraction. +f0max: 400 # Minimum f0 for pitch extraction. + + +########################################################### +# DATA SETTING # +########################################################### +batch_size: 32 +num_workers: 4 + + +########################################################### +# MODEL SETTING # +########################################################### +model: + adim: 384 # attention dimension + aheads: 2 # number of attention heads + elayers: 4 # number of encoder layers + eunits: 1536 # number of encoder ff units + dlayers: 4 # number of decoder layers + dunits: 1536 # number of decoder ff units + positionwise_layer_type: conv1d # type of position-wise layer + positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer + duration_predictor_layers: 2 # number of layers of duration predictor + duration_predictor_chans: 256 # number of channels of duration predictor + duration_predictor_kernel_size: 3 # filter size of duration predictor + postnet_layers: 5 # number of layers of postnset + postnet_filts: 5 # filter size of conv layers in postnet + postnet_chans: 256 # number of channels of conv layers in postnet + encoder_normalize_before: True # whether to perform layer normalization before the input + decoder_normalize_before: True # whether to perform layer normalization before the input + reduction_factor: 1 # reduction factor + encoder_type: conformer # encoder type + decoder_type: conformer # decoder type + conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type + conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type + conformer_activation_type: swish # conformer activation type + use_macaron_style_in_conformer: true # whether to use macaron style in conformer + use_cnn_in_conformer: true # whether to use CNN in conformer + conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder + conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder + init_type: xavier_uniform # initialization type + transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer + transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding + transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer + transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer + transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding + transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer + pitch_predictor_layers: 5 # number of conv layers in pitch predictor + pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor + pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor + pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor + pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch + pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch + stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + energy_predictor_layers: 2 # number of conv layers in energy predictor + energy_predictor_chans: 256 # number of channels of conv layers in energy predictor + energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor + energy_predictor_dropout: 0.5 # dropout rate in energy predictor + energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy + energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy + stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + spk_embed_dim: 256 # speaker embedding dimension + spk_embed_integration_type: concat # speaker embedding integration type + + +########################################################### +# UPDATER SETTING # +########################################################### +updater: + use_masking: True # whether to apply masking for padded part in loss calculation + + + +########################################################### +# OPTIMIZER SETTING # +########################################################### +optimizer: + optim: adam # optimizer type + learning_rate: 0.001 # learning rate + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 1000 +num_snapshots: 5 + + +########################################################### +# OTHER SETTING # +########################################################### +seed: 10086 diff --git a/examples/other/g2p/README.md b/examples/other/g2p/README.md index c0f55bd42..141f7f741 100644 --- a/examples/other/g2p/README.md +++ b/examples/other/g2p/README.md @@ -10,7 +10,7 @@ Run the command below to get the results of the test. ```bash ./run.sh ``` -The `avg WER` of g2p is: 0.027124048652822204 +The `avg WER` of g2p is: 0.026014352515701198 ```text ,--------------------------------------------------------------------. | | # Snt # Wrd | Corr Sub Del Ins Err S.Err | diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py index 5264e0687..07f7fa2b8 100644 --- a/paddlespeech/t2s/frontend/tone_sandhi.py +++ b/paddlespeech/t2s/frontend/tone_sandhi.py @@ -63,7 +63,7 @@ class ToneSandhi(): '扫把', '惦记' } self.must_not_neural_tone_words = { - "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子" + "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", "虎虎" } self.punc = ":,;。?!“”‘’':,;.?!" @@ -77,7 +77,9 @@ class ToneSandhi(): # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺 for j, item in enumerate(word): - if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}: + if j - 1 >= 0 and item == word[j - 1] and pos[0] in { + "n", "v", "a" + } and word not in self.must_not_neural_tone_words: finals[j] = finals[j][:-1] + "5" ge_idx = word.find("个") if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶": diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index a905c412d..bb8ed5b49 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -20,7 +20,10 @@ import numpy as np import paddle from g2pM import G2pM from pypinyin import lazy_pinyin +from pypinyin import load_phrases_dict +from pypinyin import load_single_dict from pypinyin import Style +from pypinyin_dict.phrase_pinyin_data import large_pinyin from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi @@ -41,6 +44,8 @@ class Frontend(): self.g2pM_model = G2pM() self.pinyin2phone = generate_lexicon( with_tone=True, with_erhua=False) + else: + self.__init__pypinyin() self.must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿"} self.not_erhua = { "虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿", @@ -62,6 +67,23 @@ class Frontend(): for tone, id in tone_id: self.vocab_tones[tone] = int(id) + def __init__pypinyin(self): + large_pinyin.load() + + load_phrases_dict({u'开户行': [[u'ka1i'], [u'hu4'], [u'hang2']]}) + load_phrases_dict({u'发卡行': [[u'fa4'], [u'ka3'], [u'hang2']]}) + load_phrases_dict({u'放款行': [[u'fa4ng'], [u'kua3n'], [u'hang2']]}) + load_phrases_dict({u'茧行': [[u'jia3n'], [u'hang2']]}) + load_phrases_dict({u'行号': [[u'hang2'], [u'ha4o']]}) + load_phrases_dict({u'各地': [[u'ge4'], [u'di4']]}) + load_phrases_dict({u'借还款': [[u'jie4'], [u'hua2n'], [u'kua3n']]}) + load_phrases_dict({u'时间为': [[u'shi2'], [u'jia1n'], [u'we2i']]}) + load_phrases_dict({u'为准': [[u'we2i'], [u'zhu3n']]}) + load_phrases_dict({u'色差': [[u'se4'], [u'cha1']]}) + + # 调整字的拼音顺序 + load_single_dict({ord(u'地'): u'de,di4'}) + def _get_initials_finals(self, word: str) -> List[List[str]]: initials = [] finals = [] diff --git a/paddlespeech/t2s/frontend/zh_normalization/chronology.py b/paddlespeech/t2s/frontend/zh_normalization/chronology.py index bfa7d2b19..ea5189135 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py +++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py @@ -63,7 +63,10 @@ def replace_time(match) -> str: result = f"{num2str(hour)}点" if minute.lstrip('0'): - result += f"{_time_num2str(minute)}分" + if int(minute) == 30: + result += f"半" + else: + result += f"{_time_num2str(minute)}分" if second and second.lstrip('0'): result += f"{_time_num2str(second)}秒" @@ -71,7 +74,10 @@ def replace_time(match) -> str: result += "至" result += f"{num2str(hour_2)}点" if minute_2.lstrip('0'): - result += f"{_time_num2str(minute_2)}分" + if int(minute) == 30: + result += f"半" + else: + result += f"{_time_num2str(minute_2)}分" if second_2 and second_2.lstrip('0'): result += f"{_time_num2str(second_2)}秒" diff --git a/paddlespeech/t2s/frontend/zh_normalization/num.py b/paddlespeech/t2s/frontend/zh_normalization/num.py index 27a2f8465..a83b42a47 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/num.py +++ b/paddlespeech/t2s/frontend/zh_normalization/num.py @@ -28,7 +28,7 @@ UNITS = OrderedDict({ 8: '亿', }) -COM_QUANTIFIERS = '(朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)' +COM_QUANTIFIERS = '(所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)' # 分数表达式 RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)') @@ -110,7 +110,7 @@ def replace_default_num(match): # 纯小数 RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))') # 正整数 + 量词 -RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几])?" + COM_QUANTIFIERS) +RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS) RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))') @@ -123,6 +123,8 @@ def replace_positive_quantifier(match) -> str: """ number = match.group(1) match_2 = match.group(2) + if match_2 == "+": + match_2 = "多" match_2: str = match_2 if match_2 else "" quantifiers: str = match.group(3) number: str = num2str(number) @@ -151,6 +153,7 @@ def replace_number(match) -> str: # 范围表达式 # match.group(1) and match.group(8) are copy from RE_NUMBER + RE_RANGE = re.compile( r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))') diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py index f9d1b8cb8..bc663c70d 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py +++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py @@ -63,11 +63,19 @@ class TextNormalizer(): # Only for pure Chinese here if lang == "zh": text = text.replace(" ", "") + # 过滤掉特殊字符 + text = re.sub(r'[《》【】<=>{}()()#&@“”^_|…\\]', '', text) text = self.SENTENCE_SPLITOR.sub(r'\1\n', text) text = text.strip() sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] return sentences + def _post_replace(self, sentence: str) -> str: + sentence = sentence.replace('/', '每') + sentence = sentence.replace('~', '至') + + return sentence + def normalize_sentence(self, sentence: str) -> str: # basic character conversions sentence = tranditional_to_simplified(sentence) @@ -97,6 +105,7 @@ class TextNormalizer(): sentence) sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence) sentence = RE_NUMBER.sub(replace_number, sentence) + sentence = self._post_replace(sentence) return sentence diff --git a/setup.py b/setup.py index 3f3632b37..c1c294376 100644 --- a/setup.py +++ b/setup.py @@ -48,6 +48,7 @@ base = [ "paddlespeech_feat", "praatio==5.0.0", "pypinyin", + "pypinyin-dict", "python-dateutil", "pyworld", "resampy==0.2.2",