From dad1cbbcd6cfc8d2530de48cdff3b325b6d2de8c Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 26 Nov 2021 09:12:29 +0000 Subject: [PATCH 1/2] update text frontend --- demos/style_fs2/style_syn.py | 4 ++- examples/ljspeech/voc1/README.md | 2 +- .../t2s/exps/fastspeech2/inference.py | 4 ++- .../fastspeech2/multi_spk_synthesize_e2e.py | 4 ++- .../t2s/exps/fastspeech2/synthesize_e2e.py | 4 ++- .../exps/fastspeech2/synthesize_e2e_melgan.py | 4 ++- .../t2s/exps/speedyspeech/inference.py | 4 ++- .../t2s/exps/speedyspeech/synthesize_e2e.py | 4 ++- paddlespeech/t2s/frontend/zh_frontend.py | 7 ++++- .../frontend/zh_normalization/chronology.py | 26 +++++++++++++++++++ .../frontend/zh_normalization/phonecode.py | 7 +++-- .../zh_normalization/text_normlization.py | 10 +++++++ .../t2s/models/fastspeech2/fastspeech2.py | 2 +- 13 files changed, 70 insertions(+), 12 deletions(-) diff --git a/demos/style_fs2/style_syn.py b/demos/style_fs2/style_syn.py index 5b8ce3513..9bd615790 100644 --- a/demos/style_fs2/style_syn.py +++ b/demos/style_fs2/style_syn.py @@ -34,7 +34,9 @@ def evaluate(args, fastspeech2_config, pwg_config): sentences = [] with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = ",".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md index 13cc6ed7e..3830156f9 100644 --- a/examples/ljspeech/voc1/README.md +++ b/examples/ljspeech/voc1/README.md @@ -137,4 +137,4 @@ pwg_ljspeech_ckpt_0.5 └── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` ## Acknowledgement -We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. \ No newline at end of file +We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/paddlespeech/t2s/exps/fastspeech2/inference.py b/paddlespeech/t2s/exps/fastspeech2/inference.py index 07e9ed7ee..8ea64b993 100644 --- a/paddlespeech/t2s/exps/fastspeech2/inference.py +++ b/paddlespeech/t2s/exps/fastspeech2/inference.py @@ -82,7 +82,9 @@ def main(): with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = ",".join(items[1:]) sentences.append((utt_id, sentence)) for utt_id, sentence in sentences: diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py index 1839415e9..a2f8ada69 100644 --- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py +++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py @@ -37,7 +37,9 @@ def evaluate(args, fastspeech2_config, pwg_config): sentences = [] with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = ",".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py index ff9a41eab..aac2c054e 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py @@ -40,7 +40,9 @@ def evaluate(args, fastspeech2_config, pwg_config): sentences = [] with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = ",".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py index f0ff5655d..527e5d410 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py @@ -40,7 +40,9 @@ def evaluate(args, fastspeech2_config, melgan_config): sentences = [] with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = ",".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/paddlespeech/t2s/exps/speedyspeech/inference.py b/paddlespeech/t2s/exps/speedyspeech/inference.py index 617848c58..75f937dec 100644 --- a/paddlespeech/t2s/exps/speedyspeech/inference.py +++ b/paddlespeech/t2s/exps/speedyspeech/inference.py @@ -87,7 +87,9 @@ def main(): with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = ",".join(items[1:]) sentences.append((utt_id, sentence)) for utt_id, sentence in sentences: diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py index 0e64088dc..b04189405 100644 --- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py @@ -40,7 +40,9 @@ def evaluate(args, speedyspeech_config, pwg_config): sentences = [] with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = ",".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index d49c09378..5b69477da 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -149,9 +149,14 @@ class Frontend(): if word not in self.must_erhua and (word in self.not_erhua or pos in {"a", "j", "nr"}): return initials, finals + # "……" 等情况直接返回 + if len(finals) != len(word): + return initials, finals + + assert len(finals) == len(word) + new_initials = [] new_finals = [] - assert len(finals) == len(word) for i, phn in enumerate(finals): if i == len(finals) - 1 and word[i] == "儿" and phn in { "er2", "er5" diff --git a/paddlespeech/t2s/frontend/zh_normalization/chronology.py b/paddlespeech/t2s/frontend/zh_normalization/chronology.py index b8d711564..8801baa0d 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py +++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py @@ -32,6 +32,15 @@ RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])' r':([0-5][0-9])' r'(:([0-5][0-9]))?') +# 时间范围,如8:30-12:30 +RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])' + r':([0-5][0-9])' + r'(:([0-5][0-9]))?' + r'(~|-)' + r'([0-1]?[0-9]|2[0-3])' + r':([0-5][0-9])' + r'(:([0-5][0-9]))?') + def replace_time(match) -> str: """ @@ -42,15 +51,32 @@ def replace_time(match) -> str: ---------- str """ + + is_range = len(match.groups()) > 5 + hour = match.group(1) minute = match.group(2) second = match.group(4) + if is_range: + hour_2 = match.group(6) + minute_2 = match.group(7) + second_2 = match.group(9) + result = f"{num2str(hour)}点" if minute.lstrip('0'): result += f"{_time_num2str(minute)}分" if second and second.lstrip('0'): result += f"{_time_num2str(second)}秒" + + if is_range: + result += "至" + result += f"{num2str(hour_2)}点" + if minute_2.lstrip('0'): + result += f"{_time_num2str(minute_2)}分" + if second_2 and second_2.lstrip('0'): + result += f"{_time_num2str(second_2)}秒" + return result diff --git a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py index be159c239..b7b69b41b 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py +++ b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py @@ -26,16 +26,19 @@ RE_MOBILE_PHONE = re.compile( RE_TELEPHONE = re.compile( r"(? str: if mobile: sp_parts = phone_string.strip('+').split() - result = ''.join( + result = ','.join( [verbalize_digit(part, alt_one=True) for part in sp_parts]) return result else: sil_parts = phone_string.split('-') - result = ''.join( + result = ','.join( [verbalize_digit(part, alt_one=True) for part in sil_parts]) return result diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py index e25e99019..c3885fb9b 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py +++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py @@ -18,6 +18,7 @@ from .char_convert import tranditional_to_simplified from .chronology import RE_DATE from .chronology import RE_DATE2 from .chronology import RE_TIME +from .chronology import RE_TIME_RANGE from .chronology import replace_date from .chronology import replace_date2 from .chronology import replace_time @@ -40,6 +41,7 @@ from .num import replace_percentage from .num import replace_positive_quantifier from .num import replace_range from .phonecode import RE_MOBILE_PHONE +from .phonecode import RE_NATIONAL_UNIFORM_NUMBER from .phonecode import RE_TELEPHONE from .phonecode import replace_mobile from .phonecode import replace_phone @@ -76,12 +78,19 @@ class TextNormalizer(): # number related NSW verbalization sentence = RE_DATE.sub(replace_date, sentence) sentence = RE_DATE2.sub(replace_date2, sentence) + + # range first + sentence = RE_TIME_RANGE.sub(replace_time, sentence) sentence = RE_TIME.sub(replace_time, sentence) + sentence = RE_TEMPERATURE.sub(replace_temperature, sentence) sentence = RE_FRAC.sub(replace_frac, sentence) sentence = RE_PERCENTAGE.sub(replace_percentage, sentence) sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence) + sentence = RE_TELEPHONE.sub(replace_phone, sentence) + sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence) + sentence = RE_RANGE.sub(replace_range, sentence) sentence = RE_INTEGER.sub(replace_negative_num, sentence) sentence = RE_DECIMAL_NUM.sub(replace_number, sentence) @@ -94,5 +103,6 @@ class TextNormalizer(): def normalize(self, text: str) -> List[str]: sentences = self._split(text) + sentences = [self.normalize_sentence(sent) for sent in sentences] return sentences diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index aa42a83de..cdec03abc 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -307,7 +307,7 @@ class FastSpeech2(nn.Layer): num_embeddings=idim, embedding_dim=adim, padding_idx=self.padding_idx) - + if encoder_type == "transformer": print("encoder_type is transformer") self.encoder = TransformerEncoder( From a861e56e91b42b65eaab2781ba615efd4f95ecc3 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 26 Nov 2021 11:04:29 +0000 Subject: [PATCH 2/2] rm space for pure Chinese --- demos/style_fs2/style_syn.py | 2 +- paddlespeech/t2s/exps/fastspeech2/inference.py | 2 +- paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py | 2 +- paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py | 2 +- paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py | 2 +- paddlespeech/t2s/exps/speedyspeech/inference.py | 2 +- paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py | 2 +- paddlespeech/t2s/frontend/zh_frontend.py | 2 ++ paddlespeech/t2s/frontend/zh_normalization/text_normlization.py | 2 ++ 9 files changed, 11 insertions(+), 7 deletions(-) diff --git a/demos/style_fs2/style_syn.py b/demos/style_fs2/style_syn.py index 9bd615790..0ed87e7cb 100644 --- a/demos/style_fs2/style_syn.py +++ b/demos/style_fs2/style_syn.py @@ -36,7 +36,7 @@ def evaluate(args, fastspeech2_config, pwg_config): for line in f: items = line.strip().split() utt_id = items[0] - sentence = ",".join(items[1:]) + sentence = "".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/paddlespeech/t2s/exps/fastspeech2/inference.py b/paddlespeech/t2s/exps/fastspeech2/inference.py index 8ea64b993..1d6ea667a 100644 --- a/paddlespeech/t2s/exps/fastspeech2/inference.py +++ b/paddlespeech/t2s/exps/fastspeech2/inference.py @@ -84,7 +84,7 @@ def main(): for line in f: items = line.strip().split() utt_id = items[0] - sentence = ",".join(items[1:]) + sentence = "".join(items[1:]) sentences.append((utt_id, sentence)) for utt_id, sentence in sentences: diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py index a2f8ada69..9dc3ab4b6 100644 --- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py +++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py @@ -39,7 +39,7 @@ def evaluate(args, fastspeech2_config, pwg_config): for line in f: items = line.strip().split() utt_id = items[0] - sentence = ",".join(items[1:]) + sentence = "".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py index aac2c054e..47c8a5e7a 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py @@ -42,7 +42,7 @@ def evaluate(args, fastspeech2_config, pwg_config): for line in f: items = line.strip().split() utt_id = items[0] - sentence = ",".join(items[1:]) + sentence = "".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py index 527e5d410..4d5d1ac41 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py @@ -42,7 +42,7 @@ def evaluate(args, fastspeech2_config, melgan_config): for line in f: items = line.strip().split() utt_id = items[0] - sentence = ",".join(items[1:]) + sentence = "".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/paddlespeech/t2s/exps/speedyspeech/inference.py b/paddlespeech/t2s/exps/speedyspeech/inference.py index 75f937dec..0ed2e0bf1 100644 --- a/paddlespeech/t2s/exps/speedyspeech/inference.py +++ b/paddlespeech/t2s/exps/speedyspeech/inference.py @@ -89,7 +89,7 @@ def main(): for line in f: items = line.strip().split() utt_id = items[0] - sentence = ",".join(items[1:]) + sentence = "".join(items[1:]) sentences.append((utt_id, sentence)) for utt_id, sentence in sentences: diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py index b04189405..403d35088 100644 --- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py @@ -42,7 +42,7 @@ def evaluate(args, speedyspeech_config, pwg_config): for line in f: items = line.strip().split() utt_id = items[0] - sentence = ",".join(items[1:]) + sentence = "".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index 5b69477da..84852b9ce 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -129,6 +129,8 @@ class Frontend(): # we discriminate i, ii and iii if c and c not in self.punc: phones.append(c) + if c and c in self.punc: + phones.append('sp') if v and v not in self.punc: phones.append(v) # add sp between sentence (replace the last punc with sp) diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py index c3885fb9b..c68caeeb7 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py +++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py @@ -64,6 +64,8 @@ class TextNormalizer(): List[str] Sentences. """ + # Only for pure Chinese here + text = text.replace(" ", "") text = self.SENTENCE_SPLITOR.sub(r'\1\n', text) text = text.strip() sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]