diff --git a/demos/style_fs2/style_syn.py b/demos/style_fs2/style_syn.py index 5b8ce3513..0ed87e7cb 100644 --- a/demos/style_fs2/style_syn.py +++ b/demos/style_fs2/style_syn.py @@ -34,7 +34,9 @@ def evaluate(args, fastspeech2_config, pwg_config): sentences = [] with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = "".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md index 13cc6ed7e..3830156f9 100644 --- a/examples/ljspeech/voc1/README.md +++ b/examples/ljspeech/voc1/README.md @@ -137,4 +137,4 @@ pwg_ljspeech_ckpt_0.5 └── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` ## Acknowledgement -We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. \ No newline at end of file +We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/paddlespeech/t2s/exps/fastspeech2/inference.py b/paddlespeech/t2s/exps/fastspeech2/inference.py index 07e9ed7ee..1d6ea667a 100644 --- a/paddlespeech/t2s/exps/fastspeech2/inference.py +++ b/paddlespeech/t2s/exps/fastspeech2/inference.py @@ -82,7 +82,9 @@ def main(): with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = "".join(items[1:]) sentences.append((utt_id, sentence)) for utt_id, sentence in sentences: diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py index 1839415e9..9dc3ab4b6 100644 --- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py +++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py @@ -37,7 +37,9 @@ def evaluate(args, fastspeech2_config, pwg_config): sentences = [] with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = "".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py index ff9a41eab..47c8a5e7a 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py @@ -40,7 +40,9 @@ def evaluate(args, fastspeech2_config, pwg_config): sentences = [] with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = "".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py index f0ff5655d..4d5d1ac41 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py @@ -40,7 +40,9 @@ def evaluate(args, fastspeech2_config, melgan_config): sentences = [] with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = "".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/paddlespeech/t2s/exps/speedyspeech/inference.py b/paddlespeech/t2s/exps/speedyspeech/inference.py index 617848c58..0ed2e0bf1 100644 --- a/paddlespeech/t2s/exps/speedyspeech/inference.py +++ b/paddlespeech/t2s/exps/speedyspeech/inference.py @@ -87,7 +87,9 @@ def main(): with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = "".join(items[1:]) sentences.append((utt_id, sentence)) for utt_id, sentence in sentences: diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py index 0e64088dc..403d35088 100644 --- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py @@ -40,7 +40,9 @@ def evaluate(args, speedyspeech_config, pwg_config): sentences = [] with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = "".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index d49c09378..84852b9ce 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -129,6 +129,8 @@ class Frontend(): # we discriminate i, ii and iii if c and c not in self.punc: phones.append(c) + if c and c in self.punc: + phones.append('sp') if v and v not in self.punc: phones.append(v) # add sp between sentence (replace the last punc with sp) @@ -149,9 +151,14 @@ class Frontend(): if word not in self.must_erhua and (word in self.not_erhua or pos in {"a", "j", "nr"}): return initials, finals + # "……" 等情况直接返回 + if len(finals) != len(word): + return initials, finals + + assert len(finals) == len(word) + new_initials = [] new_finals = [] - assert len(finals) == len(word) for i, phn in enumerate(finals): if i == len(finals) - 1 and word[i] == "儿" and phn in { "er2", "er5" diff --git a/paddlespeech/t2s/frontend/zh_normalization/chronology.py b/paddlespeech/t2s/frontend/zh_normalization/chronology.py index b8d711564..8801baa0d 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py +++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py @@ -32,6 +32,15 @@ RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])' r':([0-5][0-9])' r'(:([0-5][0-9]))?') +# 时间范围,如8:30-12:30 +RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])' + r':([0-5][0-9])' + r'(:([0-5][0-9]))?' + r'(~|-)' + r'([0-1]?[0-9]|2[0-3])' + r':([0-5][0-9])' + r'(:([0-5][0-9]))?') + def replace_time(match) -> str: """ @@ -42,15 +51,32 @@ def replace_time(match) -> str: ---------- str """ + + is_range = len(match.groups()) > 5 + hour = match.group(1) minute = match.group(2) second = match.group(4) + if is_range: + hour_2 = match.group(6) + minute_2 = match.group(7) + second_2 = match.group(9) + result = f"{num2str(hour)}点" if minute.lstrip('0'): result += f"{_time_num2str(minute)}分" if second and second.lstrip('0'): result += f"{_time_num2str(second)}秒" + + if is_range: + result += "至" + result += f"{num2str(hour_2)}点" + if minute_2.lstrip('0'): + result += f"{_time_num2str(minute_2)}分" + if second_2 and second_2.lstrip('0'): + result += f"{_time_num2str(second_2)}秒" + return result diff --git a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py index be159c239..b7b69b41b 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py +++ b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py @@ -26,16 +26,19 @@ RE_MOBILE_PHONE = re.compile( RE_TELEPHONE = re.compile( r"(? str: if mobile: sp_parts = phone_string.strip('+').split() - result = ''.join( + result = ','.join( [verbalize_digit(part, alt_one=True) for part in sp_parts]) return result else: sil_parts = phone_string.split('-') - result = ''.join( + result = ','.join( [verbalize_digit(part, alt_one=True) for part in sil_parts]) return result diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py index e25e99019..c68caeeb7 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py +++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py @@ -18,6 +18,7 @@ from .char_convert import tranditional_to_simplified from .chronology import RE_DATE from .chronology import RE_DATE2 from .chronology import RE_TIME +from .chronology import RE_TIME_RANGE from .chronology import replace_date from .chronology import replace_date2 from .chronology import replace_time @@ -40,6 +41,7 @@ from .num import replace_percentage from .num import replace_positive_quantifier from .num import replace_range from .phonecode import RE_MOBILE_PHONE +from .phonecode import RE_NATIONAL_UNIFORM_NUMBER from .phonecode import RE_TELEPHONE from .phonecode import replace_mobile from .phonecode import replace_phone @@ -62,6 +64,8 @@ class TextNormalizer(): List[str] Sentences. """ + # Only for pure Chinese here + text = text.replace(" ", "") text = self.SENTENCE_SPLITOR.sub(r'\1\n', text) text = text.strip() sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] @@ -76,12 +80,19 @@ class TextNormalizer(): # number related NSW verbalization sentence = RE_DATE.sub(replace_date, sentence) sentence = RE_DATE2.sub(replace_date2, sentence) + + # range first + sentence = RE_TIME_RANGE.sub(replace_time, sentence) sentence = RE_TIME.sub(replace_time, sentence) + sentence = RE_TEMPERATURE.sub(replace_temperature, sentence) sentence = RE_FRAC.sub(replace_frac, sentence) sentence = RE_PERCENTAGE.sub(replace_percentage, sentence) sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence) + sentence = RE_TELEPHONE.sub(replace_phone, sentence) + sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence) + sentence = RE_RANGE.sub(replace_range, sentence) sentence = RE_INTEGER.sub(replace_negative_num, sentence) sentence = RE_DECIMAL_NUM.sub(replace_number, sentence) @@ -94,5 +105,6 @@ class TextNormalizer(): def normalize(self, text: str) -> List[str]: sentences = self._split(text) + sentences = [self.normalize_sentence(sent) for sent in sentences] return sentences diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index aa42a83de..cdec03abc 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -307,7 +307,7 @@ class FastSpeech2(nn.Layer): num_embeddings=idim, embedding_dim=adim, padding_idx=self.padding_idx) - + if encoder_type == "transformer": print("encoder_type is transformer") self.encoder = TransformerEncoder(