[TTS]Cantonese FastSpeech2 e2e infer, test=tts (#2927)

3 years ago · 1af9bd47d9
parent 004a4d6096
commit 1af9bd47d9
7 changed files with 191 additions and 15 deletions
--- a/examples/canton/tts3/local/synthesize_e2e.sh
+++ b/examples/canton/tts3/local/synthesize_e2e.sh
@ -0,0 +1,53 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 ckpt_name=$3
 stage=0
 stop_stage=0
 # pwgan
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    FLAGS_allocator_strategy=naive_best_fit \
    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
    python3 ${BIN_DIR}/../synthesize_e2e.py \
        --am=fastspeech2_canton \
        --am_config=${config_path} \
        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
        --am_stat=dump/train/speech_stats.npy \
        --voc=pwgan_aishell3 \
        --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
        --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
        --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
        --lang=canton \
        --text=${BIN_DIR}/../sentences_canton.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
        --spk_id=0 \
        --inference_dir=${train_output_path}/inference
 fi
 # hifigan
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "in hifigan syn_e2e"
    FLAGS_allocator_strategy=naive_best_fit \
    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
    python3 ${BIN_DIR}/../synthesize_e2e.py \
        --am=fastspeech2_canton \
        --am_config=${config_path} \
        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
        --am_stat=dump/train/speech_stats.npy \
        --voc=hifigan_aishell3 \
        --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \
        --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
        --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
        --lang=canton \
        --text=${BIN_DIR}/../sentences_canton.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
        --spk_id=0 \
        --inference_dir=${train_output_path}/inference
    fi
--- a/examples/canton/tts3/run.sh
+++ b/examples/canton/tts3/run.sh
@ -9,7 +9,8 @@ stop_stage=100
 conf_path=conf/default.yaml
 train_output_path=exp/default
-ckpt_name=snapshot_iter_112793.pdz
+
 ckpt_name=snapshot_iter_280000.pdz
 # with the following command, you can choose the stage range you want to run
 # such as `./run.sh --stage 0 --stop-stage 0`
--- a/paddlespeech/t2s/exps/sentences_canton.txt
+++ b/paddlespeech/t2s/exps/sentences_canton.txt
@ -0,0 +1,7 @@
 001 白云山爬过一次嘅，好远啊，爬上去都成两个钟
 002 睇书咯，番屋企，而家好多人好少睇书噶喎
 003 因为如果唔考试嘅话，工资好低噶
 004 冇固定噶，你中意休边日就边日噶
 005 即系太迟嘅话咧，落班太迟嘅话就喺出边食啲咯
 006 是非有公理，慎言莫冒犯别人
 007 遇上冷风雨，休太认真
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@ -33,6 +33,7 @@ from paddlespeech.t2s.datasets.am_batch_fn import *
 from paddlespeech.t2s.datasets.data_table import DataTable
 from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip_static
 from paddlespeech.t2s.frontend import English
 from paddlespeech.t2s.frontend.canton_frontend import CantonFrontend
 from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.modules.normalizer import ZScore
@ -111,7 +112,7 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
            if line.strip() != "":
                items = re.split(r"\s+", line.strip(), 1)
                utt_id = items[0]
-                if lang == 'zh':
+                if lang in {'zh', 'canton'}:
                    sentence = "".join(items[1:])
                elif lang == 'en':
                    sentence = " ".join(items[1:])
@ -132,8 +133,8 @@ def get_test_dataset(test_metadata: List[Dict[str, Any]],
    converters = {}
    if am_name == 'fastspeech2':
        fields = ["utt_id", "text"]
-        if am_dataset in {"aishell3", "vctk",
+        if am_dataset in {"aishell3", "vctk", "mix",
-                          "mix"} and speaker_dict is not None:
+                          "canton"} and speaker_dict is not None:
            print("multiple speaker fastspeech2!")
            fields += ["spk_id"]
        elif voice_cloning:
@ -177,8 +178,8 @@ def get_dev_dataloader(dev_metadata: List[Dict[str, Any]],
    converters = {}
    if am_name == 'fastspeech2':
        fields = ["utt_id", "text"]
-        if am_dataset in {"aishell3", "vctk",
+        if am_dataset in {"aishell3", "vctk", "mix",
-                          "mix"} and speaker_dict is not None:
+                          "canton"} and speaker_dict is not None:
            print("multiple speaker fastspeech2!")
            collate_fn = fastspeech2_multi_spk_batch_fn_static
            fields += ["spk_id"]
@ -266,6 +267,8 @@ def get_frontend(lang: str='zh',
            phone_vocab_path=phones_dict,
            tone_vocab_path=tones_dict,
            use_rhy=use_rhy)
    elif lang == 'canton':
        frontend = CantonFrontend(phone_vocab_path=phones_dict)
    elif lang == 'en':
        frontend = English(phone_vocab_path=phones_dict)
    elif lang == 'mix':
@ -302,6 +305,10 @@ def run_frontend(frontend: object,
        if get_tone_ids:
            tone_ids = input_ids["tone_ids"]
            outs.update({'tone_ids': tone_ids})
    elif lang == 'canton':
        input_ids = frontend.get_input_ids(
            text, merge_sentences=merge_sentences, to_tensor=to_tensor)
        phone_ids = input_ids["phone_ids"]
    elif lang == 'en':
        input_ids = frontend.get_input_ids(
            text, merge_sentences=merge_sentences, to_tensor=to_tensor)
@ -311,7 +318,7 @@ def run_frontend(frontend: object,
            text, merge_sentences=merge_sentences, to_tensor=to_tensor)
        phone_ids = input_ids["phone_ids"]
    else:
-        print("lang should in {'zh', 'en', 'mix'}!")
+        print("lang should in {'zh', 'en', 'mix', 'canton'}!")
    outs.update({'phone_ids': phone_ids})
    return outs
@ -411,8 +418,8 @@ def am_to_static(am_inference,
    am_name = am[:am.rindex('_')]
    am_dataset = am[am.rindex('_') + 1:]
    if am_name == 'fastspeech2':
-        if am_dataset in {"aishell3", "vctk",
+        if am_dataset in {"aishell3", "vctk", "mix",
-                          "mix"} and speaker_dict is not None:
+                          "canton"} and speaker_dict is not None:
            am_inference = jit.to_static(
                am_inference,
                input_spec=[
@ -424,8 +431,8 @@ def am_to_static(am_inference,
                am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
    elif am_name == 'speedyspeech':
-        if am_dataset in {"aishell3", "vctk",
+        if am_dataset in {"aishell3", "vctk", "mix",
-                          "mix"} and speaker_dict is not None:
+                          "canton"} and speaker_dict is not None:
            am_inference = jit.to_static(
                am_inference,
                input_spec=[
@ -575,7 +582,7 @@ def get_am_output(
    get_tone_ids = False
    if am_name == 'speedyspeech':
        get_tone_ids = True
-    if am_dataset in {"aishell3", "vctk", "mix"} and speaker_dict:
+    if am_dataset in {"aishell3", "vctk", "mix", "canton"} and speaker_dict:
        get_spk_id = True
        spk_id = np.array([spk_id])
--- a/paddlespeech/t2s/exps/synthesize.py
+++ b/paddlespeech/t2s/exps/synthesize.py
@ -136,7 +136,8 @@ def parse_args():
        choices=[
            'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech',
            'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc',
-            'tacotron2_ljspeech', 'tacotron2_aishell3', 'fastspeech2_mix'
+            'tacotron2_ljspeech', 'tacotron2_aishell3', 'fastspeech2_mix',
            'fastspeech2_canton'
        ],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@ -119,7 +119,7 @@ def evaluate(args):
                    # acoustic model
                    if am_name == 'fastspeech2':
                        # multi speaker
-                        if am_dataset in {"aishell3", "vctk", "mix"}:
+                        if am_dataset in {"aishell3", "vctk", "mix", "canton"}:
                            spk_id = paddle.to_tensor(args.spk_id)
                            mel = am_inference(part_phone_ids, spk_id)
                        else:
@ -167,7 +167,8 @@ def parse_args():
        choices=[
            'speedyspeech_csmsc', 'speedyspeech_aishell3', 'fastspeech2_csmsc',
            'fastspeech2_ljspeech', 'fastspeech2_aishell3', 'fastspeech2_vctk',
-            'tacotron2_csmsc', 'tacotron2_ljspeech', 'fastspeech2_mix'
+            'tacotron2_csmsc', 'tacotron2_ljspeech', 'fastspeech2_mix',
            'fastspeech2_canton'
        ],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
--- a/paddlespeech/t2s/frontend/canton_frontend.py
+++ b/paddlespeech/t2s/frontend/canton_frontend.py
@ -0,0 +1,106 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Dict
 from typing import List
 import numpy as np
 import paddle
 import ToJyutping
 from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer
 INITIALS = [
    'p', 'b', 't', 'd', 'ts', 'dz', 'k', 'g', 'kw', 'gw', 'f', 'h', 'l', 'm',
    'ng', 'n', 's', 'y', 'w', 'c', 'z', 'j'
 ]
 INITIALS += ['sp', 'spl', 'spn', 'sil']
 def get_lines(cantons: List[str]):
    phones = []
    for canton in cantons:
        for consonant in INITIALS:
            if canton.startswith(consonant):
                c, v = canton[:len(consonant)], canton[len(consonant):]
                phones = phones + [c, v]
    return phones
 class CantonFrontend():
    def __init__(self, phone_vocab_path: str):
        self.text_normalizer = TextNormalizer()
        self.punc = "：，；。？！“”‘’':,;.?!"
        self.vocab_phones = {}
        if phone_vocab_path:
            with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
                phn_id = [line.strip().split() for line in f.readlines()]
            for phn, id in phn_id:
                self.vocab_phones[phn] = int(id)
    # if merge_sentences, merge all sentences into one phone sequence
    def _g2p(self, sentences: List[str],
             merge_sentences: bool=True) -> List[List[str]]:
        phones_list = []
        for sentence in sentences:
            phones_str = ToJyutping.get_jyutping_text(sentence)
            phones_split = get_lines(phones_str.split(' '))
            phones_list.append(phones_split)
        return phones_list
    def _p2id(self, phonemes: List[str]) -> np.ndarray:
        # replace unk phone with sp
        phonemes = [
            phn if phn in self.vocab_phones else "sp" for phn in phonemes
        ]
        phone_ids = [self.vocab_phones[item] for item in phonemes]
        return np.array(phone_ids, np.int64)
    def get_phonemes(self,
                     sentence: str,
                     merge_sentences: bool=True,
                     print_info: bool=False) -> List[List[str]]:
        sentences = self.text_normalizer.normalize(sentence)
        phonemes = self._g2p(sentences, merge_sentences=merge_sentences)
        if print_info:
            print("----------------------------")
            print("text norm results:")
            print(sentences)
            print("----------------------------")
            print("g2p results:")
            print(phonemes)
            print("----------------------------")
        return phonemes
    def get_input_ids(self,
                      sentence: str,
                      merge_sentences: bool=True,
                      print_info: bool=False,
                      to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
        phonemes = self.get_phonemes(
            sentence, merge_sentences=merge_sentences, print_info=print_info)
        result = {}
        temp_phone_ids = []
        for phones in phonemes:
            if phones:
                phone_ids = self._p2id(phones)
                # if use paddle.to_tensor() in onnxruntime, the first time will be too low
                if to_tensor:
                    phone_ids = paddle.to_tensor(phone_ids)
                temp_phone_ids.append(phone_ids)
        if temp_phone_ids:
            result["phone_ids"] = temp_phone_ids
        return result