Merge pull request #2040 from yt605155624/add_blank

[TTS]add blank between characters for vits
3 years ago · 02734141ce
parent 56eb1f0ebf 1731976e4e
commit 02734141ce
22 changed files with 136 additions and 201 deletions
--- a/examples/csmsc/vits/conf/default.yaml
+++ b/examples/csmsc/vits/conf/default.yaml
@ -178,6 +178,8 @@ generator_first: False # whether to start updating generator first
 ##########################################################
 #                OTHER TRAINING SETTING                  #
 ##########################################################
-max_epoch: 1000           # number of epochs
 num_snapshots: 10            # max number of snapshots to keep while training
+train_max_steps: 250000      # Number of training steps. == total_iters / ngpus, total_iters = 1000000
+save_interval_steps: 1000    # Interval steps to save checkpoint.
+eval_interval_steps: 250     # Interval steps to evaluate the network.
 seed: 777                    # random seed number
--- a/examples/csmsc/vits/local/preprocess.sh
+++ b/examples/csmsc/vits/local/preprocess.sh
@ -4,6 +4,7 @@ stage=0
 stop_stage=100

 config_path=$1
+add_blank=$2

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
@ -44,6 +45,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --feats-stats=dump/train/feats_stats.npy \
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt \
+        --add-blank=${add_blank} \
        --skip-wav-copy

    python3 ${BIN_DIR}/normalize.py \
@ -52,6 +54,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --feats-stats=dump/train/feats_stats.npy \
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt \
+        --add-blank=${add_blank} \
        --skip-wav-copy

    python3 ${BIN_DIR}/normalize.py \
@ -60,5 +63,6 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --feats-stats=dump/train/feats_stats.npy \
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt \
+        --add-blank=${add_blank} \
        --skip-wav-copy
 fi
--- a/examples/csmsc/vits/local/synthesize_e2e.sh
+++ b/examples/csmsc/vits/local/synthesize_e2e.sh
@ -3,9 +3,12 @@
 config_path=$1
 train_output_path=$2
 ckpt_name=$3
+add_blank=$4
+
 stage=0
 stop_stage=0

+
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    FLAGS_allocator_strategy=naive_best_fit \
    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
@ -14,5 +17,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
        --phones_dict=dump/phone_id_map.txt \
        --output_dir=${train_output_path}/test_e2e \
-        --text=${BIN_DIR}/../sentences.txt
+        --text=${BIN_DIR}/../sentences.txt \
+        --add-blank=${add_blank}
 fi
--- a/examples/csmsc/vits/run.sh
+++ b/examples/csmsc/vits/run.sh
@ -10,6 +10,7 @@ stop_stage=100
 conf_path=conf/default.yaml
 train_output_path=exp/default
 ckpt_name=snapshot_iter_153.pdz
+add_blank=true

 # with the following command, you can choose the stage range you want to run
 # such as `./run.sh --stage 0 --stop-stage 0`
@ -18,7 +19,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
-    ./local/preprocess.sh ${conf_path} || exit -1
+    ./local/preprocess.sh ${conf_path} ${add_blank}|| exit -1
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
@ -32,5 +33,5 @@ fi

 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # synthesize_e2e, vocoder is pwgan
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ${add_blank}|| exit -1
 fi
--- a/examples/ljspeech/voc0/local/synthesize.sh
+++ b/examples/ljspeech/voc0/local/synthesize.sh
@ -8,5 +8,4 @@ python ${BIN_DIR}/synthesize.py \
    --input=${input_mel_path} \
    --output=${train_output_path}/wavs/ \
    --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
-    --ngpu=1 \
-    --verbose
+    --ngpu=1
--- a/paddlespeech/t2s/exps/fastspeech2/normalize.py
+++ b/paddlespeech/t2s/exps/fastspeech2/normalize.py
@ -58,30 +58,8 @@ def main():
        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
    parser.add_argument(
        "--speaker-dict", type=str, default=None, help="speaker id map file.")
-    parser.add_argument(
-        "--verbose",
-        type=int,
-        default=1,
-        help="logging level. higher is more logging. (default=1)")
-    args = parser.parse_args()

-    # set logger
-    if args.verbose > 1:
-        logging.basicConfig(
-            level=logging.DEBUG,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
-        )
-    elif args.verbose > 0:
-        logging.basicConfig(
-            level=logging.INFO,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
-        )
-    else:
-        logging.basicConfig(
-            level=logging.WARN,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
-        )
-        logging.warning('Skip DEBUG/INFO messages')
+    args = parser.parse_args()

    dumpdir = Path(args.dumpdir).expanduser()
    # use absolute path
--- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py
+++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
@ -209,11 +209,6 @@ def main():

    parser.add_argument("--config", type=str, help="fastspeech2 config file.")

-    parser.add_argument(
-        "--verbose",
-        type=int,
-        default=1,
-        help="logging level. higher is more logging. (default=1)")
    parser.add_argument(
        "--num-cpu", type=int, default=1, help="number of process.")

@ -248,10 +243,6 @@ def main():
    with open(args.config, 'rt') as f:
        config = CfgNode(yaml.safe_load(f))

-    if args.verbose > 1:
-        print(vars(args))
-        print(config)
-
    sentences, speaker_set = get_phn_dur(dur_file)

    merge_silence(sentences)
--- a/paddlespeech/t2s/exps/gan_vocoder/normalize.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/normalize.py
@ -47,30 +47,8 @@ def main():
        default=False,
        action="store_true",
        help="whether to skip the copy of wav files.")
-    parser.add_argument(
-        "--verbose",
-        type=int,
-        default=1,
-        help="logging level. higher is more logging. (default=1)")
-    args = parser.parse_args()

-    # set logger
-    if args.verbose > 1:
-        logging.basicConfig(
-            level=logging.DEBUG,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
-        )
-    elif args.verbose > 0:
-        logging.basicConfig(
-            level=logging.INFO,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
-        )
-    else:
-        logging.basicConfig(
-            level=logging.WARN,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
-        )
-        logging.warning('Skip DEBUG/INFO messages')
+    args = parser.parse_args()

    dumpdir = Path(args.dumpdir).expanduser()
    # use absolute path
--- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
@ -167,11 +167,6 @@ def main():
        required=True,
        help="directory to dump feature files.")
    parser.add_argument("--config", type=str, help="vocoder config file.")
-    parser.add_argument(
-        "--verbose",
-        type=int,
-        default=1,
-        help="logging level. higher is more logging. (default=1)")
    parser.add_argument(
        "--num-cpu", type=int, default=1, help="number of process.")
    parser.add_argument(
@ -197,10 +192,6 @@ def main():
    with open(args.config, 'rt') as f:
        config = CfgNode(yaml.safe_load(f))

-    if args.verbose > 1:
-        print(vars(args))
-        print(config)
-
    sentences, speaker_set = get_phn_dur(dur_file)
    merge_silence(sentences)

--- a/paddlespeech/t2s/exps/speedyspeech/normalize.py
+++ b/paddlespeech/t2s/exps/speedyspeech/normalize.py
@ -50,11 +50,6 @@ def main():
        "--tones-dict", type=str, default=None, help="tone vocabulary file.")
    parser.add_argument(
        "--speaker-dict", type=str, default=None, help="speaker id map file.")
-    parser.add_argument(
-        "--verbose",
-        type=int,
-        default=1,
-        help="logging level. higher is more logging. (default=1)")

    parser.add_argument(
        "--use-relative-path",
@ -63,24 +58,6 @@ def main():
        help="whether use relative path in metadata")
    args = parser.parse_args()

-    # set logger
-    if args.verbose > 1:
-        logging.basicConfig(
-            level=logging.DEBUG,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
-        )
-    elif args.verbose > 0:
-        logging.basicConfig(
-            level=logging.INFO,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
-        )
-    else:
-        logging.basicConfig(
-            level=logging.WARN,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
-        )
-        logging.warning('Skip DEBUG/INFO messages')
-
    dumpdir = Path(args.dumpdir).expanduser()
    # use absolute path
    dumpdir = dumpdir.resolve()
--- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py
+++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py
@ -195,11 +195,6 @@ def main():

    parser.add_argument("--config", type=str, help="fastspeech2 config file.")

-    parser.add_argument(
-        "--verbose",
-        type=int,
-        default=1,
-        help="logging level. higher is more logging. (default=1)")
    parser.add_argument(
        "--num-cpu", type=int, default=1, help="number of process.")

@ -230,10 +225,6 @@ def main():
    with open(args.config, 'rt') as f:
        config = CfgNode(yaml.safe_load(f))

-    if args.verbose > 1:
-        print(vars(args))
-        print(config)
-
    sentences, speaker_set = get_phn_dur(dur_file)

    merge_silence(sentences)
--- a/paddlespeech/t2s/exps/tacotron2/preprocess.py
+++ b/paddlespeech/t2s/exps/tacotron2/preprocess.py
@ -184,11 +184,6 @@ def main():

    parser.add_argument("--config", type=str, help="fastspeech2 config file.")

-    parser.add_argument(
-        "--verbose",
-        type=int,
-        default=1,
-        help="logging level. higher is more logging. (default=1)")
    parser.add_argument(
        "--num-cpu", type=int, default=1, help="number of process.")

@ -223,10 +218,6 @@ def main():
    with open(args.config, 'rt') as f:
        config = CfgNode(yaml.safe_load(f))

-    if args.verbose > 1:
-        print(vars(args))
-        print(config)
-
    sentences, speaker_set = get_phn_dur(dur_file)

    merge_silence(sentences)
--- a/paddlespeech/t2s/exps/transformer_tts/normalize.py
+++ b/paddlespeech/t2s/exps/transformer_tts/normalize.py
@ -51,30 +51,8 @@ def main():
        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
    parser.add_argument(
        "--speaker-dict", type=str, default=None, help="speaker id map file.")
-    parser.add_argument(
-        "--verbose",
-        type=int,
-        default=1,
-        help="logging level. higher is more logging. (default=1)")
-    args = parser.parse_args()

-    # set logger
-    if args.verbose > 1:
-        logging.basicConfig(
-            level=logging.DEBUG,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
-        )
-    elif args.verbose > 0:
-        logging.basicConfig(
-            level=logging.INFO,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
-        )
-    else:
-        logging.basicConfig(
-            level=logging.WARN,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
-        )
-        logging.warning('Skip DEBUG/INFO messages')
+    args = parser.parse_args()

    # check directory existence
    dumpdir = Path(args.dumpdir).resolve()
--- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py
+++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py
@ -186,11 +186,6 @@ def main():
        type=str,
        help="yaml format configuration file.")

-    parser.add_argument(
-        "--verbose",
-        type=int,
-        default=1,
-        help="logging level. higher is more logging. (default=1)")
    parser.add_argument(
        "--num-cpu", type=int, default=1, help="number of process.")

@ -210,10 +205,6 @@ def main():
        _C = Configuration(_C)
        config = _C.clone()

-    if args.verbose > 1:
-        print(vars(args))
-        print(config)
-
    phone_id_map_path = dumpdir / "phone_id_map.txt"
    speaker_id_map_path = dumpdir / "speaker_id_map.txt"

--- a/paddlespeech/t2s/exps/vits/normalize.py
+++ b/paddlespeech/t2s/exps/vits/normalize.py
@ -16,6 +16,7 @@ import argparse
 import logging
 from operator import itemgetter
 from pathlib import Path
+from typing import List

 import jsonlines
 import numpy as np
@ -23,6 +24,50 @@ from sklearn.preprocessing import StandardScaler
 from tqdm import tqdm

 from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.utils import str2bool
+
+INITIALS = [
+    'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh',
+    'r', 'z', 'c', 's', 'j', 'q', 'x'
+]
+INITIALS += ['y', 'w', 'sp', 'spl', 'spn', 'sil']
+
+
+def intersperse(lst, item):
+    result = [item] * (len(lst) * 2 + 1)
+    result[1::2] = lst
+    return result
+
+
+def insert_after_character(lst, item):
+    result = [item]
+    for phone in lst:
+        result.append(phone)
+        if phone not in INITIALS:
+            # finals has tones
+            assert phone[-1] in "12345"
+            result.append(item)
+    return result
+
+
+def add_blank(phones: List[str],
+              filed: str="character",
+              blank_token: str="<pad>"):
+    if filed == "phone":
+        """
+        add blank after phones
+        input: ["n", "i3", "h", "ao3", "m", "a5"]
+        output: ["n", "<pad>", "i3", "<pad>", "h", "<pad>", "ao3", "<pad>", "m", "<pad>", "a5"]
+        """
+        phones = intersperse(phones, blank_token)
+    elif filed == "character":
+        """
+        add blank after characters
+        input: ["n", "i3", "h", "ao3"]
+        output: ["n", "i3", "<pad>", "h", "ao3", "<pad>", "m", "a5"]
+        """
+        phones = insert_after_character(phones, blank_token)
+    return phones


 def main():
@ -58,29 +103,12 @@ def main():
    parser.add_argument(
        "--speaker-dict", type=str, default=None, help="speaker id map file.")
    parser.add_argument(
-        "--verbose",
-        type=int,
-        default=1,
-        help="logging level. higher is more logging. (default=1)")
-    args = parser.parse_args()
+        "--add-blank",
+        type=str2bool,
+        default=True,
+        help="whether to add blank between phones")

-    # set logger
-    if args.verbose > 1:
-        logging.basicConfig(
-            level=logging.DEBUG,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
-        )
-    elif args.verbose > 0:
-        logging.basicConfig(
-            level=logging.INFO,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
-        )
-    else:
-        logging.basicConfig(
-            level=logging.WARN,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
-        )
-        logging.warning('Skip DEBUG/INFO messages')
+    args = parser.parse_args()

    dumpdir = Path(args.dumpdir).expanduser()
    # use absolute path
@ -135,13 +163,19 @@ def main():
        else:
            wav_path = wave

-        phone_ids = [vocab_phones[p] for p in item['phones']]
+        phones = item['phones']
+        text_lengths = item['text_lengths']
+        if args.add_blank:
+            phones = add_blank(phones, filed="character")
+            text_lengths = len(phones)
+
+        phone_ids = [vocab_phones[p] for p in phones]
        spk_id = vocab_speaker[item["speaker"]]

        record = {
            "utt_id": item['utt_id'],
            "text": phone_ids,
-            "text_lengths": item['text_lengths'],
+            "text_lengths": text_lengths,
            'feats': str(feats_path),
            "feats_lengths": item['feats_lengths'],
            "wave": str(wav_path),
--- a/paddlespeech/t2s/exps/vits/preprocess.py
+++ b/paddlespeech/t2s/exps/vits/preprocess.py
@ -197,11 +197,6 @@ def main():

    parser.add_argument("--config", type=str, help="fastspeech2 config file.")

-    parser.add_argument(
-        "--verbose",
-        type=int,
-        default=1,
-        help="logging level. higher is more logging. (default=1)")
    parser.add_argument(
        "--num-cpu", type=int, default=1, help="number of process.")

@ -236,10 +231,6 @@ def main():
    with open(args.config, 'rt') as f:
        config = CfgNode(yaml.safe_load(f))

-    if args.verbose > 1:
-        print(vars(args))
-        print(config)
-
    sentences, speaker_set = get_phn_dur(dur_file)

    merge_silence(sentences)
--- a/paddlespeech/t2s/exps/vits/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/vits/synthesize_e2e.py
@ -23,6 +23,7 @@ from yacs.config import CfgNode
 from paddlespeech.t2s.exps.syn_utils import get_frontend
 from paddlespeech.t2s.exps.syn_utils import get_sentences
 from paddlespeech.t2s.models.vits import VITS
+from paddlespeech.t2s.utils import str2bool


 def evaluate(args):
@ -55,6 +56,7 @@ def evaluate(args):
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    merge_sentences = False
+    add_blank = args.add_blank

    N = 0
    T = 0
@ -62,7 +64,9 @@ def evaluate(args):
        with timer() as t:
            if args.lang == 'zh':
                input_ids = frontend.get_input_ids(
-                    sentence, merge_sentences=merge_sentences)
+                    sentence,
+                    merge_sentences=merge_sentences,
+                    add_blank=add_blank)
                phone_ids = input_ids["phone_ids"]
            elif args.lang == 'en':
                input_ids = frontend.get_input_ids(
@ -125,6 +129,12 @@ def parse_args():
        help="text to synthesize, a 'utt_id sentence' pair per line.")
    parser.add_argument("--output_dir", type=str, help="output dir.")

+    parser.add_argument(
+        "--add-blank",
+        type=str2bool,
+        default=True,
+        help="whether to add blank between phones")
+
    args = parser.parse_args()
    return args

--- a/paddlespeech/t2s/exps/vits/train.py
+++ b/paddlespeech/t2s/exps/vits/train.py
@ -211,13 +211,18 @@ def train_sp(args, config):
        generator_first=config.generator_first,
        output_dir=output_dir)

-    trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
+    trainer = Trainer(
+        updater,
+        stop_trigger=(config.train_max_steps, "iteration"),
+        out=output_dir)

    if dist.get_rank() == 0:
-        trainer.extend(evaluator, trigger=(1, "epoch"))
-        trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
        trainer.extend(
-        Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
+            evaluator, trigger=(config.eval_interval_steps, 'iteration'))
+        trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
+    trainer.extend(
+        Snapshot(max_size=config.num_snapshots),
+        trigger=(config.save_interval_steps, 'iteration'))

    print("Trainer Done!")
    trainer.run()
--- a/paddlespeech/t2s/exps/waveflow/preprocess.py
+++ b/paddlespeech/t2s/exps/waveflow/preprocess.py
@ -143,8 +143,6 @@ if __name__ == "__main__":
        nargs=argparse.REMAINDER,
        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
    )
-    parser.add_argument(
-        "-v", "--verbose", action="store_true", help="print msg")

    config = get_cfg_defaults()
    args = parser.parse_args()
@ -153,8 +151,5 @@ if __name__ == "__main__":
    if args.opts:
        config.merge_from_list(args.opts)
    config.freeze()
-    if args.verbose:
-        print(config.data)
-        print(args)

    create_dataset(config.data, args.input, args.output)
--- a/paddlespeech/t2s/exps/waveflow/synthesize.py
+++ b/paddlespeech/t2s/exps/waveflow/synthesize.py
@ -72,8 +72,6 @@ if __name__ == "__main__":
        nargs=argparse.REMAINDER,
        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
    )
-    parser.add_argument(
-        "-v", "--verbose", action="store_true", help="print msg")

    args = parser.parse_args()
    if args.config:
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@ -29,6 +29,29 @@ from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
 from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi
 from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer

+INITIALS = [
+    'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh',
+    'r', 'z', 'c', 's', 'j', 'q', 'x'
+]
+INITIALS += ['y', 'w', 'sp', 'spl', 'spn', 'sil']
+
+
+def intersperse(lst, item):
+    result = [item] * (len(lst) * 2 + 1)
+    result[1::2] = lst
+    return result
+
+
+def insert_after_character(lst, item):
+    result = [item]
+    for phone in lst:
+        result.append(phone)
+        if phone not in INITIALS:
+            # finals has tones
+            # assert phone[-1] in "12345"
+            result.append(item)
+    return result
+

 class Frontend():
    def __init__(self,
@ -280,12 +303,15 @@ class Frontend():
            print("----------------------------")
        return phonemes

-    def get_input_ids(self,
+    def get_input_ids(
+            self,
            sentence: str,
            merge_sentences: bool=True,
            get_tone_ids: bool=False,
            robot: bool=False,
-                      print_info: bool=False) -> Dict[str, List[paddle.Tensor]]:
+            print_info: bool=False,
+            add_blank: bool=False,
+            blank_token: str="<pad>") -> Dict[str, List[paddle.Tensor]]:
        phonemes = self.get_phonemes(
            sentence,
            merge_sentences=merge_sentences,
@ -299,6 +325,10 @@ class Frontend():
        for part_phonemes in phonemes:
            phones, tones = self._get_phone_tone(
                part_phonemes, get_tone_ids=get_tone_ids)
+
+            if add_blank:
+                phones = insert_after_character(phones, blank_token)
+
            if tones:
                tone_ids = self._t2id(tones)
                tone_ids = paddle.to_tensor(tone_ids)
--- a/paddlespeech/t2s/models/vits/vits.py
+++ b/paddlespeech/t2s/models/vits/vits.py
@ -227,11 +227,7 @@ class VITS(nn.Layer):
            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
            forward_generator (bool): Whether to forward generator.
        Returns:
-            Dict[str, Any]:
-                - loss (Tensor): Loss scalar tensor.
-                - stats (Dict[str, float]): Statistics to be monitored.
-                - weight (Tensor): Weight tensor to summarize losses.
-                - optim_idx (int): Optimizer index (0 for G and 1 for D).
+        
        """
        if forward_generator:
            return self._forward_generator(