add tts tutorial

4 years ago · 0fcc5005a2
parent e5edc83a43
commit 0fcc5005a2
26 changed files with 2134 additions and 24 deletions
--- a/demos/metaverse/Lamarr.png
+++ b/demos/metaverse/Lamarr.png
--- a/demos/metaverse/path.sh
+++ b/demos/metaverse/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=fastspeech2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/demos/metaverse/run.sh
+++ b/demos/metaverse/run.sh
@ -0,0 +1,61 @@
+#!/bin/bash
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+mkdir download
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # install PaddleGAN
+    git clone https://github.com/PaddlePaddle/PaddleGAN.git
+    pip install -e PaddleGAN/
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then 
+    # download pretrained PaddleGAN model
+    wget -P download https://paddlegan.bj.bcebos.com/models/wav2lip_hq.pdparams
+fi 
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # download pretrained tts models and unzip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+    unzip -d download download/pwg_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+    unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # run tts
+    CUDA_VISIBLE_DEVICES=${gpus} \
+    python3 ${BIN_DIR}/synthesize_e2e.py \
+        --fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
+        --fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
+        --fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
+        --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --text=sentences.txt \
+        --output-dir=output/wavs \
+        --inference-dir=output/inference \
+        --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
+    # output/inference is not needed here, which save the static models
+    rm -rf output/inference
+fi
+
+if [ ${stage} -le  4 ] && [ ${stop_stage} -ge 4 ]; then
+    # We only test one audio here, cause it's slow
+    CUDA_VISIBLE_DEVICES=${gpus} \
+    python3 PaddleGAN/applications/tools/wav2lip.py \
+        --checkpoint_path download/wav2lip_hq.pdparams \
+        --face Lamarr.png \
+        --audio output/wavs/000.wav \
+        --outfile output/tts_lips.mp4 \
+        --face_enhancement
+fi
--- a/demos/metaverse/sentences.txt
+++ b/demos/metaverse/sentences.txt
@ -0,0 +1 @@
+000 谁知青蛙一落地，竟变成了一位英俊的王子。于是遵照国王的意思，他做了公主的亲密伴侣。
--- a/demos/story_talker/imgs/000.jpg
+++ b/demos/story_talker/imgs/000.jpg
--- a/demos/story_talker/ocr.py
+++ b/demos/story_talker/ocr.py
@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import re
+from pathlib import Path
+
+import paddle
+from paddleocr import draw_ocr
+from paddleocr import PaddleOCR
+from PIL import Image
+
+
+def evaluate(args, ocr):
+    img_dir = Path(args.img_dir)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    img_out_dir = output_dir / "imgs"
+    img_out_dir.mkdir(parents=True, exist_ok=True)
+    with open(output_dir / "sentences.txt", "w") as wf:
+        for name in os.listdir(img_dir):
+            id = name.split(".")[0]
+            img_path = img_dir / name
+            result = ocr.ocr(str(img_path), cls=True)
+            # draw result
+            image = Image.open(img_path).convert('RGB')
+            boxes = [line[0] for line in result]
+            txts = [line[1][0] for line in result]
+            scores = [line[1][1] for line in result]
+            im_show = draw_ocr(
+                image, boxes, txts, scores, font_path=args.font_path)
+            im_show = Image.fromarray(im_show)
+            paragraph = "".join(txts)
+            # 过滤出中文结果
+            pattern = re.compile(r'[^(\u4e00-\u9fa5)+，。？、]')
+            sentence = re.sub(pattern, '', paragraph)
+            im_show.save(img_out_dir / name)
+            wf.write(id + " " + sentence + "\n")
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with fastspeech2 & parallel wavegan.")
+    parser.add_argument("--img-dir", default="imgs", type=str, help="img_dir.")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="output",
+        help="output sentences path.")
+    parser.add_argument(
+        "--font-path", type=str, default="simfang.ttf", help="font path")
+    args = parser.parse_args()
+
+    paddle.set_device("gpu")
+    # need to run only once to download and load model into memory
+    ocr = PaddleOCR(use_angle_cls=True, lang='ch')
+
+    evaluate(args, ocr)
+
+
+if __name__ == "__main__":
+    main()
--- a/demos/story_talker/path.sh
+++ b/demos/story_talker/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=fastspeech2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/demos/story_talker/run.sh
+++ b/demos/story_talker/run.sh
@ -0,0 +1,50 @@
+#!/bin/bash
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+mkdir download
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # install PaddleOCR
+    pip install "paddleocr>=2.0.1"
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # download pretrained tts models and unzip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+    unzip -d download download/pwg_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+    unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # run ocr
+    CUDA_VISIBLE_DEVICES=${gpus} \
+    python3 ocr.py --img-dir=imgs --output-dir=output --font-path=simfang.ttf
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # run tts
+    CUDA_VISIBLE_DEVICES=${gpus} \
+    python3 ${BIN_DIR}/synthesize_e2e.py \
+        --fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
+        --fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
+        --fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
+        --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --text=output/sentences.txt \
+        --output-dir=output/wavs \
+        --inference-dir=output/inference \
+        --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
+    # output/inference is not needed here, which save the static models
+    rm -rf output/inference
+fi
--- a/demos/story_talker/simfang.ttf
+++ b/demos/story_talker/simfang.ttf
--- a/demos/style_fs2/path.sh
+++ b/demos/style_fs2/path.sh
@ -0,0 +1,12 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+MODEL=fastspeech2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/demos/style_fs2/run.sh
+++ b/demos/style_fs2/run.sh
@ -0,0 +1,38 @@
+#!/bin/bash
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+mkdir download
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # download pretrained tts models and unzip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+    unzip -d download download/pwg_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+    unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # run tts
+    CUDA_VISIBLE_DEVICES=${gpus} \
+    python3 style_syn.py \
+        --fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
+        --fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
+        --fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
+        --fastspeech2-pitch-stat=download/fastspeech2_nosil_baker_ckpt_0.4/pitch_stats.npy \
+        --fastspeech2-energy-stat=download/fastspeech2_nosil_baker_ckpt_0.4/energy_stats.npy \
+        --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output-dir=output \
+        --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
+fi
--- a/demos/style_fs2/sentences.txt
+++ b/demos/style_fs2/sentences.txt
@ -0,0 +1 @@
+000 谁知青蛙一落地，竟变成了一位英俊的王子。于是遵照国王的意思，他做了公主的亲密伴侣。
--- a/demos/style_fs2/style_syn.py
+++ b/demos/style_fs2/style_syn.py
@ -0,0 +1,284 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference
+from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
+from paddlespeech.t2s.models.parallel_wavegan import PWGInference
+from paddlespeech.t2s.modules.normalizer import ZScore
+
+
+class StyleFastSpeech2Inference(FastSpeech2Inference):
+    def __init__(self, normalizer, model, pitch_stats_path, energy_stats_path):
+        super().__init__(normalizer, model)
+        self.pitch_mean, self.pitch_std = np.load(pitch_stats_path)
+        self.pitch_mean = paddle.to_tensor(self.pitch_mean)
+        self.pitch_std = paddle.to_tensor(self.pitch_std)
+        self.energy_mean, self.energy_std = np.load(energy_stats_path)
+        self.energy_mean = paddle.to_tensor(self.energy_mean)
+        self.energy_std = paddle.to_tensor(self.energy_std)
+
+    def denorm(self, data, mean, std):
+        return data * std + mean
+
+    def norm(self, data, mean, std):
+        return (data - mean) / std
+
+    def forward(self,
+                text,
+                durations=None,
+                pitch=None,
+                energy=None,
+                robot=False):
+        """
+        Parameters
+        ----------
+        text : Tensor(int64)
+            Input sequence of characters (T,).
+        speech : Tensor, optional
+            Feature sequence to extract style (N, idim).
+        durations : Tensor, optional (int64)
+            Groundtruth of duration (T,) or 
+            float/int (represents ratio)
+        pitch : Tensor, optional
+            Groundtruth of token-averaged pitch (T, 1) or
+            float/int (represents ratio)
+        energy : Tensor, optional
+            Groundtruth of token-averaged energy (T, 1) or 
+            float (represents ratio)
+        robot : bool, optional
+            Weather output robot style
+        Returns
+        ----------
+        Tensor
+            Output sequence of features (L, odim).
+        """
+        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
+            text, durations=None, pitch=None, energy=None)
+
+        # set duration
+        if isinstance(durations, float):
+            durations = durations * d_outs
+        elif isinstance(durations, paddle.Tensor):
+            durations = durations
+
+        if robot:
+            # set normed pitch to zeros have the same effect with set denormd ones to mean
+            pitch = paddle.zeros(p_outs.shape)
+
+        # set pitch, can overwrite robot set    
+        if isinstance(pitch, (int, float)):
+            p_Hz = paddle.exp(
+                self.denorm(p_outs, self.pitch_mean, self.pitch_std))
+            p_HZ = pitch * p_Hz
+            pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std)
+        elif isinstance(pitch, paddle.Tensor):
+            pitch = pitch
+
+        # set energy
+        if isinstance(energy, (int, float)):
+            e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std)
+            e_dnorm = energy * e_dnorm
+            energy = self.norm(e_dnorm, self.energy_mean, self.energy_std)
+        elif isinstance(energy, paddle.Tensor):
+            energy = energy
+
+        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
+            text,
+            durations=durations,
+            pitch=pitch,
+            energy=energy,
+            use_teacher_forcing=True)
+
+        logmel = self.normalizer.inverse(normalized_mel)
+        return logmel
+
+
+def evaluate(args, fastspeech2_config, pwg_config):
+
+    # construct dataset for evaluation
+    sentences = []
+    with open(args.text, 'rt') as f:
+        for line in f:
+            utt_id, sentence = line.strip().split()
+            sentences.append((utt_id, sentence))
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    odim = fastspeech2_config.n_mels
+    model = FastSpeech2(
+        idim=vocab_size, odim=odim, **fastspeech2_config["model"])
+
+    model.set_state_dict(
+        paddle.load(args.fastspeech2_checkpoint)["main_params"])
+    model.eval()
+
+    vocoder = PWGGenerator(**pwg_config["generator_params"])
+    vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"])
+    vocoder.remove_weight_norm()
+    vocoder.eval()
+    print("model done!")
+
+    frontend = Frontend(phone_vocab_path=args.phones_dict)
+    print("frontend done!")
+
+    stat = np.load(args.fastspeech2_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    fastspeech2_normalizer = ZScore(mu, std)
+
+    stat = np.load(args.pwg_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    pwg_normalizer = ZScore(mu, std)
+
+    fastspeech2_inference = StyleFastSpeech2Inference(
+        fastspeech2_normalizer, model, args.fastspeech2_pitch_stat,
+        args.fastspeech2_energy_stat)
+    fastspeech2_inference.eval()
+
+    pwg_inference = PWGInference(pwg_normalizer, vocoder)
+    pwg_inference.eval()
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    styles = ["normal", "robot", "1.2xspeed", "0.8xspeed", "child_voice"]
+    for style in styles:
+        robot = False
+        durations = None
+        pitch = None
+        energy = None
+
+        if style == "robot":
+            # all tones in phones be `1`
+            # all pitch should be the same, we use mean here
+            robot = True
+        if style == "1.2xspeed":
+            durations = 1 / 1.2
+        if style == "0.8xspeed":
+            durations = 1 / 0.8
+        if style == "child_voice":
+            pitch = 1.3
+        sub_output_dir = output_dir / style
+        sub_output_dir.mkdir(parents=True, exist_ok=True)
+        for utt_id, sentence in sentences:
+            input_ids = frontend.get_input_ids(
+                sentence, merge_sentences=True, robot=robot)
+            phone_ids = input_ids["phone_ids"][0]
+
+            with paddle.no_grad():
+                mel = fastspeech2_inference(
+                    phone_ids,
+                    durations=durations,
+                    pitch=pitch,
+                    energy=energy,
+                    robot=robot)
+                wav = pwg_inference(mel)
+
+            sf.write(
+                str(sub_output_dir / (utt_id + ".wav")),
+                wav.numpy(),
+                samplerate=fastspeech2_config.fs)
+            print(f"{style}_{utt_id} done!")
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with fastspeech2 & parallel wavegan.")
+    parser.add_argument(
+        "--fastspeech2-config", type=str, help="fastspeech2 config file.")
+    parser.add_argument(
+        "--fastspeech2-checkpoint",
+        type=str,
+        help="fastspeech2 checkpoint to load.")
+    parser.add_argument(
+        "--fastspeech2-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
+    )
+    parser.add_argument(
+        "--fastspeech2-pitch-stat",
+        type=str,
+        help="mean and standard deviation used to normalize pitch when training fastspeech2"
+    )
+    parser.add_argument(
+        "--fastspeech2-energy-stat",
+        type=str,
+        help="mean and standard deviation used to normalize energy when training fastspeech2."
+    )
+    parser.add_argument(
+        "--pwg-config", type=str, help="parallel wavegan config file.")
+    parser.add_argument(
+        "--pwg-checkpoint",
+        type=str,
+        help="parallel wavegan generator parameters to load.")
+    parser.add_argument(
+        "--pwg-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
+    )
+    parser.add_argument(
+        "--phones-dict",
+        type=str,
+        default="phone_id_map.txt",
+        help="phone vocabulary file.")
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument("--verbose", type=int, default=1, help="verbose.")
+
+    args = parser.parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    with open(args.fastspeech2_config) as f:
+        fastspeech2_config = CfgNode(yaml.safe_load(f))
+    with open(args.pwg_config) as f:
+        pwg_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(fastspeech2_config)
+    print(pwg_config)
+
+    evaluate(args, fastspeech2_config, pwg_config)
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/tutorial/tts/source/fastpitch.png
+++ b/docs/tutorial/tts/source/fastpitch.png
--- a/docs/tutorial/tts/source/fastspeech2.png
+++ b/docs/tutorial/tts/source/fastspeech2.png
--- a/docs/tutorial/tts/source/ocr.wav
+++ b/docs/tutorial/tts/source/ocr.wav
--- a/docs/tutorial/tts/source/ocr_result.jpg
+++ b/docs/tutorial/tts/source/ocr_result.jpg
--- a/docs/tutorial/tts/source/pwgan.png
+++ b/docs/tutorial/tts/source/pwgan.png
--- a/docs/tutorial/tts/source/signal_pipeline.png
+++ b/docs/tutorial/tts/source/signal_pipeline.png
--- a/docs/tutorial/tts/source/text_frontend_struct.png
+++ b/docs/tutorial/tts/source/text_frontend_struct.png
--- a/docs/tutorial/tts/source/tts_lips.mp4
+++ b/docs/tutorial/tts/source/tts_lips.mp4
--- a/docs/tutorial/tts/source/tts_pipeline.png
+++ b/docs/tutorial/tts/source/tts_pipeline.png
--- a/docs/tutorial/tts/tts_tutorial.ipynb
+++ b/docs/tutorial/tts/tts_tutorial.ipynb
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@ -219,18 +219,45 @@ class Frontend():
    def get_phonemes(self,
                     sentence: str,
                     merge_sentences: bool=True,
-                     with_erhua: bool=True) -> List[List[str]]:
+                     with_erhua: bool=True,
+                     print_info: bool=False,
+                     robot: bool=False) -> List[List[str]]:
        sentences = self.text_normalizer.normalize(sentence)
        phonemes = self._g2p(
            sentences, merge_sentences=merge_sentences, with_erhua=with_erhua)
+        # change all tones to `1`
+        if robot:
+            new_phonemes = []
+            for sentence in phonemes:
+                new_sentence = []
+                for item in sentence:
+                    # `er` only have tone `2`
+                    if item[-1] in "12345" and item != "er2":
+                        item = item[:-1] + "1"
+                    new_sentence.append(item)
+                new_phonemes.append(new_sentence)
+            phonemes = new_phonemes
+        if print_info:
+            print("----------------------------")
+            print("text norm results:")
+            print(sentences)
+            print("----------------------------")
+            print("g2p results:")
+            print(phonemes)
+            print("----------------------------")
        return phonemes

-    def get_input_ids(
-            self,
-            sentence: str,
-            merge_sentences: bool=True,
-            get_tone_ids: bool=False) -> Dict[str, List[paddle.Tensor]]:
-        phonemes = self.get_phonemes(sentence, merge_sentences=merge_sentences)
+    def get_input_ids(self,
+                      sentence: str,
+                      merge_sentences: bool=True,
+                      get_tone_ids: bool=False,
+                      print_info: bool=False,
+                      robot: bool=False) -> Dict[str, List[paddle.Tensor]]:
+        phonemes = self.get_phonemes(
+            sentence,
+            merge_sentences=merge_sentences,
+            print_info=print_info,
+            robot=robot)
        result = {}
        phones = []
        tones = []
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@ -420,10 +420,9 @@ class FastSpeech2(nn.Layer):

        if is_inference:
            # (B, Tmax)
+            d_outs = self.duration_predictor.inference(hs, d_masks)
            if ds is not None:
                d_outs = ds
-            else:
-                d_outs = self.duration_predictor.inference(hs, d_masks)
            if ps is not None:
                p_outs = ps
            if es is not None:
@ -513,9 +512,9 @@ class FastSpeech2(nn.Layer):
        spembs : Tensor, optional
            peaker embedding vector (spk_embed_dim,).
        spk_id : Tensor, optional(int64)
-            Speaker embedding vector (spk_embed_dim).
+            Batch of padded spk ids  (1,).
        tone_id : Tensor, optional(int64)
-            Batch of padded tone ids  (B, Tmax).
+            Batch of padded tone ids  (T,).

        Returns
        ----------
@ -526,9 +525,7 @@ class FastSpeech2(nn.Layer):
        x = paddle.cast(text, 'int64')
        y = speech
        spemb = spembs
-        if durations is not None:
-            d = paddle.cast(durations, 'int64')
-        p, e = pitch, energy
+        d, p, e = durations, pitch, energy
        # setup batch axis
        ilens = paddle.shape(x)[0]

@ -539,8 +536,9 @@ class FastSpeech2(nn.Layer):

        if spemb is not None:
            spembs = spemb.unsqueeze(0)
-        else:
-            spembs = None
+
+        if tone_id is not None:
+            tone_id = tone_id.unsqueeze(0)

        if use_teacher_forcing:
            # use groundtruth of duration, pitch, and energy
@ -549,7 +547,7 @@ class FastSpeech2(nn.Layer):
            es = e.unsqueeze(0) if e is not None else None
            # ds, ps, es = , p.unsqueeze(0), e.unsqueeze(0)
            # (1, L, odim)
-            _, outs, d_outs, *_ = self._forward(
+            _, outs, d_outs, p_outs, e_outs = self._forward(
                xs,
                ilens,
                ys,
@ -562,7 +560,7 @@ class FastSpeech2(nn.Layer):
                is_inference=True)
        else:
            # (1, L, odim)
-            _, outs, d_outs, *_ = self._forward(
+            _, outs, d_outs, p_outs, e_outs = self._forward(
                xs,
                ilens,
                ys,
@ -571,8 +569,7 @@ class FastSpeech2(nn.Layer):
                spembs=spembs,
                spk_id=spk_id,
                tone_id=tone_id)
-
-        return outs[0]
+        return outs[0], d_outs[0], p_outs[0], e_outs[0]

    def _integrate_with_spk_embed(self, hs, spembs):
        """Integrate speaker embedding with hidden states.
@ -683,7 +680,8 @@ class FastSpeech2Inference(nn.Layer):
        self.acoustic_model = model

    def forward(self, text, spk_id=None):
-        normalized_mel = self.acoustic_model.inference(text, spk_id=spk_id)
+        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
+            text, spk_id=spk_id)
        logmel = self.normalizer.inverse(normalized_mel)
        return logmel

--- a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
-import logging
-
 from paddle import nn

 from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
@ -122,7 +120,6 @@ class Encoder(nn.Layer):
                "rel_selfattn",
                "legacy_rel_selfattn",
        ]:
-            logging.info("encoder self-attention layer type = self-attention")
            encoder_selfattn_layer = MultiHeadedAttention
            encoder_selfattn_layer_args = [
                (attention_heads, attention_dim, attention_dropout_rate, )
				`@ -0,0 +1 @@`
				`000 谁知青蛙一落地，竟变成了一位英俊的王子。于是遵照国王的意思，他做了公主的亲密伴侣。`