Merge pull request #992 from yt605155624/fix_docs

[TTS] add tts tutorial
3 years ago · fe29f74a1c
parent f9b66d0d97 9106a90055
commit fe29f74a1c
51 changed files with 2393 additions and 129 deletions
--- a/demos/metaverse/Lamarr.png
+++ b/demos/metaverse/Lamarr.png
--- a/demos/metaverse/path.sh
+++ b/demos/metaverse/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=fastspeech2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/demos/metaverse/run.sh
+++ b/demos/metaverse/run.sh
@ -0,0 +1,61 @@
+#!/bin/bash
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+mkdir -p download
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # install PaddleGAN
+    git clone https://github.com/PaddlePaddle/PaddleGAN.git
+    pip install -e PaddleGAN/
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then 
+    # download pretrained PaddleGAN model
+    wget -P download https://paddlegan.bj.bcebos.com/models/wav2lip_hq.pdparams
+fi 
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # download pretrained tts models and unzip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+    unzip -d download download/pwg_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+    unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # run tts
+    CUDA_VISIBLE_DEVICES=${gpus} \
+    python3 ${BIN_DIR}/synthesize_e2e.py \
+        --fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
+        --fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
+        --fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
+        --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --text=sentences.txt \
+        --output-dir=output/wavs \
+        --inference-dir=output/inference \
+        --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
+    # output/inference is not needed here, which save the static models
+    rm -rf output/inference
+fi
+
+if [ ${stage} -le  4 ] && [ ${stop_stage} -ge 4 ]; then
+    # We only test one audio here, cause it's slow
+    CUDA_VISIBLE_DEVICES=${gpus} \
+    python3 PaddleGAN/applications/tools/wav2lip.py \
+        --checkpoint_path download/wav2lip_hq.pdparams \
+        --face Lamarr.png \
+        --audio output/wavs/000.wav \
+        --outfile output/tts_lips.mp4 \
+        --face_enhancement
+fi
--- a/demos/metaverse/sentences.txt
+++ b/demos/metaverse/sentences.txt
@ -0,0 +1 @@
+000 谁知青蛙一落地，竟变成了一位英俊的王子。于是遵照国王的意思，他做了公主的亲密伴侣。
--- a/demos/story_talker/imgs/000.jpg
+++ b/demos/story_talker/imgs/000.jpg
--- a/demos/story_talker/ocr.py
+++ b/demos/story_talker/ocr.py
@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import re
+from pathlib import Path
+
+import paddle
+from paddleocr import draw_ocr
+from paddleocr import PaddleOCR
+from PIL import Image
+
+
+def evaluate(args, ocr):
+    img_dir = Path(args.img_dir)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    img_out_dir = output_dir / "imgs"
+    img_out_dir.mkdir(parents=True, exist_ok=True)
+    with open(output_dir / "sentences.txt", "w") as wf:
+        for name in os.listdir(img_dir):
+            id = name.split(".")[0]
+            img_path = img_dir / name
+            result = ocr.ocr(str(img_path), cls=True)
+            # draw result
+            image = Image.open(img_path).convert('RGB')
+            boxes = [line[0] for line in result]
+            txts = [line[1][0] for line in result]
+            scores = [line[1][1] for line in result]
+            im_show = draw_ocr(
+                image, boxes, txts, scores, font_path=args.font_path)
+            im_show = Image.fromarray(im_show)
+            paragraph = "".join(txts)
+            # 过滤出中文结果
+            pattern = re.compile(r'[^(\u4e00-\u9fa5)+，。？、]')
+            sentence = re.sub(pattern, '', paragraph)
+            im_show.save(img_out_dir / name)
+            wf.write(id + " " + sentence + "\n")
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with fastspeech2 & parallel wavegan.")
+    parser.add_argument("--img-dir", default="imgs", type=str, help="img_dir.")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="output",
+        help="output sentences path.")
+    parser.add_argument(
+        "--font-path", type=str, default="simfang.ttf", help="font path")
+    args = parser.parse_args()
+
+    paddle.set_device("gpu")
+    # need to run only once to download and load model into memory
+    ocr = PaddleOCR(use_angle_cls=True, lang='ch')
+
+    evaluate(args, ocr)
+
+
+if __name__ == "__main__":
+    main()
--- a/demos/story_talker/path.sh
+++ b/demos/story_talker/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=fastspeech2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/demos/story_talker/run.sh
+++ b/demos/story_talker/run.sh
@ -0,0 +1,50 @@
+#!/bin/bash
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+mkdir -p download
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # install PaddleOCR
+    pip install "paddleocr>=2.0.1"
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # download pretrained tts models and unzip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+    unzip -d download download/pwg_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+    unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # run ocr
+    CUDA_VISIBLE_DEVICES=${gpus} \
+    python3 ocr.py --img-dir=imgs --output-dir=output --font-path=simfang.ttf
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # run tts
+    CUDA_VISIBLE_DEVICES=${gpus} \
+    python3 ${BIN_DIR}/synthesize_e2e.py \
+        --fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
+        --fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
+        --fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
+        --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --text=output/sentences.txt \
+        --output-dir=output/wavs \
+        --inference-dir=output/inference \
+        --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
+    # output/inference is not needed here, which save the static models
+    rm -rf output/inference
+fi
--- a/demos/story_talker/simfang.ttf
+++ b/demos/story_talker/simfang.ttf
--- a/demos/style_fs2/path.sh
+++ b/demos/style_fs2/path.sh
@ -0,0 +1,12 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+MODEL=fastspeech2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/demos/style_fs2/run.sh
+++ b/demos/style_fs2/run.sh
@ -0,0 +1,38 @@
+#!/bin/bash
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+mkdir -p download
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # download pretrained tts models and unzip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+    unzip -d download download/pwg_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+    unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # run tts
+    CUDA_VISIBLE_DEVICES=${gpus} \
+    python3 style_syn.py \
+        --fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
+        --fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
+        --fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
+        --fastspeech2-pitch-stat=download/fastspeech2_nosil_baker_ckpt_0.4/pitch_stats.npy \
+        --fastspeech2-energy-stat=download/fastspeech2_nosil_baker_ckpt_0.4/energy_stats.npy \
+        --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output-dir=output \
+        --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
+fi
--- a/demos/style_fs2/sentences.txt
+++ b/demos/style_fs2/sentences.txt
@ -0,0 +1 @@
+000 谁知青蛙一落地，竟变成了一位英俊的王子。于是遵照国王的意思，他做了公主的亲密伴侣。
--- a/demos/style_fs2/style_syn.py
+++ b/demos/style_fs2/style_syn.py
@ -0,0 +1,328 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+from typing import Union
+
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference
+from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
+from paddlespeech.t2s.models.parallel_wavegan import PWGInference
+from paddlespeech.t2s.modules.normalizer import ZScore
+
+
+class StyleFastSpeech2Inference(FastSpeech2Inference):
+    def __init__(self, normalizer, model, pitch_stats_path, energy_stats_path):
+        super().__init__(normalizer, model)
+        pitch_mean, pitch_std = np.load(pitch_stats_path)
+        self.pitch_mean = paddle.to_tensor(pitch_mean)
+        self.pitch_std = paddle.to_tensor(pitch_std)
+        energy_mean, energy_std = np.load(energy_stats_path)
+        self.energy_mean = paddle.to_tensor(energy_mean)
+        self.energy_std = paddle.to_tensor(energy_std)
+
+    def denorm(self, data, mean, std):
+        return data * std + mean
+
+    def norm(self, data, mean, std):
+        return (data - mean) / std
+
+    def forward(self,
+                text: paddle.Tensor,
+                durations: Union[paddle.Tensor, np.ndarray]=None,
+                durations_scale: Union[int, float]=None,
+                durations_bias: Union[int, float]=None,
+                pitch: Union[paddle.Tensor, np.ndarray]=None,
+                pitch_scale: Union[int, float]=None,
+                pitch_bias: Union[int, float]=None,
+                energy: Union[paddle.Tensor, np.ndarray]=None,
+                energy_scale: Union[int, float]=None,
+                energy_bias: Union[int, float]=None,
+                robot: bool=False):
+        """
+        Parameters
+        ----------
+        text : Tensor(int64)
+            Input sequence of characters (T,).
+        speech : Tensor, optional
+            Feature sequence to extract style (N, idim).
+        durations : paddle.Tensor/np.ndarray, optional (int64)
+            Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
+        durations_scale: int/float, optional
+        durations_bias: int/float, optional
+        pitch : paddle.Tensor/np.ndarray, optional
+            Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
+        pitch_scale: int/float, optional
+            In denormed HZ domain.
+        pitch_bias: int/float, optional
+            In denormed HZ domain.
+        energy : paddle.Tensor/np.ndarray, optional
+            Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
+        energy_scale: int/float, optional
+            In denormed domain.
+        energy_bias: int/float, optional
+            In denormed domain.
+        robot : bool, optional
+            Weather output robot style
+        Returns
+        ----------
+        Tensor
+            Output sequence of features (L, odim).
+        """
+        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
+            text, durations=None, pitch=None, energy=None)
+        # priority: groundtruth > scale/bias > previous output
+        # set durations
+        if isinstance(durations, np.ndarray):
+            durations = paddle.to_tensor(durations)
+        elif isinstance(durations, paddle.Tensor):
+            durations = durations
+        elif durations_scale or durations_bias:
+            durations_scale = durations_scale if durations_scale is not None else 1
+            durations_bias = durations_bias if durations_bias is not None else 0
+            durations = durations_scale * d_outs + durations_bias
+        else:
+            durations = d_outs
+
+        if robot:
+            # set normed pitch to zeros have the same effect with set denormd ones to mean
+            pitch = paddle.zeros(p_outs.shape)
+
+        # set pitch, can overwrite robot set  
+        if isinstance(pitch, np.ndarray):
+            pitch = paddle.to_tensor(pitch)
+        elif isinstance(pitch, paddle.Tensor):
+            pitch = pitch
+        elif pitch_scale or pitch_bias:
+            pitch_scale = pitch_scale if pitch_scale is not None else 1
+            pitch_bias = pitch_bias if pitch_bias is not None else 0
+            p_Hz = paddle.exp(
+                self.denorm(p_outs, self.pitch_mean, self.pitch_std))
+            p_HZ = pitch_scale * p_Hz + pitch_bias
+            pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std)
+        else:
+            pitch = p_outs
+
+        # set energy
+        if isinstance(energy, np.ndarray):
+            energy = paddle.to_tensor(energy)
+        elif isinstance(energy, paddle.Tensor):
+            energy = energy
+        elif energy_scale or energy_bias:
+            energy_scale = energy_scale if energy_scale is not None else 1
+            energy_bias = energy_bias if energy_bias is not None else 0
+            e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std)
+            e_dnorm = energy_scale * e_dnorm + energy_bias
+            energy = self.norm(e_dnorm, self.energy_mean, self.energy_std)
+        else:
+            energy = e_outs
+
+        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
+            text,
+            durations=durations,
+            pitch=pitch,
+            energy=energy,
+            use_teacher_forcing=True)
+
+        logmel = self.normalizer.inverse(normalized_mel)
+        return logmel
+
+
+def evaluate(args, fastspeech2_config, pwg_config):
+
+    # construct dataset for evaluation
+    sentences = []
+    with open(args.text, 'rt') as f:
+        for line in f:
+            utt_id, sentence = line.strip().split()
+            sentences.append((utt_id, sentence))
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    odim = fastspeech2_config.n_mels
+    model = FastSpeech2(
+        idim=vocab_size, odim=odim, **fastspeech2_config["model"])
+
+    model.set_state_dict(
+        paddle.load(args.fastspeech2_checkpoint)["main_params"])
+    model.eval()
+
+    vocoder = PWGGenerator(**pwg_config["generator_params"])
+    vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"])
+    vocoder.remove_weight_norm()
+    vocoder.eval()
+    print("model done!")
+
+    frontend = Frontend(phone_vocab_path=args.phones_dict)
+    print("frontend done!")
+
+    stat = np.load(args.fastspeech2_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    fastspeech2_normalizer = ZScore(mu, std)
+
+    stat = np.load(args.pwg_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    pwg_normalizer = ZScore(mu, std)
+
+    fastspeech2_inference = StyleFastSpeech2Inference(
+        fastspeech2_normalizer, model, args.fastspeech2_pitch_stat,
+        args.fastspeech2_energy_stat)
+    fastspeech2_inference.eval()
+
+    pwg_inference = PWGInference(pwg_normalizer, vocoder)
+    pwg_inference.eval()
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    styles = ["normal", "robot", "1.2xspeed", "0.8xspeed", "child_voice"]
+    for style in styles:
+        robot = False
+        durations = None
+        durations_scale = None
+        durations_bias = None
+        pitch = None
+        pitch_scale = None
+        pitch_bias = None
+        energy = None
+        energy_scale = None
+        energy_bias = None
+        if style == "robot":
+            # all tones in phones be `1`
+            # all pitch should be the same, we use mean here
+            robot = True
+        if style == "1.2xspeed":
+            durations_scale = 1 / 1.2
+        if style == "0.8xspeed":
+            durations_scale = 1 / 0.8
+        if style == "child_voice":
+            pitch_scale = 1.3
+        sub_output_dir = output_dir / style
+        sub_output_dir.mkdir(parents=True, exist_ok=True)
+        for utt_id, sentence in sentences:
+            input_ids = frontend.get_input_ids(
+                sentence, merge_sentences=True, robot=robot)
+            phone_ids = input_ids["phone_ids"][0]
+
+            with paddle.no_grad():
+                mel = fastspeech2_inference(
+                    phone_ids,
+                    durations=durations,
+                    durations_scale=durations_scale,
+                    durations_bias=durations_bias,
+                    pitch=pitch,
+                    pitch_scale=pitch_scale,
+                    pitch_bias=pitch_bias,
+                    energy=energy,
+                    energy_scale=energy_scale,
+                    energy_bias=energy_bias,
+                    robot=robot)
+                wav = pwg_inference(mel)
+
+            sf.write(
+                str(sub_output_dir / (utt_id + ".wav")),
+                wav.numpy(),
+                samplerate=fastspeech2_config.fs)
+            print(f"{style}_{utt_id} done!")
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with fastspeech2 & parallel wavegan.")
+    parser.add_argument(
+        "--fastspeech2-config", type=str, help="fastspeech2 config file.")
+    parser.add_argument(
+        "--fastspeech2-checkpoint",
+        type=str,
+        help="fastspeech2 checkpoint to load.")
+    parser.add_argument(
+        "--fastspeech2-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
+    )
+    parser.add_argument(
+        "--fastspeech2-pitch-stat",
+        type=str,
+        help="mean and standard deviation used to normalize pitch when training fastspeech2"
+    )
+    parser.add_argument(
+        "--fastspeech2-energy-stat",
+        type=str,
+        help="mean and standard deviation used to normalize energy when training fastspeech2."
+    )
+    parser.add_argument(
+        "--pwg-config", type=str, help="parallel wavegan config file.")
+    parser.add_argument(
+        "--pwg-checkpoint",
+        type=str,
+        help="parallel wavegan generator parameters to load.")
+    parser.add_argument(
+        "--pwg-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training parallel wavegan."
+    )
+    parser.add_argument(
+        "--phones-dict",
+        type=str,
+        default="phone_id_map.txt",
+        help="phone vocabulary file.")
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument("--verbose", type=int, default=1, help="verbose.")
+
+    args = parser.parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    with open(args.fastspeech2_config) as f:
+        fastspeech2_config = CfgNode(yaml.safe_load(f))
+    with open(args.pwg_config) as f:
+        pwg_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(fastspeech2_config)
+    print(pwg_config)
+
+    evaluate(args, fastspeech2_config, pwg_config)
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/source/asr/models_introduction.md
+++ b/docs/source/asr/models_introduction.md
@ -13,7 +13,7 @@ In addition, the training process and the testing process are also introduced.
 The arcitecture of the model is shown in Fig.1.

 <p align="center">
-    <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/ds2onlineModel.png" width=800>
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/ds2onlineModel.png" width=800>
    <br/>Fig.1 The Arcitecture of deepspeech2 online model
 </p>

@ -160,7 +160,7 @@ The deepspeech2 offline model is similarity to the deepspeech2 online model. The

 The arcitecture of the model is shown in Fig.2.
 <p align="center">
-    <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/ds2offlineModel.png" width=800>
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/ds2offlineModel.png" width=800>
    <br/>Fig.2 The Arcitecture of deepspeech2 offline model
 </p>

--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@ -1,3 +1,4 @@
+
 # Released Models

 ## Speech-To-Text Models
@ -28,28 +29,29 @@ Language Model | Training Data | Token-based | Size | Descriptions

 ## Text-To-Speech Models
 ### Acoustic Models
-Model Type | Dataset| Example Link | Pretrained Models
-:-------------:| :------------:| :-----: | :-----
-Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)
-TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)
-SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
-FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)
-FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
-FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
-FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)
-
+Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static)
+:-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
+Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)|||
+TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)|||
+SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip)|12MB|
+FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip)|157MB|
+FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)|||
+FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|||
+FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)|||

 ### Vocoders

-Model Type | Dataset| Example Link | Pretrained Models
-:-------------:| :------------:| :-----: | :-----
-WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)
-Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip.](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)
-Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)
-Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)
+Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size(static)
+:-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
+WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)|||
+Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip)|5.1MB|
+Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)|||
+Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip)|||
+Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)|||
+|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip) |8.2MB|

 ### Voice Cloning
 Model Type | Dataset| Example Link | Pretrained Models
-:-------------:| :------------:| :-----: | :-----
-GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
-GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip)
+:-------------:| :------------:| :-----: | :-----:
+GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
+GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip)
--- a/docs/source/tts/demo.rst
+++ b/docs/source/tts/demo.rst
@ -642,8 +642,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
 Multi-Speaker TTS
 -------------------

-PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generated by FastSpeech2 + ParallelWaveGAN, we use AISHELL-3 Multi-Speaker TTS dataset.
-
+PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generated by FastSpeech2 + ParallelWaveGAN, we use AISHELL-3 Multi-Speaker TTS dataset. Each line is a different person.


 .. raw:: html
@ -651,19 +650,381 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
    <div class="table">
    <table border="2" cellspacing="1" cellpadding="1">
        <tr>
-            <th align="center"> Text </th>
-            <th align="center"> Origin </th>
+            <th align="center"> Target Timbre </th>
            <th align="center"> Generated </th>
        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/0.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/0_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/1.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/1_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/2.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/2_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/3.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/3_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/4.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/4_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/5.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/5_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/6.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/6_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/7.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/7_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/8.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/8_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/9.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/9_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/10.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/10_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/11.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/11_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/12.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/12_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/13.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/13_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/14.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/14_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/15.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/15_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/16.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/16_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/17.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/17_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/18.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/18_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/19.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/19_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+
    <table>
    <div>
    <br>
    <br>
        

-Duration control in FastSpeech2
+Style control in FastSpeech2
 --------------------------------------
-In our FastSpeech2, we can control ``duration``, ``pitch`` and ``energy``, we provide the audio demos of duration control here. ``duration`` means the duration of phonemes, when we reduce duration, the speed of audios will increase, and when we incerase ``duration``, the speed of audios will reduce.
+In our FastSpeech2, we can control ``duration``, ``pitch`` and ``energy``.
+
+We provide the audio demos of duration control here. ``duration`` means the duration of phonemes, when we reduce duration, the speed of audios will increase, and when we incerase ``duration``, the speed of audios will reduce.

 The ``duration`` of different phonemes in a sentence can have different scale ratios (when you want to slow down one word and keep the other words' speed in a sentence). Here we use a fixed scale ratio for different phonemes to control the ``speed`` of audios.

@ -892,6 +1253,174 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
    <br>
    <br>

+We provide the audio demos of pitch control here. 
+
+When we set pitch of one sentence to a mean value and set ``tones`` of phones to ``1``, we will get a ``robot-style`` timbre.
+
+When we raise the pitch of an adult female (with a fixed scale ratio), we will get a ``child-style`` timbre.
+
+The ``pitch`` of different phonemes in a sentence can also have different scale ratios.
+
+The nomal audios are in the second column of the previous table.
+
+.. raw:: html
+
+    <div class="table">
+    <table border="2" cellspacing="1" cellpadding="1">
+        <tr>
+            <th align="center"> Robot </th>
+            <th align="center"> Child </th>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice/001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice/002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/003.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice/003.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/004.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice//004.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/005.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice//005.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/007.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice//007.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/008.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice//008.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/009.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice//009.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+
+    <table>
+    <div>
+    <br>
+    <br>
+

 Chinese TTS with/without text frontend
 --------------------------------------
--- a/docs/source/tts/gan_vocoder.md
+++ b/docs/source/tts/gan_vocoder.md
@ -6,4 +6,4 @@ Model  | Generator Loss |Discriminator Loss
 Parallel Wave GAN| adversial loss <br> Feature Matching  | Multi-Scale Discriminator |
 Mel GAN |adversial loss <br> Multi-resolution STFT loss  | adversial loss|
 Multi-Band Mel GAN | adversial loss <br> full band Multi-resolution STFT loss <br> sub band Multi-resolution STFT loss |Multi-Scale Discriminator|
-HiFi GAN |adversial loss <br> Feature Matching <br>  Mel-Spectrogram Loss | Multi-Scale Discriminator <br> Multi-Period Discriminato  |
+HiFi GAN |adversial loss <br> Feature Matching <br>  Mel-Spectrogram Loss | Multi-Scale Discriminator <br> Multi-Period Discriminator|
--- a/docs/source/tts/models_introduction.md
+++ b/docs/source/tts/models_introduction.md
@ -27,14 +27,14 @@ At present, there are two mainstream acoustic model structures.
   - Acoustic decoder (N Frames - > N Frames).

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/frame_level_am.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/frame_level_am.png" width=500 /> <br>
 </div>

 - Sequence to sequence acoustic model:
    - M Tokens - > N Frames.

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/seq2seq_am.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/seq2seq_am.png" width=500 /> <br>
 </div>

 ### Tacotron2
@ -54,7 +54,7 @@ At present, there are two mainstream acoustic model structures.
    - CBHG postprocess.
    - Vocoder: Griffin-Lim.
 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/tacotron.png" width=700 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/tacotron.png" width=700 /> <br>
 </div>

 **Advantage of Tacotron:**
@ -89,10 +89,10 @@ At present, there are two mainstream acoustic model structures.
   - The alignment matrix of previous time is considered at the step `t` of decoder.

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/tacotron2.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/tacotron2.png" width=500 /> <br>
 </div>

-You can find PaddleSpeech TTS's tacotron2 with LJSpeech dataset example at [examples/ljspeech/tts0](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts0).
+You can find PaddleSpeech TTS's tacotron2 with LJSpeech dataset example at [examples/ljspeech/tts0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0).

 ### TransformerTTS
 **Disadvantages of the Tacotrons:**
@ -118,7 +118,7 @@ Transformer TTS is a combination of Tacotron2 and Transformer.
    - Positional Encoding.

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/transformer.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/transformer.png" width=500 /> <br>
 </div>

 #### Transformer TTS
@ -138,7 +138,7 @@ Transformer TTS is a seq2seq acoustic model based on Transformer and Tacotron2.
    - Uniform scale position encoding may have a negative impact on input or output sequences.

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/transformer_tts.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/transformer_tts.png" width=500 /> <br>
 </div>

 **Disadvantages of Transformer TTS:**
@ -146,7 +146,7 @@ Transformer TTS is a seq2seq acoustic model based on Transformer and Tacotron2.
 - The ability to perceive local information is weak, and local information is more related to pronunciation.
 - Stability is worse than Tacotron2.

-You can find PaddleSpeech TTS's Transformer TTS with LJSpeech dataset example at [examples/ljspeech/tts1](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts1).
+You can find PaddleSpeech TTS's Transformer TTS with LJSpeech dataset example at [examples/ljspeech/tts1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1).


 ### FastSpeech2
@ -184,14 +184,14 @@ Instead of using the encoder-attention-decoder based architecture as adopted by
 • Can be generated in parallel (decoding time is less affected by sequence length)

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/fastspeech.png" width=800 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/fastspeech.png" width=800 /> <br>
 </div>

 #### FastPitch
 [FastPitch](https://arxiv.org/abs/2006.06873) follows FastSpeech. A single pitch value is predicted for every temporal location, which improves the overall quality of synthesized speech.

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/fastpitch.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/fastpitch.png" width=500 /> <br>
 </div>

 #### FastSpeech2
@ -209,10 +209,10 @@ Instead of using the encoder-attention-decoder based architecture as adopted by
 FastSpeech2 is similar to FastPitch but introduces more variation information of speech.

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/fastspeech2.png" width=800 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/fastspeech2.png" width=800 /> <br>
 </div>

-You can find PaddleSpeech TTS's FastSpeech2/FastPitch with CSMSC dataset example at [examples/csmsc/tts3](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts3), We use token-averaged pitch and energy values introduced in FastPitch rather than frame level ones in FastSpeech2.
+You can find PaddleSpeech TTS's FastSpeech2/FastPitch with CSMSC dataset example at [examples/csmsc/tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3), We use token-averaged pitch and energy values introduced in FastPitch rather than frame level ones in FastSpeech2.

 ### SpeedySpeech
 [SpeedySpeech](https://arxiv.org/abs/2008.03802) simplify the teacher-student architecture of FastSpeech and provide a fast and stable training procedure.
@ -223,10 +223,10 @@ You can find PaddleSpeech TTS's FastSpeech2/FastPitch with CSMSC dataset example
 - Describe a simple data augmentation technique that can be used early in the training to make the teacher network robust to sequential error propagation.

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/speedyspeech.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/speedyspeech.png" width=500 /> <br>
 </div>

-You can find PaddleSpeech TTS's SpeedySpeech with CSMSC dataset example at [examples/csmsc/tts2](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts2).
+You can find PaddleSpeech TTS's SpeedySpeech with CSMSC dataset example at [examples/csmsc/tts2](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2).

 ## Vocoders
 In speech synthesis, the main task of the vocoder is to convert the spectral parameters predicted by the acoustic model into the final speech waveform.
@ -276,7 +276,7 @@ Here, we introduce a Flow-based vocoder WaveFlow and a GAN-based vocoder Paralle
 - It is a small-footprint flow-based model for raw audio. It has only 5.9M parameters, which is 15x smalller than WaveGlow (87.9M).
 - It is directly trained with maximum likelihood without probability density distillation and auxiliary losses as used in [Parallel WaveNet](https://arxiv.org/abs/1711.10433) and [ClariNet](https://openreview.net/pdf?id=HklY120cYm), which simplifies the training pipeline and reduces the cost of development.

-You can find PaddleSpeech TTS's WaveFlow with LJSpeech dataset example at [examples/ljspeech/voc0](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0).
+You can find PaddleSpeech TTS's WaveFlow with LJSpeech dataset example at [examples/ljspeech/voc0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0).

 ### Parallel WaveGAN
 [Parallel WaveGAN](https://arxiv.org/abs/1910.11480) trains a non-autoregressive WaveNet variant as a generator in a GAN based training method.
@ -289,7 +289,7 @@ You can find PaddleSpeech TTS's WaveFlow with LJSpeech dataset example at [examp
 - Multi-resolution STFT loss.

 <div align="left">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/pwg.png" width=600 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSpeech/develop/docs/images/pwg.png" width=600 /> <br>
 </div>

-You can find PaddleSpeech TTS's Parallel WaveGAN with CSMSC example at [examples/csmsc/voc1](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1).
+You can find PaddleSpeech TTS's Parallel WaveGAN with CSMSC example at [examples/csmsc/voc1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1).
--- a/docs/source/tts/quick_start.md
+++ b/docs/source/tts/quick_start.md
@ -18,7 +18,7 @@ The models in PaddleSpeech TTS have the following mapping relationship:

 ## Quick Start

-Let's take a FastSpeech2 + Parallel WaveGAN with CSMSC dataset for instance. (./examples/csmsc/)(https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc)
+Let's take a FastSpeech2 + Parallel WaveGAN with CSMSC dataset for instance. (./examples/csmsc/)(https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc)

 ### Train Parallel WaveGAN with CSMSC
 - Go to directory
--- a/docs/source/tts/zh_text_frontend.md
+++ b/docs/source/tts/zh_text_frontend.md
@ -1,5 +1,5 @@
 # Chinese Rule Based Text Frontend
-A TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We provide a complete Chinese text frontend module in PaddleSpeech TTS, see exapmle in [examples/other/text_frontend/](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/text_frontend).
+A TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We provide a complete Chinese text frontend module in PaddleSpeech TTS, see exapmle in [examples/other/text_frontend/](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/text_frontend).

 A text frontend module mainly includes:
 - Text Segmentation
--- a/docs/tutorial/tts/source/fastpitch.png
+++ b/docs/tutorial/tts/source/fastpitch.png
--- a/docs/tutorial/tts/source/fastspeech2.png
+++ b/docs/tutorial/tts/source/fastspeech2.png
--- a/docs/tutorial/tts/source/frog_prince.jpg
+++ b/docs/tutorial/tts/source/frog_prince.jpg
--- a/docs/tutorial/tts/source/ocr.wav
+++ b/docs/tutorial/tts/source/ocr.wav
--- a/docs/tutorial/tts/source/ocr_result.jpg
+++ b/docs/tutorial/tts/source/ocr_result.jpg
--- a/docs/tutorial/tts/source/pwgan.png
+++ b/docs/tutorial/tts/source/pwgan.png
--- a/docs/tutorial/tts/source/signal_pipeline.png
+++ b/docs/tutorial/tts/source/signal_pipeline.png
--- a/docs/tutorial/tts/source/text_frontend_struct.png
+++ b/docs/tutorial/tts/source/text_frontend_struct.png
--- a/docs/tutorial/tts/source/tts_lips.mp4
+++ b/docs/tutorial/tts/source/tts_lips.mp4
--- a/docs/tutorial/tts/source/tts_pipeline.png
+++ b/docs/tutorial/tts/source/tts_pipeline.png
--- a/docs/tutorial/tts/tts_tutorial.ipynb
+++ b/docs/tutorial/tts/tts_tutorial.ipynb
@ -0,0 +1,958 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Story Talker"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 用 PaddleOCR 识别图片中的文字"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "from PIL import Image\n",
+    "img_path = 'source/frog_prince.jpg'\n",
+    "im = Image.open(img_path)\n",
+    "im.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 使用 TTS 合成的音频"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import IPython.display as dp\n",
+    "dp.Audio(\"source/ocr.wav\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<font size=4>具体实现代码请参考: https://github.com/DeepSpeech/demos/story_talker/run.sh<font>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 元宇宙来袭，构造你的虚拟人！"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 使用 PaddleGAN 合成的唇形视频"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "from IPython.display import HTML\n",
+    "html_str = '''\n",
+    "<video controls width=\"650\" height=\"365\" src=\"{}\">animation</video>\n",
+    "'''.format(\"output/tts_lips.mp4\")\n",
+    "dp.display(HTML(html_str))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<font size=4>具体实现代码请参考: https://github.com/DeepSpeech/demos/metaverse/run.sh<font>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 前言\n",
+    "<br></br>\n",
+    "近年来，随着深度学习算法上的进步以及不断丰厚的硬件资源条件，**文本转语音（Text-To-Speech, TTS）** 技术在智能语音助手、虚拟娱乐等领域得到了广泛的应用。本教程将结合背景知识，让用户能够使用PaddlePaddle完成文本转语音任务，并结合光学字符识别（Optical Character Recognition，OCR）、自然语言处理（Natural Language Processing，NLP）等技术“听”书、让名人开口说话。\n",
+    "\n",
+    "<br></br>\n",
+    "## 背景知识\n",
+    "<br></br>\n",
+    "为了更好地了解文本转语音任务的要素，我们先简要地回顾一下文本转语音的发展历史。如果你对此已经有所了解，或希望能尽快使用代码实现，请跳至第二章。\n",
+    "\n",
+    "<br></br>\n",
+    "### 定义\n",
+    "<br></br>\n",
+    "<!----\n",
+    "Note: \n",
+    "1.此句抄自 [李沐Dive into Dive Learning](https://zh-v2.d2l.ai/chapter_introduction/index.html)\n",
+    "2.修改参考A survey on Neural Speech Sysnthesis.\n",
+    "---> \n",
+    "<font size=4> 文本转语音，又称语音合成（Speech Sysnthesis），指的是将一段文本按照一定需求转化成对应的音频，这种特性决定了的输出数据比输入输入长得多。文本转语音是一项包含了语义学、声学、数字信号处理以及机器学习的等多项学科的交叉任务。虽然辨识低质量音频文件的内容对人类来说很容易，但这对计算机来说并非易事。\n",
+    "</font>\n",
+    "\n",
+    "> Note: 这里可以提供一下资料出处嘛？ 2021/11/09\n",
+    "<br></br>\n",
+    "\n",
+    "按照不同的应用需求，更广义的语音合成研究包括：\n",
+    "- <font size=4>语音转换（Voice Transformation/Conversion）</font>\n",
+    "    - 说话人转换\n",
+    "    - 语音到歌唱转换（Speech to Singing）\n",
+    "    - 语音情感转换\n",
+    "    - 口音转换\n",
+    "- <font size=4>歌唱合成 （Singing Synthesis）</font>\n",
+    "    - <font size=4>歌词到歌唱转换（Text/Lyric to Singing）</font>\n",
+    "- <font size=4>可视语音合成（Visual Speech Synthesis）</font>\n",
+    "\n",
+    "<br></br>\n",
+    "### 发展历史\n",
+    "<br></br>\n",
+    "<!--\n",
+    "以下摘自维基百科 https://en.wikipedia.org/wiki/Speech_synthesis\n",
+    "--->\n",
+    "#### 机械式语音合成（19世纪及以前）\n",
+    "在第二次工业革命之前，语音的合成主要以机械式的音素合成为主。1779年，德裔丹麦科学家 Christian Gottlieb Kratzenstein 建造了人类的声道模型，使其可以产生五个长元音。1791年， Wolfgang von Kempelen 添加了唇和舌的模型，使其能够发出辅音和元音。\n",
+    "#### 电子语音合成（20世纪30年代）\n",
+    "贝尔实验室于20世纪30年代发明了声码器（Vocoder），将语音自动分解为音调和共振，此项技术由 Homer Dudley 改进为键盘式合成器并于 1939年纽约世界博览会展出。\n",
+    "#### 电子语音合成\n",
+    "第一台基于计算机的语音合成系统起源于 20 世纪 50 年代。1961 年，IBM 的 John Larry Kelly，以及 Louis Gerstman 使用 IBM 704 计算机合成语音，成为贝尔实验室最著名的成就之一。 1975年，第一代语音合成系统之一 —— MUSA（MUltichannel Speaking Automation）问世，其由一个独立的硬件和配套的软件组成。1978年发行的第二个版本也可以进行无伴奏演唱。90 年代的主流是采用 MIT 和贝尔实验室的系统，并结合自然语言处理模型。\n",
+    "> Note: 这里插一张timeline图\n",
+    "#### 当前的主流方法\n",
+    "\n",
+    "- <font size=4>基于统计参数的语音合成</font>\n",
+    "    - <font size=4>隐马尔可夫模型（Hidden Markov Model,HMM）</font>\n",
+    "    - <font size=4>深度学习网络（Deep Neural Network，DNN）</font>\n",
+    "- <font size=4>波形拼接语音合成</font>\n",
+    " \n",
+    "- <font size=4>混合方法</font>\n",
+    "   - <font size=4>参数轨迹指导的波形拼接</font>\n",
+    "- <font size=4>端到端神经网络语音合成</font>\n",
+    "  - <font size=4>声学模型 + 声码器</font>\n",
+    "  - <font size=4>“完全”端到端方法</font>\n",
+    "\n",
+    "<br></br>\n",
+    "## 基于深度学习的语音合成技术\n",
+    "<br></br>\n",
+    "### 语音合成基本知识\n",
+    "<br></br>\n",
+    "![信号处理流水线](source/signal_pipeline.png)\n",
+    "<br></br>\n",
+    "<font size=4>语音合成流水线包含 <font color=\"#ff0000\">**文本前端（Text Frontend）**</font> 、<font color=\"#ff0000\">**声学模型（Acoustic Model）**</font> 和 <font color=\"#ff0000\">**声码器（Vocoder）**</font> 三个主要模块:</font>\n",
+    "- <font size=4>通过文本前端模块将原始文本转换为字符/音素。</font>\n",
+    "- <font size=4>通过声学模型将字符/音素转换为声学特征，如线性频谱图、mel 频谱图、LPC 特征等。</font>\n",
+    "- <font size=4>通过声码器将声学特征转换为波形。</font>\n",
+    "<br></br>\n",
+    "<img style=\"float: center;\" src=\"source/tts_pipeline.png\" width=\"85%\"/>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 实践\n",
+    "<br></br>\n",
+    "<font size=4>环境安装请参考: https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md</font>\n",
+    "\n",
+    "<br></br>\n",
+    "\n",
+    "<font size=4>使用 **PaddleSpeech** 提供的预训练模型合成一句中文。</font>\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## step 0 准备"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 获取预训练模型"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!mkdir download\n",
+    "!wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip\n",
+    "!unzip -d download download/pwg_baker_ckpt_0.4.zip\n",
+    "!wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip\n",
+    "!unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 查看预训练模型的结构"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!tree download/pwg_baker_ckpt_0.4\n",
+    "!tree download/fastspeech2_nosil_baker_ckpt_0.4"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 导入 Python 包"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "import logging\n",
+    "import sys\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "# PaddleSpeech 项目根目录放到 python 路径中\n",
+    "sys.path.insert(0,\"../../../\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import argparse\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "import IPython.display as dp\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import paddle\n",
+    "import soundfile as sf\n",
+    "import yaml\n",
+    "from paddlespeech.t2s.frontend.zh_frontend import Frontend\n",
+    "from paddlespeech.t2s.models.fastspeech2 import FastSpeech2\n",
+    "from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference\n",
+    "from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator\n",
+    "from paddlespeech.t2s.models.parallel_wavegan import PWGInference\n",
+    "from paddlespeech.t2s.modules.normalizer import ZScore\n",
+    "from yacs.config import CfgNode"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 设置预训练模型的路径"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "fastspeech2_config = \"download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml\"\n",
+    "fastspeech2_checkpoint = \"download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz\"\n",
+    "fastspeech2_stat = \"download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy\"\n",
+    "pwg_config = \"download/pwg_baker_ckpt_0.4/pwg_default.yaml\"\n",
+    "pwg_checkpoint = \"download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz\"\n",
+    "pwg_stat = \"download/pwg_baker_ckpt_0.4/pwg_stats.npy\"\n",
+    "phones_dict = \"download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt\"\n",
+    "# 读取 conf 文件并结构化\n",
+    "with open(fastspeech2_config) as f:\n",
+    "    fastspeech2_config = CfgNode(yaml.safe_load(f))\n",
+    "with open(pwg_config) as f:\n",
+    "    pwg_config = CfgNode(yaml.safe_load(f))\n",
+    "print(\"========Config========\")\n",
+    "print(fastspeech2_config)\n",
+    "print(\"---------------------\")\n",
+    "print(pwg_config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## step1 文本前端\n",
+    "<br></br>\n",
+    "\n",
+    "<font size=4>一个文本前端模块主要包含</font>:\n",
+    "- <font size=4>分段（Text Segmentation）</font>\n",
+    "\n",
+    "- <font size=4>文本正则化（Text Normalization, TN）</font>\n",
+    "\n",
+    "- <font size=4>分词（Word Segmentation, 主要是在中文中）</font>\n",
+    "\n",
+    "- <font size=4>词性标注（Part-of-Speech, PoS）</font>\n",
+    "- <font size=4>韵律预测（Prosody）</font>\n",
+    "- <font size=4>字音转换（Grapheme-to-Phoneme，G2P）</font>\n",
+    "<br></br>\n",
+    "<font size=2>（Grapheme: **语言**书写系统的最小有意义单位; Phoneme: 区分单词的最小**语音**单位）</font>\n",
+    "    - <font size=4>多音字（Polyphone）</font>\n",
+    "    - <font size=4>变调（Tone Sandhi）</font>\n",
+    "        - <font size=4>“一”、“不”变调</font>\n",
+    "        - <font size=4>三声变调</font>\n",
+    "        - <font size=4>轻声变调</font>\n",
+    "        - <font size=4>儿化音</font>\n",
+    "        - <font size=4>方言</font>\n",
+    "- ...\n",
+    "<br></br>\n",
+    "\n",
+    "<font size=4>（输入给声学模型之前，还需要把音素序列转换为 id）</font>\n",
+    "\n",
+    "<br></br>\n",
+    "<font size=4>其中最重要的模块是<font color=\"#ff0000\"> 文本正则化 </font>模块和<font color=\"#ff0000\"> 字音转换（TTS 中更常用 G2P代指） </font>模块。</font>\n",
+    "\n",
+    "<br></br>\n",
+    "\n",
+    "<font size=4>各模块输出示例:</font>\n",
+    "```text\n",
+    "• Text: 全国一共有112所211高校\n",
+    "• Text Normalization: 全国一共有一百一十二所二一一高校\n",
+    "• Word Segmentation: 全国/一共/有/一百一十二/所/二一一/高校/\n",
+    "• G2P（注意此句中“一”的读音）:\n",
+    "    quan2 guo2 yi2 gong4 you3 yi4 bai3 yi1 shi2 er4 suo3 er4 yao1 yao1 gao1 xiao4\n",
+    "    （可以进一步把声母和韵母分开）\n",
+    "    q uan2 g uo2 y i2 g ong4 y ou3 y i4 b ai3 y i1 sh i2 er4 s uo3 er4 y ao1 y ao1 g ao1 x iao4\n",
+    "    （把音调和声韵母分开）\n",
+    "    q uan g uo y i g ong y ou y i b ai y i sh i er s uo er y ao y ao g ao x iao\n",
+    "    0 2 0 2 0 2 0 4 0 3 ...\n",
+    "• Prosody (prosodic words #1, prosodic phrases #2, intonation phrases #3, sentence #4):\n",
+    "    全国#2一共有#2一百#1一十二所#2二一一#1高校#4\n",
+    "    （分词的结果一般是固定的，但是不同人习惯不同，可能有不同的韵律）\n",
+    "```\n",
+    "\n",
+    "<br></br>\n",
+    "<font size=4>文本前端模块的设计需要融入很多专业的或经验性的知识，人类在读文本的时候可以自然而然地读出正确的发音，但是这些计算机都是不知道的！</font>\n",
+    "\n",
+    "<br></br>\n",
+    "<font size=4>分词:</font>\n",
+    "```text\n",
+    "我也想过过过儿过过的生活\n",
+    "我也想/过过/过儿/过过的/生活\n",
+    "\n",
+    "货拉拉拉不拉拉布拉多\n",
+    "货拉拉/拉不拉/拉布拉多\n",
+    "\n",
+    "南京市长江大桥\n",
+    "南京市长/江大桥\n",
+    "南京市/长江大桥\n",
+    "```\n",
+    "<font size=4>变调和儿化音：</font>\n",
+    "```\n",
+    "你要不要和我们一起出去玩？\n",
+    "你要不（2声）要和我们一（4声）起出去玩（儿）？\n",
+    "\n",
+    "不好，我要一个人出去。\n",
+    "不（4声）好，我要一（2声）个人出去。\n",
+    "\n",
+    "（以下每个词的所有字都是三声的，请你读一读，体会一下在读的时候，是否每个字都被读成了三声？）\n",
+    "纸老虎、虎骨酒、展览馆、岂有此理、手表厂有五种好产品\n",
+    "```\n",
+    "<font size=4>多音字（通常需要先正确分词）：</font>\n",
+    "```text\n",
+    "人要行，干一行行一行，一行行行行行;\n",
+    "人要是不行，干一行不行一行，一行不行行行不行。\n",
+    "\n",
+    "佟大为妻子产下一女\n",
+    "\n",
+    "海水朝朝朝朝朝朝朝落\n",
+    "浮云长长长长长长长消\n",
+    "```\n",
+    "<br></br>\n",
+    "\n",
+    "<font size=4>PaddleSpeech TTS 文本前端解决方案:</font>\n",
+    "- <font size=4>文本正则: 规则</font>\n",
+    "- <font size=4>G2P:</font>\n",
+    "    - <font size=4>多音字模块: pypinyin/g2pM</font>\n",
+    "    - <font size=4>变调模块: 用分词 + 规则</font>\n",
+    "\n",
+    "<br></br>\n",
+    "<font size=4>相关 examples:\n",
+    "    \n",
+    "https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/tn\n",
+    "https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/g2p</font>\n",
+    "\n",
+    "<br></br>\n",
+    "<font size=4>（未来计划推出基于深度学习的文本前端模块）</font>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 构造文本前端对象"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# 传入 phones_dict 会把相应的 phones 转换成 phone_ids\n",
+    "frontend = Frontend(phone_vocab_path=phones_dict)\n",
+    "print(\"Frontend done!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 调用文本前端"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "input = \"你好，欢迎使用百度飞桨框架进行深度学习研究！\"\n",
+    "# input = \"我每天中午12:00起床\"\n",
+    "# input = \"我出生于2005/11/08，那天的最低气温达到-10°C\"\n",
+    "# text norm 时会进行分句，merge_sentences 表示把分句的结果合成一条\n",
+    "# 可以把 merge_sentences 设置为 False, 多个子句并行调用声学模型和声码器提升合成速度\n",
+    "input_ids = frontend.get_input_ids(input, merge_sentences=True, print_info=True)\n",
+    "# 由于 merge_sentences=True, input_ids[\"phone_ids\"][0] 即表示整句的 phone_ids\n",
+    "phone_ids = input_ids[\"phone_ids\"][0]\n",
+    "print(\"phone_ids:\")\n",
+    "print(phone_ids)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## step1+ 文本前端深度学习化\n",
+    "<br></br>\n",
+    "<img style=\"float: center;\" src=\"source/text_frontend_struct.png\" width=\"100%\"/>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## step2 声学模型\n",
+    "<br></br>\n",
+    "<font size=4>声学模型将字符/音素转换为声学特征，如线性频谱图、mel 频谱图、LPC 特征等，声学特征以 “帧” 为单位，一般一帧是 10ms 左右，一个音素一般对应 5~20 帧左右, 声学模型需要解决的是 <font color=\"#ff0000\">“不等长序列间的映射问题”</font>，“不等长”是指，同一个人发不同音素的持续时间不同，同一个人在不同时刻说同一句话的语速可能不同，对应各个音素的持续时间不同，不同人说话的特色不同，对应各个音素的持续时间不同。这是一个困难的“一对多”问题。</font>\n",
+    "```\n",
+    "# 卡尔普陪外孙玩滑梯\n",
+    "000001|baker_corpus|sil 20 k 12 a2 4 er2 10 p 12 u3 12 p 9 ei2 9 uai4 15 s 11 uen1 12 uan2 14 h 10 ua2 11 t 15 i1 16 sil 20\n",
+    "```\n",
+    "\n",
+    "<font size=4>声学模型主要分为自回归模型和非自回归模型，其中自回归模型在 `t` 时刻的预测需要依赖 `t-1` 时刻的输出作为输入，预测时间长，但是音质相对较好，非自回归模型不存在预测上的依赖关系，预测时间快，音质相对较差。</font>\n",
+    "\n",
+    "<br></br>\n",
+    "<font size=4>主流声学模型发展的脉络:</font>\n",
+    "- <font size=4>自回归模型:</font>\n",
+    "    - <font size=4>Tacotron</font>\n",
+    "    - <font size=4>Tacotron2</font>\n",
+    "    - <font size=4>Transformer TTS</font>\n",
+    "- <font size=4>非自回归模型:</font>\n",
+    "    - <font size=4>FastSpeech</font>\n",
+    "    - <font size=4>SpeedySpeech</font>\n",
+    "    - <font size=4>FastPitch</font>\n",
+    "    - <font size=4>FastSpeech2</font>\n",
+    "    - ...\n",
+    " \n",
+    "<br></br>\n",
+    "<font size=4>在本教程中，我们使用 `FastSpeech2` 作为声学模型。<font>\n",
+    "![FastSpeech2](source/fastspeech2.png)\n",
+    "<font size=4>PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于，我们使用的的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似)。<font>\n",
+    "![FastPitch](source/fastpitch.png)\n",
+    "<font size=4>更多关于声学模型的发展及改进的介绍: https://paddlespeech.readthedocs.io/en/latest/tts/models_introduction.html<font>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 初始化声学模型 FastSpeech2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "with open(phones_dict, \"r\") as f:\n",
+    "    phn_id = [line.strip().split() for line in f.readlines()]\n",
+    "vocab_size = len(phn_id)\n",
+    "print(\"vocab_size:\", vocab_size)\n",
+    "odim = fastspeech2_config.n_mels\n",
+    "model = FastSpeech2(\n",
+    "    idim=vocab_size, odim=odim, **fastspeech2_config[\"model\"])\n",
+    "# 预训练好的参数赋值给模型\n",
+    "model.set_state_dict(paddle.load(fastspeech2_checkpoint)[\"main_params\"])\n",
+    "# 推理阶段不启用 batch norm 和 dropout\n",
+    "model.eval()\n",
+    "# 读取数据预处理阶段数据集的均值和标准差\n",
+    "stat = np.load(fastspeech2_stat)\n",
+    "mu, std = stat\n",
+    "mu = paddle.to_tensor(mu)\n",
+    "std = paddle.to_tensor(std)\n",
+    "fastspeech2_normalizer = ZScore(mu, std)\n",
+    "# 构造包含 normalize 的新模型\n",
+    "fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)\n",
+    "fastspeech2_inference.eval()\n",
+    "print(\"FastSpeech2 done!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 调用声学模型"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "with paddle.no_grad():\n",
+    "    mel = fastspeech2_inference(phone_ids)\n",
+    "print(\"shepe of mel (n_frames x n_mels):\")\n",
+    "print(mel.shape)\n",
+    "# 绘制声学模型输出的 mel 频谱\n",
+    "fig, ax = plt.subplots(figsize=(9, 6))\n",
+    "im = ax.imshow(mel.T, aspect='auto',origin='lower')\n",
+    "fig.colorbar(im, ax=ax)\n",
+    "plt.title('Mel Spectrogram')\n",
+    "plt.xlabel('Time')\n",
+    "plt.ylabel('Frequency')\n",
+    "plt.tight_layout()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## step3 声码器\n",
+    "<br></br>\n",
+    "<font size=4>声码器将声学特征转换为波形。声码器需要解决的是 <font color=\"#ff0000\">“信息缺失的补全问题”</font>。信息缺失是指，在音频波形转换为频谱图的时候，存在**相位信息**的缺失，在频谱图转换为 mel 频谱图的时候，存在**频域压缩**导致的信息缺失；假设音频的采样率是16kHZ, 一帧的音频有 10ms，也就是说，1s 的音频有 16000 个采样点，而 1s 中包含 100 帧，每一帧有 160 个采样点，声码器的作用就是将一个频谱帧变成音频波形的 160 个采样点，所以声码器中一般会包含**上采样**模块。<font>\n",
+    "    \n",
+    "<br></br>\n",
+    "<font size=4>与声学模型类似，声码器也分为自回归模型和非自回归模型, 更细致的分类如下:<font>\n",
+    "\n",
+    "- <font size=4>Autoregression<font>\n",
+    "    - <font size=4>WaveNet<font>\n",
+    "    - <font size=4>WaveRNN<font>\n",
+    "    - <font size=4>LPCNet<font>\n",
+    "- <font size=4>Flow<font>\n",
+    "    - <font size=4>WaveFlow<font>\n",
+    "    - <font size=4>WaveGlow<font>\n",
+    "    - <font size=4>FloWaveNet<font>\n",
+    "    - <font size=4>Parallel WaveNet<font>\n",
+    "- <font size=4>GAN<font>\n",
+    "    - <font size=4>WaveGAN<font>\n",
+    "    - <font size=4>arallel WaveGAN<font>\n",
+    "    - <font size=4>MelGAN<font>\n",
+    "    - <font size=4>HiFi-GAN<font>\n",
+    "- <font size=4>VAE\n",
+    "    - <font size=4>Wave-VAE<font>\n",
+    "- <font size=4>Diffusion<font>\n",
+    "    - <font size=4>WaveGrad<font>\n",
+    "    - <font size=4>DiffWave<font>\n",
+    "\n",
+    "<br></br>\n",
+    "<font size=4>PaddleSpeech TTS 主要实现了百度的 `WaveFlow` 和一些主流的 GAN Vocoder, 在本教程中，我们使用 `Parallel WaveGAN` 作为声码器。<font>\n",
+    "\n",
+    "<br></br> \n",
+    "<img style=\"float: center;\" src=\"source/pwgan.png\" width=\"75%\"/> \n",
+    "\n",
+    "<br></br>\n",
+    "<font size=4>各 GAN Vocoder 的生成器和判别器的 Loss 的区别如下表格所示:<font>\n",
+    "    \n",
+    "Model  | Generator Loss |Discriminator Loss\n",
+    ":-------------:| :------------:| :-----\n",
+    "Parallel Wave GAN| adversial loss <br> Feature Matching  | Multi-Scale Discriminator |\n",
+    "Mel GAN |adversial loss <br> Multi-resolution STFT loss  | adversial loss|\n",
+    "Multi-Band Mel GAN | adversial loss <br> full band Multi-resolution STFT loss <br> sub band Multi-resolution STFT loss |Multi-Scale Discriminator|\n",
+    "HiFi GAN |adversial loss <br> Feature Matching <br>  Mel-Spectrogram Loss | Multi-Scale Discriminator <br> Multi-Period Discriminator|\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 初始化声码器 Parallel WaveGAN"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "vocoder = PWGGenerator(**pwg_config[\"generator_params\"])\n",
+    "# 预训练好的参数赋值给模型\n",
+    "vocoder.set_state_dict(paddle.load(pwg_checkpoint)[\"generator_params\"])\n",
+    "vocoder.remove_weight_norm()\n",
+    "# 推理阶段不启用 batch norm 和 dropout\n",
+    "vocoder.eval()\n",
+    "# 读取数据预处理阶段数据集的均值和标准差\n",
+    "stat = np.load(pwg_stat)\n",
+    "mu, std = stat\n",
+    "mu = paddle.to_tensor(mu)\n",
+    "std = paddle.to_tensor(std)\n",
+    "pwg_normalizer = ZScore(mu, std)\n",
+    "# 构造包含 normalize 的新模型\n",
+    "pwg_inference = PWGInference(pwg_normalizer, vocoder)\n",
+    "pwg_inference.eval()\n",
+    "print(\"Parallel WaveGAN done!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 调用声码器"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "with paddle.no_grad():\n",
+    "    wav = pwg_inference(mel)\n",
+    "print(\"shepe of wav (time x n_channels):\")\n",
+    "print(wav.shape)\n",
+    "# 绘制声码器输出的波形图\n",
+    "wave_data = wav.numpy().T\n",
+    "time = np.arange(0, wave_data.shape[1]) * (1.0 / fastspeech2_config.fs)\n",
+    "fig, ax = plt.subplots(figsize=(9, 6))\n",
+    "plt.plot(time, wave_data[0])\n",
+    "plt.title('Waveform')\n",
+    "plt.xlabel('Time (seconds)')\n",
+    "plt.ylabel('Amplitude (normed)')\n",
+    "plt.tight_layout()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 播放音频"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "dp.Audio(wav.numpy().T, rate=fastspeech2_config.fs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 保存音频"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!mkdir output\n",
+    "sf.write(\n",
+    "    \"output/output.wav\",\n",
+    "    wav.numpy(),\n",
+    "    samplerate=fastspeech2_config.fs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## step4 FastSpeech2 进阶 —— 个性化调节\n",
+    "<br></br>\n",
+    "<font size=3>FastSpeech2 模型可以个性化地调节音素时长、音调和能量，通过一些简单的调节就可以获得一些有意思的效果<font>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 不要听信别人的谗言，我不是什么克隆人。\n",
+    "print(\"原始音频\")\n",
+    "dp.display(dp.Audio(url=\"https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1_001.wav\"))\n",
+    "print(\"speed x 1.2\")\n",
+    "dp.display(dp.Audio(url=\"https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1.2_001.wav\"))\n",
+    "print(\"speed x 0.8\")\n",
+    "dp.display(dp.Audio(url=\"https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x0.8_001.wav\"))\n",
+    "print(\"pitch x 1.3(童声)\")\n",
+    "dp.display(dp.Audio(url=\"https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice/001.wav\"))\n",
+    "print(\"robot\")\n",
+    "dp.display(dp.Audio(url=\"https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/001.wav\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<font size=4>具体实现代码请参考: https://github.com/DeepSpeech/demos/style_fs2/run.sh<font>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<br></br>\n",
+    "# 用 PaddleSpeech 训练 TTS 模型\n",
+    "<br></br>\n",
+    "<font size=3>PaddleSpeech 的 examples 是按照 数据集/模型 的结构安排的:<font>\n",
+    "```text\n",
+    "examples   \n",
+    "|-- aishell3\n",
+    "|   |-- README.md\n",
+    "|   |-- tts3\n",
+    "|   `-- vc0\n",
+    "|-- csmsc\n",
+    "|   |-- README.md\n",
+    "|   |-- tts2\n",
+    "|   |-- tts3\n",
+    "|   |-- voc1\n",
+    "|   `-- voc3\n",
+    "```\n",
+    "<font size=3>我们在每个数据集的 README.md 介绍了子目录和模型的对应关系, 在 TTS 中有如下对应关系:<font>\n",
+    "```text\n",
+    "tts0 - Tactron2\n",
+    "tts1 - TransformerTTS\n",
+    "tts2 - SpeedySpeech\n",
+    "tts3 - FastSpeech2\n",
+    "voc0 - WaveFlow\n",
+    "voc1 - Parallel WaveGAN\n",
+    "voc2 - MelGAN\n",
+    "voc3 - MultiBand MelGAN\n",
+    "```\n",
+    "<br></br>\n",
+    "## 基于 CSMCS 数据集训练 FastSpeech2 模型\n",
+    "```bash\n",
+    "git clone https://github.com/PaddlePaddle/PaddleSpeech.git\n",
+    "cd examples/csmsc/tts\n",
+    "```\n",
+    "<font size=3>根据 README.md, 下载 CSMCS 数据集和其对应的强制对齐文件, 并放置在对应的位置<font>\n",
+    "```bash\n",
+    "./run.sh\n",
+    "```\n",
+    "<font size=3>`run.sh` 中包含预处理、训练、合成、静态图推理等步骤:</font>\n",
+    "\n",
+    "```bash\n",
+    "#!/bin/bash\n",
+    "set -e\n",
+    "source path.sh\n",
+    "gpus=0,1\n",
+    "stage=0\n",
+    "stop_stage=100\n",
+    "conf_path=conf/default.yaml\n",
+    "train_output_path=exp/default\n",
+    "ckpt_name=snapshot_iter_153.pdz\n",
+    "\n",
+    "# with the following command, you can choice the stage range you want to run\n",
+    "# such as `./run.sh --stage 0 --stop-stage 0`\n",
+    "# this can not be mixed use with `$1`, `$2` ...\n",
+    "source ${MAIN_ROOT}/utils/parse_options.sh || exit 1\n",
+    "\n",
+    "if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then\n",
+    "    # prepare data\n",
+    "    bash ./local/preprocess.sh ${conf_path} || exit -1\n",
+    "fi\n",
+    "if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then\n",
+    "    # train model, all `ckpt` under `train_output_path/checkpoints/` dir\n",
+    "    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1\n",
+    "fi\n",
+    "if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then\n",
+    "    # synthesize, vocoder is pwgan\n",
+    "    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1\n",
+    "fi\n",
+    "if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then\n",
+    "    # synthesize_e2e, vocoder is pwgan\n",
+    "    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1\n",
+    "fi\n",
+    "if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then\n",
+    "    # inference with static model\n",
+    "    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1\n",
+    "fi\n",
+    "```\n",
+    "<br></br>\n",
+    "## 基于 CSMCS 数据集训练 Parallel WaveGAN 模型\n",
+    "```bash\n",
+    "git clone https://github.com/PaddlePaddle/PaddleSpeech.git\n",
+    "cd examples/csmsc/voc1\n",
+    "```\n",
+    "<font size=3>根据 README.md, 下载 CSMCS 数据集和其对应的强制对齐文件, 并放置在对应的位置<font>\n",
+    "```bash\n",
+    "./run.sh\n",
+    "```\n",
+    "<font size=3>`run.sh` 中包含预处理、训练、合成等步骤:</font>\n",
+    "```bash\n",
+    "#!/bin/bash\n",
+    "set -e\n",
+    "source path.sh\n",
+    "gpus=0,1\n",
+    "stage=0\n",
+    "stop_stage=100\n",
+    "conf_path=conf/default.yaml\n",
+    "train_output_path=exp/default\n",
+    "ckpt_name=snapshot_iter_5000.pdz\n",
+    "\n",
+    "# with the following command, you can choice the stage range you want to run\n",
+    "# such as `./run.sh --stage 0 --stop-stage 0`\n",
+    "# this can not be mixed use with `$1`, `$2` ...\n",
+    "source ${MAIN_ROOT}/utils/parse_options.sh || exit 1\n",
+    "\n",
+    "if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then\n",
+    "    # prepare data\n",
+    "    ./local/preprocess.sh ${conf_path} || exit -1\n",
+    "fi\n",
+    "if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then\n",
+    "    # train model, all `ckpt` under `train_output_path/checkpoints/` dir\n",
+    "    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1\n",
+    "fi\n",
+    "if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then\n",
+    "    # synthesize\n",
+    "    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1\n",
+    "fi\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# FAQ\n",
+    "\n",
+    "- <font size=3>需要注意的问题<font>\n",
+    "- <font size=3>经验与分享<font>\n",
+    "- <font size=3>用户的其他问题<font>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 作业\n",
+    "<font size=4>在 CSMSC 数据集上利用 FastSpeech2 和 Parallel WaveGAN 实现一个中文 TTS 系统<font>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 关注 PaddleSpeech\n",
+    "<font size=3>https://github.com/PaddlePaddle/PaddleSpeech/<font>"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.7.0 64-bit ('yt_py37_develop': venv)",
+   "language": "python",
+   "name": "python37064bitytpy37developvenv88cd689abeac41d886f9210a708a170b"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {
+    "height": "calc(100% - 180px)",
+    "left": "10px",
+    "top": "150px",
+    "width": "263.594px"
+   },
+   "toc_section_display": true,
+   "toc_window_display": true
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@ -17,7 +17,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA result of AISHELL-3 and Extract it
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.

 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
@ -96,17 +96,17 @@ optional arguments:
 6. `--speaker-dict`is the path of the  speaker id map file when training a multi-speaker FastSpeech2.

 ### Synthesize
-We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
-Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
+We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
+Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip) and unzip it.
 ```bash
-unzip pwg_baker_ckpt_0.4.zip
+unzip pwg_aishell3_ckpt_0.5.zip
 ```
 Parallel WaveGAN checkpoint contains files listed below.
 ```text
-pwg_baker_ckpt_0.4
-├── pwg_default.yaml               # default config used to train parallel wavegan
-├── pwg_snapshot_iter_400000.pdz   # model parameters of parallel wavegan
-└── pwg_stats.npy                  # statistics used to normalize spectrogram when training parallel wavegan
+pwg_aishell3_ckpt_0.5
+├── default.yaml                   # default config used to train parallel wavegan
+├── feats_stats.npy                # statistics used to normalize spectrogram when training parallel wavegan
+└── snapshot_iter_1000000.pdz      # generator parameters of parallel wavegan
 ```
 `./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
@ -224,14 +224,12 @@ python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \
  --fastspeech2-config=fastspeech2_nosil_aishell3_ckpt_0.4/default.yaml \
  --fastspeech2-checkpoint=fastspeech2_nosil_aishell3_ckpt_0.4/snapshot_iter_96400.pdz \
  --fastspeech2-stat=fastspeech2_nosil_aishell3_ckpt_0.4/speech_stats.npy \
-  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
-  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
-  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+  --pwg-config=pwg_aishell3_ckpt_0.5/default.yaml \
+  --pwg-checkpoint=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+  --pwg-stat=pwg_aishell3_ckpt_0.5/feats_stats.npy  \
  --text=${BIN_DIR}/../sentences.txt \
  --output-dir=exp/default/test_e2e \
  --phones-dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \
  --speaker-dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt

 ```
-## Future work
-A multi-speaker  vocoder is needed.
--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
@ -1,8 +1,8 @@
 # Tacotron2 + AISHELL-3 Voice Cloning
 This example contains code used to train a [Tacotron2 ](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of  [Transfer Learning from Speaker Veriﬁcation to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows:
-1. Speaker Encoder: We  use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the  transcriptions are not needed, we use more datasets, refer to  [ge2e](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/ge2e).
+1. Speaker Encoder: We  use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the  transcriptions are not needed, we use more datasets, refer to  [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e).
 2. Synthesizer: Then, we use the trained speaker encoder to generate utterance embedding for each  sentence in AISHELL-3. This embedding is a extra input of  Tacotron2 which will be concated with encoder outputs.
-3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0).
+3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0).

 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
@ -39,9 +39,9 @@ There are silence in the edge of AISHELL-3's wavs, and the audio amplitude is ve

 We use Montreal Force Aligner 1.0. The label in  aishell3 include pinyin，so the lexicon we provided to MFA is pinyin rather than Chinese characters. And the prosody marks(`$`  and `%`) need to be removed. You shoud preprocess the dataset into the format  which MFA needs, the texts have the same name with wavs and have the suffix `.lab`.

-We use [lexicon.txt](https://github.com/PaddlePaddle/DeepSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.
+We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.

-You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.

 ```bash
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@ -0,0 +1,146 @@
+# Parallel WaveGAN with AISHELL-3
+This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [AISHELL-3](http://www.aishelltech.com/aishell_3).
+
+AISHELL-3 is a large-scale and high-fidelity multi-speaker Mandarin speech corpus which could be used to train multi-speaker Text-to-Speech (TTS) systems.
+## Dataset
+### Download and Extract the datasaet
+Download AISHELL-3.
+```bash
+wget https://www.openslr.org/resources/93/data_aishell3.tgz
+```
+Extract AISHELL-3.
+```bash
+mkdir data_aishell3
+tar zxvf data_aishell3.tgz -C data_aishell3
+```
+### Get MFA result of AISHELL-3 and Extract it
+We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/data_aishell3`.
+Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+```bash
+./run.sh
+```
+### Preprocess the dataset
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── norm
+    ├── raw
+    └── feats_stats.npy
+```
+
+The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains log magnitude of mel spectrogram of each utterances, while the norm folder contains normalized spectrogram. The statistics used to normalize the spectrogram is computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance.
+
+### Train the model
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE]
+                [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK]
+                [--profiler_options PROFILER_OPTIONS]
+
+Train a ParallelWaveGAN model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       config file to overwrite default config.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --verbose VERBOSE     verbose.
+
+benchmark:
+  arguments related to benchmark.
+
+  --batch-size BATCH_SIZE
+                        batch size.
+  --max-iter MAX_ITER   train max steps.
+  --run-benchmark RUN_BENCHMARK
+                        runing benchmark or not, if True, use the --batch-size
+                        and --max-iter.
+  --profiler_options PROFILER_OPTIONS
+                        The option of profiler, which should be in format
+                        "key1=value1;key2=value2;key3=value3".
+```
+
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+### Synthesize
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT]
+                     [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
+                     [--ngpu NGPU] [--verbose VERBOSE]
+
+Synthesize with parallel wavegan.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       parallel wavegan config file.
+  --checkpoint CHECKPOINT
+                        snapshot to load.
+  --test-metadata TEST_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --verbose VERBOSE     verbose.
+```
+
+1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
+2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. If you use the pretrained model, use the `snapshot_iter_1000000.pdz `.
+3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
+4. `--output-dir` is the directory to save the synthesized audio files.
+5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+## Pretrained Models
+Pretrained models can be downloaded here [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip).
+
+Parallel WaveGAN checkpoint contains files listed below.
+
+```text
+pwg_aishell3_ckpt_0.5
+├── default.yaml                   # default config used to train parallel wavegan
+├── feats_stats.npy                # statistics used to normalize spectrogram when training parallel wavegan
+└── snapshot_iter_1000000.pdz      # generator parameters of parallel wavegan
+```
+## Acknowledgement
+We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@ -7,7 +7,7 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind

 ### Get MFA result of CSMSC and Extract it
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.

 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
@ -89,7 +89,7 @@ optional arguments:
 6. `--tones-dict` is the path of the tone vocabulary file.

 ### Synthesize
-We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
+We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
 Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
 ```bash
 unzip pwg_baker_ckpt_0.4.zip
@ -209,6 +209,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}

 ## Pretrained Model
 Pretrained SpeedySpeech model with no silence in the edge of audios. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
+Static model can be downloaded here [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip).

 SpeedySpeech checkpoint contains files listed below.
 ```text
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@ -7,7 +7,7 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind

 ### Get MFA result of CSMSC and Extract it
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.

 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
@ -87,7 +87,7 @@ optional arguments:
 5. `--phones-dict` is the path of the phone vocabulary file.

 ### Synthesize
-We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
+We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
 Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
 ```bash
 unzip pwg_baker_ckpt_0.4.zip
@ -200,6 +200,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}

 ## Pretrained Model
 Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)
+Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip)

 FastSpeech2 checkpoint contains files listed below.
 ```text
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@ -6,7 +6,7 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index

 ### Get MFA results for silence trim
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.

 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
@ -122,7 +122,8 @@ optional arguments:
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.

 ## Pretrained Models
-Pretrained models can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip).
+Pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip).
+Static models can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip).

 Parallel WaveGAN checkpoint contains files listed below.

--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@ -85,11 +85,11 @@ usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT]
                     [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
                     [--ngpu NGPU] [--verbose VERBOSE]

-Synthesize with parallel wavegan.
+Synthesize with multi band melgan.

 optional arguments:
  -h, --help            show this help message and exit
-  --config CONFIG       parallel wavegan config file.
+  --config CONFIG       multi band melgan config file.
  --checkpoint CHECKPOINT
                        snapshot to load.
  --test-metadata TEST_METADATA
@ -100,10 +100,23 @@ optional arguments:
  --verbose VERBOSE     verbose.
 ```

-1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
+1. `--config` multi band melgan config file. You should use the same config with which the model is trained.
 2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
 4. `--output-dir` is the directory to save the synthesized audio files.
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.

 ## Pretrained Models
+Pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip).
+Static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip)
+
+Multi Band MelGAN checkpoint contains files listed below.
+
+```text
+mb_melgan_baker_ckpt_0.5
+├── default.yaml                  # default config used to train multi band melgan
+├── feats_stats.npy               # statistics used to normalize spectrogram when training multi band melgan
+└── snapshot_iter_1000000.pdz     # generator parameters of multi band melgan
+```
+## Acknowledgement
+We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
--- a/examples/ljspeech/tts0/README.md
+++ b/examples/ljspeech/tts0/README.md
@ -75,7 +75,7 @@ optional arguments:
                        config, passing in KEY VALUE pairs
  -v, --verbose         print msg
 ```
-**Ps.** You can  use [waveflow](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0) as the neural vocoder to synthesize mels to wavs. (Please  refer to `synthesize.sh` in our  LJSpeech waveflow example)
+**Ps.** You can  use [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0) as the neural vocoder to synthesize mels to wavs. (Please  refer to `synthesize.sh` in our  LJSpeech waveflow example)

 ## Pretrained Models
 Pretrained Models can be downloaded from links below. We provide 2 models with different configurations.
--- a/examples/ljspeech/tts1/README.md
+++ b/examples/ljspeech/tts1/README.md
@ -78,7 +78,7 @@ optional arguments:
 5. `--phones-dict` is the path of the phone vocabulary file.

 ## Synthesize
-We use [waveflow](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0) as the neural vocoder.
+We use [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0) as the neural vocoder.
 Download Pretrained WaveFlow Model with residual channel equals 128 from [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip) and unzip it.
 ```bash
 unzip waveflow_ljspeech_ckpt_0.3.zip
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@ -7,7 +7,7 @@ Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech

 ### Get MFA result of LJSpeech-1.1 and Extract it
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
-You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.

 ## Get Started
 Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
@ -86,7 +86,7 @@ optional arguments:
 5. `--phones-dict` is the path of the phone vocabulary file.

 ### Synthesize
-We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc1) as the neural vocoder.
+We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1) as the neural vocoder.
 Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip) and unzip it.
 ```bash
 unzip pwg_ljspeech_ckpt_0.5.zip
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@ -5,7 +5,7 @@ This example contains code used to train a [parallel wavegan](http://arxiv.org/a
 Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/).
 ### Get MFA results for silence trim
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
-You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.

 ## Get Started
 Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
--- a/examples/other/ge2e/README.md
+++ b/examples/other/ge2e/README.md
@ -1,5 +1,5 @@
 # Speaker Encoder
-This experiment trains a speaker encoder with speaker verification as its task. It is done as a part of the experiment of transfer learning from speaker verification to multispeaker text-to-speech synthesis, which can be found at [examples/aishell3/vc0](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/aishell3/vc0). The trained speaker encoder is used to extract utterance embeddings from utterances.
+This experiment trains a speaker encoder with speaker verification as its task. It is done as a part of the experiment of transfer learning from speaker verification to multispeaker text-to-speech synthesis, which can be found at [examples/aishell3/vc0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0). The trained speaker encoder is used to extract utterance embeddings from utterances.
 ## Model
 The model used in this experiment is the speaker encoder with text independent speaker verification task in [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf). GE2E-softmax loss is used.

--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@ -7,8 +7,8 @@ Download VCTK-0.92 from the [official website](https://datashare.ed.ac.uk/handle

 ### Get MFA result of VCTK and Extract it
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
-You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) of our repo.
-ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/DeepSpeech/blob/develop/examples/other/use_mfa/local/reorganize_vctk.py)):
+You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/use_mfa/local/reorganize_vctk.py)):
 1. `p315`, because no txt for it.
 2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for  them.

@ -88,7 +88,7 @@ optional arguments:
 4. `--phones-dict` is the path of the phone vocabulary file.

 ### Synthesize
-We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/vctk/voc1) as the neural vocoder.
+We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1) as the neural vocoder.

 Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)and unzip it.
 ```bash
--- a/examples/vctk/voc1/README.md
+++ b/examples/vctk/voc1/README.md
@ -7,8 +7,8 @@ Download VCTK-0.92  from the [official website](https://datashare.ed.ac.uk/handl

 ### Get MFA results for silence trim
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
-You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) of our repo.
-ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/DeepSpeech/blob/develop/examples/other/use_mfa/local/reorganize_vctk.py)):
+You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/use_mfa/local/reorganize_vctk.py)):
 1. `p315`, because no txt for it.
 2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for  them.

--- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
@ -87,26 +87,27 @@ def evaluate(args, fastspeech2_config, pwg_config):
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    # only test the number 0 speaker
-    spk_id = 0
-    for utt_id, sentence in sentences:
-        input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
-        phone_ids = input_ids["phone_ids"]
-        flags = 0
-        for part_phone_ids in phone_ids:
-            with paddle.no_grad():
-                mel = fastspeech2_inference(
-                    part_phone_ids, spk_id=paddle.to_tensor(spk_id))
-                temp_wav = pwg_inference(mel)
-            if flags == 0:
-                wav = temp_wav
-                flags = 1
-            else:
-                wav = paddle.concat([wav, temp_wav])
-        sf.write(
-            str(output_dir / (str(spk_id) + "_" + utt_id + ".wav")),
-            wav.numpy(),
-            samplerate=fastspeech2_config.fs)
-        print(f"{spk_id}_{utt_id} done!")
+    spk_ids = list(range(20))
+    for spk_id in spk_ids:
+        for utt_id, sentence in sentences[:2]:
+            input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
+            phone_ids = input_ids["phone_ids"]
+            flags = 0
+            for part_phone_ids in phone_ids:
+                with paddle.no_grad():
+                    mel = fastspeech2_inference(
+                        part_phone_ids, spk_id=paddle.to_tensor(spk_id))
+                    temp_wav = pwg_inference(mel)
+                if flags == 0:
+                    wav = temp_wav
+                    flags = 1
+                else:
+                    wav = paddle.concat([wav, temp_wav])
+            sf.write(
+                str(output_dir / (str(spk_id) + "_" + utt_id + ".wav")),
+                wav.numpy(),
+                samplerate=fastspeech2_config.fs)
+            print(f"{spk_id}_{utt_id} done!")


 def main():
--- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py
@ -30,9 +30,9 @@ from paddlespeech.t2s.models.melgan import MelGANGenerator

 def main():
    parser = argparse.ArgumentParser(
-        description="Synthesize with parallel wavegan.")
+        description="Synthesize with multi band melgan.")
    parser.add_argument(
-        "--config", type=str, help="parallel wavegan config file.")
+        "--config", type=str, help="multi band melgan config file.")
    parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
    parser.add_argument("--test-metadata", type=str, help="dev data.")
    parser.add_argument("--output-dir", type=str, help="output dir.")
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@ -219,18 +219,45 @@ class Frontend():
    def get_phonemes(self,
                     sentence: str,
                     merge_sentences: bool=True,
-                     with_erhua: bool=True) -> List[List[str]]:
+                     with_erhua: bool=True,
+                     robot: bool=False,
+                     print_info: bool=False) -> List[List[str]]:
        sentences = self.text_normalizer.normalize(sentence)
        phonemes = self._g2p(
            sentences, merge_sentences=merge_sentences, with_erhua=with_erhua)
+        # change all tones to `1`
+        if robot:
+            new_phonemes = []
+            for sentence in phonemes:
+                new_sentence = []
+                for item in sentence:
+                    # `er` only have tone `2`
+                    if item[-1] in "12345" and item != "er2":
+                        item = item[:-1] + "1"
+                    new_sentence.append(item)
+                new_phonemes.append(new_sentence)
+            phonemes = new_phonemes
+        if print_info:
+            print("----------------------------")
+            print("text norm results:")
+            print(sentences)
+            print("----------------------------")
+            print("g2p results:")
+            print(phonemes)
+            print("----------------------------")
        return phonemes

-    def get_input_ids(
-            self,
-            sentence: str,
-            merge_sentences: bool=True,
-            get_tone_ids: bool=False) -> Dict[str, List[paddle.Tensor]]:
-        phonemes = self.get_phonemes(sentence, merge_sentences=merge_sentences)
+    def get_input_ids(self,
+                      sentence: str,
+                      merge_sentences: bool=True,
+                      get_tone_ids: bool=False,
+                      robot: bool=False,
+                      print_info: bool=False) -> Dict[str, List[paddle.Tensor]]:
+        phonemes = self.get_phonemes(
+            sentence,
+            merge_sentences=merge_sentences,
+            print_info=print_info,
+            robot=robot)
        result = {}
        phones = []
        tones = []
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@ -513,9 +513,9 @@ class FastSpeech2(nn.Layer):
        spembs : Tensor, optional
            peaker embedding vector (spk_embed_dim,).
        spk_id : Tensor, optional(int64)
-            Speaker embedding vector (spk_embed_dim).
+            Batch of padded spk ids  (1,).
        tone_id : Tensor, optional(int64)
-            Batch of padded tone ids  (B, Tmax).
+            Batch of padded tone ids  (T,).

        Returns
        ----------
@ -526,9 +526,7 @@ class FastSpeech2(nn.Layer):
        x = paddle.cast(text, 'int64')
        y = speech
        spemb = spembs
-        if durations is not None:
-            d = paddle.cast(durations, 'int64')
-        p, e = pitch, energy
+        d, p, e = durations, pitch, energy
        # setup batch axis
        ilens = paddle.shape(x)[0]

@ -539,8 +537,9 @@ class FastSpeech2(nn.Layer):

        if spemb is not None:
            spembs = spemb.unsqueeze(0)
-        else:
-            spembs = None
+
+        if tone_id is not None:
+            tone_id = tone_id.unsqueeze(0)

        if use_teacher_forcing:
            # use groundtruth of duration, pitch, and energy
@ -549,7 +548,7 @@ class FastSpeech2(nn.Layer):
            es = e.unsqueeze(0) if e is not None else None
            # ds, ps, es = , p.unsqueeze(0), e.unsqueeze(0)
            # (1, L, odim)
-            _, outs, d_outs, *_ = self._forward(
+            _, outs, d_outs, p_outs, e_outs = self._forward(
                xs,
                ilens,
                ys,
@ -562,7 +561,7 @@ class FastSpeech2(nn.Layer):
                is_inference=True)
        else:
            # (1, L, odim)
-            _, outs, d_outs, *_ = self._forward(
+            _, outs, d_outs, p_outs, e_outs = self._forward(
                xs,
                ilens,
                ys,
@ -571,8 +570,7 @@ class FastSpeech2(nn.Layer):
                spembs=spembs,
                spk_id=spk_id,
                tone_id=tone_id)
-
-        return outs[0]
+        return outs[0], d_outs[0], p_outs[0], e_outs[0]

    def _integrate_with_spk_embed(self, hs, spembs):
        """Integrate speaker embedding with hidden states.
@ -683,7 +681,8 @@ class FastSpeech2Inference(nn.Layer):
        self.acoustic_model = model

    def forward(self, text, spk_id=None):
-        normalized_mel = self.acoustic_model.inference(text, spk_id=spk_id)
+        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
+            text, spk_id=spk_id)
        logmel = self.normalizer.inverse(normalized_mel)
        return logmel

--- a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
-import logging
-
 from paddle import nn

 from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
@ -122,7 +120,6 @@ class Encoder(nn.Layer):
                "rel_selfattn",
                "legacy_rel_selfattn",
        ]:
-            logging.info("encoder self-attention layer type = self-attention")
            encoder_selfattn_layer = MultiHeadedAttention
            encoder_selfattn_layer_args = [
                (attention_heads, attention_dim, attention_dropout_rate, )
--- a/tools/Makefile
+++ b/tools/Makefile
@ -51,7 +51,7 @@ soxbindings.done:
 	touch soxbindings.done

 mfa.done:
-	test -d montreal-forced-aligner || $(WGET) https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz
+	test -d montreal-forced-aligner || $(WGET) https://paddlespeech.bj.bcebos.com/Parakeet/montreal-forced-aligner_linux.tar.gz
 	tar xvf montreal-forced-aligner_linux.tar.gz
 	touch mfa.done
				`@ -0,0 +1 @@`
				`000 谁知青蛙一落地，竟变成了一位英俊的王子。于是遵照国王的意思，他做了公主的亲密伴侣。`