Merge pull request #932 from yt605155624/merge_parakeet

[tts] refactor parakeet example
3 years ago · e395462419
parent 05288cd381 20226b4fdd
commit e395462419
246 changed files with 2360 additions and 1632 deletions
--- a/deepspeech/init.py
+++ b/deepspeech/init.py
@ -355,7 +355,6 @@ if not hasattr(paddle.Tensor, 'tolist'):
        "register user tolist to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'tolist', tolist)

-
 ########### hack paddle.nn #############
 from paddle.nn import Layer
 from typing import Optional
@ -506,5 +505,3 @@ if not hasattr(paddle.nn, 'LayerDict'):
    logger.debug(
        "register user LayerDict to paddle.nn, remove this when fixed!")
    setattr(paddle.nn, 'LayerDict', LayerDict)
-
-
--- a/deepspeech/decoders/recog.py
+++ b/deepspeech/decoders/recog.py
@ -12,12 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """V2 backend for `asr_recog.py` using py:class:`decoders.beam_search.BeamSearch`."""
-import json
-from pathlib import Path
-
 import jsonlines
 import paddle
-import yaml
 from yacs.config import CfgNode

 from .beam_search import BatchBeamSearch
@ -79,8 +75,7 @@ def recog_v2(args):
        sort_in_input_length=False,
        preprocess_conf=confs.collator.augmentation_config
        if args.preprocess_conf is None else args.preprocess_conf,
-        preprocess_args={"train": False},
-    )
+        preprocess_args={"train": False}, )

    if args.rnnlm:
        lm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
@ -113,8 +108,7 @@ def recog_v2(args):
        ctc=args.ctc_weight,
        lm=args.lm_weight,
        ngram=args.ngram_weight,
-        length_bonus=args.penalty,
-    )
+        length_bonus=args.penalty, )
    beam_search = BeamSearch(
        beam_size=args.beam_size,
        vocab_size=len(char_list),
@ -123,8 +117,7 @@ def recog_v2(args):
        sos=model.sos,
        eos=model.eos,
        token_list=char_list,
-        pre_beam_score_key=None if args.ctc_weight == 1.0 else "full",
-    )
+        pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", )

    # TODO(karita): make all scorers batchfied
    if args.batchsize == 1:
@ -171,9 +164,10 @@ def recog_v2(args):
                logger.info(f'feat: {feat.shape}')
                enc = model.encode(paddle.to_tensor(feat).to(dtype))
                logger.info(f'eout: {enc.shape}')
-                nbest_hyps = beam_search(x=enc,
-                                         maxlenratio=args.maxlenratio,
-                                         minlenratio=args.minlenratio)
+                nbest_hyps = beam_search(
+                    x=enc,
+                    maxlenratio=args.maxlenratio,
+                    minlenratio=args.minlenratio)
                nbest_hyps = [
                    h.asdict()
                    for h in nbest_hyps[:min(len(nbest_hyps), args.nbest)]
@ -183,9 +177,8 @@ def recog_v2(args):

                item = new_js[name]['output'][0]  # 1-best
                ref = item['text']
-                rec_text = item['rec_text'].replace('▁',
-                                                    ' ').replace('<eos>',
-                                                                 '').strip()
+                rec_text = item['rec_text'].replace('▁', ' ').replace(
+                    '<eos>', '').strip()
                rec_tokenid = list(map(int, item['rec_tokenid'].split()))
                f.write({
                    "utt": name,
--- a/deepspeech/decoders/recog_bin.py
+++ b/deepspeech/decoders/recog_bin.py
@ -21,7 +21,7 @@ from distutils.util import strtobool
 import configargparse
 import numpy as np

-from .recog import recog_v2
+from deepspeech.decoders.recog import recog_v2


 def get_parser():
@ -359,7 +359,7 @@ def main(args):
        if args.num_encs == 1:
            # Experimental API that supports custom LMs
            if args.api == "v2":
-                from deepspeech.decoders.recog import recog_v2
+
                recog_v2(args)
            else:
                raise ValueError("Only support --api v2")
--- a/deepspeech/modules/ctc.py
+++ b/deepspeech/modules/ctc.py
@ -11,9 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Union
+
 import paddle
 from paddle import nn
-from typing import Union
 from paddle.nn import functional as F
 from typeguard import check_argument_types

--- a/examples/aishell3/README.md
+++ b/examples/aishell3/README.md
@ -1,4 +1,11 @@
 # Aishell3

-* tts0 - fastspeech2
-* vc0 - tactron2 voice clone
+* tts0 - Tactron2
+* tts1 - TransformerTTS
+* tts2 - SpeedySpeech
+* tts3 - FastSpeech2
+* voc0 - WaveFlow
+* voc1 - Parallel WaveGAN
+* voc2 - MelGAN
+* voc3 - MultiBand MelGAN
+* vc0 - Tactron2 Voice Clone with GE2E
--- a/examples/aishell3/tts0/run.sh
+++ b/examples/aishell3/tts0/run.sh
--- a/examples/vctk/fastspeech2/aishell3/README.md
+++ b/examples/vctk/fastspeech2/aishell3/README.md
@ -18,12 +18,23 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ### Get MFA result of AISHELL-3 and Extract it
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
 You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) (use MFA1.x now) of our repo.
-### Preprocess the dataset
+
+## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
 Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`.
-Run the command below to preprocess the dataset.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from text file.
 ```bash
-./preprocess.sh
+./run.sh
+```
+### Preprocess the dataset
+```bash
+./local/preprocess.sh ${conf_path}
 ```
 When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
 ```text
@ -47,10 +58,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi

 Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.

-## Train the model
-`./run.sh` calls `../train.py`.
+### Train the model
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 ```bash
-./run.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
 ```
 Here's the complete help message.
 ```text
@ -85,20 +96,8 @@ optional arguments:
 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
 6. `--phones-dict` is the path of the phone vocabulary file.
 7. `--speaker-dict`is the path of the  speaker id map file when training a multi-speaker FastSpeech2.
-## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)

-FastSpeech2 checkpoint contains files listed below.
-
-```text
-fastspeech2_nosil_aishell3_ckpt_0.4
-├── default.yaml            # default config used to train fastspeech2
-├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
-├── snapshot_iter_96400.pdz # model parameters and optimizer states
-├── speaker_id_map.txt      # speaker id map file when training a multi-speaker fastspeech2
-└── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
-```
-## Synthesize
+### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder.
 Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
 ```bash
@ -111,9 +110,9 @@ pwg_baker_ckpt_0.4
 ├── pwg_snapshot_iter_400000.pdz   # model parameters of parallel wavegan
 └── pwg_stats.npy                  # statistics used to normalize spectrogram when training parallel wavegan
 ```
-`synthesize.sh` calls `synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
-./synthesize.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
@ -153,22 +152,22 @@ optional arguments:
  --device DEVICE       device type to use.
  --verbose VERBOSE     verbose.
 ```
-
-`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
-
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/multi_spk_synthesize_e2e.py`, which can synthesize waveform from text file.
 ```bash
-./synthesize_e2e.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
-usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
-                         [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT]
-                         [--fastspeech2-stat FASTSPEECH2_STAT]
-                         [--pwg-config PWG_CONFIG]
-                         [--pwg-checkpoint PWG_CHECKPOINT]
-                         [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT]
-                         [--speaker-dict SPEAKER_DICT] [--text TEXT]
-                         [--output-dir OUTPUT_DIR] [--device DEVICE]
-                         [--verbose VERBOSE]
+usage: multi_spk_synthesize_e2e.py [-h]
+                                   [--fastspeech2-config FASTSPEECH2_CONFIG]
+                                   [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT]
+                                   [--fastspeech2-stat FASTSPEECH2_STAT]
+                                   [--pwg-config PWG_CONFIG]
+                                   [--pwg-checkpoint PWG_CHECKPOINT]
+                                   [--pwg-stat PWG_STAT]
+                                   [--phones-dict PHONES_DICT]
+                                   [--speaker-dict SPEAKER_DICT] [--text TEXT]
+                                   [--output-dir OUTPUT_DIR] [--device DEVICE]
+                                   [--verbose VERBOSE]

 Synthesize with fastspeech2 & parallel wavegan.

@ -204,24 +203,38 @@ optional arguments:
 5. `--output-dir` is the directory to save synthesized audio files.
 6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.

-You can use the following scripts to synthesize for `../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
+## Pretrained Model
+Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
+
+FastSpeech2 checkpoint contains files listed below.
+
+```text
+fastspeech2_nosil_aishell3_ckpt_0.4
+├── default.yaml            # default config used to train fastspeech2
+├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
+├── snapshot_iter_96400.pdz # model parameters and optimizer states
+├── speaker_id_map.txt      # speaker id map file when training a multi-speaker fastspeech2
+└── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
+```
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
 ```bash
+source path.sh
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
+python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \
  --fastspeech2-config=fastspeech2_nosil_aishell3_ckpt_0.4/default.yaml \
  --fastspeech2-checkpoint=fastspeech2_nosil_aishell3_ckpt_0.4/snapshot_iter_96400.pdz \
  --fastspeech2-stat=fastspeech2_nosil_aishell3_ckpt_0.4/speech_stats.npy \
  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
-  --text=../sentences.txt \
+  --text=${BIN_DIR}/../sentences.txt \
  --output-dir=exp/default/test_e2e \
  --device="gpu" \
  --phones-dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \
  --speaker-dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt

 ```
-
 ## Future work
 A multi-speaker  vocoder is needed.
--- a/examples/vctk/fastspeech2/aishell3/conf/default.yaml
+++ b/examples/vctk/fastspeech2/aishell3/conf/default.yaml
--- a/examples/vctk/fastspeech2/aishell3/preprocess.sh
+++ b/examples/vctk/fastspeech2/aishell3/preprocess.sh
@ -3,7 +3,7 @@
 stage=0
 stop_stage=100

-export MAIN_ROOT=`realpath ${PWD}/../../../`
+config_path=$1

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./aishell3_alignment_tone \
        --output durations.txt \
-        --config=conf/default.yaml 
+        --config=${config_path}
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # extract features
    echo "Extract features ..."
-    python3 ../preprocess.py \
+    python3 ${BIN_DIR}/preprocess.py \
        --dataset=aishell3 \
        --rootdir=~/datasets/data_aishell3/ \
        --dumpdir=dump \
        --dur-file=durations.txt \
-        --config=conf/default.yaml \
+        --config=${config_path} \
        --num-cpu=20 \
        --cut-sil=True
 fi
@ -46,7 +46,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize and covert phone/speaker to id, dev and test should use train's stats
    echo "Normalize ..."
-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --speech-stats=dump/train/speech_stats.npy \
@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --speech-stats=dump/train/speech_stats.npy \
@ -64,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --speech-stats=dump/train/speech_stats.npy \
--- a/examples/vctk/fastspeech2/vctk/synthesize.sh
+++ b/examples/vctk/fastspeech2/vctk/synthesize.sh
@ -1,15 +1,20 @@
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3  ../synthesize.py \
-  --fastspeech2-config=conf/default.yaml \
-  --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_32769.pdz_bak\
+python3 ${BIN_DIR}/synthesize.py \
+  --fastspeech2-config=${config_path} \
+  --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --fastspeech2-stat=dump/train/speech_stats.npy \
  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
  --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=exp/default/test \
+  --output-dir=${train_output_path}/test \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt \
  --speaker-dict=dump/speaker_id_map.txt
--- a/examples/vctk/fastspeech2/vctk/synthesize_e2e.sh
+++ b/examples/vctk/fastspeech2/vctk/synthesize_e2e.sh
@ -1,15 +1,20 @@
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
-  --fastspeech2-config=conf/default.yaml \
-  --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_32769.pdz_bak \
+python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \
+  --fastspeech2-config=${config_path} \
+  --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --fastspeech2-stat=dump/train/speech_stats.npy \
  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
-  --text=../sentences_en.txt \
-  --output-dir=exp/default/test_e2e \
+  --text=${BIN_DIR}/../sentences.txt \
+  --output-dir=${train_output_path}/test_e2e \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt \
  --speaker-dict=dump/speaker_id_map.txt
--- a/examples/vctk/fastspeech2/aishell3/run.sh
+++ b/examples/vctk/fastspeech2/aishell3/run.sh
@ -1,10 +1,13 @@
 #!/bin/bash

-python3 ../train.py \
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=conf/default.yaml \
-    --output-dir=exp/default \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
    --nprocs=2 \
    --phones-dict=dump/phone_id_map.txt \
    --speaker-dict=dump/speaker_id_map.txt
--- a/examples/aishell3/tts3/path.sh
+++ b/examples/aishell3/tts3/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=fastspeech2
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
--- a/examples/aishell3/tts3/run.sh
+++ b/examples/aishell3/tts3/run.sh
@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_482.pdz
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
@ -0,0 +1,89 @@
+# Tacotron2 + AISHELL-3 Voice Cloning
+This example contains code used to train a [Tacotron2 ](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of  [Transfer Learning from Speaker Veriﬁcation to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows:
+1. Speaker Encoder: We  use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the  transcriptions are not needed, we use more datasets, refer to  [ge2e](../../other/ge2e).
+2. Synthesizer: Then, we use the trained speaker encoder to generate utterance embedding for each  sentence in AISHELL-3. This embedding is a extra input of  Tacotron2 which will be concated with encoder outputs.
+3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](../../ljspeech/voc0).
+
+## Get Started
+Assume the path to the dataset is `~/datasets/data_aishell3`.
+Assume the path to the MFA result of AISHELL-3 is `./alignment`.
+Assume the path to the pretrained ge2e model is `ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000`
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. start a voice cloning inference.
+```bash
+./run.sh
+```
+### Preprocess the dataset
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path}
+```
+#### generate utterance embedding
+ Use pretrained GE2E (speaker encoder) to generate utterance embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is  `.npy`.
+
+```bash
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../ge2e/inference.py \
+        --input=${input} \
+        --output=${preprocess_path}/embed \
+        --device="gpu" \
+        --checkpoint_path=${ge2e_ckpt_path}
+fi
+```
+
+The computing time of  utterance embedding can be x hours.
+####  process wav
+There are silence in the edge of AISHELL-3's wavs, and the audio amplitude is very small, so, we need to remove the silence and normalize the audio. You can the silence remove method based on   volume or energy, but the effect is not very good, We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get  the alignment of text and  speech, then utilize the alignment results to remove the silence.
+
+We use Montreal Force Aligner 1.0. The label in  aishell3 include pinyin，so the lexicon we provided to MFA is pinyin rather than Chinese characters. And the prosody marks(`$`  and `%`) need to be removed. You shoud preprocess the dataset into the format  which MFA needs, the texts have the same name with wavs and have the suffix `.lab`.
+
+We use [lexicon.txt](./lexicon.txt) as the lexicon.
+
+You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) (use MFA1.x now) of our repo.
+
+```bash
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "Process wav ..."
+    python3 ${BIN_DIR}/process_wav.py \
+        --input=${input}/wav \
+        --output=${preprocess_path}/normalized_wav \
+        --alignment=${alignment}
+fi
+```
+
+#### preprocess transcription
+We revert the transcription into `phones` and  `tones`. It is worth noting that our processing here is different from that used for MFA, we separated the tones. This is a processing method, of course, you can only segment initials and vowels.
+
+```bash
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/preprocess_transcription.py \
+        --input=${input} \
+        --output=${preprocess_path}
+fi
+```
+The default input is  `~/datasets/data_aishell3/train`，which contains `label_train-set.txt`, the processed results are `metadata.yaml` and  `metadata.pickle`. the former is a text format for easy viewing, and the latter is a binary format for direct reading.
+#### extract mel
+```python
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    python3 ${BIN_DIR}/extract_mel.py \
+        --input=${preprocess_path}/normalized_wav \
+        --output=${preprocess_path}/mel
+fi
+```
+
+###  Train the model
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path}
+```
+
+Our model remve  stop token prediction in Tacotron2, because of the problem of extremely unbalanced proportion of positive and negative samples of stop token prediction, and it's very sensitive to the clip of audio silence. We use the last symbol from the highest point of attention to the encoder side as the termination condition.
+
+In addition, in order to accelerate the convergence of the model, we add `guided attention loss` to induce the alignment between encoder and decoder to show diagonal lines faster.
+###  Infernece
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output}
+```
+## Pretrained Model
+[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip).
--- a/examples/aishell3/vc0/local/tacotron2/images/alignment-step2000.png
+++ b/examples/aishell3/vc0/local/tacotron2/images/alignment-step2000.png
--- a/examples/aishell3/vc0/local/tacotron2/images/train.png
+++ b/examples/aishell3/vc0/local/tacotron2/images/train.png
--- a/examples/aishell3/vc0/local/tacotron2/images/valid.png
+++ b/examples/aishell3/vc0/local/tacotron2/images/valid.png
--- a/examples/aishell3/vc0/local/preprocess.sh
+++ b/examples/aishell3/vc0/local/preprocess.sh
@ -0,0 +1,37 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+input=$1
+preprocess_path=$2
+alignment=$3
+ge2e_ckpt_path=$4
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../../ge2e/inference.py \
+        --input=${input} \
+        --output=${preprocess_path}/embed \
+        --device="gpu" \
+        --checkpoint_path=${ge2e_ckpt_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "Process wav ..."
+    python3 ${BIN_DIR}/process_wav.py \
+        --input=${input}/wav \
+        --output=${preprocess_path}/normalized_wav \
+        --alignment=${alignment}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/preprocess_transcription.py \
+        --input=${input} \
+        --output=${preprocess_path}
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    python3 ${BIN_DIR}/extract_mel.py \
+        --input=${preprocess_path}/normalized_wav \
+        --output=${preprocess_path}/mel
+fi
--- a/examples/aishell3/vc0/local/tacotron2/README_cn.md
+++ b/examples/aishell3/vc0/local/tacotron2/README_cn.md
@ -1,112 +0,0 @@
-## Tacotron2 + AISHELL-3 数据集训练语音克隆模型
-
-本实验的内容是利用 AISHELL-3 数据集和 Tacotron 2 模型进行语音克隆任务，使用的模型大体结构和论文 [Transfer Learning from Speaker Veriﬁcation to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) 相同。大致步骤如下：
-
-1. Speaker Encoder: 我们使用了一个 Speaker Verification 任务训练一个 speaker encoder。这部分任务所用的数据集和训练 Tacotron 2 的数据集不同，因为不需要 transcription 的缘故，我们使用了较多的训练数据，可以参考实现 [ge2e](../ge2e)。
-2. Synthesizer: 然后使用训练好的 speaker encoder 为 AISHELL-3 数据集中的每个句子生成对应的 utterance embedding. 这个 Embedding 作为 Tacotron 模型中的一个额外输入和 encoder outputs 拼接在一起。
-3. Vocoder: 我们使用的声码器是 WaveFlow，参考实验 [waveflow](../waveflow).
-
-## 数据处理
-
-### utterance embedding 的生成
-
-使用训练好的 speaker encoder 为 AISHELL-3 数据集中的每个句子生成对应的 utterance embedding. 以和音频文件夹同构的方式存储。存储格式是 `.npy` 文件。
-
-首先 cd 到 [ge2e](../ge2e) 文件夹。下载训练好的 [模型](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)，然后运行脚本生成每个句子的 utterance embedding.
-
-```bash
-python inference.py --input=<intput> --output=<output> --device="gpu" --checkpoint_path=<pretrained checkpoint>
-```
-
-其中 input 是只包含音频文件夹的文件。这里可以用 `~/datasets/aishell3/train/wav`，然后 output 是用于存储 utterance embed 的文件夹，这里可以用 `~/datasets/aishell3/train/embed`。Utterance embedding 会以和音频文件夹相同的文件结构存储，格式为 `.npy`.
-
-utterance embedding 的计算可能会用几个小时的时间，请耐心等待。
-
-### 音频处理
-
-因为 AISHELL-3 数据集前后有一些空白，静音片段，而且语音幅值很小，所以我们需要进行空白移除和音量规范化。空白移除可以简单的使用基于音量或者能量的方法，但是效果不是很好，对于不同的句子很难取到一个一致的阈值。我们使用的是先利用 Force Aligner 进行文本和语音的对齐。然后根据对齐结果截除空白。
-
-我们使用的工具是 Montreal Force Aligner 1.0. 因为 aishell 的标注包含拼音标注，所以我们提供给 Montreal Force Aligner 的是拼音 transcription 而不是汉字 transcription. 而且需要把其中的韵律标记(`$` 和 `%`)去除，并且处理成 Montreal Force Alinger 所需要的文件形式。和音频同名的文本文件，扩展名为 `.lab`.
-
-此外还需要准备词典文件。其中包含把拼音序列转换为 phone 序列的映射关系。在这里我们只做声母和韵母的切分，而声调则归为韵母的一部分。我们使用的[词典文件](./lexicon.txt)可以下载。
-
-准备好之后运行训练和对齐。首先下载 [Montreal Force Aligner 1.0](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/tag/v1.0.1).下载之后解压即可运行。cd 到其中的 bin 文件夹运行命令，即可进行训练和对齐。前三个命令行参数分别是音频文件夹的路径，词典路径和对齐文件输出路径。可以通过`-o` 传入训练得到的模型保存路径。
-
-```bash
-./mfa_train_and_align \
-  ~/datasets/aishell3/train/wav \
-  lexicon.txt \
-  ~/datasets/aishell3/train/alignment \
-  -o aishell3_model \
-  -v
-```
-
-因为训练和对齐的时间比较长。我们提供了对齐后的 [alignment 文件](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz)，其中每个句子对应的文件为 `.TextGrid` 格式的文本。
-
-得到了对齐文件之后，可以运行 `process_wav.py` 脚本来处理音频。
-
-```bash
-python process_wav.py --input=<input> --output=<output> --alignment=<alignment>
-```
-
-默认 input, output, alignment 分别是 `~/datasets/aishell3/train/wav`, `~/datasets/aishell3/train/normalized_wav`, `~/datasets/aishell3/train/alignment`.
-
-处理结束后，会将处理好的音频保存在 `<output>` 文件夹中。
-
-### 转录文本处理
-
-把文本转换成为 phone 和 tone 的形式，并存储起来。值得注意的是，这里我们的处理和用于 montreal force aligner 的不一样。我们把声调分了出来。这是一个处理方式，当然也可以只做声母和韵母的切分。
-
-运行脚本处理转录文本。
-
-```bash
-python preprocess_transcription.py --input=<input> --output=<output>
-```
-
-默认的 input 是 `~/datasets/aishell3/train`，其中会包含 `label_train-set.txt` 文件，处理后的结果会 `metadata.yaml` 和 `metadata.pickle`. 前者是文本格式，方便查看，后者是二进制格式，方便直接读取。
-
-### mel 频谱提取
-
-对处理后的音频进行 mel 频谱的提取，并且以和音频文件夹同构的方式存储，存储格式是 `.npy` 文件。
-
-```python
-python extract_mel.py --input=<intput> --output=<output>
-```
-
-input 是处理后的音频所在的文件夹，output 是输出频谱的文件夹。
-
-## 训练
-
-运行脚本训练。
-
-```python
-python train.py --data=<data> --output=<output> --device="gpu"
-```
-
-我们的模型去掉了 tacotron2 模型中的 stop token prediction。因为实践中由于 stop token prediction 是一个正负样例比例极不平衡的问题，每个句子可能有几百帧对应负样例，只有一帧正样例，而且这个 stop token prediction 对音频静音的裁切十分敏感。我们转用 attention 的最高点到达 encoder 侧的最后一个符号为终止条件。
-
-另外，为了加速模型的收敛，我们加上了 guided attention loss, 诱导 encoder-decoder 之间的 alignment 更快地呈现对角线。
-
-可以使用 visualdl 查看训练过程的 log。
-
-```bash
-visualdl --logdir=<output> --host=$HOSTNAME
-```
-
-示例 training loss / validation loss 曲线如下。
-
-![train](./images/train.png)
-
-![valid](./images/valid.png)
-
-<img src="images/alignment-step2000.png" alt="alignment-step2000" style="zoom:50%;" />
-
-大约从训练 2000 步左右就从 validation 过程中产出的 alignement 中可以观察到模糊的对角线。随着训练步数增加，对角线会更加清晰。但因为 validation 也是以 teacher forcing 的方式进行的，所以要在真正的 auto regressive 合成中产出的 alignment 中观察到对角线，需要更长的时间。
-
-## 预训练模型
-
-预训练模型下载链接。[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip).
-
-## 使用
-
-本实验包含了一个简单的使用示例，用户可以替换作为参考的声音以及文本，用训练好的模型来合成语音。使用方式参考 [notebook](./voice_cloning.ipynb) 上的使用说明。
--- a/examples/aishell3/vc0/local/tacotron2/voice_cloning.ipynb
+++ b/examples/aishell3/vc0/local/tacotron2/voice_cloning.ipynb
--- a/examples/aishell3/vc0/local/train.sh
+++ b/examples/aishell3/vc0/local/train.sh
@ -0,0 +1,9 @@
+#!/bin/bash
+
+preprocess_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --data=${preprocess_path} \
+    --output=${train_output_path} \
+    --device="gpu"
--- a/examples/aishell3/vc0/local/voice_cloning.sh
+++ b/examples/aishell3/vc0/local/voice_cloning.sh
@ -0,0 +1,14 @@
+#!/bin/bash
+
+ge2e_params_path=$1
+tacotron2_params_path=$2
+waveflow_params_path=$3
+vc_input=$4
+vc_output=$5
+
+python3 ${BIN_DIR}/voice_cloning.py \
+        --ge2e_params_path=${ge2e_params_path} \
+        --tacotron2_params_path=${tacotron2_params_path} \
+        --waveflow_params_path=${waveflow_params_path} \
+        --input-dir=${vc_input} \
+        --output-dir=${vc_output}
--- a/examples/aishell3/vc0/path.sh
+++ b/examples/aishell3/vc0/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=voice_cloning/tacotron2_ge2e
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
--- a/examples/aishell3/vc0/run.sh
+++ b/examples/aishell3/vc0/run.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+input=~/datasets/data_aishell3/train
+preprocess_path=dump
+alignment=./alignment
+
+# not include ".pdparams" here
+ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000
+train_output_path=output
+# include ".pdparams" here
+ge2e_params_path=${ge2e_ckpt_path}.pdparams
+tacotron2_params_path=${train_output_path}/checkpoints/step-1000.pdparams
+# pretrained model
+# tacotron2_params_path=./tacotron2_aishell3_ckpt_0.3/step-450000.pdparams
+waveflow_params_path=./waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams
+vc_input=ref_audio
+vc_output=syn_audio
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output} || exit -1
+fi
+
+
--- a/examples/csmsc/README.md
+++ b/examples/csmsc/README.md
@ -0,0 +1,11 @@
+
+# CSMSC
+
+* tts0 - Tactron2
+* tts1 - TransformerTTS
+* tts2 - SpeedySpeech
+* tts3 - FastSpeech2
+* voc0 - WaveFlow
+* voc1 - Parallel WaveGAN
+* voc2 - MelGAN
+* voc3 - MultiBand MelGAN
--- a/examples/csmsc/speedyspeech/baker/inference.sh
+++ b/examples/csmsc/speedyspeech/baker/inference.sh
@ -1,8 +0,0 @@
-#!/bin/bash
-
-python3 inference.py \
-  --inference-dir=exp/default/inference \
-  --text=../sentences.txt \
-  --output-dir=exp/default/pd_infer_out \
-  --phones-dict=dump/phone_id_map.txt \
-  --tones-dict=dump/tone_id_map.txt
--- a/examples/csmsc/speedyspeech/baker/README.md
+++ b/examples/csmsc/speedyspeech/baker/README.md
@ -1,5 +1,4 @@
 # Speedyspeech with CSMSC
-
 This example contains code used to train a [Speedyspeech](http://arxiv.org/abs/2008.03802) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). NOTE that we only implement the student part of the Speedyspeech model. The ground truth alignment used to train the model is extracted from the dataset using [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner).

 ## Dataset
@ -10,12 +9,23 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH.
 You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.

-## Preprocess the dataset
+## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
 Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
-Run the command below to preprocess the dataset.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from text file.
+6. inference using static model.
 ```bash
-./preprocess.sh
+./run.sh
+```
+### Preprocess the dataset
+```bash
+./local/preprocess.sh ${conf_path}
 ```
 When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.

@ -37,13 +47,12 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi

 Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, tones, durations, path of spectrogram, and id of each utterance.

-## Train the model
-`./run.sh` calls `../train.py`.
+### Train the model
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 ```bash
-./run.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
 ```
 Here's the complete help message.
-
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                     [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
@ -81,20 +90,7 @@ optional arguments:
 6. `--phones-dict` is the path of the phone vocabulary file.
 7. `--tones-dict` is the path of the tone vocabulary file.

-## Pretrained Model
-Pretrained SpeedySpeech model with no silence in the edge of audios. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
-
-SpeedySpeech checkpoint contains files listed below.
-```text
-speedyspeech_nosil_baker_ckpt_0.5
-├── default.yaml            # default config used to train speedyspeech
-├── feats_stats.npy         # statistics used to normalize spectrogram when training speedyspeech
-├── phone_id_map.txt        # phone vocabulary file when training speedyspeech
-├── snapshot_iter_11400.pdz # model parameters and optimizer states
-└── tone_id_map.txt         # tone vocabulary file when training speedyspeech
-```
-
-## Synthesize
+### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder.
 Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
 ```bash
@ -107,9 +103,9 @@ pwg_baker_ckpt_0.4
 ├── pwg_snapshot_iter_400000.pdz   # model parameters of parallel wavegan
 └── pwg_stats.npy                  # statistics used to normalize spectrogram when training parallel wavegan
 ```
-`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
-./synthesize.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG]
@ -152,9 +148,9 @@ optional arguments:
  --device DEVICE       device type to use
  --verbose VERBOSE     verbose
 ```
-`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
 ```bash
-./synthesize_e2e.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize_e2e.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG]
@ -203,21 +199,42 @@ optional arguments:
 4. `--output-dir` is the directory to save synthesized audio files.
 5. `--inference-dir` is the directory to save exported model, which can be used with paddle infernece.
 6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
-6. `--phones-dict` is the path of the phone vocabulary file.
-7. `--tones-dict` is the path of the tone vocabulary file.
+7. `--phones-dict` is the path of the phone vocabulary file.
+8. `--tones-dict` is the path of the tone vocabulary file.
+
+### Inference
+After Synthesize, we will get static models of speedyspeech and pwgan in `${train_output_path}/inference`.
+`./local/inference.sh` calls `${BIN_DIR}/inference.py`, which provides a paddle static model inference example for speedyspeech + pwgan synthesize.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
+```

-You can use the following scripts to synthesize for `../sentences.txt` using pretrained speedyspeech and parallel wavegan models.
+## Pretrained Model
+Pretrained SpeedySpeech model with no silence in the edge of audios. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
+
+SpeedySpeech checkpoint contains files listed below.
+```text
+speedyspeech_nosil_baker_ckpt_0.5
+├── default.yaml            # default config used to train speedyspeech
+├── feats_stats.npy         # statistics used to normalize spectrogram when training speedyspeech
+├── phone_id_map.txt        # phone vocabulary file when training speedyspeech
+├── snapshot_iter_11400.pdz # model parameters and optimizer states
+└── tone_id_map.txt         # tone vocabulary file when training speedyspeech
+```
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained speedyspeech and parallel wavegan models.
 ```bash
+source path.sh
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
+python3 ${BIN_DIR}/synthesize_e2e.py \
  --speedyspeech-config=speedyspeech_nosil_baker_ckpt_0.5/default.yaml \
  --speedyspeech-checkpoint=speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz \
  --speedyspeech-stat=speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy \
  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
-  --text=../sentences.txt \
+  --text=${BIN_DIR}/../sentences.txt \
  --output-dir=exp/default/test_e2e \
  --inference-dir=exp/default/inference \
  --device="gpu" \
--- a/examples/csmsc/speedyspeech/baker/conf/default.yaml
+++ b/examples/csmsc/speedyspeech/baker/conf/default.yaml
--- a/examples/csmsc/tts2/local/inference.sh
+++ b/examples/csmsc/tts2/local/inference.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+
+train_output_path=$1
+
+python3 ${BIN_DIR}/inference.py \
+  --inference-dir=${train_output_path}/inference \
+  --text=${BIN_DIR}/../sentences.txt \
+  --output-dir=${train_output_path}/pd_infer_out \
+  --phones-dict=dump/phone_id_map.txt \
+  --tones-dict=dump/tone_id_map.txt
--- a/examples/csmsc/speedyspeech/baker/preprocess.sh
+++ b/examples/csmsc/speedyspeech/baker/preprocess.sh
@ -1,9 +1,10 @@
 #!/bin/bash

+
 stage=0
 stop_stage=100

-export MAIN_ROOT=`realpath ${PWD}/../../../`
+config_path=$1

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
@ -11,17 +12,17 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./baker_alignment_tone \
        --output=durations.txt \
-        --config=conf/default.yaml
+        --config=${config_path}
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "Extract features ..."
-    python3 ../preprocess.py \
+    python3 ${BIN_DIR}/preprocess.py \
        --dataset=baker \
        --rootdir=~/datasets/BZNSYP/ \
        --dumpdir=dump \
        --dur-file=durations.txt \
-        --config=conf/default.yaml \
+        --config=${config_path} \
        --num-cpu=20 \
        --cut-sil=True \
        --use-relative-path=True
@ -38,7 +39,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize and covert phone/tone to id, dev and test should use train's stats
    echo "Normalize ..."
-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --stats=dump/train/feats_stats.npy \
@ -46,7 +47,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --tones-dict=dump/tone_id_map.txt \
        --use-relative-path=True

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --stats=dump/train/feats_stats.npy \
@ -54,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --tones-dict=dump/tone_id_map.txt \
        --use-relative-path=True

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --stats=dump/train/feats_stats.npy \
--- a/examples/csmsc/speedyspeech/baker/synthesize.sh
+++ b/examples/csmsc/speedyspeech/baker/synthesize.sh
@ -1,16 +1,20 @@
 #!/bin/bash
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ../synthesize.py \
-  --speedyspeech-config=conf/default.yaml \
-  --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_11400.pdz \
+python3 ${BIN_DIR}/synthesize.py \
+  --speedyspeech-config=${config_path} \
+  --speedyspeech-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --speedyspeech-stat=dump/train/feats_stats.npy \
  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
  --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=exp/default/test \
-  --inference-dir=exp/default/inference \
+  --output-dir=${train_output_path}/test \
+  --inference-dir=${train_output_path}/inference \
  --phones-dict=dump/phone_id_map.txt \
  --tones-dict=dump/tone_id_map.txt \
  --device="gpu"
--- a/examples/csmsc/speedyspeech/baker/synthesize_e2e.sh
+++ b/examples/csmsc/speedyspeech/baker/synthesize_e2e.sh
@ -1,16 +1,21 @@
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python synthesize_e2e.py \
-  --speedyspeech-config=conf/default.yaml \
-  --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_11400.pdz \
+python3 ${BIN_DIR}/synthesize_e2e.py \
+  --speedyspeech-config=${config_path} \
+  --speedyspeech-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --speedyspeech-stat=dump/train/feats_stats.npy \
  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
-  --text=../sentences.txt \
-  --output-dir=exp/default/test_e2e \
-  --inference-dir=exp/default/inference \
+  --text=${BIN_DIR}/../sentences.txt \
+  --output-dir=${train_output_path}/test_e2e \
+  --inference-dir=${train_output_path}/inference \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt \
  --tones-dict=dump/tone_id_map.txt
--- a/examples/csmsc/speedyspeech/baker/run.sh
+++ b/examples/csmsc/speedyspeech/baker/run.sh
@ -1,11 +1,14 @@

 #!/bin/bash

-python ../train.py \
+config_path=$1
+train_output_path=$2
+
+python ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=conf/default.yaml \
-    --output-dir=exp/default \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
    --nprocs=2 \
    --phones-dict=dump/phone_id_map.txt \
    --tones-dict=dump/tone_id_map.txt \
--- a/examples/csmsc/tts2/path.sh
+++ b/examples/csmsc/tts2/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=speedyspeech
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
--- a/examples/csmsc/tts2/run.sh
+++ b/examples/csmsc/tts2/run.sh
@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_76.pdz
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # inference with static model
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
+fi
--- a/examples/vctk/fastspeech2/baker/README.md
+++ b/examples/vctk/fastspeech2/baker/README.md
@ -9,13 +9,22 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
 You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.

-### Preprocess the dataset
+## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
 Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
-Run the command below to preprocess the dataset.
-
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from text file.
+```bash
+./run.sh
+```
+### Preprocess the dataset
 ```bash
-./preprocess.sh
+./local/preprocess.sh ${conf_path}
 ```
 When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.

@ -40,11 +49,11 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi

 Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.

-## Train the model
-`./run.sh` calls `../train.py`.
+### Train the model
 ```bash
-./run.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
 ```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
@ -78,18 +87,7 @@ optional arguments:
 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
 6. `--phones-dict` is the path of the phone vocabulary file.

-## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)
-
-FastSpeech2 checkpoint contains files listed below.
-```text
-fastspeech2_nosil_baker_ckpt_0.4
-├── default.yaml            # default config used to train fastspeech2
-├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
-├── snapshot_iter_76000.pdz # model parameters and optimizer states
-└── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
-```
-## Synthesize
+### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder.
 Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
 ```bash
@ -102,9 +100,9 @@ pwg_baker_ckpt_0.4
 ├── pwg_snapshot_iter_400000.pdz   # model parameters of parallel wavegan
 └── pwg_stats.npy                  # statistics used to normalize spectrogram when training parallel wavegan
 ```
-`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
-./synthesize.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
@ -144,9 +142,9 @@ optional arguments:
  --device DEVICE       device type to use.
  --verbose VERBOSE     verbose.
 ```
-`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
 ```bash
-./synthesize_e2e.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
@ -191,18 +189,31 @@ optional arguments:
 5. `--output-dir` is the directory to save synthesized audio files.
 6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.

-You can use the following scripts to synthesize for `../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
+## Pretrained Model
+Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)
+
+FastSpeech2 checkpoint contains files listed below.
+```text
+fastspeech2_nosil_baker_ckpt_0.4
+├── default.yaml            # default config used to train fastspeech2
+├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
+├── snapshot_iter_76000.pdz # model parameters and optimizer states
+└── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
+```
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
 ```bash
+source path.sh
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
+python3 ${BIN_DIR}/synthesize_e2e.py \
  --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
  --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
  --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
-  --text=../sentences.txt \
+  --text=${BIN_DIR}/../sentences.txt \
  --output-dir=exp/default/test_e2e \
  --device="gpu" \
  --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
--- a/examples/vctk/fastspeech2/baker/conf/default.yaml
+++ b/examples/vctk/fastspeech2/baker/conf/default.yaml
--- a/examples/vctk/fastspeech2/baker/preprocess.sh
+++ b/examples/vctk/fastspeech2/baker/preprocess.sh
@ -3,7 +3,7 @@
 stage=0
 stop_stage=100

-export MAIN_ROOT=`realpath ${PWD}/../../../`
+config_path=$1

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./baker_alignment_tone \
        --output=durations.txt \
-        --config=conf/default.yaml
+        --config=${config_path}
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # extract features
    echo "Extract features ..."
-    python3 ../preprocess.py \
+    python3 ${BIN_DIR}/preprocess.py \
        --dataset=baker \
        --rootdir=~/datasets/BZNSYP/ \
        --dumpdir=dump \
        --dur-file=durations.txt \
-        --config=conf/default.yaml \
+        --config=${config_path} \
        --num-cpu=20 \
        --cut-sil=True
 fi
@ -46,7 +46,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize and covert phone/speaker to id, dev and test should use train's stats
    echo "Normalize ..."
-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --speech-stats=dump/train/speech_stats.npy \
@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --speech-stats=dump/train/speech_stats.npy \
@ -64,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --speech-stats=dump/train/speech_stats.npy \
--- a/examples/vctk/fastspeech2/baker/simple.lexicon
+++ b/examples/vctk/fastspeech2/baker/simple.lexicon
--- a/examples/vctk/fastspeech2/baker/synthesize.sh
+++ b/examples/vctk/fastspeech2/baker/synthesize.sh
@ -1,14 +1,19 @@
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ../synthesize.py \
-  --fastspeech2-config=conf/default.yaml \
-  --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_76000.pdz \
+python3 ${BIN_DIR}/synthesize.py \
+  --fastspeech2-config=${config_path} \
+  --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --fastspeech2-stat=dump/train/speech_stats.npy \
  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
  --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=exp/default/test \
+  --output-dir=${train_output_path}/test \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt
--- a/examples/vctk/fastspeech2/baker/synthesize_e2e.sh
+++ b/examples/vctk/fastspeech2/baker/synthesize_e2e.sh
@ -1,14 +1,19 @@
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
-  --fastspeech2-config=conf/default.yaml \
-  --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_153.pdz \
+python3 ${BIN_DIR}/synthesize_e2e.py \
+  --fastspeech2-config=${config_path} \
+  --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --fastspeech2-stat=dump/train/speech_stats.npy \
  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
-  --text=../sentences.txt \
-  --output-dir=exp/default/test_e2e \
+  --text=${BIN_DIR}/../sentences.txt \
+  --output-dir=${train_output_path}/test_e2e \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt
--- a/examples/csmsc/tts3/local/train.sh
+++ b/examples/csmsc/tts3/local/train.sh
@ -0,0 +1,12 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --nprocs=1 \
+    --phones-dict=dump/phone_id_map.txt
--- a/examples/csmsc/tts3/path.sh
+++ b/examples/csmsc/tts3/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=fastspeech2
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
--- a/examples/csmsc/tts3/run.sh
+++ b/examples/csmsc/tts3/run.sh
@ -0,0 +1,37 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_153.pdz
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    bash ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/vctk/GANVocoder/parallelwave_gan/baker/README.md
+++ b/examples/vctk/GANVocoder/parallelwave_gan/baker/README.md
@ -1,6 +1,6 @@
 # Parallel WaveGAN with CSMSC
 This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
-## Preprocess the dataset
+## Dataset
 ### Download and Extract the datasaet
 Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/BZNSYP`.

@ -8,12 +8,21 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
 You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.

-### Preprocess the dataset
+## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
 Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
-Run the command below to preprocess the dataset.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+```bash
+./run.sh
+```
+### Preprocess the dataset
 ```bash
-./preprocess.sh
+./local/preprocess.sh ${conf_path}
 ```
 When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.

@ -30,17 +39,15 @@ dump
    ├── raw
    └── feats_stats.npy
 ```
-
 The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains log magnitude of mel spectrogram of each utterances, while the norm folder contains normalized spectrogram. The statistics used to normalize the spectrogram is computed from the training set, which is located in `dump/train/feats_stats.npy`.

 Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance.

-## Train the model
-
-`./run.sh` calls `../train.py`.
+### Train the model
 ```bash
-./run.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
 ```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 Here's the complete help message.

 ```text
@ -86,25 +93,10 @@ benchmark:
 4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.

-## Pretrained Models
-
-Pretrained models can be downloaded here:
-1. Parallel WaveGAN checkpoint. [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip), which is used as a vocoder in the end-to-end inference script.
-
-Parallel WaveGAN checkpoint contains files listed below.
-
-```text
-pwg_baker_ckpt_0.4
-├── pwg_default.yaml              # default config used to train parallel wavegan
-├── pwg_snapshot_iter_400000.pdz  # generator parameters of parallel wavegan
-└── pwg_stats.npy                 # statistics used to normalize spectrogram when training parallel wavegan
-```
-
-## Synthesize
-
-`synthesize.sh` calls `../synthesize.py `, which can synthesize waveform from `metadata.jsonl`.
+### Synthesize
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
-./synthesize.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT]
@ -127,10 +119,21 @@ optional arguments:
 ```

 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
-2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. If you use the pretrained model, use the `pwg_snapshot_iter_400000.pdz`.
+2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
 4. `--output-dir` is the directory to save the synthesized audio files.
 5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported.

+## Pretrained Models
+Pretrained models can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip).
+
+Parallel WaveGAN checkpoint contains files listed below.
+
+```text
+pwg_baker_ckpt_0.4
+├── pwg_default.yaml              # default config used to train parallel wavegan
+├── pwg_snapshot_iter_400000.pdz  # generator parameters of parallel wavegan
+└── pwg_stats.npy                 # statistics used to normalize spectrogram when training parallel wavegan
+```
 ## Acknowledgement
 We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
--- a/examples/vctk/GANVocoder/parallelwave_gan/baker/conf/default.yaml
+++ b/examples/vctk/GANVocoder/parallelwave_gan/baker/conf/default.yaml
--- a/examples/vctk/GANVocoder/parallelwave_gan/baker/preprocess.sh
+++ b/examples/vctk/GANVocoder/parallelwave_gan/baker/preprocess.sh
@ -3,7 +3,7 @@
 stage=0
 stop_stage=100

-export MAIN_ROOT=`realpath ${PWD}/../../../../`
+config_path=$1

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
@ -11,17 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./baker_alignment_tone \
        --output=durations.txt \
-        --config=conf/default.yaml
+        --config=${config_path}
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
    echo "Extract features ..."
-    python3 ../../preprocess.py \
+    python3 ${BIN_DIR}/../preprocess.py \
        --rootdir=~/datasets/BZNSYP/ \
        --dataset=baker \
        --dumpdir=dump \
        --dur-file=durations.txt \
-        --config=conf/default.yaml \
+        --config=${config_path} \
        --cut-sil=True \
        --num-cpu=20
 fi
@ -38,16 +39,16 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize, dev and test should use train's stats
    echo "Normalize ..."
   
-    python3 ../../normalize.py \
+    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --stats=dump/train/feats_stats.npy
-    python3 ../../normalize.py \
+    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --stats=dump/train/feats_stats.npy
    
-    python3 ../../normalize.py \
+    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --stats=dump/train/feats_stats.npy
--- a/examples/csmsc/voc1/local/synthesize.sh
+++ b/examples/csmsc/voc1/local/synthesize.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/synthesize.py \
+  --config=${config_path} \
+  --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
+  --test-metadata=dump/test/norm/metadata.jsonl \
+  --output-dir=${train_output_path}/test
--- a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/run.sh
+++ b/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/run.sh
@ -1,10 +1,13 @@
 #!/bin/bash

+config_path=$1
+train_output_path=$2
+
 FLAGS_cudnn_exhaustive_search=true \
 FLAGS_conv_workspace_size_limit=4000 \
-python ../train.py \
+python ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=conf/default.yaml \
-    --output-dir=exp/default \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
    --nprocs=1
--- a/examples/csmsc/voc1/path.sh
+++ b/examples/csmsc/voc1/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=parallelwave_gan
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/gan_vocoder/${MODEL}
--- a/examples/csmsc/voc1/run.sh
+++ b/examples/csmsc/voc1/run.sh
@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_5000.pdz
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/ljspeech/README.md
+++ b/examples/ljspeech/README.md
@ -3,4 +3,9 @@

 * tts0 - Tactron2
 * tts1 - TransformerTTS
+* tts2 - SpeedySpeech
+* tts3 - FastSpeech2
 * voc0 - WaveFlow
+* voc1 - Parallel WaveGAN
+* voc2 - MelGAN
+* voc3 - MultiBand MelGAN
--- a/examples/ljspeech/tts0/README.md
+++ b/examples/ljspeech/tts0/README.md
@ -0,0 +1,87 @@
+# Tacotron2  with LJSpeech
+PaddlePaddle dynamic graph implementation of Tacotron2, a neural network architecture for speech synthesis directly from text. The implementation is based on [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884).
+
+## Dataset
+We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).
+
+```bash
+wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
+tar xjvf LJSpeech-1.1.tar.bz2
+```
+## Get Started
+Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize mels.
+```bash
+./run.sh
+```
+### Preprocess the dataset
+```bash
+./local/preprocess.sh ${conf_path}
+```
+### Train the model
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+Here's the complete help message.
+```text
+usage: train.py [-h] [--config FILE] [--data DATA_DIR] [--output OUTPUT_DIR]
+                [--checkpoint_path CHECKPOINT_PATH] [--device {cpu,gpu}]
+                [--nprocs NPROCS] [--opts ...]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config FILE         path of the config file to overwrite to default config
+                        with.
+  --data DATA_DIR       path to the datatset.
+  --output OUTPUT_DIR   path to save checkpoint and logs.
+  --checkpoint_path CHECKPOINT_PATH
+                        path of the checkpoint to load
+  --device {cpu,gpu}    device type to use, cpu and gpu are supported.
+  --nprocs NPROCS       number of parallel processes to use.
+  --opts ...            options to overwrite --config file and the default
+                        config, passing in KEY VALUE pairs
+```
+
+If you want to train on CPU, just set ``--device=cpu``.
+If you want to train on multiple GPUs, just set ``--nprocs`` as num of GPU.
+By default, training will be resumed from the latest checkpoint in ``--output``, if you want to start a new training, please use a new ``${OUTPUTPATH}`` with no checkpoint.
+And if you want to resume from an other existing model, you should set ``checkpoint_path`` to be the checkpoint path you want to load.
+**Note: The checkpoint path cannot contain the file extension.**
+
+### Synthesize
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`,  which synthesize **mels**  from text_list here.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h] [--config FILE] [--checkpoint_path CHECKPOINT_PATH]
+                     [--input INPUT] [--output OUTPUT] [--device DEVICE]
+                     [--opts ...] [-v]
+
+generate mel spectrogram with TransformerTTS.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config FILE         extra config to overwrite the default config
+  --checkpoint_path CHECKPOINT_PATH
+                        path of the checkpoint to load.
+  --input INPUT         path of the text sentences
+  --output OUTPUT       path to save outputs
+  --device DEVICE       device type to use.
+  --opts ...            options to overwrite --config file and the default
+                        config, passing in KEY VALUE pairs
+  -v, --verbose         print msg
+```
+**Ps.** You can  use [waveflow](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/waveflow) as the neural vocoder to synthesize mels to wavs. (Please  refer to `synthesize.sh` in our  LJSpeech waveflow example)
+
+## Pretrained Models
+Pretrained Models can be downloaded from links below. We provide 2 models with different configurations.
+
+1. This model use a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)
+
+2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
--- a/examples/ljspeech/tts0/local/preprocess.sh
+++ b/examples/ljspeech/tts0/local/preprocess.sh
@ -0,0 +1,8 @@
+#!/bin/bash
+
+preprocess_path=$1
+
+python3 ${BIN_DIR}/preprocess.py \
+    --input=~/datasets/LJSpeech-1.1 \
+    --output=${preprocess_path} \
+    -v  \
--- a/examples/ljspeech/tts0/local/synthesize.sh
+++ b/examples/ljspeech/tts0/local/synthesize.sh
@ -0,0 +1,11 @@
+#!/bin/bash
+
+train_output_path=$1
+ckpt_name=$2
+
+python3 ${BIN_DIR}/synthesize.py \
+    --config=${train_output_path}/config.yaml \
+    --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
+    --input=${BIN_DIR}/../sentences_en.txt \
+    --output=${train_output_path}/test
+    --device=gpu
--- a/examples/ljspeech/tts0/local/tacotron2/README.md
+++ b/examples/ljspeech/tts0/local/tacotron2/README.md
@ -1,92 +0,0 @@
-# Tacotron2
-
-PaddlePaddle dynamic graph implementation of Tacotron2, a neural network architecture for speech synthesis directly from text. The implementation is based on [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884).
-
-## Project Structure
-
-```text
-├── config.py              # default configuration file
-├── ljspeech.py            # dataset and dataloader settings for LJSpeech
-├── preprocess.py          # script to preprocess LJSpeech dataset
-├── synthesize.py          # script to synthesize spectrogram from text
-├── train.py               # script for tacotron2 model training
-├── synthesize.ipynb       # notebook example for end-to-end TTS
-```
-
-## Dataset
-
-We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).
-
-```bash
-wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
-tar xjvf LJSpeech-1.1.tar.bz2
-```
-
-Then you need to preprocess the data by running ``preprocess.py``, the preprocessed data will be placed in ``--output`` directory.
-
-```bash
-python preprocess.py \
--input=${DATAPATH} \
--output=${PREPROCESSEDDATAPATH} \
-v  \
-```
-
-For more help on arguments
-
-``python preprocess.py --help``.
-
-## Train the model
-
-Tacotron2 model can be trained by running ``train.py``.
-
-```bash
-python train.py \
--data=${PREPROCESSEDDATAPATH} \
--output=${OUTPUTPATH} \
--device=gpu \
-```
-
-If you want to train on CPU, just set ``--device=cpu``.
-If you want to train on multiple GPUs, just set ``--nprocs`` as num of GPU.
-By default, training will be resumed from the latest checkpoint in ``--output``, if you want to start a new training, please use a new ``${OUTPUTPATH}`` with no checkpoint. And if you want to resume from an other existing model, you should set ``checkpoint_path`` to be the checkpoint path you want to load.
-
-**Note: The checkpoint path cannot contain the file extension.**
-
-For more help on arguments
-
-``python train_transformer.py --help``.
-
-## Synthesize
-
-After training the Tacotron2, spectrogram can be synthesized by running ``synthesize.py``.
-
-```bash
-python synthesize.py \
--config=${CONFIGPATH} \
--checkpoint_path=${CHECKPOINTPATH} \
--input=${TEXTPATH} \
--output=${OUTPUTPATH}
--device=gpu
-```
-
-The ``${CONFIGPATH}`` needs to be matched with ``${CHECKPOINTPATH}``.
-
-For more help on arguments
-
-``python synthesize.py --help``.
-
-Then you can find the spectrogram files in ``${OUTPUTPATH}``, and then they can be the input of vocoder like [waveflow](../waveflow/README.md#Synthesis) to get audio files.
-
-
-## Pretrained Models
-
-Pretrained Models can be downloaded from links below. We provide 2 models with different configurations.
-
-1. This model use a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)
-
-2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
-
-
-## Notebook: End-to-end TTS
-
-See [synthesize.ipynb](./synthesize.ipynb) for details about end-to-end TTS with tacotron2 and waveflow.
--- a/examples/ljspeech/tts0/local/train.sh
+++ b/examples/ljspeech/tts0/local/train.sh
@ -0,0 +1,9 @@
+#!/bin/bash
+
+preprocess_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --data=${preprocess_path} \
+    --output=${train_output_path} \
+    --device=gpu \
--- a/examples/ljspeech/tts0/path.sh
+++ b/examples/ljspeech/tts0/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=tacotron2
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
--- a/examples/ljspeech/tts0/run.sh
+++ b/examples/ljspeech/tts0/run.sh
@ -0,0 +1,33 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+preprocess_path=preprocessed_ljspeech
+train_output_path=output
+ckpt_name=step-35000
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${preprocess_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_name} || exit -1
+fi
+
--- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/README.md
+++ b/examples/ljspeech/tts1/local/transformer_tts/ljspeech/README.md
@ -8,12 +8,21 @@ wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
 ```bash
 tar xjvf LJSpeech-1.1.tar.bz2
 ```
-### Preprocess the dataset
+## Get Started
 Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
-Run the command below to preprocess the dataset.
-
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from text file.
+```bash
+./run.sh
+```
+### Preprocess the dataset
 ```bash
-./preprocess.sh.
+./local/preprocess.sh ${conf_path}
 ```
 When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
 ```text
@ -35,10 +44,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi

 Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, path of speech features, speaker and id of each utterance.

-## Train the model
-`./run.sh` calls `../train.py`.
+### Train the model
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 ```bash
-./run.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
 ```
 Here's the complete help message.
 ```text
@ -71,17 +80,6 @@ optional arguments:
 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
 6. `--phones-dict` is the path of the phone vocabulary file.

-## Pretrained Model
-Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)
-
-TransformerTTS  checkpoint contains files listed below.
-```text
-transformer_tts_ljspeech_ckpt_0.4
-├── default.yaml             # default config used to train transformer_tts
-├── phone_id_map.txt         # phone vocabulary file when training transformer_tts
-├── snapshot_iter_201500.pdz # model parameters and optimizer states
-└── speech_stats.npy         # statistics used to normalize spectrogram when training transformer_tts
-```
 ## Synthesize
 We use [waveflow](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/waveflow) as the neural vocoder.
 Download Pretrained WaveFlow Model with residual channel equals 128 from [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip) and unzip it.
@ -94,9 +92,9 @@ waveflow_ljspeech_ckpt_0.3
 ├── config.yaml           # default config used to train waveflow
 └── step-2000000.pdparams # model parameters of waveflow
 ```
-`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
-./synthesize.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize.py [-h] [--transformer-tts-config TRANSFORMER_TTS_CONFIG]
@ -132,9 +130,9 @@ optional arguments:
  --device DEVICE       device type to use.
  --verbose VERBOSE     verbose.
 ```
-`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
 ```bash
-./synthesize_e2e.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize_e2e.py [-h]
@ -177,17 +175,30 @@ optional arguments:
 5. `--output-dir` is the directory to save synthesized audio files.
 6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.

-You can use the following scripts to synthesize for `../sentences.txt` using pretrained transformer_tts  and waveflow models.
+## Pretrained Model
+Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)
+
+TransformerTTS  checkpoint contains files listed below.
+```text
+transformer_tts_ljspeech_ckpt_0.4
+├── default.yaml             # default config used to train transformer_tts
+├── phone_id_map.txt         # phone vocabulary file when training transformer_tts
+├── snapshot_iter_201500.pdz # model parameters and optimizer states
+└── speech_stats.npy         # statistics used to normalize spectrogram when training transformer_tts
+```
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_en.txt` using pretrained transformer_tts  and waveflow models.
 ```bash
+source path.sh
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
+python3 ${BIN_DIR}/synthesize_e2e.py \
  --transformer-tts-config=transformer_tts_ljspeech_ckpt_0.4/default.yaml \
  --transformer-tts-checkpoint=transformer_tts_ljspeech_ckpt_0.4/snapshot_iter_201500.pdz \
  --transformer-tts-stat=transformer_tts_ljspeech_ckpt_0.4/speech_stats.npy \
  --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
  --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
-  --text=../sentences.txt \
+  --text=${BIN_DIR}/../sentences_en.txt \
  --output-dir=exp/default/test_e2e \
  --device="gpu" \
  --phones-dict=transformer_tts_ljspeech_ckpt_0.4/phone_id_map.txt
--- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/conf/default.yaml
+++ b/examples/ljspeech/tts1/local/transformer_tts/ljspeech/conf/default.yaml
--- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/preprocess.sh
+++ b/examples/ljspeech/tts1/local/transformer_tts/ljspeech/preprocess.sh
@ -3,12 +3,12 @@
 stage=1
 stop_stage=100

-export MAIN_ROOT=`realpath ${PWD}/../../../`
+config_path=$1

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # extract features
    echo "Extract features ..."
-    python3 ../preprocess.py  \
+    python3 ${BIN_DIR}/preprocess.py  \
        --dataset=ljspeech \
        --rootdir=~/datasets/LJSpeech-1.1/ \
        --dumpdir=dump \
@ -27,21 +27,21 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize and covert phone to id, dev and test should use train's stats
    echo "Normalize ..."
-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --speech-stats=dump/train/speech_stats.npy \
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --speech-stats=dump/train/speech_stats.npy \
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --speech-stats=dump/train/speech_stats.npy \
--- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize.sh
+++ b/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize.sh
@ -1,13 +1,18 @@
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ../synthesize.py \
-  --transformer-tts-config=conf/default.yaml \
-  --transformer-tts-checkpoint=exp/default/checkpoints/snapshot_iter_201500.pdz \
+python3 ${BIN_DIR}/synthesize.py \
+  --transformer-tts-config=${config_path} \
+  --transformer-tts-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --transformer-tts-stat=dump/train/speech_stats.npy \
  --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
  --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
  --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=exp/default/test \
+  --output-dir=${train_output_path}/test \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt
--- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize_e2e.sh
+++ b/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize_e2e.sh
@ -1,13 +1,18 @@
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
-  --transformer-tts-config=conf/default.yaml \
-  --transformer-tts-checkpoint=exp/default/checkpoints/snapshot_iter_201500.pdz \
+python3 ${BIN_DIR}/synthesize_e2e.py \
+  --transformer-tts-config=${config_path} \
+  --transformer-tts-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --transformer-tts-stat=dump/train/speech_stats.npy \
  --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
  --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
-  --text=../sentences.txt \
-  --output-dir=exp/default/test_e2e \
+  --text=${BIN_DIR}/../sentences_en.txt \
+  --output-dir=${train_output_path}/test_e2e \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt
--- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/run.sh
+++ b/examples/ljspeech/tts1/local/transformer_tts/ljspeech/run.sh
@ -1,9 +1,12 @@
 #!/bin/bash

-python3 ../train.py \
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=conf/default.yaml \
-    --output-dir=exp/default \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
    --nprocs=2 \
    --phones-dict=dump/phone_id_map.txt
--- a/examples/ljspeech/tts1/local/transformer_tts/sentences.txt
+++ b/examples/ljspeech/tts1/local/transformer_tts/sentences.txt
@ -1,9 +0,0 @@
-001 Life was like a box of chocolates, you never know what you're gonna get.
-002 With great power there must come great responsibility.
-003 To be or not to be, that’s a question.
-004 A man can be destroyed but not defeated
-005 Do not, for one repulse, give up the purpose that you resolved to effort.
-006 Death is just a part of life, something we're all destined to do.
-007 I think it's hard winning a war with words. 
-008 Don’t argue with the people of strong determination, because they may change the fact!
-009 Love you three thousand times.
--- a/examples/ljspeech/tts1/path.sh
+++ b/examples/ljspeech/tts1/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=transformer_tts
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
--- a/examples/ljspeech/tts1/run.sh
+++ b/examples/ljspeech/tts1/run.sh
@ -0,0 +1,37 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_403.pdz
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/vctk/fastspeech2/ljspeech/README.md
+++ b/examples/vctk/fastspeech2/ljspeech/README.md
@ -9,13 +9,22 @@ Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
 You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.

-### Preprocess the dataset
+## Get Started
 Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
 Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`.
-Run the command below to preprocess the dataset.
-
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from text file.
+```bash
+./run.sh
+```
+### Preprocess the dataset
 ```bash
-./preprocess.sh
+./local/preprocess.sh ${conf_path}
 ```
 When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.

@ -40,10 +49,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi

 Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.

-## Train the model
-`./run.sh` calls `../train.py`.
+### Train the model
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 ```bash
-./run.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
 ```
 Here's the complete help message.
 ```text
@ -78,18 +87,7 @@ optional arguments:
 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
 6. `--phones-dict` is the path of the phone vocabulary file.

-## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
-
-FastSpeech2 checkpoint contains files listed below.
-```text
-fastspeech2_nosil_ljspeech_ckpt_0.5
-├── default.yaml             # default config used to train fastspeech2
-├── phone_id_map.txt         # phone vocabulary file when training fastspeech2
-├── snapshot_iter_100000.pdz # model parameters and optimizer states
-└── speech_stats.npy         # statistics used to normalize spectrogram when training fastspeech2
-```
-## Synthesize
+### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/ljspeech/) as the neural vocoder.
 Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip) and unzip it.
 ```bash
@ -102,9 +100,9 @@ pwg_ljspeech_ckpt_0.5
 ├── pwg_snapshot_iter_400000.pdz  # generator parameters of parallel wavegan
 └── pwg_stats.npy                 # statistics used to normalize spectrogram when training parallel wavegan
 ```
-`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
-./synthesize.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
@ -144,19 +142,19 @@ optional arguments:
  --device DEVICE       device type to use.
  --verbose VERBOSE     verbose.
 ```
-`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e_en.py`, which can synthesize waveform from text file.
 ```bash
-./synthesize_e2e.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
-usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
-                         [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT]
-                         [--fastspeech2-stat FASTSPEECH2_STAT]
-                         [--pwg-config PWG_CONFIG]
-                         [--pwg-checkpoint PWG_CHECKPOINT]
-                         [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT]
-                         [--text TEXT] [--output-dir OUTPUT_DIR]
-                         [--device DEVICE] [--verbose VERBOSE]
+usage: synthesize_e2e_en.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
+                            [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT]
+                            [--fastspeech2-stat FASTSPEECH2_STAT]
+                            [--pwg-config PWG_CONFIG]
+                            [--pwg-checkpoint PWG_CHECKPOINT]
+                            [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT]
+                            [--text TEXT] [--output-dir OUTPUT_DIR]
+                            [--device DEVICE] [--verbose VERBOSE]

 Synthesize with fastspeech2 & parallel wavegan.

@ -191,18 +189,31 @@ optional arguments:
 5. `--output-dir` is the directory to save synthesized audio files.
 6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.

-You can use the following scripts to synthesize for `../sentences_en.txt` using pretrained fastspeech2 and parallel wavegan models.
+## Pretrained Model
+Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
+
+FastSpeech2 checkpoint contains files listed below.
+```text
+fastspeech2_nosil_ljspeech_ckpt_0.5
+├── default.yaml             # default config used to train fastspeech2
+├── phone_id_map.txt         # phone vocabulary file when training fastspeech2
+├── snapshot_iter_100000.pdz # model parameters and optimizer states
+└── speech_stats.npy         # statistics used to normalize spectrogram when training fastspeech2
+```
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_en.txt` using pretrained fastspeech2 and parallel wavegan models.
 ```bash
+source path.sh
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
+python3 ${BIN_DIR}/synthesize_e2e_en.py \
  --fastspeech2-config=fastspeech2_nosil_ljspeech_ckpt_0.5/default.yaml \
  --fastspeech2-checkpoint=fastspeech2_nosil_ljspeech_ckpt_0.5/snapshot_iter_100000.pdz \
  --fastspeech2-stat=fastspeech2_nosil_ljspeech_ckpt_0.5/speech_stats.npy \
  --pwg-config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
  --pwg-checkpoint=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
-  --text=../sentences_en.txt \
+  --text=${BIN_DIR}/../sentences_en.txt \
  --output-dir=exp/default/test_e2e \
  --device="gpu" \
  --phones-dict=fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt
--- a/examples/vctk/fastspeech2/ljspeech/conf/default.yaml
+++ b/examples/vctk/fastspeech2/ljspeech/conf/default.yaml
--- a/examples/vctk/fastspeech2/ljspeech/preprocess.sh
+++ b/examples/vctk/fastspeech2/ljspeech/preprocess.sh
@ -3,7 +3,7 @@
 stage=0
 stop_stage=100

-export MAIN_ROOT=`realpath ${PWD}/../../../`
+config_path=$1

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./ljspeech_alignment \
        --output=durations.txt \
-        --config=conf/default.yaml
+        --config=${config_path}
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # extract features
    echo "Extract features ..."
-    python3 ../preprocess.py \
+    python3 ${BIN_DIR}/preprocess.py \
        --dataset=ljspeech \
        --rootdir=~/datasets/LJSpeech-1.1/ \
        --dumpdir=dump \
        --dur-file=durations.txt \
-        --config=conf/default.yaml \
+        --config=${config_path} \
        --num-cpu=8 \
        --cut-sil=True
 fi
@ -46,7 +46,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize and covert phone/speaker to id, dev and test should use train's stats
    echo "Normalize ..."
-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --speech-stats=dump/train/speech_stats.npy \
@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --speech-stats=dump/train/speech_stats.npy \
@ -64,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --speech-stats=dump/train/speech_stats.npy \
--- a/examples/vctk/fastspeech2/ljspeech/synthesize.sh
+++ b/examples/vctk/fastspeech2/ljspeech/synthesize.sh
@ -1,15 +1,19 @@
-
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ../synthesize.py \
-  --fastspeech2-config=conf/default.yaml \
-  --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_100000.pdz \
+python3 ${BIN_DIR}/synthesize.py \
+  --fastspeech2-config=${config_path} \
+  --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --fastspeech2-stat=dump/train/speech_stats.npy \
  --pwg-config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
  --pwg-checkpoint=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
  --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=exp/default/test \
+  --output-dir=${train_output_path}/test \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt
--- a/examples/vctk/fastspeech2/ljspeech/synthesize_e2e.sh
+++ b/examples/vctk/fastspeech2/ljspeech/synthesize_e2e.sh
@ -1,15 +1,19 @@
-
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
-  --fastspeech2-config=conf/default.yaml \
-  --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_100000.pdz \
+python3 ${BIN_DIR}/synthesize_e2e_en.py \
+  --fastspeech2-config=${config_path} \
+  --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --fastspeech2-stat=dump/train/speech_stats.npy \
  --pwg-config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
  --pwg-checkpoint=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
-  --text=../sentences_en.txt \
-  --output-dir=exp/default/test_e2e \
+  --text=${BIN_DIR}/../sentences_en.txt \
+  --output-dir=${train_output_path}/test_e2e \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt
--- a/examples/vctk/fastspeech2/ljspeech/run.sh
+++ b/examples/vctk/fastspeech2/ljspeech/run.sh
@ -1,9 +1,12 @@
 #!/bin/bash

-python3 ../train.py \
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=conf/default.yaml \
-    --output-dir=exp/default \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
    --nprocs=1 \
    --phones-dict=dump/phone_id_map.txt
--- a/examples/ljspeech/tts3/path.sh
+++ b/examples/ljspeech/tts3/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=fastspeech2
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
--- a/examples/ljspeech/tts3/run.sh
+++ b/examples/ljspeech/tts3/run.sh
@ -0,0 +1,37 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_201.pdz
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/ljspeech/voc0/README.md
+++ b/examples/ljspeech/voc0/README.md
@ -0,0 +1,52 @@
+# WaveFlow with LJSpeech
+## Dataset
+### Download the datasaet.
+```bash
+wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
+```
+### Extract the dataset.
+```bash
+tar xjvf LJSpeech-1.1.tar.bz2
+```
+## Get Started
+Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
+Assume the path to the Tacotron2 generated mels is `../tts0/output/test`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs from mels.
+```bash
+./run.sh
+```
+### Preprocess the dataset.
+```bash
+./local/preprocess.sh ${preprocess_path}
+```
+### Train the model
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path}
+```
+The training script requires 4 command line arguments.
+1. `--data` is the path of the training dataset.
+2. `--output` is the path of the output directory.
+3. `--device` should be "cpu" or "gpu"
+4. `--nprocs` is the number of processes to train the model in parallel.
+
+If you want distributed training, set a larger `--nprocs` (e.g. 4). Note that distributed training with cpu is not supported yet.
+
+### Synthesize
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from mels.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${input_mel_path} ${train_output_path} ${ckpt_name}
+```
+
+Synthesize waveform.
+1. We assume the `--input` is a directory containing several mel spectrograms(log magnitude) in `.npy` format.
+2. The output would be saved in `--output` directory, containing several `.wav` files, each with the same name as the mel spectrogram does.
+3. `--checkpoint_path` should be the path of the parameter file (`.pdparams`) to load. Note that the extention name `.pdparmas` is not included here.
+4. `--device` specifies to device to run synthesis on.
+
+## Pretrained Model
+Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip).
--- a/examples/ljspeech/voc0/local/preprocess.sh
+++ b/examples/ljspeech/voc0/local/preprocess.sh
@ -0,0 +1,7 @@
+#!/bin/bash
+
+preprocess_path=$1
+
+python3 ${BIN_DIR}/preprocess.py \
+    --input=~/datasets/LJSpeech-1.1 \
+    --output=${preprocess_path}
--- a/examples/ljspeech/voc0/local/synthesize.sh
+++ b/examples/ljspeech/voc0/local/synthesize.sh
@ -0,0 +1,12 @@
+#!/bin/bash
+
+input_mel_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+python ${BIN_DIR}/synthesize.py \
+    --input=${input_mel_path} \
+    --output=${train_output_path}/wavs/ \
+    --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
+    --device="gpu" \
+    --verbose
--- a/examples/ljspeech/voc0/local/train.sh
+++ b/examples/ljspeech/voc0/local/train.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+
+preprocess_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --data=${preprocess_path} \
+    --output=${train_output_path} \
+    --device="gpu" \
+    --nprocs=1
--- a/examples/ljspeech/voc0/local/waveflow/README.md
+++ b/examples/ljspeech/voc0/local/waveflow/README.md
@ -1,52 +0,0 @@
-# WaveFlow with LJSpeech
-
-## Dataset
-
-### Download the datasaet.
-
-```bash
-wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
-```
-
-### Extract the dataset.
-
-```bash
-tar xjvf LJSpeech-1.1.tar.bz2
-```
-
-### Preprocess the dataset.
-
-Assume the path to save the preprocessed dataset is `ljspeech_waveflow`. Run the command below to preprocess the dataset.
-
-```bash
-python preprocess.py --input=LJSpeech-1.1/  --output=ljspeech_waveflow
-```
-
-## Train the model
-
-The training script requires 4 command line arguments.
-`--data` is the path of the training dataset, `--output` is the path of the output directory (we recommend to use a subdirectory in `runs` to manage different experiments.)
-
-`--device` should be "cpu" or "gpu", `--nprocs` is the number of processes to train the model in parallel.
-
-```bash
-python train.py --data=ljspeech_waveflow/ --output=runs/test --device="gpu" --nprocs=1
-```
-
-If you want distributed training, set a larger `--nprocs` (e.g. 4). Note that distributed training with cpu is not supported yet.
-
-## Synthesize
-
-Synthesize waveform. We assume the `--input` is a directory containing several mel spectrograms(log magnitude) in `.npy` format. The output would be saved in `--output` directory, containing several `.wav` files, each with the same name as the mel spectrogram does.
-
-`--checkpoint_path` should be the path of the parameter file (`.pdparams`) to load. Note that the extention name `.pdparmas` is not included here.
-
-`--device` specifies to device to run synthesis on.
-
-```bash
-python synthesize.py --input=mels/ --output=wavs/ --checkpoint_path='step-2000000' --device="gpu" --verbose
-```
-
-## Pretrained Model
-
-Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip).
--- a/examples/ljspeech/voc0/path.sh
+++ b/examples/ljspeech/voc0/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=waveflow
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
--- a/examples/ljspeech/voc0/run.sh
+++ b/examples/ljspeech/voc0/run.sh
@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+preprocess_path=preprocessed_ljspeech
+train_output_path=output
+# mel generated by Tacotron2
+input_mel_path=../tts0/output/test
+ckpt_name=step-10000
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${preprocess_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${input_mel_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/README.md
+++ b/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/README.md
@ -1,22 +1,28 @@
-# Parallel WaveGAN with the LJSpeech-1.1 dataset
-
+# Parallel WaveGAN with the LJSpeech-1.1
 This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/).
-
-## Preprocess the dataset
-
+## Dataset
 ### Download and Extract the datasaet
 Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/).
-
 ### Get MFA results for silence trim
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
 You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.

-### Preprocess the dataset
+## Get Started
 Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
 Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`.
-Run the command below to preprocess the dataset.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+```bash
+./run.sh
+```
+
+### Preprocess the dataset
 ```bash
-./preprocess.sh
+./local/preprocess.sh ${conf_path}
 ```
 When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.

@ -38,10 +44,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi

 Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance.

-## Train the model
-`./run.sh` calls `../train.py`.
+### Train the model
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 ```bash
-./run.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
 ```
 Here's the complete help message.

@ -88,23 +94,10 @@ benchmark:
 4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.

-## Pretrained Models
-Pretrained models can be downloaded here:
-1. Parallel WaveGAN checkpoint. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip), which is used as a vocoder in the end-to-end inference script.
-
-Parallel WaveGAN checkpoint contains files listed below.
-
-```text
-pwg_ljspeech_ckpt_0.5
-├── pwg_default.yaml              # default config used to train parallel wavegan
-├── pwg_snapshot_iter_400000.pdz  # generator parameters of parallel wavegan
-└── pwg_stats.npy                 # statistics used to normalize spectrogram when training parallel wavegan
-```
-
-## Synthesize
-`synthesize.sh` calls `../synthesize.py `, which can synthesize waveform from `metadata.jsonl`.
+### Synthesize
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
-./synthesize.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT]
@ -127,10 +120,21 @@ optional arguments:
 ```

 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
-2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. If you use the pretrained model, use the `pwg_snapshot_iter_400000.pdz`.
+2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
 4. `--output-dir` is the directory to save the synthesized audio files.
 5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported.

+## Pretrained Models
+Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)
+
+Parallel WaveGAN checkpoint contains files listed below.
+
+```text
+pwg_ljspeech_ckpt_0.5
+├── pwg_default.yaml              # default config used to train parallel wavegan
+├── pwg_snapshot_iter_400000.pdz  # generator parameters of parallel wavegan
+└── pwg_stats.npy                 # statistics used to normalize spectrogram when training parallel wavegan
+```
 ## Acknowledgement
 We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
--- a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/conf/default.yaml
+++ b/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/conf/default.yaml
--- a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/preprocess.sh
+++ b/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/preprocess.sh
@ -3,7 +3,7 @@
 stage=0
 stop_stage=100

-export MAIN_ROOT=`realpath ${PWD}/../../../../`
+config_path=$1

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./ljspeech_alignment \
        --output=durations.txt \
-        --config=conf/default.yaml
+        --config=${config_path}
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # extract features
    echo "Extract features ..."
-    python3 ../../preprocess.py \
+    python3 ${BIN_DIR}/../preprocess.py \
        --rootdir=~/datasets/LJSpeech-1.1/ \
        --dataset=ljspeech \
        --dumpdir=dump \
        --dur-file=durations.txt \
-        --config=conf/default.yaml \
+        --config=${config_path} \
        --cut-sil=True \
        --num-cpu=20
 fi
@ -39,16 +39,16 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize, dev and test should use train's stats
    echo "Normalize ..."
   
-    python3 ../../normalize.py \
+    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --stats=dump/train/feats_stats.npy
-    python3 ../../normalize.py \
+    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --stats=dump/train/feats_stats.npy
    
-    python3 ../../normalize.py \
+    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --stats=dump/train/feats_stats.npy
--- a/examples/ljspeech/voc1/local/synthesize.sh
+++ b/examples/ljspeech/voc1/local/synthesize.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/synthesize.py \
+  --config=${config_path} \
+  --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
+  --test-metadata=dump/test/norm/metadata.jsonl \
+  --output-dir=${train_output_path}/test
--- a/examples/vctk/GANVocoder/parallelwave_gan/vctk/run.sh
+++ b/examples/vctk/GANVocoder/parallelwave_gan/vctk/run.sh
@ -1,10 +1,13 @@
 #!/bin/bash

+config_path=$1
+train_output_path=$2
+
 FLAGS_cudnn_exhaustive_search=true \
 FLAGS_conv_workspace_size_limit=4000 \
-python ../train.py \
+python ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=conf/default.yaml \
-    --output-dir=exp/default \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
    --nprocs=1
--- a/examples/ljspeech/voc1/path.sh
+++ b/examples/ljspeech/voc1/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=parallelwave_gan
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/gan_vocoder/${MODEL}
--- a/examples/ljspeech/voc1/run.sh
+++ b/examples/ljspeech/voc1/run.sh
@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_5000.pdz
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/voxceleb/spk0/local/ge2e/README.md
+++ b/examples/voxceleb/spk0/local/ge2e/README.md
@ -1,97 +1,78 @@
 # Speaker Encoder
-
 This experiment trains a speaker encoder with speaker verification as its task. It is done as a part of the experiment of transfer learning from speaker verification to multispeaker text-to-speech synthesis, which can be found at [tacotron2_aishell3](../tacotron2_shell3). The trained speaker encoder is used to extract utterance embeddings from utterances.
-
 ## Model
-
 The model used in this experiment is the speaker encoder with text independent speaker verification task in [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf). GE2E-softmax loss is used.

-## File Structure
-
-```text
-ge2e
-├── README.md
-├── README_cn.md
-├── audio_processor.py
-├── config.py
-├── dataset_processors.py
-├── inference.py
-├── preprocess.py
-├── random_cycle.py
-├── speaker_verification_dataset.py
-└── train.py
-```
-
 ## Download Datasets
-
 Currently supported datasets are  Librispeech-other-500, VoxCeleb, VoxCeleb2,ai-datatang-200zh, magicdata, which can be downloaded from corresponding webpage.

 1. Librispeech/train-other-500
-
   An English multispeaker dataset，[URL](https://www.openslr.org/resources/12/train-other-500.tar.gz)，only the `train-other-500` subset is used.
-
 2. VoxCeleb1
-
   An English multispeaker dataset，[URL](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html) , Audio Files from Dev A to Dev D should be downloaded, combined and extracted.
-
 3. VoxCeleb2
-
   An English multispeaker dataset，[URL](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html) , Audio Files from Dev A to Dev H should be downloaded, combined and extracted.
-
 4. Aidatatang-200zh
-
   A Mandarin Chinese multispeaker dataset ，[URL](https://www.openslr.org/62/) .
-
 5. magicdata
-
   A Mandarin Chinese multispeaker dataset ，[URL](https://www.openslr.org/68/) .

 If you want to use other datasets, you can also download and preprocess it as long as it meets the requirements described below.

-## Preprocess Datasets
+## Get Started

+```bash
+./run.sh
+```
+
+### Preprocess Datasets
+`./local/preprocess.sh` calls `${BIN_DIR}/preprocess.py`.
+```bash
+./local/preprocess.sh ${datasets_root} ${preprocess_path} ${dataset_names}
+```
+Assume datasets_root is `~/datasets/GE2E`, and it has the follow structure（We only use `train-other-500` for simplicity）:
+```Text
+GE2E
+├── LibriSpeech
+└── (other datasets)
+```
 Multispeaker datasets are used as training data, though the transcriptions are not used. To enlarge the amount of data used for training, several multispeaker datasets are combined. The preporcessed datasets are organized in a file structure described below. The mel spectrogram of each utterance is save in `.npy` format. The dataset is 2-stratified (speaker-utterance). Since multiple datasets are combined, to avoid conflict in speaker id, dataset name is prepended to the speake ids.

 ```text
 dataset_root
 ├── dataset01_speaker01/
-│   ├── utterance01.npy
-│   ├── utterance02.npy
-│   └── utterance03.npy
+│   ├── utterance01.npy
+│   ├── utterance02.npy
+│   └── utterance03.npy
 ├── dataset01_speaker02/
-│   ├── utterance01.npy
-│   ├── utterance02.npy
-│   └── utterance03.npy
+│   ├── utterance01.npy
+│   ├── utterance02.npy
+│   └── utterance03.npy
 ├── dataset02_speaker01/
-│   ├── utterance01.npy
-│   ├── utterance02.npy
-│   └── utterance03.npy
+│   ├── utterance01.npy
+│   ├── utterance02.npy
+│   └── utterance03.npy
 └── dataset02_speaker02/
-    ├── utterance01.npy
-    ├── utterance02.npy
-    └── utterance03.npy
+    ├── utterance01.npy
+    ├── utterance02.npy
+    └── utterance03.npy
 ```
+In `${BIN_DIR}/preprocess.py`:
+1. `--datasets_root` is the directory that contains several extracted dataset
+2.  `--output_dir` is the directory to save the preprocessed dataset
+3.  `--dataset_names` is the dataset to preprocess. If there are multiple datasets in `--datasets_root` to preprocess, the names can be joined with comma. Currently supported dataset names are  librispeech_other, voxceleb1, voxceleb2, aidatatang_200zh and magicdata.

-Run the command to preprocess datasets.
-
+### Train the model
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 ```bash
-python preprocess.py --datasets_root=<datasets_root> --output_dir=<output_dir> --dataset_names=<dataset_names>
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path}
 ```
-
-Here `--datasets_root` is the directory that contains several extracted dataset; `--output_dir` is the directory to save the preprocessed dataset; `--dataset_names` is the dataset to preprocess. If there are multiple datasets in `--datasets_root` to preprocess, the names can be joined with comma. Currently supported dataset names are  librispeech_other, voxceleb1, voxceleb2, aidatatang_200zh and magicdata.
-
-## Training
-
-When preprocessing is done, run the command below to train the mdoel.
-
-```bash
-python train.py --data=<data_path> --output=<output> --device="gpu" --nprocs=1
-```
-
- `--data` is the path to the preprocessed dataset.
- `--output` is the directory to save results，usually a subdirectory of `runs`.It contains visualdl log files, text log files, config file and a `checkpoints` directory, which contains parameter file and optimizer state file. If `--output` already has some training results in it, the most recent parameter file and optimizer state file is loaded before training.
- `--device` is the device type to run the training, 'cpu' and 'gpu' are supported.
- `--nprocs` is the number of replicas to run in multiprocessing based parallel training。Currently multiprocessing based parallel training is only enabled when using 'gpu' as the devicde. `CUDA_VISIBLE_DEVICES` can be used to specify visible devices with cuda.
+In `${BIN_DIR}/train.py`:
+1. `--data` is the path to the preprocessed dataset.
+2. `--output` is the directory to save results，usually a subdirectory of `runs`.It contains visualdl log files, text log files, config file and a `checkpoints` directory, which contains parameter file and optimizer state file. If `--output` already has some training results in it, the most recent parameter file and optimizer state file is loaded before training.
+3. `--device` is the device type to run the training, 'cpu' and 'gpu' are supported.
+4. `--nprocs` is the number of replicas to run in multiprocessing based parallel training。Currently multiprocessing based parallel training is only enabled when using 'gpu' as the devicde.
+5. `CUDA_VISIBLE_DEVICES` can be used to specify visible devices with cuda.

 Other options are described below.

@ -99,29 +80,23 @@ Other options are described below.
 - `--opts` is command line options to further override config files. It should be the last comman line options passed with multiple key-value pairs separated by spaces.
 - `--checkpoint_path` specifies the checkpoiont to load before training, extension is not included. A parameter file ( `.pdparams`) and an optimizer state file ( `.pdopt`) with the same name is used. This option has a higher priority than auto-resuming from the `--output` directory.

-## Pretrained Model
-
-The pretrained model is first trained to 1560k steps at Librispeech-other-500 and voxceleb1. Then trained at aidatatang_200h and magic_data to 3000k steps.
-
-Download URL [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip).
-
-## Inference
-
+###  Inference
 When training is done, run the command below to generate utterance embedding for each utterance in a dataset.
-
+`./local/inference.sh` calls `${BIN_DIR}/inference.py`.
 ```bash
-python inference.py --input=<input> --output=<output> --checkpoint_path=<checkpoint_path> --device="gpu"
+CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${infer_input} ${infer_output} ${train_output_path} ${ckpt_name}
 ```
+In `${BIN_DIR}/inference.py`:
+1. `--input` is the path of the dataset used for inference.
+2. `--output` is the directory to save the processed results. It has the same file structure as the input dataset. Each utterance in the dataset has a corrsponding utterance embedding file in `*.npy` format.
+3. `--checkpoint_path` is the path of the checkpoint to use, extension not included.
+4. `--pattern` is the wildcard pattern to filter audio files for inference, defaults to `*.wav`.
+5. `--device` and `--opts` have the same meaning as in the training script.

-`--input` is the path of the dataset used for inference.
-
-`--output` is the directory to save the processed results. It has the same file structure as the input dataset. Each utterance in the dataset has a corrsponding utterance embedding file in `*.npy` format.
-
-`--checkpoint_path` is the path of the checkpoint to use, extension not included.
-
-`--pattern` is the wildcard pattern to filter audio files for inference, defaults to `*.wav`.
+## Pretrained Model
+The pretrained model is first trained to 1560k steps at Librispeech-other-500 and voxceleb1. Then trained at aidatatang_200h and magic_data to 3000k steps.

-`--device` and `--opts` have the same meaning as in the training script.
+Download URL [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip).

 ## References

--- a/examples/other/ge2e/local/inference.sh
+++ b/examples/other/ge2e/local/inference.sh
@ -0,0 +1,14 @@
+#!/bin/bash
+
+#generate utterance embedding for each utterance in a dataset.
+infer_input=$1
+infer_output=$2
+train_output_path=$3
+ckpt_name=$4
+
+python3 ${BIN_DIR}/inference.py \
+    --input=${infer_input} \
+    --output=${infer_output} \
+    --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
+    --device="gpu"
+
--- a/examples/other/ge2e/local/preprocess.sh
+++ b/examples/other/ge2e/local/preprocess.sh
@ -0,0 +1,9 @@
+#!/bin/bash
+datasets_root=$1
+preprocess_path=$2
+dataset_names=$3
+
+python3 ${BIN_DIR}/preprocess.py \
+    --datasets_root=${datasets_root} \
+    --output_dir=${preprocess_path} \
+    --dataset_names=${dataset_names}
--- a/examples/other/ge2e/local/train.sh
+++ b/examples/other/ge2e/local/train.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+
+preprocess_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --data=${preprocess_path} \
+    --output=${train_output_path} \
+    --device="gpu" \
+    --nprocs=1
--- a/examples/other/ge2e/path.sh
+++ b/examples/other/ge2e/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=ge2e
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
--- a/examples/other/ge2e/run.sh
+++ b/examples/other/ge2e/run.sh
@ -0,0 +1,34 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+datasets_root=~/datasets/GE2E
+preprocess_path=dump
+dataset_names=librispeech_other
+train_output_path=output
+infer_input=infer_input
+infer_output=infer_output
+ckpt_name=step-10000
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${datasets_root} ${preprocess_path} ${dataset_names} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${infer_input} ${infer_output} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/other/punctuation_restoration/README.md
+++ b/examples/other/punctuation_restoration/README.md
@ -1,4 +1,3 @@
 # Punctation Restoration

 Please using [PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) to do this task.
-
--- a/examples/other/text_frontend/get_g2p_data.py
+++ b/examples/other/text_frontend/get_g2p_data.py
@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 from collections import defaultdict
 from pathlib import Path
--- a/examples/other/text_frontend/get_textnorm_data.py
+++ b/examples/other/text_frontend/get_textnorm_data.py
@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 from pathlib import Path

--- a/Show More
+++ b/Show More