From 35c37ace17c0f0d09c1c53fd25a82c8458d3e1e1 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Thu, 4 Nov 2021 11:31:38 +0000
Subject: [PATCH] change nprocs to ngpu, add aishell3/voc1

---
 examples/aishell3/tts3/README.md              |  27 +-
 examples/aishell3/tts3/local/synthesize.sh    |   1 -
 .../aishell3/tts3/local/synthesize_e2e.sh     |   1 -
 examples/aishell3/tts3/local/train.sh         |   2 +-
 examples/aishell3/vc0/README.md               |   2 +-
 examples/aishell3/vc0/local/preprocess.sh     |   1 -
 examples/aishell3/vc0/local/train.sh          |   2 +-
 examples/aishell3/voc1/conf/default.yaml      | 115 ++++++
 examples/aishell3/voc1/local/preprocess.sh    |  55 +++
 examples/aishell3/voc1/local/synthesize.sh    |  13 +
 examples/aishell3/voc1/local/train.sh         |  13 +
 examples/aishell3/voc1/path.sh                |  13 +
 examples/aishell3/voc1/run.sh                 |  32 ++
 examples/csmsc/tts2/README.md                 |  31 +-
 examples/csmsc/tts2/local/synthesize.sh       |   3 +-
 examples/csmsc/tts2/local/synthesize_e2e.sh   |   1 -
 examples/csmsc/tts2/local/train.sh            |   2 +-
 examples/csmsc/tts3/README.md                 |  26 +-
 examples/csmsc/tts3/local/synthesize.sh       |   1 -
 examples/csmsc/tts3/local/synthesize_e2e.sh   |   1 -
 examples/csmsc/tts3/local/train.sh            |   2 +-
 examples/csmsc/voc1/README.md                 |  17 +-
 examples/csmsc/voc1/local/train.sh            |   2 +-
 examples/csmsc/voc3/README.md                 |  32 +-
 examples/csmsc/voc3/local/train.sh            |   2 +-
 examples/ljspeech/tts0/README.md              |  18 +-
 examples/ljspeech/tts0/local/synthesize.sh    |   2 +-
 examples/ljspeech/tts0/local/train.sh         |   2 +-
 examples/ljspeech/tts1/README.md              |  22 +-
 examples/ljspeech/tts1/local/synthesize.sh    |   1 -
 .../ljspeech/tts1/local/synthesize_e2e.sh     |   1 -
 examples/ljspeech/tts1/local/train.sh         |   2 +-
 examples/ljspeech/tts3/README.md              |  40 +--
 examples/ljspeech/tts3/local/synthesize.sh    |   1 -
 .../ljspeech/tts3/local/synthesize_e2e.sh     |   1 -
 examples/ljspeech/tts3/local/train.sh         |   2 +-
 examples/ljspeech/voc0/README.md              |   7 +-
 examples/ljspeech/voc0/local/synthesize.sh    |   2 +-
 examples/ljspeech/voc0/local/train.sh         |   3 +-
 examples/ljspeech/voc1/README.md              |  21 +-
 examples/ljspeech/voc1/local/train.sh         |   2 +-
 examples/other/g2p/README.md                  |  20 ++
 .../{text_frontend => g2p}/get_g2p_data.py    |   0
 examples/other/g2p/run.sh                     |  16 +
 .../other/{text_frontend => g2p}/test_g2p.py  |   0
 examples/other/ge2e/README.md                 |   5 +-
 examples/other/ge2e/local/inference.sh        |   2 +-
 examples/other/ge2e/local/train.sh            |   3 +-
 examples/other/text_frontend/README.md        |  38 --
 examples/other/text_frontend/make_sclite.sh   |  13 -
 examples/other/text_frontend/run.sh           |  25 --
 examples/other/tn/README.md                   |  17 +
 .../data/textnorm_test_cases.txt              |   0
 .../get_textnorm_data.py                      |   0
 examples/other/tn/run.sh                      |  17 +
 .../{text_frontend => tn}/test_textnorm.py    |   0
 examples/vctk/tts3/README.md                  |  26 +-
 examples/vctk/tts3/local/synthesize.sh        |   1 -
 examples/vctk/tts3/local/synthesize_e2e.sh    |   1 -
 examples/vctk/tts3/local/train.sh             |   2 +-
 examples/vctk/voc1/README.md                  |  18 +-
 examples/vctk/voc1/local/train.sh             |   2 +-
 .../fastspeech2/multi_spk_synthesize_e2e.py   |   9 +-
 .../multi_spk_synthesize_e2e_en.py            |   9 +-
 .../t2s/exps/fastspeech2/synthesize.py        |  10 +-
 .../t2s/exps/fastspeech2/synthesize_e2e.py    |   9 +-
 .../t2s/exps/fastspeech2/synthesize_e2e_en.py |   9 +-
 .../exps/fastspeech2/synthesize_e2e_melgan.py |   9 +-
 paddlespeech/t2s/exps/fastspeech2/train.py    |  12 +-
 .../multi_band_melgan/synthesize.py           |   9 +-
 .../gan_vocoder/multi_band_melgan/train.py    |  12 +-
 .../parallelwave_gan/synthesize.py            |   9 +-
 .../parallelwave_gan/synthesize_from_wav.py   |   9 +-
 .../gan_vocoder/parallelwave_gan/train.py     |  12 +-
 .../t2s/exps/gan_vocoder/preprocess.py        |  16 +-
 paddlespeech/t2s/exps/ge2e/inference.py       |  18 +-
 paddlespeech/t2s/exps/ge2e/train.py           |   4 +-
 .../t2s/exps/speedyspeech/inference.py        |   8 +-
 .../t2s/exps/speedyspeech/synthesize.py       |   9 +-
 .../t2s/exps/speedyspeech/synthesize_e2e.py   |  11 +-
 paddlespeech/t2s/exps/speedyspeech/train.py   |  13 +-
 paddlespeech/t2s/exps/tacotron2/ljspeech.py   |   9 +-
 paddlespeech/t2s/exps/tacotron2/synthesize.py |   9 +-
 paddlespeech/t2s/exps/tacotron2/train.py      |   4 +-
 .../t2s/exps/transformer_tts/synthesize.py    |   9 +-
 .../exps/transformer_tts/synthesize_e2e.py    |   9 +-
 .../t2s/exps/transformer_tts/train.py         |  12 +-
 .../voice_cloning/tacotron2_ge2e/train.py     |   4 +-
 .../tacotron2_ge2e/voice_cloning.py           |  10 +-
 paddlespeech/t2s/exps/waveflow/synthesize.py  |  10 +-
 paddlespeech/t2s/exps/waveflow/train.py       |   4 +-
 paddlespeech/t2s/frontend/pinyin.py           | 333 ------------------
 paddlespeech/t2s/training/cli.py              |   3 +-
 paddlespeech/t2s/training/experiment.py       |   9 +-
 tests/benchmark/pwgan/run_benchmark.sh        |   4 +-
 tests/chains/speedyspeech/test.sh             |   2 +-
 96 files changed, 643 insertions(+), 715 deletions(-)
 create mode 100644 examples/aishell3/voc1/conf/default.yaml
 create mode 100755 examples/aishell3/voc1/local/preprocess.sh
 create mode 100755 examples/aishell3/voc1/local/synthesize.sh
 create mode 100755 examples/aishell3/voc1/local/train.sh
 create mode 100755 examples/aishell3/voc1/path.sh
 create mode 100755 examples/aishell3/voc1/run.sh
 create mode 100644 examples/other/g2p/README.md
 rename examples/other/{text_frontend => g2p}/get_g2p_data.py (100%)
 create mode 100755 examples/other/g2p/run.sh
 rename examples/other/{text_frontend => g2p}/test_g2p.py (100%)
 delete mode 100644 examples/other/text_frontend/README.md
 delete mode 100755 examples/other/text_frontend/make_sclite.sh
 delete mode 100755 examples/other/text_frontend/run.sh
 create mode 100644 examples/other/tn/README.md
 rename examples/other/{text_frontend => tn}/data/textnorm_test_cases.txt (100%)
 rename examples/other/{text_frontend => tn}/get_textnorm_data.py (100%)
 create mode 100755 examples/other/tn/run.sh
 rename examples/other/{text_frontend => tn}/test_textnorm.py (100%)
 delete mode 100644 paddlespeech/t2s/frontend/pinyin.py

diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md
index c313d922..9f01ff45 100644
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -67,8 +67,8 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE]
-                [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT]
+                [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT]
+                [--speaker-dict SPEAKER_DICT]
 
 Train a FastSpeech2 model.
 
@@ -81,8 +81,7 @@ optional arguments:
                         dev data.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
-  --nprocs NPROCS       number of processes.
+  --ngpu NGPU           if ngpu=0, use cpu.
   --verbose VERBOSE     verbose.
   --phones-dict PHONES_DICT
                         phone vocabulary file.
@@ -92,10 +91,9 @@ optional arguments:
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
-4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
-5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
-6. `--phones-dict` is the path of the phone vocabulary file.
-7. `--speaker-dict`is the path of the  speaker id map file when training a multi-speaker FastSpeech2.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+5. `--phones-dict` is the path of the phone vocabulary file.
+6. `--speaker-dict`is the path of the  speaker id map file when training a multi-speaker FastSpeech2.
 
 ### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
@@ -122,7 +120,7 @@ usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
                      [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT]
                      [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT]
                      [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
-                     [--device DEVICE] [--verbose VERBOSE]
+                     [--ngpu NGPU] [--verbose VERBOSE]
 
 Synthesize with fastspeech2 & parallel wavegan.
 
@@ -149,8 +147,8 @@ optional arguments:
                         test metadata.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
-  --verbose VERBOSE     verbose.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --verbose VERBOSE     verbose
 ```
 `./local/synthesize_e2e.sh` calls `${BIN_DIR}/multi_spk_synthesize_e2e.py`, which can synthesize waveform from text file.
 ```bash
@@ -166,7 +164,7 @@ usage: multi_spk_synthesize_e2e.py [-h]
                                    [--pwg-stat PWG_STAT]
                                    [--phones-dict PHONES_DICT]
                                    [--speaker-dict SPEAKER_DICT] [--text TEXT]
-                                   [--output-dir OUTPUT_DIR] [--device DEVICE]
+                                   [--output-dir OUTPUT_DIR] [--ngpu NGPU]
                                    [--verbose VERBOSE]
 
 Synthesize with fastspeech2 & parallel wavegan.
@@ -193,7 +191,7 @@ optional arguments:
   --text TEXT           text to synthesize, a 'utt_id sentence' pair per line.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose.
 ```
 1. `--fastspeech2-config`, `--fastspeech2-checkpoint`, `--fastspeech2-stat`, `--phones-dict` and `--speaker-dict` are arguments for fastspeech2, which correspond to the 5 files in the fastspeech2 pretrained model.
@@ -201,7 +199,7 @@ optional arguments:
 3. `--test-metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 4. `--text` is the text file, which contains sentences to synthesize.
 5. `--output-dir` is the directory to save synthesized audio files.
-6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
+6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Model
 Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
@@ -231,7 +229,6 @@ python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \
   --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
   --text=${BIN_DIR}/../sentences.txt \
   --output-dir=exp/default/test_e2e \
-  --device="gpu" \
   --phones-dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \
   --speaker-dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt
 
diff --git a/examples/aishell3/tts3/local/synthesize.sh b/examples/aishell3/tts3/local/synthesize.sh
index 64361983..e9b893f8 100755
--- a/examples/aishell3/tts3/local/synthesize.sh
+++ b/examples/aishell3/tts3/local/synthesize.sh
@@ -15,6 +15,5 @@ python3 ${BIN_DIR}/synthesize.py \
   --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
   --test-metadata=dump/test/norm/metadata.jsonl \
   --output-dir=${train_output_path}/test \
-  --device="gpu" \
   --phones-dict=dump/phone_id_map.txt \
   --speaker-dict=dump/speaker_id_map.txt
diff --git a/examples/aishell3/tts3/local/synthesize_e2e.sh b/examples/aishell3/tts3/local/synthesize_e2e.sh
index 8a979844..e1d84d21 100755
--- a/examples/aishell3/tts3/local/synthesize_e2e.sh
+++ b/examples/aishell3/tts3/local/synthesize_e2e.sh
@@ -15,6 +15,5 @@ python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \
   --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
   --text=${BIN_DIR}/../sentences.txt \
   --output-dir=${train_output_path}/test_e2e \
-  --device="gpu" \
   --phones-dict=dump/phone_id_map.txt \
   --speaker-dict=dump/speaker_id_map.txt
diff --git a/examples/aishell3/tts3/local/train.sh b/examples/aishell3/tts3/local/train.sh
index be6051c9..1da72f11 100755
--- a/examples/aishell3/tts3/local/train.sh
+++ b/examples/aishell3/tts3/local/train.sh
@@ -8,6 +8,6 @@ python3 ${BIN_DIR}/train.py \
     --dev-metadata=dump/dev/norm/metadata.jsonl \
     --config=${config_path} \
     --output-dir=${train_output_path} \
-    --nprocs=2 \
+    --ngpu=2 \
     --phones-dict=dump/phone_id_map.txt \
     --speaker-dict=dump/speaker_id_map.txt
diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md
index 9a269ed5..c146198c 100644
--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
@@ -28,7 +28,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     python3 ${BIN_DIR}/../ge2e/inference.py \
         --input=${input} \
         --output=${preprocess_path}/embed \
-        --device="gpu" \
+        --ngpu=1 \
         --checkpoint_path=${ge2e_ckpt_path}
 fi
 ```
diff --git a/examples/aishell3/vc0/local/preprocess.sh b/examples/aishell3/vc0/local/preprocess.sh
index 87cfab32..e14dda53 100755
--- a/examples/aishell3/vc0/local/preprocess.sh
+++ b/examples/aishell3/vc0/local/preprocess.sh
@@ -12,7 +12,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     python3 ${BIN_DIR}/../../ge2e/inference.py \
         --input=${input} \
         --output=${preprocess_path}/embed \
-        --device="gpu" \
         --checkpoint_path=${ge2e_ckpt_path}
 fi
 
diff --git a/examples/aishell3/vc0/local/train.sh b/examples/aishell3/vc0/local/train.sh
index eb968b5f..f062cbbf 100755
--- a/examples/aishell3/vc0/local/train.sh
+++ b/examples/aishell3/vc0/local/train.sh
@@ -6,4 +6,4 @@ train_output_path=$2
 python3 ${BIN_DIR}/train.py \
     --data=${preprocess_path} \
     --output=${train_output_path} \
-    --device="gpu"
\ No newline at end of file
+    --ngpu=1
\ No newline at end of file
diff --git a/examples/aishell3/voc1/conf/default.yaml b/examples/aishell3/voc1/conf/default.yaml
new file mode 100644
index 00000000..ba2d9f2e
--- /dev/null
+++ b/examples/aishell3/voc1/conf/default.yaml
@@ -0,0 +1,115 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the VCTK corpus. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000                # Sampling rate.
+n_fft: 2048              # FFT size. (in samples)
+n_shift: 300             # Hop size. (in samples)
+win_length: 1200         # Window length. (in samples)
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80               # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann"                # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 8              # Batch size.
+batch_max_steps: 24000     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 4             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    epsilon: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    learning_rate: 0.0001             # Generator's learning rate.
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    epsilon: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    learning_rate: 0.00005            # Discriminator's learning rate. 
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 1000000                # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/aishell3/voc1/local/preprocess.sh b/examples/aishell3/voc1/local/preprocess.sh
new file mode 100755
index 00000000..44cc3dbe
--- /dev/null
+++ b/examples/aishell3/voc1/local/preprocess.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./aishell3_alignment_tone \
+        --output=durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/../preprocess.py \
+        --rootdir=~/datasets/data_aishell3/ \
+        --dataset=aishell3 \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --cut-sil=True \
+        --num-cpu=20
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="feats"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize, dev and test should use train's stats
+    echo "Normalize ..."
+   
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --stats=dump/train/feats_stats.npy
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --stats=dump/train/feats_stats.npy
+    
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --stats=dump/train/feats_stats.npy
+fi
diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh
new file mode 100755
index 00000000..9f904ac0
--- /dev/null
+++ b/examples/aishell3/voc1/local/synthesize.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/synthesize.py \
+  --config=${config_path} \
+  --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
+  --test-metadata=dump/test/norm/metadata.jsonl \
+  --output-dir=${train_output_path}/test
diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh
new file mode 100755
index 00000000..9695631e
--- /dev/null
+++ b/examples/aishell3/voc1/local/train.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+FLAGS_cudnn_exhaustive_search=true \
+FLAGS_conv_workspace_size_limit=4000 \
+python ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1
diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh
new file mode 100755
index 00000000..1e6647b8
--- /dev/null
+++ b/examples/aishell3/voc1/path.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=parallelwave_gan
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
\ No newline at end of file
diff --git a/examples/aishell3/voc1/run.sh b/examples/aishell3/voc1/run.sh
new file mode 100755
index 00000000..7d0fdb21
--- /dev/null
+++ b/examples/aishell3/voc1/run.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_5000.pdz
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md
index de9e488c..b3c35e1e 100644
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -55,10 +55,10 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
 Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
-                     [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                     [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE]
-                     [--use-relative-path USE_RELATIVE_PATH]
-                     [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--verbose VERBOSE]
+                [--use-relative-path USE_RELATIVE_PATH]
+                [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT]
 
 Train a Speedyspeech model with sigle speaker dataset.
 
@@ -71,8 +71,7 @@ optional arguments:
                         dev data.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
-  --nprocs NPROCS       number of processes.
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose.
   --use-relative-path USE_RELATIVE_PATH
                         whether use relative path in metadata
@@ -85,10 +84,9 @@ optional arguments:
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
-4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
-5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
-6. `--phones-dict` is the path of the phone vocabulary file.
-7. `--tones-dict` is the path of the tone vocabulary file.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+5. `--phones-dict` is the path of the phone vocabulary file.
+6. `--tones-dict` is the path of the tone vocabulary file.
 
 ### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
@@ -115,7 +113,7 @@ usage: synthesize.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG]
                      [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT]
                      [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT]
                      [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
-                     [--inference-dir INFERENCE_DIR] [--device DEVICE]
+                     [--inference-dir INFERENCE_DIR] [--ngpu NGPU]
                      [--verbose VERBOSE]
 
 Synthesize with speedyspeech & parallel wavegan.
@@ -145,7 +143,7 @@ optional arguments:
                         output dir
   --inference-dir INFERENCE_DIR
                         dir to save inference models
-  --device DEVICE       device type to use
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose
 ```
 `./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
@@ -161,8 +159,8 @@ usage: synthesize_e2e.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG]
                          [--pwg-stat PWG_STAT] [--text TEXT]
                          [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT]
                          [--output-dir OUTPUT_DIR]
-                         [--inference-dir INFERENCE_DIR] [--device DEVICE]
-                         [--verbose VERBOSE]
+                         [--inference-dir INFERENCE_DIR] [--verbose VERBOSE]
+                         [--ngpu NGPU]
 
 Synthesize with speedyspeech & parallel wavegan.
 
@@ -190,15 +188,15 @@ optional arguments:
                         output dir
   --inference-dir INFERENCE_DIR
                         dir to save inference models
-  --device DEVICE       device type to use
   --verbose VERBOSE     verbose
+  --ngpu NGPU           if ngpu == 0, use cpu.
 ```
 1. `--speedyspeech-config`, `--speedyspeech-checkpoint`, `--speedyspeech-stat` are arguments for speedyspeech, which correspond to the 3 files in the speedyspeech pretrained model.
 2. `--pwg-config`, `--pwg-checkpoint`, `--pwg-stat` are arguments for parallel wavegan, which correspond to the 3 files in the parallel wavegan pretrained model.
 3. `--text` is the text file, which contains sentences to synthesize.
 4. `--output-dir` is the directory to save synthesized audio files.
 5. `--inference-dir` is the directory to save exported model, which can be used with paddle infernece.
-6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
+6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 7. `--phones-dict` is the path of the phone vocabulary file.
 8. `--tones-dict` is the path of the tone vocabulary file.
 
@@ -237,7 +235,6 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
   --text=${BIN_DIR}/../sentences.txt \
   --output-dir=exp/default/test_e2e \
   --inference-dir=exp/default/inference \
-  --device="gpu" \
   --phones-dict=speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt \
   --tones-dict=speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
 ```
diff --git a/examples/csmsc/tts2/local/synthesize.sh b/examples/csmsc/tts2/local/synthesize.sh
index 418ee02e..8be02dfb 100755
--- a/examples/csmsc/tts2/local/synthesize.sh
+++ b/examples/csmsc/tts2/local/synthesize.sh
@@ -16,5 +16,4 @@ python3 ${BIN_DIR}/synthesize.py \
   --output-dir=${train_output_path}/test \
   --inference-dir=${train_output_path}/inference \
   --phones-dict=dump/phone_id_map.txt \
-  --tones-dict=dump/tone_id_map.txt \
-  --device="gpu"
+  --tones-dict=dump/tone_id_map.txt
\ No newline at end of file
diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh
index c50fa776..3cbc7936 100755
--- a/examples/csmsc/tts2/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts2/local/synthesize_e2e.sh
@@ -16,6 +16,5 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
   --text=${BIN_DIR}/../sentences.txt \
   --output-dir=${train_output_path}/test_e2e \
   --inference-dir=${train_output_path}/inference \
-  --device="gpu" \
   --phones-dict=dump/phone_id_map.txt \
   --tones-dict=dump/tone_id_map.txt
diff --git a/examples/csmsc/tts2/local/train.sh b/examples/csmsc/tts2/local/train.sh
index e44c7da5..f0a5a683 100755
--- a/examples/csmsc/tts2/local/train.sh
+++ b/examples/csmsc/tts2/local/train.sh
@@ -9,7 +9,7 @@ python ${BIN_DIR}/train.py \
     --dev-metadata=dump/dev/norm/metadata.jsonl \
     --config=${config_path} \
     --output-dir=${train_output_path} \
-    --nprocs=2 \
+    --ngpu=1 \
     --phones-dict=dump/phone_id_map.txt \
     --tones-dict=dump/tone_id_map.txt \
     --use-relative-path=True
diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index 7eeb14fc..a45eec13 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -59,8 +59,8 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE]
-                [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT]
+                [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT]
+                [--speaker-dict SPEAKER_DICT]
 
 Train a FastSpeech2 model.
 
@@ -73,8 +73,7 @@ optional arguments:
                         dev data.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
-  --nprocs NPROCS       number of processes.
+  --ngpu NGPU           if ngpu=0, use cpu.
   --verbose VERBOSE     verbose.
   --phones-dict PHONES_DICT
                         phone vocabulary file.
@@ -84,9 +83,8 @@ optional arguments:
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
-4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
-5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
-6. `--phones-dict` is the path of the phone vocabulary file.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+5. `--phones-dict` is the path of the phone vocabulary file.
 
 ### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
@@ -113,7 +111,7 @@ usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
                      [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT]
                      [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT]
                      [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
-                     [--device DEVICE] [--verbose VERBOSE]
+                     [--ngpu NGPU] [--verbose VERBOSE]
 
 Synthesize with fastspeech2 & parallel wavegan.
 
@@ -140,7 +138,7 @@ optional arguments:
                         test metadata.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose.
 ```
 `./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
@@ -155,7 +153,8 @@ usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
                          [--pwg-checkpoint PWG_CHECKPOINT]
                          [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT]
                          [--text TEXT] [--output-dir OUTPUT_DIR]
-                         [--device DEVICE] [--verbose VERBOSE]
+                         [--inference-dir INFERENCE_DIR] [--ngpu NGPU]
+                         [--verbose VERBOSE]
 
 Synthesize with fastspeech2 & parallel wavegan.
 
@@ -179,7 +178,9 @@ optional arguments:
   --text TEXT           text to synthesize, a 'utt_id sentence' pair per line.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
+  --inference-dir INFERENCE_DIR
+                        dir to save inference models
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose.
 ```
 
@@ -188,7 +189,7 @@ optional arguments:
 3. `--test-metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 4. `--text` is the text file, which contains sentences to synthesize.
 5. `--output-dir` is the directory to save synthesized audio files.
-6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
+6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ### Inference
 After Synthesize, we will get static models of fastspeech2 and pwgan in `${train_output_path}/inference`.
@@ -224,6 +225,5 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
   --text=${BIN_DIR}/../sentences.txt \
   --output-dir=exp/default/test_e2e \
   --inference-dir=exp/default/inference \
-  --device="gpu" \
   --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
 ```
diff --git a/examples/csmsc/tts3/local/synthesize.sh b/examples/csmsc/tts3/local/synthesize.sh
index 724afb04..e525fc16 100755
--- a/examples/csmsc/tts3/local/synthesize.sh
+++ b/examples/csmsc/tts3/local/synthesize.sh
@@ -15,5 +15,4 @@ python3 ${BIN_DIR}/synthesize.py \
   --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
   --test-metadata=dump/test/norm/metadata.jsonl \
   --output-dir=${train_output_path}/test \
-  --device="gpu" \
   --phones-dict=dump/phone_id_map.txt
diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh
index b6542743..cc27ffb6 100755
--- a/examples/csmsc/tts3/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts3/local/synthesize_e2e.sh
@@ -16,5 +16,4 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
   --text=${BIN_DIR}/../sentences.txt \
   --output-dir=${train_output_path}/test_e2e \
   --inference-dir=${train_output_path}/inference \
-  --device="gpu" \
   --phones-dict=dump/phone_id_map.txt
diff --git a/examples/csmsc/tts3/local/train.sh b/examples/csmsc/tts3/local/train.sh
index fbbc9a9d..f90db915 100755
--- a/examples/csmsc/tts3/local/train.sh
+++ b/examples/csmsc/tts3/local/train.sh
@@ -8,5 +8,5 @@ python3 ${BIN_DIR}/train.py \
     --dev-metadata=dump/dev/norm/metadata.jsonl \
     --config=${config_path} \
     --output-dir=${train_output_path} \
-    --nprocs=1 \
+    --ngpu=1 \
     --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md
index 4b6b6c42..c3256b78 100644
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@@ -53,9 +53,8 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE]
-                [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
-                [--run-benchmark RUN_BENCHMARK]
+                [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE]
+                [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK]
                 [--profiler_options PROFILER_OPTIONS]
 
 Train a ParallelWaveGAN model.
@@ -69,8 +68,7 @@ optional arguments:
                         dev data.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
-  --nprocs NPROCS       number of processes.
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose.
 
 benchmark:
@@ -90,8 +88,7 @@ benchmark:
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
-4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
-5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ### Synthesize
 `./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
@@ -101,7 +98,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```text
 usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT]
                      [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
-                     [--device DEVICE] [--verbose VERBOSE]
+                     [--ngpu NGPU] [--verbose VERBOSE]
 
 Synthesize with parallel wavegan.
 
@@ -114,7 +111,7 @@ optional arguments:
                         dev data.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device to run.
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose.
 ```
 
@@ -122,7 +119,7 @@ optional arguments:
 2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
 4. `--output-dir` is the directory to save the synthesized audio files.
-5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported.
+5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Models
 Pretrained models can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip).
diff --git a/examples/csmsc/voc1/local/train.sh b/examples/csmsc/voc1/local/train.sh
index 1ef860c3..9695631e 100755
--- a/examples/csmsc/voc1/local/train.sh
+++ b/examples/csmsc/voc1/local/train.sh
@@ -10,4 +10,4 @@ python ${BIN_DIR}/train.py \
     --dev-metadata=dump/dev/norm/metadata.jsonl \
     --config=${config_path} \
     --output-dir=${train_output_path} \
-    --nprocs=1
+    --ngpu=1
diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md
index 780a8ccd..757d1a36 100644
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@@ -53,12 +53,9 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE]
-                [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
-                [--run-benchmark RUN_BENCHMARK]
-                [--profiler_options PROFILER_OPTIONS]
+                [--ngpu NGPU] [--verbose VERBOSE]
 
-Train a ParallelWaveGAN model.
+Train a Multi-Band MelGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
@@ -69,29 +66,14 @@ optional arguments:
                         dev data.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
-  --nprocs NPROCS       number of processes.
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose.
-
-benchmark:
-  arguments related to benchmark.
-
-  --batch-size BATCH_SIZE
-                        batch size.
-  --max-iter MAX_ITER   train max steps.
-  --run-benchmark RUN_BENCHMARK
-                        runing benchmark or not, if True, use the --batch-size
-                        and --max-iter.
-  --profiler_options PROFILER_OPTIONS
-                        The option of profiler, which should be in format
-                        "key1=value1;key2=value2;key3=value3".
 ```
 
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
-4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
-5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ### Synthesize
 `./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
@@ -101,7 +83,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```text
 usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT]
                      [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
-                     [--device DEVICE] [--verbose VERBOSE]
+                     [--ngpu NGPU] [--verbose VERBOSE]
 
 Synthesize with parallel wavegan.
 
@@ -114,7 +96,7 @@ optional arguments:
                         dev data.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device to run.
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose.
 ```
 
@@ -122,6 +104,6 @@ optional arguments:
 2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
 4. `--output-dir` is the directory to save the synthesized audio files.
-5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported.
+5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Models
diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh
index 1ef860c3..9695631e 100755
--- a/examples/csmsc/voc3/local/train.sh
+++ b/examples/csmsc/voc3/local/train.sh
@@ -10,4 +10,4 @@ python ${BIN_DIR}/train.py \
     --dev-metadata=dump/dev/norm/metadata.jsonl \
     --config=${config_path} \
     --output-dir=${train_output_path} \
-    --nprocs=1
+    --ngpu=1
diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md
index e8e3ebff..f33d925d 100644
--- a/examples/ljspeech/tts0/README.md
+++ b/examples/ljspeech/tts0/README.md
@@ -30,8 +30,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
 Here's the complete help message.
 ```text
 usage: train.py [-h] [--config FILE] [--data DATA_DIR] [--output OUTPUT_DIR]
-                [--checkpoint_path CHECKPOINT_PATH] [--device {cpu,gpu}]
-                [--nprocs NPROCS] [--opts ...]
+                [--checkpoint_path CHECKPOINT_PATH] [--ngpu NGPU] [--opts ...]
 
 optional arguments:
   -h, --help            show this help message and exit
@@ -41,16 +40,15 @@ optional arguments:
   --output OUTPUT_DIR   path to save checkpoint and logs.
   --checkpoint_path CHECKPOINT_PATH
                         path of the checkpoint to load
-  --device {cpu,gpu}    device type to use, cpu and gpu are supported.
-  --nprocs NPROCS       number of parallel processes to use.
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --opts ...            options to overwrite --config file and the default
                         config, passing in KEY VALUE pairs
 ```
 
-If you want to train on CPU, just set ``--device=cpu``.
-If you want to train on multiple GPUs, just set ``--nprocs`` as num of GPU.
-By default, training will be resumed from the latest checkpoint in ``--output``, if you want to start a new training, please use a new ``${OUTPUTPATH}`` with no checkpoint.
-And if you want to resume from an other existing model, you should set ``checkpoint_path`` to be the checkpoint path you want to load.
+If you want to train on CPU, just set `--ngpu=0`.
+If you want to train on multiple GPUs, just set `--ngpu` as num of GPU.
+By default, training will be resumed from the latest checkpoint in `--output`, if you want to start a new training, please use a new `${OUTPUTPATH}` with no checkpoint.
+And if you want to resume from an other existing model, you should set `checkpoint_path` to be the checkpoint path you want to load.
 **Note: The checkpoint path cannot contain the file extension.**
 
 ### Synthesize
@@ -60,7 +58,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_n
 ```
 ```text
 usage: synthesize.py [-h] [--config FILE] [--checkpoint_path CHECKPOINT_PATH]
-                     [--input INPUT] [--output OUTPUT] [--device DEVICE]
+                     [--input INPUT] [--output OUTPUT] [--ngpu NGPU]
                      [--opts ...] [-v]
 
 generate mel spectrogram with TransformerTTS.
@@ -72,7 +70,7 @@ optional arguments:
                         path of the checkpoint to load.
   --input INPUT         path of the text sentences
   --output OUTPUT       path to save outputs
-  --device DEVICE       device type to use.
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --opts ...            options to overwrite --config file and the default
                         config, passing in KEY VALUE pairs
   -v, --verbose         print msg
diff --git a/examples/ljspeech/tts0/local/synthesize.sh b/examples/ljspeech/tts0/local/synthesize.sh
index 91c89dd4..02147803 100755
--- a/examples/ljspeech/tts0/local/synthesize.sh
+++ b/examples/ljspeech/tts0/local/synthesize.sh
@@ -8,4 +8,4 @@ python3 ${BIN_DIR}/synthesize.py \
     --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
     --input=${BIN_DIR}/../sentences_en.txt \
     --output=${train_output_path}/test
-    --device=gpu
\ No newline at end of file
+    --ngpu=1
\ No newline at end of file
diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh
index b8bcf5cb..a94f955a 100755
--- a/examples/ljspeech/tts0/local/train.sh
+++ b/examples/ljspeech/tts0/local/train.sh
@@ -6,4 +6,4 @@ train_output_path=$2
 python3 ${BIN_DIR}/train.py \
     --data=${preprocess_path} \
     --output=${train_output_path} \
-    --device=gpu \
\ No newline at end of file
+    --ngpu=1 \
\ No newline at end of file
diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md
index 0385fdce..625f296a 100644
--- a/examples/ljspeech/tts1/README.md
+++ b/examples/ljspeech/tts1/README.md
@@ -53,8 +53,7 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE]
-                [--phones-dict PHONES_DICT]
+                [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT]
 
 Train a TransformerTTS model with LJSpeech TTS dataset.
 
@@ -67,8 +66,7 @@ optional arguments:
                         dev data.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
-  --nprocs NPROCS       number of processes.
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose.
   --phones-dict PHONES_DICT
                         phone vocabulary file.
@@ -76,9 +74,8 @@ optional arguments:
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
-4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
-5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
-6. `--phones-dict` is the path of the phone vocabulary file.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+5. `--phones-dict` is the path of the phone vocabulary file.
 
 ## Synthesize
 We use [waveflow](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0) as the neural vocoder.
@@ -104,7 +101,7 @@ usage: synthesize.py [-h] [--transformer-tts-config TRANSFORMER_TTS_CONFIG]
                      [--waveflow-checkpoint WAVEFLOW_CHECKPOINT]
                      [--phones-dict PHONES_DICT]
                      [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
-                     [--device DEVICE] [--verbose VERBOSE]
+                     [--ngpu NGPU] [--verbose VERBOSE]
 
 Synthesize with transformer tts & waveflow.
 
@@ -127,7 +124,7 @@ optional arguments:
                         test metadata.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose.
 ```
 `./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
@@ -142,7 +139,7 @@ usage: synthesize_e2e.py [-h]
                          [--waveflow-config WAVEFLOW_CONFIG]
                          [--waveflow-checkpoint WAVEFLOW_CHECKPOINT]
                          [--phones-dict PHONES_DICT] [--text TEXT]
-                         [--output-dir OUTPUT_DIR] [--device DEVICE]
+                         [--output-dir OUTPUT_DIR] [--ngpu NGPU]
                          [--verbose VERBOSE]
 
 Synthesize with transformer tts & waveflow.
@@ -165,7 +162,7 @@ optional arguments:
   --text TEXT           text to synthesize, a 'utt_id sentence' pair per line.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose.
 ```
 1. `--transformer-tts-config`, `--transformer-tts-checkpoint`, `--transformer-tts-stat` and `--phones-dict` are arguments for transformer_tts, which correspond to the 4 files in the transformer_tts pretrained model.
@@ -173,7 +170,7 @@ optional arguments:
 3. `--test-metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 4. `--text` is the text file, which contains sentences to synthesize.
 5. `--output-dir` is the directory to save synthesized audio files.
-6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
+6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Model
 Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)
@@ -200,6 +197,5 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
   --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
   --text=${BIN_DIR}/../sentences_en.txt \
   --output-dir=exp/default/test_e2e \
-  --device="gpu" \
   --phones-dict=transformer_tts_ljspeech_ckpt_0.4/phone_id_map.txt
 ```
diff --git a/examples/ljspeech/tts1/local/synthesize.sh b/examples/ljspeech/tts1/local/synthesize.sh
index 5d1c9534..9fe837a4 100755
--- a/examples/ljspeech/tts1/local/synthesize.sh
+++ b/examples/ljspeech/tts1/local/synthesize.sh
@@ -14,5 +14,4 @@ python3 ${BIN_DIR}/synthesize.py \
   --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
   --test-metadata=dump/test/norm/metadata.jsonl \
   --output-dir=${train_output_path}/test \
-  --device="gpu" \
   --phones-dict=dump/phone_id_map.txt
diff --git a/examples/ljspeech/tts1/local/synthesize_e2e.sh b/examples/ljspeech/tts1/local/synthesize_e2e.sh
index 333a5cd6..046fdb70 100755
--- a/examples/ljspeech/tts1/local/synthesize_e2e.sh
+++ b/examples/ljspeech/tts1/local/synthesize_e2e.sh
@@ -14,5 +14,4 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
   --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
   --text=${BIN_DIR}/../sentences_en.txt \
   --output-dir=${train_output_path}/test_e2e \
-  --device="gpu" \
   --phones-dict=dump/phone_id_map.txt
diff --git a/examples/ljspeech/tts1/local/train.sh b/examples/ljspeech/tts1/local/train.sh
index 8527f57f..5e255fb8 100755
--- a/examples/ljspeech/tts1/local/train.sh
+++ b/examples/ljspeech/tts1/local/train.sh
@@ -8,5 +8,5 @@ python3 ${BIN_DIR}/train.py \
     --dev-metadata=dump/dev/norm/metadata.jsonl \
     --config=${config_path} \
     --output-dir=${train_output_path} \
-    --nprocs=2 \
+    --ngpu=2 \
     --phones-dict=dump/phone_id_map.txt
diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md
index dc711ce8..0bcdf372 100644
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -58,8 +58,8 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE]
-                [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT]
+                [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT]
+                [--speaker-dict SPEAKER_DICT]
 
 Train a FastSpeech2 model.
 
@@ -72,8 +72,7 @@ optional arguments:
                         dev data.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
-  --nprocs NPROCS       number of processes.
+  --ngpu NGPU           if ngpu=0, use cpu.
   --verbose VERBOSE     verbose.
   --phones-dict PHONES_DICT
                         phone vocabulary file.
@@ -83,9 +82,8 @@ optional arguments:
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
-4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
-5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
-6. `--phones-dict` is the path of the phone vocabulary file.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+5. `--phones-dict` is the path of the phone vocabulary file.
 
 ### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc1) as the neural vocoder.
@@ -112,7 +110,7 @@ usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
                      [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT]
                      [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT]
                      [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
-                     [--device DEVICE] [--verbose VERBOSE]
+                     [--ngpu NGPU] [--verbose VERBOSE]
 
 Synthesize with fastspeech2 & parallel wavegan.
 
@@ -139,7 +137,7 @@ optional arguments:
                         test metadata.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose.
 ```
 `./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e_en.py`, which can synthesize waveform from text file.
@@ -147,14 +145,15 @@ optional arguments:
 CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
-usage: synthesize_e2e_en.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
-                            [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT]
-                            [--fastspeech2-stat FASTSPEECH2_STAT]
-                            [--pwg-config PWG_CONFIG]
-                            [--pwg-checkpoint PWG_CHECKPOINT]
-                            [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT]
-                            [--text TEXT] [--output-dir OUTPUT_DIR]
-                            [--device DEVICE] [--verbose VERBOSE]
+usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
+                         [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT]
+                         [--fastspeech2-stat FASTSPEECH2_STAT]
+                         [--pwg-config PWG_CONFIG]
+                         [--pwg-checkpoint PWG_CHECKPOINT]
+                         [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT]
+                         [--text TEXT] [--output-dir OUTPUT_DIR]
+                         [--inference-dir INFERENCE_DIR] [--ngpu NGPU]
+                         [--verbose VERBOSE]
 
 Synthesize with fastspeech2 & parallel wavegan.
 
@@ -178,7 +177,9 @@ optional arguments:
   --text TEXT           text to synthesize, a 'utt_id sentence' pair per line.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
+  --inference-dir INFERENCE_DIR
+                        dir to save inference models
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose.
 ```
 
@@ -187,7 +188,7 @@ optional arguments:
 3. `--test-metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 4. `--text` is the text file, which contains sentences to synthesize.
 5. `--output-dir` is the directory to save synthesized audio files.
-6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
+6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Model
 Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
@@ -215,6 +216,5 @@ python3 ${BIN_DIR}/synthesize_e2e_en.py \
   --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
   --text=${BIN_DIR}/../sentences_en.txt \
   --output-dir=exp/default/test_e2e \
-  --device="gpu" \
   --phones-dict=fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt
 ```
diff --git a/examples/ljspeech/tts3/local/synthesize.sh b/examples/ljspeech/tts3/local/synthesize.sh
index 32dcde58..9b22abb3 100755
--- a/examples/ljspeech/tts3/local/synthesize.sh
+++ b/examples/ljspeech/tts3/local/synthesize.sh
@@ -15,5 +15,4 @@ python3 ${BIN_DIR}/synthesize.py \
   --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
   --test-metadata=dump/test/norm/metadata.jsonl \
   --output-dir=${train_output_path}/test \
-  --device="gpu" \
   --phones-dict=dump/phone_id_map.txt
diff --git a/examples/ljspeech/tts3/local/synthesize_e2e.sh b/examples/ljspeech/tts3/local/synthesize_e2e.sh
index 28ea3a8f..c723feef 100755
--- a/examples/ljspeech/tts3/local/synthesize_e2e.sh
+++ b/examples/ljspeech/tts3/local/synthesize_e2e.sh
@@ -15,5 +15,4 @@ python3 ${BIN_DIR}/synthesize_e2e_en.py \
   --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
   --text=${BIN_DIR}/../sentences_en.txt \
   --output-dir=${train_output_path}/test_e2e \
-  --device="gpu" \
   --phones-dict=dump/phone_id_map.txt
diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh
index 847a44e3..d1302f99 100755
--- a/examples/ljspeech/tts3/local/train.sh
+++ b/examples/ljspeech/tts3/local/train.sh
@@ -8,5 +8,5 @@ python3 ${BIN_DIR}/train.py \
     --dev-metadata=dump/dev/norm/metadata.jsonl \
     --config=${config_path} \
     --output-dir=${train_output_path} \
-    --nprocs=1 \
+    --ngpu=1 \
     --phones-dict=dump/phone_id_map.txt
diff --git a/examples/ljspeech/voc0/README.md b/examples/ljspeech/voc0/README.md
index 6163ae42..ad2337ef 100644
--- a/examples/ljspeech/voc0/README.md
+++ b/examples/ljspeech/voc0/README.md
@@ -31,10 +31,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_
 The training script requires 4 command line arguments.
 1. `--data` is the path of the training dataset.
 2. `--output` is the path of the output directory.
-3. `--device` should be "cpu" or "gpu"
-4. `--nprocs` is the number of processes to train the model in parallel.
+3. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
-If you want distributed training, set a larger `--nprocs` (e.g. 4). Note that distributed training with cpu is not supported yet.
+If you want distributed training, set a larger `--ngpu` (e.g. 4). Note that distributed training with cpu is not supported yet.
 
 ### Synthesize
 `./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from mels.
@@ -46,7 +45,7 @@ Synthesize waveform.
 1. We assume the `--input` is a directory containing several mel spectrograms(log magnitude) in `.npy` format.
 2. The output would be saved in `--output` directory, containing several `.wav` files, each with the same name as the mel spectrogram does.
 3. `--checkpoint_path` should be the path of the parameter file (`.pdparams`) to load. Note that the extention name `.pdparmas` is not included here.
-4. `--device` specifies to device to run synthesis on.
+6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Model
 Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip).
diff --git a/examples/ljspeech/voc0/local/synthesize.sh b/examples/ljspeech/voc0/local/synthesize.sh
index 055542cf..1d5e1183 100755
--- a/examples/ljspeech/voc0/local/synthesize.sh
+++ b/examples/ljspeech/voc0/local/synthesize.sh
@@ -8,5 +8,5 @@ python ${BIN_DIR}/synthesize.py \
     --input=${input_mel_path} \
     --output=${train_output_path}/wavs/ \
     --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
-    --device="gpu" \
+    --ngpu=1 \
     --verbose
\ No newline at end of file
diff --git a/examples/ljspeech/voc0/local/train.sh b/examples/ljspeech/voc0/local/train.sh
index 5c4defd9..f062cbbf 100755
--- a/examples/ljspeech/voc0/local/train.sh
+++ b/examples/ljspeech/voc0/local/train.sh
@@ -6,5 +6,4 @@ train_output_path=$2
 python3 ${BIN_DIR}/train.py \
     --data=${preprocess_path} \
     --output=${train_output_path} \
-    --device="gpu" \
-    --nprocs=1
\ No newline at end of file
+    --ngpu=1
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md
index ba6eb002..2cc196fb 100644
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@@ -53,11 +53,10 @@ Here's the complete help message.
 
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
-                     [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                     [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE]
-                     [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
-                     [--run-benchmark RUN_BENCHMARK]
-                     [--profiler_options PROFILER_OPTIONS]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE]
+                [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK]
+                [--profiler_options PROFILER_OPTIONS]
 
 Train a ParallelWaveGAN model.
 
@@ -70,8 +69,7 @@ optional arguments:
                         dev data.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
-  --nprocs NPROCS       number of processes.
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose.
 
 benchmark:
@@ -91,8 +89,7 @@ benchmark:
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
-4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
-5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ### Synthesize
 `./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
@@ -102,7 +99,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```text
 usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT]
                      [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
-                     [--device DEVICE] [--verbose VERBOSE]
+                     [--ngpu NGPU] [--verbose VERBOSE]
 
 Synthesize with parallel wavegan.
 
@@ -115,7 +112,7 @@ optional arguments:
                         dev data.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device to run.
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose.
 ```
 
@@ -123,7 +120,7 @@ optional arguments:
 2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
 4. `--output-dir` is the directory to save the synthesized audio files.
-5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported.
+5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Models
 Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)
diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh
index 1ef860c3..9695631e 100755
--- a/examples/ljspeech/voc1/local/train.sh
+++ b/examples/ljspeech/voc1/local/train.sh
@@ -10,4 +10,4 @@ python ${BIN_DIR}/train.py \
     --dev-metadata=dump/dev/norm/metadata.jsonl \
     --config=${config_path} \
     --output-dir=${train_output_path} \
-    --nprocs=1
+    --ngpu=1
diff --git a/examples/other/g2p/README.md b/examples/other/g2p/README.md
new file mode 100644
index 00000000..14bd0d9d
--- /dev/null
+++ b/examples/other/g2p/README.md
@@ -0,0 +1,20 @@
+# G2P
+For g2p, we use BZNSYP's phone label as the ground truth and we delete silence tokens in labels and predicted phones.
+
+You should Download BZNSYP from it's [Official Website](https://test.data-baker.com/data/index/source) and extract it. Assume the path to the dataset is `~/datasets/BZNSYP`.
+
+We use `WER` as evaluation criterion.
+
+# Start
+Run the command below to get the results of test.
+```bash
+./run.sh
+```
+The `avg WER` of g2p is: 0.027495061517943988
+```text
+     ,--------------------------------------------------------------------.
+     |        | # Snt    # Wrd  | Corr    Sub    Del    Ins    Err  S.Err |
+     |--------+-----------------+-----------------------------------------|
+     | Sum/Avg|  9996   299181  | 97.3    2.7    0.0    0.0    2.7   52.5 |
+     `--------------------------------------------------------------------'
+```
diff --git a/examples/other/text_frontend/get_g2p_data.py b/examples/other/g2p/get_g2p_data.py
similarity index 100%
rename from examples/other/text_frontend/get_g2p_data.py
rename to examples/other/g2p/get_g2p_data.py
diff --git a/examples/other/g2p/run.sh b/examples/other/g2p/run.sh
new file mode 100755
index 00000000..214b8b3d
--- /dev/null
+++ b/examples/other/g2p/run.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+source path.sh
+USE_SCLITE=true
+
+# test g2p
+echo "Start get g2p test data ..."
+python3 get_g2p_data.py --root-dir=~/datasets/BZNSYP --output-dir=data/g2p
+echo "Start test g2p ..."
+python3 test_g2p.py --input-dir=data/g2p --output-dir=exp/g2p
+
+# whether use sclite to get more detail information of WER
+if [ "$USE_SCLITE" = true ];then
+    echo "Start sclite g2p ..."
+    ${MAIN_ROOT}/tools/sctk/bin/sclite -i wsj -r ./exp/g2p/text.ref.clean trn -h ./exp/g2p/text.g2p trn -e utf-8 -o all
+fi
diff --git a/examples/other/text_frontend/test_g2p.py b/examples/other/g2p/test_g2p.py
similarity index 100%
rename from examples/other/text_frontend/test_g2p.py
rename to examples/other/g2p/test_g2p.py
diff --git a/examples/other/ge2e/README.md b/examples/other/ge2e/README.md
index 1fa9677a..17e591b3 100644
--- a/examples/other/ge2e/README.md
+++ b/examples/other/ge2e/README.md
@@ -70,8 +70,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_
 In `${BIN_DIR}/train.py`:
 1. `--data` is the path to the preprocessed dataset.
 2. `--output` is the directory to save results，usually a subdirectory of `runs`.It contains visualdl log files, text log files, config file and a `checkpoints` directory, which contains parameter file and optimizer state file. If `--output` already has some training results in it, the most recent parameter file and optimizer state file is loaded before training.
-3. `--device` is the device type to run the training, 'cpu' and 'gpu' are supported.
-4. `--nprocs` is the number of replicas to run in multiprocessing based parallel training。Currently multiprocessing based parallel training is only enabled when using 'gpu' as the devicde.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 5. `CUDA_VISIBLE_DEVICES` can be used to specify visible devices with cuda.
 
 Other options are described below.
@@ -91,7 +90,7 @@ In `${BIN_DIR}/inference.py`:
 2. `--output` is the directory to save the processed results. It has the same file structure as the input dataset. Each utterance in the dataset has a corrsponding utterance embedding file in `*.npy` format.
 3. `--checkpoint_path` is the path of the checkpoint to use, extension not included.
 4. `--pattern` is the wildcard pattern to filter audio files for inference, defaults to `*.wav`.
-5. `--device` and `--opts` have the same meaning as in the training script.
+5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Model
 The pretrained model is first trained to 1560k steps at Librispeech-other-500 and voxceleb1. Then trained at aidatatang_200h and magic_data to 3000k steps.
diff --git a/examples/other/ge2e/local/inference.sh b/examples/other/ge2e/local/inference.sh
index 1beebdfa..431c5309 100755
--- a/examples/other/ge2e/local/inference.sh
+++ b/examples/other/ge2e/local/inference.sh
@@ -10,5 +10,5 @@ python3 ${BIN_DIR}/inference.py \
     --input=${infer_input} \
     --output=${infer_output} \
     --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
-    --device="gpu"
+    --ngpu=1
 
diff --git a/examples/other/ge2e/local/train.sh b/examples/other/ge2e/local/train.sh
index 5c4defd9..f062cbbf 100755
--- a/examples/other/ge2e/local/train.sh
+++ b/examples/other/ge2e/local/train.sh
@@ -6,5 +6,4 @@ train_output_path=$2
 python3 ${BIN_DIR}/train.py \
     --data=${preprocess_path} \
     --output=${train_output_path} \
-    --device="gpu" \
-    --nprocs=1
\ No newline at end of file
+    --ngpu=1
\ No newline at end of file
diff --git a/examples/other/text_frontend/README.md b/examples/other/text_frontend/README.md
deleted file mode 100644
index 0bf6e72d..00000000
--- a/examples/other/text_frontend/README.md
+++ /dev/null
@@ -1,38 +0,0 @@
-# Chinese Text Frontend Example
-Here's an example for Chinese text frontend, including g2p and text normalization.
-## G2P
-For g2p, we use BZNSYP's phone label as the ground truth and we delete silence tokens in labels and predicted phones.
-
-You should Download BZNSYP from it's [Official Website](https://test.data-baker.com/data/index/source) and extract it. Assume the path to the dataset is `~/datasets/BZNSYP`.
-
-We use `WER` as evaluation criterion.
-## Text Normalization
-For text normalization, the test data is  `data/textnorm_test_cases.txt`, we use `|` as the separator of raw_data and normed_data.
-
-We use `CER` as evaluation criterion.
-## Start
-If you want to use sclite to get more detail information of WER, you should run the command below to make sclite first.
-```bash
-./make_sclite.sh
-```
-Run the command below to get the results of test.
-```bash
-./run.sh
-```
-The `avg WER` of g2p is: 0.027495061517943988
-```text
-     ,--------------------------------------------------------------------.
-     |        | # Snt    # Wrd  | Corr    Sub    Del    Ins    Err  S.Err |
-     |--------+-----------------+-----------------------------------------|
-     | Sum/Avg|  9996   299181  | 97.3    2.7    0.0    0.0    2.7   52.5 |
-     `--------------------------------------------------------------------'
-```
-
-The `avg CER` of text normalization is: 0.006388318503308237
-```text
-      ,-----------------------------------------------------------------.
-      |        | # Snt  # Wrd | Corr    Sub    Del    Ins    Err  S.Err |
-      |--------+--------------+-----------------------------------------|
-      | Sum/Avg|  125    2254 | 99.4    0.1    0.5    0.1    0.7    3.2 |
-      `-----------------------------------------------------------------'
-```
diff --git a/examples/other/text_frontend/make_sclite.sh b/examples/other/text_frontend/make_sclite.sh
deleted file mode 100755
index db8c921c..00000000
--- a/examples/other/text_frontend/make_sclite.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-if [ ! -d "./SCTK" ];then
-    echo "Clone SCTK ..."
-    git clone https://github.com/usnistgov/SCTK
-    echo "Clone SCTK done!"
-fi
-
-if [ ! -d "./SCTK/bin" ];then
-    echo "Start make SCTK ..."
-    pushd SCTK && make config && make all && make check && make install && make doc && popd
-    echo "SCTK make done!"
-fi
diff --git a/examples/other/text_frontend/run.sh b/examples/other/text_frontend/run.sh
deleted file mode 100755
index 9882b057..00000000
--- a/examples/other/text_frontend/run.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-USE_SCLITE=true
-
-# test g2p
-echo "Start get g2p test data ..."
-python3 get_g2p_data.py --root-dir=~/datasets/BZNSYP --output-dir=data/g2p
-echo "Start test g2p ..."
-python3 test_g2p.py --input-dir=data/g2p --output-dir=exp/g2p
-
-# test text normalization
-echo "Start get text normalization test data ..."
-python3 get_textnorm_data.py --test-file=data/textnorm_test_cases.txt --output-dir=data/textnorm
-echo "Start test text normalization ..."
-python3 test_textnorm.py --input-dir=data/textnorm --output-dir=exp/textnorm
-
-# whether use sclite to get more detail information of WER
-if [ "$USE_SCLITE" = true ];then
-    echo "Start sclite g2p ..."
-    ./SCTK/bin/sclite -i wsj -r ./exp/g2p/text.ref.clean trn -h ./exp/g2p/text.g2p trn -e utf-8 -o all
-    echo
-
-    echo "Start sclite textnorm ..."
-    ./SCTK/bin/sclite -i wsj -r ./exp/textnorm/text.ref.clean trn -h ./exp/textnorm/text.tn trn -e utf-8 -o all
-fi
\ No newline at end of file
diff --git a/examples/other/tn/README.md b/examples/other/tn/README.md
new file mode 100644
index 00000000..dfefccde
--- /dev/null
+++ b/examples/other/tn/README.md
@@ -0,0 +1,17 @@
+# Text Normalization
+For text normalization, the test data is  `data/textnorm_test_cases.txt`, we use `|` as the separator of raw_data and normed_data.
+
+We use `CER` as evaluation criterion.
+## Start
+Run the command below to get the results of test.
+```bash
+./run.sh
+```
+The `avg CER` of text normalization is: 0.006388318503308237
+```text
+      ,-----------------------------------------------------------------.
+      |        | # Snt  # Wrd | Corr    Sub    Del    Ins    Err  S.Err |
+      |--------+--------------+-----------------------------------------|
+      | Sum/Avg|  125    2254 | 99.4    0.1    0.5    0.1    0.7    3.2 |
+      `-----------------------------------------------------------------'
+```
diff --git a/examples/other/text_frontend/data/textnorm_test_cases.txt b/examples/other/tn/data/textnorm_test_cases.txt
similarity index 100%
rename from examples/other/text_frontend/data/textnorm_test_cases.txt
rename to examples/other/tn/data/textnorm_test_cases.txt
diff --git a/examples/other/text_frontend/get_textnorm_data.py b/examples/other/tn/get_textnorm_data.py
similarity index 100%
rename from examples/other/text_frontend/get_textnorm_data.py
rename to examples/other/tn/get_textnorm_data.py
diff --git a/examples/other/tn/run.sh b/examples/other/tn/run.sh
new file mode 100755
index 00000000..e8298a84
--- /dev/null
+++ b/examples/other/tn/run.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+source path.sh
+
+USE_SCLITE=true
+
+# test text normalization
+echo "Start get text normalization test data ..."
+python3 get_textnorm_data.py --test-file=data/textnorm_test_cases.txt --output-dir=data/textnorm
+echo "Start test text normalization ..."
+python3 test_textnorm.py --input-dir=data/textnorm --output-dir=exp/textnorm
+
+# whether use sclite to get more detail information of WER
+if [ "$USE_SCLITE" = true ];then
+    echo "Start sclite textnorm ..."
+    ${MAIN_ROOT}/tools/sctk/bin/sclite -i wsj -r ./exp/textnorm/text.ref.clean trn -h ./exp/textnorm/text.tn trn -e utf-8 -o all
+fi
\ No newline at end of file
diff --git a/examples/other/text_frontend/test_textnorm.py b/examples/other/tn/test_textnorm.py
similarity index 100%
rename from examples/other/text_frontend/test_textnorm.py
rename to examples/other/tn/test_textnorm.py
diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md
index 717ee7ac..994fe058 100644
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -61,8 +61,8 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE]
-                [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT]
+                [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT]
+                [--speaker-dict SPEAKER_DICT]
 
 Train a FastSpeech2 model.
 
@@ -75,8 +75,7 @@ optional arguments:
                         dev data.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
-  --nprocs NPROCS       number of processes.
+  --ngpu NGPU           if ngpu=0, use cpu.
   --verbose VERBOSE     verbose.
   --phones-dict PHONES_DICT
                         phone vocabulary file.
@@ -86,9 +85,7 @@ optional arguments:
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
-4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
-5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
-6. `--phones-dict` is the path of the phone vocabulary file.
+4. `--phones-dict` is the path of the phone vocabulary file.
 
 ### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/vctk/voc1) as the neural vocoder.
@@ -116,7 +113,7 @@ usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
                      [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT]
                      [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT]
                      [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
-                     [--device DEVICE] [--verbose VERBOSE]
+                     [--ngpu NGPU] [--verbose VERBOSE]
 
 Synthesize with fastspeech2 & parallel wavegan.
 
@@ -143,7 +140,7 @@ optional arguments:
                         test metadata.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose.
 ```
 `./local/synthesize_e2e.sh` calls `${BIN_DIR}/multi_spk_synthesize_e2e_en.py`, which can synthesize waveform from text file.
@@ -161,7 +158,7 @@ usage: multi_spk_synthesize_e2e_en.py [-h]
                                       [--phones-dict PHONES_DICT]
                                       [--speaker-dict SPEAKER_DICT]
                                       [--text TEXT] [--output-dir OUTPUT_DIR]
-                                      [--device DEVICE] [--verbose VERBOSE]
+                                      [--ngpu NGPU] [--verbose VERBOSE]
 
 Synthesize with fastspeech2 & parallel wavegan.
 
@@ -187,7 +184,7 @@ optional arguments:
   --text TEXT           text to synthesize, a 'utt_id sentence' pair per line.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose.
 ```
 
@@ -196,7 +193,7 @@ optional arguments:
 3. `--test-metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 4. `--text` is the text file, which contains sentences to synthesize.
 5. `--output-dir` is the directory to save synthesized audio files.
-6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
+6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Model
 Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)
@@ -218,14 +215,13 @@ FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/multi_spk_synthesize_e2e_en.py \
   --fastspeech2-config=fastspeech2_nosil_vctk_ckpt_0.5/default.yaml \
-  --fastspeech2-checkpoint=fastspeech2_nosil_vctk_ckpt_0.5/snapshot_iter_96400.pdz \
+  --fastspeech2-checkpoint=fastspeech2_nosil_vctk_ckpt_0.5/snapshot_iter_66200.pdz \
   --fastspeech2-stat=fastspeech2_nosil_vctk_ckpt_0.5/speech_stats.npy \
   --pwg-config=pwg_vctk_ckpt_0.5/pwg_default.yaml \
   --pwg-checkpoint=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \
   --pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../sentences_en.txt \
   --output-dir=exp/default/test_e2e \
-  --device="gpu" \
   --phones-dict=fastspeech2_nosil_vctk_ckpt_0.5/phone_id_map.txt \
   --speaker-dict=fastspeech2_nosil_vctk_ckpt_0.5/speaker_id_map.txt
 ```
diff --git a/examples/vctk/tts3/local/synthesize.sh b/examples/vctk/tts3/local/synthesize.sh
index ca112969..8165c858 100755
--- a/examples/vctk/tts3/local/synthesize.sh
+++ b/examples/vctk/tts3/local/synthesize.sh
@@ -15,6 +15,5 @@ python3 ${BIN_DIR}/synthesize.py \
   --pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \
   --test-metadata=dump/test/norm/metadata.jsonl \
   --output-dir=${train_output_path}/test \
-  --device="gpu" \
   --phones-dict=dump/phone_id_map.txt \
   --speaker-dict=dump/speaker_id_map.txt
diff --git a/examples/vctk/tts3/local/synthesize_e2e.sh b/examples/vctk/tts3/local/synthesize_e2e.sh
index d919bb08..e0b2a041 100755
--- a/examples/vctk/tts3/local/synthesize_e2e.sh
+++ b/examples/vctk/tts3/local/synthesize_e2e.sh
@@ -15,6 +15,5 @@ python3 ${BIN_DIR}/multi_spk_synthesize_e2e_en.py \
   --pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \
   --text=${BIN_DIR}/../sentences_en.txt \
   --output-dir=${train_output_path}/test_e2e \
-  --device="gpu" \
   --phones-dict=dump/phone_id_map.txt \
   --speaker-dict=dump/speaker_id_map.txt
diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh
index be6051c9..3a507650 100755
--- a/examples/vctk/tts3/local/train.sh
+++ b/examples/vctk/tts3/local/train.sh
@@ -8,6 +8,6 @@ python3 ${BIN_DIR}/train.py \
     --dev-metadata=dump/dev/norm/metadata.jsonl \
     --config=${config_path} \
     --output-dir=${train_output_path} \
-    --nprocs=2 \
+    --ngpu=1 \
     --phones-dict=dump/phone_id_map.txt \
     --speaker-dict=dump/speaker_id_map.txt
diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md
index cbfff32d..d2d2d48c 100644
--- a/examples/vctk/voc1/README.md
+++ b/examples/vctk/voc1/README.md
@@ -58,9 +58,8 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE]
-                [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
-                [--run-benchmark RUN_BENCHMARK]
+                [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE]
+                [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK]
                 [--profiler_options PROFILER_OPTIONS]
 
 Train a ParallelWaveGAN model.
@@ -74,8 +73,7 @@ optional arguments:
                         dev data.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device type to use.
-  --nprocs NPROCS       number of processes.
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose.
 
 benchmark:
@@ -95,8 +93,8 @@ benchmark:
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
-4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
-5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
 ### Synthesize
 `./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
@@ -105,7 +103,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```text
 usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT]
                      [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
-                     [--device DEVICE] [--verbose VERBOSE]
+                     [--ngpu NGPU] [--verbose VERBOSE]
 
 Synthesize with parallel wavegan.
 
@@ -118,7 +116,7 @@ optional arguments:
                         dev data.
   --output-dir OUTPUT_DIR
                         output dir.
-  --device DEVICE       device to run.
+  --ngpu NGPU           if ngpu == 0, use cpu.
   --verbose VERBOSE     verbose.
 ```
 
@@ -126,7 +124,7 @@ optional arguments:
 2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. If you use the pretrained model, use the `pwg_snapshot_iter_400000.pdz`.
 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
 4. `--output-dir` is the directory to save the synthesized audio files.
-5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported.
+5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Models
 Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip).
diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh
index 1ef860c3..9695631e 100755
--- a/examples/vctk/voc1/local/train.sh
+++ b/examples/vctk/voc1/local/train.sh
@@ -10,4 +10,4 @@ python ${BIN_DIR}/train.py \
     --dev-metadata=dump/dev/norm/metadata.jsonl \
     --config=${config_path} \
     --output-dir=${train_output_path} \
-    --nprocs=1
+    --ngpu=1
diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
index 98cf9f8f..a90658c6 100644
--- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
@@ -145,12 +145,17 @@ def main():
         help="text to synthesize, a 'utt_id sentence' pair per line.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
     parser.add_argument("--verbose", type=int, default=1, help="verbose.")
 
     args = parser.parse_args()
 
-    paddle.set_device(args.device)
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
 
     with open(args.fastspeech2_config) as f:
         fastspeech2_config = CfgNode(yaml.safe_load(f))
diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py
index 9e29eea1..b5d0ce17 100644
--- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py
+++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py
@@ -154,12 +154,17 @@ def main():
         help="text to synthesize, a 'utt_id sentence' pair per line.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
     parser.add_argument("--verbose", type=int, default=1, help="verbose.")
 
     args = parser.parse_args()
 
-    paddle.set_device(args.device)
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
 
     with open(args.fastspeech2_config) as f:
         fastspeech2_config = CfgNode(yaml.safe_load(f))
diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize.py b/paddlespeech/t2s/exps/fastspeech2/synthesize.py
index 1beac5ce..207275f9 100644
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize.py
@@ -145,12 +145,16 @@ def main():
     parser.add_argument("--test-metadata", type=str, help="test metadata.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
     parser.add_argument("--verbose", type=int, default=1, help="verbose.")
 
     args = parser.parse_args()
-
-    paddle.set_device(args.device)
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
 
     with open(args.fastspeech2_config) as f:
         fastspeech2_config = CfgNode(yaml.safe_load(f))
diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
index b6a8fc58..ff9a41ea 100644
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
@@ -155,12 +155,17 @@ def main():
     parser.add_argument(
         "--inference-dir", type=str, help="dir to save inference models")
     parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
     parser.add_argument("--verbose", type=int, default=1, help="verbose.")
 
     args = parser.parse_args()
 
-    paddle.set_device(args.device)
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
 
     with open(args.fastspeech2_config) as f:
         fastspeech2_config = CfgNode(yaml.safe_load(f))
diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_en.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_en.py
index 7a55fbb1..6e3434a7 100644
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_en.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_en.py
@@ -145,12 +145,17 @@ def main():
         help="text to synthesize, a 'utt_id sentence' pair per line.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
     parser.add_argument("--verbose", type=int, default=1, help="verbose.")
 
     args = parser.parse_args()
 
-    paddle.set_device(args.device)
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
 
     with open(args.fastspeech2_config) as f:
         fastspeech2_config = CfgNode(yaml.safe_load(f))
diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
index 92a43d5c..f0ff5655 100644
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
@@ -160,12 +160,17 @@ def main():
     parser.add_argument(
         "--inference-dir", type=str, help="dir to save inference models")
     parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
     parser.add_argument("--verbose", type=int, default=1, help="verbose.")
 
     args = parser.parse_args()
 
-    paddle.set_device(args.device)
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
 
     with open(args.fastspeech2_config) as f:
         fastspeech2_config = CfgNode(yaml.safe_load(f))
diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py
index 5662d15d..38ac2fe3 100644
--- a/paddlespeech/t2s/exps/fastspeech2/train.py
+++ b/paddlespeech/t2s/exps/fastspeech2/train.py
@@ -43,7 +43,7 @@ from paddlespeech.t2s.training.trainer import Trainer
 def train_sp(args, config):
     # decides device type and whether to run in parallel
     # setup running environment correctly
-    if not paddle.is_compiled_with_cuda():
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
         paddle.set_device("cpu")
     else:
         paddle.set_device("gpu")
@@ -174,9 +174,7 @@ def main():
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use.")
-    parser.add_argument(
-        "--nprocs", type=int, default=1, help="number of processes.")
+        "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
     parser.add_argument("--verbose", type=int, default=1, help="verbose.")
     parser.add_argument(
         "--phones-dict", type=str, default=None, help="phone vocabulary file.")
@@ -187,8 +185,6 @@ def main():
         help="speaker id map file for multiple speaker model.")
 
     args = parser.parse_args()
-    if args.device == "cpu" and args.nprocs > 1:
-        raise RuntimeError("Multiprocess training on CPU is not supported.")
 
     with open(args.config) as f:
         config = CfgNode(yaml.safe_load(f))
@@ -202,8 +198,8 @@ def main():
     )
 
     # dispatch
-    if args.nprocs > 1:
-        dist.spawn(train_sp, (args, config), nprocs=args.nprocs)
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
     else:
         train_sp(args, config)
 
diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py
index 720b08ce..9ea76a83 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py
@@ -37,7 +37,7 @@ def main():
     parser.add_argument("--test-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
-        "--device", type=str, default="gpu", help="device to run.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
     parser.add_argument("--verbose", type=int, default=1, help="verbose.")
 
     args = parser.parse_args()
@@ -53,7 +53,12 @@ def main():
         f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
     )
 
-    paddle.set_device(args.device)
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
     generator = MelGANGenerator(**config["generator_params"])
     state_dict = paddle.load(args.checkpoint)
     generator.set_state_dict(state_dict["generator_params"])
diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
index 45704607..ca3c0a1f 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
@@ -50,7 +50,7 @@ def train_sp(args, config):
     # decides device type and whether to run in parallel
     # setup running environment correctly
     world_size = paddle.distributed.get_world_size()
-    if not paddle.is_compiled_with_cuda():
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
         paddle.set_device("cpu")
     else:
         paddle.set_device("gpu")
@@ -238,14 +238,10 @@ def main():
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use.")
-    parser.add_argument(
-        "--nprocs", type=int, default=1, help="number of processes.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
     parser.add_argument("--verbose", type=int, default=1, help="verbose.")
 
     args = parser.parse_args()
-    if args.device == "cpu" and args.nprocs > 1:
-        raise RuntimeError("Multiprocess training on CPU is not supported.")
 
     with open(args.config, 'rt') as f:
         config = CfgNode(yaml.safe_load(f))
@@ -259,8 +255,8 @@ def main():
     )
 
     # dispatch
-    if args.nprocs > 1:
-        dist.spawn(train_sp, (args, config), nprocs=args.nprocs)
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
     else:
         train_sp(args, config)
 
diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py
index ce90aaf4..f275ed44 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py
@@ -37,7 +37,7 @@ def main():
     parser.add_argument("--test-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
-        "--device", type=str, default="gpu", help="device to run.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
     parser.add_argument("--verbose", type=int, default=1, help="verbose.")
 
     args = parser.parse_args()
@@ -53,7 +53,12 @@ def main():
         f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
     )
 
-    paddle.set_device(args.device)
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
     generator = PWGGenerator(**config["generator_params"])
     state_dict = paddle.load(args.checkpoint)
     generator.set_state_dict(state_dict["generator_params"])
diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py
index a04a547e..ca2e3f55 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py
@@ -92,12 +92,17 @@ def main():
     parser.add_argument("--input-dir", type=str, help="input dir of wavs.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
-        "--device", type=str, default="gpu", help="device to run.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
     parser.add_argument("--verbose", type=int, default=1, help="verbose.")
 
     args = parser.parse_args()
 
-    paddle.set_device(args.device)
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
 
     with open(args.config) as f:
         config = CfgNode(yaml.safe_load(f))
diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
index 99801267..42ef8830 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
@@ -47,7 +47,7 @@ def train_sp(args, config):
     # decides device type and whether to run in parallel
     # setup running environment correctly
     world_size = paddle.distributed.get_world_size()
-    if not paddle.is_compiled_with_cuda():
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
         paddle.set_device("cpu")
     else:
         paddle.set_device("gpu")
@@ -215,9 +215,7 @@ def main():
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use.")
-    parser.add_argument(
-        "--nprocs", type=int, default=1, help="number of processes.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
     parser.add_argument("--verbose", type=int, default=1, help="verbose.")
 
     benchmark_group = parser.add_argument_group(
@@ -241,8 +239,6 @@ def main():
     )
 
     args = parser.parse_args()
-    if args.device == "cpu" and args.nprocs > 1:
-        raise RuntimeError("Multiprocess training on CPU is not supported.")
 
     with open(args.config, 'rt') as f:
         config = CfgNode(yaml.safe_load(f))
@@ -261,8 +257,8 @@ def main():
     )
 
     # dispatch
-    if args.nprocs > 1:
-        dist.spawn(train_sp, (args, config), nprocs=args.nprocs)
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
     else:
         train_sp(args, config)
 
diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
index 852b0c91..782fbdf2 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
@@ -226,8 +226,22 @@ def main():
                 test_wav_files += wav_files[-sub_num_dev:]
             else:
                 train_wav_files += wav_files
+    elif args.dataset == "aishell3":
+        sub_num_dev = 5
+        wav_dir = rootdir / "train" / "wav"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*.wav")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
     else:
-        print("dataset should in {baker, ljspeech, vctk} now!")
+        print("dataset should in {baker, ljspeech, vctk, aishell3} now!")
 
     train_dump_dir = dumpdir / "train" / "raw"
     train_dump_dir.mkdir(parents=True, exist_ok=True)
diff --git a/paddlespeech/t2s/exps/ge2e/inference.py b/paddlespeech/t2s/exps/ge2e/inference.py
index a5733941..eed3b794 100644
--- a/paddlespeech/t2s/exps/ge2e/inference.py
+++ b/paddlespeech/t2s/exps/ge2e/inference.py
@@ -51,7 +51,13 @@ def _process_utterance(ifpath: Path,
 
 
 def main(config, args):
-    paddle.set_device(args.device)
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
 
     # load model
     model = LSTMSpeakerEncoder(config.data.n_mels, config.model.num_layers,
@@ -112,13 +118,6 @@ if __name__ == "__main__":
     parser.add_argument(
         "--checkpoint_path", type=str, help="path of the checkpoint to load")
 
-    # running
-    parser.add_argument(
-        "--device",
-        type=str,
-        choices=["cpu", "gpu"],
-        help="device type to use, cpu and gpu are supported.")
-
     # overwrite extra config and default config
     parser.add_argument(
         "--opts",
@@ -126,6 +125,9 @@ if __name__ == "__main__":
         help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
     )
 
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
+
     args = parser.parse_args()
     if args.config:
         config.merge_from_file(args.config)
diff --git a/paddlespeech/t2s/exps/ge2e/train.py b/paddlespeech/t2s/exps/ge2e/train.py
index d3a57c93..55c6daf7 100644
--- a/paddlespeech/t2s/exps/ge2e/train.py
+++ b/paddlespeech/t2s/exps/ge2e/train.py
@@ -102,8 +102,8 @@ def main_sp(config, args):
 
 
 def main(config, args):
-    if args.nprocs > 1 and args.device == "gpu":
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    if args.ngpu > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     else:
         main_sp(config, args)
 
diff --git a/paddlespeech/t2s/exps/speedyspeech/inference.py b/paddlespeech/t2s/exps/speedyspeech/inference.py
index 49ce37f2..617848c5 100644
--- a/paddlespeech/t2s/exps/speedyspeech/inference.py
+++ b/paddlespeech/t2s/exps/speedyspeech/inference.py
@@ -96,10 +96,10 @@ def main():
 
         input_ids = frontend.get_input_ids(
             sentence, merge_sentences=True, get_tone_ids=True)
-        phone_ids = input_ids["phone_ids"].numpy()
-        tone_ids = input_ids["tone_ids"].numpy()
-        phones = phone_ids[0]
-        tones = tone_ids[0]
+        phone_ids = input_ids["phone_ids"]
+        tone_ids = input_ids["tone_ids"]
+        phones = phone_ids[0].numpy()
+        tones = tone_ids[0].numpy()
 
         if args.enable_auto_log:
             logger.times.stamp()
diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize.py b/paddlespeech/t2s/exps/speedyspeech/synthesize.py
index 4482c179..67d56ea5 100644
--- a/paddlespeech/t2s/exps/speedyspeech/synthesize.py
+++ b/paddlespeech/t2s/exps/speedyspeech/synthesize.py
@@ -155,12 +155,17 @@ def main():
     parser.add_argument(
         "--inference-dir", type=str, help="dir to save inference models")
     parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
     parser.add_argument("--verbose", type=int, default=1, help="verbose")
 
     args, _ = parser.parse_known_args()
 
-    paddle.set_device(args.device)
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
 
     with open(args.speedyspeech_config) as f:
         speedyspeech_config = CfgNode(yaml.safe_load(f))
diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
index 0870d466..0e64088d 100644
--- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
@@ -170,13 +170,18 @@ def main():
     parser.add_argument("--output-dir", type=str, help="output dir")
     parser.add_argument(
         "--inference-dir", type=str, help="dir to save inference models")
-    parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use")
     parser.add_argument("--verbose", type=int, default=1, help="verbose")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
 
     args, _ = parser.parse_known_args()
 
-    paddle.set_device(args.device)
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
 
     with open(args.speedyspeech_config) as f:
         speedyspeech_config = CfgNode(yaml.safe_load(f))
diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py
index 772a39d7..d9a2fbf4 100644
--- a/paddlespeech/t2s/exps/speedyspeech/train.py
+++ b/paddlespeech/t2s/exps/speedyspeech/train.py
@@ -43,7 +43,7 @@ def train_sp(args, config):
     # decides device type and whether to run in parallel
     # setup running environment correctly
     world_size = paddle.distributed.get_world_size()
-    if not paddle.is_compiled_with_cuda():
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
         paddle.set_device("cpu")
     else:
         paddle.set_device("gpu")
@@ -167,9 +167,7 @@ def main():
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use.")
-    parser.add_argument(
-        "--nprocs", type=int, default=1, help="number of processes.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
     parser.add_argument("--verbose", type=int, default=1, help="verbose.")
 
     def str2bool(str):
@@ -189,8 +187,7 @@ def main():
 
     # 这里可以多传入 max_epoch 等
     args, rest = parser.parse_known_args()
-    if args.device == "cpu" and args.nprocs > 1:
-        raise RuntimeError("Multiprocess training on CPU is not supported.")
+
     with open(args.config) as f:
         config = CfgNode(yaml.safe_load(f))
 
@@ -212,8 +209,8 @@ def main():
     )
 
     # dispatch
-    if args.nprocs > 1:
-        dist.spawn(train_sp, (args, config), nprocs=args.nprocs)
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
     else:
         train_sp(args, config)
 
diff --git a/paddlespeech/t2s/exps/tacotron2/ljspeech.py b/paddlespeech/t2s/exps/tacotron2/ljspeech.py
index 08db2a64..4facde40 100644
--- a/paddlespeech/t2s/exps/tacotron2/ljspeech.py
+++ b/paddlespeech/t2s/exps/tacotron2/ljspeech.py
@@ -67,16 +67,19 @@ class LJSpeechCollector(object):
 
         # Sort by text_len in descending order
         texts = [
-            i for i, _ in sorted(
+            i
+            for i, _ in sorted(
                 zip(texts, text_lens), key=lambda x: x[1], reverse=True)
         ]
         mels = [
-            i for i, _ in sorted(
+            i
+            for i, _ in sorted(
                 zip(mels, text_lens), key=lambda x: x[1], reverse=True)
         ]
 
         mel_lens = [
-            i for i, _ in sorted(
+            i
+            for i, _ in sorted(
                 zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
         ]
 
diff --git a/paddlespeech/t2s/exps/tacotron2/synthesize.py b/paddlespeech/t2s/exps/tacotron2/synthesize.py
index 613fec02..c73c32d2 100644
--- a/paddlespeech/t2s/exps/tacotron2/synthesize.py
+++ b/paddlespeech/t2s/exps/tacotron2/synthesize.py
@@ -25,7 +25,12 @@ from paddlespeech.t2s.utils import display
 
 
 def main(config, args):
-    paddle.set_device(args.device)
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
 
     # model
     frontend = EnglishCharacter()
@@ -77,7 +82,7 @@ if __name__ == "__main__":
     parser.add_argument("--input", type=str, help="path of the text sentences")
     parser.add_argument("--output", type=str, help="path to save outputs")
     parser.add_argument(
-        "--device", type=str, default="cpu", help="device type to use.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
     parser.add_argument(
         "--opts",
         nargs=argparse.REMAINDER,
diff --git a/paddlespeech/t2s/exps/tacotron2/train.py b/paddlespeech/t2s/exps/tacotron2/train.py
index a5f08360..8198348f 100644
--- a/paddlespeech/t2s/exps/tacotron2/train.py
+++ b/paddlespeech/t2s/exps/tacotron2/train.py
@@ -199,8 +199,8 @@ def main_sp(config, args):
 
 
 def main(config, args):
-    if args.nprocs > 1 and args.device == "gpu":
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    if args.ngpu > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     else:
         main_sp(config, args)
 
diff --git a/paddlespeech/t2s/exps/transformer_tts/synthesize.py b/paddlespeech/t2s/exps/transformer_tts/synthesize.py
index 82fd8f15..666c3b72 100644
--- a/paddlespeech/t2s/exps/transformer_tts/synthesize.py
+++ b/paddlespeech/t2s/exps/transformer_tts/synthesize.py
@@ -117,12 +117,17 @@ def main():
     parser.add_argument("--test-metadata", type=str, help="test metadata.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
     parser.add_argument("--verbose", type=int, default=1, help="verbose.")
 
     args = parser.parse_args()
 
-    paddle.set_device(args.device)
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
 
     with open(args.transformer_tts_config) as f:
         transformer_tts_config = CfgNode(yaml.safe_load(f))
diff --git a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
index 993749f0..ba197f43 100644
--- a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
@@ -136,12 +136,17 @@ def main():
         help="text to synthesize, a 'utt_id sentence' pair per line.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
     parser.add_argument("--verbose", type=int, default=1, help="verbose.")
 
     args = parser.parse_args()
 
-    paddle.set_device(args.device)
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
 
     with open(args.transformer_tts_config) as f:
         transformer_tts_config = CfgNode(yaml.safe_load(f))
diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py
index 7d9020a3..163339f4 100644
--- a/paddlespeech/t2s/exps/transformer_tts/train.py
+++ b/paddlespeech/t2s/exps/transformer_tts/train.py
@@ -42,7 +42,7 @@ from paddlespeech.t2s.training.trainer import Trainer
 def train_sp(args, config):
     # decides device type and whether to run in parallel
     # setup running environment correctly
-    if not paddle.is_compiled_with_cuda():
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
         paddle.set_device("cpu")
     else:
         paddle.set_device("gpu")
@@ -164,16 +164,12 @@ def main():
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use.")
-    parser.add_argument(
-        "--nprocs", type=int, default=1, help="number of processes.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
     parser.add_argument("--verbose", type=int, default=1, help="verbose.")
     parser.add_argument(
         "--phones-dict", type=str, default=None, help="phone vocabulary file.")
 
     args = parser.parse_args()
-    if args.device == "cpu" and args.nprocs > 1:
-        raise RuntimeError("Multiprocess training on CPU is not supported.")
 
     with open(args.config) as f:
         config = CfgNode(yaml.safe_load(f))
@@ -187,8 +183,8 @@ def main():
     )
 
     # dispatch
-    if args.nprocs > 1:
-        dist.spawn(train_sp, (args, config), nprocs=args.nprocs)
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
     else:
         train_sp(args, config)
 
diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py
index ceae1360..34660c75 100644
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py
+++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py
@@ -241,8 +241,8 @@ def main_sp(config, args):
 
 
 def main(config, args):
-    if args.nprocs > 1 and args.device == "gpu":
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    if args.gpus:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     else:
         main_sp(config, args)
 
diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
index c76ce007..2f005e72 100644
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
+++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
@@ -140,8 +140,9 @@ def main():
         "--tacotron2_params_path", type=str, help="tacotron2 params path.")
     parser.add_argument(
         "--waveflow_params_path", type=str, help="waveflow params path.")
+
     parser.add_argument(
-        "--device", type=str, default="gpu", help="device type to use.")
+        "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
 
     parser.add_argument(
         "--input-dir",
@@ -151,7 +152,12 @@ def main():
 
     args = parser.parse_args()
 
-    paddle.set_device(args.device)
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
 
     voice_cloning(args)
 
diff --git a/paddlespeech/t2s/exps/waveflow/synthesize.py b/paddlespeech/t2s/exps/waveflow/synthesize.py
index 4f07aa4e..53715b01 100644
--- a/paddlespeech/t2s/exps/waveflow/synthesize.py
+++ b/paddlespeech/t2s/exps/waveflow/synthesize.py
@@ -25,7 +25,13 @@ from paddlespeech.t2s.utils import layer_tools
 
 
 def main(config, args):
-    paddle.set_device(args.device)
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
     model = ConditionalWaveFlow.from_pretrained(config, args.checkpoint_path)
     layer_tools.recursively_remove_weight_norm(model)
     model.eval()
@@ -60,7 +66,7 @@ if __name__ == "__main__":
         help="path of directory containing mel spectrogram (in .npy format)")
     parser.add_argument("--output", type=str, help="path to save outputs")
     parser.add_argument(
-        "--device", type=str, default="cpu", help="device type to use.")
+        "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
     parser.add_argument(
         "--opts",
         nargs=argparse.REMAINDER,
diff --git a/paddlespeech/t2s/exps/waveflow/train.py b/paddlespeech/t2s/exps/waveflow/train.py
index 9d1df13c..d500336a 100644
--- a/paddlespeech/t2s/exps/waveflow/train.py
+++ b/paddlespeech/t2s/exps/waveflow/train.py
@@ -139,8 +139,8 @@ def main_sp(config, args):
 
 
 def main(config, args):
-    if args.nprocs > 1 and args.device == "gpu":
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    if args.ngpu > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     else:
         main_sp(config, args)
 
diff --git a/paddlespeech/t2s/frontend/pinyin.py b/paddlespeech/t2s/frontend/pinyin.py
deleted file mode 100644
index f99129ce..00000000
--- a/paddlespeech/t2s/frontend/pinyin.py
+++ /dev/null
@@ -1,333 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-A Simple Chinese Phonology using pinyin symbols. 
-The G2P conversion converts pinyin string to symbols. Also it can handle string
-in Chinese chracters, but due to the complexity of chinese G2P, we can leave 
-text -> pinyin to other part of a TTS system. Other NLP techniques may be used
-(e.g. tokenization, tagging, NER...)
-"""
-import re
-from itertools import product
-
-from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
-from pypinyin.core import DefaultConverter
-from pypinyin.core import Pinyin
-from pypinyin.core import Style
-
-from paddlespeech.t2s.frontend.phonectic import Phonetics
-from paddlespeech.t2s.frontend.vocab import Vocab
-
-_punctuations = ['，', '。', '？', '！']
-_initials = [
-    'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j', 'q', 'x', 'zh',
-    'ch', 'sh', 'r', 'z', 'c', 's'
-]
-_finals = [
-    'ii', 'iii', 'a', 'o', 'e', 'ea', 'ai', 'ei', 'ao', 'ou', 'an', 'en', 'ang',
-    'eng', 'er', 'i', 'ia', 'io', 'ie', 'iai', 'iao', 'iou', 'ian', 'ien',
-    'iang', 'ieng', 'u', 'ua', 'uo', 'uai', 'uei', 'uan', 'uen', 'uang', 'ueng',
-    'v', 've', 'van', 'ven', 'veng'
-]
-_ernized_symbol = ['&r']
-_phones = _initials + _finals + _ernized_symbol + _punctuations
-_tones = ['0', '1', '2', '3', '4', '5']
-
-_toned_finals = [final + tone for final, tone in product(_finals, _tones[1:])]
-_toned_phonems = _initials + _toned_finals + _ernized_symbol + _punctuations
-
-
-class ParakeetConverter(NeutralToneWith5Mixin, DefaultConverter):
-    pass
-
-
-class ParakeetPinyin(Phonetics):
-    def __init__(self):
-        self.vocab_phonemes = Vocab(_phones)
-        self.vocab_tones = Vocab(_tones)
-        self.pinyin_backend = Pinyin(ParakeetConverter())
-
-    def convert_pypinyin_tone3(self, syllables, add_start_end=False):
-        phonemes, tones = _convert_to_parakeet_style_pinyin(syllables)
-
-        if add_start_end:
-            start = self.vocab_phonemes.start_symbol
-            end = self.vocab_phonemes.end_symbol
-            phonemes = [start] + phonemes + [end]
-
-            start = self.vocab_tones.start_symbol
-            end = self.vocab_tones.end_symbol
-            phonemes = [start] + tones + [end]
-
-        phonemes = [
-            item for item in phonemes if item in self.vocab_phonemes.stoi
-        ]
-        tones = [item for item in tones if item in self.vocab_tones.stoi]
-        return phonemes, tones
-
-    def phoneticize(self, sentence, add_start_end=False):
-        """ Normalize the input text sequence and convert it into pronunciation sequence.
-
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
-        """
-        syllables = self.pinyin_backend.lazy_pinyin(
-            sentence, style=Style.TONE3, strict=True)
-        phonemes, tones = self.convert_pypinyin_tone3(
-            syllables, add_start_end=add_start_end)
-        return phonemes, tones
-
-    def numericalize(self, phonemes, tones):
-        """ Convert pronunciation sequence into pronunciation id sequence.
-
-        Parameters
-        -----------
-        phonemes: List[str]
-            The list of pronunciation sequence.
-
-        Returns
-        ----------
-        List[int]
-            The list of pronunciation id sequence.
-        """
-        phoneme_ids = [self.vocab_phonemes.lookup(item) for item in phonemes]
-        tone_ids = [self.vocab_tones.lookup(item) for item in tones]
-        return phoneme_ids, tone_ids
-
-    def __call__(self, sentence, add_start_end=False):
-        """ Convert the input text sequence into pronunciation id sequence.
-
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation id sequence.
-        """
-        phonemes, tones = self.phoneticize(
-            sentence, add_start_end=add_start_end)
-        phoneme_ids, tone_ids = self.numericalize(phonemes, tones)
-        return phoneme_ids, tone_ids
-
-    @property
-    def vocab_size(self):
-        """ Vocab size.
-        """
-        # 70 = 62 phones + 4 punctuations + 4 special tokens
-        return len(self.vocab_phonemes)
-
-    @property
-    def tone_vocab_size(self):
-        # 10 = 1 non tone + 5 tone + 4 special tokens
-        return len(self.vocab_tones)
-
-
-class ParakeetPinyinWithTone(Phonetics):
-    def __init__(self):
-        self.vocab = Vocab(_toned_phonems)
-        self.pinyin_backend = Pinyin(ParakeetConverter())
-
-    def convert_pypinyin_tone3(self, syllables, add_start_end=False):
-        phonemes = _convert_to_parakeet_style_pinyin_with_tone(syllables)
-
-        if add_start_end:
-            start = self.vocab_phonemes.start_symbol
-            end = self.vocab_phonemes.end_symbol
-            phonemes = [start] + phonemes + [end]
-
-        phonemes = [item for item in phonemes if item in self.vocab.stoi]
-        return phonemes
-
-    def phoneticize(self, sentence, add_start_end=False):
-        """ Normalize the input text sequence and convert it into pronunciation sequence.
-
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
-        """
-        syllables = self.pinyin_backend.lazy_pinyin(
-            sentence, style=Style.TONE3, strict=True)
-        phonemes = self.convert_pypinyin_tone3(
-            syllables, add_start_end=add_start_end)
-        return phonemes
-
-    def numericalize(self, phonemes):
-        """ Convert pronunciation sequence into pronunciation id sequence.
-
-        Parameters
-        -----------
-        phonemes: List[str]
-            The list of pronunciation sequence.
-
-        Returns
-        ----------
-        List[int]
-            The list of pronunciation id sequence.
-        """
-        phoneme_ids = [self.vocab.lookup(item) for item in phonemes]
-        return phoneme_ids
-
-    def __call__(self, sentence, add_start_end=False):
-        """ Convert the input text sequence into pronunciation id sequence.
-
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation id sequence.
-        """
-        phonemes = self.phoneticize(sentence, add_start_end=add_start_end)
-        phoneme_ids = self.numericalize(phonemes)
-        return phoneme_ids
-
-    @property
-    def vocab_size(self):
-        """ Vocab size.
-        """
-        # 230 = 222 phones + 4 punctuations + 4 special tokens
-        return len(self.vocab)
-
-
-def _convert_to_parakeet_convension(syllable):
-    # from pypinyin.Style.TONE3 to parakeet convension
-    tone = syllable[-1]
-    syllable = syllable[:-1]
-
-    # expansion of o -> uo
-    syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable)
-
-    # expansion for iong, ong
-    syllable = syllable.replace("iong", "veng").replace("ong", "ueng")
-
-    # expansion for ing, in
-    syllable = syllable.replace("ing", "ieng").replace("in", "ien")
-
-    # expansion for un, ui, iu
-    syllable = syllable.replace("un", "uen") \
-        .replace("ui", "uei") \
-        .replace("iu", "iou")
-
-    # rule for variants of i
-    syllable = syllable.replace("zi", "zii") \
-        .replace("ci", "cii") \
-        .replace("si", "sii") \
-        .replace("zhi", "zhiii") \
-        .replace("chi", "chiii") \
-        .replace("shi", "shiii") \
-        .replace("ri", "riii")
-
-    # rule for y preceding i, u
-    syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i")
-
-    # rule for w
-    syllable = syllable.replace("wu", "u").replace("w", "u")
-
-    # rule for v following j, q, x
-    syllable = syllable.replace("ju", "jv") \
-        .replace("qu", "qv") \
-        .replace("xu", "xv")
-
-    return syllable + tone
-
-
-def _split_syllable(syllable: str):
-    global _punctuations
-
-    if syllable in _punctuations:
-        # syllables, tones
-        return [syllable], ['0']
-
-    syllable = _convert_to_parakeet_convension(syllable)
-
-    tone = syllable[-1]
-    syllable = syllable[:-1]
-
-    phones = []
-    tones = []
-
-    global _initials
-    if syllable[:2] in _initials:
-        phones.append(syllable[:2])
-        tones.append('0')
-        phones.append(syllable[2:])
-        tones.append(tone)
-    elif syllable[0] in _initials:
-        phones.append(syllable[0])
-        tones.append('0')
-        phones.append(syllable[1:])
-        tones.append(tone)
-    else:
-        phones.append(syllable)
-        tones.append(tone)
-    return phones, tones
-
-
-def _convert_to_parakeet_style_pinyin(syllables):
-    phones, tones = [], []
-    for syllable in syllables:
-        p, t = _split_syllable(syllable)
-        phones.extend(p)
-        tones.extend(t)
-    return phones, tones
-
-
-def _split_syllable_with_tone(syllable: str):
-    global _punctuations
-
-    if syllable in _punctuations:
-        # syllables
-        return [syllable]
-
-    syllable = _convert_to_parakeet_convension(syllable)
-
-    phones = []
-
-    global _initials
-    if syllable[:2] in _initials:
-        phones.append(syllable[:2])
-        phones.append(syllable[2:])
-    elif syllable[0] in _initials:
-        phones.append(syllable[0])
-        phones.append(syllable[1:])
-    else:
-        phones.append(syllable)
-    return phones
-
-
-def _convert_to_parakeet_style_pinyin_with_tone(syllables):
-    phones = []
-    for syllable in syllables:
-        p = _split_syllable_with_tone(syllable)
-        phones.extend(p)
-    return phones
diff --git a/paddlespeech/t2s/training/cli.py b/paddlespeech/t2s/training/cli.py
index 3b9fd42e..a0710fd7 100644
--- a/paddlespeech/t2s/training/cli.py
+++ b/paddlespeech/t2s/training/cli.py
@@ -53,8 +53,7 @@ def default_argument_parser():
     parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load")
 
     # running
-    parser.add_argument("--device", type=str, choices=["cpu", "gpu"], help="device type to use, cpu and gpu are supported.")
-    parser.add_argument("--nprocs", type=int, default=1, help="number of parallel processes to use.")
+    parser.add_argument("--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
 
     # overwrite extra config and default config
     parser.add_argument("--opts", nargs=argparse.REMAINDER, help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
diff --git a/paddlespeech/t2s/training/experiment.py b/paddlespeech/t2s/training/experiment.py
index 7a6a7e99..c9e7f4cc 100644
--- a/paddlespeech/t2s/training/experiment.py
+++ b/paddlespeech/t2s/training/experiment.py
@@ -107,7 +107,12 @@ class ExperimentBase(object):
     def setup(self):
         """Setup the experiment.
         """
-        paddle.set_device(self.args.device)
+        if self.args.ngpu == 0:
+            paddle.set_device("cpu")
+        elif self.args.ngpu > 0:
+            paddle.set_device("gpu")
+        else:
+            print("ngpu should >= 0 !")
         if self.parallel:
             self.init_parallel()
 
@@ -128,7 +133,7 @@ class ExperimentBase(object):
         """A flag indicating whether the experiment should run with
         multiprocessing.
         """
-        return self.args.device == "gpu" and self.args.nprocs > 1
+        return self.args.ngpu > 1
 
     def init_parallel(self):
         """Init environment for multiprocess training.
diff --git a/tests/benchmark/pwgan/run_benchmark.sh b/tests/benchmark/pwgan/run_benchmark.sh
index be5733da..e60d8798 100755
--- a/tests/benchmark/pwgan/run_benchmark.sh
+++ b/tests/benchmark/pwgan/run_benchmark.sh
@@ -29,8 +29,8 @@ function _train(){
                --run-benchmark=true"   
 
     case ${run_mode} in
-    sp) train_cmd="python3 paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py --nprocs=1 ${train_cmd}" ;;
-    mp) train_cmd="python3 paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py --nprocs=8 ${train_cmd}"
+    sp) train_cmd="python3 paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py --ngpu=1 ${train_cmd}" ;;
+    mp) train_cmd="python3 paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py --ngpu=8 ${train_cmd}"
         log_parse_file="mylog/workerlog.0" ;;
     *) echo "choose run_mode(sp or mp)"; exit 1;
     esac
diff --git a/tests/chains/speedyspeech/test.sh b/tests/chains/speedyspeech/test.sh
index f4441335..ccabc319 100755
--- a/tests/chains/speedyspeech/test.sh
+++ b/tests/chains/speedyspeech/test.sh
@@ -324,7 +324,7 @@ else
                     gsu=${gpu//,/ }
                     nump=`echo $gsu | wc -w`
                     CUDA_VISIBLE_DEVICES=${gpu}
-                    cmd="${python} ${run_train} --nprocs=$nump"
+                    cmd="${python} ${run_train} --ngpu=$nump"
                 else     # train with multi-machine
                     cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}"
                 fi