Merge branch 'develop' into server_asr

3 years ago · b8f16ac9b0
parent da3ea7bb40 49f80afe6a
commit b8f16ac9b0
37 changed files with 1897 additions and 87 deletions
--- a/examples/mustc/st1/cmd.sh
+++ b/examples/mustc/st1/cmd.sh
@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
--- a/examples/mustc/st1/conf/fbank.conf
+++ b/examples/mustc/st1/conf/fbank.conf
@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
--- a/examples/mustc/st1/conf/pitch.conf
+++ b/examples/mustc/st1/conf/pitch.conf
@ -0,0 +1 @@
+--sample-frequency=16000
--- a/examples/mustc/st1/conf/transformer_de.yaml
+++ b/examples/mustc/st1/conf/transformer_de.yaml
@ -0,0 +1,90 @@
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.de.train
+dev_manifest: data/manifest.de.dev
+test_manifest: data/manifest.de.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_1spm/train_sp.en-de.de_bpe8000_units_tc.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_1spm/train_sp.en-de.de_bpe8000_tc
+mean_std_filepath: ""
+# preprocess_config: conf/augmentation.json
+batch_size: 20
+feat_dim: 83
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config:
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+
+
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: None
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.0
+    ctc_weight: 0.0
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 40
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 0.
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/mustc/st1/conf/transformer_es.yaml
+++ b/examples/mustc/st1/conf/transformer_es.yaml
@ -0,0 +1,90 @@
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.es.train
+dev_manifest: data/manifest.es.dev
+test_manifest: data/manifest.es.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_1spm/train_sp.en-es.es_bpe8000_units_tc.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_1spm/train_sp.en-es.es_bpe8000_tc
+mean_std_filepath: ""
+# preprocess_config: conf/augmentation.json
+batch_size: 20
+feat_dim: 83
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config:
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+
+
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: None
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.0
+    ctc_weight: 0.0
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 40
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 0.
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/mustc/st1/conf/transformer_fr.yaml
+++ b/examples/mustc/st1/conf/transformer_fr.yaml
@ -0,0 +1,90 @@
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.fr.train
+dev_manifest: data/manifest.fr.dev
+test_manifest: data/manifest.fr.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_1spm/train_sp.en-fr.fr_bpe8000_units_tc.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_1spm/train_sp.en-fr.fr_bpe8000_tc
+mean_std_filepath: ""
+# preprocess_config: conf/augmentation.json
+batch_size: 20
+feat_dim: 83
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config:
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+
+
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: None
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.0
+    ctc_weight: 0.0
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 40
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 0.
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/mustc/st1/conf/transformer_it.yaml
+++ b/examples/mustc/st1/conf/transformer_it.yaml
@ -0,0 +1,90 @@
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.it.train
+dev_manifest: data/manifest.it.dev
+test_manifest: data/manifest.it.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_1spm/train_sp.en-it.it_bpe8000_units_tc.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_1spm/train_sp.en-it.it_bpe8000_tc
+mean_std_filepath: ""
+# preprocess_config: conf/augmentation.json
+batch_size: 20
+feat_dim: 83
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config:
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+
+
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: None
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.0
+    ctc_weight: 0.0
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 40
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 0.
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/mustc/st1/conf/transformer_nl.yaml
+++ b/examples/mustc/st1/conf/transformer_nl.yaml
@ -0,0 +1,90 @@
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.nl.train
+dev_manifest: data/manifest.nl.dev
+test_manifest: data/manifest.nl.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_1spm/train_sp.en-nl.nl_bpe8000_units_tc.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_1spm/train_sp.en-nl.nl_bpe8000_tc
+mean_std_filepath: ""
+# preprocess_config: conf/augmentation.json
+batch_size: 20
+feat_dim: 83
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config:
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+
+
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: None
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.0
+    ctc_weight: 0.0
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 40
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 0.
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/mustc/st1/conf/transformer_pt.yaml
+++ b/examples/mustc/st1/conf/transformer_pt.yaml
@ -0,0 +1,90 @@
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.pt.train
+dev_manifest: data/manifest.pt.dev
+test_manifest: data/manifest.pt.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_1spm/train_sp.en-pt.pt_bpe8000_units_tc.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_1spm/train_sp.en-pt.pt_bpe8000_tc
+mean_std_filepath: ""
+# preprocess_config: conf/augmentation.json
+batch_size: 20
+feat_dim: 83
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config:
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+
+
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: None
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.0
+    ctc_weight: 0.0
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 40
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 0.
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/mustc/st1/conf/transformer_ro.yaml
+++ b/examples/mustc/st1/conf/transformer_ro.yaml
@ -0,0 +1,90 @@
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.ro.train
+dev_manifest: data/manifest.ro.dev
+test_manifest: data/manifest.ro.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_1spm/train_sp.en-ro.ro_bpe8000_units_tc.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_1spm/train_sp.en-ro.ro_bpe8000_tc
+mean_std_filepath: ""
+# preprocess_config: conf/augmentation.json
+batch_size: 20
+feat_dim: 83
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config:
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+
+
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: None
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.0
+    ctc_weight: 0.0
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 40
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 0.
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/mustc/st1/conf/transformer_ru.yaml
+++ b/examples/mustc/st1/conf/transformer_ru.yaml
@ -0,0 +1,90 @@
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.ru.train
+dev_manifest: data/manifest.ru.dev
+test_manifest: data/manifest.ru.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_1spm/train_sp.en-ru.ru_bpe8000_units_tc.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_1spm/train_sp.en-ru.ru_bpe8000_tc
+mean_std_filepath: ""
+# preprocess_config: conf/augmentation.json
+batch_size: 20
+feat_dim: 83
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config:
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+
+
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: None
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.0
+    ctc_weight: 0.0
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 40
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 0.
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/mustc/st1/local/augmentation.json
+++ b/examples/mustc/st1/local/augmentation.json
@ -0,0 +1,19 @@
+[
+  {
+    "type": "specaug",
+    "params": {
+      "W": 5,
+      "warp_mode": "PIL",
+      "F": 30,
+      "n_freq_masks": 2,
+      "T": 40,
+      "n_time_masks": 2,
+      "p": 1.0,
+      "adaptive_number_ratio": 0,
+      "adaptive_size_ratio": 0,
+      "max_n_time_masks": 20,
+      "replace_with_zero": false
+    },
+    "prob": 1.0
+  }
+]
--- a/examples/mustc/st1/local/data.sh
+++ b/examples/mustc/st1/local/data.sh
@ -0,0 +1,201 @@
+#!/bin/bash
+
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#           2021 PaddlePaddle
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+set -e
+set -u
+
+stage=-1
+stop_stage=10
+
+# bpemode (unigram or bpe)
+tgt_lang=
+nbpe=8000
+bpemode=bpe
+must_c=
+dumpdir=data/dump
+do_delta=false
+tgt_case=tc
+src_case=lc.rm
+source ${MAIN_ROOT}/utils/parse_options.sh
+
+TARGET_DIR=${MAIN_ROOT}/examples/dataset
+mkdir -p ${TARGET_DIR}
+mkdir -p data
+
+train_set=train_sp.en-${tgt_lang}.${tgt_lang}
+train_dev=dev.en-${tgt_lang}.${tgt_lang}
+trans_set=""
+for lang in $(echo ${tgt_lang} | tr '_' ' '); do
+    trans_set="${trans_set} tst-COMMON.en-${lang}.${lang}"
+done
+
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    if [ ! -e ${must_c} ]; then
+        echo "Error: Dataset is not avaiable. Please download and unzip the dataset"
+        echo "Link of Must-c v1, https://ict.fbk.eu/must-c/."
+        exit 1
+    fi
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "stage 0: Data Preparation"
+    for lang in $(echo ${tgt_lang} | tr '_' ' '); do
+        local/data_prep.sh ${must_c} ${lang}
+    done
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+    fbankdir=fbank
+    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+    for lang in $(echo ${tgt_lang} | tr '_' ' '); do
+        for x in train.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
+            steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
+                data/${x} data/make_fbank/${x} ${fbankdir}
+        done
+    done
+
+    # speed-perturbed
+    utils/perturb_data_dir_speed.sh 0.9 data/train.en-${tgt_lang} data/temp1.${tgt_lang}
+    utils/perturb_data_dir_speed.sh 1.0 data/train.en-${tgt_lang} data/temp2.${tgt_lang}
+    utils/perturb_data_dir_speed.sh 1.1 data/train.en-${tgt_lang} data/temp3.${tgt_lang}
+    utils/combine_data.sh --extra-files utt2uniq data/train_sp.en-${tgt_lang} \
+        data/temp1.${tgt_lang} data/temp2.${tgt_lang} data/temp3.${tgt_lang}
+    rm -r data/temp1.${tgt_lang} data/temp2.${tgt_lang} data/temp3.${tgt_lang}
+    utils/fix_data_dir.sh data/train_sp.en-${tgt_lang}
+    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
+        data/train_sp.en-${tgt_lang} data/make_fbank/train_sp.en-${tgt_lang} ${fbankdir}
+    for lang in en ${tgt_lang}; do
+        awk -v p="sp0.9-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
+        utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >data/train_sp.en-${tgt_lang}/text.tc.${lang}
+        utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >data/train_sp.en-${tgt_lang}/text.lc.${lang}
+        utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
+        awk -v p="sp1.0-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
+        utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >>data/train_sp.en-${tgt_lang}/text.tc.${lang}
+        utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.${lang}
+        utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
+        awk -v p="sp1.1-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
+        utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >>data/train_sp.en-${tgt_lang}/text.tc.${lang}
+        utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.${lang}
+        utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
+    done
+
+    # Divide into source and target languages
+    for x in train_sp.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
+        local/divide_lang.sh ${x} ${tgt_lang}
+    done
+
+    for x in train_sp.en-${tgt_lang} dev.en-${tgt_lang}; do
+        # remove utt having more than 3000 frames
+        # remove utt having more than 400 characters
+        for lang in ${tgt_lang} en; do
+            remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/${x}.${lang} data/${x}.${lang}.tmp
+        done
+
+        # Match the number of utterances between source and target languages
+        # extract commocn lines
+        cut -f 1 -d " " data/${x}.en.tmp/text > data/${x}.${tgt_lang}.tmp/reclist1
+        cut -f 1 -d " " data/${x}.${tgt_lang}.tmp/text > data/${x}.${tgt_lang}.tmp/reclist2
+        comm -12 data/${x}.${tgt_lang}.tmp/reclist1 data/${x}.${tgt_lang}.tmp/reclist2 > data/${x}.en.tmp/reclist
+
+        for lang in ${tgt_lang} en; do
+            reduce_data_dir.sh data/${x}.${lang}.tmp data/${x}.en.tmp/reclist data/${x}.${lang}
+            utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${x}.${lang}
+        done
+        rm -rf data/${x}.*.tmp
+    done
+
+    # compute global CMVN
+    compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
+
+    # dump features for training
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
+      utils/create_split_dir.pl \
+          /export/b{14,15,16,17}/${USER}/espnet-data/egs/must_c/st1/dump/${train_set}/delta${do_delta}/storage \
+          ${feat_tr_dir}/storage
+    fi
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
+      utils/create_split_dir.pl \
+          /export/b{14,15,16,17}/${USER}/espnet-data/egs/must_c/st1/dump/${train_dev}/delta${do_delta}/storage \
+          ${feat_dt_dir}/storage
+    fi
+    dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \
+        data/${train_set}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/${train_set} ${feat_tr_dir}
+    dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
+        data/${train_dev}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/${train_dev} ${feat_dt_dir}
+    for ttask in ${trans_set}; do
+        feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}; mkdir -p ${feat_trans_dir}
+        dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
+            data/${ttask}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/trans/${ttask} \
+            ${feat_trans_dir}
+    done
+fi
+
+dict=data/lang_1spm/${train_set}_${bpemode}${nbpe}_units_${tgt_case}.txt
+nlsyms=data/lang_1spm/${train_set}_non_lang_syms_${tgt_case}.txt
+bpemodel=data/lang_1spm/${train_set}_${bpemode}${nbpe}_${tgt_case}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p data/lang_1spm/
+    export LC_ALL=C.UTF-8
+
+    echo "make a non-linguistic symbol list for all languages"
+    grep sp1.0 data/train_sp.en-${tgt_lang}.*/text.${tgt_case} | cut -f 2- -d' ' | grep -o -P '&[^;]*;'| sort | uniq > ${nlsyms}
+    cat ${nlsyms}
+
+    echo "make a joint source and target dictionary"
+    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    offset=$(wc -l < ${dict})
+    grep sp1.0 data/train_sp.en-${tgt_lang}.${tgt_lang}/text.${tgt_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' > data/lang_1spm/input_${tgt_lang}.txt
+    grep sp1.0 data/train_sp.en-${tgt_lang}.en/text.${src_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' >> data/lang_1spm/input_${tgt_lang}.txt
+    spm_train --user_defined_symbols="$(tr "\n" "," < ${nlsyms})" --input=data/lang_1spm/input_${tgt_lang}.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
+    spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_1spm/input_${tgt_lang}.txt | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
+    wc -l ${dict}
+
+    echo "make json files"
+    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+        data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
+    data2json.sh --feat ${feat_dt_dir}/feats.scp --text data/${train_dev}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+        data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
+    for ttask in ${trans_set}; do
+        feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}
+        data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${ttask}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+            data/${ttask} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
+    done
+    echo "update json (add source references)"
+    # update json (add source references)
+    for x in ${train_set} ${train_dev}; do
+        feat_dir=${dumpdir}/${x}/delta${do_delta}
+        data_dir=data/$(echo ${x} | cut -f 1 -d ".").en-${tgt_lang}.en
+        update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
+            ${feat_dir}/data_${bpemode}${nbpe}.${tgt_case}.json ${data_dir} ${dict}
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    x=(${train_set} ${train_dev} ${trans_set})
+    y=(train dev test)
+    for (( i=0; i<${#x[*]}; ++i)); do
+        echo ${x[$i]} ${y[$i]}
+        feat_dir=${dumpdir}/${x[$i]}/delta${do_delta}
+        data_dir=data/$(echo ${x[$i]} | cut -f 1 -d ".").en-${tgt_lang}.en
+        python3 ${MAIN_ROOT}/utils/espnet_json_to_manifest.py \
+                --json-file ${feat_dir}/data_${bpemode}${nbpe}.${tgt_case}.json \
+                --manifest-file data/manifest.${tgt_lang}.${y[$i]}
+        echo "Process done for ${y[$i]} set from ${x[$i]}"
+    done
+fi
+
+
+echo "MuST-C ${tgt_lang} Data preparation done."
+exit 0
--- a/examples/mustc/st1/local/data_prep.sh
+++ b/examples/mustc/st1/local/data_prep.sh
@ -0,0 +1,163 @@
+#!/bin/bash
+
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+export LC_ALL=C
+
+source ${MAIN_ROOT}/utils/parse_options.sh
+
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <src-dir>"
+    echo "e.g.: $0 /n/rd11/corpora_8/MUSTC_v1.0 target_lang"
+    exit 1;
+fi
+
+tgt_lang=$2
+
+for set in train dev tst-COMMON tst-HE; do
+    src=$1/en-${tgt_lang}/data/${set}
+    dst=data/local/en-${tgt_lang}/${set}
+
+    [ ! -d ${src} ] && echo "$0: no such directory ${src}" && exit 1;
+
+    wav_dir=${src}/wav
+    trans_dir=${src}/txt
+    yml=${trans_dir}/${set}.yaml
+    en=${trans_dir}/${set}.en
+    tgt=${trans_dir}/${set}.${tgt_lang}
+
+    mkdir -p ${dst} || exit 1;
+
+    [ ! -d ${wav_dir} ] && echo "$0: no such directory ${wav_dir}" && exit 1;
+    [ ! -d ${trans_dir} ] && echo "$0: no such directory ${trans_dir}" && exit 1;
+    [ ! -f ${yml} ] && echo "$0: expected file ${yml} to exist" && exit 1;
+    [ ! -f ${en} ] && echo "$0: expected file ${en} to exist" && exit 1;
+    [ ! -f ${tgt} ] && echo "$0: expected file ${tgt} to exist" && exit 1;
+
+    wav_scp=${dst}/wav.scp; [[ -f "${wav_scp}" ]] && rm ${wav_scp}
+    trans_en=${dst}/text.en; [[ -f "${trans_en}" ]] && rm ${trans_en}
+    trans_tgt=${dst}/text.${tgt_lang}; [[ -f "${trans_tgt}" ]] && rm ${trans_tgt}
+    utt2spk=${dst}/utt2spk; [[ -f "${utt2spk}" ]] && rm ${utt2spk}
+    spk2utt=${dst}/spk2utt; [[ -f "${spk2utt}" ]] && rm ${spk2utt}
+    segments=${dst}/segments; [[ -f "${segments}" ]] && rm ${segments}
+
+    # error check
+    n=$(cat ${yml} | grep duration | wc -l)
+    n_en=$(cat ${en} | wc -l)
+    n_tgt=$(cat ${tgt} | wc -l)
+    [ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1;
+    [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1;
+
+    # (1a) Transcriptions and translations preparation
+    # make basic transcription file (add segments info)
+    cp ${yml} ${dst}/.yaml0
+    grep duration ${dst}/.yaml0 > ${dst}/.yaml1
+    awk '{
+        duration=$3; offset=$5; spkid=$7;
+        gsub(",","",duration);
+        gsub(",","",offset);
+        gsub(",","",spkid);
+        gsub("spk.","",spkid);
+        duration=sprintf("%.7f", duration);
+        if ( duration < 0.2 ) extendt=sprintf("%.7f", (0.2-duration)/2);
+        else extendt=0;
+        offset=sprintf("%.7f", offset);
+        startt=offset-extendt;
+        endt=offset+duration+extendt;
+        printf("ted_%05d_%07.0f_%07.0f\n", spkid, int(1000*startt+0.5), int(1000*endt+0.5));
+    }' ${dst}/.yaml1 > ${dst}/.yaml2
+    # NOTE: Extend the lengths of short utterances (< 0.2s) rather than exclude them
+
+    cp ${en} ${dst}/en.org
+    cp ${tgt} ${dst}/${tgt_lang}.org
+
+    for lang in en ${tgt_lang}; do
+        # normalize punctuation
+        normalize-punctuation.perl -l ${lang} < ${dst}/${lang}.org > ${dst}/${lang}.norm
+
+        # lowercasing
+        lowercase.perl < ${dst}/${lang}.norm > ${dst}/${lang}.norm.lc
+        cp ${dst}/${lang}.norm ${dst}/${lang}.norm.tc
+
+        # remove punctuation
+        local/remove_punctuation.pl < ${dst}/${lang}.norm.lc > ${dst}/${lang}.norm.lc.rm
+
+        # tokenization
+        tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.tc > ${dst}/${lang}.norm.tc.tok
+        tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.lc > ${dst}/${lang}.norm.lc.tok
+        tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.lc.rm > ${dst}/${lang}.norm.lc.rm.tok
+
+        paste -d " " ${dst}/.yaml2 ${dst}/${lang}.norm.tc.tok | sort > ${dst}/text.tc.${lang}
+        paste -d " " ${dst}/.yaml2 ${dst}/${lang}.norm.lc.tok | sort > ${dst}/text.lc.${lang}
+        paste -d " " ${dst}/.yaml2 ${dst}/${lang}.norm.lc.rm.tok | sort > ${dst}/text.lc.rm.${lang}
+
+        # save original and cleaned punctuation
+        lowercase.perl < ${dst}/${lang}.org | text2token.py -s 0 -n 1 | tr " " "\n" \
+            | sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > ${dst}/punctuation.${lang}
+        lowercase.perl < ${dst}/${lang}.norm.tc | text2token.py -s 0 -n 1 | tr " " "\n" \
+            | sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > ${dst}/punctuation.clean.${lang}
+    done
+
+
+    # error check
+    n=$(cat ${dst}/.yaml2 | wc -l)
+    n_en=$(cat ${dst}/en.norm.tc.tok | wc -l)
+    n_tgt=$(cat ${dst}/${tgt_lang}.norm.tc.tok | wc -l)
+    [ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1;
+    [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1;
+
+
+    # (1c) Make segments files from transcript
+    #segments file format is: utt-id start-time end-time, e.g.:
+    #ted_00001_0003501_0003684 ted_0001 003.501 0003.684
+    awk '{
+        segment=$1; split(segment,S,"[_]");
+        spkid=S[1] "_" S[2]; startf=S[3]; endf=S[4];
+        printf("%s %s %.2f %.2f\n", segment, spkid, startf/1000, endf/1000);
+    }' < ${dst}/text.tc.${tgt_lang} | uniq | sort > ${dst}/segments
+
+    awk '{
+        segment=$1; split(segment,S,"[_]");
+        spkid=S[1] "_" S[2];
+        printf("%s cat '${wav_dir}'/%s_%d.wav |\n", spkid, S[1], S[2]);
+    }' < ${dst}/text.tc.${tgt_lang} | uniq | sort > ${dst}/wav.scp
+
+    awk '{
+        segment=$1; split(segment,S,"[_]");
+        spkid=S[1] "_" S[2]; print $1 " " spkid
+    }' ${dst}/segments | uniq | sort > ${dst}/utt2spk
+
+    cat ${dst}/utt2spk | utils/utt2spk_to_spk2utt.pl | sort > ${dst}/spk2utt
+
+    # error check
+    n_en=$(cat ${dst}/text.tc.en | wc -l)
+    n_tgt=$(cat ${dst}/text.tc.${tgt_lang} | wc -l)
+    [ ${n_en} -ne ${n_tgt} ] && echo "Warning: expected ${n_en} data data files, found ${n_tgt}" && exit 1;
+
+    # Copy stuff intoc its final locations [this has been moved from the format_data script]
+    mkdir -p data/${set}.en-${tgt_lang}
+
+    # remove duplicated utterances (the same offset)
+    echo "remove duplicate lines..."
+    cut -d ' ' -f 1 ${dst}/text.tc.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted' \
+        | sed 's/^[ \t]*//' > ${dst}/duplicate_lines
+    cut -d ' ' -f 1 ${dst}/text.tc.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted' \
+        | cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist
+    reduce_data_dir.sh ${dst} ${dst}/reclist data/${set}.en-${tgt_lang}
+    for l in en ${tgt_lang}; do
+        for case in tc lc lc.rm; do
+            cp ${dst}/text.${case}.${l} data/${set}.en-${tgt_lang}/text.${case}.${l}
+        done
+    done
+    utils/fix_data_dir.sh --utt_extra_files \
+        "text.tc.en text.lc.en text.lc.rm.en text.tc.${tgt_lang} text.lc.${tgt_lang} text.lc.rm.${tgt_lang}" \
+        data/${set}.en-${tgt_lang}
+
+    # error check
+    n_seg=$(cat data/${set}.en-${tgt_lang}/segments | wc -l)
+    n_text=$(cat data/${set}.en-${tgt_lang}/text.tc.${tgt_lang} | wc -l)
+    [ ${n_seg} -ne ${n_text} ] && echo "Warning: expected ${n_seg} data data files, found ${n_text}" && exit 1;
+
+    echo "$0: successfully prepared data in ${dst}"
+done
--- a/examples/mustc/st1/local/divide_lang.sh
+++ b/examples/mustc/st1/local/divide_lang.sh
@ -0,0 +1,52 @@
+#!/bin/bash
+
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh
+
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <set> <lang>>"
+    echo "e.g.: $0 dev"
+    exit 1
+fi
+
+set=$1
+lang=$2
+export LC_ALL=en_US.UTF-8
+# Copy stuff intoc its final locations [this has been moved from the format_data script]
+# for En
+mkdir -p data/${set}.en
+for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
+    if [ -f data/${set}/${f} ]; then
+        sort data/${set}/${f} > data/${set}.en/${f}
+    fi
+done
+sort data/${set}/text.lc.rm.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text  # dummy
+sort data/${set}/text.tc.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text.tc
+sort data/${set}/text.lc.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text.lc
+sort data/${set}/text.lc.rm.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text.lc.rm
+utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.en
+if [ -f data/${set}.en/feats.scp ]; then
+    utils/validate_data_dir.sh data/${set}.en || exit 1;
+else
+    utils/validate_data_dir.sh --no-feats --no-wav data/${set}.en || exit 1;
+fi
+
+# for target language
+mkdir -p data/${set}.${lang}
+for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
+    if [ -f data/${set}/${f} ]; then
+        sort data/${set}/${f} > data/${set}.${lang}/${f}
+    fi
+done
+sort data/${set}/text.tc.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text  # dummy
+sort data/${set}/text.tc.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text.tc
+sort data/${set}/text.lc.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text.lc
+sort data/${set}/text.lc.rm.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text.lc.rm
+utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.${lang}
+if [ -f data/${set}.${lang}/feats.scp ]; then
+    utils/validate_data_dir.sh data/${set}.${lang} || exit 1;
+else
+    utils/validate_data_dir.sh --no-feats --no-wav data/${set}.${lang} || exit 1;
+fi
--- a/examples/mustc/st1/local/remove_punctuation.pl
+++ b/examples/mustc/st1/local/remove_punctuation.pl
@ -0,0 +1,25 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+binmode(STDIN,":utf8");
+binmode(STDOUT,":utf8");
+
+while(<STDIN>) {
+  $_ = " $_ ";
+
+  # remove punctuation except apostrophe
+  s/<space>/spacemark/g;  # for scoring
+  s/'/apostrophe/g;
+  s/[[:punct:]]//g;
+  s/apostrophe/'/g;
+  s/spacemark/<space>/g;  # for scoring
+
+  # remove whitespace
+  s/\s+/ /g;
+  s/^\s+//;
+  s/\s+$//;
+
+  print "$_\n";
+}
--- a/examples/mustc/st1/local/test.sh
+++ b/examples/mustc/st1/local/test.sh
@ -0,0 +1,48 @@
+#! /usr/bin/env bash
+
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix lang"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+decode_config_path=$2
+ckpt_prefix=$3
+tgt_lang=$4
+
+for type in fullsentence; do
+    echo "decoding ${type}"
+    python3 -u ${BIN_DIR}/test.py \
+    --ngpu ${ngpu} \
+    --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
+    --result_file ${ckpt_prefix}.${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decode.decoding_method ${type} \
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+    echo $PATH
+    python3 ${MAIN_ROOT}/utils/rsl2trn.py --rsl ${ckpt_prefix}.${type}.rsl \
+                            --hyp ${ckpt_prefix}.${type}.hyp \
+                            --ref ${ckpt_prefix}.${type}.ref
+    if ! which tokenizer.perl > /dev/null; then
+    echo "Error: it seems that moses is not installed." >&2
+    echo "Error: please install moses as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+    return 1
+    fi
+    detokenizer.perl -l ${tgt_lang} -q < ${ckpt_prefix}.${type}.hyp > ${ckpt_prefix}.${type}.hyp.detok
+    detokenizer.perl -l ${tgt_lang} -q < ${ckpt_prefix}.${type}.ref > ${ckpt_prefix}.${type}.ref.detok
+    echo "Detokenized BLEU:"
+    sacrebleu ${ckpt_prefix}.${type}.ref.detok -i ${ckpt_prefix}.${type}.hyp.detok
+
+
+done
+
+exit 0
--- a/examples/mustc/st1/local/train.sh
+++ b/examples/mustc/st1/local/train.sh
@ -0,0 +1,40 @@
+#!/bin/bash
+
+if [ $# != 3 ];then
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_name=$2
+ckpt_path=$3
+
+
+mkdir -p exp
+
+# seed may break model convergence
+seed=0
+if [ ${seed} != 0 ]; then
+    export FLAGS_cudnn_deterministic=True
+fi
+
+python3 -u ${BIN_DIR}/train.py \
+--ngpu ${ngpu} \
+--config ${config_path} \
+--output exp/${ckpt_name} \
+--checkpoint_path "${ckpt_path}" \
+--seed ${seed}
+
+if [ ${seed} != 0 ]; then
+    unset FLAGS_cudnn_deterministic
+fi
+
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+
+exit 0
--- a/examples/mustc/st1/path.sh
+++ b/examples/mustc/st1/path.sh
@ -0,0 +1,29 @@
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${MAIN_ROOT}/tools/moses/scripts/tokenizer:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8 
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+if ! which tokenizer.perl > /dev/null; then
+    echo "Error: moses is required in this example." >&2
+    echo "Error: it seems that moses is not installed." >&2
+    echo "Error: please install moses as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && git clone https://github.com/moses-smt/mosesdecoder.git moses" >&2
+    return 1
+fi
+
+MODEL=u2_st
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
+
+# Kaldi
+export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
+[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
--- a/examples/mustc/st1/run.sh
+++ b/examples/mustc/st1/run.sh
@ -0,0 +1,39 @@
+#!/bin/bash
+set -e
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+gpus=0,1,2,3
+stage=0
+stop_stage=3
+conf_path=conf/transformer_es.yaml
+decode_conf_path=conf/tuning/decode.yaml
+must_c_path=
+lang=es
+avg_num=5
+ckpt_path= #  (finetune from FAT-ST or ASR pretrained model)
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+avg_ckpt=avg_${avg_num}
+ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
+echo "checkpoint name ${ckpt}"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    bash ./local/data.sh --tgt_lang ${lang} --must_c ${must_c_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `exp` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt} "${ckpt_path}" 
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # avg n best model
+    avg.sh best exp/${ckpt}/checkpoints ${avg_num} 
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # test ckpt avg_n
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${lang} || exit -1
+fi
--- a/examples/mustc/st1/steps
+++ b/examples/mustc/st1/steps
@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
--- a/examples/mustc/st1/utils
+++ b/examples/mustc/st1/utils
@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
--- a/examples/ted_en_zh/st1/local/data.sh
+++ b/examples/ted_en_zh/st1/local/data.sh
@ -198,10 +198,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
 fi

 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    x=(${feat_tr_dir} ${feat_dt_dir} ${feat_trans_dir})
+    y=(train dev test)
    echo "stage 3: Format the Json Data"
-    python3 local/espnet_json_to_manifest.py --json-file ${feat_tr_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.train
-    python3 local/espnet_json_to_manifest.py --json-file ${feat_dt_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.dev
-    python3 local/espnet_json_to_manifest.py --json-file ${feat_trans_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.test
+    for (( i=0; i<${#x[*]}; ++i)); do
+        python3 ${MAIN_ROOT}/utils/espnet_json_to_manifest.py \
+         --json-file ${x[$i]}/data_${bpemode}${nbpe}.json 
+         --manifest-file data/manifest.${y[$i]}
+    done
 fi
 echo "Ted En-Zh Data preparation done."
 exit 0
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
+import ast
 import os
 import sys
+from collections import OrderedDict
 from typing import List
 from typing import Optional
 from typing import Union
@ -130,7 +132,7 @@ class ASRExecutor(BaseExecutor):
        self.parser = argparse.ArgumentParser(
            prog='paddlespeech.asr', add_help=True)
        self.parser.add_argument(
-            '--input', type=str, required=True, help='Audio file to recognize.')
+            '--input', type=str, default=None, help='Audio file to recognize.')
        self.parser.add_argument(
            '--model',
            type=str,
@ -180,6 +182,11 @@ class ASRExecutor(BaseExecutor):
            type=str,
            default=paddle.get_device(),
            help='Choose device to execute model inference.')
+        self.parser.add_argument(
+            '--job_dump_result',
+            type=ast.literal_eval,
+            default=False,
+            help='Save job result into file.')

    def _get_pretrained_path(self, tag: str) -> os.PathLike:
        """
@ -469,19 +476,31 @@ class ASRExecutor(BaseExecutor):
        sample_rate = parser_args.sample_rate
        config = parser_args.config
        ckpt_path = parser_args.ckpt_path
-        audio_file = parser_args.input
        decode_method = parser_args.decode_method
        force_yes = parser_args.yes
        device = parser_args.device
+        job_dump_result = parser_args.job_dump_result
+
+        task_source = self.get_task_source(parser_args.input)
+        task_results = OrderedDict()
+        has_exceptions = False

+        for id_, input_ in task_source.items():
            try:
-            res = self(audio_file, model, lang, sample_rate, config, ckpt_path,
+                res = self(input_, model, lang, sample_rate, config, ckpt_path,
                           decode_method, force_yes, device)
-            logger.info('ASR Result: {}'.format(res))
-            return True
+                task_results[id_] = res
            except Exception as e:
-            logger.exception(e)
+                has_exceptions = True
+                task_results[id_] = f'{e.__class__.__name__}: {e}'
+
+        self.process_task_results(parser_args.input, task_results,
+                                  job_dump_result)
+
+        if has_exceptions:
            return False
+        else:
+            return True

    @stats_wrapper
    def __call__(self,
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
+import ast
 import os
+from collections import OrderedDict
 from typing import List
 from typing import Optional
 from typing import Union
@ -77,7 +79,7 @@ class CLSExecutor(BaseExecutor):
        self.parser = argparse.ArgumentParser(
            prog='paddlespeech.cls', add_help=True)
        self.parser.add_argument(
-            '--input', type=str, required=True, help='Audio file to classify.')
+            '--input', type=str, default=None, help='Audio file to classify.')
        self.parser.add_argument(
            '--model',
            type=str,
@ -109,6 +111,11 @@ class CLSExecutor(BaseExecutor):
            type=str,
            default=paddle.get_device(),
            help='Choose device to execute model inference.')
+        self.parser.add_argument(
+            '--job_dump_result',
+            type=ast.literal_eval,
+            default=False,
+            help='Save job result into file.')

    def _get_pretrained_path(self, tag: str) -> os.PathLike:
        """
@ -214,7 +221,7 @@ class CLSExecutor(BaseExecutor):
        ret = ''
        for idx in topk_idx:
            label, score = self._label_list[idx], result[idx]
-            ret += f'{label}: {score}\n'
+            ret += f'{label} {score} '
        return ret

    def postprocess(self, topk: int) -> Union[str, os.PathLike]:
@ -234,18 +241,30 @@ class CLSExecutor(BaseExecutor):
        label_file = parser_args.label_file
        cfg_path = parser_args.config
        ckpt_path = parser_args.ckpt_path
-        audio_file = parser_args.input
        topk = parser_args.topk
        device = parser_args.device
+        job_dump_result = parser_args.job_dump_result
+
+        task_source = self.get_task_source(parser_args.input)
+        task_results = OrderedDict()
+        has_exceptions = False

+        for id_, input_ in task_source.items():
            try:
-            res = self(audio_file, model_type, cfg_path, ckpt_path, label_file,
+                res = self(input_, model_type, cfg_path, ckpt_path, label_file,
                           topk, device)
-            logger.info('CLS Result:\n{}'.format(res))
-            return True
+                task_results[id_] = res
            except Exception as e:
-            logger.exception(e)
+                has_exceptions = True
+                task_results[id_] = f'{e.__class__.__name__}: {e}'
+
+        self.process_task_results(parser_args.input, task_results,
+                                  job_dump_result)
+
+        if has_exceptions:
            return False
+        else:
+            return True

    @stats_wrapper
    def __call__(self,
@ -259,7 +278,7 @@ class CLSExecutor(BaseExecutor):
        """
            Python API to call an executor.
        """
-        audio_file = os.path.abspath(audio_file)
+        audio_file = os.path.abspath(os.path.expanduser(audio_file))
        paddle.set_device(device)
        self._init_from_path(model, config, ckpt_path, label_file)
        self.preprocess(audio_file)
--- a/paddlespeech/cli/executor.py
+++ b/paddlespeech/cli/executor.py
@ -12,14 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+import sys
 from abc import ABC
 from abc import abstractmethod
+from collections import OrderedDict
 from typing import Any
+from typing import Dict
 from typing import List
 from typing import Union

 import paddle

+from .log import logger
+

 class BaseExecutor(ABC):
    """
@ -27,8 +32,8 @@ class BaseExecutor(ABC):
    """

    def __init__(self):
-        self._inputs = dict()
-        self._outputs = dict()
+        self._inputs = OrderedDict()
+        self._outputs = OrderedDict()

    @abstractmethod
    def _get_pretrained_path(self, tag: str) -> os.PathLike:
@ -100,3 +105,107 @@ class BaseExecutor(ABC):
        Python API to call an executor.
        """
        pass
+
+    def get_task_source(self, input_: Union[str, os.PathLike, None]
+                        ) -> Dict[str, Union[str, os.PathLike]]:
+        """
+        Get task input source from command line input.
+
+        Args:
+            input_ (Union[str, os.PathLike, None]): Input from command line.
+
+        Returns:
+            Dict[str, Union[str, os.PathLike]]: A dict with ids and inputs.
+        """
+        if self._is_job_input(input_):
+            ret = self._get_job_contents(input_)
+        else:
+            ret = OrderedDict()
+
+            if input_ is None:  # Take input from stdin
+                for i, line in enumerate(sys.stdin):
+                    line = line.strip()
+                    if len(line.split(' ')) == 1:
+                        ret[str(i + 1)] = line
+                    elif len(line.split(' ')) == 2:
+                        id_, info = line.split(' ')
+                        ret[id_] = info
+                    else:  # No valid input info from one line.
+                        continue
+            else:
+                ret[1] = input_
+        return ret
+
+    def process_task_results(self,
+                             input_: Union[str, os.PathLike, None],
+                             results: Dict[str, os.PathLike],
+                             job_dump_result: bool=False):
+        """
+        Handling task results and redirect stdout if needed.
+
+        Args:
+            input_ (Union[str, os.PathLike, None]): Input from command line.
+            results (Dict[str, os.PathLike]): Task outputs.
+            job_dump_result (bool, optional): if True, dumps job results into file. Defaults to False.
+        """
+
+        raw_text = self._format_task_results(results)
+        print(raw_text, end='')
+
+        if self._is_job_input(input_) and job_dump_result:
+            try:
+                job_output_file = os.path.abspath(input_) + '.done'
+                sys.stdout = open(job_output_file, 'w')
+                print(raw_text, end='')
+                logger.info(f'Results had been saved to: {job_output_file}')
+            finally:
+                sys.stdout.close()
+
+    def _is_job_input(self, input_: Union[str, os.PathLike]) -> bool:
+        """
+        Check if current input file is a job input or not.
+
+        Args:
+            input_ (Union[str, os.PathLike]): Input file of current task.
+
+        Returns:
+            bool: return `True` for job input, `False` otherwise.
+        """
+        return input_ and os.path.isfile(input_) and input_.endswith('.job')
+
+    def _get_job_contents(
+            self, job_input: os.PathLike) -> Dict[str, Union[str, os.PathLike]]:
+        """
+        Read a job input file and return its contents in a dictionary.
+
+        Args:
+            job_input (os.PathLike): The job input file.
+
+        Returns:
+            Dict[str, str]: Contents of job input.
+        """
+        job_contents = OrderedDict()
+        with open(job_input) as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                k, v = line.split(' ')
+                job_contents[k] = v
+        return job_contents
+
+    def _format_task_results(
+            self, results: Dict[str, Union[str, os.PathLike]]) -> str:
+        """
+        Convert task results to raw text.
+
+        Args:
+            results (Dict[str, str]): A dictionary of task results.
+
+        Returns:
+            str: A string object contains task results.
+        """
+        ret = ''
+        for k, v in results.items():
+            ret += f'{k} {v}\n'
+        return ret
--- a/paddlespeech/cli/st/infer.py
+++ b/paddlespeech/cli/st/infer.py
@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
+import ast
 import os
 import subprocess
+from collections import OrderedDict
 from typing import List
 from typing import Optional
 from typing import Union
@ -69,7 +71,7 @@ class STExecutor(BaseExecutor):
        self.parser = argparse.ArgumentParser(
            prog="paddlespeech.st", add_help=True)
        self.parser.add_argument(
-            "--input", type=str, required=True, help="Audio file to translate.")
+            "--input", type=str, default=None, help="Audio file to translate.")
        self.parser.add_argument(
            "--model",
            type=str,
@ -107,6 +109,11 @@ class STExecutor(BaseExecutor):
            type=str,
            default=paddle.get_device(),
            help="Choose device to execute model inference.")
+        self.parser.add_argument(
+            '--job_dump_result',
+            type=ast.literal_eval,
+            default=False,
+            help='Save job result into file.')

    def _get_pretrained_path(self, tag: str) -> os.PathLike:
        """
@ -319,17 +326,29 @@ class STExecutor(BaseExecutor):
        sample_rate = parser_args.sample_rate
        config = parser_args.config
        ckpt_path = parser_args.ckpt_path
-        audio_file = parser_args.input
        device = parser_args.device
+        job_dump_result = parser_args.job_dump_result
+
+        task_source = self.get_task_source(parser_args.input)
+        task_results = OrderedDict()
+        has_exceptions = False

+        for id_, input_ in task_source.items():
            try:
-            res = self(audio_file, model, src_lang, tgt_lang, sample_rate,
+                res = self(input_, model, src_lang, tgt_lang, sample_rate,
                           config, ckpt_path, device)
-            logger.info("ST Result: {}".format(res))
-            return True
+                task_results[id_] = res
            except Exception as e:
-            logger.exception(e)
+                has_exceptions = True
+                task_results[id_] = f'{e.__class__.__name__}: {e}'
+
+        self.process_task_results(parser_args.input, task_results,
+                                  job_dump_result)
+
+        if has_exceptions:
            return False
+        else:
+            return True

    @stats_wrapper
    def __call__(self,
--- a/paddlespeech/cli/text/infer.py
+++ b/paddlespeech/cli/text/infer.py
@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
+import ast
 import os
 import re
+from collections import OrderedDict
 from typing import List
 from typing import Optional
 from typing import Union
@ -80,7 +82,7 @@ class TextExecutor(BaseExecutor):
        self.parser = argparse.ArgumentParser(
            prog='paddlespeech.text', add_help=True)
        self.parser.add_argument(
-            '--input', type=str, required=True, help='Input text.')
+            '--input', type=str, default=None, help='Input text.')
        self.parser.add_argument(
            '--task',
            type=str,
@ -119,6 +121,11 @@ class TextExecutor(BaseExecutor):
            type=str,
            default=paddle.get_device(),
            help='Choose device to execute model inference.')
+        self.parser.add_argument(
+            '--job_dump_result',
+            type=ast.literal_eval,
+            default=False,
+            help='Save job result into file.')

    def _get_pretrained_path(self, tag: str) -> os.PathLike:
        """
@ -256,7 +263,6 @@ class TextExecutor(BaseExecutor):
        """
        parser_args = self.parser.parse_args(argv)

-        text = parser_args.input
        task = parser_args.task
        model_type = parser_args.model
        lang = parser_args.lang
@ -264,15 +270,28 @@ class TextExecutor(BaseExecutor):
        ckpt_path = parser_args.ckpt_path
        punc_vocab = parser_args.punc_vocab
        device = parser_args.device
+        job_dump_result = parser_args.job_dump_result
+
+        task_source = self.get_task_source(parser_args.input)
+        task_results = OrderedDict()
+        has_exceptions = False

+        for id_, input_ in task_source.items():
            try:
-            res = self(text, task, model_type, lang, cfg_path, ckpt_path,
+                res = self(input_, task, model_type, lang, cfg_path, ckpt_path,
                           punc_vocab, device)
-            logger.info('Text Result:\n{}'.format(res))
-            return True
+                task_results[id_] = res
            except Exception as e:
-            logger.exception(e)
+                has_exceptions = True
+                task_results[id_] = f'{e.__class__.__name__}: {e}'
+
+        self.process_task_results(parser_args.input, task_results,
+                                  job_dump_result)
+
+        if has_exceptions:
            return False
+        else:
+            return True

    @stats_wrapper
    def __call__(
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
+import ast
 import os
+from collections import OrderedDict
 from typing import Any
 from typing import List
 from typing import Optional
@ -298,7 +300,7 @@ class TTSExecutor(BaseExecutor):
        self.parser = argparse.ArgumentParser(
            prog='paddlespeech.tts', add_help=True)
        self.parser.add_argument(
-            '--input', type=str, required=True, help='Input text to generate.')
+            '--input', type=str, default=None, help='Input text to generate.')
        # acoustic model
        self.parser.add_argument(
            '--am',
@ -397,6 +399,11 @@ class TTSExecutor(BaseExecutor):

        self.parser.add_argument(
            '--output', type=str, default='output.wav', help='output file name')
+        self.parser.add_argument(
+            '--job_dump_result',
+            type=ast.literal_eval,
+            default=False,
+            help='Save job result into file.')

    def _get_pretrained_path(self, tag: str) -> os.PathLike:
        """
@ -671,7 +678,6 @@ class TTSExecutor(BaseExecutor):

        args = self.parser.parse_args(argv)

-        text = args.input
        am = args.am
        am_config = args.am_config
        am_ckpt = args.am_ckpt
@ -686,12 +692,24 @@ class TTSExecutor(BaseExecutor):
        voc_stat = args.voc_stat
        lang = args.lang
        device = args.device
-        output = args.output
        spk_id = args.spk_id
+        job_dump_result = args.job_dump_result
+
+        task_source = self.get_task_source(args.input)
+        task_results = OrderedDict()
+        has_exceptions = False
+
+        for id_, input_ in task_source.items():
+            if len(task_source) > 1:
+                assert isinstance(args.output,
+                                  str) and args.output.endswith('.wav')
+                output = args.output.replace('.wav', f'_{id_}.wav')
+            else:
+                output = args.output

            try:
                res = self(
-                text=text,
+                    text=input_,
                    # acoustic model related
                    am=am,
                    am_config=am_config,
@ -710,11 +728,17 @@ class TTSExecutor(BaseExecutor):
                    lang=lang,
                    device=device,
                    output=output)
-            logger.info('Wave file has been generated: {}'.format(res))
-            return True
+                task_results[id_] = res
            except Exception as e:
-            logger.exception(e)
+                has_exceptions = True
+                task_results[id_] = f'{e.__class__.__name__}: {e}'
+
+        self.process_task_results(args.input, task_results, job_dump_result)
+
+        if has_exceptions:
            return False
+        else:
+            return True

    @stats_wrapper
    def __call__(self,
--- a/paddlespeech/server/README.md
+++ b/paddlespeech/server/README.md
@ -0,0 +1,33 @@
+# PaddleSpeech Server Command Line
+
+([简体中文](./README_cn.md)|English)
+
+ The simplest approach to use PaddleSpeech Server including server and client.
+
+ ## PaddleSpeech Server
+ ### Help
+ ```bash
+ paddlespeech_server help
+ ```
+ ### Start the server
+ First set the service-related configuration parameters, similar to `./conf/application.yaml`,
+ Then start the service:
+ ```bash
+ paddlespeech_server start --config_file ./conf/application.yaml
+ ```
+
+ ## PaddleSpeech Client
+ ### Help
+ ```bash
+ paddlespeech_client help
+ ```
+ ### Access speech recognition services 
+ ```
+ paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./tests/16_audio.wav
+ ```
+ 
+ ### Access text to speech services
+ ```bash
+ paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "你好，欢迎使用百度飞桨深度学习框架！" --output output.wav
+ ```
+ 
--- a/paddlespeech/server/README_cn.md
+++ b/paddlespeech/server/README_cn.md
@ -0,0 +1,32 @@
+# PaddleSpeech Server 命令行工具
+
+(简体中文|[English](./README.md))
+
+它提供了最简便的方式调用 PaddleSpeech 语音服务用一行命令就可以轻松启动服务和调用服务。
+
+ ## 服务端命令行使用
+ ### 帮助
+ ```bash
+ paddlespeech_server help
+ ```
+ ### 启动服务
+ 首先设置服务相关配置文件，类似于 `./conf/application.yaml`，同时设置服务配置中的语音任务模型相关配置，类似于 `./conf/tts/tts.yaml`。
+ 然后启动服务：
+ ```bash
+ paddlespeech_server start --config_file ./conf/application.yaml
+ ```
+
+ ## 客户端命令行使用
+ ### 帮助
+ ```bash
+ paddlespeech_client help
+ ```
+ ### 访问语音识别服务 
+ ```
+ paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input input_16k.wav
+ ```
+ 
+ ### 访问语音合成服务
+ ```bash
+ paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "你好，欢迎使用百度飞桨深度学习框架！" --output output.wav
+ ```
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@ -24,6 +24,7 @@ import numpy as np
 import requests
 import soundfile

+from ..executor import BaseExecutor
 from ..util import cli_client_register
 from paddlespeech.server.utils.audio_process import wav2pcm
 from paddlespeech.server.utils.util import wav2base64
@ -33,7 +34,7 @@ __all__ = ['TTSClientExecutor', 'ASRClientExecutor']

@cli_client_register(
    name='paddlespeech_client.tts', description='visit tts service')
-class TTSClientExecutor():
+class TTSClientExecutor(BaseExecutor):
    def __init__(self):
        super().__init__()
        self.parser = argparse.ArgumentParser()
@ -42,7 +43,7 @@ class TTSClientExecutor():
        self.parser.add_argument(
            '--port', type=int, default=8090, help='server port')
        self.parser.add_argument(
-            '--text',
+            '--input',
            type=str,
            default="你好，欢迎使用语音合成服务",
            help='A sentence to be synthesized')
@ -60,20 +61,20 @@ class TTSClientExecutor():
        self.parser.add_argument(
            '--output',
            type=str,
-            default="./out.wav",
+            default="./output.wav",
            help='Synthesized audio file')

    # Request and response
    def tts_client(self, args):
        """ Request and response
        Args:
-            text: A sentence to be synthesized
+            input: A sentence to be synthesized
            outfile: Synthetic audio file
        """
        url = 'http://' + args.server_ip + ":" + str(
            args.port) + '/paddlespeech/tts'
        request = {
-            "text": args.text,
+            "text": args.input,
            "spk_id": args.spk_id,
            "speed": args.speed,
            "volume": args.volume,
@ -119,7 +120,7 @@ class TTSClientExecutor():

@cli_client_register(
    name='paddlespeech_client.asr', description='visit asr service')
-class ASRClientExecutor():
+class ASRClientExecutor(BaseExecutor):
    def __init__(self):
        super().__init__()
        self.parser = argparse.ArgumentParser()
@ -128,29 +129,34 @@ class ASRClientExecutor():
        self.parser.add_argument(
            '--port', type=int, default=8090, help='server port')
        self.parser.add_argument(
-            '--audio_file',
+            '--input',
            type=str,
            default="./paddlespeech/server/tests/16_audio.wav",
            help='Audio file to be recognized')
        self.parser.add_argument(
            '--sample_rate', type=int, default=16000, help='audio sample rate')
+        self.parser.add_argument(
+            '--lang', type=str, default="zh_cn", help='language')
+        self.parser.add_argument(
+            '--audio_format', type=str, default="wav", help='audio format')

    def execute(self, argv: List[str]) -> bool:
        args = self.parser.parse_args(argv)
        url = 'http://' + args.server_ip + ":" + str(
            args.port) + '/paddlespeech/asr'
-        audio = wav2base64(args.audio_file)
+        audio = wav2base64(args.input)
        data = {
            "audio": audio,
-            "audio_format": "wav",
+            "audio_format": args.audio_format,
            "sample_rate": args.sample_rate,
-            "lang": "zh_cn",
+            "lang": args.lang,
        }
        time_start = time.time()
        try:
            r = requests.post(url=url, data=json.dumps(data))
            # ending Timestamp
            time_end = time.time()
+            print(r.json())
            print('time cost', time_end - time_start, 's')
        except:
            print("Failed to speech recognition.")
--- a/paddlespeech/server/bin/paddlespeech_server.py
+++ b/paddlespeech/server/bin/paddlespeech_server.py
@ -17,6 +17,7 @@ from typing import List
 import uvicorn
 from fastapi import FastAPI

+from ..executor import BaseExecutor
 from ..util import cli_server_register
 from paddlespeech.server.engine.engine_factory import EngineFactory
 from paddlespeech.server.restful.api import setup_router
@ -29,8 +30,8 @@ app = FastAPI(


@cli_server_register(
-    name='paddlespeech_server.server', description='Start the service')
-class ServerExecutor():
+    name='paddlespeech_server.start', description='Start the service')
+class ServerExecutor(BaseExecutor):
    def __init__(self):
        super().__init__()
        self.parser = argparse.ArgumentParser()
@ -48,10 +49,8 @@ class ServerExecutor():

    def init(self, config) -> bool:
        """system initialization
-
        Args:
            config (CfgNode): config object
-
        Returns:
            bool: 
        """
@ -75,4 +74,4 @@ class ServerExecutor():
        config = get_config(args.config_file)

        if self.init(config):
-            uvicorn.run(app, host=config.host, port=config.port, debug=True)
+            uvicorn.run(app, host=config.host, port=config.port, debug=True)v
--- a/paddlespeech/server/executor.py
+++ b/paddlespeech/server/executor.py
@ -0,0 +1,38 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from abc import ABC
+from abc import abstractmethod
+from typing import List
+
+class BaseExecutor(ABC):
+    """
+        An abstract executor of paddlespeech server tasks.
+    """
+
+    def __init__(self):
+        self.parser = argparse.ArgumentParser()
+
+    @abstractmethod
+    def execute(self, argv: List[str]) -> bool:
+        """
+        Command line entry. This method can only be accessed by a command line such as `paddlespeech asr`.
+
+        Args:
+            argv (List[str]): Arguments from command line.
+
+        Returns:
+            int: Result of the command execution. `True` for a success and `False` for a failure.
+        """
+        pass
--- a/paddlespeech/server/tests/16_audio.wav
+++ b/paddlespeech/server/tests/16_audio.wav
--- a/paddlespeech/server/tests/http_client.py
+++ b/paddlespeech/server/tests/http_client.py
@ -0,0 +1,59 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the 
+import requests
+import json
+import time
+import base64
+import io
+
+
+def readwav2base64(wav_file):
+    """
+    read wave file and covert to base64 string
+    """
+    with open(wav_file, 'rb') as f:
+        base64_bytes = base64.b64encode(f.read())
+        base64_string = base64_bytes.decode('utf-8')
+    return base64_string
+
+
+def main():
+    """
+    main func
+    """
+    url = "http://127.0.0.1:8090/paddlespeech/asr"
+
+    # start Timestamp
+    time_start=time.time()
+
+    test_audio_dir = "./16_audio.wav"
+    audio = readwav2base64(test_audio_dir)
+
+    data = {
+            "audio": audio,
+            "audio_format": "wav",
+            "sample_rate": 16000,
+            "lang": "zh_cn",
+            }
+
+    r = requests.post(url=url, data=json.dumps(data))
+
+    # ending Timestamp
+    time_end=time.time()
+    print('time cost',time_end - time_start, 's')
+
+    print(r.json())
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py
+++ b/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py