Merge pull request #1459 from LittleChenCc/develop

[ST] example of MuST-C
3 years ago · a8c3f6d479
parent ae521d3700 97e2015242
commit a8c3f6d479
24 changed files with 1437 additions and 3 deletions
--- a/examples/mustc/st1/cmd.sh
+++ b/examples/mustc/st1/cmd.sh
@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
--- a/examples/mustc/st1/conf/fbank.conf
+++ b/examples/mustc/st1/conf/fbank.conf
@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
--- a/examples/mustc/st1/conf/pitch.conf
+++ b/examples/mustc/st1/conf/pitch.conf
@ -0,0 +1 @@
+--sample-frequency=16000
--- a/examples/mustc/st1/conf/transformer_de.yaml
+++ b/examples/mustc/st1/conf/transformer_de.yaml
@ -0,0 +1,90 @@
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.de.train
+dev_manifest: data/manifest.de.dev
+test_manifest: data/manifest.de.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_1spm/train_sp.en-de.de_bpe8000_units_tc.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_1spm/train_sp.en-de.de_bpe8000_tc
+mean_std_filepath: ""
+# preprocess_config: conf/augmentation.json
+batch_size: 20
+feat_dim: 83
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config:
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+
+
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: None
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.0
+    ctc_weight: 0.0
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 40
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 0.
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/mustc/st1/conf/transformer_es.yaml
+++ b/examples/mustc/st1/conf/transformer_es.yaml
@ -0,0 +1,90 @@
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.es.train
+dev_manifest: data/manifest.es.dev
+test_manifest: data/manifest.es.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_1spm/train_sp.en-es.es_bpe8000_units_tc.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_1spm/train_sp.en-es.es_bpe8000_tc
+mean_std_filepath: ""
+# preprocess_config: conf/augmentation.json
+batch_size: 20
+feat_dim: 83
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config:
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+
+
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: None
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.0
+    ctc_weight: 0.0
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 40
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 0.
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/mustc/st1/conf/transformer_fr.yaml
+++ b/examples/mustc/st1/conf/transformer_fr.yaml
@ -0,0 +1,90 @@
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.fr.train
+dev_manifest: data/manifest.fr.dev
+test_manifest: data/manifest.fr.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_1spm/train_sp.en-fr.fr_bpe8000_units_tc.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_1spm/train_sp.en-fr.fr_bpe8000_tc
+mean_std_filepath: ""
+# preprocess_config: conf/augmentation.json
+batch_size: 20
+feat_dim: 83
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config:
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+
+
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: None
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.0
+    ctc_weight: 0.0
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 40
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 0.
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/mustc/st1/conf/transformer_it.yaml
+++ b/examples/mustc/st1/conf/transformer_it.yaml
@ -0,0 +1,90 @@
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.it.train
+dev_manifest: data/manifest.it.dev
+test_manifest: data/manifest.it.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_1spm/train_sp.en-it.it_bpe8000_units_tc.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_1spm/train_sp.en-it.it_bpe8000_tc
+mean_std_filepath: ""
+# preprocess_config: conf/augmentation.json
+batch_size: 20
+feat_dim: 83
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config:
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+
+
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: None
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.0
+    ctc_weight: 0.0
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 40
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 0.
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/mustc/st1/conf/transformer_nl.yaml
+++ b/examples/mustc/st1/conf/transformer_nl.yaml
@ -0,0 +1,90 @@
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.nl.train
+dev_manifest: data/manifest.nl.dev
+test_manifest: data/manifest.nl.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_1spm/train_sp.en-nl.nl_bpe8000_units_tc.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_1spm/train_sp.en-nl.nl_bpe8000_tc
+mean_std_filepath: ""
+# preprocess_config: conf/augmentation.json
+batch_size: 20
+feat_dim: 83
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config:
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+
+
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: None
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.0
+    ctc_weight: 0.0
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 40
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 0.
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/mustc/st1/conf/transformer_pt.yaml
+++ b/examples/mustc/st1/conf/transformer_pt.yaml
@ -0,0 +1,90 @@
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.pt.train
+dev_manifest: data/manifest.pt.dev
+test_manifest: data/manifest.pt.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_1spm/train_sp.en-pt.pt_bpe8000_units_tc.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_1spm/train_sp.en-pt.pt_bpe8000_tc
+mean_std_filepath: ""
+# preprocess_config: conf/augmentation.json
+batch_size: 20
+feat_dim: 83
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config:
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+
+
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: None
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.0
+    ctc_weight: 0.0
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 40
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 0.
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/mustc/st1/conf/transformer_ro.yaml
+++ b/examples/mustc/st1/conf/transformer_ro.yaml
@ -0,0 +1,90 @@
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.ro.train
+dev_manifest: data/manifest.ro.dev
+test_manifest: data/manifest.ro.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_1spm/train_sp.en-ro.ro_bpe8000_units_tc.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_1spm/train_sp.en-ro.ro_bpe8000_tc
+mean_std_filepath: ""
+# preprocess_config: conf/augmentation.json
+batch_size: 20
+feat_dim: 83
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config:
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+
+
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: None
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.0
+    ctc_weight: 0.0
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 40
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 0.
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/mustc/st1/conf/transformer_ru.yaml
+++ b/examples/mustc/st1/conf/transformer_ru.yaml
@ -0,0 +1,90 @@
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.ru.train
+dev_manifest: data/manifest.ru.dev
+test_manifest: data/manifest.ru.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_1spm/train_sp.en-ru.ru_bpe8000_units_tc.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_1spm/train_sp.en-ru.ru_bpe8000_tc
+mean_std_filepath: ""
+# preprocess_config: conf/augmentation.json
+batch_size: 20
+feat_dim: 83
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config:
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+
+
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: None
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.0
+    ctc_weight: 0.0
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 40
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 0.
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/mustc/st1/local/augmentation.json
+++ b/examples/mustc/st1/local/augmentation.json
@ -0,0 +1,19 @@
+[
+  {
+    "type": "specaug",
+    "params": {
+      "W": 5,
+      "warp_mode": "PIL",
+      "F": 30,
+      "n_freq_masks": 2,
+      "T": 40,
+      "n_time_masks": 2,
+      "p": 1.0,
+      "adaptive_number_ratio": 0,
+      "adaptive_size_ratio": 0,
+      "max_n_time_masks": 20,
+      "replace_with_zero": false
+    },
+    "prob": 1.0
+  }
+]
--- a/examples/mustc/st1/local/data.sh
+++ b/examples/mustc/st1/local/data.sh
@ -0,0 +1,201 @@
+#!/bin/bash
+
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#           2021 PaddlePaddle
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+set -e
+set -u
+
+stage=-1
+stop_stage=10
+
+# bpemode (unigram or bpe)
+tgt_lang=
+nbpe=8000
+bpemode=bpe
+must_c=
+dumpdir=data/dump
+do_delta=false
+tgt_case=tc
+src_case=lc.rm
+source ${MAIN_ROOT}/utils/parse_options.sh
+
+TARGET_DIR=${MAIN_ROOT}/examples/dataset
+mkdir -p ${TARGET_DIR}
+mkdir -p data
+
+train_set=train_sp.en-${tgt_lang}.${tgt_lang}
+train_dev=dev.en-${tgt_lang}.${tgt_lang}
+trans_set=""
+for lang in $(echo ${tgt_lang} | tr '_' ' '); do
+    trans_set="${trans_set} tst-COMMON.en-${lang}.${lang}"
+done
+
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    if [ ! -e ${must_c} ]; then
+        echo "Error: Dataset is not avaiable. Please download and unzip the dataset"
+        echo "Link of Must-c v1, https://ict.fbk.eu/must-c/."
+        exit 1
+    fi
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "stage 0: Data Preparation"
+    for lang in $(echo ${tgt_lang} | tr '_' ' '); do
+        local/data_prep.sh ${must_c} ${lang}
+    done
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+    fbankdir=fbank
+    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+    for lang in $(echo ${tgt_lang} | tr '_' ' '); do
+        for x in train.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
+            steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
+                data/${x} data/make_fbank/${x} ${fbankdir}
+        done
+    done
+
+    # speed-perturbed
+    utils/perturb_data_dir_speed.sh 0.9 data/train.en-${tgt_lang} data/temp1.${tgt_lang}
+    utils/perturb_data_dir_speed.sh 1.0 data/train.en-${tgt_lang} data/temp2.${tgt_lang}
+    utils/perturb_data_dir_speed.sh 1.1 data/train.en-${tgt_lang} data/temp3.${tgt_lang}
+    utils/combine_data.sh --extra-files utt2uniq data/train_sp.en-${tgt_lang} \
+        data/temp1.${tgt_lang} data/temp2.${tgt_lang} data/temp3.${tgt_lang}
+    rm -r data/temp1.${tgt_lang} data/temp2.${tgt_lang} data/temp3.${tgt_lang}
+    utils/fix_data_dir.sh data/train_sp.en-${tgt_lang}
+    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
+        data/train_sp.en-${tgt_lang} data/make_fbank/train_sp.en-${tgt_lang} ${fbankdir}
+    for lang in en ${tgt_lang}; do
+        awk -v p="sp0.9-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
+        utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >data/train_sp.en-${tgt_lang}/text.tc.${lang}
+        utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >data/train_sp.en-${tgt_lang}/text.lc.${lang}
+        utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
+        awk -v p="sp1.0-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
+        utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >>data/train_sp.en-${tgt_lang}/text.tc.${lang}
+        utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.${lang}
+        utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
+        awk -v p="sp1.1-" '{printf("%s %s%s\n", $1, p, $1);}' data/train.en-${tgt_lang}/utt2spk > data/train_sp.en-${tgt_lang}/utt_map
+        utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.tc.${lang} >>data/train_sp.en-${tgt_lang}/text.tc.${lang}
+        utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.${lang}
+        utils/apply_map.pl -f 1 data/train_sp.en-${tgt_lang}/utt_map <data/train.en-${tgt_lang}/text.lc.rm.${lang} >>data/train_sp.en-${tgt_lang}/text.lc.rm.${lang}
+    done
+
+    # Divide into source and target languages
+    for x in train_sp.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
+        local/divide_lang.sh ${x} ${tgt_lang}
+    done
+
+    for x in train_sp.en-${tgt_lang} dev.en-${tgt_lang}; do
+        # remove utt having more than 3000 frames
+        # remove utt having more than 400 characters
+        for lang in ${tgt_lang} en; do
+            remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/${x}.${lang} data/${x}.${lang}.tmp
+        done
+
+        # Match the number of utterances between source and target languages
+        # extract commocn lines
+        cut -f 1 -d " " data/${x}.en.tmp/text > data/${x}.${tgt_lang}.tmp/reclist1
+        cut -f 1 -d " " data/${x}.${tgt_lang}.tmp/text > data/${x}.${tgt_lang}.tmp/reclist2
+        comm -12 data/${x}.${tgt_lang}.tmp/reclist1 data/${x}.${tgt_lang}.tmp/reclist2 > data/${x}.en.tmp/reclist
+
+        for lang in ${tgt_lang} en; do
+            reduce_data_dir.sh data/${x}.${lang}.tmp data/${x}.en.tmp/reclist data/${x}.${lang}
+            utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${x}.${lang}
+        done
+        rm -rf data/${x}.*.tmp
+    done
+
+    # compute global CMVN
+    compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
+
+    # dump features for training
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
+      utils/create_split_dir.pl \
+          /export/b{14,15,16,17}/${USER}/espnet-data/egs/must_c/st1/dump/${train_set}/delta${do_delta}/storage \
+          ${feat_tr_dir}/storage
+    fi
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
+      utils/create_split_dir.pl \
+          /export/b{14,15,16,17}/${USER}/espnet-data/egs/must_c/st1/dump/${train_dev}/delta${do_delta}/storage \
+          ${feat_dt_dir}/storage
+    fi
+    dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \
+        data/${train_set}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/${train_set} ${feat_tr_dir}
+    dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
+        data/${train_dev}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/${train_dev} ${feat_dt_dir}
+    for ttask in ${trans_set}; do
+        feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}; mkdir -p ${feat_trans_dir}
+        dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
+            data/${ttask}/feats.scp data/${train_set}/cmvn.ark data/dump_feats/trans/${ttask} \
+            ${feat_trans_dir}
+    done
+fi
+
+dict=data/lang_1spm/${train_set}_${bpemode}${nbpe}_units_${tgt_case}.txt
+nlsyms=data/lang_1spm/${train_set}_non_lang_syms_${tgt_case}.txt
+bpemodel=data/lang_1spm/${train_set}_${bpemode}${nbpe}_${tgt_case}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p data/lang_1spm/
+    export LC_ALL=C.UTF-8
+
+    echo "make a non-linguistic symbol list for all languages"
+    grep sp1.0 data/train_sp.en-${tgt_lang}.*/text.${tgt_case} | cut -f 2- -d' ' | grep -o -P '&[^;]*;'| sort | uniq > ${nlsyms}
+    cat ${nlsyms}
+
+    echo "make a joint source and target dictionary"
+    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    offset=$(wc -l < ${dict})
+    grep sp1.0 data/train_sp.en-${tgt_lang}.${tgt_lang}/text.${tgt_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' > data/lang_1spm/input_${tgt_lang}.txt
+    grep sp1.0 data/train_sp.en-${tgt_lang}.en/text.${src_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' >> data/lang_1spm/input_${tgt_lang}.txt
+    spm_train --user_defined_symbols="$(tr "\n" "," < ${nlsyms})" --input=data/lang_1spm/input_${tgt_lang}.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
+    spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_1spm/input_${tgt_lang}.txt | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
+    wc -l ${dict}
+
+    echo "make json files"
+    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+        data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
+    data2json.sh --feat ${feat_dt_dir}/feats.scp --text data/${train_dev}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+        data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
+    for ttask in ${trans_set}; do
+        feat_trans_dir=${dumpdir}/${ttask}/delta${do_delta}
+        data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${ttask}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+            data/${ttask} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${tgt_case}.json
+    done
+    echo "update json (add source references)"
+    # update json (add source references)
+    for x in ${train_set} ${train_dev}; do
+        feat_dir=${dumpdir}/${x}/delta${do_delta}
+        data_dir=data/$(echo ${x} | cut -f 1 -d ".").en-${tgt_lang}.en
+        update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
+            ${feat_dir}/data_${bpemode}${nbpe}.${tgt_case}.json ${data_dir} ${dict}
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    x=(${train_set} ${train_dev} ${trans_set})
+    y=(train dev test)
+    for (( i=0; i<${#x[*]}; ++i)); do
+        echo ${x[$i]} ${y[$i]}
+        feat_dir=${dumpdir}/${x[$i]}/delta${do_delta}
+        data_dir=data/$(echo ${x[$i]} | cut -f 1 -d ".").en-${tgt_lang}.en
+        python3 ${MAIN_ROOT}/utils/espnet_json_to_manifest.py \
+                --json-file ${feat_dir}/data_${bpemode}${nbpe}.${tgt_case}.json \
+                --manifest-file data/manifest.${tgt_lang}.${y[$i]}
+        echo "Process done for ${y[$i]} set from ${x[$i]}"
+    done
+fi
+
+
+echo "MuST-C ${tgt_lang} Data preparation done."
+exit 0
--- a/examples/mustc/st1/local/data_prep.sh
+++ b/examples/mustc/st1/local/data_prep.sh
@ -0,0 +1,163 @@
+#!/bin/bash
+
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+export LC_ALL=C
+
+source ${MAIN_ROOT}/utils/parse_options.sh
+
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <src-dir>"
+    echo "e.g.: $0 /n/rd11/corpora_8/MUSTC_v1.0 target_lang"
+    exit 1;
+fi
+
+tgt_lang=$2
+
+for set in train dev tst-COMMON tst-HE; do
+    src=$1/en-${tgt_lang}/data/${set}
+    dst=data/local/en-${tgt_lang}/${set}
+
+    [ ! -d ${src} ] && echo "$0: no such directory ${src}" && exit 1;
+
+    wav_dir=${src}/wav
+    trans_dir=${src}/txt
+    yml=${trans_dir}/${set}.yaml
+    en=${trans_dir}/${set}.en
+    tgt=${trans_dir}/${set}.${tgt_lang}
+
+    mkdir -p ${dst} || exit 1;
+
+    [ ! -d ${wav_dir} ] && echo "$0: no such directory ${wav_dir}" && exit 1;
+    [ ! -d ${trans_dir} ] && echo "$0: no such directory ${trans_dir}" && exit 1;
+    [ ! -f ${yml} ] && echo "$0: expected file ${yml} to exist" && exit 1;
+    [ ! -f ${en} ] && echo "$0: expected file ${en} to exist" && exit 1;
+    [ ! -f ${tgt} ] && echo "$0: expected file ${tgt} to exist" && exit 1;
+
+    wav_scp=${dst}/wav.scp; [[ -f "${wav_scp}" ]] && rm ${wav_scp}
+    trans_en=${dst}/text.en; [[ -f "${trans_en}" ]] && rm ${trans_en}
+    trans_tgt=${dst}/text.${tgt_lang}; [[ -f "${trans_tgt}" ]] && rm ${trans_tgt}
+    utt2spk=${dst}/utt2spk; [[ -f "${utt2spk}" ]] && rm ${utt2spk}
+    spk2utt=${dst}/spk2utt; [[ -f "${spk2utt}" ]] && rm ${spk2utt}
+    segments=${dst}/segments; [[ -f "${segments}" ]] && rm ${segments}
+
+    # error check
+    n=$(cat ${yml} | grep duration | wc -l)
+    n_en=$(cat ${en} | wc -l)
+    n_tgt=$(cat ${tgt} | wc -l)
+    [ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1;
+    [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1;
+
+    # (1a) Transcriptions and translations preparation
+    # make basic transcription file (add segments info)
+    cp ${yml} ${dst}/.yaml0
+    grep duration ${dst}/.yaml0 > ${dst}/.yaml1
+    awk '{
+        duration=$3; offset=$5; spkid=$7;
+        gsub(",","",duration);
+        gsub(",","",offset);
+        gsub(",","",spkid);
+        gsub("spk.","",spkid);
+        duration=sprintf("%.7f", duration);
+        if ( duration < 0.2 ) extendt=sprintf("%.7f", (0.2-duration)/2);
+        else extendt=0;
+        offset=sprintf("%.7f", offset);
+        startt=offset-extendt;
+        endt=offset+duration+extendt;
+        printf("ted_%05d_%07.0f_%07.0f\n", spkid, int(1000*startt+0.5), int(1000*endt+0.5));
+    }' ${dst}/.yaml1 > ${dst}/.yaml2
+    # NOTE: Extend the lengths of short utterances (< 0.2s) rather than exclude them
+
+    cp ${en} ${dst}/en.org
+    cp ${tgt} ${dst}/${tgt_lang}.org
+
+    for lang in en ${tgt_lang}; do
+        # normalize punctuation
+        normalize-punctuation.perl -l ${lang} < ${dst}/${lang}.org > ${dst}/${lang}.norm
+
+        # lowercasing
+        lowercase.perl < ${dst}/${lang}.norm > ${dst}/${lang}.norm.lc
+        cp ${dst}/${lang}.norm ${dst}/${lang}.norm.tc
+
+        # remove punctuation
+        local/remove_punctuation.pl < ${dst}/${lang}.norm.lc > ${dst}/${lang}.norm.lc.rm
+
+        # tokenization
+        tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.tc > ${dst}/${lang}.norm.tc.tok
+        tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.lc > ${dst}/${lang}.norm.lc.tok
+        tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.lc.rm > ${dst}/${lang}.norm.lc.rm.tok
+
+        paste -d " " ${dst}/.yaml2 ${dst}/${lang}.norm.tc.tok | sort > ${dst}/text.tc.${lang}
+        paste -d " " ${dst}/.yaml2 ${dst}/${lang}.norm.lc.tok | sort > ${dst}/text.lc.${lang}
+        paste -d " " ${dst}/.yaml2 ${dst}/${lang}.norm.lc.rm.tok | sort > ${dst}/text.lc.rm.${lang}
+
+        # save original and cleaned punctuation
+        lowercase.perl < ${dst}/${lang}.org | text2token.py -s 0 -n 1 | tr " " "\n" \
+            | sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > ${dst}/punctuation.${lang}
+        lowercase.perl < ${dst}/${lang}.norm.tc | text2token.py -s 0 -n 1 | tr " " "\n" \
+            | sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > ${dst}/punctuation.clean.${lang}
+    done
+
+
+    # error check
+    n=$(cat ${dst}/.yaml2 | wc -l)
+    n_en=$(cat ${dst}/en.norm.tc.tok | wc -l)
+    n_tgt=$(cat ${dst}/${tgt_lang}.norm.tc.tok | wc -l)
+    [ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1;
+    [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1;
+
+
+    # (1c) Make segments files from transcript
+    #segments file format is: utt-id start-time end-time, e.g.:
+    #ted_00001_0003501_0003684 ted_0001 003.501 0003.684
+    awk '{
+        segment=$1; split(segment,S,"[_]");
+        spkid=S[1] "_" S[2]; startf=S[3]; endf=S[4];
+        printf("%s %s %.2f %.2f\n", segment, spkid, startf/1000, endf/1000);
+    }' < ${dst}/text.tc.${tgt_lang} | uniq | sort > ${dst}/segments
+
+    awk '{
+        segment=$1; split(segment,S,"[_]");
+        spkid=S[1] "_" S[2];
+        printf("%s cat '${wav_dir}'/%s_%d.wav |\n", spkid, S[1], S[2]);
+    }' < ${dst}/text.tc.${tgt_lang} | uniq | sort > ${dst}/wav.scp
+
+    awk '{
+        segment=$1; split(segment,S,"[_]");
+        spkid=S[1] "_" S[2]; print $1 " " spkid
+    }' ${dst}/segments | uniq | sort > ${dst}/utt2spk
+
+    cat ${dst}/utt2spk | utils/utt2spk_to_spk2utt.pl | sort > ${dst}/spk2utt
+
+    # error check
+    n_en=$(cat ${dst}/text.tc.en | wc -l)
+    n_tgt=$(cat ${dst}/text.tc.${tgt_lang} | wc -l)
+    [ ${n_en} -ne ${n_tgt} ] && echo "Warning: expected ${n_en} data data files, found ${n_tgt}" && exit 1;
+
+    # Copy stuff intoc its final locations [this has been moved from the format_data script]
+    mkdir -p data/${set}.en-${tgt_lang}
+
+    # remove duplicated utterances (the same offset)
+    echo "remove duplicate lines..."
+    cut -d ' ' -f 1 ${dst}/text.tc.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted' \
+        | sed 's/^[ \t]*//' > ${dst}/duplicate_lines
+    cut -d ' ' -f 1 ${dst}/text.tc.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted' \
+        | cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist
+    reduce_data_dir.sh ${dst} ${dst}/reclist data/${set}.en-${tgt_lang}
+    for l in en ${tgt_lang}; do
+        for case in tc lc lc.rm; do
+            cp ${dst}/text.${case}.${l} data/${set}.en-${tgt_lang}/text.${case}.${l}
+        done
+    done
+    utils/fix_data_dir.sh --utt_extra_files \
+        "text.tc.en text.lc.en text.lc.rm.en text.tc.${tgt_lang} text.lc.${tgt_lang} text.lc.rm.${tgt_lang}" \
+        data/${set}.en-${tgt_lang}
+
+    # error check
+    n_seg=$(cat data/${set}.en-${tgt_lang}/segments | wc -l)
+    n_text=$(cat data/${set}.en-${tgt_lang}/text.tc.${tgt_lang} | wc -l)
+    [ ${n_seg} -ne ${n_text} ] && echo "Warning: expected ${n_seg} data data files, found ${n_text}" && exit 1;
+
+    echo "$0: successfully prepared data in ${dst}"
+done
--- a/examples/mustc/st1/local/divide_lang.sh
+++ b/examples/mustc/st1/local/divide_lang.sh
@ -0,0 +1,52 @@
+#!/bin/bash
+
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh
+
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <set> <lang>>"
+    echo "e.g.: $0 dev"
+    exit 1
+fi
+
+set=$1
+lang=$2
+export LC_ALL=en_US.UTF-8
+# Copy stuff intoc its final locations [this has been moved from the format_data script]
+# for En
+mkdir -p data/${set}.en
+for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
+    if [ -f data/${set}/${f} ]; then
+        sort data/${set}/${f} > data/${set}.en/${f}
+    fi
+done
+sort data/${set}/text.lc.rm.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text  # dummy
+sort data/${set}/text.tc.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text.tc
+sort data/${set}/text.lc.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text.lc
+sort data/${set}/text.lc.rm.en | sed $'s/[^[:print:]]//g' > data/${set}.en/text.lc.rm
+utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.en
+if [ -f data/${set}.en/feats.scp ]; then
+    utils/validate_data_dir.sh data/${set}.en || exit 1;
+else
+    utils/validate_data_dir.sh --no-feats --no-wav data/${set}.en || exit 1;
+fi
+
+# for target language
+mkdir -p data/${set}.${lang}
+for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
+    if [ -f data/${set}/${f} ]; then
+        sort data/${set}/${f} > data/${set}.${lang}/${f}
+    fi
+done
+sort data/${set}/text.tc.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text  # dummy
+sort data/${set}/text.tc.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text.tc
+sort data/${set}/text.lc.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text.lc
+sort data/${set}/text.lc.rm.${lang} | sed $'s/[^[:print:]]//g' > data/${set}.${lang}/text.lc.rm
+utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.${lang}
+if [ -f data/${set}.${lang}/feats.scp ]; then
+    utils/validate_data_dir.sh data/${set}.${lang} || exit 1;
+else
+    utils/validate_data_dir.sh --no-feats --no-wav data/${set}.${lang} || exit 1;
+fi
--- a/examples/mustc/st1/local/remove_punctuation.pl
+++ b/examples/mustc/st1/local/remove_punctuation.pl
@ -0,0 +1,25 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+binmode(STDIN,":utf8");
+binmode(STDOUT,":utf8");
+
+while(<STDIN>) {
+  $_ = " $_ ";
+
+  # remove punctuation except apostrophe
+  s/<space>/spacemark/g;  # for scoring
+  s/'/apostrophe/g;
+  s/[[:punct:]]//g;
+  s/apostrophe/'/g;
+  s/spacemark/<space>/g;  # for scoring
+
+  # remove whitespace
+  s/\s+/ /g;
+  s/^\s+//;
+  s/\s+$//;
+
+  print "$_\n";
+}
--- a/examples/mustc/st1/local/test.sh
+++ b/examples/mustc/st1/local/test.sh
@ -0,0 +1,48 @@
+#! /usr/bin/env bash
+
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix lang"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+decode_config_path=$2
+ckpt_prefix=$3
+tgt_lang=$4
+
+for type in fullsentence; do
+    echo "decoding ${type}"
+    python3 -u ${BIN_DIR}/test.py \
+    --ngpu ${ngpu} \
+    --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
+    --result_file ${ckpt_prefix}.${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decode.decoding_method ${type} \
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+    echo $PATH
+    python3 ${MAIN_ROOT}/utils/rsl2trn.py --rsl ${ckpt_prefix}.${type}.rsl \
+                            --hyp ${ckpt_prefix}.${type}.hyp \
+                            --ref ${ckpt_prefix}.${type}.ref
+    if ! which tokenizer.perl > /dev/null; then
+    echo "Error: it seems that moses is not installed." >&2
+    echo "Error: please install moses as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+    return 1
+    fi
+    detokenizer.perl -l ${tgt_lang} -q < ${ckpt_prefix}.${type}.hyp > ${ckpt_prefix}.${type}.hyp.detok
+    detokenizer.perl -l ${tgt_lang} -q < ${ckpt_prefix}.${type}.ref > ${ckpt_prefix}.${type}.ref.detok
+    echo "Detokenized BLEU:"
+    sacrebleu ${ckpt_prefix}.${type}.ref.detok -i ${ckpt_prefix}.${type}.hyp.detok
+
+
+done
+
+exit 0
--- a/examples/mustc/st1/local/train.sh
+++ b/examples/mustc/st1/local/train.sh
@ -0,0 +1,40 @@
+#!/bin/bash
+
+if [ $# != 3 ];then
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_name=$2
+ckpt_path=$3
+
+
+mkdir -p exp
+
+# seed may break model convergence
+seed=0
+if [ ${seed} != 0 ]; then
+    export FLAGS_cudnn_deterministic=True
+fi
+
+python3 -u ${BIN_DIR}/train.py \
+--ngpu ${ngpu} \
+--config ${config_path} \
+--output exp/${ckpt_name} \
+--checkpoint_path "${ckpt_path}" \
+--seed ${seed}
+
+if [ ${seed} != 0 ]; then
+    unset FLAGS_cudnn_deterministic
+fi
+
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+
+exit 0
--- a/examples/mustc/st1/path.sh
+++ b/examples/mustc/st1/path.sh
@ -0,0 +1,29 @@
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${MAIN_ROOT}/tools/moses/scripts/tokenizer:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8 
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+if ! which tokenizer.perl > /dev/null; then
+    echo "Error: moses is required in this example." >&2
+    echo "Error: it seems that moses is not installed." >&2
+    echo "Error: please install moses as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && git clone https://github.com/moses-smt/mosesdecoder.git moses" >&2
+    return 1
+fi
+
+MODEL=u2_st
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
+
+# Kaldi
+export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
+[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
--- a/examples/mustc/st1/run.sh
+++ b/examples/mustc/st1/run.sh
@ -0,0 +1,39 @@
+#!/bin/bash
+set -e
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+gpus=0,1,2,3
+stage=0
+stop_stage=3
+conf_path=conf/transformer_es.yaml
+decode_conf_path=conf/tuning/decode.yaml
+must_c_path=
+lang=es
+avg_num=5
+ckpt_path= #  (finetune from FAT-ST or ASR pretrained model)
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+avg_ckpt=avg_${avg_num}
+ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
+echo "checkpoint name ${ckpt}"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    bash ./local/data.sh --tgt_lang ${lang} --must_c ${must_c_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `exp` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt} "${ckpt_path}" 
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # avg n best model
+    avg.sh best exp/${ckpt}/checkpoints ${avg_num} 
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # test ckpt avg_n
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${lang} || exit -1
+fi
--- a/examples/mustc/st1/steps
+++ b/examples/mustc/st1/steps
@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
--- a/examples/mustc/st1/utils
+++ b/examples/mustc/st1/utils
@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
--- a/examples/ted_en_zh/st1/local/data.sh
+++ b/examples/ted_en_zh/st1/local/data.sh
@ -198,10 +198,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
 fi

 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    x=(${feat_tr_dir} ${feat_dt_dir} ${feat_trans_dir})
+    y=(train dev test)
    echo "stage 3: Format the Json Data"
-    python3 local/espnet_json_to_manifest.py --json-file ${feat_tr_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.train
-    python3 local/espnet_json_to_manifest.py --json-file ${feat_dt_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.dev
-    python3 local/espnet_json_to_manifest.py --json-file ${feat_trans_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.test
+    for (( i=0; i<${#x[*]}; ++i)); do
+        python3 ${MAIN_ROOT}/utils/espnet_json_to_manifest.py \
+         --json-file ${x[$i]}/data_${bpemode}${nbpe}.json 
+         --manifest-file data/manifest.${y[$i]}
+    done
 fi
 echo "Ted En-Zh Data preparation done."
 exit 0
--- a/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py
+++ b/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py