refactor ds2, cli, server

3 years ago · 47dd61e5b2
parent 0fa32e4aae
commit 47dd61e5b2
27 changed files with 580 additions and 1377 deletions
--- a/examples/aishell/asr0/conf/deepspeech2.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2.yaml
@ -15,50 +15,53 @@ max_output_input_ratio: .inf
 ###########################################
 #              Dataloader                 #
 ###########################################
 batch_size: 64 # one gpu
 mean_std_filepath: data/mean_std.json
 unit_type: char
 vocab_filepath: data/lang_char/vocab.txt 
-augmentation_config: conf/augmentation.json
+spm_model_prefix: ''
-random_seed: 0
+unit_type: 'char'
-spm_model_prefix: 
+preprocess_config: conf/preprocess.yaml
 spectrum_type: linear
 feat_dim: 161
 delta_delta: False
 stride_ms: 10.0
-window_ms: 20.0
+window_ms: 25.0
-n_fft: None
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-max_freq: None
+batch_size: 64
-target_sample_rate: 16000
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-use_dB_normalization: True
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-target_dB: -20
+minibatches: 0 # for debug
-dither: 1.0
+batch_count: auto
-keep_transcription_text: False
+batch_bins: 0 
-sortagrad: True
+batch_frames_in: 0
-shuffle_method: batch_shuffle
+batch_frames_out: 0
-num_workers: 2
+batch_frames_inout: 0
 num_workers: 8
 subsampling_factor: 1
 num_encs: 1
 ############################################
 #           Network Architecture           #
 ############################################
 num_conv_layers: 2
-num_rnn_layers: 3
+num_rnn_layers: 5
 rnn_layer_size: 1024
-use_gru: True 
+rnn_direction: bidirect # [forward, bidirect]
-share_rnn_weights: False
+num_fc_layers: 0
 fc_layers_size_list: -1,
 use_gru: False 
 blank_id: 0
-ctc_grad_norm_type: instance 
+  
 ###########################################
 #                Training                 #
 ###########################################
-n_epoch: 80
+n_epoch: 50
 accum_grad: 1
-lr: 2.0e-3
+lr: 5.0e-4
-lr_decay: 0.83
+lr_decay: 0.93
 weight_decay: 1.0e-6
 global_grad_clip: 3.0
-log_interval: 100
+dist_sampler: False
 log_interval: 1
 checkpoint:
  kbest_n: 50
  latest_n: 5
--- a/examples/aishell/asr0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2_online.yaml
@ -15,28 +15,26 @@ max_output_input_ratio: .inf
 ###########################################
 #              Dataloader                 #
 ###########################################
 batch_size: 64 # one gpu
 mean_std_filepath: data/mean_std.json
 unit_type: char
 vocab_filepath: data/lang_char/vocab.txt 
-augmentation_config: conf/augmentation.json
+spm_model_prefix: ''
-random_seed: 0
+unit_type: 'char'
-spm_model_prefix: 
+preprocess_config: conf/preprocess.yaml
 spectrum_type: linear #linear, mfcc, fbank
 feat_dim: 161
 delta_delta: False
 stride_ms: 10.0
-window_ms: 20.0
+window_ms: 25.0
-n_fft: None
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-max_freq: None
+batch_size: 64
-target_sample_rate: 16000
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-use_dB_normalization: True
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-target_dB: -20
+minibatches: 0 # for debug
-dither: 1.0
+batch_count: auto
-keep_transcription_text: False
+batch_bins: 0 
-sortagrad: True
+batch_frames_in: 0
-shuffle_method: batch_shuffle
+batch_frames_out: 0
-num_workers: 0
+batch_frames_inout: 0
 num_workers: 8
 subsampling_factor: 1
 num_encs: 1
 ############################################
 #           Network Architecture           #
@ -54,12 +52,13 @@ blank_id: 0
 ###########################################
 #                Training                 #
 ###########################################
-n_epoch: 65
+n_epoch: 30
 accum_grad: 1
 lr: 5.0e-4
 lr_decay: 0.93
 weight_decay: 1.0e-6
 global_grad_clip: 3.0
 dist_sampler: False
 log_interval: 100
 checkpoint:
  kbest_n: 50
--- a/examples/aishell/asr0/conf/tuning/decode.yaml
+++ b/examples/aishell/asr0/conf/tuning/decode.yaml
@ -2,9 +2,9 @@ decode_batch_size: 128
 error_rate_type: cer 
 decoding_method: ctc_beam_search
 lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
-alpha: 1.9
+alpha: 2.2
-beta: 5.0
+beta: 4.3
-beam_size: 300
+beam_size: 500
 cutoff_prob: 0.99
 cutoff_top_n: 40
 num_proc_bsearch: 10
--- a/examples/aishell/asr0/local/data.sh
+++ b/examples/aishell/asr0/local/data.sh
@ -33,12 +33,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    num_workers=$(nproc)
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
-    --spectrum_type="linear" \
+    --spectrum_type="fbank" \
    --feat_dim=161 \
    --delta_delta=false \
    --stride_ms=10 \
-    --window_ms=20 \
+    --window_ms=25 \
    --sample_rate=16000 \
-    --use_dB_normalization=True \
+    --use_dB_normalization=False \
    --num_samples=2000 \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
--- a/examples/aishell/asr0/run.sh
+++ b/examples/aishell/asr0/run.sh
@ -7,8 +7,7 @@ stage=0
 stop_stage=100
 conf_path=conf/deepspeech2.yaml    #conf/deepspeech2.yaml or conf/deepspeech2_online.yaml
 decode_conf_path=conf/tuning/decode.yaml
-avg_num=1
+avg_num=10
 model_type=offline    # offline or online
 audio_file=data/demo_01_03.wav
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
--- a/examples/librispeech/asr0/conf/deepspeech2.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2.yaml
@ -15,51 +15,51 @@ max_output_input_ratio: .inf
 ###########################################
 #              Dataloader                 #
 ###########################################
 batch_size: 20
 mean_std_filepath: data/mean_std.json
 unit_type: char
 vocab_filepath: data/lang_char/vocab.txt 
-augmentation_config: conf/augmentation.json
+spm_model_prefix: ''
-random_seed: 0
+unit_type: 'char'
-spm_model_prefix: 
+preprocess_config: conf/preprocess.yaml
-spectrum_type: linear
+feat_dim: 161
 feat_dim: 
 target_sample_rate: 16000
 max_freq: None
 n_fft: None
 stride_ms: 10.0
-window_ms: 20.0
+window_ms: 25.0
-delta_delta: False
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-dither: 1.0
+batch_size: 64
-use_dB_normalization: True 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-target_dB: -20
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-random_seed: 0
+minibatches: 0 # for debug
-keep_transcription_text: False
+batch_count: auto
-sortagrad: True 
+batch_bins: 0 
-shuffle_method: batch_shuffle
+batch_frames_in: 0
-num_workers: 2
+batch_frames_out: 0
 batch_frames_inout: 0
 num_workers: 8
 subsampling_factor: 1
 num_encs: 1
 ############################################
 #           Network Architecture           #
 ############################################
 num_conv_layers: 2
-num_rnn_layers: 3
+num_rnn_layers: 5
-rnn_layer_size: 2048
+rnn_layer_size: 1024
 rnn_direction: bidirect
 num_fc_layers: 0
 fc_layers_size_list: -1
 use_gru: False 
 share_rnn_weights: True
 blank_id: 0
 ###########################################
 #                Training                 #
 ###########################################
-n_epoch: 50
+n_epoch: 15
 accum_grad: 1
-lr: 1.0e-3
+lr: 5.0e-4
-lr_decay: 0.83
+lr_decay: 0.93
 weight_decay: 1.0e-6
 global_grad_clip: 5.0
-log_interval: 100
+dist_sampler: False
 log_interval: 1
 checkpoint:
  kbest_n: 50
  latest_n: 5
--- a/examples/librispeech/asr0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2_online.yaml
@ -15,39 +15,36 @@ max_output_input_ratio: .inf
 ###########################################
 #              Dataloader                 #
 ###########################################
 batch_size: 15
 mean_std_filepath: data/mean_std.json
 unit_type: char
 vocab_filepath: data/lang_char/vocab.txt 
-augmentation_config: conf/augmentation.json
+spm_model_prefix: ''
-random_seed: 0
+unit_type: 'char'
-spm_model_prefix: 
+preprocess_config: conf/preprocess.yaml
-spectrum_type: linear
+feat_dim: 161
 feat_dim: 
 target_sample_rate: 16000
 max_freq: None
 n_fft: None
 stride_ms: 10.0
-window_ms: 20.0
+window_ms: 25.0
-delta_delta: False
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-dither: 1.0
+batch_size: 64
-use_dB_normalization: True 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-target_dB: -20
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-random_seed: 0
+minibatches: 0 # for debug
-keep_transcription_text: False
+batch_count: auto
-sortagrad: True 
+batch_bins: 0 
-shuffle_method: batch_shuffle
+batch_frames_in: 0
-num_workers: 0
+batch_frames_out: 0
 batch_frames_inout: 0
 num_workers: 8
 subsampling_factor: 1
 num_encs: 1
 ############################################
 #           Network Architecture           #
 ############################################
 num_conv_layers: 2
-num_rnn_layers: 3
+num_rnn_layers: 5
-rnn_layer_size: 2048
+rnn_layer_size: 1024
 rnn_direction: forward
-num_fc_layers: 2
+num_fc_layers: 0
-fc_layers_size_list: 512, 256
+fc_layers_size_list: -1
 use_gru: False 
 blank_id: 0
@ -55,13 +52,13 @@ blank_id: 0
 ###########################################
 #                Training                 #
 ###########################################
-n_epoch: 50
+n_epoch: 65
-accum_grad: 4
+accum_grad: 1
-lr: 1.0e-3
+lr: 5.0e-4
-lr_decay: 0.83
+lr_decay: 0.93
 weight_decay: 1.0e-6
 global_grad_clip: 5.0
-log_interval: 100
+log_interval: 1
 checkpoint:
  kbest_n: 50
  latest_n: 5
--- a/examples/librispeech/asr0/local/data.sh
+++ b/examples/librispeech/asr0/local/data.sh
@ -49,12 +49,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
    --num_samples=2000 \
-    --spectrum_type="linear" \
+    --spectrum_type="fbank" \
    --feat_dim=161 \
    --delta_delta=false \
    --sample_rate=16000 \
    --stride_ms=10 \
-    --window_ms=20 \
+    --window_ms=25 \
-    --use_dB_normalization=True \
+    --use_dB_normalization=False \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
--- a/examples/librispeech/asr0/local/test.sh
+++ b/examples/librispeech/asr0/local/test.sh
@ -4,6 +4,8 @@ if [ $# != 4 ];then
    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
    exit -1
 fi
 stage=0
 stop_stage=100
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
@ -19,17 +21,44 @@ if [ $? -ne 0 ]; then
   exit 1
 fi
-python3 -u ${BIN_DIR}/test.py \
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--ngpu ${ngpu} \
+    # format the reference test file
--config ${config_path} \
+    python utils/format_rsl.py \
--decode_cfg ${decode_config_path} \
+        --origin_ref data/manifest.test-clean.raw \
--result_file ${ckpt_prefix}.rsl \
+        --trans_ref data/manifest.test-clean.text
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type}
-if [ $? -ne 0 ]; then
+    python3 -u ${BIN_DIR}/test.py \
    --ngpu ${ngpu} \
    --config ${config_path} \
    --decode_cfg ${decode_config_path} \
    --result_file ${ckpt_prefix}.rsl \
    --checkpoint_path ${ckpt_prefix} \
    --model_type ${model_type}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
        exit 1
    fi
    python utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.rsl \
        --trans_hyp ${ckpt_prefix}.rsl.text
    python utils/compute-wer.py --char=1 --v=1 \
        data/manifest.test-clean.text ${ckpt_prefix}.rsl.text > ${ckpt_prefix}.error
 fi
 if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
    python utils/format_rsl.py \
        --origin_ref data/manifest.test-clean.raw \
        --trans_ref_sclite data/manifest.test.text-clean.sclite
        python utils/format_rsl.py \
            --origin_hyp ${ckpt_prefix}.rsl \
            --trans_hyp_sclite ${ckpt_prefix}.rsl.text.sclite
        mkdir -p ${ckpt_prefix}_sclite
        sclite -i wsj -r data/manifest.test-clean.text.sclite -h  ${ckpt_prefix}.rsl.text.sclite  -e utf-8 -o all -O ${ckpt_prefix}_sclite -c NOASCII
 fi
--- a/examples/librispeech/asr0/run.sh
+++ b/examples/librispeech/asr0/run.sh
@ -2,13 +2,12 @@
 set -e
 source path.sh
-gpus=0,1,2,3,4,5,6,7
+gpus=0,1,2,3
 stage=0
 stop_stage=100
 conf_path=conf/deepspeech2.yaml
 decode_conf_path=conf/tuning/decode.yaml
-avg_num=30
+avg_num=5
 model_type=offline
 audio_file=data/demo_002_en.wav
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@ -43,6 +42,11 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
 fi
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # test export ckpt avg_n
    CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1
 fi
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
    # test a single .wav file
    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1
 fi
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@ -138,6 +138,7 @@ class ASRExecutor(BaseExecutor):
            tag = model_type + '-' + lang + '-' + sample_rate_str
            self.task_resource.set_task_model(tag, version=None)
            self.res_path = self.task_resource.res_dir
            self.cfg_path = os.path.join(
                self.res_path, self.task_resource.res_dict['cfg_path'])
            self.ckpt_path = os.path.join(
@ -158,15 +159,18 @@ class ASRExecutor(BaseExecutor):
        self.config.merge_from_file(self.cfg_path)
        with UpdateConfig(self.config):
-            if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
+            if self.config.spm_model_prefix:
-                from paddlespeech.s2t.io.collator import SpeechCollator
+                self.config.spm_model_prefix = os.path.join(
-                self.vocab = self.config.vocab_filepath
+                    self.res_path, self.config.spm_model_prefix)
            self.text_feature = TextFeaturizer(
                unit_type=self.config.unit_type,
                vocab=self.config.vocab_filepath,
                spm_model_prefix=self.config.spm_model_prefix)
            if "deepspeech2" in model_type:
                self.config.decode.lang_model_path = os.path.join(
                    MODEL_HOME, 'language_model',
                    self.config.decode.lang_model_path)
-                self.collate_fn_test = SpeechCollator.from_config(self.config)
+
                self.text_feature = TextFeaturizer(
                    unit_type=self.config.unit_type, vocab=self.vocab)
                lm_url = self.task_resource.res_dict['lm_url']
                lm_md5 = self.task_resource.res_dict['lm_md5']
                self.download_lm(
@ -174,12 +178,6 @@ class ASRExecutor(BaseExecutor):
                    os.path.dirname(self.config.decode.lang_model_path), lm_md5)
            elif "conformer" in model_type or "transformer" in model_type:
                self.config.spm_model_prefix = os.path.join(
                    self.res_path, self.config.spm_model_prefix)
                self.text_feature = TextFeaturizer(
                    unit_type=self.config.unit_type,
                    vocab=self.config.vocab_filepath,
                    spm_model_prefix=self.config.spm_model_prefix)
                self.config.decode.decoding_method = decode_method
            else:
@ -222,19 +220,7 @@ class ASRExecutor(BaseExecutor):
            logger.info("Preprocess audio_file:" + audio_file)
        # Get the object for feature extraction
-        if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
+        if "deepspeech2" in model_type or "conformer" in model_type or "transformer" in model_type:
            audio, _ = self.collate_fn_test.process_utterance(
                audio_file=audio_file, transcript=" ")
            audio_len = audio.shape[0]
            audio = paddle.to_tensor(audio, dtype='float32')
            audio_len = paddle.to_tensor(audio_len)
            audio = paddle.unsqueeze(audio, axis=0)
            # vocab_list = collate_fn_test.vocab_list
            self._inputs["audio"] = audio
            self._inputs["audio_len"] = audio_len
            logger.info(f"audio feat shape: {audio.shape}")
        elif "conformer" in model_type or "transformer" in model_type:
            logger.info("get the preprocess conf")
            preprocess_conf = self.config.preprocess_config
            preprocess_args = {"train": False}
@ -242,7 +228,6 @@ class ASRExecutor(BaseExecutor):
            logger.info("read the audio file")
            audio, audio_sample_rate = soundfile.read(
                audio_file, dtype="int16", always_2d=True)
            if self.change_format:
                if audio.shape[1] >= 2:
                    audio = audio.mean(axis=1, dtype=np.int16)
@ -285,7 +270,7 @@ class ASRExecutor(BaseExecutor):
        cfg = self.config.decode
        audio = self._inputs["audio"]
        audio_len = self._inputs["audio_len"]
-        if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
+        if "deepspeech2" in model_type:
            decode_batch_size = audio.shape[0]
            self.model.decoder.init_decoder(
                decode_batch_size, self.text_feature.vocab_list,
--- a/paddlespeech/resource/model_alias.py
+++ b/paddlespeech/resource/model_alias.py
@ -23,7 +23,7 @@ model_alias = {
    # ---------------------------------
    "deepspeech2offline": ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"],
    "deepspeech2online":
-    ["paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline"],
+    ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"],
    "conformer": ["paddlespeech.s2t.models.u2:U2Model"],
    "conformer_online": ["paddlespeech.s2t.models.u2:U2Model"],
    "transformer": ["paddlespeech.s2t.models.u2:U2Model"],
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@ -136,9 +136,9 @@ asr_dynamic_pretrained_models = {
    "deepspeech2online_wenetspeech-zh-16k": {
        '1.0': {
            'url':
-            'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz',
+            'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.1.model.tar.gz',
            'md5':
-            'e393d4d274af0f6967db24fc146e8074',
+            'd1be86a3e786042ab64f05161b5fae62',
            'cfg_path':
            'model.yaml',
            'ckpt_path':
@ -152,13 +152,13 @@ asr_dynamic_pretrained_models = {
    "deepspeech2offline_aishell-zh-16k": {
        '1.0': {
            'url':
-            'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
+            'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz',
            'md5':
-            '932c3593d62fe5c741b59b31318aa314',
+            '4d26066c6f19f52087425dc722ae5b13',
            'cfg_path':
            'model.yaml',
            'ckpt_path':
-            'exp/deepspeech2/checkpoints/avg_1',
+            'exp/deepspeech2/checkpoints/avg_10',
            'lm_url':
            'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
            'lm_md5':
@ -168,9 +168,9 @@ asr_dynamic_pretrained_models = {
    "deepspeech2online_aishell-zh-16k": {
        '1.0': {
            'url':
-            'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
+            'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_1.0.1.model.tar.gz',
            'md5':
-            '98b87b171b7240b7cae6e07d8d0bc9be',
+            'df5ddeac8b679a470176649ac4b78726',
            'cfg_path':
            'model.yaml',
            'ckpt_path':
@ -188,13 +188,13 @@ asr_dynamic_pretrained_models = {
    "deepspeech2offline_librispeech-en-16k": {
        '1.0': {
            'url':
-            'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz',
+            'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_offline_librispeech_ckpt_1.0.1.model.tar.gz',
            'md5':
-            'f5666c81ad015c8de03aac2bc92e5762',
+            'ed9e2b008a65268b3484020281ab048c',
            'cfg_path':
            'model.yaml',
            'ckpt_path':
-            'exp/deepspeech2/checkpoints/avg_1',
+            'exp/deepspeech2/checkpoints/avg_5',
            'lm_url':
            'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm',
            'lm_md5':
@ -207,17 +207,17 @@ asr_static_pretrained_models = {
    "deepspeech2offline_aishell-zh-16k": {
        '1.0': {
            'url':
-            'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
+            'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz',
            'md5':
-            '932c3593d62fe5c741b59b31318aa314',
+            '4d26066c6f19f52087425dc722ae5b13',
            'cfg_path':
            'model.yaml',
            'ckpt_path':
-            'exp/deepspeech2/checkpoints/avg_1',
+            'exp/deepspeech2/checkpoints/avg_10',
            'model':
-            'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel',
+            'exp/deepspeech2/checkpoints/avg_10.jit.pdmodel',
            'params':
-            'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams',
+            'exp/deepspeech2/checkpoints/avg_10.jit.pdiparams',
            'lm_url':
            'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
            'lm_md5':
--- a/paddlespeech/s2t/exps/deepspeech2/bin/export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/export.py
@ -35,8 +35,6 @@ if __name__ == "__main__":
    # save jit model to
    parser.add_argument(
        "--export_path", type=str, help="path of the jit model to save")
    parser.add_argument(
        "--model_type", type=str, default='offline', help="offline/online")
    parser.add_argument(
        '--nxpu',
        type=int,
@ -44,7 +42,6 @@ if __name__ == "__main__":
        choices=[0, 1],
        help="if nxpu == 0 and ngpu == 0, use cpu.")
    args = parser.parse_args()
    print("model_type:{}".format(args.model_type))
    print_arguments(args)
    # https://yaml.org/type/float.html
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test.py
@ -32,8 +32,6 @@ def main(config, args):
 if __name__ == "__main__":
    parser = default_argument_parser()
    parser.add_argument(
        "--model_type", type=str, default='offline', help='offline/online')
    # save asr result to
    parser.add_argument(
        "--result_file", type=str, help="path of save the asr result")
@ -45,7 +43,6 @@ if __name__ == "__main__":
        help="if nxpu == 0 and ngpu == 0, use cpu.")
    args = parser.parse_args()
    print_arguments(args, globals())
    print("model_type:{}".format(args.model_type))
    # https://yaml.org/type/float.html
    config = CfgNode(new_allowed=True)
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
@ -38,8 +38,6 @@ if __name__ == "__main__":
    #load jit model from
    parser.add_argument(
        "--export_path", type=str, help="path of the jit model to save")
    parser.add_argument(
        "--model_type", type=str, default='offline', help='offline/online')
    parser.add_argument(
        '--nxpu',
        type=int,
@ -50,7 +48,6 @@ if __name__ == "__main__":
        "--enable-auto-log", action="store_true", help="use auto log")
    args = parser.parse_args()
    print_arguments(args, globals())
    print("model_type:{}".format(args.model_type))
    # https://yaml.org/type/float.html
    config = CfgNode(new_allowed=True)
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
@ -23,7 +23,6 @@ from yacs.config import CfgNode
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.io.collator import SpeechCollator
 from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
 from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils import mp_tools
 from paddlespeech.s2t.utils.checkpoint import Checkpoint
@ -113,12 +112,7 @@ class DeepSpeech2Tester_hub():
            config.input_dim = self.collate_fn_test.feature_size
            config.output_dim = self.collate_fn_test.vocab_size
        if self.args.model_type == 'offline':
        model = DeepSpeech2Model.from_config(config)
        elif self.args.model_type == 'online':
            model = DeepSpeech2ModelOnline.from_config(config)
        else:
            raise Exception("wrong model type")
        self.model = model
@ -172,8 +166,6 @@ def main(config, args):
 if __name__ == "__main__":
    parser = default_argument_parser()
    parser.add_argument(
        "--model_type", type=str, default='offline', help='offline/online')
    parser.add_argument("--audio_file", type=str, help='audio file path')
    # save asr result to
    parser.add_argument(
@ -184,7 +176,6 @@ if __name__ == "__main__":
        print("Please input the audio file path")
        sys.exit(-1)
    check(args.audio_file)
    print("model_type:{}".format(args.model_type))
    # https://yaml.org/type/float.html
    config = CfgNode(new_allowed=True)
--- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py
@ -31,8 +31,6 @@ def main(config, args):
 if __name__ == "__main__":
    parser = default_argument_parser()
    parser.add_argument(
        "--model_type", type=str, default='offline', help='offline/online')
    parser.add_argument(
        '--nxpu',
        type=int,
@ -40,7 +38,6 @@ if __name__ == "__main__":
        choices=[0, 1],
        help="if nxpu == 0 and ngpu == 0, use cpu.")
    args = parser.parse_args()
    print("model_type:{}".format(args.model_type))
    print_arguments(args, globals())
    # https://yaml.org/type/float.html
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@ -23,16 +23,12 @@ import paddle
 from paddle import distributed as dist
 from paddle import inference
 from paddle.io import DataLoader
 from paddlespeech.s2t.io.dataloader import BatchDataLoader
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.io.collator import SpeechCollator
 from paddlespeech.s2t.io.dataset import ManifestDataset
 from paddlespeech.s2t.io.sampler import SortagradBatchSampler
 from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler
 from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
 from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
 from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline
 from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
 from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
 from paddlespeech.s2t.training.reporter import report
 from paddlespeech.s2t.training.timer import Timer
@ -136,18 +132,13 @@ class DeepSpeech2Trainer(Trainer):
        config = self.config.clone()
        with UpdateConfig(config):
            if self.train:
-                config.input_dim = self.train_loader.collate_fn.feature_size
+                config.input_dim = self.train_loader.feat_dim
-                config.output_dim = self.train_loader.collate_fn.vocab_size
+                config.output_dim = self.train_loader.vocab_size
            else:
-                config.input_dim = self.test_loader.collate_fn.feature_size
+                config.input_dim = self.test_loader.feat_dim
-                config.output_dim = self.test_loader.collate_fn.vocab_size
+                config.output_dim = self.test_loader.vocab_size
        if self.args.model_type == 'offline':
        model = DeepSpeech2Model.from_config(config)
        elif self.args.model_type == 'online':
            model = DeepSpeech2ModelOnline.from_config(config)
        else:
            raise Exception("wrong model type")
        if self.parallel:
            model = paddle.DataParallel(model)
@ -175,76 +166,81 @@ class DeepSpeech2Trainer(Trainer):
        config = self.config.clone()
        config.defrost()
        if self.train:
-            # train
+            # train/valid dataset, return token ids
-            config.manifest = config.train_manifest
+            self.train_loader = BatchDataLoader(
-            train_dataset = ManifestDataset.from_config(config)
+                json_file=config.train_manifest,
-            if self.parallel:
+                train_mode=True,
                batch_sampler = SortagradDistributedBatchSampler(
                    train_dataset,
                    batch_size=config.batch_size,
                    num_replicas=None,
                    rank=None,
                    shuffle=True,
                    drop_last=True,
                sortagrad=config.sortagrad,
                    shuffle_method=config.shuffle_method)
            else:
                batch_sampler = SortagradBatchSampler(
                    train_dataset,
                    shuffle=True,
                batch_size=config.batch_size,
-                    drop_last=True,
+                maxlen_in=config.maxlen_in,
-                    sortagrad=config.sortagrad,
+                maxlen_out=config.maxlen_out,
-                    shuffle_method=config.shuffle_method)
+                minibatches=config.minibatches,
-
+                mini_batch_size=self.args.ngpu,
-            config.keep_transcription_text = False
+                batch_count=config.batch_count,
-            collate_fn_train = SpeechCollator.from_config(config)
+                batch_bins=config.batch_bins,
-            self.train_loader = DataLoader(
+                batch_frames_in=config.batch_frames_in,
-                train_dataset,
+                batch_frames_out=config.batch_frames_out,
-                batch_sampler=batch_sampler,
+                batch_frames_inout=config.batch_frames_inout,
-                collate_fn=collate_fn_train,
+                preprocess_conf=config.preprocess_config,
-                num_workers=config.num_workers)
+                n_iter_processes=config.num_workers,
-
+                subsampling_factor=1,
-            # dev
+                num_encs=1,
-            config.manifest = config.dev_manifest
+                dist_sampler=config.get('dist_sampler', False),
-            dev_dataset = ManifestDataset.from_config(config)
+                shortest_first=False)
-
+
-            config.augmentation_config = ""
+            self.valid_loader = BatchDataLoader(
-            config.keep_transcription_text = False
+                json_file=config.dev_manifest,
-            collate_fn_dev = SpeechCollator.from_config(config)
+                train_mode=False,
-            self.valid_loader = DataLoader(
+                sortagrad=False,
-                dev_dataset,
+                batch_size=config.batch_size,
-                batch_size=int(config.batch_size),
+                maxlen_in=float('inf'),
-                shuffle=False,
+                maxlen_out=float('inf'),
-                drop_last=False,
+                minibatches=0,
-                collate_fn=collate_fn_dev,
+                mini_batch_size=self.args.ngpu,
-                num_workers=config.num_workers)
+                batch_count='auto',
                batch_bins=0,
                batch_frames_in=0,
                batch_frames_out=0,
                batch_frames_inout=0,
                preprocess_conf=config.preprocess_config,
                n_iter_processes=config.num_workers,
                subsampling_factor=1,
                num_encs=1,
                dist_sampler=config.get('dist_sampler', False),
                shortest_first=False)
            logger.info("Setup train/valid Dataloader!")
        else:
            # test
            config.manifest = config.test_manifest
            test_dataset = ManifestDataset.from_config(config)
            config.augmentation_config = ""
            config.keep_transcription_text = True
            collate_fn_test = SpeechCollator.from_config(config)
            decode_batch_size = config.get('decode', dict()).get(
                'decode_batch_size', 1)
-            self.test_loader = DataLoader(
+            # test dataset, return raw text
-                test_dataset,
+            self.test_loader = BatchDataLoader(
                json_file=config.test_manifest,
                train_mode=False,
                sortagrad=False,
                batch_size=decode_batch_size,
-                shuffle=False,
+                maxlen_in=float('inf'),
-                drop_last=False,
+                maxlen_out=float('inf'),
-                collate_fn=collate_fn_test,
+                minibatches=0,
-                num_workers=config.num_workers)
+                mini_batch_size=1,
-            logger.info("Setup test  Dataloader!")
+                batch_count='auto',
                batch_bins=0,
                batch_frames_in=0,
                batch_frames_out=0,
                batch_frames_inout=0,
                preprocess_conf=config.preprocess_config,
                n_iter_processes=1,
                subsampling_factor=1,
                num_encs=1)
            logger.info("Setup test/align Dataloader!")
 class DeepSpeech2Tester(DeepSpeech2Trainer):
    def __init__(self, config, args):
        super().__init__(config, args)
        self._text_featurizer = TextFeaturizer(
-            unit_type=config.unit_type, vocab=None)
+            unit_type=config.unit_type,
            vocab=config.vocab_filepath)
        self.vocab_list = self._text_featurizer.vocab_list
    def ordid2token(self, texts, texts_len):
        """ ord() id to chr() chr """
@ -252,7 +248,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        for text, n in zip(texts, texts_len):
            n = n.numpy().item()
            ids = text[:n]
-            trans.append(''.join([chr(i) for i in ids]))
+            #trans.append(''.join([chr(i) for i in ids]))
            trans.append(self._text_featurizer.defeaturize(ids.numpy().tolist()))
        return trans
    def compute_metrics(self,
@ -307,8 +304,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        # Initialized the decoder in model
        decode_cfg = self.config.decode
-        vocab_list = self.test_loader.collate_fn.vocab_list
+        vocab_list = self.vocab_list
-        decode_batch_size = self.test_loader.batch_size
+        decode_batch_size = decode_cfg.decode_batch_size
        self.model.decoder.init_decoder(
            decode_batch_size, vocab_list, decode_cfg.decoding_method,
            decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta,
@ -338,17 +335,9 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
    @paddle.no_grad()
    def export(self):
        if self.args.model_type == 'offline':
        infer_model = DeepSpeech2InferModel.from_pretrained(
            self.test_loader, self.config, self.args.checkpoint_path)
        elif self.args.model_type == 'online':
            infer_model = DeepSpeech2InferModelOnline.from_pretrained(
                self.test_loader, self.config, self.args.checkpoint_path)
        else:
            raise Exception("wrong model type")
        infer_model.eval()
        feat_dim = self.test_loader.collate_fn.feature_size
        static_model = infer_model.export()
        logger.info(f"Export code: {static_model.forward.code}")
        paddle.jit.save(static_model, self.args.export_path)
@ -376,10 +365,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
        # Initialized the decoder in model
        decode_cfg = self.config.decode
-        vocab_list = self.test_loader.collate_fn.vocab_list
+        vocab_list = self.vocab_list
-        if self.args.model_type == "online":
+        if self.config.rnn_direction == "forward":
            decode_batch_size = 1
-        elif self.args.model_type == "offline":
+        elif self.config.rnn_direction == "bidirect":
            decode_batch_size = self.test_loader.batch_size
        else:
            raise Exception("wrong model type")
@ -412,11 +401,11 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
        self.model.decoder.del_decoder()
    def compute_result_transcripts(self, audio, audio_len):
-        if self.args.model_type == "online":
+        if self.config.rnn_direction == "forward":
            output_probs, output_lens, trans_batch = self.static_forward_online(
                audio, audio_len, decoder_chunk_size=1)
            result_transcripts = [trans[-1] for trans in trans_batch]
-        elif self.args.model_type == "offline":
+        elif self.config.rnn_direction == "bidirect":
            output_probs, output_lens = self.static_forward_offline(audio,
                                                                    audio_len)
            batch_size = output_probs.shape[0]
--- a/paddlespeech/s2t/models/ds2/conv.py
+++ b/paddlespeech/s2t/models/ds2/conv.py
@ -11,161 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddle import nn
+import paddle
 from paddle.nn import functional as F
-from paddlespeech.s2t.modules.activation import brelu
+from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling4
 from paddlespeech.s2t.modules.mask import make_non_pad_mask
 from paddlespeech.s2t.utils.log import Log
 logger = Log(__name__).getlog()
-__all__ = ['ConvStack', "conv_output_size"]
+class Conv2dSubsampling4Pure(Conv2dSubsampling4):
    def __init__(self, idim: int, odim: int, dropout_rate: float):
        super().__init__(idim, odim, dropout_rate, None)
        self.output_dim = ((idim - 1) // 2 - 1) // 2 * odim
        self.receptive_field_length = 2 * (
            3 - 1) + 3  # stride_1 * (kernel_size_2 - 1) + kerel_size_1
-
+    def forward(self, x: paddle.Tensor,
-def conv_output_size(I, F, P, S):
+                x_len: paddle.Tensor) -> [paddle.Tensor, paddle.Tensor]:
-    # https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
+        x = x.unsqueeze(1)  # (b, c=1, t, f)
    # Output size after Conv:
    #   By noting I the length of the input volume size, 
    #   F the length of the filter, 
    #   P the amount of zero padding, 
    #   S the stride,
    #   then the output size O of the feature map along that dimension is given by:
    #       O = (I - F + Pstart + Pend) // S + 1
    #   When Pstart == Pend == P, we can replace Pstart + Pend by 2P.
    #   When Pstart == Pend == 0
    #       O = (I - F - S) // S
    # https://iq.opengenus.org/output-size-of-convolution/
    # Output height = (Input height + padding height top + padding height bottom - kernel height) / (stride height) + 1
    # Output width = (Output width + padding width right + padding width left - kernel width) / (stride width) + 1
    return (I - F + 2 * P - S) // S
 # receptive field calculator
 # https://fomoro.com/research/article/receptive-field-calculator
 # https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
 # https://distill.pub/2019/computing-receptive-fields/
 # Rl-1 = Sl * Rl + (Kl - Sl) 
 class ConvBn(nn.Layer):
    """Convolution layer with batch normalization.
    :param kernel_size: The x dimension of a filter kernel. Or input a tuple for
                        two image dimension.
    :type kernel_size: int|tuple|list
    :param num_channels_in: Number of input channels.
    :type num_channels_in: int
    :param num_channels_out: Number of output channels.
    :type num_channels_out: int
    :param stride: The x dimension of the stride. Or input a tuple for two 
                image dimension. 
    :type stride: int|tuple|list
    :param padding: The x dimension of the padding. Or input a tuple for two
                    image dimension.
    :type padding: int|tuple|list
    :param act: Activation type, relu|brelu
    :type act: string
    :return: Batch norm layer after convolution layer.
    :rtype: Variable
    """
    def __init__(self, num_channels_in, num_channels_out, kernel_size, stride,
                 padding, act):
        super().__init__()
        assert len(kernel_size) == 2
        assert len(stride) == 2
        assert len(padding) == 2
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.conv = nn.Conv2D(
            num_channels_in,
            num_channels_out,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            weight_attr=None,
            bias_attr=False,
            data_format='NCHW')
        self.bn = nn.BatchNorm2D(
            num_channels_out,
            weight_attr=None,
            bias_attr=None,
            data_format='NCHW')
        self.act = F.relu if act == 'relu' else brelu
    def forward(self, x, x_len):
        """
        x(Tensor): audio, shape [B, C, D, T]
        """
        x = self.conv(x)
-        x = self.bn(x)
+        #b, c, t, f = paddle.shape(x) #not work under jit
-        x = self.act(x)
+        x = x.transpose([0, 2, 1, 3]).reshape([0, 0, -1])
-
+        x_len = ((x_len - 1) // 2 - 1) // 2
        x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1]
                 ) // self.stride[1] + 1
        # reset padding part to 0
        masks = make_non_pad_mask(x_len)  #[B, T]
        masks = masks.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
        # TODO(Hui Zhang): not support bool multiply
        # masks = masks.type_as(x)
        masks = masks.astype(x.dtype)
        x = x.multiply(masks)
        return x, x_len
 class ConvStack(nn.Layer):
    """Convolution group with stacked convolution layers.
    :param feat_size: audio feature dim.
    :type feat_size: int
    :param num_stacks: Number of stacked convolution layers.
    :type num_stacks: int
    """
    def __init__(self, feat_size, num_stacks):
        super().__init__()
        self.feat_size = feat_size  # D
        self.num_stacks = num_stacks
        self.conv_in = ConvBn(
            num_channels_in=1,
            num_channels_out=32,
            kernel_size=(41, 11),  #[D, T]
            stride=(2, 3),
            padding=(20, 5),
            act='brelu')
        out_channel = 32
        convs = [
            ConvBn(
                num_channels_in=32,
                num_channels_out=out_channel,
                kernel_size=(21, 11),
                stride=(2, 1),
                padding=(10, 5),
                act='brelu') for i in range(num_stacks - 1)
        ]
        self.conv_stack = nn.LayerList(convs)
        # conv output feat_dim
        output_height = (feat_size - 1) // 2 + 1
        for i in range(self.num_stacks - 1):
            output_height = (output_height - 1) // 2 + 1
        self.output_height = out_channel * output_height
    def forward(self, x, x_len):
        """
        x: shape [B, C, D, T]
        x_len : shape [B]
        """
        x, x_len = self.conv_in(x, x_len)
        for i, conv in enumerate(self.conv_stack):
            x, x_len = conv(x, x_len)
        return x, x_len
--- a/paddlespeech/s2t/models/ds2/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2/deepspeech2.py
@ -13,15 +13,14 @@
 # limitations under the License.
 """Deepspeech2 ASR Model"""
 import paddle
 import paddle.nn.functional as F
 from paddle import nn
-from paddlespeech.s2t.models.ds2.conv import ConvStack
+from paddlespeech.s2t.models.ds2.conv import Conv2dSubsampling4Pure
 from paddlespeech.s2t.models.ds2.rnn import RNNStack
 from paddlespeech.s2t.modules.ctc import CTCDecoder
 from paddlespeech.s2t.utils import layer_tools
 from paddlespeech.s2t.utils.checkpoint import Checkpoint
 from paddlespeech.s2t.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel']
@ -32,72 +31,197 @@ class CRNNEncoder(nn.Layer):
                 feat_size,
                 dict_size,
                 num_conv_layers=2,
-                 num_rnn_layers=3,
+                 num_rnn_layers=4,
                 rnn_size=1024,
-                 use_gru=False,
+                 rnn_direction='forward',
-                 share_rnn_weights=True):
+                 num_fc_layers=2,
                 fc_layers_size_list=[512, 256],
                 use_gru=False):
        super().__init__()
        self.rnn_size = rnn_size
        self.feat_size = feat_size  # 161 for linear
        self.dict_size = dict_size
-
+        self.num_rnn_layers = num_rnn_layers
-        self.conv = ConvStack(feat_size, num_conv_layers)
+        self.num_fc_layers = num_fc_layers
-
+        self.rnn_direction = rnn_direction
-        i_size = self.conv.output_height  # H after conv stack
+        self.fc_layers_size_list = fc_layers_size_list
-        self.rnn = RNNStack(
+        self.use_gru = use_gru
-            i_size=i_size,
+        self.conv = Conv2dSubsampling4Pure(feat_size, 32, dropout_rate=0.0)
-            h_size=rnn_size,
+
-            num_stacks=num_rnn_layers,
+        self.output_dim = self.conv.output_dim
-            use_gru=use_gru,
+
-            share_rnn_weights=share_rnn_weights)
+        i_size = self.conv.output_dim
        self.rnn = nn.LayerList()
        self.layernorm_list = nn.LayerList()
        self.fc_layers_list = nn.LayerList()
        if rnn_direction == 'bidirect' or rnn_direction == 'bidirectional':
            layernorm_size = 2 * rnn_size
        elif rnn_direction == 'forward':
            layernorm_size = rnn_size
        else:
            raise Exception("Wrong rnn direction")
        for i in range(0, num_rnn_layers):
            if i == 0:
                rnn_input_size = i_size
            else:
                rnn_input_size = layernorm_size
            if use_gru is True:
                self.rnn.append(
                    nn.GRU(
                        input_size=rnn_input_size,
                        hidden_size=rnn_size,
                        num_layers=1,
                        direction=rnn_direction))
            else:
                self.rnn.append(
                    nn.LSTM(
                        input_size=rnn_input_size,
                        hidden_size=rnn_size,
                        num_layers=1,
                        direction=rnn_direction))
            self.layernorm_list.append(nn.LayerNorm(layernorm_size))
            self.output_dim = layernorm_size
        fc_input_size = layernorm_size
        for i in range(self.num_fc_layers):
            self.fc_layers_list.append(
                nn.Linear(fc_input_size, fc_layers_size_list[i]))
            fc_input_size = fc_layers_size_list[i]
            self.output_dim = fc_layers_size_list[i]
    @property
    def output_size(self):
-        return self.rnn_size * 2
+        return self.output_dim
-    def forward(self, audio, audio_len):
+    def forward(self, x, x_lens, init_state_h_box=None, init_state_c_box=None):
        """Compute Encoder outputs
        Args:
-            audio (Tensor): [B, Tmax, D]
+            x (Tensor): [B, T, D]
-            text (Tensor): [B, Umax]
+            x_lens (Tensor): [B]
-            audio_len (Tensor): [B]
+            init_state_h_box(Tensor): init_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
-            text_len (Tensor): [B]
+            init_state_c_box(Tensor): init_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
-        Returns:
+        Return:
            x (Tensor): encoder outputs, [B, T, D]
            x_lens (Tensor): encoder length, [B]
            final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
            final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
        """
-        # [B, T, D]  -> [B, D, T]
+        if init_state_h_box is not None:
-        audio = audio.transpose([0, 2, 1])
+            init_state_list = None
-        # [B, D, T] -> [B, C=1, D, T]
+
-        x = audio.unsqueeze(1)
+            if self.use_gru is True:
-        x_lens = audio_len
+                init_state_h_list = paddle.split(
                    init_state_h_box, self.num_rnn_layers, axis=0)
                init_state_list = init_state_h_list
            else:
                init_state_h_list = paddle.split(
                    init_state_h_box, self.num_rnn_layers, axis=0)
                init_state_c_list = paddle.split(
                    init_state_c_box, self.num_rnn_layers, axis=0)
                init_state_list = [(init_state_h_list[i], init_state_c_list[i])
                                   for i in range(self.num_rnn_layers)]
        else:
            init_state_list = [None] * self.num_rnn_layers
        # convolution group
        x, x_lens = self.conv(x, x_lens)
        final_chunk_state_list = []
        for i in range(0, self.num_rnn_layers):
            x, final_state = self.rnn[i](x, init_state_list[i],
                                         x_lens)  #[B, T, D]
            final_chunk_state_list.append(final_state)
            x = self.layernorm_list[i](x)
        for i in range(self.num_fc_layers):
            x = self.fc_layers_list[i](x)
            x = F.relu(x)
        if self.use_gru is True:
            final_chunk_state_h_box = paddle.concat(
                final_chunk_state_list, axis=0)
            final_chunk_state_c_box = init_state_c_box
        else:
            final_chunk_state_h_list = [
                final_chunk_state_list[i][0] for i in range(self.num_rnn_layers)
            ]
            final_chunk_state_c_list = [
                final_chunk_state_list[i][1] for i in range(self.num_rnn_layers)
            ]
            final_chunk_state_h_box = paddle.concat(
                final_chunk_state_h_list, axis=0)
            final_chunk_state_c_box = paddle.concat(
                final_chunk_state_c_list, axis=0)
        return x, x_lens, final_chunk_state_h_box, final_chunk_state_c_box
    def forward_chunk_by_chunk(self, x, x_lens, decoder_chunk_size=8):
        """Compute Encoder outputs
-        # convert data from convolution feature map to sequence of vectors
+        Args:
-        #B, C, D, T = paddle.shape(x)  # not work under jit
+            x (Tensor): [B, T, D]
-        x = x.transpose([0, 3, 1, 2])  #[B, T, C, D]
+            x_lens (Tensor): [B]
-        #x = x.reshape([B, T, C * D])  #[B, T, C*D]  # not work under jit
+            decoder_chunk_size: The chunk size of decoder
-        x = x.reshape([0, 0, -1])  #[B, T, C*D]
+        Returns:
-
+            eouts_list (List of Tensor): The list of encoder outputs in chunk_size: [B, chunk_size, D] * num_chunks
-        # remove padding part
+            eouts_lens_list (List of Tensor): The list of  encoder length in chunk_size: [B] * num_chunks
-        x, x_lens = self.rnn(x, x_lens)  #[B, T, D]
+            final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
-        return x, x_lens
+            final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
        """
        subsampling_rate = self.conv.subsampling_rate
        receptive_field_length = self.conv.receptive_field_length
        chunk_size = (decoder_chunk_size - 1
                      ) * subsampling_rate + receptive_field_length
        chunk_stride = subsampling_rate * decoder_chunk_size
        max_len = x.shape[1]
        assert (chunk_size <= max_len)
        eouts_chunk_list = []
        eouts_chunk_lens_list = []
        if (max_len - chunk_size) % chunk_stride != 0:
            padding_len = chunk_stride - (max_len - chunk_size) % chunk_stride
        else:
            padding_len = 0
        padding = paddle.zeros((x.shape[0], padding_len, x.shape[2]))
        padded_x = paddle.concat([x, padding], axis=1)
        num_chunk = (max_len + padding_len - chunk_size) / chunk_stride + 1
        num_chunk = int(num_chunk)
        chunk_state_h_box = None
        chunk_state_c_box = None
        final_state_h_box = None
        final_state_c_box = None
        for i in range(0, num_chunk):
            start = i * chunk_stride
            end = start + chunk_size
            x_chunk = padded_x[:, start:end, :]
            x_len_left = paddle.where(x_lens - i * chunk_stride < 0,
                                      paddle.zeros_like(x_lens),
                                      x_lens - i * chunk_stride)
            x_chunk_len_tmp = paddle.ones_like(x_lens) * chunk_size
            x_chunk_lens = paddle.where(x_len_left < x_chunk_len_tmp,
                                        x_len_left, x_chunk_len_tmp)
            eouts_chunk, eouts_chunk_lens, chunk_state_h_box, chunk_state_c_box = self.forward(
                x_chunk, x_chunk_lens, chunk_state_h_box, chunk_state_c_box)
            eouts_chunk_list.append(eouts_chunk)
            eouts_chunk_lens_list.append(eouts_chunk_lens)
        final_state_h_box = chunk_state_h_box
        final_state_c_box = chunk_state_c_box
        return eouts_chunk_list, eouts_chunk_lens_list, final_state_h_box, final_state_c_box
 class DeepSpeech2Model(nn.Layer):
    """The DeepSpeech2 network structure.
-    :param audio_data: Audio spectrogram data layer.
+    :param audio: Audio spectrogram data layer.
-    :type audio_data: Variable
+    :type audio: Variable
-    :param text_data: Transcription text data layer.
+    :param text: Transcription text data layer.
-    :type text_data: Variable
+    :type text: Variable
    :param audio_len: Valid sequence length data layer.
    :type audio_len: Variable
-    :param masks: Masks data layer to reset padding.
+    :param feat_size: feature size for audio.
-    :type masks: Variable
+    :type feat_size: int
    :param dict_size: Dictionary size for tokenized transcription.
    :type dict_size: int
    :param num_conv_layers: Number of stacking convolution layers.
@ -106,37 +230,41 @@ class DeepSpeech2Model(nn.Layer):
    :type num_rnn_layers: int
    :param rnn_size: RNN layer size (dimension of RNN cells).
    :type rnn_size: int
    :param num_fc_layers: Number of stacking FC layers.
    :type num_fc_layers: int
    :param fc_layers_size_list: The list of FC layer sizes.
    :type fc_layers_size_list: [int,]
    :param use_gru: Use gru if set True. Use simple rnn if set False.
    :type use_gru: bool
    :param share_rnn_weights: Whether to share input-hidden weights between
                              forward and backward direction RNNs.
                              It is only available when use_gru=False.
    :type share_weights: bool
    :return: A tuple of an output unnormalized log probability layer (
             before softmax) and a ctc cost layer.
    :rtype: tuple of LayerOutput
    """
-    def __init__(self,
+    def __init__(
            self,
            feat_size,
            dict_size,
            num_conv_layers=2,
-                 num_rnn_layers=3,
+            num_rnn_layers=4,
            rnn_size=1024,
            rnn_direction='forward',
            num_fc_layers=2,
            fc_layers_size_list=[512, 256],
            use_gru=False,
                 share_rnn_weights=True,
            blank_id=0,
-                 ctc_grad_norm_type=None):
+            ctc_grad_norm_type=None, ):
        super().__init__()
        self.encoder = CRNNEncoder(
            feat_size=feat_size,
            dict_size=dict_size,
            num_conv_layers=num_conv_layers,
            num_rnn_layers=num_rnn_layers,
            rnn_direction=rnn_direction,
            num_fc_layers=num_fc_layers,
            fc_layers_size_list=fc_layers_size_list,
            rnn_size=rnn_size,
-            use_gru=use_gru,
+            use_gru=use_gru)
            share_rnn_weights=share_rnn_weights)
        assert (self.encoder.output_size == rnn_size * 2)
        self.decoder = CTCDecoder(
            odim=dict_size,  # <blank> is in  vocab
@ -151,7 +279,7 @@ class DeepSpeech2Model(nn.Layer):
        """Compute Model loss
        Args:
-            audio (Tensors): [B, T, D]
+            audio (Tensor): [B, T, D]
            audio_len (Tensor): [B]
            text (Tensor): [B, U]
            text_len (Tensor): [B]
@ -159,22 +287,22 @@ class DeepSpeech2Model(nn.Layer):
        Returns:
            loss (Tensor): [1]
        """
-        eouts, eouts_len = self.encoder(audio, audio_len)
+        eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder(
            audio, audio_len, None, None)
        loss = self.decoder(eouts, eouts_len, text, text_len)
        return loss
    @paddle.no_grad()
    def decode(self, audio, audio_len):
        # decoders only accept string encoded in utf-8
        # Make sure the decoder has been initialized
-        eouts, eouts_len = self.encoder(audio, audio_len)
+        eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder(
            audio, audio_len, None, None)
        probs = self.decoder.softmax(eouts)
        batch_size = probs.shape[0]
        self.decoder.reset_decoder(batch_size=batch_size)
        self.decoder.next(probs, eouts_len)
        trans_best, trans_beam = self.decoder.decode()
        return trans_best
    @classmethod
@ -196,13 +324,15 @@ class DeepSpeech2Model(nn.Layer):
            The model built from pretrained result.
        """
        model = cls(
-            feat_size=dataloader.collate_fn.feature_size,
+            feat_size=dataloader.feat_dim,
-            dict_size=dataloader.collate_fn.vocab_size,
+            dict_size=dataloader.vocab_size,
            num_conv_layers=config.num_conv_layers,
            num_rnn_layers=config.num_rnn_layers,
            rnn_size=config.rnn_layer_size,
            rnn_direction=config.rnn_direction,
            num_fc_layers=config.num_fc_layers,
            fc_layers_size_list=config.fc_layers_size_list,
            use_gru=config.use_gru,
            share_rnn_weights=config.share_rnn_weights,
            blank_id=config.blank_id,
            ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
        infos = Checkpoint().load_parameters(
@ -229,8 +359,10 @@ class DeepSpeech2Model(nn.Layer):
            num_conv_layers=config.num_conv_layers,
            num_rnn_layers=config.num_rnn_layers,
            rnn_size=config.rnn_layer_size,
            rnn_direction=config.rnn_direction,
            num_fc_layers=config.num_fc_layers,
            fc_layers_size_list=config.fc_layers_size_list,
            use_gru=config.use_gru,
            share_rnn_weights=config.share_rnn_weights,
            blank_id=config.blank_id,
            ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
        return model
@ -240,21 +372,37 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
-    def forward(self, audio, audio_len):
+    def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box=None,
-        """export model function
+                chunk_state_c_box=None):
-
+        if self.encoder.rnn_direction == "forward":
-        Args:
+            eouts_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box = self.encoder(
-            audio (Tensor): [B, T, D]
+                audio_chunk, audio_chunk_lens, chunk_state_h_box, chunk_state_c_box)
-            audio_len (Tensor): [B]
+            probs_chunk = self.decoder.softmax(eouts_chunk)
-
+            return probs_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box
-        Returns:
+        elif self.encoder.rnn_direction == "bidirect":
-            probs: probs after softmax
+            eouts, eouts_len, _, _ = self.encoder(audio_chunk, audio_chunk_lens)
        """
        eouts, eouts_len = self.encoder(audio, audio_len)
            probs = self.decoder.softmax(eouts)
            return probs, eouts_len
        else:
            raise Exception("wrong model type")
    def export(self):
        if self.encoder.rnn_direction == "forward":
            static_model = paddle.jit.to_static(
                self,
                input_spec=[
                    paddle.static.InputSpec(
                        shape=[None, None,
                               self.encoder.feat_size],  #[B, chunk_size, feat_dim]
                        dtype='float32'),
                    paddle.static.InputSpec(shape=[None],
                                            dtype='int64'),  # audio_length, [B]
                    paddle.static.InputSpec(
                        shape=[None, None, None], dtype='float32'),
                    paddle.static.InputSpec(
                        shape=[None, None, None], dtype='float32')
                ])
        elif self.encoder.rnn_direction == "bidirect":
            static_model = paddle.jit.to_static(
                self,
                input_spec=[
@ -264,4 +412,6 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
                    paddle.static.InputSpec(shape=[None],
                                            dtype='int64'),  # audio_length, [B]
                ])
        else:
            raise Exception("wrong model type")
        return static_model
--- a/paddlespeech/s2t/models/ds2/rnn.py
+++ b/paddlespeech/s2t/models/ds2/rnn.py
@ -1,315 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
 from paddle.nn import initializer as I
 from paddlespeech.s2t.modules.activation import brelu
 from paddlespeech.s2t.modules.mask import make_non_pad_mask
 from paddlespeech.s2t.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = ['RNNStack']
 class RNNCell(nn.RNNCellBase):
    r"""
    Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it
    computes the outputs and updates states.
    The formula used is as follows:
    .. math::
        h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
        y_{t} & = h_{t}
    where :math:`act` is for :attr:`activation`.
    """
    def __init__(self,
                 hidden_size: int,
                 activation="tanh",
                 weight_ih_attr=None,
                 weight_hh_attr=None,
                 bias_ih_attr=None,
                 bias_hh_attr=None,
                 name=None):
        super().__init__()
        std = 1.0 / math.sqrt(hidden_size)
        self.weight_hh = self.create_parameter(
            (hidden_size, hidden_size),
            weight_hh_attr,
            default_initializer=I.Uniform(-std, std))
        self.bias_ih = None
        self.bias_hh = self.create_parameter(
            (hidden_size, ),
            bias_hh_attr,
            is_bias=True,
            default_initializer=I.Uniform(-std, std))
        self.hidden_size = hidden_size
        if activation not in ["tanh", "relu", "brelu"]:
            raise ValueError(
                "activation for SimpleRNNCell should be tanh or relu, "
                "but get {}".format(activation))
        self.activation = activation
        self._activation_fn = paddle.tanh \
            if activation == "tanh" \
            else F.relu
        if activation == 'brelu':
            self._activation_fn = brelu
    def forward(self, inputs, states=None):
        if states is None:
            states = self.get_initial_states(inputs, self.state_shape)
        pre_h = states
        i2h = inputs
        if self.bias_ih is not None:
            i2h += self.bias_ih
        h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
        if self.bias_hh is not None:
            h2h += self.bias_hh
        h = self._activation_fn(i2h + h2h)
        return h, h
    @property
    def state_shape(self):
        return (self.hidden_size, )
 class GRUCell(nn.RNNCellBase):
    r"""
    Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states,
    it computes the outputs and updates states.
    The formula for GRU used is as follows:
    ..  math::
        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})
        z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})
        \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
        h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
        y_{t} & = h_{t}
    where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
    multiplication operator.
    """
    def __init__(self,
                 input_size: int,
                 hidden_size: int,
                 weight_ih_attr=None,
                 weight_hh_attr=None,
                 bias_ih_attr=None,
                 bias_hh_attr=None,
                 name=None):
        super().__init__()
        std = 1.0 / math.sqrt(hidden_size)
        self.weight_hh = self.create_parameter(
            (3 * hidden_size, hidden_size),
            weight_hh_attr,
            default_initializer=I.Uniform(-std, std))
        self.bias_ih = None
        self.bias_hh = self.create_parameter(
            (3 * hidden_size, ),
            bias_hh_attr,
            is_bias=True,
            default_initializer=I.Uniform(-std, std))
        self.hidden_size = hidden_size
        self.input_size = input_size
        self._gate_activation = F.sigmoid
        self._activation = paddle.tanh
    def forward(self, inputs, states=None):
        if states is None:
            states = self.get_initial_states(inputs, self.state_shape)
        pre_hidden = states
        x_gates = inputs
        if self.bias_ih is not None:
            x_gates = x_gates + self.bias_ih
        h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
        if self.bias_hh is not None:
            h_gates = h_gates + self.bias_hh
        x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1)
        h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1)
        r = self._gate_activation(x_r + h_r)
        z = self._gate_activation(x_z + h_z)
        c = self._activation(x_c + r * h_c)  # apply reset gate after mm
        h = (pre_hidden - c) * z + c
        # https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru
        return h, h
    @property
    def state_shape(self):
        r"""
        The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
        size would be automatically inserted into shape). The shape corresponds
        to the shape of :math:`h_{t-1}`.
        """
        return (self.hidden_size, )
 class BiRNNWithBN(nn.Layer):
    """Bidirectonal simple rnn layer with sequence-wise batch normalization.
    The batch normalization is only performed on input-state weights.
    :param size: Dimension of RNN cells.
    :type size: int
    :param share_weights: Whether to share input-hidden weights between
                          forward and backward directional RNNs.
    :type share_weights: bool
    :return: Bidirectional simple rnn layer.
    :rtype: Variable
    """
    def __init__(self, i_size: int, h_size: int, share_weights: bool):
        super().__init__()
        self.share_weights = share_weights
        if self.share_weights:
            #input-hidden weights shared between bi-directional rnn.
            self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
            # batch norm is only performed on input-state projection
            self.fw_bn = nn.BatchNorm1D(
                h_size, bias_attr=None, data_format='NLC')
            self.bw_fc = self.fw_fc
            self.bw_bn = self.fw_bn
        else:
            self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
            self.fw_bn = nn.BatchNorm1D(
                h_size, bias_attr=None, data_format='NLC')
            self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False)
            self.bw_bn = nn.BatchNorm1D(
                h_size, bias_attr=None, data_format='NLC')
        self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu')
        self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu')
        self.fw_rnn = nn.RNN(
            self.fw_cell, is_reverse=False, time_major=False)  #[B, T, D]
        self.bw_rnn = nn.RNN(
            self.fw_cell, is_reverse=True, time_major=False)  #[B, T, D]
    def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
        # x, shape [B, T, D]
        fw_x = self.fw_bn(self.fw_fc(x))
        bw_x = self.bw_bn(self.bw_fc(x))
        fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
        bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
        x = paddle.concat([fw_x, bw_x], axis=-1)
        return x, x_len
 class BiGRUWithBN(nn.Layer):
    """Bidirectonal gru layer with sequence-wise batch normalization.
    The batch normalization is only performed on input-state weights.
    :param name: Name of the layer.
    :type name: string
    :param input: Input layer.
    :type input: Variable
    :param size: Dimension of GRU cells.
    :type size: int
    :param act: Activation type.
    :type act: string
    :return: Bidirectional GRU layer.
    :rtype: Variable
    """
    def __init__(self, i_size: int, h_size: int):
        super().__init__()
        hidden_size = h_size * 3
        self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
        self.fw_bn = nn.BatchNorm1D(
            hidden_size, bias_attr=None, data_format='NLC')
        self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
        self.bw_bn = nn.BatchNorm1D(
            hidden_size, bias_attr=None, data_format='NLC')
        self.fw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
        self.bw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
        self.fw_rnn = nn.RNN(
            self.fw_cell, is_reverse=False, time_major=False)  #[B, T, D]
        self.bw_rnn = nn.RNN(
            self.fw_cell, is_reverse=True, time_major=False)  #[B, T, D]
    def forward(self, x, x_len):
        # x, shape [B, T, D]
        fw_x = self.fw_bn(self.fw_fc(x))
        bw_x = self.bw_bn(self.bw_fc(x))
        fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
        bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
        x = paddle.concat([fw_x, bw_x], axis=-1)
        return x, x_len
 class RNNStack(nn.Layer):
    """RNN group with stacked bidirectional simple RNN or GRU layers.
    :param input: Input layer.
    :type input: Variable
    :param size: Dimension of RNN cells in each layer.
    :type size: int
    :param num_stacks: Number of stacked rnn layers.
    :type num_stacks: int
    :param use_gru: Use gru if set True. Use simple rnn if set False.
    :type use_gru: bool
    :param share_rnn_weights: Whether to share input-hidden weights between
                              forward and backward directional RNNs.
                              It is only available when use_gru=False.
    :type share_weights: bool
    :return: Output layer of the RNN group.
    :rtype: Variable
    """
    def __init__(self,
                 i_size: int,
                 h_size: int,
                 num_stacks: int,
                 use_gru: bool,
                 share_rnn_weights: bool):
        super().__init__()
        rnn_stacks = []
        for i in range(num_stacks):
            if use_gru:
                #default:GRU using tanh
                rnn_stacks.append(BiGRUWithBN(i_size=i_size, h_size=h_size))
            else:
                rnn_stacks.append(
                    BiRNNWithBN(
                        i_size=i_size,
                        h_size=h_size,
                        share_weights=share_rnn_weights))
            i_size = h_size * 2
        self.rnn_stacks = nn.LayerList(rnn_stacks)
    def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
        """
        x: shape [B, T, D]
        x_len: shpae [B]
        """
        for i, rnn in enumerate(self.rnn_stacks):
            x, x_len = rnn(x, x_len)
            masks = make_non_pad_mask(x_len)  #[B, T]
            masks = masks.unsqueeze(-1)  # [B, T, 1]
            # TODO(Hui Zhang): not support bool multiply
            masks = masks.astype(x.dtype)
            x = x.multiply(masks)
        return x, x_len
--- a/paddlespeech/s2t/models/ds2_online/init.py
+++ b/paddlespeech/s2t/models/ds2_online/init.py
@ -1,31 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .deepspeech2 import DeepSpeech2InferModelOnline
 from .deepspeech2 import DeepSpeech2ModelOnline
 from paddlespeech.s2t.utils import dynamic_pip_install
 import sys
 try:
    import paddlespeech_ctcdecoders
 except ImportError:
    try:
        package_name = 'paddlespeech_ctcdecoders'
        if sys.platform != "win32":
            dynamic_pip_install.install(package_name)
    except Exception:
        raise RuntimeError(
            "Can not install package paddlespeech_ctcdecoders on your system. \
                The DeepSpeech2 model is not supported for your system")
 __all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModelOnline']
--- a/paddlespeech/s2t/models/ds2_online/conv.py
+++ b/paddlespeech/s2t/models/ds2_online/conv.py
@ -1,33 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling4
 class Conv2dSubsampling4Online(Conv2dSubsampling4):
    def __init__(self, idim: int, odim: int, dropout_rate: float):
        super().__init__(idim, odim, dropout_rate, None)
        self.output_dim = ((idim - 1) // 2 - 1) // 2 * odim
        self.receptive_field_length = 2 * (
            3 - 1) + 3  # stride_1 * (kernel_size_2 - 1) + kerel_size_1
    def forward(self, x: paddle.Tensor,
                x_len: paddle.Tensor) -> [paddle.Tensor, paddle.Tensor]:
        x = x.unsqueeze(1)  # (b, c=1, t, f)
        x = self.conv(x)
        #b, c, t, f = paddle.shape(x) #not work under jit
        x = x.transpose([0, 2, 1, 3]).reshape([0, 0, -1])
        x_len = ((x_len - 1) // 2 - 1) // 2
        return x, x_len
--- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py
@ -1,397 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Deepspeech2 ASR Online Model"""
 import paddle
 import paddle.nn.functional as F
 from paddle import nn
 from paddlespeech.s2t.models.ds2_online.conv import Conv2dSubsampling4Online
 from paddlespeech.s2t.modules.ctc import CTCDecoder
 from paddlespeech.s2t.utils import layer_tools
 from paddlespeech.s2t.utils.checkpoint import Checkpoint
 from paddlespeech.s2t.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModelOnline']
 class CRNNEncoder(nn.Layer):
    def __init__(self,
                 feat_size,
                 dict_size,
                 num_conv_layers=2,
                 num_rnn_layers=4,
                 rnn_size=1024,
                 rnn_direction='forward',
                 num_fc_layers=2,
                 fc_layers_size_list=[512, 256],
                 use_gru=False):
        super().__init__()
        self.rnn_size = rnn_size
        self.feat_size = feat_size  # 161 for linear
        self.dict_size = dict_size
        self.num_rnn_layers = num_rnn_layers
        self.num_fc_layers = num_fc_layers
        self.rnn_direction = rnn_direction
        self.fc_layers_size_list = fc_layers_size_list
        self.use_gru = use_gru
        self.conv = Conv2dSubsampling4Online(feat_size, 32, dropout_rate=0.0)
        self.output_dim = self.conv.output_dim
        i_size = self.conv.output_dim
        self.rnn = nn.LayerList()
        self.layernorm_list = nn.LayerList()
        self.fc_layers_list = nn.LayerList()
        if rnn_direction == 'bidirect' or rnn_direction == 'bidirectional':
            layernorm_size = 2 * rnn_size
        elif rnn_direction == 'forward':
            layernorm_size = rnn_size
        else:
            raise Exception("Wrong rnn direction")
        for i in range(0, num_rnn_layers):
            if i == 0:
                rnn_input_size = i_size
            else:
                rnn_input_size = layernorm_size
            if use_gru is True:
                self.rnn.append(
                    nn.GRU(
                        input_size=rnn_input_size,
                        hidden_size=rnn_size,
                        num_layers=1,
                        direction=rnn_direction))
            else:
                self.rnn.append(
                    nn.LSTM(
                        input_size=rnn_input_size,
                        hidden_size=rnn_size,
                        num_layers=1,
                        direction=rnn_direction))
            self.layernorm_list.append(nn.LayerNorm(layernorm_size))
            self.output_dim = layernorm_size
        fc_input_size = layernorm_size
        for i in range(self.num_fc_layers):
            self.fc_layers_list.append(
                nn.Linear(fc_input_size, fc_layers_size_list[i]))
            fc_input_size = fc_layers_size_list[i]
            self.output_dim = fc_layers_size_list[i]
    @property
    def output_size(self):
        return self.output_dim
    def forward(self, x, x_lens, init_state_h_box=None, init_state_c_box=None):
        """Compute Encoder outputs
        Args:
            x (Tensor): [B, T, D]
            x_lens (Tensor): [B]
            init_state_h_box(Tensor): init_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
            init_state_c_box(Tensor): init_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
        Return:
            x (Tensor): encoder outputs, [B, T, D]
            x_lens (Tensor): encoder length, [B]
            final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
            final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
        """
        if init_state_h_box is not None:
            init_state_list = None
            if self.use_gru is True:
                init_state_h_list = paddle.split(
                    init_state_h_box, self.num_rnn_layers, axis=0)
                init_state_list = init_state_h_list
            else:
                init_state_h_list = paddle.split(
                    init_state_h_box, self.num_rnn_layers, axis=0)
                init_state_c_list = paddle.split(
                    init_state_c_box, self.num_rnn_layers, axis=0)
                init_state_list = [(init_state_h_list[i], init_state_c_list[i])
                                   for i in range(self.num_rnn_layers)]
        else:
            init_state_list = [None] * self.num_rnn_layers
        x, x_lens = self.conv(x, x_lens)
        final_chunk_state_list = []
        for i in range(0, self.num_rnn_layers):
            x, final_state = self.rnn[i](x, init_state_list[i],
                                         x_lens)  #[B, T, D]
            final_chunk_state_list.append(final_state)
            x = self.layernorm_list[i](x)
        for i in range(self.num_fc_layers):
            x = self.fc_layers_list[i](x)
            x = F.relu(x)
        if self.use_gru is True:
            final_chunk_state_h_box = paddle.concat(
                final_chunk_state_list, axis=0)
            final_chunk_state_c_box = init_state_c_box
        else:
            final_chunk_state_h_list = [
                final_chunk_state_list[i][0] for i in range(self.num_rnn_layers)
            ]
            final_chunk_state_c_list = [
                final_chunk_state_list[i][1] for i in range(self.num_rnn_layers)
            ]
            final_chunk_state_h_box = paddle.concat(
                final_chunk_state_h_list, axis=0)
            final_chunk_state_c_box = paddle.concat(
                final_chunk_state_c_list, axis=0)
        return x, x_lens, final_chunk_state_h_box, final_chunk_state_c_box
    def forward_chunk_by_chunk(self, x, x_lens, decoder_chunk_size=8):
        """Compute Encoder outputs
        Args:
            x (Tensor): [B, T, D]
            x_lens (Tensor): [B]
            decoder_chunk_size: The chunk size of decoder
        Returns:
            eouts_list (List of Tensor): The list of encoder outputs in chunk_size: [B, chunk_size, D] * num_chunks
            eouts_lens_list (List of Tensor): The list of  encoder length in chunk_size: [B] * num_chunks
            final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
            final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
        """
        subsampling_rate = self.conv.subsampling_rate
        receptive_field_length = self.conv.receptive_field_length
        chunk_size = (decoder_chunk_size - 1
                      ) * subsampling_rate + receptive_field_length
        chunk_stride = subsampling_rate * decoder_chunk_size
        max_len = x.shape[1]
        assert (chunk_size <= max_len)
        eouts_chunk_list = []
        eouts_chunk_lens_list = []
        if (max_len - chunk_size) % chunk_stride != 0:
            padding_len = chunk_stride - (max_len - chunk_size) % chunk_stride
        else:
            padding_len = 0
        padding = paddle.zeros((x.shape[0], padding_len, x.shape[2]))
        padded_x = paddle.concat([x, padding], axis=1)
        num_chunk = (max_len + padding_len - chunk_size) / chunk_stride + 1
        num_chunk = int(num_chunk)
        chunk_state_h_box = None
        chunk_state_c_box = None
        final_state_h_box = None
        final_state_c_box = None
        for i in range(0, num_chunk):
            start = i * chunk_stride
            end = start + chunk_size
            x_chunk = padded_x[:, start:end, :]
            x_len_left = paddle.where(x_lens - i * chunk_stride < 0,
                                      paddle.zeros_like(x_lens),
                                      x_lens - i * chunk_stride)
            x_chunk_len_tmp = paddle.ones_like(x_lens) * chunk_size
            x_chunk_lens = paddle.where(x_len_left < x_chunk_len_tmp,
                                        x_len_left, x_chunk_len_tmp)
            eouts_chunk, eouts_chunk_lens, chunk_state_h_box, chunk_state_c_box = self.forward(
                x_chunk, x_chunk_lens, chunk_state_h_box, chunk_state_c_box)
            eouts_chunk_list.append(eouts_chunk)
            eouts_chunk_lens_list.append(eouts_chunk_lens)
        final_state_h_box = chunk_state_h_box
        final_state_c_box = chunk_state_c_box
        return eouts_chunk_list, eouts_chunk_lens_list, final_state_h_box, final_state_c_box
 class DeepSpeech2ModelOnline(nn.Layer):
    """The DeepSpeech2 network structure for online.
    :param audio: Audio spectrogram data layer.
    :type audio: Variable
    :param text: Transcription text data layer.
    :type text: Variable
    :param audio_len: Valid sequence length data layer.
    :type audio_len: Variable
    :param feat_size: feature size for audio.
    :type feat_size: int
    :param dict_size: Dictionary size for tokenized transcription.
    :type dict_size: int
    :param num_conv_layers: Number of stacking convolution layers.
    :type num_conv_layers: int
    :param num_rnn_layers: Number of stacking RNN layers.
    :type num_rnn_layers: int
    :param rnn_size: RNN layer size (dimension of RNN cells).
    :type rnn_size: int
    :param num_fc_layers: Number of stacking FC layers.
    :type num_fc_layers: int
    :param fc_layers_size_list: The list of FC layer sizes.
    :type fc_layers_size_list: [int,]
    :param use_gru: Use gru if set True. Use simple rnn if set False.
    :type use_gru: bool
    :return: A tuple of an output unnormalized log probability layer (
             before softmax) and a ctc cost layer.
    :rtype: tuple of LayerOutput
    """
    def __init__(
            self,
            feat_size,
            dict_size,
            num_conv_layers=2,
            num_rnn_layers=4,
            rnn_size=1024,
            rnn_direction='forward',
            num_fc_layers=2,
            fc_layers_size_list=[512, 256],
            use_gru=False,
            blank_id=0,
            ctc_grad_norm_type=None, ):
        super().__init__()
        self.encoder = CRNNEncoder(
            feat_size=feat_size,
            dict_size=dict_size,
            num_conv_layers=num_conv_layers,
            num_rnn_layers=num_rnn_layers,
            rnn_direction=rnn_direction,
            num_fc_layers=num_fc_layers,
            fc_layers_size_list=fc_layers_size_list,
            rnn_size=rnn_size,
            use_gru=use_gru)
        self.decoder = CTCDecoder(
            odim=dict_size,  # <blank> is in  vocab
            enc_n_units=self.encoder.output_size,
            blank_id=blank_id,
            dropout_rate=0.0,
            reduction=True,  # sum
            batch_average=True,  # sum / batch_size
            grad_norm_type=ctc_grad_norm_type)
    def forward(self, audio, audio_len, text, text_len):
        """Compute Model loss
        Args:
            audio (Tensor): [B, T, D]
            audio_len (Tensor): [B]
            text (Tensor): [B, U]
            text_len (Tensor): [B]
        Returns:
            loss (Tensor): [1]
        """
        eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder(
            audio, audio_len, None, None)
        loss = self.decoder(eouts, eouts_len, text, text_len)
        return loss
    @paddle.no_grad()
    def decode(self, audio, audio_len):
        # decoders only accept string encoded in utf-8
        # Make sure the decoder has been initialized
        eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder(
            audio, audio_len, None, None)
        probs = self.decoder.softmax(eouts)
        batch_size = probs.shape[0]
        self.decoder.reset_decoder(batch_size=batch_size)
        self.decoder.next(probs, eouts_len)
        trans_best, trans_beam = self.decoder.decode()
        return trans_best
    @classmethod
    def from_pretrained(cls, dataloader, config, checkpoint_path):
        """Build a DeepSpeech2Model model from a pretrained model.
        Parameters
        ----------
        dataloader: paddle.io.DataLoader
        config: yacs.config.CfgNode
            model configs
        checkpoint_path: Path or str
            the path of pretrained model checkpoint, without extension name
        Returns
        -------
        DeepSpeech2ModelOnline
            The model built from pretrained result.
        """
        model = cls(
            feat_size=dataloader.collate_fn.feature_size,
            dict_size=dataloader.collate_fn.vocab_size,
            num_conv_layers=config.num_conv_layers,
            num_rnn_layers=config.num_rnn_layers,
            rnn_size=config.rnn_layer_size,
            rnn_direction=config.rnn_direction,
            num_fc_layers=config.num_fc_layers,
            fc_layers_size_list=config.fc_layers_size_list,
            use_gru=config.use_gru,
            blank_id=config.blank_id,
            ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
        infos = Checkpoint().load_parameters(
            model, checkpoint_path=checkpoint_path)
        logger.info(f"checkpoint info: {infos}")
        layer_tools.summary(model)
        return model
    @classmethod
    def from_config(cls, config):
        """Build a DeepSpeec2ModelOnline from config
        Parameters
        config: yacs.config.CfgNode
            config
        Returns
        -------
        DeepSpeech2ModelOnline
            The model built from config.
        """
        model = cls(
            feat_size=config.input_dim,
            dict_size=config.output_dim,
            num_conv_layers=config.num_conv_layers,
            num_rnn_layers=config.num_rnn_layers,
            rnn_size=config.rnn_layer_size,
            rnn_direction=config.rnn_direction,
            num_fc_layers=config.num_fc_layers,
            fc_layers_size_list=config.fc_layers_size_list,
            use_gru=config.use_gru,
            blank_id=config.blank_id,
            ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
        return model
 class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box,
                chunk_state_c_box):
        eouts_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box = self.encoder(
            audio_chunk, audio_chunk_lens, chunk_state_h_box, chunk_state_c_box)
        probs_chunk = self.decoder.softmax(eouts_chunk)
        return probs_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box
    def export(self):
        static_model = paddle.jit.to_static(
            self,
            input_spec=[
                paddle.static.InputSpec(
                    shape=[None, None,
                           self.encoder.feat_size],  #[B, chunk_size, feat_dim]
                    dtype='float32'),
                paddle.static.InputSpec(shape=[None],
                                        dtype='int64'),  # audio_length, [B]
                paddle.static.InputSpec(
                    shape=[None, None, None], dtype='float32'),
                paddle.static.InputSpec(
                    shape=[None, None, None], dtype='float32')
            ])
        return static_model
--- a/paddlespeech/server/engine/asr/online/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@ -25,7 +25,6 @@ from paddlespeech.cli.log import logger
 from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.resource import CommonTaskResource
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.speech import SpeechSegment
 from paddlespeech.s2t.modules.ctc import CTCDecoder
 from paddlespeech.s2t.transform.transformation import Transformation
 from paddlespeech.s2t.utils.tensor_utils import add_sos_eos
@ -66,10 +65,13 @@ class PaddleASRConnectionHanddler:
        self.text_feature = self.asr_engine.executor.text_feature
        if "deepspeech2" in self.model_type:
            from paddlespeech.s2t.io.collator import SpeechCollator
            self.am_predictor = self.asr_engine.executor.am_predictor
-            self.collate_fn_test = SpeechCollator.from_config(self.model_config)
+            # extract feat, new only fbank in conformer model
            self.preprocess_conf = self.model_config.preprocess_config
            self.preprocess_args = {"train": False}
            self.preprocessing = Transformation(self.preprocess_conf)
            self.decoder = CTCDecoder(
                odim=self.model_config.output_dim,  # <blank> is in  vocab
                enc_n_units=self.model_config.rnn_layer_size * 2,
@ -89,10 +91,8 @@ class PaddleASRConnectionHanddler:
                cfg.num_proc_bsearch)
            # frame window and frame shift, in samples unit
-            self.win_length = int(self.model_config.window_ms / 1000 *
+            self.win_length = self.preprocess_conf.process[0]['win_length']
-                                  self.sample_rate)
+            self.n_shift = self.preprocess_conf.process[0]['n_shift']
            self.n_shift = int(self.model_config.stride_ms / 1000 *
                               self.sample_rate)
        elif "conformer" in self.model_type or "transformer" in self.model_type:
            # acoustic model
@ -123,11 +123,6 @@ class PaddleASRConnectionHanddler:
            samples = np.frombuffer(samples, dtype=np.int16)
            assert samples.ndim == 1
            # pcm16 -> pcm 32
            # pcm2float will change the orignal samples, 
            # so we shoule do pcm2float before concatenate
            samples = pcm2float(samples)
            if self.remained_wav is None:
                self.remained_wav = samples
            else:
@ -137,26 +132,11 @@ class PaddleASRConnectionHanddler:
                f"The connection remain the audio samples: {self.remained_wav.shape}"
            )
-            # read audio
+            # fbank
-            speech_segment = SpeechSegment.from_pcm(
+            feat = self.preprocessing(self.remained_wav,
-                self.remained_wav, self.sample_rate, transcript=" ")
+                                         **self.preprocess_args)
-            # audio augment
+            feat = paddle.to_tensor(
-            self.collate_fn_test.augmentation.transform_audio(speech_segment)
+                feat, dtype="float32").unsqueeze(axis=0)
            # extract speech feature
            spectrum, transcript_part = self.collate_fn_test._speech_featurizer.featurize(
                speech_segment, self.collate_fn_test.keep_transcription_text)
            # CMVN spectrum
            if self.collate_fn_test._normalizer:
                spectrum = self.collate_fn_test._normalizer.apply(spectrum)
            # spectrum augment
            feat = self.collate_fn_test.augmentation.transform_feature(spectrum)
            # audio_len is frame num
            frame_num = feat.shape[0]
            feat = paddle.to_tensor(feat, dtype='float32')
            feat = paddle.unsqueeze(feat, axis=0)
            if self.cached_feat is None:
                self.cached_feat = feat
@ -170,8 +150,11 @@ class PaddleASRConnectionHanddler:
            if self.device is None:
                self.device = self.cached_feat.place
-            self.num_frames += frame_num
+            # cur frame step
-            self.remained_wav = self.remained_wav[self.n_shift * frame_num:]
+            num_frames = feat.shape[1]
            self.num_frames += num_frames
            self.remained_wav = self.remained_wav[self.n_shift * num_frames:]
            logger.info(
                f"process the audio feature success, the connection feat shape: {self.cached_feat.shape}"
@ -752,16 +735,19 @@ class ASRServerExecutor(ASRExecutor):
        self.config = CfgNode(new_allowed=True)
        self.config.merge_from_file(self.cfg_path)
        if self.config.spm_model_prefix:
            self.config.spm_model_prefix = os.path.join(
                self.res_path, self.config.spm_model_prefix)
        self.text_feature = TextFeaturizer(
            unit_type=self.config.unit_type,
            vocab=self.config.vocab_filepath,
            spm_model_prefix=self.config.spm_model_prefix)
        self.vocab = self.config.vocab_filepath
        with UpdateConfig(self.config):
            if "deepspeech2" in model_type:
                from paddlespeech.s2t.io.collator import SpeechCollator
                self.vocab = self.config.vocab_filepath
                self.config.decode.lang_model_path = os.path.join(
                    MODEL_HOME, 'language_model',
                    self.config.decode.lang_model_path)
                self.collate_fn_test = SpeechCollator.from_config(self.config)
                self.text_feature = TextFeaturizer(
                    unit_type=self.config.unit_type, vocab=self.vocab)
                lm_url = self.task_resource.res_dict['lm_url']
                lm_md5 = self.task_resource.res_dict['lm_md5']
@ -772,14 +758,6 @@ class ASRServerExecutor(ASRExecutor):
            elif "conformer" in model_type or "transformer" in model_type:
                logger.info("start to create the stream conformer asr engine")
                if self.config.spm_model_prefix:
                    self.config.spm_model_prefix = os.path.join(
                        self.res_path, self.config.spm_model_prefix)
                self.vocab = self.config.vocab_filepath
                self.text_feature = TextFeaturizer(
                    unit_type=self.config.unit_type,
                    vocab=self.config.vocab_filepath,
                    spm_model_prefix=self.config.spm_model_prefix)
                # update the decoding method
                if decode_method:
                    self.config.decode.decoding_method = decode_method
--- a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
+++ b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
@ -54,6 +54,7 @@ class ASRServerExecutor(ASRExecutor):
        sample_rate_str = '16k' if sample_rate == 16000 else '8k'
        tag = model_type + '-' + lang + '-' + sample_rate_str
        self.max_len = 50
        self.task_resource.set_task_model(model_tag=tag)
        if cfg_path is None or am_model is None or am_params is None:
            self.res_path = self.task_resource.res_dir
@ -80,22 +81,24 @@ class ASRServerExecutor(ASRExecutor):
        self.config.merge_from_file(self.cfg_path)
        with UpdateConfig(self.config):
-            if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
+            if "deepspeech2" in model_type:
                from paddlespeech.s2t.io.collator import SpeechCollator
                self.vocab = self.config.vocab_filepath
                if self.config.spm_model_prefix:
                    self.config.spm_model_prefix = os.path.join(
                        self.res_path, self.config.spm_model_prefix)
                self.text_feature = TextFeaturizer(
                    unit_type=self.config.unit_type, vocab=self.vocab,
                    spm_model_prefix=self.config.spm_model_prefix)
                self.config.decode.lang_model_path = os.path.join(
                    MODEL_HOME, 'language_model',
                    self.config.decode.lang_model_path)
                self.collate_fn_test = SpeechCollator.from_config(self.config)
                self.text_feature = TextFeaturizer(
                    unit_type=self.config.unit_type, vocab=self.vocab)
                lm_url = self.task_resource.res_dict['lm_url']
                lm_md5 = self.task_resource.res_dict['lm_md5']
                self.download_lm(
                    lm_url,
                    os.path.dirname(self.config.decode.lang_model_path), lm_md5)
-            elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
+            elif "conformer" in model_type or "transformer" in model_type:
                raise Exception("wrong type")
            else:
                raise Exception("wrong type")
@ -125,7 +128,7 @@ class ASRServerExecutor(ASRExecutor):
        cfg = self.config.decode
        audio = self._inputs["audio"]
        audio_len = self._inputs["audio_len"]
-        if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
+        if "deepspeech2" in model_type:
            decode_batch_size = audio.shape[0]
            # init once
            self.decoder.init_decoder(