diff --git a/README.md b/README.md
index 931e6331c..71bc63638 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,3 @@
-[中文版](README_cn.md)
-
 # PaddlePaddle Speech to Any toolkit
 
 ![License](https://img.shields.io/badge/license-Apache%202-red.svg)
@@ -11,7 +9,7 @@
 
 ## Features
 
- See [feature list](doc/src/feature_list.md) for more information.
+ See [feature list](docs/src/feature_list.md) for more information.
 
 ## Setup
 
@@ -20,20 +18,20 @@ All tested under:
 * python>=3.7
 * paddlepaddle>=2.2.0rc
 
-Please see [install](doc/src/install.md).
+Please see [install](docs/src/install.md).
 
 ## Getting Started
 
-Please see [Getting Started](doc/src/getting_started.md) and [tiny egs](examples/tiny/s0/README.md).
+Please see [Getting Started](docs/src/getting_started.md) and [tiny egs](examples/tiny/s0/README.md).
 
 
 ## More Information  
 
-* [Data Prepration](doc/src/data_preparation.md)  
-* [Data Augmentation](doc/src/augmentation.md)  
-* [Ngram LM](doc/src/ngram_lm.md)  
-* [Benchmark](doc/src/benchmark.md)  
-* [Relased Model](doc/src/released_model.md)  
+* [Data Prepration](docs/src/data_preparation.md)  
+* [Data Augmentation](docs/src/augmentation.md)  
+* [Ngram LM](docs/src/ngram_lm.md)  
+* [Benchmark](docs/src/benchmark.md)  
+* [Relased Model](docs/src/released_model.md)  
 
 
 ## Questions and Help
@@ -47,4 +45,4 @@ DeepSpeech is provided under the [Apache-2.0 License](./LICENSE).
 
 ## Acknowledgement
 
-We depends on many open source repos. See [References](doc/src/reference.md) for more information.
+We depends on many open source repos. See [References](docs/src/reference.md) for more information.
diff --git a/README_cn.md b/README_cn.md
deleted file mode 100644
index cc993f8bf..000000000
--- a/README_cn.md
+++ /dev/null
@@ -1,49 +0,0 @@
-[English](README.md)
-
-# PaddlePaddle Speech to Any toolkit
-
-![License](https://img.shields.io/badge/license-Apache%202-red.svg)
-![python version](https://img.shields.io/badge/python-3.7+-orange.svg)
-![support os](https://img.shields.io/badge/os-linux-yellow.svg)
-
-*DeepSpeech*是一个采用[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)平台的端到端自动语音识别引擎的开源项目，
-我们的愿景是为语音识别在工业应用和学术研究上，提供易于使用、高效、小型化和可扩展的工具，包括训练，推理，以及  部署。
-
-## 特性
-
- 参看 [特性列表](doc/src/feature_list.md)。
-
-
-## 安装
-
-在以下环境测试验证过：  
-
-* Ubuntu 16.04
-* python>=3.7
-* paddlepaddle>=2.2.0rc
-
-参看 [安装](doc/src/install.md)。
-
-## 开始
-
-请查看 [开始](doc/src/getting_started.md) 和 [tiny egs](examples/tiny/s0/README.md)。
-
-## 更多信息
-
-* [数据处理](doc/src/data_preparation.md)  
-* [数据增强](doc/src/augmentation.md)  
-* [语言模型](doc/src/ngram_lm.md)  
-* [Benchmark](doc/src/benchmark.md)  
-* [Relased Model](doc/src/released_model.md)  
-
-## 问题和帮助
-
-欢迎您在[Github讨论](https://github.com/PaddlePaddle/DeepSpeech/discussions)提交问题，[Github问题](https://github.com/PaddlePaddle/models/issues)中反馈bug。也欢迎您为这个项目做出贡献。
-
-## License
-
-DeepSpeech 遵循[Apache-2.0开源协议](./LICENSE)。
-
-## 感谢
-
-开发中参考一些优秀的仓库，详情参见 [References](doc/src/reference.md)。
diff --git a/deepspeech/exps/deepspeech2/bin/tune.py b/deepspeech/exps/deepspeech2/bin/tune.py
deleted file mode 100644
index 94a9b6c47..000000000
--- a/deepspeech/exps/deepspeech2/bin/tune.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Beam search parameters tuning for DeepSpeech2 model."""
-import functools
-import sys
-
-import numpy as np
-from paddle.io import DataLoader
-
-from deepspeech.exps.deepspeech2.config import get_cfg_defaults
-from deepspeech.io.collator import SpeechCollator
-from deepspeech.io.dataset import ManifestDataset
-from deepspeech.models.ds2 import DeepSpeech2Model
-from deepspeech.training.cli import default_argument_parser
-from deepspeech.utils import error_rate
-from deepspeech.utils.utility import add_arguments
-from deepspeech.utils.utility import print_arguments
-
-
-def tune(config, args):
-    """Tune parameters alpha and beta incrementally."""
-    if not args.num_alphas >= 0:
-        raise ValueError("num_alphas must be non-negative!")
-    if not args.num_betas >= 0:
-        raise ValueError("num_betas must be non-negative!")
-    config.defrost()
-    config.data.manfiest = config.data.dev_manifest
-    config.data.augmentation_config = ""
-    config.data.keep_transcription_text = True
-    dev_dataset = ManifestDataset.from_config(config)
-
-    valid_loader = DataLoader(
-        dev_dataset,
-        batch_size=config.data.batch_size,
-        shuffle=False,
-        drop_last=False,
-        collate_fn=SpeechCollator(keep_transcription_text=True))
-
-    model = DeepSpeech2Model.from_pretrained(valid_loader, config,
-                                             args.checkpoint_path)
-    model.eval()
-
-    # decoders only accept string encoded in utf-8
-    vocab_list = valid_loader.dataset.vocab_list
-    errors_func = error_rate.char_errors if config.decoding.error_rate_type == 'cer' else error_rate.word_errors
-
-    # create grid for search
-    cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
-    cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
-    params_grid = [(alpha, beta) for alpha in cand_alphas
-                   for beta in cand_betas]
-
-    err_sum = [0.0 for i in range(len(params_grid))]
-    err_ave = [0.0 for i in range(len(params_grid))]
-
-    num_ins, len_refs, cur_batch = 0, 0, 0
-    # initialize external scorer
-    model.decoder.init_decode(args.alpha_from, args.beta_from,
-                              config.decoding.lang_model_path, vocab_list,
-                              config.decoding.decoding_method)
-    ## incremental tuning parameters over multiple batches
-    print("start tuning ...")
-    for infer_data in valid_loader():
-        if (args.num_batches >= 0) and (cur_batch >= args.num_batches):
-            break
-
-        def ordid2token(texts, texts_len):
-            """ ord() id to chr() chr """
-            trans = []
-            for text, n in zip(texts, texts_len):
-                n = n.numpy().item()
-                ids = text[:n]
-                trans.append(''.join([chr(i) for i in ids]))
-            return trans
-
-        audio, audio_len, text, text_len = infer_data
-        target_transcripts = ordid2token(text, text_len)
-        num_ins += audio.shape[0]
-
-        # model infer
-        eouts, eouts_len = model.encoder(audio, audio_len)
-        probs = model.decoder.softmax(eouts)
-
-        # grid search
-        for index, (alpha, beta) in enumerate(params_grid):
-            print(f"tuneing: alpha={alpha} beta={beta}")
-            result_transcripts = model.decoder.decode_probs(
-                probs.numpy(), eouts_len, vocab_list,
-                config.decoding.decoding_method,
-                config.decoding.lang_model_path, alpha, beta,
-                config.decoding.beam_size, config.decoding.cutoff_prob,
-                config.decoding.cutoff_top_n, config.decoding.num_proc_bsearch)
-
-            for target, result in zip(target_transcripts, result_transcripts):
-                errors, len_ref = errors_func(target, result)
-                err_sum[index] += errors
-
-                # accumulate the length of references of every batchπ
-                # in the first iteration
-                if args.alpha_from == alpha and args.beta_from == beta:
-                    len_refs += len_ref
-
-            err_ave[index] = err_sum[index] / len_refs
-            if index % 2 == 0:
-                sys.stdout.write('.')
-                sys.stdout.flush()
-            print("tuneing: one grid done!")
-
-        # output on-line tuning result at the end of current batch
-        err_ave_min = min(err_ave)
-        min_index = err_ave.index(err_ave_min)
-        print("\nBatch %d [%d/?], current opt (alpha, beta) = (%s, %s), "
-              " min [%s] = %f" %
-              (cur_batch, num_ins, "%.3f" % params_grid[min_index][0],
-               "%.3f" % params_grid[min_index][1],
-               config.decoding.error_rate_type, err_ave_min))
-        cur_batch += 1
-
-    # output WER/CER at every (alpha, beta)
-    print("\nFinal %s:\n" % config.decoding.error_rate_type)
-    for index in range(len(params_grid)):
-        print("(alpha, beta) = (%s, %s), [%s] = %f" %
-              ("%.3f" % params_grid[index][0], "%.3f" % params_grid[index][1],
-               config.decoding.error_rate_type, err_ave[index]))
-
-    err_ave_min = min(err_ave)
-    min_index = err_ave.index(err_ave_min)
-    print("\nFinish tuning on %d batches, final opt (alpha, beta) = (%s, %s)" %
-          (cur_batch, "%.3f" % params_grid[min_index][0],
-           "%.3f" % params_grid[min_index][1]))
-
-    print("finish tuning")
-
-
-def main(config, args):
-    tune(config, args)
-
-
-if __name__ == "__main__":
-    parser = default_argument_parser()
-    add_arg = functools.partial(add_arguments, argparser=parser)
-    add_arg('num_batches', int, -1, "# of batches tuning on. "
-            "Default -1, on whole dev set.")
-    add_arg('num_alphas', int, 45, "# of alpha candidates for tuning.")
-    add_arg('num_betas', int, 8, "# of beta candidates for tuning.")
-    add_arg('alpha_from', float, 1.0, "Where alpha starts tuning from.")
-    add_arg('alpha_to', float, 3.2, "Where alpha ends tuning with.")
-    add_arg('beta_from', float, 0.1, "Where beta starts tuning from.")
-    add_arg('beta_to', float, 0.45, "Where beta ends tuning with.")
-
-    add_arg('batch_size', int, 256, "# of samples per batch.")
-    add_arg('beam_size', int, 500, "Beam search width.")
-    add_arg('num_proc_bsearch', int, 8, "# of CPUs for beam search.")
-    add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.")
-    add_arg('cutoff_top_n', int, 40, "Cutoff number for pruning.")
-
-    args = parser.parse_args()
-    print_arguments(args, globals())
-
-    # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
-    if args.config:
-        config.merge_from_file(args.config)
-    if args.opts:
-        config.merge_from_list(args.opts)
-
-    config.data.batch_size = args.batch_size
-    config.decoding.beam_size = args.beam_size
-    config.decoding.num_proc_bsearch = args.num_proc_bsearch
-    config.decoding.cutoff_prob = args.cutoff_prob
-    config.decoding.cutoff_top_n = args.cutoff_top_n
-
-    config.freeze()
-    print(config)
-
-    if args.dump_config:
-        with open(args.dump_config, 'w') as f:
-            print(config, file=f)
-
-    main(config, args)
diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py
index fbc357ca0..7bf029300 100644
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@@ -35,12 +35,14 @@ from deepspeech.models.ds2 import DeepSpeech2Model
 from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline
 from deepspeech.models.ds2_online import DeepSpeech2ModelOnline
 from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
+from deepspeech.training.reporter import report
 from deepspeech.training.trainer import Trainer
 from deepspeech.utils import error_rate
 from deepspeech.utils import layer_tools
 from deepspeech.utils import mp_tools
 from deepspeech.utils.log import Autolog
 from deepspeech.utils.log import Log
+from deepspeech.utils.utility import UpdateConfig
 
 logger = Log(__name__).getlog()
 
@@ -66,7 +68,9 @@ class DeepSpeech2Trainer(Trainer):
         super().__init__(config, args)
 
     def train_batch(self, batch_index, batch_data, msg):
-        train_conf = self.config.training
+        batch_size = self.config.collator.batch_size
+        accum_grad = self.config.training.accum_grad
+
         start = time.time()
 
         # forward
@@ -77,7 +81,7 @@ class DeepSpeech2Trainer(Trainer):
         }
 
         # loss backward
-        if (batch_index + 1) % train_conf.accum_grad != 0:
+        if (batch_index + 1) % accum_grad != 0:
             # Disable gradient synchronizations across DDP processes.
             # Within this context, gradients will be accumulated on module
             # variables, which will later be synchronized.
@@ -92,19 +96,18 @@ class DeepSpeech2Trainer(Trainer):
             layer_tools.print_grads(self.model, print_func=None)
 
         # optimizer step
-        if (batch_index + 1) % train_conf.accum_grad == 0:
+        if (batch_index + 1) % accum_grad == 0:
             self.optimizer.step()
             self.optimizer.clear_grad()
             self.iteration += 1
 
         iteration_time = time.time() - start
 
-        msg += "train time: {:>.3f}s, ".format(iteration_time)
-        msg += "batch size: {}, ".format(self.config.collator.batch_size)
-        msg += "accum: {}, ".format(train_conf.accum_grad)
-        msg += ', '.join('{}: {:>.6f}'.format(k, v)
-                         for k, v in losses_np.items())
-        logger.info(msg)
+        for k, v in losses_np.items():
+            report(k, v)
+        report("batch_size", batch_size)
+        report("accum", accum_grad)
+        report("step_cost", iteration_time)
 
         if dist.get_rank() == 0 and self.visualizer:
             for k, v in losses_np.items():
@@ -147,10 +150,9 @@ class DeepSpeech2Trainer(Trainer):
 
     def setup_model(self):
         config = self.config.clone()
-        config.defrost()
-        config.model.feat_size = self.train_loader.collate_fn.feature_size
-        config.model.dict_size = self.train_loader.collate_fn.vocab_size
-        config.freeze()
+        with UpdateConfig(config):
+            config.model.feat_size = self.train_loader.collate_fn.feature_size
+            config.model.dict_size = self.train_loader.collate_fn.vocab_size
 
         if self.args.model_type == 'offline':
             model = DeepSpeech2Model.from_config(config.model)
diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py
index a7f4f14d9..ce3d17cc2 100644
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -17,6 +17,7 @@ import os
 import sys
 import time
 from collections import defaultdict
+from collections import OrderedDict
 from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional
@@ -33,6 +34,8 @@ from deepspeech.io.sampler import SortagradBatchSampler
 from deepspeech.io.sampler import SortagradDistributedBatchSampler
 from deepspeech.models.u2 import U2Model
 from deepspeech.training.optimizer import OptimizerFactory
+from deepspeech.training.reporter import ObsScope
+from deepspeech.training.reporter import report
 from deepspeech.training.scheduler import LRSchedulerFactory
 from deepspeech.training.timer import Timer
 from deepspeech.training.trainer import Trainer
@@ -43,6 +46,7 @@ from deepspeech.utils import mp_tools
 from deepspeech.utils import text_grid
 from deepspeech.utils import utility
 from deepspeech.utils.log import Log
+from deepspeech.utils.utility import UpdateConfig
 
 logger = Log(__name__).getlog()
 
@@ -100,7 +104,8 @@ class U2Trainer(Trainer):
             # Disable gradient synchronizations across DDP processes.
             # Within this context, gradients will be accumulated on module
             # variables, which will later be synchronized.
-            context = self.model.no_sync
+            # When using cpu w/o DDP, model does not have `no_sync`
+            context = self.model.no_sync if self.parallel else nullcontext
         else:
             # Used for single gpu training and DDP gradient synchronization
             # processes.
@@ -119,12 +124,11 @@ class U2Trainer(Trainer):
         iteration_time = time.time() - start
 
         if (batch_index + 1) % train_conf.log_interval == 0:
-            msg += "train time: {:>.3f}s, ".format(iteration_time)
-            msg += "batch size: {}, ".format(self.config.collator.batch_size)
-            msg += "accum: {}, ".format(train_conf.accum_grad)
-            msg += ', '.join('{}: {:>.6f}'.format(k, v)
-                             for k, v in losses_np.items())
-            logger.info(msg)
+            for k, v in losses_np.items():
+                report(k, v)
+            report("batch_size", self.config.collator.batch_size)
+            report("accum", train_conf.accum_grad)
+            report("step_cost", iteration_time)
 
             if dist.get_rank() == 0 and self.visualizer:
                 losses_np_v = losses_np.copy()
@@ -197,15 +201,29 @@ class U2Trainer(Trainer):
                     data_start_time = time.time()
                     for batch_index, batch in enumerate(self.train_loader):
                         dataload_time = time.time() - data_start_time
-                        msg = "Train: Rank: {}, ".format(dist.get_rank())
-                        msg += "epoch: {}, ".format(self.epoch)
-                        msg += "step: {}, ".format(self.iteration)
-                        msg += "batch : {}/{}, ".format(batch_index + 1,
-                                                        len(self.train_loader))
-                        msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
-                        msg += "data time: {:>.3f}s, ".format(dataload_time)
-                        self.train_batch(batch_index, batch, msg)
-                        self.after_train_batch()
+                        msg = "Train:"
+                        observation = OrderedDict()
+                        with ObsScope(observation):
+                            report("Rank", dist.get_rank())
+                            report("epoch", self.epoch)
+                            report('step', self.iteration)
+                            report('step/total',
+                                   (batch_index + 1) / len(self.train_loader))
+                            report("lr", self.lr_scheduler())
+                            self.train_batch(batch_index, batch, msg)
+                            self.after_train_batch()
+                            report('reader_cost', dataload_time)
+                        observation['batch_cost'] = observation[
+                            'reader_cost'] + observation['step_cost']
+                        observation['samples'] = observation['batch_size']
+                        observation['ips[sent./sec]'] = observation[
+                            'batch_size'] / observation['batch_cost']
+                        for k, v in observation.items():
+                            msg += f" {k}: "
+                            msg += f"{v:>.8f}" if isinstance(v,
+                                                             float) else f"{v}"
+                            msg += ","
+                        logger.info(msg)
                         data_start_time = time.time()
                 except Exception as e:
                     logger.error(e)
@@ -314,10 +332,11 @@ class U2Trainer(Trainer):
     def setup_model(self):
         config = self.config
         model_conf = config.model
-        model_conf.defrost()
-        model_conf.input_dim = self.train_loader.collate_fn.feature_size
-        model_conf.output_dim = self.train_loader.collate_fn.vocab_size
-        model_conf.freeze()
+
+        with UpdateConfig(model_conf):
+            model_conf.input_dim = self.train_loader.collate_fn.feature_size
+            model_conf.output_dim = self.train_loader.collate_fn.vocab_size
+
         model = U2Model.from_config(model_conf)
 
         if self.parallel:
@@ -560,7 +579,7 @@ class U2Tester(U2Trainer):
                 # 1. Encoder
                 encoder_out, encoder_mask = self.model._forward_encoder(
                     feat, feats_length)  # (B, maxlen, encoder_dim)
-                maxlen = encoder_out.size(1)
+                maxlen = encoder_out.shape[1]
                 ctc_probs = self.model.ctc.log_softmax(
                     encoder_out)  # (1, maxlen, vocab_size)
 
diff --git a/deepspeech/exps/u2/trainer.py b/deepspeech/exps/u2/trainer.py
index fa3e6d9d7..8e8634ac3 100644
--- a/deepspeech/exps/u2/trainer.py
+++ b/deepspeech/exps/u2/trainer.py
@@ -32,6 +32,7 @@ from deepspeech.training.trainer import Trainer
 from deepspeech.training.updaters.trainer import Trainer as NewTrainer
 from deepspeech.utils import layer_tools
 from deepspeech.utils.log import Log
+from deepspeech.utils.utility import UpdateConfig
 
 logger = Log(__name__).getlog()
 
@@ -121,10 +122,10 @@ class U2Trainer(Trainer):
     def setup_model(self):
         config = self.config
         model_conf = config.model
-        model_conf.defrost()
-        model_conf.input_dim = self.train_loader.collate_fn.feature_size
-        model_conf.output_dim = self.train_loader.collate_fn.vocab_size
-        model_conf.freeze()
+        with UpdateConfig(model_conf):
+            model_conf.input_dim = self.train_loader.collate_fn.feature_size
+            model_conf.output_dim = self.train_loader.collate_fn.vocab_size
+
         model = U2Model.from_config(model_conf)
 
         if self.parallel:
diff --git a/deepspeech/exps/u2_kaldi/model.py b/deepspeech/exps/u2_kaldi/model.py
index 1dbdfef85..116ab2808 100644
--- a/deepspeech/exps/u2_kaldi/model.py
+++ b/deepspeech/exps/u2_kaldi/model.py
@@ -41,6 +41,7 @@ from deepspeech.utils import mp_tools
 from deepspeech.utils import text_grid
 from deepspeech.utils import utility
 from deepspeech.utils.log import Log
+from deepspeech.utils.utility import UpdateConfig
 
 logger = Log(__name__).getlog()
 
@@ -319,10 +320,10 @@ class U2Trainer(Trainer):
 
         # model
         model_conf = config.model
-        model_conf.defrost()
-        model_conf.input_dim = self.train_loader.feat_dim
-        model_conf.output_dim = self.train_loader.vocab_size
-        model_conf.freeze()
+        with UpdateConfig(model_conf):
+            model_conf.input_dim = self.train_loader.feat_dim
+            model_conf.output_dim = self.train_loader.vocab_size
+
         model = U2Model.from_config(model_conf)
         if self.parallel:
             model = paddle.DataParallel(model)
@@ -555,7 +556,7 @@ class U2Tester(U2Trainer):
                 # 1. Encoder
                 encoder_out, encoder_mask = self.model._forward_encoder(
                     feat, feats_length)  # (B, maxlen, encoder_dim)
-                maxlen = encoder_out.size(1)
+                maxlen = encoder_out.shape[1]
                 ctc_probs = self.model.ctc.log_softmax(
                     encoder_out)  # (1, maxlen, vocab_size)
 
diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py
index 364070d23..eb84d6f11 100644
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
@@ -47,6 +47,7 @@ from deepspeech.utils import mp_tools
 from deepspeech.utils import text_grid
 from deepspeech.utils import utility
 from deepspeech.utils.log import Log
+from deepspeech.utils.utility import UpdateConfig
 
 logger = Log(__name__).getlog()
 
@@ -345,10 +346,10 @@ class U2STTrainer(Trainer):
     def setup_model(self):
         config = self.config
         model_conf = config.model
-        model_conf.defrost()
-        model_conf.input_dim = self.train_loader.collate_fn.feature_size
-        model_conf.output_dim = self.train_loader.collate_fn.vocab_size
-        model_conf.freeze()
+        with UpdateConfig(model_conf):
+            model_conf.input_dim = self.train_loader.collate_fn.feature_size
+            model_conf.output_dim = self.train_loader.collate_fn.vocab_size
+
         model = U2STModel.from_config(model_conf)
 
         if self.parallel:
@@ -587,7 +588,7 @@ class U2STTester(U2STTrainer):
                 # 1. Encoder
                 encoder_out, encoder_mask = self.model._forward_encoder(
                     feat, feats_length)  # (B, maxlen, encoder_dim)
-                maxlen = encoder_out.size(1)
+                maxlen = encoder_out.shape[1]
                 ctc_probs = self.model.ctc.log_softmax(
                     encoder_out)  # (1, maxlen, vocab_size)
 
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index d1fe04707..e58e03b4e 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -76,19 +76,19 @@ class ManifestDataset(Dataset):
 
         Args:
             manifest_path (str): manifest josn file path
-            max_input_len ([type], optional): maximum output seq length, 
+            max_input_len ([type], optional): maximum output seq length,
                 in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
-            min_input_len (float, optional): minimum input seq length, 
+            min_input_len (float, optional): minimum input seq length,
                 in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
-            max_output_len (float, optional): maximum input seq length, 
+            max_output_len (float, optional): maximum input seq length,
                 in modeling units. Defaults to 500.0.
-            min_output_len (float, optional): minimum input seq length, 
+            min_output_len (float, optional): minimum input seq length,
                 in modeling units. Defaults to 0.0.
-            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. 
+            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio.
                 Defaults to 10.0.
             min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio.
                 Defaults to 0.05.
-        
+
         """
         super().__init__()
 
diff --git a/deepspeech/models/u2/u2.py b/deepspeech/models/u2/u2.py
index fd8f15471..46bbd102f 100644
--- a/deepspeech/models/u2/u2.py
+++ b/deepspeech/models/u2/u2.py
@@ -48,6 +48,7 @@ from deepspeech.utils.tensor_utils import add_sos_eos
 from deepspeech.utils.tensor_utils import pad_sequence
 from deepspeech.utils.tensor_utils import th_accuracy
 from deepspeech.utils.utility import log_add
+from deepspeech.utils.utility import UpdateConfig
 
 __all__ = ["U2Model", "U2InferModel"]
 
@@ -297,8 +298,8 @@ class U2BaseModel(nn.Layer):
             speech, speech_lengths, decoding_chunk_size,
             num_decoding_left_chunks,
             simulate_streaming)  # (B, maxlen, encoder_dim)
-        maxlen = encoder_out.size(1)
-        encoder_dim = encoder_out.size(2)
+        maxlen = encoder_out.shape[1]
+        encoder_dim = encoder_out.shape[2]
         running_size = batch_size * beam_size
         encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view(
             running_size, maxlen, encoder_dim)  # (B*N, maxlen, encoder_dim)
@@ -403,7 +404,7 @@ class U2BaseModel(nn.Layer):
         encoder_out, encoder_mask = self._forward_encoder(
             speech, speech_lengths, decoding_chunk_size,
             num_decoding_left_chunks, simulate_streaming)
-        maxlen = encoder_out.size(1)
+        maxlen = encoder_out.shape[1]
         encoder_out_lens = encoder_mask.squeeze(1).sum(1)
         ctc_probs = self.ctc.log_softmax(encoder_out)  # (B, maxlen, vocab_size)
 
@@ -454,7 +455,7 @@ class U2BaseModel(nn.Layer):
             speech, speech_lengths, decoding_chunk_size,
             num_decoding_left_chunks,
             simulate_streaming)  # (B, maxlen, encoder_dim)
-        maxlen = encoder_out.size(1)
+        maxlen = encoder_out.shape[1]
         ctc_probs = self.ctc.log_softmax(encoder_out)  # (1, maxlen, vocab_size)
         ctc_probs = ctc_probs.squeeze(0)
 
@@ -582,7 +583,7 @@ class U2BaseModel(nn.Layer):
 
         encoder_out = encoder_out.repeat(beam_size, 1, 1)
         encoder_mask = paddle.ones(
-            (beam_size, 1, encoder_out.size(1)), dtype=paddle.bool)
+            (beam_size, 1, encoder_out.shape[1]), dtype=paddle.bool)
         decoder_out, _ = self.decoder(
             encoder_out, encoder_mask, hyps_pad,
             hyps_lens)  # (beam_size, max_hyps_len, vocab_size)
@@ -689,13 +690,13 @@ class U2BaseModel(nn.Layer):
         Returns:
             paddle.Tensor: decoder output, (B, L)
         """
-        assert encoder_out.size(0) == 1
-        num_hyps = hyps.size(0)
-        assert hyps_lens.size(0) == num_hyps
+        assert encoder_out.shape[0] == 1
+        num_hyps = hyps.shape[0]
+        assert hyps_lens.shape[0] == num_hyps
         encoder_out = encoder_out.repeat(num_hyps, 1, 1)
         # (B, 1, T)
         encoder_mask = paddle.ones(
-            [num_hyps, 1, encoder_out.size(1)], dtype=paddle.bool)
+            [num_hyps, 1, encoder_out.shape[1]], dtype=paddle.bool)
         # (num_hyps, max_hyps_len, vocab_size)
         decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps,
                                       hyps_lens)
@@ -750,7 +751,7 @@ class U2BaseModel(nn.Layer):
         Returns:
             List[List[int]]: transcripts.
         """
-        batch_size = feats.size(0)
+        batch_size = feats.shape[0]
         if decoding_method in ['ctc_prefix_beam_search',
                                'attention_rescoring'] and batch_size > 1:
             logger.fatal(
@@ -778,7 +779,7 @@ class U2BaseModel(nn.Layer):
         # result in List[int], change it to List[List[int]] for compatible
         # with other batch decoding mode
         elif decoding_method == 'ctc_prefix_beam_search':
-            assert feats.size(0) == 1
+            assert feats.shape[0] == 1
             hyp = self.ctc_prefix_beam_search(
                 feats,
                 feats_lengths,
@@ -788,7 +789,7 @@ class U2BaseModel(nn.Layer):
                 simulate_streaming=simulate_streaming)
             hyps = [hyp]
         elif decoding_method == 'attention_rescoring':
-            assert feats.size(0) == 1
+            assert feats.shape[0] == 1
             hyp = self.attention_rescoring(
                 feats,
                 feats_lengths,
@@ -903,10 +904,10 @@ class U2Model(U2BaseModel):
         Returns:
             DeepSpeech2Model: The model built from pretrained result.
         """
-        config.defrost()
-        config.input_dim = dataloader.collate_fn.feature_size
-        config.output_dim = dataloader.collate_fn.vocab_size
-        config.freeze()
+        with UpdateConfig(config):
+            config.input_dim = dataloader.collate_fn.feature_size
+            config.output_dim = dataloader.collate_fn.vocab_size
+
         model = cls.from_config(config)
 
         if checkpoint_path:
diff --git a/deepspeech/models/u2_st.py b/deepspeech/models/u2_st.py
index 6737a549d..a3d99942f 100644
--- a/deepspeech/models/u2_st.py
+++ b/deepspeech/models/u2_st.py
@@ -42,6 +42,7 @@ from deepspeech.utils import layer_tools
 from deepspeech.utils.log import Log
 from deepspeech.utils.tensor_utils import add_sos_eos
 from deepspeech.utils.tensor_utils import th_accuracy
+from deepspeech.utils.utility import UpdateConfig
 
 __all__ = ["U2STModel", "U2STInferModel"]
 
@@ -339,8 +340,8 @@ class U2STBaseModel(nn.Layer):
             speech, speech_lengths, decoding_chunk_size,
             num_decoding_left_chunks,
             simulate_streaming)  # (B, maxlen, encoder_dim)
-        maxlen = encoder_out.size(1)
-        encoder_dim = encoder_out.size(2)
+        maxlen = encoder_out.shape[1]
+        encoder_dim = encoder_out.shape[2]
         running_size = batch_size * beam_size
         encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view(
             running_size, maxlen, encoder_dim)  # (B*N, maxlen, encoder_dim)
@@ -495,13 +496,13 @@ class U2STBaseModel(nn.Layer):
         Returns:
             paddle.Tensor: decoder output, (B, L)
         """
-        assert encoder_out.size(0) == 1
-        num_hyps = hyps.size(0)
-        assert hyps_lens.size(0) == num_hyps
+        assert encoder_out.shape[0] == 1
+        num_hyps = hyps.shape[0]
+        assert hyps_lens.shape[0] == num_hyps
         encoder_out = encoder_out.repeat(num_hyps, 1, 1)
         # (B, 1, T)
         encoder_mask = paddle.ones(
-            [num_hyps, 1, encoder_out.size(1)], dtype=paddle.bool)
+            [num_hyps, 1, encoder_out.shape[1]], dtype=paddle.bool)
         # (num_hyps, max_hyps_len, vocab_size)
         decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps,
                                       hyps_lens)
@@ -556,7 +557,7 @@ class U2STBaseModel(nn.Layer):
         Returns:
             List[List[int]]: transcripts.
         """
-        batch_size = feats.size(0)
+        batch_size = feats.shape[0]
 
         if decoding_method == 'fullsentence':
             hyps = self.translate(
@@ -686,10 +687,10 @@ class U2STModel(U2STBaseModel):
         Returns:
             DeepSpeech2Model: The model built from pretrained result.
         """
-        config.defrost()
-        config.input_dim = dataloader.collate_fn.feature_size
-        config.output_dim = dataloader.collate_fn.vocab_size
-        config.freeze()
+        with UpdateConfig(config):
+            config.input_dim = dataloader.collate_fn.feature_size
+            config.output_dim = dataloader.collate_fn.vocab_size
+
         model = cls.from_config(config)
 
         if checkpoint_path:
diff --git a/deepspeech/modules/attention.py b/deepspeech/modules/attention.py
index 1a984dd45..f94797282 100644
--- a/deepspeech/modules/attention.py
+++ b/deepspeech/modules/attention.py
@@ -70,7 +70,7 @@ class MultiHeadedAttention(nn.Layer):
             paddle.Tensor: Transformed value tensor, size
                 (#batch, n_head, time2, d_k).
         """
-        n_batch = query.size(0)
+        n_batch = query.shape[0]
         q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
         k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
         v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
@@ -96,7 +96,7 @@ class MultiHeadedAttention(nn.Layer):
             paddle.Tensor: Transformed value weighted 
                 by the attention score, (#batch, time1, d_model).
         """
-        n_batch = value.size(0)
+        n_batch = value.shape[0]
         if mask is not None:
             mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
             scores = scores.masked_fill(mask, -float('inf'))
@@ -172,15 +172,16 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
             paddle.Tensor: Output tensor. (batch, head, time1, time1)
         """
         zero_pad = paddle.zeros(
-            (x.size(0), x.size(1), x.size(2), 1), dtype=x.dtype)
+            (x.shape[0], x.shape[1], x.shape[2], 1), dtype=x.dtype)
         x_padded = paddle.cat([zero_pad, x], dim=-1)
 
-        x_padded = x_padded.view(x.size(0), x.size(1), x.size(3) + 1, x.size(2))
+        x_padded = x_padded.view(x.shape[0], x.shape[1], x.shape[3] + 1,
+                                 x.shape[2])
         x = x_padded[:, :, 1:].view_as(x)  # [B, H, T1, T1]
 
         if zero_triu:
-            ones = paddle.ones((x.size(2), x.size(3)))
-            x = x * paddle.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
+            ones = paddle.ones((x.shape[2], x.shape[3]))
+            x = x * paddle.tril(ones, x.shape[3] - x.shape[2])[None, None, :, :]
 
         return x
 
@@ -205,7 +206,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
         q, k, v = self.forward_qkv(query, key, value)
         q = q.transpose([0, 2, 1, 3])  # (batch, time1, head, d_k)
 
-        n_batch_pos = pos_emb.size(0)
+        n_batch_pos = pos_emb.shape[0]
         p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
         p = p.transpose([0, 2, 1, 3])  # (batch, head, time1, d_k)
 
diff --git a/deepspeech/modules/decoder.py b/deepspeech/modules/decoder.py
index 143f6cc57..8ca72894a 100644
--- a/deepspeech/modules/decoder.py
+++ b/deepspeech/modules/decoder.py
@@ -122,7 +122,7 @@ class TransformerDecoder(nn.Layer):
         # tgt_mask: (B, 1, L)
         tgt_mask = (make_non_pad_mask(ys_in_lens).unsqueeze(1))
         # m: (1, L, L)
-        m = subsequent_mask(tgt_mask.size(-1)).unsqueeze(0)
+        m = subsequent_mask(tgt_mask.shape[-1]).unsqueeze(0)
         # tgt_mask: (B, L, L)
         tgt_mask = tgt_mask & m
 
diff --git a/deepspeech/modules/embedding.py b/deepspeech/modules/embedding.py
index 98b4e1291..fbbda023c 100644
--- a/deepspeech/modules/embedding.py
+++ b/deepspeech/modules/embedding.py
@@ -68,7 +68,7 @@ class PositionalEncoding(nn.Layer):
             paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...)
         """
         T = x.shape[1]
-        assert offset + x.size(1) < self.max_len
+        assert offset + x.shape[1] < self.max_len
         #TODO(Hui Zhang): using T = x.size(1), __getitem__ not support Tensor
         pos_emb = self.pe[:, offset:offset + T]
         x = x * self.xscale + pos_emb
@@ -114,7 +114,7 @@ class RelPositionalEncoding(PositionalEncoding):
             paddle.Tensor: Encoded tensor (batch, time, `*`).
             paddle.Tensor: Positional embedding tensor (1, time, `*`).
         """
-        assert offset + x.size(1) < self.max_len
+        assert offset + x.shape[1] < self.max_len
         x = x * self.xscale
         #TODO(Hui Zhang): using x.size(1), __getitem__ not support Tensor
         pos_emb = self.pe[:, offset:offset + x.shape[1]]
diff --git a/deepspeech/modules/encoder.py b/deepspeech/modules/encoder.py
index fb44fe295..d4a8275c3 100644
--- a/deepspeech/modules/encoder.py
+++ b/deepspeech/modules/encoder.py
@@ -159,7 +159,7 @@ class BaseEncoder(nn.Layer):
         if self.global_cmvn is not None:
             xs = self.global_cmvn(xs)
         #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
-        xs, pos_emb, masks = self.embed(xs, masks.type_as(xs), offset=0)
+        xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0)
         #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
         masks = masks.astype(paddle.bool)
         mask_pad = ~masks
@@ -206,11 +206,11 @@ class BaseEncoder(nn.Layer):
                 chunk computation
             List[paddle.Tensor]: conformer cnn cache
         """
-        assert xs.size(0) == 1  # batch size must be one
+        assert xs.shape[0] == 1  # batch size must be one
         # tmp_masks is just for interface compatibility
         # TODO(Hui Zhang): stride_slice not support bool tensor
         # tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
-        tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.int32)
+        tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.int32)
         tmp_masks = tmp_masks.unsqueeze(1)  #[B=1, C=1, T]
 
         if self.global_cmvn is not None:
@@ -220,25 +220,25 @@ class BaseEncoder(nn.Layer):
             xs, tmp_masks, offset=offset)  #xs=(B, T, D), pos_emb=(B=1, T, D)
 
         if subsampling_cache is not None:
-            cache_size = subsampling_cache.size(1)  #T
+            cache_size = subsampling_cache.shape[1]  #T
             xs = paddle.cat((subsampling_cache, xs), dim=1)
         else:
             cache_size = 0
 
         # only used when using `RelPositionMultiHeadedAttention`
         pos_emb = self.embed.position_encoding(
-            offset=offset - cache_size, size=xs.size(1))
+            offset=offset - cache_size, size=xs.shape[1])
 
         if required_cache_size < 0:
             next_cache_start = 0
         elif required_cache_size == 0:
-            next_cache_start = xs.size(1)
+            next_cache_start = xs.shape[1]
         else:
-            next_cache_start = xs.size(1) - required_cache_size
+            next_cache_start = xs.shape[1] - required_cache_size
         r_subsampling_cache = xs[:, next_cache_start:, :]
 
         # Real mask for transformer/conformer layers
-        masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
+        masks = paddle.ones([1, xs.shape[1]], dtype=paddle.bool)
         masks = masks.unsqueeze(1)  #[B=1, L'=1, T]
         r_elayers_output_cache = []
         r_conformer_cnn_cache = []
@@ -302,7 +302,7 @@ class BaseEncoder(nn.Layer):
         stride = subsampling * decoding_chunk_size
         decoding_window = (decoding_chunk_size - 1) * subsampling + context
 
-        num_frames = xs.size(1)
+        num_frames = xs.shape[1]
         required_cache_size = decoding_chunk_size * num_decoding_left_chunks
         subsampling_cache: Optional[paddle.Tensor] = None
         elayers_output_cache: Optional[List[paddle.Tensor]] = None
@@ -318,10 +318,10 @@ class BaseEncoder(nn.Layer):
                  chunk_xs, offset, required_cache_size, subsampling_cache,
                  elayers_output_cache, conformer_cnn_cache)
             outputs.append(y)
-            offset += y.size(1)
+            offset += y.shape[1]
         ys = paddle.cat(outputs, 1)
         # fake mask, just for jit script and compatibility with `forward` api
-        masks = paddle.ones([1, ys.size(1)], dtype=paddle.bool)
+        masks = paddle.ones([1, ys.shape[1]], dtype=paddle.bool)
         masks = masks.unsqueeze(1)
         return ys, masks
 
diff --git a/deepspeech/training/cli.py b/deepspeech/training/cli.py
index 1477bdfe0..07c213dbc 100644
--- a/deepspeech/training/cli.py
+++ b/deepspeech/training/cli.py
@@ -43,33 +43,57 @@ def default_argument_parser():
     """
     parser = argparse.ArgumentParser()
 
-    # yapf: disable
-    # data and output
-    parser.add_argument("--config", metavar="FILE", help="path of the config file to overwrite to default config with.")
-    parser.add_argument("--dump-config", metavar="FILE", help="dump config to yaml file.")
-    parser.add_argument("--output", metavar="OUTPUT_DIR", help="path to save checkpoint and logs.")
-
-    # load from saved checkpoint
-    parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load")
-
-    # running
-    parser.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"],
-                        help="device type to use, cpu and gpu are supported.")
-    parser.add_argument("--nprocs", type=int, default=1, help="number of parallel processes to use.")
-
-    # overwrite extra config and default config
-    # parser.add_argument("--opts", nargs=argparse.REMAINDER,
-    # help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
-    parser.add_argument("--opts", type=str, default=[], nargs='+',
-                        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
-
-    # random seed
-    parser.add_argument("--seed", type=int, default=None,
-                        help="seed to use for paddle, np and random. None or 0 for random, else set seed.")
-
-    # profiler
-    parser.add_argument('--profiler_options', type=str, default=None,
-        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".')
-    # yapd: enable
+    train_group = parser.add_argument_group(
+        title='Train Options', description=None)
+    train_group.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="seed to use for paddle, np and random. None or 0 for random, else set seed."
+    )
+    train_group.add_argument(
+        "--device",
+        type=str,
+        default='gpu',
+        choices=["cpu", "gpu"],
+        help="device cpu and gpu are supported.")
+    train_group.add_argument(
+        "--nprocs",
+        type=int,
+        default=1,
+        help="number of parallel processes. 0 for cpu.")
+    train_group.add_argument(
+        "--config", metavar="CONFIG_FILE", help="config file.")
+    train_group.add_argument(
+        "--output", metavar="CKPT_DIR", help="path to save checkpoint.")
+    train_group.add_argument(
+        "--checkpoint_path", type=str, help="path to load checkpoint")
+    train_group.add_argument(
+        "--opts",
+        type=str,
+        default=[],
+        nargs='+',
+        help="overwrite --config file, passing in LIST[KEY VALUE] pairs")
+    train_group.add_argument(
+        "--dump-config", metavar="FILE", help="dump config to `this` file.")
+
+    profile_group = parser.add_argument_group(
+        title='Benchmark Options', description=None)
+    profile_group.add_argument(
+        '--profiler-options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+    )
+    profile_group.add_argument(
+        '--benchmark-batch-size',
+        type=int,
+        default=None,
+        help='batch size for benchmark.')
+    profile_group.add_argument(
+        '--benchmark-max-step',
+        type=int,
+        default=None,
+        help='max iteration for benchmark.')
 
     return parser
diff --git a/deepspeech/training/extensions/evaluator.py b/deepspeech/training/extensions/evaluator.py
index d5b359829..1026a4ec3 100644
--- a/deepspeech/training/extensions/evaluator.py
+++ b/deepspeech/training/extensions/evaluator.py
@@ -20,8 +20,8 @@ from paddle.nn import Layer
 
 from . import extension
 from ..reporter import DictSummary
+from ..reporter import ObsScope
 from ..reporter import report
-from ..reporter import scope
 from ..timer import Timer
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
@@ -78,7 +78,7 @@ class StandardEvaluator(extension.Extension):
         summary = DictSummary()
         for batch in self.dataloader:
             observation = {}
-            with scope(observation):
+            with ObsScope(observation):
                 # main evaluation computation here.
                 with paddle.no_grad():
                     self.evaluate_sync(self.evaluate_core(batch))
diff --git a/deepspeech/training/reporter.py b/deepspeech/training/reporter.py
index 66a81adef..7afc33f38 100644
--- a/deepspeech/training/reporter.py
+++ b/deepspeech/training/reporter.py
@@ -19,7 +19,7 @@ OBSERVATIONS = None
 
 
 @contextlib.contextmanager
-def scope(observations):
+def ObsScope(observations):
     # make `observation` the target to report to.
     # it is basically a dictionary that stores temporary observations
     global OBSERVATIONS
diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py
index 6587f1290..a5efdd541 100644
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@@ -11,19 +11,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import sys
 import time
+from collections import OrderedDict
 from pathlib import Path
 
 import paddle
 from paddle import distributed as dist
 from tensorboardX import SummaryWriter
 
+from deepspeech.training.reporter import ObsScope
+from deepspeech.training.reporter import report
 from deepspeech.training.timer import Timer
 from deepspeech.utils import mp_tools
 from deepspeech.utils import profiler
 from deepspeech.utils.checkpoint import Checkpoint
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import seed_all
+from deepspeech.utils.utility import UpdateConfig
 
 __all__ = ["Trainer"]
 
@@ -96,11 +101,21 @@ class Trainer():
         self.checkpoint_dir = None
         self.iteration = 0
         self.epoch = 0
+        self.rank = dist.get_rank()
+
+        logger.info(f"Rank: {self.rank}/{dist.get_world_size()}")
 
         if args.seed:
             seed_all(args.seed)
             logger.info(f"Set seed {args.seed}")
 
+        if self.args.benchmark_batch_size:
+            with UpdateConfig(self.config):
+                self.config.collator.batch_size = self.args.benchmark_batch_size
+                self.config.training.log_interval = 1
+            logger.info(
+                f"Benchmark reset batch-size: {self.args.benchmark_batch_size}")
+
     def setup(self):
         """Setup the experiment.
         """
@@ -188,6 +203,12 @@ class Trainer():
         if self.args.profiler_options:
             profiler.add_profiler_step(self.args.profiler_options)
 
+        if self.args.benchmark_max_step and self.iteration > self.args.benchmark_max_step:
+            logger.info(
+                f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
+            sys.exit(
+                f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
+
     def train(self):
         """The training process control by epoch."""
         from_scratch = self.resume_or_scratch()
@@ -208,15 +229,29 @@ class Trainer():
                     data_start_time = time.time()
                     for batch_index, batch in enumerate(self.train_loader):
                         dataload_time = time.time() - data_start_time
-                        msg = "Train: Rank: {}, ".format(dist.get_rank())
-                        msg += "epoch: {}, ".format(self.epoch)
-                        msg += "step: {}, ".format(self.iteration)
-                        msg += "batch : {}/{}, ".format(batch_index + 1,
-                                                        len(self.train_loader))
-                        msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
-                        msg += "data time: {:>.3f}s, ".format(dataload_time)
-                        self.train_batch(batch_index, batch, msg)
-                        self.after_train_batch()
+                        msg = "Train:"
+                        observation = OrderedDict()
+                        with ObsScope(observation):
+                            report("Rank", dist.get_rank())
+                            report("epoch", self.epoch)
+                            report('step', self.iteration)
+                            report('step/total',
+                                   (batch_index + 1) / len(self.train_loader))
+                            report("lr", self.lr_scheduler())
+                            self.train_batch(batch_index, batch, msg)
+                            self.after_train_batch()
+                            report('reader_cost', dataload_time)
+                        observation['batch_cost'] = observation[
+                            'reader_cost'] + observation['step_cost']
+                        observation['samples'] = observation['batch_size']
+                        observation['ips[sent./sec]'] = observation[
+                            'batch_size'] / observation['batch_cost']
+                        for k, v in observation.items():
+                            msg += f" {k}: "
+                            msg += f"{v:>.8f}" if isinstance(v,
+                                                             float) else f"{v}"
+                            msg += ","
+                        logger.info(msg)
                         data_start_time = time.time()
                 except Exception as e:
                     logger.error(e)
diff --git a/deepspeech/training/updaters/trainer.py b/deepspeech/training/updaters/trainer.py
index a52fb9eb3..077694659 100644
--- a/deepspeech/training/updaters/trainer.py
+++ b/deepspeech/training/updaters/trainer.py
@@ -24,7 +24,7 @@ import tqdm
 
 from deepspeech.training.extensions.extension import Extension
 from deepspeech.training.extensions.extension import PRIORITY_READER
-from deepspeech.training.reporter import scope
+from deepspeech.training.reporter import ObsScope
 from deepspeech.training.triggers import get_trigger
 from deepspeech.training.triggers.limit_trigger import LimitTrigger
 from deepspeech.training.updaters.updater import UpdaterBase
@@ -144,7 +144,7 @@ class Trainer():
                 # you can use `report` freely in Updater.update()
 
                 # updating parameters and state
-                with scope(self.observation):
+                with ObsScope(self.observation):
                     update()
                     p.update()
 
diff --git a/deepspeech/utils/ctc_utils.py b/deepspeech/utils/ctc_utils.py
index 9f2271814..fc43a71f0 100644
--- a/deepspeech/utils/ctc_utils.py
+++ b/deepspeech/utils/ctc_utils.py
@@ -84,19 +84,19 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
     y_insert_blank = insert_blank(y, blank_id)  #(2L+1)
 
     log_alpha = paddle.zeros(
-        (ctc_probs.size(0), len(y_insert_blank)))  #(T, 2L+1)
+        (ctc_probs.shape[0], len(y_insert_blank)))  #(T, 2L+1)
     log_alpha = log_alpha - float('inf')  # log of zero
 
     # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
     state_path = (paddle.zeros(
-        (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int32) - 1
+        (ctc_probs.shape[0], len(y_insert_blank)), dtype=paddle.int32) - 1
                   )  # state path, Tuple((T, 2L+1))
 
     # init start state
     log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]]  # State-b, Sb
     log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]]  # State-nb, Snb
 
-    for t in range(1, ctc_probs.size(0)):  # T
+    for t in range(1, ctc_probs.shape[0]):  # T
         for s in range(len(y_insert_blank)):  # 2L+1
             if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[
                     s] == y_insert_blank[s - 2]:
@@ -114,7 +114,7 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
                 y_insert_blank[s]]
             state_path[t, s] = prev_state[paddle.argmax(candidates)]
     # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
-    state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int32)
+    state_seq = -1 * paddle.ones((ctc_probs.shape[0], 1), dtype=paddle.int32)
 
     candidates = paddle.to_tensor([
         log_alpha[-1, len(y_insert_blank) - 1],  # Sb
@@ -122,11 +122,11 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
     ])
     prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2]
     state_seq[-1] = prev_state[paddle.argmax(candidates)]
-    for t in range(ctc_probs.size(0) - 2, -1, -1):
+    for t in range(ctc_probs.shape[0] - 2, -1, -1):
         state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]]
 
     output_alignment = []
-    for t in range(0, ctc_probs.size(0)):
+    for t in range(0, ctc_probs.shape[0]):
         output_alignment.append(y_insert_blank[state_seq[t, 0]])
 
     return output_alignment
diff --git a/deepspeech/utils/tensor_utils.py b/deepspeech/utils/tensor_utils.py
index 3519f4fa5..bb7f58ded 100644
--- a/deepspeech/utils/tensor_utils.py
+++ b/deepspeech/utils/tensor_utils.py
@@ -83,7 +83,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
     # (TODO Hui Zhang): slice not supprot `end==start`
     # trailing_dims = max_size[1:]
     trailing_dims = max_size[1:] if max_size.ndim >= 2 else ()
-    max_len = max([s.size(0) for s in sequences])
+    max_len = max([s.shape[0] for s in sequences])
     if batch_first:
         out_dims = (len(sequences), max_len) + trailing_dims
     else:
@@ -91,7 +91,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
 
     out_tensor = sequences[0].new_full(out_dims, padding_value)
     for i, tensor in enumerate(sequences):
-        length = tensor.size(0)
+        length = tensor.shape[0]
         # use index notation to prevent duplicate references to the tensor
         if batch_first:
             out_tensor[i, :length, ...] = tensor
@@ -139,7 +139,7 @@ def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int,
     #ys_in = [paddle.cat([_sos, y], dim=0) for y in ys]
     #ys_out = [paddle.cat([y, _eos], dim=0) for y in ys]
     #return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id)
-    B = ys_pad.size(0)
+    B = ys_pad.shape[0]
     _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos
     _eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos
     ys_in = paddle.cat([_sos, ys_pad], dim=1)
@@ -165,8 +165,8 @@ def th_accuracy(pad_outputs: paddle.Tensor,
     Returns:
         float: Accuracy value (0.0 - 1.0).
     """
-    pad_pred = pad_outputs.view(
-        pad_targets.size(0), pad_targets.size(1), pad_outputs.size(1)).argmax(2)
+    pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1],
+                                pad_outputs.shape[1]).argmax(2)
     mask = pad_targets != ignore_label
     numerator = paddle.sum(
         pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
diff --git a/deepspeech/utils/utility.py b/deepspeech/utils/utility.py
index e18fc1f77..6f84c41be 100644
--- a/deepspeech/utils/utility.py
+++ b/deepspeech/utils/utility.py
@@ -16,15 +16,27 @@ import distutils.util
 import math
 import os
 import random
+from contextlib import contextmanager
 from typing import List
 
 import numpy as np
 import paddle
 
-__all__ = ["seed_all", 'print_arguments', 'add_arguments', "log_add"]
+__all__ = [
+    "UpdateConfig", "seed_all", 'print_arguments', 'add_arguments', "log_add"
+]
+
+
+@contextmanager
+def UpdateConfig(config):
+    """Update yacs config"""
+    config.defrost()
+    yield
+    config.freeze()
 
 
 def seed_all(seed: int=210329):
+    """freeze random generator seed."""
     np.random.seed(seed)
     random.seed(seed)
     paddle.seed(seed)
diff --git a/docs/src/install.md b/docs/src/install.md
index 79460737b..8cecba125 100644
--- a/docs/src/install.md
+++ b/docs/src/install.md
@@ -4,7 +4,7 @@ To avoid the trouble of environment setup, [running in Docker container](#runnin
 
 ## Prerequisites
 - Python >= 3.7
-- PaddlePaddle 2.0.0 or later (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html))
+- PaddlePaddle latest version (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html))
 
 ## Setup (Important)
 
diff --git a/docs/src/reference.md b/docs/src/reference.md
index 341e13611..d3676fff2 100644
--- a/docs/src/reference.md
+++ b/docs/src/reference.md
@@ -1,5 +1,7 @@
 # Reference
 
+We refer these repos to build `model` and `engine`:
+
 * [delta](https://github.com/Delta-ML/delta.git)
 * [espnet](https://github.com/espnet/espnet.git)
 * [kaldi](https://github.com/kaldi-asr/kaldi.git)
diff --git a/examples/aishell/s1/local/train.sh b/examples/aishell/s1/local/train.sh
index e065ad6a8..5097d4d03 100755
--- a/examples/aishell/s1/local/train.sh
+++ b/examples/aishell/s1/local/train.sh
@@ -1,7 +1,8 @@
 #!/bin/bash
 
-
 profiler_options=
+benchmark_batch_size=0
+benchmark_max_step=0
 
 # seed may break model convergence
 seed=0
@@ -32,12 +33,15 @@ ckpt_name=$2
 mkdir -p exp
 
 python3 -u ${BIN_DIR}/train.py \
+--seed ${seed} \
 --device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
---profiler_options ${profiler_options} \
---seed ${seed}
+--profiler-options "${profiler_options}" \
+--benchmark-batch-size ${benchmark_batch_size} \
+--benchmark-max-step ${benchmark_max_step}
+
 
 if [ ${seed} != 0  ]; then
     unset FLAGS_cudnn_deterministic
@@ -48,4 +52,4 @@ if [ $? -ne 0 ]; then
     exit 1
 fi
 
-exit 0
\ No newline at end of file
+exit 0
diff --git a/examples/librispeech/s1/conf/augmentation.json b/examples/librispeech/s1/conf/augmentation.json
index 8e6e97040..40a5b7900 100644
--- a/examples/librispeech/s1/conf/augmentation.json
+++ b/examples/librispeech/s1/conf/augmentation.json
@@ -19,17 +19,17 @@
   {
     "type": "specaug",
     "params": {
+      "W": 0,
+      "warp_mode": "PIL",
       "F": 10,
-      "T": 50,
       "n_freq_masks": 2,
+      "T": 50,
       "n_time_masks": 2,
       "p": 1.0,
-      "W": 80,
       "adaptive_number_ratio": 0,
       "adaptive_size_ratio": 0,
       "max_n_time_masks": 20,
-      "replace_with_zero": true,
-      "warp_mode": "PIL"
+      "replace_with_zero": true
     },
     "prob": 1.0
   }
diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml
index 4aa7b9158..fe9cab069 100644
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@@ -33,7 +33,7 @@ collator:
   keep_transcription_text: False
   sortagrad: True 
   shuffle_method: batch_shuffle
-  num_workers: 0
+  num_workers: 2
 
 
 # network architecture
@@ -74,7 +74,7 @@ model:
 
 
 training:
-  n_epoch: 120
+  n_epoch: 120 
   accum_grad: 2
   global_grad_clip: 5.0
   optim: adam
diff --git a/examples/tiny/s0/local/train.sh b/examples/tiny/s0/local/train.sh
index f96508b4f..9a76c7ade 100755
--- a/examples/tiny/s0/local/train.sh
+++ b/examples/tiny/s0/local/train.sh
@@ -38,7 +38,7 @@ python3 -u ${BIN_DIR}/train.py \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --model_type ${model_type} \
---profiler_options "${profiler_options}" \
+--profiler-options "${profiler_options}" \
 --seed ${seed}
 
 if [ ${seed} != 0  ]; then
diff --git a/examples/tiny/s1/local/train.sh b/examples/tiny/s1/local/train.sh
index 374608fd1..5097d4d03 100755
--- a/examples/tiny/s1/local/train.sh
+++ b/examples/tiny/s1/local/train.sh
@@ -1,37 +1,49 @@
 #!/bin/bash
 
-if [ $# != 2 ];then
-    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
-    exit -1
-fi
+profiler_options=
+benchmark_batch_size=0
+benchmark_max_step=0
+
+# seed may break model convergence
+seed=0
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
-config_path=$1
-ckpt_name=$2
-
 device=gpu
 if [ ${ngpu} == 0 ];then
     device=cpu
 fi
 
-mkdir -p exp
-
-# seed may break model convergence
-seed=0
-if [ ${seed} != 0 ]; then
+if [ ${seed} != 0  ]; then
     export FLAGS_cudnn_deterministic=True
+    echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
+fi
+
+if [ $# != 2 ];then
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
+    exit -1
 fi
 
+config_path=$1
+ckpt_name=$2
+
+mkdir -p exp
+
 python3 -u ${BIN_DIR}/train.py \
+--seed ${seed} \
 --device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
---seed ${seed}
+--profiler-options "${profiler_options}" \
+--benchmark-batch-size ${benchmark_batch_size} \
+--benchmark-max-step ${benchmark_max_step}
+
 
-if [ ${seed} != 0 ]; then
+if [ ${seed} != 0  ]; then
     unset FLAGS_cudnn_deterministic
 fi
 
diff --git a/tests/benchmark/run_all.sh b/tests/benchmark/run_all.sh
old mode 100644
new mode 100755
index 7aa11d0f2..6f707cdcb
--- a/tests/benchmark/run_all.sh
+++ b/tests/benchmark/run_all.sh
@@ -1,41 +1,46 @@
 #!/bin/bash
 
+CUR_DIR=${PWD}
 ROOT_DIR=../../
 
 # 提供可稳定复现性能的脚本，默认在标准docker环境内py37执行：
 # collect env info
 bash ${ROOT_DIR}/utils/pd_env_collect.sh
-cat pd_env.txt
+#cat pd_env.txt
 
-# 执行目录：需说明
-pushd ${ROOT_DIR}/examples/aishell/s1
 
 # 1 安装该模型需要的依赖 (如需开启优化策略请注明)
-pushd ${ROOT_DIR}/tools; make; popd
-source ${ROOT_DIR}/tools/venv/bin/activate
-pushd ${ROOT_DIR}; bash setup.sh; popd
+#pushd ${ROOT_DIR}/tools; make; popd
+#source ${ROOT_DIR}/tools/venv/bin/activate
+#pushd ${ROOT_DIR}; bash setup.sh; popd
 
 
 # 2 拷贝该模型需要数据、预训练模型
+
+# 执行目录：需说明
+#pushd ${ROOT_DIR}/examples/aishell/s1
+pushd ${ROOT_DIR}/examples/tiny/s1
+
 mkdir -p exp/log
-loca/data.sh &> exp/log/data.log
+. path.sh
+#bash local/data.sh &> exp/log/data.log
 
 # 3 批量运行（如不方便批量，1，2需放到单个模型中）
 
-model_mode_list=(conformer)
+model_mode_list=(conformer transformer)
 fp_item_list=(fp32)
-bs_item=(32 64 96)
+bs_item_list=(32 64 96)
 for model_mode in ${model_mode_list[@]}; do
       for fp_item in ${fp_item_list[@]}; do
-          for bs_item in ${bs_list[@]}
+          for bs_item in ${bs_item_list[@]}
             do
             echo "index is speed, 1gpus, begin, ${model_name}"
             run_mode=sp
-            CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}     #  (5min)
+            CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}     #  (5min)
             sleep 60
             echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}"
             run_mode=mp
-            CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
+            CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
             sleep 60
             done
       done
diff --git a/tests/benchmark/run_benchmark.sh b/tests/benchmark/run_benchmark.sh
old mode 100644
new mode 100755
index 625d36160..eb1117936
--- a/tests/benchmark/run_benchmark.sh
+++ b/tests/benchmark/run_benchmark.sh
@@ -23,19 +23,19 @@ function _train(){
     echo "Train on ${num_gpu_devices} GPUs"
     echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
 
-    train_cmd="--model_name=${model_name}
-               --batch_size=${batch_size}
-               --fp=${fp_item} \
-               --max_iter=${max_iter} "
+    train_cmd="--benchmark-batch-size ${batch_size}
+               --benchmark-max-step ${max_iter}
+               conf/${model_name}.yaml ${model_name}"
+
     case ${run_mode} in
-    sp) train_cmd="python -u tools/train.py "${train_cmd}" ;;
+    sp) train_cmd="bash local/train.sh "${train_cmd}"" ;;
     mp)
-        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES tools/train.py "${train_cmd}"
-        log_parse_file="mylog/workerlog.0" ;;
+        train_cmd="bash local/train.sh "${train_cmd}"" ;;
     *) echo "choose run_mode(sp or mp)"; exit 1;
     esac
-# 以下不用修改
-    timeout 15m ${train_cmd} > ${log_file} 2>&1
+
+    # 以下不用修改
+    CUDA_VISIBLE_DEVICES=${device} timeout 15m ${train_cmd} > ${log_file} 2>&1
     if [ $? -ne 0 ];then
         echo -e "${model_name}, FAIL"
         export job_fail_flag=1
@@ -43,7 +43,8 @@ function _train(){
         echo -e "${model_name}, SUCCESS"
         export job_fail_flag=0
     fi
-    kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
+
+    trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
 
     if [ $run_mode = "mp" -a -d mylog ]; then
         rm ${log_file}
diff --git a/third_party/__init__.py b/third_party/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/third_party/paddle_audio/__init__.py b/third_party/paddle_audio/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/third_party/paddle_audio/frontend.py b/third_party/paddle_audio/frontend.py
deleted file mode 100644
index 1b337732e..000000000
--- a/third_party/paddle_audio/frontend.py
+++ /dev/null
@@ -1,146 +0,0 @@
-from typing import Tuple
-import numpy as np
-import paddle
-from paddle import Tensor
-from paddle import nn
-from paddle.nn import functional as F
-
-
-def frame(x: Tensor,
-          num_samples: Tensor,
-          win_length: int,
-          hop_length: int,
-          clip: bool = True) -> Tuple[Tensor, Tensor]:
-    """Extract frames from audio.
-
-    Parameters
-    ----------
-    x : Tensor
-        Shape (N, T), batched waveform.
-    num_samples : Tensor
-        Shape (N, ), number of samples of each waveform.
-    win_length : int
-        Window length.
-    hop_length : int
-        Number of samples shifted between ajancent frames.
-    clip : bool, optional
-        Whether to clip audio that does not fit into the last frame, by 
-        default True
-
-    Returns
-    -------
-    frames : Tensor
-        Shape (N, T', win_length).
-    num_frames : Tensor
-        Shape (N, ) number of valid frames
-    """
-    assert hop_length <= win_length
-    num_frames = (num_samples - win_length) // hop_length
-    padding = (0, 0)
-    if not clip:
-        num_frames += 1
-        # NOTE: pad hop_length - 1 to the right to ensure that there is at most
-        # one frame dangling to the righe edge
-        padding = (0, hop_length - 1)
-
-    weight = paddle.eye(win_length).unsqueeze(1)
-
-    frames = F.conv1d(x.unsqueeze(1),
-                      weight,
-                      padding=padding,
-                      stride=(hop_length, ))
-    return frames, num_frames
-
-
-class STFT(nn.Layer):
-    """A module for computing stft transformation in a differentiable way. 
-    
-    Parameters
-    ------------
-    n_fft : int
-        Number of samples in a frame.
-        
-    hop_length : int
-        Number of samples shifted between adjacent frames.
-        
-    win_length : int
-        Length of the window.
-
-    clip: bool
-        Whether to clip audio is necesaary.
-    """
-    def __init__(self,
-                 n_fft: int,
-                 hop_length: int,
-                 win_length: int,
-                 window_type: str = None,
-                 clip: bool = True):
-        super().__init__()
-
-        self.hop_length = hop_length
-        self.n_bin = 1 + n_fft // 2
-        self.n_fft = n_fft
-        self.clip = clip
-
-        # calculate window
-        if window_type is None:
-            window = np.ones(win_length)
-        elif window_type == "hann":
-            window = np.hanning(win_length)
-        elif window_type == "hamming":
-            window = np.hamming(win_length)
-        else:
-            raise ValueError("Not supported yet!")
-
-        if win_length < n_fft:
-            window = F.pad(window, (0, n_fft - win_length))
-        elif win_length > n_fft:
-            window = window[:n_fft]
-
-        # (n_bins, n_fft) complex
-        kernel_size = min(n_fft, win_length)
-        weight = np.fft.fft(np.eye(n_fft))[:self.n_bin, :kernel_size]
-        w_real = weight.real
-        w_imag = weight.imag
-
-        # (2 * n_bins, kernel_size)
-        w = np.concatenate([w_real, w_imag], axis=0)
-        w = w * window
-
-        # (2 * n_bins, 1, kernel_size) # (C_out, C_in, kernel_size)
-        w = np.expand_dims(w, 1)
-        weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
-        self.register_buffer("weight", weight)
-
-    def forward(self, x: Tensor, num_samples: Tensor) -> Tuple[Tensor, Tensor]:
-        """Compute the stft transform.
-        Parameters
-        ------------
-        x : Tensor [shape=(B, T)]
-            The input waveform.
-        num_samples : Tensor 
-            Number of samples of each waveform.
-        Returns
-        ------------
-        D : Tensor
-            Shape(N, T', n_bins, 2) Spectrogram.
-
-        num_frames: Tensor
-            Shape (N,) number of samples of each spectrogram
-        """
-        num_frames = (num_samples - self.win_length) // self.hop_length
-        padding = (0, 0)
-        if not self.clip:
-            num_frames += 1
-            padding = (0, self.hop_length - 1)
-
-        batch_size, _, _ = paddle.shape(x)
-        x = x.unsqueeze(-1)
-        D = F.conv1d(self.weight,
-                     x,
-                     stride=(self.hop_length, ),
-                     padding=padding,
-                     data_format="NLC")
-        D = paddle.reshape(D, [batch_size, -1, self.n_bin, 2])
-        return D, num_frames
-
diff --git a/third_party/paddle_audio/frontend/common.py b/third_party/paddle_audio/frontend/common.py
new file mode 100644
index 000000000..7638dae53
--- /dev/null
+++ b/third_party/paddle_audio/frontend/common.py
@@ -0,0 +1,201 @@
+import paddle
+import numpy as np
+from typing import Tuple, Optional, Union
+
+
+# https://github.com/kaldi-asr/kaldi/blob/cbed4ff688/src/feat/feature-window.cc#L109
+def povey_window(frame_len:int) -> np.ndarray:
+    win = np.empty(frame_len)
+    a = 2 * np.pi / (frame_len -1)
+    for i in range(frame_len):
+        win[i] = (0.5 - 0.5 * np.cos(a * i) )**0.85 
+    return win
+
+def hann_window(frame_len:int) -> np.ndarray:
+    win = np.empty(frame_len)
+    a = 2 * np.pi / (frame_len -1)
+    for i in range(frame_len):
+        win[i] = 0.5 - 0.5 * np.cos(a * i)
+    return win
+
+def sine_window(frame_len:int) -> np.ndarray:
+    win = np.empty(frame_len)
+    a = 2 * np.pi / (frame_len -1)
+    for i in range(frame_len):
+        win[i] = np.sin(0.5 * a * i)
+    return win
+
+def hamm_window(frame_len:int) -> np.ndarray:
+    win = np.empty(frame_len)
+    a = 2 * np.pi / (frame_len -1)
+    for i in range(frame_len):
+        win[i] = 0.54 - 0.46 * np.cos(a * i)
+    return win
+
+def get_window(wintype:Optional[str], winlen:int) -> np.ndarray:
+    """get window function
+
+    Args:
+        wintype (Optional[str]): window type.
+        winlen (int): window length in samples.
+
+    Raises:
+        ValueError: not support window.
+
+    Returns:
+        np.ndarray: window coeffs.
+    """
+    # calculate window
+    if not wintype or wintype == 'rectangular':
+        window = np.ones(winlen)
+    elif wintype == "hann":
+        window = hann_window(winlen)
+    elif wintype == "hamm":
+        window = hamm_window(winlen)
+    elif wintype == "povey":
+        window = povey_window(winlen)
+    else:
+        msg = f"{wintype} Not supported yet!"
+        raise ValueError(msg)
+    return window
+    
+   
+def dft_matrix(n_fft:int, winlen:int=None, n_bin:int=None) -> Tuple[np.ndarray, np.ndarray, int]:
+    # https://en.wikipedia.org/wiki/Discrete_Fourier_transform
+    # (n_bins, n_fft) complex
+    if n_bin is None:
+        n_bin = 1 + n_fft // 2
+    if winlen is None:
+        winlen = n_bin
+    # https://github.com/numpy/numpy/blob/v1.20.0/numpy/fft/_pocketfft.py#L49
+    kernel_size = min(n_fft, winlen)
+        
+    n = np.arange(0, n_fft, 1.)
+    wsin = np.empty((n_bin, kernel_size)) #[Cout, kernel_size]
+    wcos = np.empty((n_bin, kernel_size)) #[Cout, kernel_size]
+    for k in range(n_bin): # Only half of the bins contain useful info
+        wsin[k,:] = -np.sin(2*np.pi*k*n/n_fft)[:kernel_size]
+        wcos[k,:] = np.cos(2*np.pi*k*n/n_fft)[:kernel_size]
+    w_real = wcos
+    w_imag = wsin
+    return w_real, w_imag, kernel_size
+    
+
+def dft_matrix_fast(n_fft:int, winlen:int=None, n_bin:int=None) -> Tuple[np.ndarray, np.ndarray, int]:
+    # (n_bins, n_fft) complex
+    if n_bin is None:
+        n_bin = 1 + n_fft // 2
+    if winlen is None:
+        winlen = n_bin
+    # https://github.com/numpy/numpy/blob/v1.20.0/numpy/fft/_pocketfft.py#L49
+    kernel_size = min(n_fft, winlen)
+    
+    # https://en.wikipedia.org/wiki/DFT_matrix
+    # https://ccrma.stanford.edu/~jos/st/Matrix_Formulation_DFT.html
+    weight = np.fft.fft(np.eye(n_fft))[:self.n_bin, :kernel_size]
+    w_real = weight.real
+    w_imag = weight.imag
+    return w_real, w_imag, kernel_size
+    
+
+def bin2hz(bin:Union[List[int], np.ndarray], N:int, sr:int)->List[float]:
+    """FFT bins to Hz.
+    
+    http://practicalcryptography.com/miscellaneous/machine-learning/intuitive-guide-discrete-fourier-transform/
+
+    Args:
+        bins (List[int] or np.ndarray): bin index.
+        N (int): the number of samples, or FFT points.
+        sr (int): sampling rate.
+
+    Returns:
+        List[float]: Hz's.
+    """
+    hz = bin * float(sr) / N
+        
+        
+def hz2mel(hz):
+    """Convert a value in Hertz to Mels
+
+    :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Mels. If an array was passed in, an identical sized array is returned.
+    """
+    return 1127 * np.log(1+hz/700.0)
+
+
+def mel2hz(mel):
+    """Convert a value in Mels to Hertz
+
+    :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
+    """
+    return 700 * (np.exp(mel/1127.0)-1)
+
+
+
+def rms_to_db(rms: float):
+    """Root Mean Square to dB.
+
+    Args:
+        rms ([float]): root mean square
+
+    Returns:
+        float: dB
+    """
+    return 20.0 * math.log10(max(1e-16, rms))
+
+
+def rms_to_dbfs(rms: float):
+    """Root Mean Square to dBFS.
+    https://fireattack.wordpress.com/2017/02/06/replaygain-loudness-normalization-and-applications/
+    Audio is mix of sine wave, so 1 amp sine wave's Full scale is 0.7071, equal to -3.0103dB.
+   
+    dB = dBFS + 3.0103
+    dBFS = db - 3.0103
+    e.g. 0 dB = -3.0103 dBFS
+
+    Args:
+        rms ([float]): root mean square
+
+    Returns:
+        float: dBFS
+    """
+    return rms_to_db(rms) - 3.0103
+
+
+def max_dbfs(sample_data: np.ndarray):
+    """Peak dBFS based on the maximum energy sample. 
+
+    Args:
+        sample_data ([np.ndarray]): float array, [-1, 1].
+
+    Returns:
+        float: dBFS 
+    """
+    # Peak dBFS based on the maximum energy sample. Will prevent overdrive if used for normalization.
+    return rms_to_dbfs(max(abs(np.min(sample_data)), abs(np.max(sample_data))))
+
+
+def mean_dbfs(sample_data):
+    """Peak dBFS based on the RMS energy. 
+
+    Args:
+        sample_data ([np.ndarray]): float array, [-1, 1].
+
+    Returns:
+        float: dBFS 
+    """
+    return rms_to_dbfs(
+        math.sqrt(np.mean(np.square(sample_data, dtype=np.float64))))
+
+
+def gain_db_to_ratio(gain_db: float):
+    """dB to ratio
+
+    Args:
+        gain_db (float): gain in dB
+
+    Returns:
+        float: scale in amp
+    """
+    return math.pow(10.0, gain_db / 20.0)
\ No newline at end of file
diff --git a/third_party/paddle_audio/frontend/english.wav b/third_party/paddle_audio/frontend/english.wav
new file mode 100644
index 000000000..bb28291f6
Binary files /dev/null and b/third_party/paddle_audio/frontend/english.wav differ
diff --git a/third_party/paddle_audio/frontend/kaldi.py b/third_party/paddle_audio/frontend/kaldi.py
new file mode 100644
index 000000000..d1c13fe30
--- /dev/null
+++ b/third_party/paddle_audio/frontend/kaldi.py
@@ -0,0 +1,266 @@
+from typing import Tuple
+import numpy as np
+import paddle
+from paddle import Tensor
+from paddle import nn
+from paddle.nn import functional as F
+import soundfile as sf
+
+from .common import get_window
+from .common import dft_matrix
+
+
+def read(wavpath:str, sr:int = None, start=0, stop=None, dtype='int16', always_2d=True)->Tuple[int, np.ndarray]:
+    """load wav file.
+
+    Args:
+        wavpath (str): wav path.
+        sr (int, optional): expect sample rate. Defaults to None.
+        dtype (str, optional): wav data bits. Defaults to 'int16'.
+
+    Returns:
+        Tuple[int, np.ndarray]: sr (int), wav (int16) [T, C].
+    """
+    wav, r_sr = sf.read(wavpath, start=start, stop=stop, dtype=dtype, always_2d=always_2d)
+    if sr:
+        assert sr == r_sr
+    return r_sr, wav
+
+
+def write(wavpath:str, wav:np.ndarray, sr:int, dtype='PCM_16'):
+    """write wav file.
+
+    Args:
+        wavpath (str): file path to save.
+        wav (np.ndarray): wav data.
+        sr (int): data samplerate.
+        dtype (str, optional): wav bit format. Defaults to 'PCM_16'.
+    """
+    sf.write(wavpath, wav, sr, subtype=dtype)
+
+
+def frames(x: Tensor,
+          num_samples: Tensor,
+          sr: int,
+          win_length: float,
+          stride_length: float,
+          clip: bool = False) -> Tuple[Tensor, Tensor]:
+    """Extract frames from audio.
+
+    Parameters
+    ----------
+    x : Tensor
+        Shape (B, T), batched waveform.
+    num_samples : Tensor
+        Shape (B, ), number of samples of each waveform.
+    sr: int
+        Sampling Rate.
+    win_length : float
+        Window length in ms.
+    stride_length : float
+        Stride length in ms.
+    clip : bool, optional
+        Whether to clip audio that does not fit into the last frame, by
+        default True
+
+    Returns
+    -------
+    frames : Tensor
+        Shape (B, T', win_length).
+    num_frames : Tensor
+        Shape (B, ) number of valid frames
+    """
+    assert stride_length <= win_length
+    stride_length = int(stride_length * sr)
+    win_length = int(win_length * sr)
+
+    num_frames = (num_samples - win_length) // stride_length
+    padding = (0, 0)
+    if not clip:
+        num_frames += 1
+        need_samples = num_frames * stride_length + win_length
+        padding = (0, need_samples - num_samples - 1)
+
+    weight = paddle.eye(win_length).unsqueeze(1) #[win_length, 1, win_length]
+
+    frames = F.conv1d(x.unsqueeze(-1),
+                      weight,
+                      padding=padding,
+                      stride=(stride_length, ),
+                      data_format='NLC')
+    return frames, num_frames
+
+
+def dither(signal:Tensor, dither_value=1.0)->Tensor:
+    """dither frames for log compute.
+
+    Args:
+        signal (Tensor): [B, T, D]
+        dither_value (float, optional): [scalar]. Defaults to 1.0.
+
+    Returns:
+        Tensor: [B, T, D]
+    """
+    D = paddle.shape(signal)[-1]
+    signal += paddle.normal(shape=[1, 1, D]) * dither_value
+    return signal
+
+
+def remove_dc_offset(signal:Tensor)->Tensor:
+    """remove dc.
+
+    Args:
+        signal (Tensor): [B, T, D]
+
+    Returns:
+        Tensor: [B, T, D]
+    """
+    signal -= paddle.mean(signal, axis=-1, keepdim=True)
+    return signal
+
+def preemphasis(signal:Tensor, coeff=0.97)->Tensor:
+    """perform preemphasis on the input signal.
+
+    Args:
+        signal (Tensor): [B, T, D], The signal to filter.
+        coeff (float, optional): [scalar].The preemphasis coefficient. 0 is no filter, Defaults to 0.97.
+
+    Returns:
+        Tensor: [B, T, D]
+    """
+    return paddle.concat([
+        (1-coeff)*signal[:, :, 0:1],
+        signal[:, :, 1:] - coeff * signal[:, :, :-1]
+    ], axis=-1)
+
+
+class STFT(nn.Layer):
+    """A module for computing stft transformation in a differentiable way.
+
+    http://practicalcryptography.com/miscellaneous/machine-learning/intuitive-guide-discrete-fourier-transform/
+
+    Parameters
+    ------------
+    n_fft : int
+        Number of samples in a frame.
+
+    sr: int
+        Number of Samplilng rate.
+
+    stride_length : float
+        Number of samples shifted between adjacent frames.
+
+    win_length : float
+        Length of the window.
+
+    clip: bool
+        Whether to clip audio is necesaary.
+    """
+    def __init__(self,
+                 n_fft: int,
+                 sr: int,
+                 win_length: float,
+                 stride_length: float,
+                 dither:float=0.0,
+                 preemph_coeff:float=0.97,
+                 remove_dc_offset:bool=True,
+                 window_type: str = 'povey',
+                 clip: bool = False):
+        super().__init__()
+        self.sr = sr
+        self.win_length = win_length
+        self.stride_length = stride_length
+        self.dither = dither
+        self.preemph_coeff = preemph_coeff
+        self.remove_dc_offset = remove_dc_offset
+        self.window_type = window_type
+        self.clip = clip
+
+        self.n_fft = n_fft
+        self.n_bin = 1 + n_fft // 2
+
+        w_real, w_imag, kernel_size = dft_matrix(
+            self.n_fft, int(self.win_length * self.sr), self.n_bin
+        )
+
+        # calculate window
+        window = get_window(window_type, kernel_size)
+
+        # (2 * n_bins, kernel_size)
+        w = np.concatenate([w_real, w_imag], axis=0)
+        w = w * window
+        # (kernel_size, 2 * n_bins)
+        w = np.transpose(w)
+        weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
+        self.register_buffer("weight", weight)
+
+    def forward(self, x: Tensor, num_samples: Tensor) -> Tuple[Tensor, Tensor]:
+        """Compute the stft transform.
+        Parameters
+        ------------
+        x : Tensor [shape=(B, T)]
+            The input waveform.
+        num_samples : Tensor [shape=(B,)]
+            Number of samples of each waveform.
+        Returns
+        ------------
+        C : Tensor
+            Shape(B, T', n_bins, 2) Spectrogram.
+
+        num_frames: Tensor
+            Shape (B,) number of samples of each spectrogram
+        """
+        batch_size = paddle.shape(num_samples)
+        F, nframe = frames(x, num_samples, self.sr, self.win_length, self.stride_length, clip=self.clip)
+        if self.dither:
+            F = dither(F, self.dither)
+        if self.remove_dc_offset:
+            F = remove_dc_offset(F)
+        if self.preemph_coeff:
+            F = preemphasis(F)
+        C = paddle.matmul(F, self.weight) # [B, T, K] [K, 2 * n_bins]
+        C = paddle.reshape(C, [batch_size, -1, 2, self.n_bin])
+        C = C.transpose([0, 1, 3, 2])
+        return C, nframe
+
+
+def powspec(C:Tensor) -> Tensor:
+    """Compute the power spectrum |X_k|^2.
+
+    Args:
+        C (Tensor): [B, T, C, 2]
+
+    Returns:
+        Tensor: [B, T, C]
+    """
+    real, imag = paddle.chunk(C, 2, axis=-1)
+    return paddle.square(real.squeeze(-1)) + paddle.square(imag.squeeze(-1))
+
+
+def magspec(C: Tensor, eps=1e-10) -> Tensor:
+    """Compute the magnitude spectrum |X_k|.
+
+    Args:
+        C (Tensor): [B, T, C, 2]
+        eps (float): epsilon.
+
+    Returns:
+        Tensor: [B, T, C]
+    """
+    pspec = powspec(C)
+    return paddle.sqrt(pspec + eps)
+
+
+def logspec(C: Tensor, eps=1e-10) -> Tensor:
+    """Compute log-spectrum  20log10∣X_k∣.
+
+    Args:
+        C (Tensor): [description]
+        eps ([type], optional): [description]. Defaults to 1e-10.
+
+    Returns:
+        Tensor: [description]
+    """
+    spec = magspec(C)
+    return 20 * paddle.log10(spec + eps)
+
diff --git a/third_party/paddle_audio/frontend/kaldi_test.py b/third_party/paddle_audio/frontend/kaldi_test.py
new file mode 100644
index 000000000..34ff413c5
--- /dev/null
+++ b/third_party/paddle_audio/frontend/kaldi_test.py
@@ -0,0 +1,533 @@
+from typing import Tuple
+import numpy as np
+import paddle
+import unittest
+
+import decimal
+import numpy
+import math
+import logging
+from pathlib import Path
+
+from scipy.fftpack import dct
+
+from third_party.paddle_audio.frontend import kaldi
+
+def round_half_up(number):
+    return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
+
+def rolling_window(a, window, step=1):
+    # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
+    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
+    strides = a.strides + (a.strides[-1],)
+    return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
+
+
+def do_dither(signal, dither_value=1.0):
+    signal += numpy.random.normal(size=signal.shape) * dither_value
+    return signal
+    
+def do_remove_dc_offset(signal):
+    signal -= numpy.mean(signal)
+    return signal
+
+def do_preemphasis(signal, coeff=0.97):
+    """perform preemphasis on the input signal.
+
+    :param signal: The signal to filter.
+    :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
+    :returns: the filtered signal.
+    """
+    return numpy.append((1-coeff)*signal[0], signal[1:] - coeff * signal[:-1])
+
+
+def framesig(sig, frame_len, frame_step, dither=1.0, preemph=0.97, remove_dc_offset=True, wintype='hamming', stride_trick=True):
+    """Frame a signal into overlapping frames.
+
+    :param sig: the audio signal to frame.
+    :param frame_len: length of each frame measured in samples.
+    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
+    :param stride_trick: use stride trick to compute the rolling window and window multiplication faster
+    :returns: an array of frames. Size is NUMFRAMES by frame_len.
+    """
+    slen = len(sig)
+    frame_len = int(round_half_up(frame_len))
+    frame_step = int(round_half_up(frame_step))
+    if slen <= frame_len:
+        numframes = 1
+    else:
+        numframes = 1 + (( slen - frame_len) // frame_step)
+
+    # check kaldi/src/feat/feature-window.h
+    padsignal = sig[:(numframes-1)*frame_step+frame_len]
+    if wintype is 'povey':
+        win = numpy.empty(frame_len)
+        for i in range(frame_len):
+            win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85     
+    else: # the hamming window
+        win = numpy.hamming(frame_len)
+        
+    if stride_trick:
+        frames = rolling_window(padsignal, window=frame_len, step=frame_step)
+    else:
+        indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
+            numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
+        indices = numpy.array(indices, dtype=numpy.int32)
+        frames = padsignal[indices]
+        win = numpy.tile(win, (numframes, 1))
+        
+    frames = frames.astype(numpy.float32)
+    raw_frames = numpy.zeros(frames.shape)
+    for frm in range(frames.shape[0]):
+        frames[frm,:] = do_dither(frames[frm,:], dither)        # dither
+        frames[frm,:] = do_remove_dc_offset(frames[frm,:])      # remove dc offset
+        raw_frames[frm,:] = frames[frm,:]
+        frames[frm,:] = do_preemphasis(frames[frm,:], preemph)    # preemphasize
+
+    return frames * win, raw_frames
+
+
+def magspec(frames, NFFT):
+    """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
+    """
+    if numpy.shape(frames)[1] > NFFT:
+        logging.warn(
+            'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
+            numpy.shape(frames)[1], NFFT)
+    complex_spec = numpy.fft.rfft(frames, NFFT)
+    return numpy.absolute(complex_spec)
+
+
+def powspec(frames, NFFT):
+    """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
+    """
+    return numpy.square(magspec(frames, NFFT))
+
+
+
+def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
+         nfilt=23,nfft=512,lowfreq=20,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,
+         ceplifter=22,useEnergy=True,wintype='povey'):
+    """Compute MFCC features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param numcep: the number of cepstrum to return, default 13
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
+    :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+    :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
+    """
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither,remove_dc_offset,preemph,wintype)
+    feat = numpy.log(feat)
+    feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
+    feat = lifter(feat,ceplifter)
+    if useEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
+    return feat
+
+def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97, 
+          wintype='hamming'):
+    """Compute Mel-filterbank energy features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+     winfunc=lambda x:numpy.ones((x,))   
+    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
+        second return value is the energy in each frame (total energy, unwindowed)
+    """
+    highfreq= highfreq or samplerate/2
+    frames,raw_frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, dither, preemph, remove_dc_offset, wintype)
+    pspec = sigproc.powspec(frames,nfft) # nearly the same until this part
+    energy = numpy.sum(raw_frames**2,1) # this stores the raw energy in each frame
+    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
+
+    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
+    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
+    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
+
+    return feat,energy
+
+def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=40,nfft=512,lowfreq=64,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,wintype='hamming'):
+    """Compute log Mel-filterbank energy features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
+    """
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither, remove_dc_offset,preemph,wintype)
+    return numpy.log(feat)
+
+def hz2mel(hz):
+    """Convert a value in Hertz to Mels
+
+    :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Mels. If an array was passed in, an identical sized array is returned.
+    """
+    return 1127 * numpy.log(1+hz/700.0)
+
+def mel2hz(mel):
+    """Convert a value in Mels to Hertz
+
+    :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
+    """
+    return 700 * (numpy.exp(mel/1127.0)-1)
+
+def get_filterbanks(nfilt=26,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
+    """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
+    to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
+
+    :param nfilt: the number of filters in the filterbank, default 20.
+    :param nfft: the FFT size. Default is 512.
+    :param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
+    :param lowfreq: lowest band edge of mel filters, default 0 Hz
+    :param highfreq: highest band edge of mel filters, default samplerate/2
+    :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
+    """
+    highfreq= highfreq or samplerate/2
+    assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
+
+    # compute points evenly spaced in mels
+    lowmel = hz2mel(lowfreq)
+    highmel = hz2mel(highfreq)
+
+    # check kaldi/src/feat/Mel-computations.h    
+    fbank = numpy.zeros([nfilt,nfft//2+1])
+    mel_freq_delta = (highmel-lowmel)/(nfilt+1)
+    for j in range(0,nfilt):
+        leftmel = lowmel+j*mel_freq_delta
+        centermel = lowmel+(j+1)*mel_freq_delta
+        rightmel = lowmel+(j+2)*mel_freq_delta
+        for i in range(0,nfft//2):
+            mel=hz2mel(i*samplerate/nfft)
+            if mel>leftmel and mel<rightmel:
+                if mel<centermel:
+                    fbank[j,i]=(mel-leftmel)/(centermel-leftmel)
+                else:
+                    fbank[j,i]=(rightmel-mel)/(rightmel-centermel)
+    return fbank
+
+def lifter(cepstra, L=22):
+    """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
+    magnitude of the high frequency DCT coeffs.
+
+    :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
+    :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
+    """
+    if L > 0:
+        nframes,ncoeff = numpy.shape(cepstra)
+        n = numpy.arange(ncoeff)
+        lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
+        return lift*cepstra
+    else:
+        # values of L <= 0, do nothing
+        return cepstra
+
+def delta(feat, N):
+    """Compute delta features from a feature vector sequence.
+
+    :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
+    :param N: For each frame, calculate delta features based on preceding and following N frames
+    :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
+    """
+    if N < 1:
+        raise ValueError('N must be an integer >= 1')
+    NUMFRAMES = len(feat)
+    denominator = 2 * sum([i**2 for i in range(1, N+1)])
+    delta_feat = numpy.empty_like(feat)
+    padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge')   # padded version of feat
+    for t in range(NUMFRAMES):
+        delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator   # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
+    return delta_feat
+
+##### modify for test ######
+
+def framesig_without_dither_dc_preemphasize(sig, frame_len, frame_step, wintype='hamming', stride_trick=True):
+    """Frame a signal into overlapping frames.
+
+    :param sig: the audio signal to frame.
+    :param frame_len: length of each frame measured in samples.
+    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
+    :param stride_trick: use stride trick to compute the rolling window and window multiplication faster
+    :returns: an array of frames. Size is NUMFRAMES by frame_len.
+    """
+    slen = len(sig)
+    frame_len = int(round_half_up(frame_len))
+    frame_step = int(round_half_up(frame_step))
+    if slen <= frame_len:
+        numframes = 1
+    else:
+        numframes = 1 + (( slen - frame_len) // frame_step)
+
+    # check kaldi/src/feat/feature-window.h
+    padsignal = sig[:(numframes-1)*frame_step+frame_len]
+    
+    if wintype is 'povey':
+        win = numpy.empty(frame_len)
+        for i in range(frame_len):
+            win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85 
+    elif wintype == '':
+        win = numpy.ones(frame_len)
+    elif wintype == 'hann':
+        win = numpy.hanning(frame_len)
+    else: # the hamming window
+        win = numpy.hamming(frame_len)
+        
+    if stride_trick:
+        frames = rolling_window(padsignal, window=frame_len, step=frame_step)
+    else:
+        indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
+            numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
+        indices = numpy.array(indices, dtype=numpy.int32)
+        frames = padsignal[indices]
+        win = numpy.tile(win, (numframes, 1))
+        
+    frames = frames.astype(numpy.float32)
+    raw_frames = frames
+    return frames * win, raw_frames
+
+
+def frames(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=40,nfft=512,lowfreq=0,highfreq=None, wintype='hamming'):
+    frames_with_win, raw_frames = framesig_without_dither_dc_preemphasize(signal, winlen*samplerate, winstep*samplerate, wintype)
+    return frames_with_win, raw_frames
+
+
+def complexspec(frames, NFFT):
+    """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
+    """
+    if numpy.shape(frames)[1] > NFFT:
+        logging.warn(
+            'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
+            numpy.shape(frames)[1], NFFT)
+    complex_spec = numpy.fft.rfft(frames, NFFT)
+    return complex_spec
+
+
+def stft_with_window(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97, 
+          wintype='hamming'):
+    frames_with_win, raw_frames = framesig_without_dither_dc_preemphasize(signal, winlen*samplerate, winstep*samplerate, wintype)
+    
+    spec = magspec(frames_with_win, nfft) # nearly the same until this part
+    scomplex = complexspec(frames_with_win, nfft)
+    
+    rspec = magspec(raw_frames, nfft)
+    rcomplex = complexspec(raw_frames, nfft)
+    return spec, scomplex, rspec, rcomplex
+
+
+class TestKaldiFE(unittest.TestCase):
+    def setUp(self):
+        self. this_dir = Path(__file__).parent
+        
+        self.wavpath = str(self.this_dir / 'english.wav')
+        self.winlen=0.025 # ms
+        self.winstep=0.01 # ms
+        self.nfft=512
+        self.lowfreq = 0
+        self.highfreq = None
+        self.wintype='hamm'
+        self.nfilt=40
+        
+        paddle.set_device('cpu')
+        
+        
+    def test_read(self):
+        import scipy.io.wavfile as wav
+        rate, sig = wav.read(self.wavpath)
+        sr, wav = kaldi.read(self.wavpath)
+        wav = wav[:, 0]
+        self.assertTrue(np.all(sig == wav))
+        self.assertEqual(rate, sr)
+        
+    def test_frames(self):
+        sr, wav = kaldi.read(self.wavpath)
+        wav = wav[:, 0]
+        _, fs = frames(wav, samplerate=sr, 
+                            winlen=self.winlen, winstep=self.winstep, 
+                            nfilt=self.nfilt, nfft=self.nfft, 
+                            lowfreq=self.lowfreq, highfreq=self.highfreq, 
+                            wintype=self.wintype)
+        
+        t_wav = paddle.to_tensor([wav], dtype='float32')
+        t_wavlen = paddle.to_tensor([len(wav)])
+        t_fs, t_nframe = kaldi.frames(t_wav, t_wavlen, sr, self.winlen, self.winstep, clip=False)
+        t_fs = t_fs.astype(fs.dtype)[0]
+        
+        self.assertEqual(t_nframe.item(), fs.shape[0])
+        self.assertTrue(np.allclose(t_fs.numpy(), fs))
+        
+         
+    def test_stft(self):
+        sr, wav = kaldi.read(self.wavpath)
+        wav = wav[:, 0]
+        
+        for wintype in ['', 'hamm', 'hann', 'povey']:
+            self.wintype=wintype
+            _, stft_c_win, _, _ = stft_with_window(wav, samplerate=sr, 
+                                winlen=self.winlen, winstep=self.winstep, 
+                                nfilt=self.nfilt, nfft=self.nfft, 
+                                lowfreq=self.lowfreq, highfreq=self.highfreq, 
+                                wintype=self.wintype)
+            
+            t_wav = paddle.to_tensor([wav], dtype='float32')
+            t_wavlen = paddle.to_tensor([len(wav)])
+            
+            stft_class = kaldi.STFT(self.nfft, sr, self.winlen, self.winstep, window_type=self.wintype, dither=0.0, preemph_coeff=0.0, remove_dc_offset=False, clip=False)
+            t_stft, t_nframe = stft_class(t_wav, t_wavlen)
+            t_stft = t_stft.astype(stft_c_win.real.dtype)[0]
+            t_real = t_stft[:, :, 0]
+            t_imag = t_stft[:, :, 1]
+            
+            self.assertEqual(t_nframe.item(), stft_c_win.real.shape[0])
+
+            self.assertLess(np.sum(t_real.numpy()) - np.sum(stft_c_win.real), 1)
+            self.assertTrue(np.allclose(t_real.numpy(), stft_c_win.real, atol=1e-1))
+            
+            self.assertLess(np.sum(t_imag.numpy()) - np.sum(stft_c_win.imag), 1)
+            self.assertTrue(np.allclose(t_imag.numpy(), stft_c_win.imag, atol=1e-1))
+        
+        
+    def test_magspec(self):
+        sr, wav = kaldi.read(self.wavpath)
+        wav = wav[:, 0]
+        for wintype in ['', 'hamm', 'hann', 'povey']:
+            self.wintype=wintype
+            stft_win, _, _, _ = stft_with_window(wav, samplerate=sr, 
+                                winlen=self.winlen, winstep=self.winstep, 
+                                nfilt=self.nfilt, nfft=self.nfft, 
+                                lowfreq=self.lowfreq, highfreq=self.highfreq, 
+                                wintype=self.wintype)
+
+            t_wav = paddle.to_tensor([wav], dtype='float32')
+            t_wavlen = paddle.to_tensor([len(wav)])
+            
+            stft_class = kaldi.STFT(self.nfft, sr, self.winlen, self.winstep, window_type=self.wintype, dither=0.0, preemph_coeff=0.0, remove_dc_offset=False, clip=False)
+            t_stft, t_nframe = stft_class(t_wav, t_wavlen)
+            t_stft = t_stft.astype(stft_win.dtype)
+            t_spec = kaldi.magspec(t_stft)[0]
+
+            self.assertEqual(t_nframe.item(), stft_win.shape[0])
+            
+            self.assertLess(np.sum(t_spec.numpy()) - np.sum(stft_win), 1)
+            self.assertTrue(np.allclose(t_spec.numpy(), stft_win, atol=1e-1))
+           
+            
+     def test_magsepc_winprocess(self):
+        sr, wav = kaldi.read(self.wavpath)
+        wav = wav[:, 0]
+        fs, _= framesig(wav, self.winlen*sr, self.winstep*sr, 
+                        dither=0.0, preemph=0.97, remove_dc_offset=True, wintype='povey', stride_trick=True)
+        spec = magspec(fs, self.nfft) # nearly the same until this part
+        
+        t_wav = paddle.to_tensor([wav], dtype='float32')
+        t_wavlen = paddle.to_tensor([len(wav)])
+        stft_class = kaldi.STFT(
+            self.nfft, sr, self.winlen, self.winstep,
+            window_type='povey', dither=0.0, preemph_coeff=0.97, remove_dc_offset=True, clip=False)
+        t_stft, t_nframe = stft_class(t_wav, t_wavlen)
+        t_stft = t_stft.astype(spec.dtype)
+        t_spec = kaldi.magspec(t_stft)[0]
+        
+        self.assertEqual(t_nframe.item(), fs.shape[0])
+        
+        self.assertLess(np.sum(t_spec.numpy()) - np.sum(spec), 1)
+        self.assertTrue(np.allclose(t_spec.numpy(), spec, atol=1e-1))
+        
+        
+    def test_powspec(self):
+        sr, wav = kaldi.read(self.wavpath)
+        wav = wav[:, 0]
+        for wintype in ['', 'hamm', 'hann', 'povey']:
+            self.wintype=wintype
+            stft_win, _, _, _ = stft_with_window(wav, samplerate=sr, 
+                                winlen=self.winlen, winstep=self.winstep, 
+                                nfilt=self.nfilt, nfft=self.nfft, 
+                                lowfreq=self.lowfreq, highfreq=self.highfreq, 
+                                wintype=self.wintype)
+            stft_win = np.square(stft_win)
+
+            t_wav = paddle.to_tensor([wav], dtype='float32')
+            t_wavlen = paddle.to_tensor([len(wav)])
+            
+            stft_class = kaldi.STFT(self.nfft, sr, self.winlen, self.winstep, window_type=self.wintype, dither=0.0, preemph_coeff=0.0, remove_dc_offset=False, clip=False)
+            t_stft, t_nframe = stft_class(t_wav, t_wavlen)
+            t_stft = t_stft.astype(stft_win.dtype)
+            t_spec = kaldi.powspec(t_stft)[0]
+
+            self.assertEqual(t_nframe.item(), stft_win.shape[0])
+            
+            self.assertLess(np.sum(t_spec.numpy() - stft_win), 5e4)
+            self.assertTrue(np.allclose(t_spec.numpy(), stft_win, atol=1e2))
+
+
+# from python_speech_features import mfcc
+# from python_speech_features import delta
+# from python_speech_features import logfbank
+# import scipy.io.wavfile as wav
+
+# (rate,sig) = wav.read("english.wav")
+
+# # note that generally nfilt=40 is used for speech recognition
+# fbank_feat = logfbank(sig,nfilt=23,lowfreq=20,dither=0,wintype='povey')
+
+# # the computed fbank coefficents of english.wav with dimension [110,23]
+# # [ 12.2865	12.6906	13.1765	15.714	16.064	15.7553	16.5746	16.9205	16.6472	16.1302	16.4576	16.7326	16.8864	17.7215	18.88	19.1377	19.1495	18.6683	18.3886	20.3506	20.2772	18.8248	18.1899
+# # 11.9198	13.146	14.7215	15.8642	17.4288	16.394	16.8238	16.1095	16.4297	16.6331	16.3163	16.5093	17.4981	18.3429	19.6555	19.6263	19.8435	19.0534	19.001	20.0287	19.7707	19.5852	19.1112
+# # ...
+# # ...
+# # the same with that using kaldi commands: compute-fbank-feats --dither=0.0
+
+
+# mfcc_feat = mfcc(sig,dither=0,useEnergy=True,wintype='povey')
+
+# # the computed mfcc coefficents of english.wav with dimension [110,13]
+# # [ 17.1337	-23.3651	-7.41751	-7.73686	-21.3682	-8.93884	-3.70843	4.68346	-16.0676	12.782	-7.24054	8.25089	10.7292
+# # 17.1692	-23.3028	-5.61872	-4.0075	-23.287	-20.6101	-5.51584	-6.15273	-14.4333	8.13052	-0.0345329	2.06274	-0.564298
+# # ...
+# # ...
+# # the same with that using kaldi commands: compute-mfcc-feats --dither=0.0
+
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file