Merge branch 'develop' into fix_varbase

4 years ago · 282914f45b
parent 231499a347 6f7a6dc2e8
commit 282914f45b
39 changed files with 1335 additions and 601 deletions
--- a/README.md
+++ b/README.md
@ -1,5 +1,3 @@
 [中文版](README_cn.md)
 # PaddlePaddle Speech to Any toolkit
 ![License](https://img.shields.io/badge/license-Apache%202-red.svg)
@ -11,7 +9,7 @@
 ## Features
- See [feature list](doc/src/feature_list.md) for more information.
+ See [feature list](docs/src/feature_list.md) for more information.
 ## Setup
@ -20,20 +18,20 @@ All tested under:
 * python>=3.7
 * paddlepaddle>=2.2.0rc
-Please see [install](doc/src/install.md).
+Please see [install](docs/src/install.md).
 ## Getting Started
-Please see [Getting Started](doc/src/getting_started.md) and [tiny egs](examples/tiny/s0/README.md).
+Please see [Getting Started](docs/src/getting_started.md) and [tiny egs](examples/tiny/s0/README.md).
 ## More Information  
-* [Data Prepration](doc/src/data_preparation.md)  
+* [Data Prepration](docs/src/data_preparation.md)  
-* [Data Augmentation](doc/src/augmentation.md)  
+* [Data Augmentation](docs/src/augmentation.md)  
-* [Ngram LM](doc/src/ngram_lm.md)  
+* [Ngram LM](docs/src/ngram_lm.md)  
-* [Benchmark](doc/src/benchmark.md)  
+* [Benchmark](docs/src/benchmark.md)  
-* [Relased Model](doc/src/released_model.md)  
+* [Relased Model](docs/src/released_model.md)  
 ## Questions and Help
@ -47,4 +45,4 @@ DeepSpeech is provided under the [Apache-2.0 License](./LICENSE).
 ## Acknowledgement
-We depends on many open source repos. See [References](doc/src/reference.md) for more information.
+We depends on many open source repos. See [References](docs/src/reference.md) for more information.
--- a/README_cn.md
+++ b/README_cn.md
@ -1,49 +0,0 @@
 [English](README.md)
 # PaddlePaddle Speech to Any toolkit
 ![License](https://img.shields.io/badge/license-Apache%202-red.svg)
 ![python version](https://img.shields.io/badge/python-3.7+-orange.svg)
 ![support os](https://img.shields.io/badge/os-linux-yellow.svg)
 *DeepSpeech*是一个采用[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)平台的端到端自动语音识别引擎的开源项目，
 我们的愿景是为语音识别在工业应用和学术研究上，提供易于使用、高效、小型化和可扩展的工具，包括训练，推理，以及  部署。
 ## 特性
 参看 [特性列表](doc/src/feature_list.md)。
 ## 安装
 在以下环境测试验证过：  
 * Ubuntu 16.04
 * python>=3.7
 * paddlepaddle>=2.2.0rc
 参看 [安装](doc/src/install.md)。
 ## 开始
 请查看 [开始](doc/src/getting_started.md) 和 [tiny egs](examples/tiny/s0/README.md)。
 ## 更多信息
 * [数据处理](doc/src/data_preparation.md)  
 * [数据增强](doc/src/augmentation.md)  
 * [语言模型](doc/src/ngram_lm.md)  
 * [Benchmark](doc/src/benchmark.md)  
 * [Relased Model](doc/src/released_model.md)  
 ## 问题和帮助
 欢迎您在[Github讨论](https://github.com/PaddlePaddle/DeepSpeech/discussions)提交问题，[Github问题](https://github.com/PaddlePaddle/models/issues)中反馈bug。也欢迎您为这个项目做出贡献。
 ## License
 DeepSpeech 遵循[Apache-2.0开源协议](./LICENSE)。
 ## 感谢
 开发中参考一些优秀的仓库，详情参见 [References](doc/src/reference.md)。
--- a/deepspeech/exps/deepspeech2/bin/tune.py
+++ b/deepspeech/exps/deepspeech2/bin/tune.py
@ -1,191 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Beam search parameters tuning for DeepSpeech2 model."""
 import functools
 import sys
 import numpy as np
 from paddle.io import DataLoader
 from deepspeech.exps.deepspeech2.config import get_cfg_defaults
 from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.models.ds2 import DeepSpeech2Model
 from deepspeech.training.cli import default_argument_parser
 from deepspeech.utils import error_rate
 from deepspeech.utils.utility import add_arguments
 from deepspeech.utils.utility import print_arguments
 def tune(config, args):
    """Tune parameters alpha and beta incrementally."""
    if not args.num_alphas >= 0:
        raise ValueError("num_alphas must be non-negative!")
    if not args.num_betas >= 0:
        raise ValueError("num_betas must be non-negative!")
    config.defrost()
    config.data.manfiest = config.data.dev_manifest
    config.data.augmentation_config = ""
    config.data.keep_transcription_text = True
    dev_dataset = ManifestDataset.from_config(config)
    valid_loader = DataLoader(
        dev_dataset,
        batch_size=config.data.batch_size,
        shuffle=False,
        drop_last=False,
        collate_fn=SpeechCollator(keep_transcription_text=True))
    model = DeepSpeech2Model.from_pretrained(valid_loader, config,
                                             args.checkpoint_path)
    model.eval()
    # decoders only accept string encoded in utf-8
    vocab_list = valid_loader.dataset.vocab_list
    errors_func = error_rate.char_errors if config.decoding.error_rate_type == 'cer' else error_rate.word_errors
    # create grid for search
    cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
    cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
    params_grid = [(alpha, beta) for alpha in cand_alphas
                   for beta in cand_betas]
    err_sum = [0.0 for i in range(len(params_grid))]
    err_ave = [0.0 for i in range(len(params_grid))]
    num_ins, len_refs, cur_batch = 0, 0, 0
    # initialize external scorer
    model.decoder.init_decode(args.alpha_from, args.beta_from,
                              config.decoding.lang_model_path, vocab_list,
                              config.decoding.decoding_method)
    ## incremental tuning parameters over multiple batches
    print("start tuning ...")
    for infer_data in valid_loader():
        if (args.num_batches >= 0) and (cur_batch >= args.num_batches):
            break
        def ordid2token(texts, texts_len):
            """ ord() id to chr() chr """
            trans = []
            for text, n in zip(texts, texts_len):
                n = n.numpy().item()
                ids = text[:n]
                trans.append(''.join([chr(i) for i in ids]))
            return trans
        audio, audio_len, text, text_len = infer_data
        target_transcripts = ordid2token(text, text_len)
        num_ins += audio.shape[0]
        # model infer
        eouts, eouts_len = model.encoder(audio, audio_len)
        probs = model.decoder.softmax(eouts)
        # grid search
        for index, (alpha, beta) in enumerate(params_grid):
            print(f"tuneing: alpha={alpha} beta={beta}")
            result_transcripts = model.decoder.decode_probs(
                probs.numpy(), eouts_len, vocab_list,
                config.decoding.decoding_method,
                config.decoding.lang_model_path, alpha, beta,
                config.decoding.beam_size, config.decoding.cutoff_prob,
                config.decoding.cutoff_top_n, config.decoding.num_proc_bsearch)
            for target, result in zip(target_transcripts, result_transcripts):
                errors, len_ref = errors_func(target, result)
                err_sum[index] += errors
                # accumulate the length of references of every batchπ
                # in the first iteration
                if args.alpha_from == alpha and args.beta_from == beta:
                    len_refs += len_ref
            err_ave[index] = err_sum[index] / len_refs
            if index % 2 == 0:
                sys.stdout.write('.')
                sys.stdout.flush()
            print("tuneing: one grid done!")
        # output on-line tuning result at the end of current batch
        err_ave_min = min(err_ave)
        min_index = err_ave.index(err_ave_min)
        print("\nBatch %d [%d/?], current opt (alpha, beta) = (%s, %s), "
              " min [%s] = %f" %
              (cur_batch, num_ins, "%.3f" % params_grid[min_index][0],
               "%.3f" % params_grid[min_index][1],
               config.decoding.error_rate_type, err_ave_min))
        cur_batch += 1
    # output WER/CER at every (alpha, beta)
    print("\nFinal %s:\n" % config.decoding.error_rate_type)
    for index in range(len(params_grid)):
        print("(alpha, beta) = (%s, %s), [%s] = %f" %
              ("%.3f" % params_grid[index][0], "%.3f" % params_grid[index][1],
               config.decoding.error_rate_type, err_ave[index]))
    err_ave_min = min(err_ave)
    min_index = err_ave.index(err_ave_min)
    print("\nFinish tuning on %d batches, final opt (alpha, beta) = (%s, %s)" %
          (cur_batch, "%.3f" % params_grid[min_index][0],
           "%.3f" % params_grid[min_index][1]))
    print("finish tuning")
 def main(config, args):
    tune(config, args)
 if __name__ == "__main__":
    parser = default_argument_parser()
    add_arg = functools.partial(add_arguments, argparser=parser)
    add_arg('num_batches', int, -1, "# of batches tuning on. "
            "Default -1, on whole dev set.")
    add_arg('num_alphas', int, 45, "# of alpha candidates for tuning.")
    add_arg('num_betas', int, 8, "# of beta candidates for tuning.")
    add_arg('alpha_from', float, 1.0, "Where alpha starts tuning from.")
    add_arg('alpha_to', float, 3.2, "Where alpha ends tuning with.")
    add_arg('beta_from', float, 0.1, "Where beta starts tuning from.")
    add_arg('beta_to', float, 0.45, "Where beta ends tuning with.")
    add_arg('batch_size', int, 256, "# of samples per batch.")
    add_arg('beam_size', int, 500, "Beam search width.")
    add_arg('num_proc_bsearch', int, 8, "# of CPUs for beam search.")
    add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.")
    add_arg('cutoff_top_n', int, 40, "Cutoff number for pruning.")
    args = parser.parse_args()
    print_arguments(args, globals())
    # https://yaml.org/type/float.html
    config = get_cfg_defaults()
    if args.config:
        config.merge_from_file(args.config)
    if args.opts:
        config.merge_from_list(args.opts)
    config.data.batch_size = args.batch_size
    config.decoding.beam_size = args.beam_size
    config.decoding.num_proc_bsearch = args.num_proc_bsearch
    config.decoding.cutoff_prob = args.cutoff_prob
    config.decoding.cutoff_top_n = args.cutoff_top_n
    config.freeze()
    print(config)
    if args.dump_config:
        with open(args.dump_config, 'w') as f:
            print(config, file=f)
    main(config, args)
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@ -35,12 +35,14 @@ from deepspeech.models.ds2 import DeepSpeech2Model
 from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline
 from deepspeech.models.ds2_online import DeepSpeech2ModelOnline
 from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
 from deepspeech.training.reporter import report
 from deepspeech.training.trainer import Trainer
 from deepspeech.utils import error_rate
 from deepspeech.utils import layer_tools
 from deepspeech.utils import mp_tools
 from deepspeech.utils.log import Autolog
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import UpdateConfig
 logger = Log(__name__).getlog()
@ -66,7 +68,9 @@ class DeepSpeech2Trainer(Trainer):
        super().__init__(config, args)
    def train_batch(self, batch_index, batch_data, msg):
-        train_conf = self.config.training
+        batch_size = self.config.collator.batch_size
        accum_grad = self.config.training.accum_grad
        start = time.time()
        # forward
@ -77,7 +81,7 @@ class DeepSpeech2Trainer(Trainer):
        }
        # loss backward
-        if (batch_index + 1) % train_conf.accum_grad != 0:
+        if (batch_index + 1) % accum_grad != 0:
            # Disable gradient synchronizations across DDP processes.
            # Within this context, gradients will be accumulated on module
            # variables, which will later be synchronized.
@ -92,19 +96,18 @@ class DeepSpeech2Trainer(Trainer):
            layer_tools.print_grads(self.model, print_func=None)
        # optimizer step
-        if (batch_index + 1) % train_conf.accum_grad == 0:
+        if (batch_index + 1) % accum_grad == 0:
            self.optimizer.step()
            self.optimizer.clear_grad()
            self.iteration += 1
        iteration_time = time.time() - start
-        msg += "train time: {:>.3f}s, ".format(iteration_time)
+        for k, v in losses_np.items():
-        msg += "batch size: {}, ".format(self.config.collator.batch_size)
+            report(k, v)
-        msg += "accum: {}, ".format(train_conf.accum_grad)
+        report("batch_size", batch_size)
-        msg += ', '.join('{}: {:>.6f}'.format(k, v)
+        report("accum", accum_grad)
-                         for k, v in losses_np.items())
+        report("step_cost", iteration_time)
        logger.info(msg)
        if dist.get_rank() == 0 and self.visualizer:
            for k, v in losses_np.items():
@ -147,10 +150,9 @@ class DeepSpeech2Trainer(Trainer):
    def setup_model(self):
        config = self.config.clone()
-        config.defrost()
+        with UpdateConfig(config):
-        config.model.feat_size = self.train_loader.collate_fn.feature_size
+            config.model.feat_size = self.train_loader.collate_fn.feature_size
-        config.model.dict_size = self.train_loader.collate_fn.vocab_size
+            config.model.dict_size = self.train_loader.collate_fn.vocab_size
        config.freeze()
        if self.args.model_type == 'offline':
            model = DeepSpeech2Model.from_config(config.model)
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@ -17,6 +17,7 @@ import os
 import sys
 import time
 from collections import defaultdict
 from collections import OrderedDict
 from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional
@ -33,6 +34,8 @@ from deepspeech.io.sampler import SortagradBatchSampler
 from deepspeech.io.sampler import SortagradDistributedBatchSampler
 from deepspeech.models.u2 import U2Model
 from deepspeech.training.optimizer import OptimizerFactory
 from deepspeech.training.reporter import ObsScope
 from deepspeech.training.reporter import report
 from deepspeech.training.scheduler import LRSchedulerFactory
 from deepspeech.training.timer import Timer
 from deepspeech.training.trainer import Trainer
@ -43,6 +46,7 @@ from deepspeech.utils import mp_tools
 from deepspeech.utils import text_grid
 from deepspeech.utils import utility
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import UpdateConfig
 logger = Log(__name__).getlog()
@ -100,7 +104,8 @@ class U2Trainer(Trainer):
            # Disable gradient synchronizations across DDP processes.
            # Within this context, gradients will be accumulated on module
            # variables, which will later be synchronized.
-            context = self.model.no_sync
+            # When using cpu w/o DDP, model does not have `no_sync`
            context = self.model.no_sync if self.parallel else nullcontext
        else:
            # Used for single gpu training and DDP gradient synchronization
            # processes.
@ -119,12 +124,11 @@ class U2Trainer(Trainer):
        iteration_time = time.time() - start
        if (batch_index + 1) % train_conf.log_interval == 0:
-            msg += "train time: {:>.3f}s, ".format(iteration_time)
+            for k, v in losses_np.items():
-            msg += "batch size: {}, ".format(self.config.collator.batch_size)
+                report(k, v)
-            msg += "accum: {}, ".format(train_conf.accum_grad)
+            report("batch_size", self.config.collator.batch_size)
-            msg += ', '.join('{}: {:>.6f}'.format(k, v)
+            report("accum", train_conf.accum_grad)
-                             for k, v in losses_np.items())
+            report("step_cost", iteration_time)
            logger.info(msg)
            if dist.get_rank() == 0 and self.visualizer:
                losses_np_v = losses_np.copy()
@ -197,15 +201,29 @@ class U2Trainer(Trainer):
                    data_start_time = time.time()
                    for batch_index, batch in enumerate(self.train_loader):
                        dataload_time = time.time() - data_start_time
-                        msg = "Train: Rank: {}, ".format(dist.get_rank())
+                        msg = "Train:"
-                        msg += "epoch: {}, ".format(self.epoch)
+                        observation = OrderedDict()
-                        msg += "step: {}, ".format(self.iteration)
+                        with ObsScope(observation):
-                        msg += "batch : {}/{}, ".format(batch_index + 1,
+                            report("Rank", dist.get_rank())
-                                                        len(self.train_loader))
+                            report("epoch", self.epoch)
-                        msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
+                            report('step', self.iteration)
-                        msg += "data time: {:>.3f}s, ".format(dataload_time)
+                            report('step/total',
-                        self.train_batch(batch_index, batch, msg)
+                                   (batch_index + 1) / len(self.train_loader))
-                        self.after_train_batch()
+                            report("lr", self.lr_scheduler())
                            self.train_batch(batch_index, batch, msg)
                            self.after_train_batch()
                            report('reader_cost', dataload_time)
                        observation['batch_cost'] = observation[
                            'reader_cost'] + observation['step_cost']
                        observation['samples'] = observation['batch_size']
                        observation['ips[sent./sec]'] = observation[
                            'batch_size'] / observation['batch_cost']
                        for k, v in observation.items():
                            msg += f" {k}: "
                            msg += f"{v:>.8f}" if isinstance(v,
                                                             float) else f"{v}"
                            msg += ","
                        logger.info(msg)
                        data_start_time = time.time()
                except Exception as e:
                    logger.error(e)
@ -314,10 +332,11 @@ class U2Trainer(Trainer):
    def setup_model(self):
        config = self.config
        model_conf = config.model
-        model_conf.defrost()
+
-        model_conf.input_dim = self.train_loader.collate_fn.feature_size
+        with UpdateConfig(model_conf):
-        model_conf.output_dim = self.train_loader.collate_fn.vocab_size
+            model_conf.input_dim = self.train_loader.collate_fn.feature_size
-        model_conf.freeze()
+            model_conf.output_dim = self.train_loader.collate_fn.vocab_size
        model = U2Model.from_config(model_conf)
        if self.parallel:
@ -560,7 +579,7 @@ class U2Tester(U2Trainer):
                # 1. Encoder
                encoder_out, encoder_mask = self.model._forward_encoder(
                    feat, feats_length)  # (B, maxlen, encoder_dim)
-                maxlen = encoder_out.size(1)
+                maxlen = encoder_out.shape[1]
                ctc_probs = self.model.ctc.log_softmax(
                    encoder_out)  # (1, maxlen, vocab_size)
--- a/deepspeech/exps/u2/trainer.py
+++ b/deepspeech/exps/u2/trainer.py
@ -32,6 +32,7 @@ from deepspeech.training.trainer import Trainer
 from deepspeech.training.updaters.trainer import Trainer as NewTrainer
 from deepspeech.utils import layer_tools
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import UpdateConfig
 logger = Log(__name__).getlog()
@ -121,10 +122,10 @@ class U2Trainer(Trainer):
    def setup_model(self):
        config = self.config
        model_conf = config.model
-        model_conf.defrost()
+        with UpdateConfig(model_conf):
-        model_conf.input_dim = self.train_loader.collate_fn.feature_size
+            model_conf.input_dim = self.train_loader.collate_fn.feature_size
-        model_conf.output_dim = self.train_loader.collate_fn.vocab_size
+            model_conf.output_dim = self.train_loader.collate_fn.vocab_size
-        model_conf.freeze()
+
        model = U2Model.from_config(model_conf)
        if self.parallel:
--- a/deepspeech/exps/u2_kaldi/model.py
+++ b/deepspeech/exps/u2_kaldi/model.py
@ -41,6 +41,7 @@ from deepspeech.utils import mp_tools
 from deepspeech.utils import text_grid
 from deepspeech.utils import utility
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import UpdateConfig
 logger = Log(__name__).getlog()
@ -319,10 +320,10 @@ class U2Trainer(Trainer):
        # model
        model_conf = config.model
-        model_conf.defrost()
+        with UpdateConfig(model_conf):
-        model_conf.input_dim = self.train_loader.feat_dim
+            model_conf.input_dim = self.train_loader.feat_dim
-        model_conf.output_dim = self.train_loader.vocab_size
+            model_conf.output_dim = self.train_loader.vocab_size
-        model_conf.freeze()
+
        model = U2Model.from_config(model_conf)
        if self.parallel:
            model = paddle.DataParallel(model)
@ -555,7 +556,7 @@ class U2Tester(U2Trainer):
                # 1. Encoder
                encoder_out, encoder_mask = self.model._forward_encoder(
                    feat, feats_length)  # (B, maxlen, encoder_dim)
-                maxlen = encoder_out.size(1)
+                maxlen = encoder_out.shape[1]
                ctc_probs = self.model.ctc.log_softmax(
                    encoder_out)  # (1, maxlen, vocab_size)
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
@ -47,6 +47,7 @@ from deepspeech.utils import mp_tools
 from deepspeech.utils import text_grid
 from deepspeech.utils import utility
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import UpdateConfig
 logger = Log(__name__).getlog()
@ -345,10 +346,10 @@ class U2STTrainer(Trainer):
    def setup_model(self):
        config = self.config
        model_conf = config.model
-        model_conf.defrost()
+        with UpdateConfig(model_conf):
-        model_conf.input_dim = self.train_loader.collate_fn.feature_size
+            model_conf.input_dim = self.train_loader.collate_fn.feature_size
-        model_conf.output_dim = self.train_loader.collate_fn.vocab_size
+            model_conf.output_dim = self.train_loader.collate_fn.vocab_size
-        model_conf.freeze()
+
        model = U2STModel.from_config(model_conf)
        if self.parallel:
@ -587,7 +588,7 @@ class U2STTester(U2STTrainer):
                # 1. Encoder
                encoder_out, encoder_mask = self.model._forward_encoder(
                    feat, feats_length)  # (B, maxlen, encoder_dim)
-                maxlen = encoder_out.size(1)
+                maxlen = encoder_out.shape[1]
                ctc_probs = self.model.ctc.log_softmax(
                    encoder_out)  # (1, maxlen, vocab_size)
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@ -76,19 +76,19 @@ class ManifestDataset(Dataset):
        Args:
            manifest_path (str): manifest josn file path
-            max_input_len ([type], optional): maximum output seq length, 
+            max_input_len ([type], optional): maximum output seq length,
                in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
-            min_input_len (float, optional): minimum input seq length, 
+            min_input_len (float, optional): minimum input seq length,
                in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
-            max_output_len (float, optional): maximum input seq length, 
+            max_output_len (float, optional): maximum input seq length,
                in modeling units. Defaults to 500.0.
-            min_output_len (float, optional): minimum input seq length, 
+            min_output_len (float, optional): minimum input seq length,
                in modeling units. Defaults to 0.0.
-            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. 
+            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio.
                Defaults to 10.0.
            min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio.
                Defaults to 0.05.
-        
+
        """
        super().__init__()
--- a/deepspeech/models/u2/u2.py
+++ b/deepspeech/models/u2/u2.py
@ -48,6 +48,7 @@ from deepspeech.utils.tensor_utils import add_sos_eos
 from deepspeech.utils.tensor_utils import pad_sequence
 from deepspeech.utils.tensor_utils import th_accuracy
 from deepspeech.utils.utility import log_add
 from deepspeech.utils.utility import UpdateConfig
 __all__ = ["U2Model", "U2InferModel"]
@ -297,8 +298,8 @@ class U2BaseModel(nn.Layer):
            speech, speech_lengths, decoding_chunk_size,
            num_decoding_left_chunks,
            simulate_streaming)  # (B, maxlen, encoder_dim)
-        maxlen = encoder_out.size(1)
+        maxlen = encoder_out.shape[1]
-        encoder_dim = encoder_out.size(2)
+        encoder_dim = encoder_out.shape[2]
        running_size = batch_size * beam_size
        encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view(
            running_size, maxlen, encoder_dim)  # (B*N, maxlen, encoder_dim)
@ -403,7 +404,7 @@ class U2BaseModel(nn.Layer):
        encoder_out, encoder_mask = self._forward_encoder(
            speech, speech_lengths, decoding_chunk_size,
            num_decoding_left_chunks, simulate_streaming)
-        maxlen = encoder_out.size(1)
+        maxlen = encoder_out.shape[1]
        encoder_out_lens = encoder_mask.squeeze(1).sum(1)
        ctc_probs = self.ctc.log_softmax(encoder_out)  # (B, maxlen, vocab_size)
@ -454,7 +455,7 @@ class U2BaseModel(nn.Layer):
            speech, speech_lengths, decoding_chunk_size,
            num_decoding_left_chunks,
            simulate_streaming)  # (B, maxlen, encoder_dim)
-        maxlen = encoder_out.size(1)
+        maxlen = encoder_out.shape[1]
        ctc_probs = self.ctc.log_softmax(encoder_out)  # (1, maxlen, vocab_size)
        ctc_probs = ctc_probs.squeeze(0)
@ -582,7 +583,7 @@ class U2BaseModel(nn.Layer):
        encoder_out = encoder_out.repeat(beam_size, 1, 1)
        encoder_mask = paddle.ones(
-            (beam_size, 1, encoder_out.size(1)), dtype=paddle.bool)
+            (beam_size, 1, encoder_out.shape[1]), dtype=paddle.bool)
        decoder_out, _ = self.decoder(
            encoder_out, encoder_mask, hyps_pad,
            hyps_lens)  # (beam_size, max_hyps_len, vocab_size)
@ -689,13 +690,13 @@ class U2BaseModel(nn.Layer):
        Returns:
            paddle.Tensor: decoder output, (B, L)
        """
-        assert encoder_out.size(0) == 1
+        assert encoder_out.shape[0] == 1
-        num_hyps = hyps.size(0)
+        num_hyps = hyps.shape[0]
-        assert hyps_lens.size(0) == num_hyps
+        assert hyps_lens.shape[0] == num_hyps
        encoder_out = encoder_out.repeat(num_hyps, 1, 1)
        # (B, 1, T)
        encoder_mask = paddle.ones(
-            [num_hyps, 1, encoder_out.size(1)], dtype=paddle.bool)
+            [num_hyps, 1, encoder_out.shape[1]], dtype=paddle.bool)
        # (num_hyps, max_hyps_len, vocab_size)
        decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps,
                                      hyps_lens)
@ -750,7 +751,7 @@ class U2BaseModel(nn.Layer):
        Returns:
            List[List[int]]: transcripts.
        """
-        batch_size = feats.size(0)
+        batch_size = feats.shape[0]
        if decoding_method in ['ctc_prefix_beam_search',
                               'attention_rescoring'] and batch_size > 1:
            logger.fatal(
@ -778,7 +779,7 @@ class U2BaseModel(nn.Layer):
        # result in List[int], change it to List[List[int]] for compatible
        # with other batch decoding mode
        elif decoding_method == 'ctc_prefix_beam_search':
-            assert feats.size(0) == 1
+            assert feats.shape[0] == 1
            hyp = self.ctc_prefix_beam_search(
                feats,
                feats_lengths,
@ -788,7 +789,7 @@ class U2BaseModel(nn.Layer):
                simulate_streaming=simulate_streaming)
            hyps = [hyp]
        elif decoding_method == 'attention_rescoring':
-            assert feats.size(0) == 1
+            assert feats.shape[0] == 1
            hyp = self.attention_rescoring(
                feats,
                feats_lengths,
@ -903,10 +904,10 @@ class U2Model(U2BaseModel):
        Returns:
            DeepSpeech2Model: The model built from pretrained result.
        """
-        config.defrost()
+        with UpdateConfig(config):
-        config.input_dim = dataloader.collate_fn.feature_size
+            config.input_dim = dataloader.collate_fn.feature_size
-        config.output_dim = dataloader.collate_fn.vocab_size
+            config.output_dim = dataloader.collate_fn.vocab_size
-        config.freeze()
+
        model = cls.from_config(config)
        if checkpoint_path:
--- a/deepspeech/models/u2_st.py
+++ b/deepspeech/models/u2_st.py
@ -42,6 +42,7 @@ from deepspeech.utils import layer_tools
 from deepspeech.utils.log import Log
 from deepspeech.utils.tensor_utils import add_sos_eos
 from deepspeech.utils.tensor_utils import th_accuracy
 from deepspeech.utils.utility import UpdateConfig
 __all__ = ["U2STModel", "U2STInferModel"]
@ -339,8 +340,8 @@ class U2STBaseModel(nn.Layer):
            speech, speech_lengths, decoding_chunk_size,
            num_decoding_left_chunks,
            simulate_streaming)  # (B, maxlen, encoder_dim)
-        maxlen = encoder_out.size(1)
+        maxlen = encoder_out.shape[1]
-        encoder_dim = encoder_out.size(2)
+        encoder_dim = encoder_out.shape[2]
        running_size = batch_size * beam_size
        encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view(
            running_size, maxlen, encoder_dim)  # (B*N, maxlen, encoder_dim)
@ -495,13 +496,13 @@ class U2STBaseModel(nn.Layer):
        Returns:
            paddle.Tensor: decoder output, (B, L)
        """
-        assert encoder_out.size(0) == 1
+        assert encoder_out.shape[0] == 1
-        num_hyps = hyps.size(0)
+        num_hyps = hyps.shape[0]
-        assert hyps_lens.size(0) == num_hyps
+        assert hyps_lens.shape[0] == num_hyps
        encoder_out = encoder_out.repeat(num_hyps, 1, 1)
        # (B, 1, T)
        encoder_mask = paddle.ones(
-            [num_hyps, 1, encoder_out.size(1)], dtype=paddle.bool)
+            [num_hyps, 1, encoder_out.shape[1]], dtype=paddle.bool)
        # (num_hyps, max_hyps_len, vocab_size)
        decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps,
                                      hyps_lens)
@ -556,7 +557,7 @@ class U2STBaseModel(nn.Layer):
        Returns:
            List[List[int]]: transcripts.
        """
-        batch_size = feats.size(0)
+        batch_size = feats.shape[0]
        if decoding_method == 'fullsentence':
            hyps = self.translate(
@ -686,10 +687,10 @@ class U2STModel(U2STBaseModel):
        Returns:
            DeepSpeech2Model: The model built from pretrained result.
        """
-        config.defrost()
+        with UpdateConfig(config):
-        config.input_dim = dataloader.collate_fn.feature_size
+            config.input_dim = dataloader.collate_fn.feature_size
-        config.output_dim = dataloader.collate_fn.vocab_size
+            config.output_dim = dataloader.collate_fn.vocab_size
-        config.freeze()
+
        model = cls.from_config(config)
        if checkpoint_path:
--- a/deepspeech/modules/attention.py
+++ b/deepspeech/modules/attention.py
@ -70,7 +70,7 @@ class MultiHeadedAttention(nn.Layer):
            paddle.Tensor: Transformed value tensor, size
                (#batch, n_head, time2, d_k).
        """
-        n_batch = query.size(0)
+        n_batch = query.shape[0]
        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
@ -96,7 +96,7 @@ class MultiHeadedAttention(nn.Layer):
            paddle.Tensor: Transformed value weighted 
                by the attention score, (#batch, time1, d_model).
        """
-        n_batch = value.size(0)
+        n_batch = value.shape[0]
        if mask is not None:
            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
            scores = scores.masked_fill(mask, -float('inf'))
@ -172,15 +172,16 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
            paddle.Tensor: Output tensor. (batch, head, time1, time1)
        """
        zero_pad = paddle.zeros(
-            (x.size(0), x.size(1), x.size(2), 1), dtype=x.dtype)
+            (x.shape[0], x.shape[1], x.shape[2], 1), dtype=x.dtype)
        x_padded = paddle.cat([zero_pad, x], dim=-1)
-        x_padded = x_padded.view(x.size(0), x.size(1), x.size(3) + 1, x.size(2))
+        x_padded = x_padded.view(x.shape[0], x.shape[1], x.shape[3] + 1,
                                 x.shape[2])
        x = x_padded[:, :, 1:].view_as(x)  # [B, H, T1, T1]
        if zero_triu:
-            ones = paddle.ones((x.size(2), x.size(3)))
+            ones = paddle.ones((x.shape[2], x.shape[3]))
-            x = x * paddle.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
+            x = x * paddle.tril(ones, x.shape[3] - x.shape[2])[None, None, :, :]
        return x
@ -205,7 +206,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
        q, k, v = self.forward_qkv(query, key, value)
        q = q.transpose([0, 2, 1, 3])  # (batch, time1, head, d_k)
-        n_batch_pos = pos_emb.size(0)
+        n_batch_pos = pos_emb.shape[0]
        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
        p = p.transpose([0, 2, 1, 3])  # (batch, head, time1, d_k)
--- a/deepspeech/modules/decoder.py
+++ b/deepspeech/modules/decoder.py
@ -122,7 +122,7 @@ class TransformerDecoder(nn.Layer):
        # tgt_mask: (B, 1, L)
        tgt_mask = (make_non_pad_mask(ys_in_lens).unsqueeze(1))
        # m: (1, L, L)
-        m = subsequent_mask(tgt_mask.size(-1)).unsqueeze(0)
+        m = subsequent_mask(tgt_mask.shape[-1]).unsqueeze(0)
        # tgt_mask: (B, L, L)
        tgt_mask = tgt_mask & m
--- a/deepspeech/modules/embedding.py
+++ b/deepspeech/modules/embedding.py
@ -68,7 +68,7 @@ class PositionalEncoding(nn.Layer):
            paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...)
        """
        T = x.shape[1]
-        assert offset + x.size(1) < self.max_len
+        assert offset + x.shape[1] < self.max_len
        #TODO(Hui Zhang): using T = x.size(1), __getitem__ not support Tensor
        pos_emb = self.pe[:, offset:offset + T]
        x = x * self.xscale + pos_emb
@ -114,7 +114,7 @@ class RelPositionalEncoding(PositionalEncoding):
            paddle.Tensor: Encoded tensor (batch, time, `*`).
            paddle.Tensor: Positional embedding tensor (1, time, `*`).
        """
-        assert offset + x.size(1) < self.max_len
+        assert offset + x.shape[1] < self.max_len
        x = x * self.xscale
        #TODO(Hui Zhang): using x.size(1), __getitem__ not support Tensor
        pos_emb = self.pe[:, offset:offset + x.shape[1]]
--- a/deepspeech/modules/encoder.py
+++ b/deepspeech/modules/encoder.py
@ -159,7 +159,7 @@ class BaseEncoder(nn.Layer):
        if self.global_cmvn is not None:
            xs = self.global_cmvn(xs)
        #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
-        xs, pos_emb, masks = self.embed(xs, masks.type_as(xs), offset=0)
+        xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0)
        #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
        masks = masks.astype(paddle.bool)
        mask_pad = ~masks
@ -206,11 +206,11 @@ class BaseEncoder(nn.Layer):
                chunk computation
            List[paddle.Tensor]: conformer cnn cache
        """
-        assert xs.size(0) == 1  # batch size must be one
+        assert xs.shape[0] == 1  # batch size must be one
        # tmp_masks is just for interface compatibility
        # TODO(Hui Zhang): stride_slice not support bool tensor
        # tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
-        tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.int32)
+        tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.int32)
        tmp_masks = tmp_masks.unsqueeze(1)  #[B=1, C=1, T]
        if self.global_cmvn is not None:
@ -220,25 +220,25 @@ class BaseEncoder(nn.Layer):
            xs, tmp_masks, offset=offset)  #xs=(B, T, D), pos_emb=(B=1, T, D)
        if subsampling_cache is not None:
-            cache_size = subsampling_cache.size(1)  #T
+            cache_size = subsampling_cache.shape[1]  #T
            xs = paddle.cat((subsampling_cache, xs), dim=1)
        else:
            cache_size = 0
        # only used when using `RelPositionMultiHeadedAttention`
        pos_emb = self.embed.position_encoding(
-            offset=offset - cache_size, size=xs.size(1))
+            offset=offset - cache_size, size=xs.shape[1])
        if required_cache_size < 0:
            next_cache_start = 0
        elif required_cache_size == 0:
-            next_cache_start = xs.size(1)
+            next_cache_start = xs.shape[1]
        else:
-            next_cache_start = xs.size(1) - required_cache_size
+            next_cache_start = xs.shape[1] - required_cache_size
        r_subsampling_cache = xs[:, next_cache_start:, :]
        # Real mask for transformer/conformer layers
-        masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
+        masks = paddle.ones([1, xs.shape[1]], dtype=paddle.bool)
        masks = masks.unsqueeze(1)  #[B=1, L'=1, T]
        r_elayers_output_cache = []
        r_conformer_cnn_cache = []
@ -302,7 +302,7 @@ class BaseEncoder(nn.Layer):
        stride = subsampling * decoding_chunk_size
        decoding_window = (decoding_chunk_size - 1) * subsampling + context
-        num_frames = xs.size(1)
+        num_frames = xs.shape[1]
        required_cache_size = decoding_chunk_size * num_decoding_left_chunks
        subsampling_cache: Optional[paddle.Tensor] = None
        elayers_output_cache: Optional[List[paddle.Tensor]] = None
@ -318,10 +318,10 @@ class BaseEncoder(nn.Layer):
                 chunk_xs, offset, required_cache_size, subsampling_cache,
                 elayers_output_cache, conformer_cnn_cache)
            outputs.append(y)
-            offset += y.size(1)
+            offset += y.shape[1]
        ys = paddle.cat(outputs, 1)
        # fake mask, just for jit script and compatibility with `forward` api
-        masks = paddle.ones([1, ys.size(1)], dtype=paddle.bool)
+        masks = paddle.ones([1, ys.shape[1]], dtype=paddle.bool)
        masks = masks.unsqueeze(1)
        return ys, masks
--- a/deepspeech/training/cli.py
+++ b/deepspeech/training/cli.py
@ -43,33 +43,57 @@ def default_argument_parser():
    """
    parser = argparse.ArgumentParser()
-    # yapf: disable
+    train_group = parser.add_argument_group(
-    # data and output
+        title='Train Options', description=None)
-    parser.add_argument("--config", metavar="FILE", help="path of the config file to overwrite to default config with.")
+    train_group.add_argument(
-    parser.add_argument("--dump-config", metavar="FILE", help="dump config to yaml file.")
+        "--seed",
-    parser.add_argument("--output", metavar="OUTPUT_DIR", help="path to save checkpoint and logs.")
+        type=int,
-
+        default=None,
-    # load from saved checkpoint
+        help="seed to use for paddle, np and random. None or 0 for random, else set seed."
-    parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load")
+    )
-
+    train_group.add_argument(
-    # running
+        "--device",
-    parser.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"],
+        type=str,
-                        help="device type to use, cpu and gpu are supported.")
+        default='gpu',
-    parser.add_argument("--nprocs", type=int, default=1, help="number of parallel processes to use.")
+        choices=["cpu", "gpu"],
-
+        help="device cpu and gpu are supported.")
-    # overwrite extra config and default config
+    train_group.add_argument(
-    # parser.add_argument("--opts", nargs=argparse.REMAINDER,
+        "--nprocs",
-    # help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
+        type=int,
-    parser.add_argument("--opts", type=str, default=[], nargs='+',
+        default=1,
-                        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
+        help="number of parallel processes. 0 for cpu.")
-
+    train_group.add_argument(
-    # random seed
+        "--config", metavar="CONFIG_FILE", help="config file.")
-    parser.add_argument("--seed", type=int, default=None,
+    train_group.add_argument(
-                        help="seed to use for paddle, np and random. None or 0 for random, else set seed.")
+        "--output", metavar="CKPT_DIR", help="path to save checkpoint.")
-
+    train_group.add_argument(
-    # profiler
+        "--checkpoint_path", type=str, help="path to load checkpoint")
-    parser.add_argument('--profiler_options', type=str, default=None,
+    train_group.add_argument(
-        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".')
+        "--opts",
-    # yapd: enable
+        type=str,
        default=[],
        nargs='+',
        help="overwrite --config file, passing in LIST[KEY VALUE] pairs")
    train_group.add_argument(
        "--dump-config", metavar="FILE", help="dump config to `this` file.")
    profile_group = parser.add_argument_group(
        title='Benchmark Options', description=None)
    profile_group.add_argument(
        '--profiler-options',
        type=str,
        default=None,
        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
    )
    profile_group.add_argument(
        '--benchmark-batch-size',
        type=int,
        default=None,
        help='batch size for benchmark.')
    profile_group.add_argument(
        '--benchmark-max-step',
        type=int,
        default=None,
        help='max iteration for benchmark.')
    return parser
--- a/deepspeech/training/extensions/evaluator.py
+++ b/deepspeech/training/extensions/evaluator.py
@ -20,8 +20,8 @@ from paddle.nn import Layer
 from . import extension
 from ..reporter import DictSummary
 from ..reporter import ObsScope
 from ..reporter import report
 from ..reporter import scope
 from ..timer import Timer
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
@ -78,7 +78,7 @@ class StandardEvaluator(extension.Extension):
        summary = DictSummary()
        for batch in self.dataloader:
            observation = {}
-            with scope(observation):
+            with ObsScope(observation):
                # main evaluation computation here.
                with paddle.no_grad():
                    self.evaluate_sync(self.evaluate_core(batch))
--- a/deepspeech/training/reporter.py
+++ b/deepspeech/training/reporter.py
@ -19,7 +19,7 @@ OBSERVATIONS = None
@contextlib.contextmanager
-def scope(observations):
+def ObsScope(observations):
    # make `observation` the target to report to.
    # it is basically a dictionary that stores temporary observations
    global OBSERVATIONS
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@ -11,19 +11,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
 import time
 from collections import OrderedDict
 from pathlib import Path
 import paddle
 from paddle import distributed as dist
 from tensorboardX import SummaryWriter
 from deepspeech.training.reporter import ObsScope
 from deepspeech.training.reporter import report
 from deepspeech.training.timer import Timer
 from deepspeech.utils import mp_tools
 from deepspeech.utils import profiler
 from deepspeech.utils.checkpoint import Checkpoint
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import seed_all
 from deepspeech.utils.utility import UpdateConfig
 __all__ = ["Trainer"]
@ -96,11 +101,21 @@ class Trainer():
        self.checkpoint_dir = None
        self.iteration = 0
        self.epoch = 0
        self.rank = dist.get_rank()
        logger.info(f"Rank: {self.rank}/{dist.get_world_size()}")
        if args.seed:
            seed_all(args.seed)
            logger.info(f"Set seed {args.seed}")
        if self.args.benchmark_batch_size:
            with UpdateConfig(self.config):
                self.config.collator.batch_size = self.args.benchmark_batch_size
                self.config.training.log_interval = 1
            logger.info(
                f"Benchmark reset batch-size: {self.args.benchmark_batch_size}")
    def setup(self):
        """Setup the experiment.
        """
@ -188,6 +203,12 @@ class Trainer():
        if self.args.profiler_options:
            profiler.add_profiler_step(self.args.profiler_options)
        if self.args.benchmark_max_step and self.iteration > self.args.benchmark_max_step:
            logger.info(
                f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
            sys.exit(
                f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
    def train(self):
        """The training process control by epoch."""
        from_scratch = self.resume_or_scratch()
@ -208,15 +229,29 @@ class Trainer():
                    data_start_time = time.time()
                    for batch_index, batch in enumerate(self.train_loader):
                        dataload_time = time.time() - data_start_time
-                        msg = "Train: Rank: {}, ".format(dist.get_rank())
+                        msg = "Train:"
-                        msg += "epoch: {}, ".format(self.epoch)
+                        observation = OrderedDict()
-                        msg += "step: {}, ".format(self.iteration)
+                        with ObsScope(observation):
-                        msg += "batch : {}/{}, ".format(batch_index + 1,
+                            report("Rank", dist.get_rank())
-                                                        len(self.train_loader))
+                            report("epoch", self.epoch)
-                        msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
+                            report('step', self.iteration)
-                        msg += "data time: {:>.3f}s, ".format(dataload_time)
+                            report('step/total',
-                        self.train_batch(batch_index, batch, msg)
+                                   (batch_index + 1) / len(self.train_loader))
-                        self.after_train_batch()
+                            report("lr", self.lr_scheduler())
                            self.train_batch(batch_index, batch, msg)
                            self.after_train_batch()
                            report('reader_cost', dataload_time)
                        observation['batch_cost'] = observation[
                            'reader_cost'] + observation['step_cost']
                        observation['samples'] = observation['batch_size']
                        observation['ips[sent./sec]'] = observation[
                            'batch_size'] / observation['batch_cost']
                        for k, v in observation.items():
                            msg += f" {k}: "
                            msg += f"{v:>.8f}" if isinstance(v,
                                                             float) else f"{v}"
                            msg += ","
                        logger.info(msg)
                        data_start_time = time.time()
                except Exception as e:
                    logger.error(e)
--- a/deepspeech/training/updaters/trainer.py
+++ b/deepspeech/training/updaters/trainer.py
@ -24,7 +24,7 @@ import tqdm
 from deepspeech.training.extensions.extension import Extension
 from deepspeech.training.extensions.extension import PRIORITY_READER
-from deepspeech.training.reporter import scope
+from deepspeech.training.reporter import ObsScope
 from deepspeech.training.triggers import get_trigger
 from deepspeech.training.triggers.limit_trigger import LimitTrigger
 from deepspeech.training.updaters.updater import UpdaterBase
@ -144,7 +144,7 @@ class Trainer():
                # you can use `report` freely in Updater.update()
                # updating parameters and state
-                with scope(self.observation):
+                with ObsScope(self.observation):
                    update()
                    p.update()
--- a/deepspeech/utils/ctc_utils.py
+++ b/deepspeech/utils/ctc_utils.py
@ -84,19 +84,19 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
    y_insert_blank = insert_blank(y, blank_id)  #(2L+1)
    log_alpha = paddle.zeros(
-        (ctc_probs.size(0), len(y_insert_blank)))  #(T, 2L+1)
+        (ctc_probs.shape[0], len(y_insert_blank)))  #(T, 2L+1)
    log_alpha = log_alpha - float('inf')  # log of zero
    # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
    state_path = (paddle.zeros(
-        (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int32) - 1
+        (ctc_probs.shape[0], len(y_insert_blank)), dtype=paddle.int32) - 1
                  )  # state path, Tuple((T, 2L+1))
    # init start state
    log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]]  # State-b, Sb
    log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]]  # State-nb, Snb
-    for t in range(1, ctc_probs.size(0)):  # T
+    for t in range(1, ctc_probs.shape[0]):  # T
        for s in range(len(y_insert_blank)):  # 2L+1
            if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[
                    s] == y_insert_blank[s - 2]:
@ -114,7 +114,7 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
                y_insert_blank[s]]
            state_path[t, s] = prev_state[paddle.argmax(candidates)]
    # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
-    state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int32)
+    state_seq = -1 * paddle.ones((ctc_probs.shape[0], 1), dtype=paddle.int32)
    candidates = paddle.to_tensor([
        log_alpha[-1, len(y_insert_blank) - 1],  # Sb
@ -122,11 +122,11 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
    ])
    prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2]
    state_seq[-1] = prev_state[paddle.argmax(candidates)]
-    for t in range(ctc_probs.size(0) - 2, -1, -1):
+    for t in range(ctc_probs.shape[0] - 2, -1, -1):
        state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]]
    output_alignment = []
-    for t in range(0, ctc_probs.size(0)):
+    for t in range(0, ctc_probs.shape[0]):
        output_alignment.append(y_insert_blank[state_seq[t, 0]])
    return output_alignment
--- a/deepspeech/utils/tensor_utils.py
+++ b/deepspeech/utils/tensor_utils.py
@ -83,7 +83,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
    # (TODO Hui Zhang): slice not supprot `end==start`
    # trailing_dims = max_size[1:]
    trailing_dims = max_size[1:] if max_size.ndim >= 2 else ()
-    max_len = max([s.size(0) for s in sequences])
+    max_len = max([s.shape[0] for s in sequences])
    if batch_first:
        out_dims = (len(sequences), max_len) + trailing_dims
    else:
@ -91,7 +91,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
    out_tensor = sequences[0].new_full(out_dims, padding_value)
    for i, tensor in enumerate(sequences):
-        length = tensor.size(0)
+        length = tensor.shape[0]
        # use index notation to prevent duplicate references to the tensor
        if batch_first:
            out_tensor[i, :length, ...] = tensor
@ -139,7 +139,7 @@ def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int,
    #ys_in = [paddle.cat([_sos, y], dim=0) for y in ys]
    #ys_out = [paddle.cat([y, _eos], dim=0) for y in ys]
    #return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id)
-    B = ys_pad.size(0)
+    B = ys_pad.shape[0]
    _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos
    _eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos
    ys_in = paddle.cat([_sos, ys_pad], dim=1)
@ -165,8 +165,8 @@ def th_accuracy(pad_outputs: paddle.Tensor,
    Returns:
        float: Accuracy value (0.0 - 1.0).
    """
-    pad_pred = pad_outputs.view(
+    pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1],
-        pad_targets.size(0), pad_targets.size(1), pad_outputs.size(1)).argmax(2)
+                                pad_outputs.shape[1]).argmax(2)
    mask = pad_targets != ignore_label
    numerator = paddle.sum(
        pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
--- a/deepspeech/utils/utility.py
+++ b/deepspeech/utils/utility.py
@ -16,15 +16,27 @@ import distutils.util
 import math
 import os
 import random
 from contextlib import contextmanager
 from typing import List
 import numpy as np
 import paddle
-__all__ = ["seed_all", 'print_arguments', 'add_arguments', "log_add"]
+__all__ = [
    "UpdateConfig", "seed_all", 'print_arguments', 'add_arguments', "log_add"
 ]
@contextmanager
 def UpdateConfig(config):
    """Update yacs config"""
    config.defrost()
    yield
    config.freeze()
 def seed_all(seed: int=210329):
    """freeze random generator seed."""
    np.random.seed(seed)
    random.seed(seed)
    paddle.seed(seed)
--- a/docs/src/install.md
+++ b/docs/src/install.md
@ -4,7 +4,7 @@ To avoid the trouble of environment setup, [running in Docker container](#runnin
 ## Prerequisites
 - Python >= 3.7
- PaddlePaddle 2.0.0 or later (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html))
+- PaddlePaddle latest version (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html))
 ## Setup (Important)
--- a/docs/src/reference.md
+++ b/docs/src/reference.md
@ -1,5 +1,7 @@
 # Reference
 We refer these repos to build `model` and `engine`:
 * [delta](https://github.com/Delta-ML/delta.git)
 * [espnet](https://github.com/espnet/espnet.git)
 * [kaldi](https://github.com/kaldi-asr/kaldi.git)
--- a/examples/aishell/s1/local/train.sh
+++ b/examples/aishell/s1/local/train.sh
@ -1,7 +1,8 @@
 #!/bin/bash
 profiler_options=
 benchmark_batch_size=0
 benchmark_max_step=0
 # seed may break model convergence
 seed=0
@ -32,12 +33,15 @@ ckpt_name=$2
 mkdir -p exp
 python3 -u ${BIN_DIR}/train.py \
 --seed ${seed} \
 --device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
--profiler_options ${profiler_options} \
+--profiler-options "${profiler_options}" \
--seed ${seed}
+--benchmark-batch-size ${benchmark_batch_size} \
 --benchmark-max-step ${benchmark_max_step}
 if [ ${seed} != 0  ]; then
    unset FLAGS_cudnn_deterministic
@ -48,4 +52,4 @@ if [ $? -ne 0 ]; then
    exit 1
 fi
-exit 0
+exit 0
--- a/examples/librispeech/s1/conf/augmentation.json
+++ b/examples/librispeech/s1/conf/augmentation.json
@ -19,17 +19,17 @@
  {
    "type": "specaug",
    "params": {
      "W": 0,
      "warp_mode": "PIL",
      "F": 10,
      "T": 50,
      "n_freq_masks": 2,
      "T": 50,
      "n_time_masks": 2,
      "p": 1.0,
      "W": 80,
      "adaptive_number_ratio": 0,
      "adaptive_size_ratio": 0,
      "max_n_time_masks": 20,
-      "replace_with_zero": true,
+      "replace_with_zero": true
      "warp_mode": "PIL"
    },
    "prob": 1.0
  }
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@ -33,7 +33,7 @@ collator:
  keep_transcription_text: False
  sortagrad: True 
  shuffle_method: batch_shuffle
-  num_workers: 0
+  num_workers: 2
 # network architecture
@ -74,7 +74,7 @@ model:
 training:
-  n_epoch: 120
+  n_epoch: 120 
  accum_grad: 2
  global_grad_clip: 5.0
  optim: adam
--- a/examples/tiny/s0/local/train.sh
+++ b/examples/tiny/s0/local/train.sh
@ -38,7 +38,7 @@ python3 -u ${BIN_DIR}/train.py \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --model_type ${model_type} \
--profiler_options "${profiler_options}" \
+--profiler-options "${profiler_options}" \
 --seed ${seed}
 if [ ${seed} != 0  ]; then
--- a/examples/tiny/s1/local/train.sh
+++ b/examples/tiny/s1/local/train.sh
@ -1,37 +1,49 @@
 #!/bin/bash
-if [ $# != 2 ];then
+profiler_options=
-    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
+benchmark_batch_size=0
-    exit -1
+benchmark_max_step=0
-fi
+
 # seed may break model convergence
 seed=0
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
 device=gpu
 if [ ${ngpu} == 0 ];then
    device=cpu
 fi
-mkdir -p exp
+if [ ${seed} != 0  ]; then
 # seed may break model convergence
 seed=0
 if [ ${seed} != 0 ]; then
    export FLAGS_cudnn_deterministic=True
    echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
 fi
 if [ $# != 2 ];then
    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
    exit -1
 fi
 config_path=$1
 ckpt_name=$2
 mkdir -p exp
 python3 -u ${BIN_DIR}/train.py \
 --seed ${seed} \
 --device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
--seed ${seed}
+--profiler-options "${profiler_options}" \
 --benchmark-batch-size ${benchmark_batch_size} \
 --benchmark-max-step ${benchmark_max_step}
-if [ ${seed} != 0 ]; then
+if [ ${seed} != 0  ]; then
    unset FLAGS_cudnn_deterministic
 fi
--- a/tests/benchmark/run_all.sh
+++ b/tests/benchmark/run_all.sh
@ -1,41 +1,46 @@
 #!/bin/bash
 CUR_DIR=${PWD}
 ROOT_DIR=../../
 # 提供可稳定复现性能的脚本，默认在标准docker环境内py37执行：
 # collect env info
 bash ${ROOT_DIR}/utils/pd_env_collect.sh
-cat pd_env.txt
+#cat pd_env.txt
 # 执行目录：需说明
 pushd ${ROOT_DIR}/examples/aishell/s1
 # 1 安装该模型需要的依赖 (如需开启优化策略请注明)
-pushd ${ROOT_DIR}/tools; make; popd
+#pushd ${ROOT_DIR}/tools; make; popd
-source ${ROOT_DIR}/tools/venv/bin/activate
+#source ${ROOT_DIR}/tools/venv/bin/activate
-pushd ${ROOT_DIR}; bash setup.sh; popd
+#pushd ${ROOT_DIR}; bash setup.sh; popd
 # 2 拷贝该模型需要数据、预训练模型
 # 执行目录：需说明
 #pushd ${ROOT_DIR}/examples/aishell/s1
 pushd ${ROOT_DIR}/examples/tiny/s1
 mkdir -p exp/log
-loca/data.sh &> exp/log/data.log
+. path.sh
 #bash local/data.sh &> exp/log/data.log
 # 3 批量运行（如不方便批量，1，2需放到单个模型中）
-model_mode_list=(conformer)
+model_mode_list=(conformer transformer)
 fp_item_list=(fp32)
-bs_item=(32 64 96)
+bs_item_list=(32 64 96)
 for model_mode in ${model_mode_list[@]}; do
      for fp_item in ${fp_item_list[@]}; do
-          for bs_item in ${bs_list[@]}
+          for bs_item in ${bs_item_list[@]}
            do
            echo "index is speed, 1gpus, begin, ${model_name}"
            run_mode=sp
-            CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}     #  (5min)
+            CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}     #  (5min)
            sleep 60
            echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}"
            run_mode=mp
-            CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
+            CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
            sleep 60
            done
      done
--- a/tests/benchmark/run_benchmark.sh
+++ b/tests/benchmark/run_benchmark.sh
@ -23,19 +23,19 @@ function _train(){
    echo "Train on ${num_gpu_devices} GPUs"
    echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
-    train_cmd="--model_name=${model_name}
+    train_cmd="--benchmark-batch-size ${batch_size}
-               --batch_size=${batch_size}
+               --benchmark-max-step ${max_iter}
-               --fp=${fp_item} \
+               conf/${model_name}.yaml ${model_name}"
-               --max_iter=${max_iter} "
+
    case ${run_mode} in
-    sp) train_cmd="python -u tools/train.py "${train_cmd}" ;;
+    sp) train_cmd="bash local/train.sh "${train_cmd}"" ;;
    mp)
-        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES tools/train.py "${train_cmd}"
+        train_cmd="bash local/train.sh "${train_cmd}"" ;;
        log_parse_file="mylog/workerlog.0" ;;
    *) echo "choose run_mode(sp or mp)"; exit 1;
    esac
-# 以下不用修改
+
-    timeout 15m ${train_cmd} > ${log_file} 2>&1
+    # 以下不用修改
    CUDA_VISIBLE_DEVICES=${device} timeout 15m ${train_cmd} > ${log_file} 2>&1
    if [ $? -ne 0 ];then
        echo -e "${model_name}, FAIL"
        export job_fail_flag=1
@ -43,7 +43,8 @@ function _train(){
        echo -e "${model_name}, SUCCESS"
        export job_fail_flag=0
    fi
-    kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
+
    trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
    if [ $run_mode = "mp" -a -d mylog ]; then
        rm ${log_file}
--- a/third_party/init.py
+++ b/third_party/init.py
--- a/third_party/paddle_audio/init.py
+++ b/third_party/paddle_audio/init.py
--- a/third_party/paddle_audio/frontend.py
+++ b/third_party/paddle_audio/frontend.py
@ -1,146 +0,0 @@
 from typing import Tuple
 import numpy as np
 import paddle
 from paddle import Tensor
 from paddle import nn
 from paddle.nn import functional as F
 def frame(x: Tensor,
          num_samples: Tensor,
          win_length: int,
          hop_length: int,
          clip: bool = True) -> Tuple[Tensor, Tensor]:
    """Extract frames from audio.
    Parameters
    ----------
    x : Tensor
        Shape (N, T), batched waveform.
    num_samples : Tensor
        Shape (N, ), number of samples of each waveform.
    win_length : int
        Window length.
    hop_length : int
        Number of samples shifted between ajancent frames.
    clip : bool, optional
        Whether to clip audio that does not fit into the last frame, by 
        default True
    Returns
    -------
    frames : Tensor
        Shape (N, T', win_length).
    num_frames : Tensor
        Shape (N, ) number of valid frames
    """
    assert hop_length <= win_length
    num_frames = (num_samples - win_length) // hop_length
    padding = (0, 0)
    if not clip:
        num_frames += 1
        # NOTE: pad hop_length - 1 to the right to ensure that there is at most
        # one frame dangling to the righe edge
        padding = (0, hop_length - 1)
    weight = paddle.eye(win_length).unsqueeze(1)
    frames = F.conv1d(x.unsqueeze(1),
                      weight,
                      padding=padding,
                      stride=(hop_length, ))
    return frames, num_frames
 class STFT(nn.Layer):
    """A module for computing stft transformation in a differentiable way. 
    Parameters
    ------------
    n_fft : int
        Number of samples in a frame.
    hop_length : int
        Number of samples shifted between adjacent frames.
    win_length : int
        Length of the window.
    clip: bool
        Whether to clip audio is necesaary.
    """
    def __init__(self,
                 n_fft: int,
                 hop_length: int,
                 win_length: int,
                 window_type: str = None,
                 clip: bool = True):
        super().__init__()
        self.hop_length = hop_length
        self.n_bin = 1 + n_fft // 2
        self.n_fft = n_fft
        self.clip = clip
        # calculate window
        if window_type is None:
            window = np.ones(win_length)
        elif window_type == "hann":
            window = np.hanning(win_length)
        elif window_type == "hamming":
            window = np.hamming(win_length)
        else:
            raise ValueError("Not supported yet!")
        if win_length < n_fft:
            window = F.pad(window, (0, n_fft - win_length))
        elif win_length > n_fft:
            window = window[:n_fft]
        # (n_bins, n_fft) complex
        kernel_size = min(n_fft, win_length)
        weight = np.fft.fft(np.eye(n_fft))[:self.n_bin, :kernel_size]
        w_real = weight.real
        w_imag = weight.imag
        # (2 * n_bins, kernel_size)
        w = np.concatenate([w_real, w_imag], axis=0)
        w = w * window
        # (2 * n_bins, 1, kernel_size) # (C_out, C_in, kernel_size)
        w = np.expand_dims(w, 1)
        weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
        self.register_buffer("weight", weight)
    def forward(self, x: Tensor, num_samples: Tensor) -> Tuple[Tensor, Tensor]:
        """Compute the stft transform.
        Parameters
        ------------
        x : Tensor [shape=(B, T)]
            The input waveform.
        num_samples : Tensor 
            Number of samples of each waveform.
        Returns
        ------------
        D : Tensor
            Shape(N, T', n_bins, 2) Spectrogram.
        num_frames: Tensor
            Shape (N,) number of samples of each spectrogram
        """
        num_frames = (num_samples - self.win_length) // self.hop_length
        padding = (0, 0)
        if not self.clip:
            num_frames += 1
            padding = (0, self.hop_length - 1)
        batch_size, _, _ = paddle.shape(x)
        x = x.unsqueeze(-1)
        D = F.conv1d(self.weight,
                     x,
                     stride=(self.hop_length, ),
                     padding=padding,
                     data_format="NLC")
        D = paddle.reshape(D, [batch_size, -1, self.n_bin, 2])
        return D, num_frames
--- a/third_party/paddle_audio/frontend/common.py
+++ b/third_party/paddle_audio/frontend/common.py
@ -0,0 +1,201 @@
 import paddle
 import numpy as np
 from typing import Tuple, Optional, Union
 # https://github.com/kaldi-asr/kaldi/blob/cbed4ff688/src/feat/feature-window.cc#L109
 def povey_window(frame_len:int) -> np.ndarray:
    win = np.empty(frame_len)
    a = 2 * np.pi / (frame_len -1)
    for i in range(frame_len):
        win[i] = (0.5 - 0.5 * np.cos(a * i) )**0.85 
    return win
 def hann_window(frame_len:int) -> np.ndarray:
    win = np.empty(frame_len)
    a = 2 * np.pi / (frame_len -1)
    for i in range(frame_len):
        win[i] = 0.5 - 0.5 * np.cos(a * i)
    return win
 def sine_window(frame_len:int) -> np.ndarray:
    win = np.empty(frame_len)
    a = 2 * np.pi / (frame_len -1)
    for i in range(frame_len):
        win[i] = np.sin(0.5 * a * i)
    return win
 def hamm_window(frame_len:int) -> np.ndarray:
    win = np.empty(frame_len)
    a = 2 * np.pi / (frame_len -1)
    for i in range(frame_len):
        win[i] = 0.54 - 0.46 * np.cos(a * i)
    return win
 def get_window(wintype:Optional[str], winlen:int) -> np.ndarray:
    """get window function
    Args:
        wintype (Optional[str]): window type.
        winlen (int): window length in samples.
    Raises:
        ValueError: not support window.
    Returns:
        np.ndarray: window coeffs.
    """
    # calculate window
    if not wintype or wintype == 'rectangular':
        window = np.ones(winlen)
    elif wintype == "hann":
        window = hann_window(winlen)
    elif wintype == "hamm":
        window = hamm_window(winlen)
    elif wintype == "povey":
        window = povey_window(winlen)
    else:
        msg = f"{wintype} Not supported yet!"
        raise ValueError(msg)
    return window
 def dft_matrix(n_fft:int, winlen:int=None, n_bin:int=None) -> Tuple[np.ndarray, np.ndarray, int]:
    # https://en.wikipedia.org/wiki/Discrete_Fourier_transform
    # (n_bins, n_fft) complex
    if n_bin is None:
        n_bin = 1 + n_fft // 2
    if winlen is None:
        winlen = n_bin
    # https://github.com/numpy/numpy/blob/v1.20.0/numpy/fft/_pocketfft.py#L49
    kernel_size = min(n_fft, winlen)
    n = np.arange(0, n_fft, 1.)
    wsin = np.empty((n_bin, kernel_size)) #[Cout, kernel_size]
    wcos = np.empty((n_bin, kernel_size)) #[Cout, kernel_size]
    for k in range(n_bin): # Only half of the bins contain useful info
        wsin[k,:] = -np.sin(2*np.pi*k*n/n_fft)[:kernel_size]
        wcos[k,:] = np.cos(2*np.pi*k*n/n_fft)[:kernel_size]
    w_real = wcos
    w_imag = wsin
    return w_real, w_imag, kernel_size
 def dft_matrix_fast(n_fft:int, winlen:int=None, n_bin:int=None) -> Tuple[np.ndarray, np.ndarray, int]:
    # (n_bins, n_fft) complex
    if n_bin is None:
        n_bin = 1 + n_fft // 2
    if winlen is None:
        winlen = n_bin
    # https://github.com/numpy/numpy/blob/v1.20.0/numpy/fft/_pocketfft.py#L49
    kernel_size = min(n_fft, winlen)
    # https://en.wikipedia.org/wiki/DFT_matrix
    # https://ccrma.stanford.edu/~jos/st/Matrix_Formulation_DFT.html
    weight = np.fft.fft(np.eye(n_fft))[:self.n_bin, :kernel_size]
    w_real = weight.real
    w_imag = weight.imag
    return w_real, w_imag, kernel_size
 def bin2hz(bin:Union[List[int], np.ndarray], N:int, sr:int)->List[float]:
    """FFT bins to Hz.
    http://practicalcryptography.com/miscellaneous/machine-learning/intuitive-guide-discrete-fourier-transform/
    Args:
        bins (List[int] or np.ndarray): bin index.
        N (int): the number of samples, or FFT points.
        sr (int): sampling rate.
    Returns:
        List[float]: Hz's.
    """
    hz = bin * float(sr) / N
 def hz2mel(hz):
    """Convert a value in Hertz to Mels
    :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
    :returns: a value in Mels. If an array was passed in, an identical sized array is returned.
    """
    return 1127 * np.log(1+hz/700.0)
 def mel2hz(mel):
    """Convert a value in Mels to Hertz
    :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
    :returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
    """
    return 700 * (np.exp(mel/1127.0)-1)
 def rms_to_db(rms: float):
    """Root Mean Square to dB.
    Args:
        rms ([float]): root mean square
    Returns:
        float: dB
    """
    return 20.0 * math.log10(max(1e-16, rms))
 def rms_to_dbfs(rms: float):
    """Root Mean Square to dBFS.
    https://fireattack.wordpress.com/2017/02/06/replaygain-loudness-normalization-and-applications/
    Audio is mix of sine wave, so 1 amp sine wave's Full scale is 0.7071, equal to -3.0103dB.
    dB = dBFS + 3.0103
    dBFS = db - 3.0103
    e.g. 0 dB = -3.0103 dBFS
    Args:
        rms ([float]): root mean square
    Returns:
        float: dBFS
    """
    return rms_to_db(rms) - 3.0103
 def max_dbfs(sample_data: np.ndarray):
    """Peak dBFS based on the maximum energy sample. 
    Args:
        sample_data ([np.ndarray]): float array, [-1, 1].
    Returns:
        float: dBFS 
    """
    # Peak dBFS based on the maximum energy sample. Will prevent overdrive if used for normalization.
    return rms_to_dbfs(max(abs(np.min(sample_data)), abs(np.max(sample_data))))
 def mean_dbfs(sample_data):
    """Peak dBFS based on the RMS energy. 
    Args:
        sample_data ([np.ndarray]): float array, [-1, 1].
    Returns:
        float: dBFS 
    """
    return rms_to_dbfs(
        math.sqrt(np.mean(np.square(sample_data, dtype=np.float64))))
 def gain_db_to_ratio(gain_db: float):
    """dB to ratio
    Args:
        gain_db (float): gain in dB
    Returns:
        float: scale in amp
    """
    return math.pow(10.0, gain_db / 20.0)
--- a/third_party/paddle_audio/frontend/english.wav
+++ b/third_party/paddle_audio/frontend/english.wav
--- a/third_party/paddle_audio/frontend/kaldi.py
+++ b/third_party/paddle_audio/frontend/kaldi.py
@ -0,0 +1,266 @@
 from typing import Tuple
 import numpy as np
 import paddle
 from paddle import Tensor
 from paddle import nn
 from paddle.nn import functional as F
 import soundfile as sf
 from .common import get_window
 from .common import dft_matrix
 def read(wavpath:str, sr:int = None, start=0, stop=None, dtype='int16', always_2d=True)->Tuple[int, np.ndarray]:
    """load wav file.
    Args:
        wavpath (str): wav path.
        sr (int, optional): expect sample rate. Defaults to None.
        dtype (str, optional): wav data bits. Defaults to 'int16'.
    Returns:
        Tuple[int, np.ndarray]: sr (int), wav (int16) [T, C].
    """
    wav, r_sr = sf.read(wavpath, start=start, stop=stop, dtype=dtype, always_2d=always_2d)
    if sr:
        assert sr == r_sr
    return r_sr, wav
 def write(wavpath:str, wav:np.ndarray, sr:int, dtype='PCM_16'):
    """write wav file.
    Args:
        wavpath (str): file path to save.
        wav (np.ndarray): wav data.
        sr (int): data samplerate.
        dtype (str, optional): wav bit format. Defaults to 'PCM_16'.
    """
    sf.write(wavpath, wav, sr, subtype=dtype)
 def frames(x: Tensor,
          num_samples: Tensor,
          sr: int,
          win_length: float,
          stride_length: float,
          clip: bool = False) -> Tuple[Tensor, Tensor]:
    """Extract frames from audio.
    Parameters
    ----------
    x : Tensor
        Shape (B, T), batched waveform.
    num_samples : Tensor
        Shape (B, ), number of samples of each waveform.
    sr: int
        Sampling Rate.
    win_length : float
        Window length in ms.
    stride_length : float
        Stride length in ms.
    clip : bool, optional
        Whether to clip audio that does not fit into the last frame, by
        default True
    Returns
    -------
    frames : Tensor
        Shape (B, T', win_length).
    num_frames : Tensor
        Shape (B, ) number of valid frames
    """
    assert stride_length <= win_length
    stride_length = int(stride_length * sr)
    win_length = int(win_length * sr)
    num_frames = (num_samples - win_length) // stride_length
    padding = (0, 0)
    if not clip:
        num_frames += 1
        need_samples = num_frames * stride_length + win_length
        padding = (0, need_samples - num_samples - 1)
    weight = paddle.eye(win_length).unsqueeze(1) #[win_length, 1, win_length]
    frames = F.conv1d(x.unsqueeze(-1),
                      weight,
                      padding=padding,
                      stride=(stride_length, ),
                      data_format='NLC')
    return frames, num_frames
 def dither(signal:Tensor, dither_value=1.0)->Tensor:
    """dither frames for log compute.
    Args:
        signal (Tensor): [B, T, D]
        dither_value (float, optional): [scalar]. Defaults to 1.0.
    Returns:
        Tensor: [B, T, D]
    """
    D = paddle.shape(signal)[-1]
    signal += paddle.normal(shape=[1, 1, D]) * dither_value
    return signal
 def remove_dc_offset(signal:Tensor)->Tensor:
    """remove dc.
    Args:
        signal (Tensor): [B, T, D]
    Returns:
        Tensor: [B, T, D]
    """
    signal -= paddle.mean(signal, axis=-1, keepdim=True)
    return signal
 def preemphasis(signal:Tensor, coeff=0.97)->Tensor:
    """perform preemphasis on the input signal.
    Args:
        signal (Tensor): [B, T, D], The signal to filter.
        coeff (float, optional): [scalar].The preemphasis coefficient. 0 is no filter, Defaults to 0.97.
    Returns:
        Tensor: [B, T, D]
    """
    return paddle.concat([
        (1-coeff)*signal[:, :, 0:1],
        signal[:, :, 1:] - coeff * signal[:, :, :-1]
    ], axis=-1)
 class STFT(nn.Layer):
    """A module for computing stft transformation in a differentiable way.
    http://practicalcryptography.com/miscellaneous/machine-learning/intuitive-guide-discrete-fourier-transform/
    Parameters
    ------------
    n_fft : int
        Number of samples in a frame.
    sr: int
        Number of Samplilng rate.
    stride_length : float
        Number of samples shifted between adjacent frames.
    win_length : float
        Length of the window.
    clip: bool
        Whether to clip audio is necesaary.
    """
    def __init__(self,
                 n_fft: int,
                 sr: int,
                 win_length: float,
                 stride_length: float,
                 dither:float=0.0,
                 preemph_coeff:float=0.97,
                 remove_dc_offset:bool=True,
                 window_type: str = 'povey',
                 clip: bool = False):
        super().__init__()
        self.sr = sr
        self.win_length = win_length
        self.stride_length = stride_length
        self.dither = dither
        self.preemph_coeff = preemph_coeff
        self.remove_dc_offset = remove_dc_offset
        self.window_type = window_type
        self.clip = clip
        self.n_fft = n_fft
        self.n_bin = 1 + n_fft // 2
        w_real, w_imag, kernel_size = dft_matrix(
            self.n_fft, int(self.win_length * self.sr), self.n_bin
        )
        # calculate window
        window = get_window(window_type, kernel_size)
        # (2 * n_bins, kernel_size)
        w = np.concatenate([w_real, w_imag], axis=0)
        w = w * window
        # (kernel_size, 2 * n_bins)
        w = np.transpose(w)
        weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
        self.register_buffer("weight", weight)
    def forward(self, x: Tensor, num_samples: Tensor) -> Tuple[Tensor, Tensor]:
        """Compute the stft transform.
        Parameters
        ------------
        x : Tensor [shape=(B, T)]
            The input waveform.
        num_samples : Tensor [shape=(B,)]
            Number of samples of each waveform.
        Returns
        ------------
        C : Tensor
            Shape(B, T', n_bins, 2) Spectrogram.
        num_frames: Tensor
            Shape (B,) number of samples of each spectrogram
        """
        batch_size = paddle.shape(num_samples)
        F, nframe = frames(x, num_samples, self.sr, self.win_length, self.stride_length, clip=self.clip)
        if self.dither:
            F = dither(F, self.dither)
        if self.remove_dc_offset:
            F = remove_dc_offset(F)
        if self.preemph_coeff:
            F = preemphasis(F)
        C = paddle.matmul(F, self.weight) # [B, T, K] [K, 2 * n_bins]
        C = paddle.reshape(C, [batch_size, -1, 2, self.n_bin])
        C = C.transpose([0, 1, 3, 2])
        return C, nframe
 def powspec(C:Tensor) -> Tensor:
    """Compute the power spectrum |X_k|^2.
    Args:
        C (Tensor): [B, T, C, 2]
    Returns:
        Tensor: [B, T, C]
    """
    real, imag = paddle.chunk(C, 2, axis=-1)
    return paddle.square(real.squeeze(-1)) + paddle.square(imag.squeeze(-1))
 def magspec(C: Tensor, eps=1e-10) -> Tensor:
    """Compute the magnitude spectrum |X_k|.
    Args:
        C (Tensor): [B, T, C, 2]
        eps (float): epsilon.
    Returns:
        Tensor: [B, T, C]
    """
    pspec = powspec(C)
    return paddle.sqrt(pspec + eps)
 def logspec(C: Tensor, eps=1e-10) -> Tensor:
    """Compute log-spectrum  20log10∣X_k∣.
    Args:
        C (Tensor): [description]
        eps ([type], optional): [description]. Defaults to 1e-10.
    Returns:
        Tensor: [description]
    """
    spec = magspec(C)
    return 20 * paddle.log10(spec + eps)
--- a/third_party/paddle_audio/frontend/kaldi_test.py
+++ b/third_party/paddle_audio/frontend/kaldi_test.py
@ -0,0 +1,533 @@
 from typing import Tuple
 import numpy as np
 import paddle
 import unittest
 import decimal
 import numpy
 import math
 import logging
 from pathlib import Path
 from scipy.fftpack import dct
 from third_party.paddle_audio.frontend import kaldi
 def round_half_up(number):
    return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
 def rolling_window(a, window, step=1):
    # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
 def do_dither(signal, dither_value=1.0):
    signal += numpy.random.normal(size=signal.shape) * dither_value
    return signal
 def do_remove_dc_offset(signal):
    signal -= numpy.mean(signal)
    return signal
 def do_preemphasis(signal, coeff=0.97):
    """perform preemphasis on the input signal.
    :param signal: The signal to filter.
    :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
    :returns: the filtered signal.
    """
    return numpy.append((1-coeff)*signal[0], signal[1:] - coeff * signal[:-1])
 def framesig(sig, frame_len, frame_step, dither=1.0, preemph=0.97, remove_dc_offset=True, wintype='hamming', stride_trick=True):
    """Frame a signal into overlapping frames.
    :param sig: the audio signal to frame.
    :param frame_len: length of each frame measured in samples.
    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
    :param stride_trick: use stride trick to compute the rolling window and window multiplication faster
    :returns: an array of frames. Size is NUMFRAMES by frame_len.
    """
    slen = len(sig)
    frame_len = int(round_half_up(frame_len))
    frame_step = int(round_half_up(frame_step))
    if slen <= frame_len:
        numframes = 1
    else:
        numframes = 1 + (( slen - frame_len) // frame_step)
    # check kaldi/src/feat/feature-window.h
    padsignal = sig[:(numframes-1)*frame_step+frame_len]
    if wintype is 'povey':
        win = numpy.empty(frame_len)
        for i in range(frame_len):
            win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85     
    else: # the hamming window
        win = numpy.hamming(frame_len)
    if stride_trick:
        frames = rolling_window(padsignal, window=frame_len, step=frame_step)
    else:
        indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
            numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
        indices = numpy.array(indices, dtype=numpy.int32)
        frames = padsignal[indices]
        win = numpy.tile(win, (numframes, 1))
    frames = frames.astype(numpy.float32)
    raw_frames = numpy.zeros(frames.shape)
    for frm in range(frames.shape[0]):
        frames[frm,:] = do_dither(frames[frm,:], dither)        # dither
        frames[frm,:] = do_remove_dc_offset(frames[frm,:])      # remove dc offset
        raw_frames[frm,:] = frames[frm,:]
        frames[frm,:] = do_preemphasis(frames[frm,:], preemph)    # preemphasize
    return frames * win, raw_frames
 def magspec(frames, NFFT):
    """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
    :param frames: the array of frames. Each row is a frame.
    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
    """
    if numpy.shape(frames)[1] > NFFT:
        logging.warn(
            'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
            numpy.shape(frames)[1], NFFT)
    complex_spec = numpy.fft.rfft(frames, NFFT)
    return numpy.absolute(complex_spec)
 def powspec(frames, NFFT):
    """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
    :param frames: the array of frames. Each row is a frame.
    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
    """
    return numpy.square(magspec(frames, NFFT))
 def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
         nfilt=23,nfft=512,lowfreq=20,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,
         ceplifter=22,useEnergy=True,wintype='povey'):
    """Compute MFCC features from an audio signal.
    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param numcep: the number of cepstrum to return, default 13
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
    :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
    :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
    """
    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither,remove_dc_offset,preemph,wintype)
    feat = numpy.log(feat)
    feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
    feat = lifter(feat,ceplifter)
    if useEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
    return feat
 def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97, 
          wintype='hamming'):
    """Compute Mel-filterbank energy features from an audio signal.
    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
     winfunc=lambda x:numpy.ones((x,))   
    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
        second return value is the energy in each frame (total energy, unwindowed)
    """
    highfreq= highfreq or samplerate/2
    frames,raw_frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, dither, preemph, remove_dc_offset, wintype)
    pspec = sigproc.powspec(frames,nfft) # nearly the same until this part
    energy = numpy.sum(raw_frames**2,1) # this stores the raw energy in each frame
    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
    return feat,energy
 def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=40,nfft=512,lowfreq=64,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,wintype='hamming'):
    """Compute log Mel-filterbank energy features from an audio signal.
    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
    """
    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither, remove_dc_offset,preemph,wintype)
    return numpy.log(feat)
 def hz2mel(hz):
    """Convert a value in Hertz to Mels
    :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
    :returns: a value in Mels. If an array was passed in, an identical sized array is returned.
    """
    return 1127 * numpy.log(1+hz/700.0)
 def mel2hz(mel):
    """Convert a value in Mels to Hertz
    :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
    :returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
    """
    return 700 * (numpy.exp(mel/1127.0)-1)
 def get_filterbanks(nfilt=26,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
    """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
    to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
    :param nfilt: the number of filters in the filterbank, default 20.
    :param nfft: the FFT size. Default is 512.
    :param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
    :param lowfreq: lowest band edge of mel filters, default 0 Hz
    :param highfreq: highest band edge of mel filters, default samplerate/2
    :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
    """
    highfreq= highfreq or samplerate/2
    assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
    # compute points evenly spaced in mels
    lowmel = hz2mel(lowfreq)
    highmel = hz2mel(highfreq)
    # check kaldi/src/feat/Mel-computations.h    
    fbank = numpy.zeros([nfilt,nfft//2+1])
    mel_freq_delta = (highmel-lowmel)/(nfilt+1)
    for j in range(0,nfilt):
        leftmel = lowmel+j*mel_freq_delta
        centermel = lowmel+(j+1)*mel_freq_delta
        rightmel = lowmel+(j+2)*mel_freq_delta
        for i in range(0,nfft//2):
            mel=hz2mel(i*samplerate/nfft)
            if mel>leftmel and mel<rightmel:
                if mel<centermel:
                    fbank[j,i]=(mel-leftmel)/(centermel-leftmel)
                else:
                    fbank[j,i]=(rightmel-mel)/(rightmel-centermel)
    return fbank
 def lifter(cepstra, L=22):
    """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
    magnitude of the high frequency DCT coeffs.
    :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
    :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
    """
    if L > 0:
        nframes,ncoeff = numpy.shape(cepstra)
        n = numpy.arange(ncoeff)
        lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
        return lift*cepstra
    else:
        # values of L <= 0, do nothing
        return cepstra
 def delta(feat, N):
    """Compute delta features from a feature vector sequence.
    :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
    :param N: For each frame, calculate delta features based on preceding and following N frames
    :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
    """
    if N < 1:
        raise ValueError('N must be an integer >= 1')
    NUMFRAMES = len(feat)
    denominator = 2 * sum([i**2 for i in range(1, N+1)])
    delta_feat = numpy.empty_like(feat)
    padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge')   # padded version of feat
    for t in range(NUMFRAMES):
        delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator   # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
    return delta_feat
 ##### modify for test ######
 def framesig_without_dither_dc_preemphasize(sig, frame_len, frame_step, wintype='hamming', stride_trick=True):
    """Frame a signal into overlapping frames.
    :param sig: the audio signal to frame.
    :param frame_len: length of each frame measured in samples.
    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
    :param stride_trick: use stride trick to compute the rolling window and window multiplication faster
    :returns: an array of frames. Size is NUMFRAMES by frame_len.
    """
    slen = len(sig)
    frame_len = int(round_half_up(frame_len))
    frame_step = int(round_half_up(frame_step))
    if slen <= frame_len:
        numframes = 1
    else:
        numframes = 1 + (( slen - frame_len) // frame_step)
    # check kaldi/src/feat/feature-window.h
    padsignal = sig[:(numframes-1)*frame_step+frame_len]
    if wintype is 'povey':
        win = numpy.empty(frame_len)
        for i in range(frame_len):
            win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85 
    elif wintype == '':
        win = numpy.ones(frame_len)
    elif wintype == 'hann':
        win = numpy.hanning(frame_len)
    else: # the hamming window
        win = numpy.hamming(frame_len)
    if stride_trick:
        frames = rolling_window(padsignal, window=frame_len, step=frame_step)
    else:
        indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
            numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
        indices = numpy.array(indices, dtype=numpy.int32)
        frames = padsignal[indices]
        win = numpy.tile(win, (numframes, 1))
    frames = frames.astype(numpy.float32)
    raw_frames = frames
    return frames * win, raw_frames
 def frames(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=40,nfft=512,lowfreq=0,highfreq=None, wintype='hamming'):
    frames_with_win, raw_frames = framesig_without_dither_dc_preemphasize(signal, winlen*samplerate, winstep*samplerate, wintype)
    return frames_with_win, raw_frames
 def complexspec(frames, NFFT):
    """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
    :param frames: the array of frames. Each row is a frame.
    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
    """
    if numpy.shape(frames)[1] > NFFT:
        logging.warn(
            'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
            numpy.shape(frames)[1], NFFT)
    complex_spec = numpy.fft.rfft(frames, NFFT)
    return complex_spec
 def stft_with_window(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97, 
          wintype='hamming'):
    frames_with_win, raw_frames = framesig_without_dither_dc_preemphasize(signal, winlen*samplerate, winstep*samplerate, wintype)
    spec = magspec(frames_with_win, nfft) # nearly the same until this part
    scomplex = complexspec(frames_with_win, nfft)
    rspec = magspec(raw_frames, nfft)
    rcomplex = complexspec(raw_frames, nfft)
    return spec, scomplex, rspec, rcomplex
 class TestKaldiFE(unittest.TestCase):
    def setUp(self):
        self. this_dir = Path(__file__).parent
        self.wavpath = str(self.this_dir / 'english.wav')
        self.winlen=0.025 # ms
        self.winstep=0.01 # ms
        self.nfft=512
        self.lowfreq = 0
        self.highfreq = None
        self.wintype='hamm'
        self.nfilt=40
        paddle.set_device('cpu')
    def test_read(self):
        import scipy.io.wavfile as wav
        rate, sig = wav.read(self.wavpath)
        sr, wav = kaldi.read(self.wavpath)
        wav = wav[:, 0]
        self.assertTrue(np.all(sig == wav))
        self.assertEqual(rate, sr)
    def test_frames(self):
        sr, wav = kaldi.read(self.wavpath)
        wav = wav[:, 0]
        _, fs = frames(wav, samplerate=sr, 
                            winlen=self.winlen, winstep=self.winstep, 
                            nfilt=self.nfilt, nfft=self.nfft, 
                            lowfreq=self.lowfreq, highfreq=self.highfreq, 
                            wintype=self.wintype)
        t_wav = paddle.to_tensor([wav], dtype='float32')
        t_wavlen = paddle.to_tensor([len(wav)])
        t_fs, t_nframe = kaldi.frames(t_wav, t_wavlen, sr, self.winlen, self.winstep, clip=False)
        t_fs = t_fs.astype(fs.dtype)[0]
        self.assertEqual(t_nframe.item(), fs.shape[0])
        self.assertTrue(np.allclose(t_fs.numpy(), fs))
    def test_stft(self):
        sr, wav = kaldi.read(self.wavpath)
        wav = wav[:, 0]
        for wintype in ['', 'hamm', 'hann', 'povey']:
            self.wintype=wintype
            _, stft_c_win, _, _ = stft_with_window(wav, samplerate=sr, 
                                winlen=self.winlen, winstep=self.winstep, 
                                nfilt=self.nfilt, nfft=self.nfft, 
                                lowfreq=self.lowfreq, highfreq=self.highfreq, 
                                wintype=self.wintype)
            t_wav = paddle.to_tensor([wav], dtype='float32')
            t_wavlen = paddle.to_tensor([len(wav)])
            stft_class = kaldi.STFT(self.nfft, sr, self.winlen, self.winstep, window_type=self.wintype, dither=0.0, preemph_coeff=0.0, remove_dc_offset=False, clip=False)
            t_stft, t_nframe = stft_class(t_wav, t_wavlen)
            t_stft = t_stft.astype(stft_c_win.real.dtype)[0]
            t_real = t_stft[:, :, 0]
            t_imag = t_stft[:, :, 1]
            self.assertEqual(t_nframe.item(), stft_c_win.real.shape[0])
            self.assertLess(np.sum(t_real.numpy()) - np.sum(stft_c_win.real), 1)
            self.assertTrue(np.allclose(t_real.numpy(), stft_c_win.real, atol=1e-1))
            self.assertLess(np.sum(t_imag.numpy()) - np.sum(stft_c_win.imag), 1)
            self.assertTrue(np.allclose(t_imag.numpy(), stft_c_win.imag, atol=1e-1))
    def test_magspec(self):
        sr, wav = kaldi.read(self.wavpath)
        wav = wav[:, 0]
        for wintype in ['', 'hamm', 'hann', 'povey']:
            self.wintype=wintype
            stft_win, _, _, _ = stft_with_window(wav, samplerate=sr, 
                                winlen=self.winlen, winstep=self.winstep, 
                                nfilt=self.nfilt, nfft=self.nfft, 
                                lowfreq=self.lowfreq, highfreq=self.highfreq, 
                                wintype=self.wintype)
            t_wav = paddle.to_tensor([wav], dtype='float32')
            t_wavlen = paddle.to_tensor([len(wav)])
            stft_class = kaldi.STFT(self.nfft, sr, self.winlen, self.winstep, window_type=self.wintype, dither=0.0, preemph_coeff=0.0, remove_dc_offset=False, clip=False)
            t_stft, t_nframe = stft_class(t_wav, t_wavlen)
            t_stft = t_stft.astype(stft_win.dtype)
            t_spec = kaldi.magspec(t_stft)[0]
            self.assertEqual(t_nframe.item(), stft_win.shape[0])
            self.assertLess(np.sum(t_spec.numpy()) - np.sum(stft_win), 1)
            self.assertTrue(np.allclose(t_spec.numpy(), stft_win, atol=1e-1))
     def test_magsepc_winprocess(self):
        sr, wav = kaldi.read(self.wavpath)
        wav = wav[:, 0]
        fs, _= framesig(wav, self.winlen*sr, self.winstep*sr, 
                        dither=0.0, preemph=0.97, remove_dc_offset=True, wintype='povey', stride_trick=True)
        spec = magspec(fs, self.nfft) # nearly the same until this part
        t_wav = paddle.to_tensor([wav], dtype='float32')
        t_wavlen = paddle.to_tensor([len(wav)])
        stft_class = kaldi.STFT(
            self.nfft, sr, self.winlen, self.winstep,
            window_type='povey', dither=0.0, preemph_coeff=0.97, remove_dc_offset=True, clip=False)
        t_stft, t_nframe = stft_class(t_wav, t_wavlen)
        t_stft = t_stft.astype(spec.dtype)
        t_spec = kaldi.magspec(t_stft)[0]
        self.assertEqual(t_nframe.item(), fs.shape[0])
        self.assertLess(np.sum(t_spec.numpy()) - np.sum(spec), 1)
        self.assertTrue(np.allclose(t_spec.numpy(), spec, atol=1e-1))
    def test_powspec(self):
        sr, wav = kaldi.read(self.wavpath)
        wav = wav[:, 0]
        for wintype in ['', 'hamm', 'hann', 'povey']:
            self.wintype=wintype
            stft_win, _, _, _ = stft_with_window(wav, samplerate=sr, 
                                winlen=self.winlen, winstep=self.winstep, 
                                nfilt=self.nfilt, nfft=self.nfft, 
                                lowfreq=self.lowfreq, highfreq=self.highfreq, 
                                wintype=self.wintype)
            stft_win = np.square(stft_win)
            t_wav = paddle.to_tensor([wav], dtype='float32')
            t_wavlen = paddle.to_tensor([len(wav)])
            stft_class = kaldi.STFT(self.nfft, sr, self.winlen, self.winstep, window_type=self.wintype, dither=0.0, preemph_coeff=0.0, remove_dc_offset=False, clip=False)
            t_stft, t_nframe = stft_class(t_wav, t_wavlen)
            t_stft = t_stft.astype(stft_win.dtype)
            t_spec = kaldi.powspec(t_stft)[0]
            self.assertEqual(t_nframe.item(), stft_win.shape[0])
            self.assertLess(np.sum(t_spec.numpy() - stft_win), 5e4)
            self.assertTrue(np.allclose(t_spec.numpy(), stft_win, atol=1e2))
 # from python_speech_features import mfcc
 # from python_speech_features import delta
 # from python_speech_features import logfbank
 # import scipy.io.wavfile as wav
 # (rate,sig) = wav.read("english.wav")
 # # note that generally nfilt=40 is used for speech recognition
 # fbank_feat = logfbank(sig,nfilt=23,lowfreq=20,dither=0,wintype='povey')
 # # the computed fbank coefficents of english.wav with dimension [110,23]
 # # [ 12.2865	12.6906	13.1765	15.714	16.064	15.7553	16.5746	16.9205	16.6472	16.1302	16.4576	16.7326	16.8864	17.7215	18.88	19.1377	19.1495	18.6683	18.3886	20.3506	20.2772	18.8248	18.1899
 # # 11.9198	13.146	14.7215	15.8642	17.4288	16.394	16.8238	16.1095	16.4297	16.6331	16.3163	16.5093	17.4981	18.3429	19.6555	19.6263	19.8435	19.0534	19.001	20.0287	19.7707	19.5852	19.1112
 # # ...
 # # ...
 # # the same with that using kaldi commands: compute-fbank-feats --dither=0.0
 # mfcc_feat = mfcc(sig,dither=0,useEnergy=True,wintype='povey')
 # # the computed mfcc coefficents of english.wav with dimension [110,13]
 # # [ 17.1337	-23.3651	-7.41751	-7.73686	-21.3682	-8.93884	-3.70843	4.68346	-16.0676	12.782	-7.24054	8.25089	10.7292
 # # 17.1692	-23.3028	-5.61872	-4.0075	-23.287	-20.6101	-5.51584	-6.15273	-14.4333	8.13052	-0.0345329	2.06274	-0.564298
 # # ...
 # # ...
 # # the same with that using kaldi commands: compute-mfcc-feats --dither=0.0
 if __name__ == '__main__':
    unittest.main()