Merge pull request #879 from PaddlePaddle/debug

fix ctc loss not convergence
4 years ago · ad6d09d622
parent 2a671e03a7 366e34c925
commit ad6d09d622
53 changed files with 521 additions and 1100 deletions
--- a/README.md
+++ b/README.md
@ -16,7 +16,7 @@
 All tested under:  
 * Ubuntu 16.04
 * python>=3.7
-* paddlepaddle>=2.2.0rc
+* paddlepaddle==2.1.2
 Please see [install](docs/src/install.md).
--- a/deepspeech/init.py
+++ b/deepspeech/init.py
@ -353,3 +353,31 @@ if not hasattr(paddle.Tensor, 'tolist'):
    logger.debug(
        "register user tolist to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'tolist', tolist)
 # hack loss
 def ctc_loss(logits,
             labels,
             input_lengths,
             label_lengths,
             blank=0,
             reduction='mean',
             norm_by_times=True):
    #logger.info("my ctc loss with norm by times")
    ## https://github.com/PaddlePaddle/Paddle/blob/f5ca2db2cc/paddle/fluid/operators/warpctc_op.h#L403
    loss_out = paddle.fluid.layers.warpctc(logits, labels, blank, norm_by_times,
                                           input_lengths, label_lengths)
    loss_out = paddle.fluid.layers.squeeze(loss_out, [-1])
    assert reduction in ['mean', 'sum', 'none']
    if reduction == 'mean':
        loss_out = paddle.mean(loss_out / label_lengths)
    elif reduction == 'sum':
        loss_out = paddle.sum(loss_out)
    return loss_out
 logger.debug(
    "override ctc_loss of paddle.nn.functional if exists, remove this when fixed!"
 )
 F.ctc_loss = ctc_loss
--- a/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
+++ b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
@ -28,7 +28,6 @@
 #include "path_trie.h"
 using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
 const std::string kSPACE = "<space>";
 std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
    const std::vector<std::vector<double>> &probs_seq,
--- a/deepspeech/decoders/swig/decoder_utils.h
+++ b/deepspeech/decoders/swig/decoder_utils.h
@ -15,10 +15,12 @@
 #ifndef DECODER_UTILS_H_
 #define DECODER_UTILS_H_
 #include <string>
 #include <utility>
 #include "fst/log.h"
 #include "path_trie.h"
 const std::string kSPACE = "<space>";
 const float NUM_FLT_INF = std::numeric_limits<float>::max();
 const float NUM_FLT_MIN = std::numeric_limits<float>::min();
--- a/deepspeech/decoders/swig/scorer.cpp
+++ b/deepspeech/decoders/swig/scorer.cpp
@ -26,7 +26,6 @@
 #include "decoder_utils.h"
 using namespace lm::ngram;
 const std::string kSPACE = "<space>";
 Scorer::Scorer(double alpha,
               double beta,
--- a/deepspeech/exps/deepspeech2/bin/export.py
+++ b/deepspeech/exps/deepspeech2/bin/export.py
@ -33,10 +33,9 @@ if __name__ == "__main__":
    # save jit model to 
    parser.add_argument(
        "--export_path", type=str, help="path of the jit model to save")
-    parser.add_argument("--model_type")
+    parser.add_argument(
        "--model_type", type=str, default='offline', help="offline/online")
    args = parser.parse_args()
    if args.model_type is None:
        args.model_type = 'offline'
    print("model_type:{}".format(args.model_type))
    print_arguments(args)
--- a/deepspeech/exps/deepspeech2/bin/test.py
+++ b/deepspeech/exps/deepspeech2/bin/test.py
@ -30,14 +30,13 @@ def main(config, args):
 if __name__ == "__main__":
    parser = default_argument_parser()
-    parser.add_argument("--model_type")
+    parser.add_argument(
        "--model_type", type=str, default='offline', help='offline/online')
    # save asr result to 
    parser.add_argument(
        "--result_file", type=str, help="path of save the asr result")
    args = parser.parse_args()
    print_arguments(args, globals())
    if args.model_type is None:
        args.model_type = 'offline'
    print("model_type:{}".format(args.model_type))
    # https://yaml.org/type/float.html
--- a/deepspeech/exps/deepspeech2/bin/test_export.py
+++ b/deepspeech/exps/deepspeech2/bin/test_export.py
@ -36,11 +36,10 @@ if __name__ == "__main__":
    #load jit model from
    parser.add_argument(
        "--export_path", type=str, help="path of the jit model to save")
-    parser.add_argument("--model_type")
+    parser.add_argument(
        "--model_type", type=str, default='offline', help='offline/online')
    args = parser.parse_args()
    print_arguments(args, globals())
    if args.model_type is None:
        args.model_type = 'offline'
    print("model_type:{}".format(args.model_type))
    # https://yaml.org/type/float.html
--- a/deepspeech/exps/deepspeech2/bin/test_hub.py
+++ b/deepspeech/exps/deepspeech2/bin/test_hub.py
@ -177,15 +177,14 @@ def main(config, args):
 if __name__ == "__main__":
    parser = default_argument_parser()
-    parser.add_argument("--model_type")
+    parser.add_argument(
-    parser.add_argument("--audio_file")
+        "--model_type", type=str, default='offline', help='offline/online')
    parser.add_argument("--audio_file", type=str, help='audio file path')
    # save asr result to
    parser.add_argument(
        "--result_file", type=str, help="path of save the asr result")
    args = parser.parse_args()
    print_arguments(args, globals())
    if args.model_type is None:
        args.model_type = 'offline'
    if not os.path.isfile(args.audio_file):
        print("Please input the audio file path")
        sys.exit(-1)
--- a/deepspeech/exps/deepspeech2/bin/train.py
+++ b/deepspeech/exps/deepspeech2/bin/train.py
@ -35,10 +35,9 @@ def main(config, args):
 if __name__ == "__main__":
    parser = default_argument_parser()
-    parser.add_argument("--model_type")
+    parser.add_argument(
        "--model_type", type=str, default='offline', help='offline/online')
    args = parser.parse_args()
    if args.model_type is None:
        args.model_type = 'offline'
    print("model_type:{}".format(args.model_type))
    print_arguments(args, globals())
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@ -16,7 +16,6 @@ import os
 import time
 from collections import defaultdict
 from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional
 import jsonlines
@ -87,7 +86,8 @@ class DeepSpeech2Trainer(Trainer):
            # Disable gradient synchronizations across DDP processes.
            # Within this context, gradients will be accumulated on module
            # variables, which will later be synchronized.
-            context = self.model.no_sync
+            context = self.model.no_sync if (hasattr(self.model, "no_sync") and
                                             self.parallel) else nullcontext
        else:
            # Used for single gpu training and DDP gradient synchronization
            # processes.
@ -385,13 +385,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        logger.info(msg)
        self.autolog.report()
-    def run_test(self):
+    @paddle.no_grad()
        self.resume_or_scratch()
        try:
            self.test()
        except KeyboardInterrupt:
            exit(-1)
    def export(self):
        if self.args.model_type == 'offline':
            infer_model = DeepSpeech2InferModel.from_pretrained(
@ -408,40 +402,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        logger.info(f"Export code: {static_model.forward.code}")
        paddle.jit.save(static_model, self.args.export_path)
    def run_export(self):
        try:
            self.export()
        except KeyboardInterrupt:
            exit(-1)
    def setup(self):
        """Setup the experiment.
        """
        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        self.setup_output_dir()
        self.setup_checkpointer()
        self.setup_dataloader()
        self.setup_model()
        self.iteration = 0
        self.epoch = 0
    def setup_output_dir(self):
        """Create a directory used for output.
        """
        # output dir
        if self.args.output:
            output_dir = Path(self.args.output).expanduser()
            output_dir.mkdir(parents=True, exist_ok=True)
        else:
            output_dir = Path(
                self.args.checkpoint_path).expanduser().parent.parent
            output_dir.mkdir(parents=True, exist_ok=True)
        self.output_dir = output_dir
 class DeepSpeech2ExportTester(DeepSpeech2Tester):
    def __init__(self, config, args):
@ -645,38 +605,6 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
        output_lens = output_lens_handle.copy_to_cpu()
        return output_probs, output_lens
    def run_test(self):
        try:
            self.test()
        except KeyboardInterrupt:
            exit(-1)
    def setup(self):
        """Setup the experiment.
        """
        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        self.setup_output_dir()
        self.setup_dataloader()
        self.setup_model()
        self.iteration = 0
        self.epoch = 0
    def setup_output_dir(self):
        """Create a directory used for output.
        """
        # output dir
        if self.args.output:
            output_dir = Path(self.args.output).expanduser()
            output_dir.mkdir(parents=True, exist_ok=True)
        else:
            output_dir = Path(self.args.export_path).expanduser().parent.parent
            output_dir.mkdir(parents=True, exist_ok=True)
        self.output_dir = output_dir
    def setup_model(self):
        super().setup_model()
        speedyspeech_config = inference.Config(
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@ -14,12 +14,10 @@
 """Contains U2 model."""
 import json
 import os
 import sys
 import time
 from collections import defaultdict
 from collections import OrderedDict
 from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional
 import jsonlines
@ -44,8 +42,6 @@ from deepspeech.utils import ctc_utils
 from deepspeech.utils import error_rate
 from deepspeech.utils import layer_tools
 from deepspeech.utils import mp_tools
 from deepspeech.utils import text_grid
 from deepspeech.utils import utility
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import UpdateConfig
@ -106,7 +102,8 @@ class U2Trainer(Trainer):
            # Within this context, gradients will be accumulated on module
            # variables, which will later be synchronized.
            # When using cpu w/o DDP, model does not have `no_sync`
-            context = self.model.no_sync if self.parallel else nullcontext
+            context = self.model.no_sync if (hasattr(self.model, "no_sync") and
                                             self.parallel) else nullcontext
        else:
            # Used for single gpu training and DDP gradient synchronization
            # processes.
@ -550,78 +547,12 @@ class U2Tester(U2Trainer):
            })
            f.write(data + '\n')
    def run_test(self):
        self.resume_or_scratch()
        try:
            self.test()
        except KeyboardInterrupt:
            sys.exit(-1)
    @paddle.no_grad()
    def align(self):
-        if self.config.decoding.batch_size > 1:
+        ctc_utils.ctc_align(
-            logger.fatal('alignment mode must be running with batch_size == 1')
+            self.model, self.align_loader, self.config.decoding.batch_size,
-            sys.exit(1)
+            self.align_loader.collate_fn.stride_ms,
-
+            self.align_loader.collate_fn.vocab_list, self.args.result_file)
        # xxx.align
        assert self.args.result_file and self.args.result_file.endswith(
            '.align')
        self.model.eval()
        logger.info(f"Align Total Examples: {len(self.align_loader.dataset)}")
        stride_ms = self.align_loader.collate_fn.stride_ms
        token_dict = self.align_loader.collate_fn.vocab_list
        with open(self.args.result_file, 'w') as fout:
            # one example in batch
            for i, batch in enumerate(self.align_loader):
                key, feat, feats_length, target, target_length = batch
                # 1. Encoder
                encoder_out, encoder_mask = self.model._forward_encoder(
                    feat, feats_length)  # (B, maxlen, encoder_dim)
                maxlen = encoder_out.shape[1]
                ctc_probs = self.model.ctc.log_softmax(
                    encoder_out)  # (1, maxlen, vocab_size)
                # 2. alignment
                ctc_probs = ctc_probs.squeeze(0)
                target = target.squeeze(0)
                alignment = ctc_utils.forced_align(ctc_probs, target)
                logger.info(f"align ids: {key[0]} {alignment}")
                fout.write('{} {}\n'.format(key[0], alignment))
                # 3. gen praat
                # segment alignment
                align_segs = text_grid.segment_alignment(alignment)
                logger.info(f"align tokens: {key[0]}, {align_segs}")
                # IntervalTier, List["start end token\n"]
                subsample = utility.get_subsample(self.config)
                tierformat = text_grid.align_to_tierformat(
                    align_segs, subsample, token_dict)
                # write tier
                align_output_path = Path(self.args.result_file).parent / "align"
                align_output_path.mkdir(parents=True, exist_ok=True)
                tier_path = align_output_path / (key[0] + ".tier")
                with tier_path.open('w') as f:
                    f.writelines(tierformat)
                # write textgrid
                textgrid_path = align_output_path / (key[0] + ".TextGrid")
                second_per_frame = 1. / (1000. /
                                         stride_ms)  # 25ms window, 10ms stride
                second_per_example = (
                    len(alignment) + 1) * subsample * second_per_frame
                text_grid.generate_textgrid(
                    maxtime=second_per_example,
                    intervals=tierformat,
                    output=str(textgrid_path))
    def run_align(self):
        self.resume_or_scratch()
        try:
            self.align()
        except KeyboardInterrupt:
            sys.exit(-1)
    def load_inferspec(self):
        """infer model and input spec.
@ -643,6 +574,7 @@ class U2Tester(U2Trainer):
        ]
        return infer_model, input_spec
    @paddle.no_grad()
    def export(self):
        infer_model, input_spec = self.load_inferspec()
        assert isinstance(input_spec, list), type(input_spec)
@ -650,37 +582,3 @@ class U2Tester(U2Trainer):
        static_model = paddle.jit.to_static(infer_model, input_spec=input_spec)
        logger.info(f"Export code: {static_model.forward.code}")
        paddle.jit.save(static_model, self.args.export_path)
    def run_export(self):
        try:
            self.export()
        except KeyboardInterrupt:
            sys.exit(-1)
    def setup(self):
        """Setup the experiment.
        """
        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        self.setup_output_dir()
        self.setup_checkpointer()
        self.setup_dataloader()
        self.setup_model()
        self.iteration = 0
        self.epoch = 0
    def setup_output_dir(self):
        """Create a directory used for output.
        """
        # output dir
        if self.args.output:
            output_dir = Path(self.args.output).expanduser()
            output_dir.mkdir(parents=True, exist_ok=True)
        else:
            output_dir = Path(
                self.args.checkpoint_path).expanduser().parent.parent
            output_dir.mkdir(parents=True, exist_ok=True)
        self.output_dir = output_dir
--- a/deepspeech/exps/u2_kaldi/model.py
+++ b/deepspeech/exps/u2_kaldi/model.py
@ -14,11 +14,9 @@
 """Contains U2 model."""
 import json
 import os
 import sys
 import time
 from collections import defaultdict
 from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional
 import jsonlines
@ -39,8 +37,6 @@ from deepspeech.utils import ctc_utils
 from deepspeech.utils import error_rate
 from deepspeech.utils import layer_tools
 from deepspeech.utils import mp_tools
 from deepspeech.utils import text_grid
 from deepspeech.utils import utility
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import UpdateConfig
@ -105,7 +101,8 @@ class U2Trainer(Trainer):
            # Disable gradient synchronizations across DDP processes.
            # Within this context, gradients will be accumulated on module
            # variables, which will later be synchronized.
-            context = self.model.no_sync
+            context = self.model.no_sync if (hasattr(self.model, "no_sync") and
                                             self.parallel) else nullcontext
        else:
            # Used for single gpu training and DDP gradient synchronization
            # processes.
@ -524,78 +521,12 @@ class U2Tester(U2Trainer):
            })
            f.write(data + '\n')
    def run_test(self):
        self.resume_or_scratch()
        try:
            self.test()
        except KeyboardInterrupt:
            sys.exit(-1)
    @paddle.no_grad()
    def align(self):
-        if self.config.decoding.batch_size > 1:
+        ctc_utils.ctc_align(
-            logger.fatal('alignment mode must be running with batch_size == 1')
+            self.model, self.align_loader, self.config.decoding.batch_size,
-            sys.exit(1)
+            self.align_loader.collate_fn.stride_ms,
-
+            self.align_loader.collate_fn.vocab_list, self.args.result_file)
        # xxx.align
        assert self.args.result_file and self.args.result_file.endswith(
            '.align')
        self.model.eval()
        logger.info(f"Align Total Examples: {len(self.align_loader.dataset)}")
        stride_ms = self.align_loader.collate_fn.stride_ms
        token_dict = self.align_loader.collate_fn.vocab_list
        with open(self.args.result_file, 'w') as fout:
            # one example in batch
            for i, batch in enumerate(self.align_loader):
                key, feat, feats_length, target, target_length = batch
                # 1. Encoder
                encoder_out, encoder_mask = self.model._forward_encoder(
                    feat, feats_length)  # (B, maxlen, encoder_dim)
                maxlen = encoder_out.shape[1]
                ctc_probs = self.model.ctc.log_softmax(
                    encoder_out)  # (1, maxlen, vocab_size)
                # 2. alignment
                ctc_probs = ctc_probs.squeeze(0)
                target = target.squeeze(0)
                alignment = ctc_utils.forced_align(ctc_probs, target)
                logger.info(f"align ids: {key[0]} {alignment}")
                fout.write('{} {}\n'.format(key[0], alignment))
                # 3. gen praat
                # segment alignment
                align_segs = text_grid.segment_alignment(alignment)
                logger.info(f"align tokens: {key[0]}, {align_segs}")
                # IntervalTier, List["start end token\n"]
                subsample = utility.get_subsample(self.config)
                tierformat = text_grid.align_to_tierformat(
                    align_segs, subsample, token_dict)
                # write tier
                align_output_path = Path(self.args.result_file).parent / "align"
                align_output_path.mkdir(parents=True, exist_ok=True)
                tier_path = align_output_path / (key[0] + ".tier")
                with tier_path.open('w') as f:
                    f.writelines(tierformat)
                # write textgrid
                textgrid_path = align_output_path / (key[0] + ".TextGrid")
                second_per_frame = 1. / (1000. /
                                         stride_ms)  # 25ms window, 10ms stride
                second_per_example = (
                    len(alignment) + 1) * subsample * second_per_frame
                text_grid.generate_textgrid(
                    maxtime=second_per_example,
                    intervals=tierformat,
                    output=str(textgrid_path))
    def run_align(self):
        self.resume_or_scratch()
        try:
            self.align()
        except KeyboardInterrupt:
            sys.exit(-1)
    def load_inferspec(self):
        """infer model and input spec.
@ -617,6 +548,7 @@ class U2Tester(U2Trainer):
        ]
        return infer_model, input_spec
    @paddle.no_grad()
    def export(self):
        infer_model, input_spec = self.load_inferspec()
        assert isinstance(input_spec, list), type(input_spec)
@ -625,43 +557,11 @@ class U2Tester(U2Trainer):
        logger.info(f"Export code: {static_model.forward.code}")
        paddle.jit.save(static_model, self.args.export_path)
    def run_export(self):
        try:
            self.export()
        except KeyboardInterrupt:
            sys.exit(-1)
    def setup_dict(self):
        # load dictionary for debug log
        self.args.char_list = load_dict(self.args.dict_path,
                                        "maskctc" in self.args.model_name)
    def setup(self):
-        """Setup the experiment.
+        super().setup()
        """
        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        self.setup_output_dir()
        self.setup_checkpointer()
        self.setup_dataloader()
        self.setup_model()
        self.setup_dict()
        self.iteration = 0
        self.epoch = 0
    def setup_output_dir(self):
        """Create a directory used for output.
        """
        # output dir
        if self.args.output:
            output_dir = Path(self.args.output).expanduser()
            output_dir.mkdir(parents=True, exist_ok=True)
        else:
            output_dir = Path(
                self.args.checkpoint_path).expanduser().parent.parent
            output_dir.mkdir(parents=True, exist_ok=True)
        self.output_dir = output_dir
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
@ -14,11 +14,9 @@
 """Contains U2 model."""
 import json
 import os
 import sys
 import time
 from collections import defaultdict
 from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional
 import jsonlines
@ -42,8 +40,6 @@ from deepspeech.utils import bleu_score
 from deepspeech.utils import ctc_utils
 from deepspeech.utils import layer_tools
 from deepspeech.utils import mp_tools
 from deepspeech.utils import text_grid
 from deepspeech.utils import utility
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import UpdateConfig
@ -110,7 +106,8 @@ class U2STTrainer(Trainer):
            # Disable gradient synchronizations across DDP processes.
            # Within this context, gradients will be accumulated on module
            # variables, which will later be synchronized.
-            context = self.model.no_sync
+            context = self.model.no_sync if (hasattr(self.model, "no_sync") and
                                             self.parallel) else nullcontext
        else:
            # Used for single gpu training and DDP gradient synchronization
            # processes.
@ -544,78 +541,12 @@ class U2STTester(U2STTrainer):
            })
            f.write(data + '\n')
    def run_test(self):
        self.resume_or_scratch()
        try:
            self.test()
        except KeyboardInterrupt:
            sys.exit(-1)
    @paddle.no_grad()
    def align(self):
-        if self.config.decoding.batch_size > 1:
+        ctc_utils.ctc_align(
-            logger.fatal('alignment mode must be running with batch_size == 1')
+            self.model, self.align_loader, self.config.decoding.batch_size,
-            sys.exit(1)
+            self.align_loader.collate_fn.stride_ms,
-
+            self.align_loader.collate_fn.vocab_list, self.args.result_file)
        # xxx.align
        assert self.args.result_file and self.args.result_file.endswith(
            '.align')
        self.model.eval()
        logger.info(f"Align Total Examples: {len(self.align_loader.dataset)}")
        stride_ms = self.align_loader.collate_fn.stride_ms
        token_dict = self.align_loader.collate_fn.vocab_list
        with open(self.args.result_file, 'w') as fout:
            # one example in batch
            for i, batch in enumerate(self.align_loader):
                key, feat, feats_length, target, target_length = batch
                # 1. Encoder
                encoder_out, encoder_mask = self.model._forward_encoder(
                    feat, feats_length)  # (B, maxlen, encoder_dim)
                maxlen = encoder_out.shape[1]
                ctc_probs = self.model.ctc.log_softmax(
                    encoder_out)  # (1, maxlen, vocab_size)
                # 2. alignment
                ctc_probs = ctc_probs.squeeze(0)
                target = target.squeeze(0)
                alignment = ctc_utils.forced_align(ctc_probs, target)
                logger.info(f"align ids: {key[0]} {alignment}")
                fout.write('{} {}\n'.format(key[0], alignment))
                # 3. gen praat
                # segment alignment
                align_segs = text_grid.segment_alignment(alignment)
                logger.info(f"align tokens: {key[0]}, {align_segs}")
                # IntervalTier, List["start end token\n"]
                subsample = utility.get_subsample(self.config)
                tierformat = text_grid.align_to_tierformat(
                    align_segs, subsample, token_dict)
                # write tier
                align_output_path = Path(self.args.result_file).parent / "align"
                align_output_path.mkdir(parents=True, exist_ok=True)
                tier_path = align_output_path / (key[0] + ".tier")
                with tier_path.open('w') as f:
                    f.writelines(tierformat)
                # write textgrid
                textgrid_path = align_output_path / (key[0] + ".TextGrid")
                second_per_frame = 1. / (1000. /
                                         stride_ms)  # 25ms window, 10ms stride
                second_per_example = (
                    len(alignment) + 1) * subsample * second_per_frame
                text_grid.generate_textgrid(
                    maxtime=second_per_example,
                    intervals=tierformat,
                    output=str(textgrid_path))
    def run_align(self):
        self.resume_or_scratch()
        try:
            self.align()
        except KeyboardInterrupt:
            sys.exit(-1)
    def load_inferspec(self):
        """infer model and input spec.
@ -637,6 +568,7 @@ class U2STTester(U2STTrainer):
        ]
        return infer_model, input_spec
    @paddle.no_grad()
    def export(self):
        infer_model, input_spec = self.load_inferspec()
        assert isinstance(input_spec, list), type(input_spec)
@ -644,37 +576,3 @@ class U2STTester(U2STTrainer):
        static_model = paddle.jit.to_static(infer_model, input_spec=input_spec)
        logger.info(f"Export code: {static_model.forward.code}")
        paddle.jit.save(static_model, self.args.export_path)
    def run_export(self):
        try:
            self.export()
        except KeyboardInterrupt:
            sys.exit(-1)
    def setup(self):
        """Setup the experiment.
        """
        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        self.setup_output_dir()
        self.setup_checkpointer()
        self.setup_dataloader()
        self.setup_model()
        self.iteration = 0
        self.epoch = 0
    def setup_output_dir(self):
        """Create a directory used for output.
        """
        # output dir
        if self.args.output:
            output_dir = Path(self.args.output).expanduser()
            output_dir.mkdir(parents=True, exist_ok=True)
        else:
            output_dir = Path(
                self.args.checkpoint_path).expanduser().parent.parent
            output_dir.mkdir(parents=True, exist_ok=True)
        self.output_dir = output_dir
--- a/deepspeech/frontend/augmentor/augmentation.py
+++ b/deepspeech/frontend/augmentor/augmentation.py
@ -15,6 +15,7 @@
 import json
 from collections.abc import Sequence
 from inspect import signature
 from pprint import pformat
 import numpy as np
@ -22,10 +23,10 @@ from deepspeech.frontend.augmentor.base import AugmentorBase
 from deepspeech.utils.dynamic_import import dynamic_import
 from deepspeech.utils.log import Log
 __all__ = ["AugmentationPipeline"]
 logger = Log(__name__).getlog()
 __all__ = ["AugmentationPipeline"]
 import_alias = dict(
    volume="deepspeech.frontend.augmentor.impulse_response:VolumePerturbAugmentor",
    shift="deepspeech.frontend.augmentor.shift_perturb:ShiftPerturbAugmentor",
@ -111,6 +112,8 @@ class AugmentationPipeline():
            'audio')
        self._spec_augmentors, self._spec_rates = self._parse_pipeline_from(
            'feature')
        logger.info(
            f"Augmentation: {pformat(list(zip(self._augmentors, self._rates)))}")
    def __call__(self, xs, uttid_list=None, **kwargs):
        if not isinstance(xs, Sequence):
@ -197,8 +200,10 @@ class AugmentationPipeline():
            aug_confs = audio_confs
        elif aug_type == 'feature':
            aug_confs = feature_confs
-        else:
+        elif aug_type == 'all':
            aug_confs = all_confs
        else:
            raise ValueError(f"Not support: {aug_type}")
        augmentors = [
            self._get_augmentor(config["type"], config["params"])
--- a/deepspeech/frontend/augmentor/spec_augment.py
+++ b/deepspeech/frontend/augmentor/spec_augment.py
@ -133,7 +133,7 @@ class SpecAugmentor(AugmentorBase):
        return self._time_mask
    def __repr__(self):
-        return f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}"
+        return f"specaug: F-{self.F}, T-{self.T}, F-n-{self.n_freq_masks}, T-n-{self.n_time_masks}"
    def time_warp(self, x, mode='PIL'):
        """time warp for spec augment
--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
@ -51,12 +51,14 @@ class SpeechFeaturizer():
            use_dB_normalization=use_dB_normalization,
            target_dB=target_dB,
            dither=dither)
        self.feature_size = self.audio_feature.feature_size
        self.text_feature = TextFeaturizer(
            unit_type=unit_type,
            vocab_filepath=vocab_filepath,
            spm_model_prefix=spm_model_prefix,
            maskctc=maskctc)
        self.vocab_size = self.text_feature.vocab_size
    def featurize(self, speech_segment, keep_transcription_text):
        """Extract features for speech segment.
--- a/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/deepspeech/frontend/featurizer/text_featurizer.py
@ -12,12 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the text featurizer class."""
 from pprint import pformat
 import sentencepiece as spm
 from ..utility import BLANK
 from ..utility import EOS
 from ..utility import load_dict
 from ..utility import MASKCTC
 from ..utility import SOS
 from ..utility import SPACE
 from ..utility import UNK
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = ["TextFeaturizer"]
@ -76,7 +84,7 @@ class TextFeaturizer():
        """Convert text string to a list of token indices.
        Args:
-            text (str): Text.
+            text (str): Text to process.
        Returns:
            List[int]: List of token indices.
@ -199,13 +207,24 @@ class TextFeaturizer():
        """Load vocabulary from file."""
        vocab_list = load_dict(vocab_filepath, maskctc)
        assert vocab_list is not None
        logger.info(f"Vocab: {pformat(vocab_list)}")
        id2token = dict(
            [(idx, token) for (idx, token) in enumerate(vocab_list)])
        token2id = dict(
            [(token, idx) for (idx, token) in enumerate(vocab_list)])
        blank_id = vocab_list.index(BLANK) if BLANK in vocab_list else -1
        maskctc_id = vocab_list.index(MASKCTC) if MASKCTC in vocab_list else -1
        unk_id = vocab_list.index(UNK) if UNK in vocab_list else -1
        eos_id = vocab_list.index(EOS) if EOS in vocab_list else -1
-
+        sos_id = vocab_list.index(SOS) if SOS in vocab_list else -1
        space_id = vocab_list.index(SPACE) if SPACE in vocab_list else -1
        logger.info(f"BLANK id: {blank_id}")
        logger.info(f"UNK id: {unk_id}")
        logger.info(f"EOS id: {eos_id}")
        logger.info(f"SOS id: {sos_id}")
        logger.info(f"SPACE id: {space_id}")
        logger.info(f"MASKCTC id: {maskctc_id}")
        return token2id, id2token, vocab_list, unk_id, eos_id
--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@ -49,7 +49,11 @@ def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
    with open(dict_path, "r") as f:
        dictionary = f.readlines()
-    char_list = [entry.strip().split(" ")[0] for entry in dictionary]
+    # first token is `<blank>`
    # multi line: `<blank> 0\n`
    # one line: `<blank>`
    # space is relpace with <space>
    char_list = [entry[:-1].split(" ")[0] for entry in dictionary]
    if BLANK not in char_list:
        char_list.insert(0, BLANK)
    if EOS not in char_list:
--- a/deepspeech/models/ds2/conv.py
+++ b/deepspeech/models/ds2/conv.py
@ -41,6 +41,13 @@ def conv_output_size(I, F, P, S):
    return (I - F + 2 * P - S) // S
 # receptive field calculator
 # https://fomoro.com/research/article/receptive-field-calculator
 # https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
 # https://distill.pub/2019/computing-receptive-fields/
 # Rl-1 = Sl * Rl + (Kl - Sl) 
 class ConvBn(nn.Layer):
    """Convolution layer with batch normalization.
@ -106,9 +113,10 @@ class ConvBn(nn.Layer):
        # reset padding part to 0
        masks = make_non_pad_mask(x_len)  #[B, T]
        masks = masks.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
-        # https://github.com/PaddlePaddle/Paddle/pull/29265
+        # TODO(Hui Zhang): not support bool multiply
-        # rhs will type promote to lhs
+        # masks = masks.type_as(x)
-        x = x * masks
+        masks = masks.astype(x.dtype)
        x = x.multiply(masks)
        return x, x_len
--- a/deepspeech/models/ds2/deepspeech2.py
+++ b/deepspeech/models/ds2/deepspeech2.py
@ -218,14 +218,18 @@ class DeepSpeech2Model(nn.Layer):
        DeepSpeech2Model
            The model built from pretrained result.
        """
-        model = cls(feat_size=dataloader.collate_fn.feature_size,
+        model = cls(
-                    dict_size=dataloader.collate_fn.vocab_size,
+            #feat_size=dataloader.collate_fn.feature_size,
            feat_size=dataloader.dataset.feature_size,
            #dict_size=dataloader.collate_fn.vocab_size,
            dict_size=dataloader.dataset.vocab_size,
            num_conv_layers=config.model.num_conv_layers,
            num_rnn_layers=config.model.num_rnn_layers,
            rnn_size=config.model.rnn_layer_size,
            use_gru=config.model.use_gru,
            share_rnn_weights=config.model.share_rnn_weights,
-                    blank_id=config.model.blank_id)
+            blank_id=config.model.blank_id,
            ctc_grad_norm_type=config.model.ctc_grad_norm_type, )
        infos = Checkpoint().load_parameters(
            model, checkpoint_path=checkpoint_path)
        logger.info(f"checkpoint info: {infos}")
@ -244,36 +248,22 @@ class DeepSpeech2Model(nn.Layer):
        DeepSpeech2Model
            The model built from config.
        """
-        model = cls(feat_size=config.feat_size,
+        model = cls(
            feat_size=config.feat_size,
            dict_size=config.dict_size,
            num_conv_layers=config.num_conv_layers,
            num_rnn_layers=config.num_rnn_layers,
            rnn_size=config.rnn_layer_size,
            use_gru=config.use_gru,
            share_rnn_weights=config.share_rnn_weights,
-                    blank_id=config.blank_id)
+            blank_id=config.blank_id,
            ctc_grad_norm_type=config.ctc_grad_norm_type, )
        return model
 class DeepSpeech2InferModel(DeepSpeech2Model):
-    def __init__(self,
+    def __init__(self, *args, **kwargs):
-                 feat_size,
+        super().__init__(*args, **kwargs)
                 dict_size,
                 num_conv_layers=2,
                 num_rnn_layers=3,
                 rnn_size=1024,
                 use_gru=False,
                 share_rnn_weights=True,
                 blank_id=0):
        super().__init__(
            feat_size=feat_size,
            dict_size=dict_size,
            num_conv_layers=num_conv_layers,
            num_rnn_layers=num_rnn_layers,
            rnn_size=rnn_size,
            use_gru=use_gru,
            share_rnn_weights=share_rnn_weights,
            blank_id=blank_id)
    def forward(self, audio, audio_len):
        """export model function
--- a/deepspeech/models/ds2/rnn.py
+++ b/deepspeech/models/ds2/rnn.py
@ -308,8 +308,8 @@ class RNNStack(nn.Layer):
            x, x_len = rnn(x, x_len)
            masks = make_non_pad_mask(x_len)  #[B, T]
            masks = masks.unsqueeze(-1)  # [B, T, 1]
-            # https://github.com/PaddlePaddle/Paddle/pull/29265
+            # TODO(Hui Zhang): not support bool multiply
-            # rhs will type promote to lhs
+            masks = masks.astype(x.dtype)
-            x = x * masks
+            x = x.multiply(masks)
        return x, x_len
--- a/deepspeech/models/ds2_online/deepspeech2.py
+++ b/deepspeech/models/ds2_online/deepspeech2.py
@ -255,12 +255,13 @@ class DeepSpeech2ModelOnline(nn.Layer):
                fc_layers_size_list=[512, 256],
                use_gru=True,  #Use gru if set True. Use simple rnn if set False.
                blank_id=0,  # index of blank in vocob.txt
-            ))
+                ctc_grad_norm_type='instance', ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
-    def __init__(self,
+    def __init__(
            self,
            feat_size,
            dict_size,
            num_conv_layers=2,
@ -270,7 +271,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
            num_fc_layers=2,
            fc_layers_size_list=[512, 256],
            use_gru=False,
-                 blank_id=0):
+            blank_id=0,
            ctc_grad_norm_type='instance', ):
        super().__init__()
        self.encoder = CRNNEncoder(
            feat_size=feat_size,
@ -290,7 +292,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
            dropout_rate=0.0,
            reduction=True,  # sum
            batch_average=True,  # sum / batch_size
-            grad_norm_type='instance')
+            grad_norm_type=ctc_grad_norm_type)
    def forward(self, audio, audio_len, text, text_len):
        """Compute Model loss
@ -348,7 +350,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
        DeepSpeech2ModelOnline
            The model built from pretrained result.
        """
-        model = cls(feat_size=dataloader.collate_fn.feature_size,
+        model = cls(
            feat_size=dataloader.collate_fn.feature_size,
            dict_size=dataloader.collate_fn.vocab_size,
            num_conv_layers=config.model.num_conv_layers,
            num_rnn_layers=config.model.num_rnn_layers,
@ -357,7 +360,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
            num_fc_layers=config.model.num_fc_layers,
            fc_layers_size_list=config.model.fc_layers_size_list,
            use_gru=config.model.use_gru,
-                    blank_id=config.model.blank_id)
+            blank_id=config.model.blank_id,
            ctc_grad_norm_type=config.model.ctc_grad_norm_type, )
        infos = Checkpoint().load_parameters(
            model, checkpoint_path=checkpoint_path)
        logger.info(f"checkpoint info: {infos}")
@ -376,7 +380,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
        DeepSpeech2ModelOnline
            The model built from config.
        """
-        model = cls(feat_size=config.feat_size,
+        model = cls(
            feat_size=config.feat_size,
            dict_size=config.dict_size,
            num_conv_layers=config.num_conv_layers,
            num_rnn_layers=config.num_rnn_layers,
@ -385,33 +390,14 @@ class DeepSpeech2ModelOnline(nn.Layer):
            num_fc_layers=config.num_fc_layers,
            fc_layers_size_list=config.fc_layers_size_list,
            use_gru=config.use_gru,
-                    blank_id=config.blank_id)
+            blank_id=config.blank_id,
            ctc_grad_norm_type=config.ctc_grad_norm_type, )
        return model
 class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
-    def __init__(self,
+    def __init__(self, *args, **kwargs):
-                 feat_size,
+        super().__init__(*args, **kwargs)
                 dict_size,
                 num_conv_layers=2,
                 num_rnn_layers=4,
                 rnn_size=1024,
                 rnn_direction='forward',
                 num_fc_layers=2,
                 fc_layers_size_list=[512, 256],
                 use_gru=False,
                 blank_id=0):
        super().__init__(
            feat_size=feat_size,
            dict_size=dict_size,
            num_conv_layers=num_conv_layers,
            num_rnn_layers=num_rnn_layers,
            rnn_size=rnn_size,
            rnn_direction=rnn_direction,
            num_fc_layers=num_fc_layers,
            fc_layers_size_list=fc_layers_size_list,
            use_gru=use_gru,
            blank_id=blank_id)
    def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box,
                chunk_state_c_box):
--- a/deepspeech/models/u2/u2.py
+++ b/deepspeech/models/u2/u2.py
@ -164,7 +164,10 @@ class U2BaseModel(nn.Layer):
        encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
        encoder_time = time.time() - start
        #logger.debug(f"encoder time: {encoder_time}")
-        encoder_out_lens = encoder_mask.squeeze(1).sum(1)  #[B, 1, T] -> [B]
+        #TODO(Hui Zhang): sum not support bool type
        #encoder_out_lens = encoder_mask.squeeze(1).sum(1)  #[B, 1, T] -> [B]
        encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum(
            1)  #[B, 1, T] -> [B]
        # 2a. Attention-decoder branch
        loss_att = None
@ -319,7 +322,8 @@ class U2BaseModel(nn.Layer):
        # 2. Decoder forward step by step
        for i in range(1, maxlen + 1):
            # Stop if all batch and all beam produce eos
-            if end_flag.sum() == running_size:
+            # TODO(Hui Zhang): if end_flag.sum() == running_size:
            if end_flag.cast(paddle.int64).sum() == running_size:
                break
            # 2.1 Forward decoder step
@ -405,7 +409,9 @@ class U2BaseModel(nn.Layer):
            speech, speech_lengths, decoding_chunk_size,
            num_decoding_left_chunks, simulate_streaming)
        maxlen = encoder_out.shape[1]
-        encoder_out_lens = encoder_mask.squeeze(1).sum(1)
+        # (TODO Hui Zhang): bool no support reduce_sum
        # encoder_out_lens = encoder_mask.squeeze(1).sum(1)
        encoder_out_lens = encoder_mask.squeeze(1).astype(paddle.int).sum(1)
        ctc_probs = self.ctc.log_softmax(encoder_out)  # (B, maxlen, vocab_size)
        topk_prob, topk_index = ctc_probs.topk(1, axis=2)  # (B, maxlen, 1)
--- a/deepspeech/models/u2_st.py
+++ b/deepspeech/models/u2_st.py
@ -165,7 +165,10 @@ class U2STBaseModel(nn.Layer):
        encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
        encoder_time = time.time() - start
        #logger.debug(f"encoder time: {encoder_time}")
-        encoder_out_lens = encoder_mask.squeeze(1).sum(1)  #[B, 1, T] -> [B]
+        #TODO(Hui Zhang): sum not support bool type
        #encoder_out_lens = encoder_mask.squeeze(1).sum(1)  #[B, 1, T] -> [B]
        encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum(
            1)  #[B, 1, T] -> [B]
        # 2a. ST-decoder branch
        start = time.time()
@ -362,7 +365,8 @@ class U2STBaseModel(nn.Layer):
        # 2. Decoder forward step by step
        for i in range(1, maxlen + 1):
            # Stop if all batch and all beam produce eos
-            if end_flag.sum() == running_size:
+            # TODO(Hui Zhang): if end_flag.sum() == running_size:
            if end_flag.cast(paddle.int64).sum() == running_size:
                break
            # 2.1 Forward decoder step
--- a/deepspeech/modules/conv.py
+++ b/deepspeech/modules/conv.py
@ -1,170 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from paddle import nn
 from paddle.nn import functional as F
 from deepspeech.modules.activation import brelu
 from deepspeech.modules.mask import make_non_pad_mask
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = ['ConvStack', "conv_output_size"]
 def conv_output_size(I, F, P, S):
    # https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
    # Output size after Conv:
    #   By noting I the length of the input volume size, 
    #   F the length of the filter, 
    #   P the amount of zero padding, 
    #   S the stride,
    #   then the output size O of the feature map along that dimension is given by:
    #       O = (I - F + Pstart + Pend) // S + 1
    #   When Pstart == Pend == P, we can replace Pstart + Pend by 2P.
    #   When Pstart == Pend == 0
    #       O = (I - F - S) // S
    # https://iq.opengenus.org/output-size-of-convolution/
    # Output height = (Input height + padding height top + padding height bottom - kernel height) / (stride height) + 1
    # Output width = (Output width + padding width right + padding width left - kernel width) / (stride width) + 1
    return (I - F + 2 * P - S) // S
 # receptive field calculator
 # https://fomoro.com/research/article/receptive-field-calculator
 # https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
 # https://distill.pub/2019/computing-receptive-fields/
 # Rl-1 = Sl * Rl + (Kl - Sl) 
 class ConvBn(nn.Layer):
    """Convolution layer with batch normalization.
    :param kernel_size: The x dimension of a filter kernel. Or input a tuple for
                        two image dimension.
    :type kernel_size: int|tuple|list
    :param num_channels_in: Number of input channels.
    :type num_channels_in: int
    :param num_channels_out: Number of output channels.
    :type num_channels_out: int
    :param stride: The x dimension of the stride. Or input a tuple for two 
                image dimension. 
    :type stride: int|tuple|list
    :param padding: The x dimension of the padding. Or input a tuple for two
                    image dimension.
    :type padding: int|tuple|list
    :param act: Activation type, relu|brelu
    :type act: string
    :return: Batch norm layer after convolution layer.
    :rtype: Variable
    """
    def __init__(self, num_channels_in, num_channels_out, kernel_size, stride,
                 padding, act):
        super().__init__()
        assert len(kernel_size) == 2
        assert len(stride) == 2
        assert len(padding) == 2
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.conv = nn.Conv2D(
            num_channels_in,
            num_channels_out,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            weight_attr=None,
            bias_attr=False,
            data_format='NCHW')
        self.bn = nn.BatchNorm2D(
            num_channels_out,
            weight_attr=None,
            bias_attr=None,
            data_format='NCHW')
        self.act = F.relu if act == 'relu' else brelu
    def forward(self, x, x_len):
        """
        x(Tensor): audio, shape [B, C, D, T]
        """
        x = self.conv(x)
        x = self.bn(x)
        x = self.act(x)
        x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1]
                 ) // self.stride[1] + 1
        # reset padding part to 0
        masks = make_non_pad_mask(x_len)  #[B, T]
        masks = masks.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
        # https://github.com/PaddlePaddle/Paddle/pull/29265
        # rhs will type promote to lhs
        x = x * masks
        return x, x_len
 class ConvStack(nn.Layer):
    """Convolution group with stacked convolution layers.
    :param feat_size: audio feature dim.
    :type feat_size: int
    :param num_stacks: Number of stacked convolution layers.
    :type num_stacks: int
    """
    def __init__(self, feat_size, num_stacks):
        super().__init__()
        self.feat_size = feat_size  # D
        self.num_stacks = num_stacks
        self.conv_in = ConvBn(
            num_channels_in=1,
            num_channels_out=32,
            kernel_size=(41, 11),  #[D, T]
            stride=(2, 3),
            padding=(20, 5),
            act='brelu')
        out_channel = 32
        convs = [
            ConvBn(
                num_channels_in=32,
                num_channels_out=out_channel,
                kernel_size=(21, 11),
                stride=(2, 1),
                padding=(10, 5),
                act='brelu') for i in range(num_stacks - 1)
        ]
        self.conv_stack = nn.LayerList(convs)
        # conv output feat_dim
        output_height = (feat_size - 1) // 2 + 1
        for i in range(self.num_stacks - 1):
            output_height = (output_height - 1) // 2 + 1
        self.output_height = out_channel * output_height
    def forward(self, x, x_len):
        """
        x: shape [B, C, D, T]
        x_len : shape [B]
        """
        x, x_len = self.conv_in(x, x_len)
        for i, conv in enumerate(self.conv_stack):
            x, x_len = conv(x, x_len)
        return x, x_len
--- a/deepspeech/modules/decoder.py
+++ b/deepspeech/modules/decoder.py
@ -124,7 +124,9 @@ class TransformerDecoder(nn.Layer):
        # m: (1, L, L)
        m = subsequent_mask(tgt_mask.shape[-1]).unsqueeze(0)
        # tgt_mask: (B, L, L)
-        tgt_mask = tgt_mask & m
+        # TODO(Hui Zhang): not support & for tensor
        # tgt_mask = tgt_mask & m
        tgt_mask = tgt_mask.logical_and(m)
        x, _ = self.embed(tgt)
        for layer in self.decoders:
@ -135,7 +137,9 @@ class TransformerDecoder(nn.Layer):
        if self.use_output_layer:
            x = self.output_layer(x)
-        olens = tgt_mask.sum(1)
+        # TODO(Hui Zhang): reduce_sum not support bool type
        # olens = tgt_mask.sum(1)
        olens = tgt_mask.astype(paddle.int).sum(1)
        return x, olens
    def forward_one_step(
--- a/deepspeech/modules/encoder.py
+++ b/deepspeech/modules/encoder.py
@ -162,7 +162,8 @@ class BaseEncoder(nn.Layer):
        xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0)
        #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
        masks = masks.astype(paddle.bool)
-        mask_pad = ~masks
+        #TODO(Hui Zhang): mask_pad = ~masks
        mask_pad = masks.logical_not()
        chunk_masks = add_optional_chunk_mask(
            xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk,
            decoding_chunk_size, self.static_chunk_size,
--- a/deepspeech/modules/loss.py
+++ b/deepspeech/modules/loss.py
@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
@ -32,18 +34,19 @@ class CTCLoss(nn.Layer):
        # last token id as blank id
        self.loss = nn.CTCLoss(blank=blank, reduction=reduction)
        self.batch_average = batch_average
        logger.info(
            f"CTCLoss Loss reduction: {reduction}, div-bs: {batch_average}")
        logger.info(f"CTCLoss Grad Norm Type: {grad_norm_type}")
        # instance for norm_by_times
        # batch for norm_by_batchsize
        # frame for norm_by_total_logits_len
        assert grad_norm_type in ('instance', 'batch', 'frame', None)
        self.norm_by_times = False
        self.norm_by_batchsize = False
        self.norm_by_total_logits_len = False
-        logger.info(f"CTCLoss Grad Norm Type: {grad_norm_type}")
+        if grad_norm_type is None:
-        if grad_norm_type == 'instance':
+            # no grad norm
            pass
        elif grad_norm_type == 'instance':
            self.norm_by_times = True
        elif grad_norm_type == 'batch':
            self.norm_by_batchsize = True
@ -51,6 +54,22 @@ class CTCLoss(nn.Layer):
            self.norm_by_total_logits_len = True
        else:
            raise ValueError(f"CTCLoss Grad Norm no support {grad_norm_type}")
        self.kwargs = {
            "norm_by_times": self.norm_by_times,
            "norm_by_batchsize": self.norm_by_batchsize,
            "norm_by_total_logits_len": self.norm_by_total_logits_len,
        }
        # Derive only the args which the func has
        try:
            param = inspect.signature(self.loss.forward).parameters
        except ValueError:
            # Some function, e.g. built-in function, are failed
            param = {}
        self._kwargs = {k: v for k, v in self.kwargs.items() if k in param}
        _notin = {k: v for k, v in self.kwargs.items() if k not in param}
        logger.info(f"{self.loss} kwargs:{self._kwargs}, not support: {_notin}")
        #self.loss_fn = partial(self.loss.forward, **_kwargs)
    def forward(self, logits, ys_pad, hlens, ys_lens):
        """Compute CTC loss.
@ -70,14 +89,8 @@ class CTCLoss(nn.Layer):
        # logits: (B, L, D) -> (L, B, D)
        logits = logits.transpose([1, 0, 2])
        ys_pad = ys_pad.astype(paddle.int32)
-        loss = self.loss(
+        #loss = self.loss_fn(logits, ys_pad, hlens, ys_lens)
-            logits,
+        loss = self.loss(logits, ys_pad, hlens, ys_lens)
            ys_pad,
            hlens,
            ys_lens,
            norm_by_times=self.norm_by_times,
            norm_by_batchsize=self.norm_by_batchsize,
            norm_by_total_logits_len=self.norm_by_total_logits_len)
        if self.batch_average:
            # Batch-size average
            loss = loss / B
@ -152,7 +165,7 @@ class LabelSmoothingLoss(nn.Layer):
        # use zeros_like instead of torch.no_grad() for true_dist,
        # since no_grad() can not be exported by JIT
        true_dist = paddle.full_like(x, self.smoothing / (self.size - 1))
-        ignore = (target == self.padding_idx)  # (B,)
+        ignore = target == self.padding_idx  # (B,)
        #TODO(Hui Zhang): target = target * (1 - ignore)  # avoid -1 index
        target = target.masked_fill(ignore, 0)  # avoid -1 index
@ -163,8 +176,10 @@ class LabelSmoothingLoss(nn.Layer):
        kl = self.criterion(F.log_softmax(x, axis=1), true_dist)
-        total = len(target) - int(ignore.sum())
+        #TODO(Hui Zhang): sum not support bool type
        #total = len(target) - int(ignore.sum())
        total = len(target) - int(ignore.type_as(target).sum())
        denom = total if self.normalize_length else B
-        #TODO(Hui Zhang): numer = (kl * (1 - ignore)).sum()
+        #numer = (kl * (1 - ignore)).sum()
        numer = kl.masked_fill(ignore.unsqueeze(1), 0).sum()
        return numer / denom
--- a/deepspeech/modules/mask.py
+++ b/deepspeech/modules/mask.py
@ -69,7 +69,8 @@ def make_non_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
                 [1, 1, 1, 0, 0],
                 [1, 1, 0, 0, 0]]
    """
-    return ~make_pad_mask(lengths)
+    #return ~make_pad_mask(lengths)
    return make_pad_mask(lengths).logical_not()
 def subsequent_mask(size: int) -> paddle.Tensor:
@ -91,7 +92,12 @@ def subsequent_mask(size: int) -> paddle.Tensor:
         [1, 1, 1]]
    """
    ret = paddle.ones([size, size], dtype=paddle.bool)
-    return paddle.tril(ret)
+    #TODO(Hui Zhang): tril not support bool
    #return paddle.tril(ret)
    ret = ret.astype(paddle.float)
    ret = paddle.tril(ret)
    ret = ret.astype(paddle.bool)
    return ret
 def subsequent_chunk_mask(
@ -180,13 +186,15 @@ def add_optional_chunk_mask(xs: paddle.Tensor,
        chunk_masks = subsequent_chunk_mask(xs.shape[1], chunk_size,
                                            num_left_chunks)  # (L, L)
        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
-        chunk_masks = masks & chunk_masks  # (B, L, L)
+        # chunk_masks = masks & chunk_masks  # (B, L, L)
        chunk_masks = masks.logical_and(chunk_masks)  # (B, L, L)
    elif static_chunk_size > 0:
        num_left_chunks = num_decoding_left_chunks
        chunk_masks = subsequent_chunk_mask(xs.shape[1], static_chunk_size,
                                            num_left_chunks)  # (L, L)
        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
-        chunk_masks = masks & chunk_masks  # (B, L, L)
+        # chunk_masks = masks & chunk_masks  # (B, L, L)
        chunk_masks = masks.logical_and(chunk_masks)  # (B, L, L)
    else:
        chunk_masks = masks
    return chunk_masks
--- a/deepspeech/modules/rnn.py
+++ b/deepspeech/modules/rnn.py
@ -1,314 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
 from paddle.nn import initializer as I
 from deepspeech.modules.activation import brelu
 from deepspeech.modules.mask import make_non_pad_mask
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = ['RNNStack']
 class RNNCell(nn.RNNCellBase):
    r"""
    Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it 
    computes the outputs and updates states.
    The formula used is as follows:
    .. math::
        h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
        y_{t} & = h_{t}
    where :math:`act` is for :attr:`activation`.
    """
    def __init__(self,
                 hidden_size: int,
                 activation="tanh",
                 weight_ih_attr=None,
                 weight_hh_attr=None,
                 bias_ih_attr=None,
                 bias_hh_attr=None,
                 name=None):
        super().__init__()
        std = 1.0 / math.sqrt(hidden_size)
        self.weight_hh = self.create_parameter(
            (hidden_size, hidden_size),
            weight_hh_attr,
            default_initializer=I.Uniform(-std, std))
        self.bias_ih = None
        self.bias_hh = self.create_parameter(
            (hidden_size, ),
            bias_hh_attr,
            is_bias=True,
            default_initializer=I.Uniform(-std, std))
        self.hidden_size = hidden_size
        if activation not in ["tanh", "relu", "brelu"]:
            raise ValueError(
                "activation for SimpleRNNCell should be tanh or relu, "
                "but get {}".format(activation))
        self.activation = activation
        self._activation_fn = paddle.tanh \
            if activation == "tanh" \
            else F.relu
        if activation == 'brelu':
            self._activation_fn = brelu
    def forward(self, inputs, states=None):
        if states is None:
            states = self.get_initial_states(inputs, self.state_shape)
        pre_h = states
        i2h = inputs
        if self.bias_ih is not None:
            i2h += self.bias_ih
        h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
        if self.bias_hh is not None:
            h2h += self.bias_hh
        h = self._activation_fn(i2h + h2h)
        return h, h
    @property
    def state_shape(self):
        return (self.hidden_size, )
 class GRUCell(nn.RNNCellBase):
    r"""
    Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states, 
    it computes the outputs and updates states.
    The formula for GRU used is as follows:
    ..  math::
        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})
        z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})
        \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
        h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
        y_{t} & = h_{t}
    where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise 
    multiplication operator.
    """
    def __init__(self,
                 input_size: int,
                 hidden_size: int,
                 weight_ih_attr=None,
                 weight_hh_attr=None,
                 bias_ih_attr=None,
                 bias_hh_attr=None,
                 name=None):
        super().__init__()
        std = 1.0 / math.sqrt(hidden_size)
        self.weight_hh = self.create_parameter(
            (3 * hidden_size, hidden_size),
            weight_hh_attr,
            default_initializer=I.Uniform(-std, std))
        self.bias_ih = None
        self.bias_hh = self.create_parameter(
            (3 * hidden_size, ),
            bias_hh_attr,
            is_bias=True,
            default_initializer=I.Uniform(-std, std))
        self.hidden_size = hidden_size
        self.input_size = input_size
        self._gate_activation = F.sigmoid
        self._activation = paddle.tanh
    def forward(self, inputs, states=None):
        if states is None:
            states = self.get_initial_states(inputs, self.state_shape)
        pre_hidden = states
        x_gates = inputs
        if self.bias_ih is not None:
            x_gates = x_gates + self.bias_ih
        h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
        if self.bias_hh is not None:
            h_gates = h_gates + self.bias_hh
        x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1)
        h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1)
        r = self._gate_activation(x_r + h_r)
        z = self._gate_activation(x_z + h_z)
        c = self._activation(x_c + r * h_c)  # apply reset gate after mm
        h = (pre_hidden - c) * z + c
        # https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru
        return h, h
    @property
    def state_shape(self):
        r"""
        The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
        size would be automatically inserted into shape). The shape corresponds
        to the shape of :math:`h_{t-1}`.
        """
        return (self.hidden_size, )
 class BiRNNWithBN(nn.Layer):
    """Bidirectonal simple rnn layer with sequence-wise batch normalization.
    The batch normalization is only performed on input-state weights.
    :param size: Dimension of RNN cells.
    :type size: int
    :param share_weights: Whether to share input-hidden weights between
                          forward and backward directional RNNs.
    :type share_weights: bool
    :return: Bidirectional simple rnn layer.
    :rtype: Variable
    """
    def __init__(self, i_size: int, h_size: int, share_weights: bool):
        super().__init__()
        self.share_weights = share_weights
        if self.share_weights:
            #input-hidden weights shared between bi-directional rnn.
            self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
            # batch norm is only performed on input-state projection
            self.fw_bn = nn.BatchNorm1D(
                h_size, bias_attr=None, data_format='NLC')
            self.bw_fc = self.fw_fc
            self.bw_bn = self.fw_bn
        else:
            self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
            self.fw_bn = nn.BatchNorm1D(
                h_size, bias_attr=None, data_format='NLC')
            self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False)
            self.bw_bn = nn.BatchNorm1D(
                h_size, bias_attr=None, data_format='NLC')
        self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu')
        self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu')
        self.fw_rnn = nn.RNN(
            self.fw_cell, is_reverse=False, time_major=False)  #[B, T, D]
        self.bw_rnn = nn.RNN(
            self.fw_cell, is_reverse=True, time_major=False)  #[B, T, D]
    def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
        # x, shape [B, T, D]
        fw_x = self.fw_bn(self.fw_fc(x))
        bw_x = self.bw_bn(self.bw_fc(x))
        fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
        bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
        x = paddle.concat([fw_x, bw_x], axis=-1)
        return x, x_len
 class BiGRUWithBN(nn.Layer):
    """Bidirectonal gru layer with sequence-wise batch normalization.
    The batch normalization is only performed on input-state weights.
    :param name: Name of the layer.
    :type name: string
    :param input: Input layer.
    :type input: Variable
    :param size: Dimension of GRU cells.
    :type size: int
    :param act: Activation type.
    :type act: string
    :return: Bidirectional GRU layer.
    :rtype: Variable
    """
    def __init__(self, i_size: int, h_size: int):
        super().__init__()
        hidden_size = h_size * 3
        self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
        self.fw_bn = nn.BatchNorm1D(
            hidden_size, bias_attr=None, data_format='NLC')
        self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
        self.bw_bn = nn.BatchNorm1D(
            hidden_size, bias_attr=None, data_format='NLC')
        self.fw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
        self.bw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
        self.fw_rnn = nn.RNN(
            self.fw_cell, is_reverse=False, time_major=False)  #[B, T, D]
        self.bw_rnn = nn.RNN(
            self.fw_cell, is_reverse=True, time_major=False)  #[B, T, D]
    def forward(self, x, x_len):
        # x, shape [B, T, D]
        fw_x = self.fw_bn(self.fw_fc(x))
        bw_x = self.bw_bn(self.bw_fc(x))
        fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
        bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
        x = paddle.concat([fw_x, bw_x], axis=-1)
        return x, x_len
 class RNNStack(nn.Layer):
    """RNN group with stacked bidirectional simple RNN or GRU layers.
    :param input: Input layer.
    :type input: Variable
    :param size: Dimension of RNN cells in each layer.
    :type size: int
    :param num_stacks: Number of stacked rnn layers.
    :type num_stacks: int
    :param use_gru: Use gru if set True. Use simple rnn if set False.
    :type use_gru: bool
    :param share_rnn_weights: Whether to share input-hidden weights between
                              forward and backward directional RNNs.
                              It is only available when use_gru=False.
    :type share_weights: bool
    :return: Output layer of the RNN group.
    :rtype: Variable
    """
    def __init__(self,
                 i_size: int,
                 h_size: int,
                 num_stacks: int,
                 use_gru: bool,
                 share_rnn_weights: bool):
        super().__init__()
        rnn_stacks = []
        for i in range(num_stacks):
            if use_gru:
                #default:GRU using tanh
                rnn_stacks.append(BiGRUWithBN(i_size=i_size, h_size=h_size))
            else:
                rnn_stacks.append(
                    BiRNNWithBN(
                        i_size=i_size,
                        h_size=h_size,
                        share_weights=share_rnn_weights))
            i_size = h_size * 2
        self.rnn_stacks = nn.LayerList(rnn_stacks)
    def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
        """
        x: shape [B, T, D]
        x_len: shpae [B]
        """
        for i, rnn in enumerate(self.rnn_stacks):
            x, x_len = rnn(x, x_len)
            masks = make_non_pad_mask(x_len)  #[B, T]
            masks = masks.unsqueeze(-1)  # [B, T, 1]
            # https://github.com/PaddlePaddle/Paddle/pull/29265
            # rhs will type promote to lhs
            x = x * masks
        return x, x_len
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@ -14,6 +14,7 @@
 import sys
 import time
 from collections import OrderedDict
 from contextlib import contextmanager
 from pathlib import Path
 import paddle
@ -27,6 +28,7 @@ from deepspeech.utils import mp_tools
 from deepspeech.utils import profiler
 from deepspeech.utils.checkpoint import Checkpoint
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import all_version
 from deepspeech.utils.utility import seed_all
 from deepspeech.utils.utility import UpdateConfig
@ -102,13 +104,28 @@ class Trainer():
        self.iteration = 0
        self.epoch = 0
        self.rank = dist.get_rank()
        self.world_size = dist.get_world_size()
        self._train = True
-        logger.info(f"Rank: {self.rank}/{dist.get_world_size()}")
+        # print deps version
        all_version()
        logger.info(f"Rank: {self.rank}/{self.world_size}")
        # set device
        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        if self.parallel:
            self.init_parallel()
        self.checkpoint = Checkpoint(
            kbest_n=self.config.training.checkpoint.kbest_n,
            latest_n=self.config.training.checkpoint.latest_n)
        # set random seed if needed
        if args.seed:
            seed_all(args.seed)
            logger.info(f"Set seed {args.seed}")
        # profiler and benchmark options
        if self.args.benchmark_batch_size:
            with UpdateConfig(self.config):
                self.config.collator.batch_size = self.args.benchmark_batch_size
@ -116,17 +133,18 @@ class Trainer():
            logger.info(
                f"Benchmark reset batch-size: {self.args.benchmark_batch_size}")
    @contextmanager
    def eval(self):
        self._train = False
        yield
        self._train = True
    def setup(self):
        """Setup the experiment.
        """
        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        if self.parallel:
            self.init_parallel()
        self.setup_output_dir()
        self.dump_config()
        self.setup_visualizer()
        self.setup_checkpointer()
        self.setup_dataloader()
        self.setup_model()
@ -161,7 +179,7 @@ class Trainer():
            "epoch": self.epoch,
            "lr": self.optimizer.get_lr()
        })
-        self.checkpoint.add_checkpoint(self.checkpoint_dir, self.iteration
+        self.checkpoint.save_parameters(self.checkpoint_dir, self.iteration
                                        if tag is None else tag, self.model,
                                        self.optimizer, infos)
@ -181,8 +199,8 @@ class Trainer():
        if infos:
            # just restore ckpt
            # lr will resotre from optimizer ckpt
-            self.iteration = infos["step"]
+            self.iteration = infos["step"] + 1
-            self.epoch = infos["epoch"]
+            self.epoch = infos["epoch"] + 1
            scratch = False
            logger.info(
                f"Restore ckpt: epoch {self.epoch }, step {self.iteration}!")
@ -300,37 +318,74 @@ class Trainer():
        """The routine of the experiment after setup. This method is intended
        to be used by the user.
        """
        with Timer("Training Done: {}"):
        try:
            with Timer("Training Done: {}"):
                self.train()
        except KeyboardInterrupt:
            exit(-1)
        finally:
            self.destory()
    def run_test(self):
        """Do Test/Decode"""
        try:
            with Timer("Test/Decode Done: {}"):
                with self.eval():
                    self.resume_or_scratch()
                    self.test()
        except KeyboardInterrupt:
            exit(-1)
    def run_export(self):
        """Do Model Export"""
        try:
            with Timer("Export Done: {}"):
                with self.eval():
                    self.export()
        except KeyboardInterrupt:
            exit(-1)
    def run_align(self):
        """Do CTC alignment"""
        try:
            with Timer("Align Done: {}"):
                with self.eval():
                    self.resume_or_scratch()
                    self.align()
        except KeyboardInterrupt:
            sys.exit(-1)
    def setup_output_dir(self):
        """Create a directory used for output.
        """
-        # output dir
+        if self.args.output:
            output_dir = Path(self.args.output).expanduser()
-        output_dir.mkdir(parents=True, exist_ok=True)
+        elif self.args.checkpoint_path:
-
+            output_dir = Path(
                self.args.checkpoint_path).expanduser().parent.parent
        self.output_dir = output_dir
        self.output_dir.mkdir(parents=True, exist_ok=True)
-    def setup_checkpointer(self):
+        self.checkpoint_dir = self.output_dir / "checkpoints"
-        """Create a directory used to save checkpoints into.
+        self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
-        It is "checkpoints" inside the output directory.
+        self.log_dir = output_dir / "log"
-        """
+        self.log_dir.mkdir(parents=True, exist_ok=True)
        # checkpoint dir
        checkpoint_dir = self.output_dir / "checkpoints"
        checkpoint_dir.mkdir(exist_ok=True)
-        self.checkpoint_dir = checkpoint_dir
+        self.test_dir = output_dir / "test"
        self.test_dir.mkdir(parents=True, exist_ok=True)
-        self.checkpoint = Checkpoint(
+        self.decode_dir = output_dir / "decode"
-            kbest_n=self.config.training.checkpoint.kbest_n,
+        self.decode_dir.mkdir(parents=True, exist_ok=True)
-            latest_n=self.config.training.checkpoint.latest_n)
+
        self.export_dir = output_dir / "export"
        self.export_dir.mkdir(parents=True, exist_ok=True)
        self.visual_dir = output_dir / "visual"
        self.visual_dir.mkdir(parents=True, exist_ok=True)
        self.config_dir = output_dir / "conf"
        self.config_dir.mkdir(parents=True, exist_ok=True)
    @mp_tools.rank_zero_only
    def destory(self):
@ -352,7 +407,7 @@ class Trainer():
        unexpected behaviors.
        """
        # visualizer
-        visualizer = SummaryWriter(logdir=str(self.output_dir))
+        visualizer = SummaryWriter(logdir=str(self.visual_dir))
        self.visualizer = visualizer
    @mp_tools.rank_zero_only
@ -362,7 +417,14 @@ class Trainer():
        It is saved in to ``config.yaml`` in the output directory at the
        beginning of the experiment.
        """
-        with open(self.output_dir / "config.yaml", 'wt') as f:
+        config_file = self.config_dir / "config.yaml"
        if self._train and config_file.exists():
            time_stamp = time.strftime("%Y_%m_%d_%H_%M_%s", time.gmtime())
            target_path = self.config_dir / ".".join(
                [time_stamp, "config.yaml"])
            config_file.rename(target_path)
        with open(config_file, 'wt') as f:
            print(self.config, file=f)
    def train_batch(self):
@ -376,6 +438,24 @@ class Trainer():
        """
        raise NotImplementedError("valid should be implemented.")
    @paddle.no_grad()
    def test(self):
        """The test. A subclass should implement this method in Tester.
        """
        raise NotImplementedError("test should be implemented.")
    @paddle.no_grad()
    def export(self):
        """The test. A subclass should implement this method in Tester.
        """
        raise NotImplementedError("export should be implemented.")
    @paddle.no_grad()
    def align(self):
        """The align. A subclass should implement this method in Tester.
        """
        raise NotImplementedError("align should be implemented.")
    def setup_model(self):
        """Setup model, criterion and optimizer, etc. A subclass should
        implement this method.
--- a/deepspeech/utils/checkpoint.py
+++ b/deepspeech/utils/checkpoint.py
@ -39,7 +39,7 @@ class Checkpoint():
        self.latest_n = latest_n
        self._save_all = (kbest_n == -1)
-    def add_checkpoint(self,
+    def save_parameters(self,
                        checkpoint_dir,
                        tag_or_iteration: Union[int, Text],
                        model: paddle.nn.Layer,
--- a/deepspeech/utils/ctc_utils.py
+++ b/deepspeech/utils/ctc_utils.py
@ -16,6 +16,8 @@ from typing import List
 import numpy as np
 import paddle
 from deepspeech.utils import text_grid
 from deepspeech.utils import utility
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
@ -87,14 +89,16 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
        (ctc_probs.shape[0], len(y_insert_blank)))  #(T, 2L+1)
    log_alpha = log_alpha - float('inf')  # log of zero
    # TODO(Hui Zhang): zeros not support paddle.int16
    # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
    state_path = (paddle.zeros(
        (ctc_probs.shape[0], len(y_insert_blank)), dtype=paddle.int32) - 1
                  )  # state path, Tuple((T, 2L+1))
    # init start state
-    log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]]  # State-b, Sb
+    # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64
-    log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]]  # State-nb, Snb
+    log_alpha[0, 0] = ctc_probs[0][int(y_insert_blank[0])]  # State-b, Sb
    log_alpha[0, 1] = ctc_probs[0][int(y_insert_blank[1])]  # State-nb, Snb
    for t in range(1, ctc_probs.shape[0]):  # T
        for s in range(len(y_insert_blank)):  # 2L+1
@ -110,9 +114,11 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
                    log_alpha[t - 1, s - 2],
                ])
                prev_state = [s, s - 1, s - 2]
-            log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][
+            # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64
-                y_insert_blank[s]]
+            log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][int(
                y_insert_blank[s])]
            state_path[t, s] = prev_state[paddle.argmax(candidates)]
    # TODO(Hui Zhang): zeros not support paddle.int16
    # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
    state_seq = -1 * paddle.ones((ctc_probs.shape[0], 1), dtype=paddle.int32)
@ -130,3 +136,85 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
        output_alignment.append(y_insert_blank[state_seq[t, 0]])
    return output_alignment
 # ctc_align(
 # self.model,
 # self.align_loader,
 # self.config.decoding.batch_size,
 # self.align_loader.collate_fn.stride_ms,
 # self.align_loader.collate_fn.vocab_list,
 # self.args.result_file,
 # )
 def ctc_align(model, dataloader, batch_size, stride_ms, token_dict,
              result_file):
    """ctc alignment.
    Args:
        model (nn.Layer): U2 Model.
        dataloader (io.DataLoader): dataloader.
        batch_size (int): decoding batchsize.
        stride_ms (int): audio feature stride in ms unit.
        token_dict (List[str]): vocab list, e.g. ['blank', 'unk', 'a', 'b', '<eos>'].
        result_file (str): alignment output file, e.g. xxx.align.
    """
    if batch_size > 1:
        logger.fatal('alignment mode must be running with batch_size == 1')
        sys.exit(1)
    assert result_file and result_file.endswith('.align')
    model.eval()
    logger.info(f"Align Total Examples: {len(dataloader.dataset)}")
    with open(result_file, 'w') as fout:
        # one example in batch
        for i, batch in enumerate(dataloader):
            key, feat, feats_length, target, target_length = batch
            # 1. Encoder
            encoder_out, encoder_mask = model._forward_encoder(
                feat, feats_length)  # (B, maxlen, encoder_dim)
            maxlen = encoder_out.shape[1]
            ctc_probs = model.ctc.log_softmax(
                encoder_out)  # (1, maxlen, vocab_size)
            # 2. alignment
            ctc_probs = ctc_probs.squeeze(0)
            target = target.squeeze(0)
            alignment = forced_align(ctc_probs, target)
            logger.info(f"align ids: {key[0]} {alignment}")
            fout.write('{} {}\n'.format(key[0], alignment))
            # 3. gen praat
            # segment alignment
            align_segs = text_grid.segment_alignment(alignment)
            logger.info(f"align tokens: {key[0]}, {align_segs}")
            # IntervalTier, List["start end token\n"]
            subsample = utility.get_subsample(self.config)
            tierformat = text_grid.align_to_tierformat(align_segs, subsample,
                                                       token_dict)
            # write tier
            align_output_path = Path(self.args.result_file).parent / "align"
            align_output_path.mkdir(parents=True, exist_ok=True)
            tier_path = align_output_path / (key[0] + ".tier")
            with tier_path.open('w') as f:
                f.writelines(tierformat)
            # write textgrid
            textgrid_path = align_output_path / (key[0] + ".TextGrid")
            second_per_frame = 1. / (1000. /
                                     stride_ms)  # 25ms window, 10ms stride
            second_per_example = (
                len(alignment) + 1) * subsample * second_per_frame
            text_grid.generate_textgrid(
                maxtime=second_per_example,
                intervals=tierformat,
                output=str(textgrid_path))
--- a/deepspeech/utils/error_rate.py
+++ b/deepspeech/utils/error_rate.py
@ -19,6 +19,8 @@ import numpy as np
 __all__ = ['word_errors', 'char_errors', 'wer', 'cer']
 editdistance.eval("a", "b")
 def _levenshtein_distance(ref, hyp):
    """Levenshtein distance is a string metric for measuring the difference
@ -90,6 +92,7 @@ def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '):
    hyp_words = list(filter(None, hypothesis.split(delimiter)))
    edit_distance = _levenshtein_distance(ref_words, hyp_words)
    # `editdistance.eavl precision` less than `_levenshtein_distance`
    # edit_distance = editdistance.eval(ref_words, hyp_words)
    return float(edit_distance), len(ref_words)
@ -121,6 +124,7 @@ def char_errors(reference, hypothesis, ignore_case=False, remove_space=False):
    hypothesis = join_char.join(list(filter(None, hypothesis.split(' '))))
    edit_distance = _levenshtein_distance(reference, hypothesis)
    # `editdistance.eavl precision` less than `_levenshtein_distance`
    # edit_distance = editdistance.eval(reference, hypothesis)
    return float(edit_distance), len(reference)
--- a/deepspeech/utils/log.py
+++ b/deepspeech/utils/log.py
@ -120,14 +120,15 @@ class Autolog:
                 model_precision="fp32"):
        import auto_log
        pid = os.getpid()
-        if (os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''):
+        if os.environ.get('CUDA_VISIBLE_DEVICES', None):
            gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0])
            infer_config = inference.Config()
            infer_config.enable_use_gpu(100, gpu_id)
        else:
            gpu_id = None
            infer_config = inference.Config()
-        autolog = auto_log.AutoLogger(
+
        self.autolog = auto_log.AutoLogger(
            model_name=model_name,
            model_precision=model_precision,
            batch_size=batch_size,
@ -139,7 +140,6 @@ class Autolog:
            gpu_ids=gpu_id,
            time_keys=['preprocess_time', 'inference_time', 'postprocess_time'],
            warmup=0)
        self.autolog = autolog
    def getlog(self):
        return self.autolog
--- a/deepspeech/utils/tensor_utils.py
+++ b/deepspeech/utils/tensor_utils.py
@ -183,7 +183,13 @@ def th_accuracy(pad_outputs: paddle.Tensor,
    pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1],
                                pad_outputs.shape[1]).argmax(2)
    mask = pad_targets != ignore_label
-    numerator = paddle.sum(
+    #TODO(Hui Zhang): sum not support bool type
    # numerator = paddle.sum(
    #     pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
    numerator = (
        pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
-    denominator = paddle.sum(mask)
+    numerator = paddle.sum(numerator.type_as(pad_targets))
    #TODO(Hui Zhang): sum not support bool type
    # denominator = paddle.sum(mask)
    denominator = paddle.sum(mask.type_as(pad_targets))
    return float(numerator) / float(denominator)
--- a/deepspeech/utils/utility.py
+++ b/deepspeech/utils/utility.py
@ -16,17 +16,35 @@ import distutils.util
 import math
 import os
 import random
 import sys
 from contextlib import contextmanager
 from pprint import pformat
 from typing import List
 import numpy as np
 import paddle
 import soundfile
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = [
-    "UpdateConfig", "seed_all", 'print_arguments', 'add_arguments', "log_add"
+    "all_version", "UpdateConfig", "seed_all", 'print_arguments',
    'add_arguments', "log_add"
 ]
 def all_version():
    vers = {
        "python": sys.version,
        "paddle": paddle.__version__,
        "paddle_commit": paddle.version.commit,
        "soundfile": soundfile.__version__,
    }
    logger.info(f"Deps Module Version:{pformat(vers.items())}")
@contextmanager
 def UpdateConfig(config):
    """Update yacs config"""
@ -35,7 +53,7 @@ def UpdateConfig(config):
    config.freeze()
-def seed_all(seed: int=210329):
+def seed_all(seed: int=20210329):
    """freeze random generator seed."""
    np.random.seed(seed)
    random.seed(seed)
@ -61,7 +79,7 @@ def print_arguments(args, info=None):
    if info:
        filename = info["__file__"]
    filename = os.path.basename(filename)
-    print(f"----------- {filename} Configuration Arguments -----------")
+    print(f"----------- {filename} Arguments -----------")
    for arg, value in sorted(vars(args).items()):
        print("%s: %s" % (arg, value))
    print("-----------------------------------------------------------")
--- a/env.sh
+++ b/env.sh
@ -1,6 +1,6 @@
 export MAIN_ROOT=${PWD}
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:/usr/local/bin:${PATH}
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}:/usr/local/bin/
 export LC_ALL=C
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
--- a/examples/1xt2x/aishell/path.sh
+++ b/examples/1xt2x/aishell/path.sh
@ -4,6 +4,7 @@ export LOCAL_DEEPSPEECH2=`realpath ${PWD}/../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
--- a/examples/1xt2x/baidu_en8k/path.sh
+++ b/examples/1xt2x/baidu_en8k/path.sh
@ -4,6 +4,7 @@ export LOCAL_DEEPSPEECH2=`realpath ${PWD}/../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
--- a/examples/1xt2x/librispeech/path.sh
+++ b/examples/1xt2x/librispeech/path.sh
@ -4,6 +4,7 @@ export LOCAL_DEEPSPEECH2=`realpath ${PWD}/../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
--- a/examples/1xt2x/src_deepspeech2x/bin/test.py
+++ b/examples/1xt2x/src_deepspeech2x/bin/test.py
@ -31,14 +31,13 @@ def main(config, args):
 if __name__ == "__main__":
    parser = default_argument_parser()
-    parser.add_argument("--model_type")
+    parser.add_argument(
        "--model_type", type=str, default='offline', help='offline/online')
    # save asr result to
    parser.add_argument(
        "--result_file", type=str, help="path of save the asr result")
    args = parser.parse_args()
    print_arguments(args, globals())
    if args.model_type is None:
        args.model_type = 'offline'
    print("model_type:{}".format(args.model_type))
    # https://yaml.org/type/float.html
--- a/examples/aishell/s0/path.sh
+++ b/examples/aishell/s0/path.sh
@ -3,6 +3,7 @@ export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
--- a/examples/aishell/s1/path.sh
+++ b/examples/aishell/s1/path.sh
@ -3,6 +3,7 @@ export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
--- a/examples/callcenter/s1/path.sh
+++ b/examples/callcenter/s1/path.sh
@ -3,6 +3,7 @@ export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8 
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
--- a/examples/librispeech/s0/path.sh
+++ b/examples/librispeech/s0/path.sh
@ -3,6 +3,7 @@ export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8 
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
--- a/examples/librispeech/s1/path.sh
+++ b/examples/librispeech/s1/path.sh
@ -3,6 +3,7 @@ export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sctk/bin:${PWD}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
--- a/examples/librispeech/s2/path.sh
+++ b/examples/librispeech/s2/path.sh
@ -3,6 +3,7 @@ export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sctk/bin:${PWD}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
--- a/examples/ted_en_zh/t0/path.sh
+++ b/examples/ted_en_zh/t0/path.sh
@ -3,6 +3,7 @@ export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8 
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
--- a/examples/timit/s1/path.sh
+++ b/examples/timit/s1/path.sh
@ -3,6 +3,7 @@ export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8 
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,5 @@
 coverage
 editdistance
 gpustat
 jsonlines
 kaldiio
@ -19,4 +20,3 @@ tqdm
 typeguard
 visualdl==2.2.0
 yacs
 editdistance
--- a/tests/mask_test.py
+++ b/tests/mask_test.py
@ -37,13 +37,13 @@ class TestU2Model(unittest.TestCase):
    def test_make_non_pad_mask(self):
        res = make_non_pad_mask(self.lengths)
-        res2 = ~make_pad_mask(self.lengths)
+        res2 = make_pad_mask(self.lengths).logical_not()
        self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist())
        self.assertSequenceEqual(res.numpy().tolist(), res2.numpy().tolist())
    def test_make_pad_mask(self):
        res = make_pad_mask(self.lengths)
-        res1 = ~make_non_pad_mask(self.lengths)
+        res1 = make_non_pad_mask(self.lengths).logical_not()
        self.assertSequenceEqual(res.numpy().tolist(), self.pad_masks.tolist())
        self.assertSequenceEqual(res.numpy().tolist(), res1.tolist())