fix some mistacks in doc

4 years ago · 84020a0471
parent bf40e3ff2d 4133a562fb
commit 84020a0471
127 changed files with 1482 additions and 593 deletions
--- a/README.md
+++ b/README.md
@ -43,7 +43,7 @@ You are welcome to submit questions in [Github Discussions](https://github.com/P

 ## License

-DeepASR is provided under the [Apache-2.0 License](./LICENSE).
+DeepSpeech is provided under the [Apache-2.0 License](./LICENSE).

 ## Acknowledgement

--- a/README_cn.md
+++ b/README_cn.md
@ -42,7 +42,7 @@

 ## License

-DeepASR 遵循[Apache-2.0开源协议](./LICENSE)。
+DeepSpeech 遵循[Apache-2.0开源协议](./LICENSE)。

 ## 感谢

--- a/deepspeech/init.py
+++ b/deepspeech/init.py
@ -80,23 +80,23 @@ def convert_dtype_to_string(tensor_dtype):


 if not hasattr(paddle, 'softmax'):
-    logger.warn("register user softmax to paddle, remove this when fixed!")
+    logger.debug("register user softmax to paddle, remove this when fixed!")
    setattr(paddle, 'softmax', paddle.nn.functional.softmax)

 if not hasattr(paddle, 'log_softmax'):
-    logger.warn("register user log_softmax to paddle, remove this when fixed!")
+    logger.debug("register user log_softmax to paddle, remove this when fixed!")
    setattr(paddle, 'log_softmax', paddle.nn.functional.log_softmax)

 if not hasattr(paddle, 'sigmoid'):
-    logger.warn("register user sigmoid to paddle, remove this when fixed!")
+    logger.debug("register user sigmoid to paddle, remove this when fixed!")
    setattr(paddle, 'sigmoid', paddle.nn.functional.sigmoid)

 if not hasattr(paddle, 'log_sigmoid'):
-    logger.warn("register user log_sigmoid to paddle, remove this when fixed!")
+    logger.debug("register user log_sigmoid to paddle, remove this when fixed!")
    setattr(paddle, 'log_sigmoid', paddle.nn.functional.log_sigmoid)

 if not hasattr(paddle, 'relu'):
-    logger.warn("register user relu to paddle, remove this when fixed!")
+    logger.debug("register user relu to paddle, remove this when fixed!")
    setattr(paddle, 'relu', paddle.nn.functional.relu)


@ -105,7 +105,7 @@ def cat(xs, dim=0):


 if not hasattr(paddle, 'cat'):
-    logger.warn(
+    logger.debug(
        "override cat of paddle if exists or register, remove this when fixed!")
    paddle.cat = cat

@ -116,7 +116,7 @@ def item(x: paddle.Tensor):


 if not hasattr(paddle.Tensor, 'item'):
-    logger.warn(
+    logger.debug(
        "override item of paddle.Tensor if exists or register, remove this when fixed!"
    )
    paddle.Tensor.item = item
@ -127,13 +127,13 @@ def func_long(x: paddle.Tensor):


 if not hasattr(paddle.Tensor, 'long'):
-    logger.warn(
+    logger.debug(
        "override long of paddle.Tensor if exists or register, remove this when fixed!"
    )
    paddle.Tensor.long = func_long

 if not hasattr(paddle.Tensor, 'numel'):
-    logger.warn(
+    logger.debug(
        "override numel of paddle.Tensor if exists or register, remove this when fixed!"
    )
    paddle.Tensor.numel = paddle.numel
@ -147,7 +147,7 @@ def new_full(x: paddle.Tensor,


 if not hasattr(paddle.Tensor, 'new_full'):
-    logger.warn(
+    logger.debug(
        "override new_full of paddle.Tensor if exists or register, remove this when fixed!"
    )
    paddle.Tensor.new_full = new_full
@ -162,13 +162,13 @@ def eq(xs: paddle.Tensor, ys: Union[paddle.Tensor, float]) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'eq'):
-    logger.warn(
+    logger.debug(
        "override eq of paddle.Tensor if exists or register, remove this when fixed!"
    )
    paddle.Tensor.eq = eq

 if not hasattr(paddle, 'eq'):
-    logger.warn(
+    logger.debug(
        "override eq of paddle if exists or register, remove this when fixed!")
    paddle.eq = eq

@ -178,7 +178,7 @@ def contiguous(xs: paddle.Tensor) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'contiguous'):
-    logger.warn(
+    logger.debug(
        "override contiguous of paddle.Tensor if exists or register, remove this when fixed!"
    )
    paddle.Tensor.contiguous = contiguous
@ -195,7 +195,7 @@ def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:


 #`to_static` do not process `size` property, maybe some `paddle` api dependent on it.
-logger.warn(
+logger.debug(
    "override size of paddle.Tensor "
    "(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!"
 )
@ -207,7 +207,7 @@ def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'view'):
-    logger.warn("register user view to paddle.Tensor, remove this when fixed!")
+    logger.debug("register user view to paddle.Tensor, remove this when fixed!")
    paddle.Tensor.view = view


@ -216,7 +216,7 @@ def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'view_as'):
-    logger.warn(
+    logger.debug(
        "register user view_as to paddle.Tensor, remove this when fixed!")
    paddle.Tensor.view_as = view_as

@ -242,7 +242,7 @@ def masked_fill(xs: paddle.Tensor,


 if not hasattr(paddle.Tensor, 'masked_fill'):
-    logger.warn(
+    logger.debug(
        "register user masked_fill to paddle.Tensor, remove this when fixed!")
    paddle.Tensor.masked_fill = masked_fill

@ -260,7 +260,7 @@ def masked_fill_(xs: paddle.Tensor,


 if not hasattr(paddle.Tensor, 'masked_fill_'):
-    logger.warn(
+    logger.debug(
        "register user masked_fill_ to paddle.Tensor, remove this when fixed!")
    paddle.Tensor.masked_fill_ = masked_fill_

@ -272,7 +272,8 @@ def fill_(xs: paddle.Tensor, value: Union[float, int]) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'fill_'):
-    logger.warn("register user fill_ to paddle.Tensor, remove this when fixed!")
+    logger.debug(
+        "register user fill_ to paddle.Tensor, remove this when fixed!")
    paddle.Tensor.fill_ = fill_


@ -281,22 +282,22 @@ def repeat(xs: paddle.Tensor, *size: Any) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'repeat'):
-    logger.warn(
+    logger.debug(
        "register user repeat to paddle.Tensor, remove this when fixed!")
    paddle.Tensor.repeat = repeat

 if not hasattr(paddle.Tensor, 'softmax'):
-    logger.warn(
+    logger.debug(
        "register user softmax to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'softmax', paddle.nn.functional.softmax)

 if not hasattr(paddle.Tensor, 'sigmoid'):
-    logger.warn(
+    logger.debug(
        "register user sigmoid to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'sigmoid', paddle.nn.functional.sigmoid)

 if not hasattr(paddle.Tensor, 'relu'):
-    logger.warn("register user relu to paddle.Tensor, remove this when fixed!")
+    logger.debug("register user relu to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'relu', paddle.nn.functional.relu)


@ -305,7 +306,7 @@ def type_as(x: paddle.Tensor, other: paddle.Tensor) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'type_as'):
-    logger.warn(
+    logger.debug(
        "register user type_as to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'type_as', type_as)

@ -321,7 +322,7 @@ def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'to'):
-    logger.warn("register user to to paddle.Tensor, remove this when fixed!")
+    logger.debug("register user to to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'to', to)


@ -330,7 +331,8 @@ def func_float(x: paddle.Tensor) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'float'):
-    logger.warn("register user float to paddle.Tensor, remove this when fixed!")
+    logger.debug(
+        "register user float to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'float', func_float)


@ -339,7 +341,7 @@ def func_int(x: paddle.Tensor) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'int'):
-    logger.warn("register user int to paddle.Tensor, remove this when fixed!")
+    logger.debug("register user int to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'int', func_int)


@ -348,23 +350,6 @@ def tolist(x: paddle.Tensor) -> List[Any]:


 if not hasattr(paddle.Tensor, 'tolist'):
-    logger.warn(
+    logger.debug(
        "register user tolist to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'tolist', tolist)
-
-
-########### hcak paddle.nn #############
-class GLU(nn.Layer):
-    """Gated Linear Units (GLU) Layer"""
-
-    def __init__(self, dim: int=-1):
-        super().__init__()
-        self.dim = dim
-
-    def forward(self, xs):
-        return F.glu(xs, axis=self.dim)
-
-
-if not hasattr(paddle.nn, 'GLU'):
-    logger.warn("register user GLU to paddle.nn, remove this when fixed!")
-    setattr(paddle.nn, 'GLU', GLU)
--- a/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
+++ b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
@ -35,7 +35,8 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
    size_t beam_size,
    double cutoff_prob,
    size_t cutoff_top_n,
-    Scorer *ext_scorer) {
+    Scorer *ext_scorer,
+    size_t blank_id) {
    // dimension check
    size_t num_time_steps = probs_seq.size();
    for (size_t i = 0; i < num_time_steps; ++i) {
@ -48,7 +49,7 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(

    // assign blank id
    // size_t blank_id = vocabulary.size();
-    size_t blank_id = 0;
+    // size_t blank_id = 0;

    // assign space id
    auto it = std::find(vocabulary.begin(), vocabulary.end(), " ");
@ -57,7 +58,6 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
    if ((size_t)space_id >= vocabulary.size()) {
        space_id = -2;
    }
-
    // init prefixes' root
    PathTrie root;
    root.score = root.log_prob_b_prev = 0.0;
@ -218,7 +218,8 @@ ctc_beam_search_decoder_batch(
    size_t num_processes,
    double cutoff_prob,
    size_t cutoff_top_n,
-    Scorer *ext_scorer) {
+    Scorer *ext_scorer,
+    size_t blank_id) {
    VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
    // thread pool
    ThreadPool pool(num_processes);
@ -234,7 +235,8 @@ ctc_beam_search_decoder_batch(
                                      beam_size,
                                      cutoff_prob,
                                      cutoff_top_n,
-                                      ext_scorer));
+                                      ext_scorer,
+                                      blank_id));
    }

    // get decoding results
--- a/deepspeech/decoders/swig/ctc_beam_search_decoder.h
+++ b/deepspeech/decoders/swig/ctc_beam_search_decoder.h
@ -43,7 +43,8 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
    size_t beam_size,
    double cutoff_prob = 1.0,
    size_t cutoff_top_n = 40,
-    Scorer *ext_scorer = nullptr);
+    Scorer *ext_scorer = nullptr,
+    size_t blank_id = 0);

 /* CTC Beam Search Decoder for batch data

@ -70,6 +71,7 @@ ctc_beam_search_decoder_batch(
    size_t num_processes,
    double cutoff_prob = 1.0,
    size_t cutoff_top_n = 40,
-    Scorer *ext_scorer = nullptr);
+    Scorer *ext_scorer = nullptr,
+    size_t blank_id = 0);

 #endif  // CTC_BEAM_SEARCH_DECODER_H_
--- a/deepspeech/decoders/swig/ctc_greedy_decoder.cpp
+++ b/deepspeech/decoders/swig/ctc_greedy_decoder.cpp
@ -17,17 +17,18 @@

 std::string ctc_greedy_decoder(
    const std::vector<std::vector<double>> &probs_seq,
-    const std::vector<std::string> &vocabulary) {
+    const std::vector<std::string> &vocabulary,
+    size_t blank_id) {
    // dimension check
    size_t num_time_steps = probs_seq.size();
    for (size_t i = 0; i < num_time_steps; ++i) {
        VALID_CHECK_EQ(probs_seq[i].size(),
-                       vocabulary.size() + 1,
+                       vocabulary.size(),
                       "The shape of probs_seq does not match with "
                       "the shape of the vocabulary");
    }

-    size_t blank_id = vocabulary.size();
+    // size_t blank_id = vocabulary.size();

    std::vector<size_t> max_idx_vec(num_time_steps, 0);
    std::vector<size_t> idx_vec;
--- a/deepspeech/decoders/swig/ctc_greedy_decoder.h
+++ b/deepspeech/decoders/swig/ctc_greedy_decoder.h
@ -29,6 +29,7 @@
 */
 std::string ctc_greedy_decoder(
    const std::vector<std::vector<double>>& probs_seq,
-    const std::vector<std::string>& vocabulary);
+    const std::vector<std::string>& vocabulary,
+    size_t blank_id);

 #endif  // CTC_GREEDY_DECODER_H
--- a/deepspeech/decoders/swig/setup.py
+++ b/deepspeech/decoders/swig/setup.py
@ -85,9 +85,8 @@ FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')

 # yapf: disable
 FILES = [
-    fn for fn in FILES
-    if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
-        'unittest.cc'))
+    fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc')
+                               or fn.endswith('unittest.cc'))
 ]
 # yapf: enable

--- a/deepspeech/decoders/swig_wrapper.py
+++ b/deepspeech/decoders/swig_wrapper.py
@ -32,7 +32,7 @@ class Scorer(swig_decoders.Scorer):
        swig_decoders.Scorer.__init__(self, alpha, beta, model_path, vocabulary)


-def ctc_greedy_decoder(probs_seq, vocabulary):
+def ctc_greedy_decoder(probs_seq, vocabulary, blank_id):
    """Wrapper for ctc best path decoder in swig.

    :param probs_seq: 2-D list of probability distributions over each time
@ -44,7 +44,8 @@ def ctc_greedy_decoder(probs_seq, vocabulary):
    :return: Decoding result string.
    :rtype: str
    """
-    result = swig_decoders.ctc_greedy_decoder(probs_seq.tolist(), vocabulary)
+    result = swig_decoders.ctc_greedy_decoder(probs_seq.tolist(), vocabulary,
+                                              blank_id)
    return result


@ -53,7 +54,8 @@ def ctc_beam_search_decoder(probs_seq,
                            beam_size,
                            cutoff_prob=1.0,
                            cutoff_top_n=40,
-                            ext_scoring_func=None):
+                            ext_scoring_func=None,
+                            blank_id=0):
    """Wrapper for the CTC Beam Search Decoder.

    :param probs_seq: 2-D list of probability distributions over each time
@ -81,7 +83,7 @@ def ctc_beam_search_decoder(probs_seq,
    """
    beam_results = swig_decoders.ctc_beam_search_decoder(
        probs_seq.tolist(), vocabulary, beam_size, cutoff_prob, cutoff_top_n,
-        ext_scoring_func)
+        ext_scoring_func, blank_id)
    beam_results = [(res[0], res[1].decode('utf-8')) for res in beam_results]
    return beam_results

@ -92,7 +94,8 @@ def ctc_beam_search_decoder_batch(probs_split,
                                  num_processes,
                                  cutoff_prob=1.0,
                                  cutoff_top_n=40,
-                                  ext_scoring_func=None):
+                                  ext_scoring_func=None,
+                                  blank_id=0):
    """Wrapper for the batched CTC beam search decoder.

    :param probs_seq: 3-D list with each element as an instance of 2-D list
@ -125,7 +128,7 @@ def ctc_beam_search_decoder_batch(probs_split,

    batch_beam_results = swig_decoders.ctc_beam_search_decoder_batch(
        probs_split, vocabulary, beam_size, num_processes, cutoff_prob,
-        cutoff_top_n, ext_scoring_func)
+        cutoff_top_n, ext_scoring_func, blank_id)
    batch_beam_results = [[(res[0], res[1]) for res in beam_results]
                          for beam_results in batch_beam_results]
    return batch_beam_results
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@ -15,6 +15,7 @@
 import os
 import time
 from collections import defaultdict
+from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional

@ -65,29 +66,51 @@ class DeepSpeech2Trainer(Trainer):
        super().__init__(config, args)

    def train_batch(self, batch_index, batch_data, msg):
+        train_conf = self.config.training
        start = time.time()
+
+        # forward
        utt, audio, audio_len, text, text_len = batch_data
        loss = self.model(audio, audio_len, text, text_len)
-        loss.backward()
-        layer_tools.print_grads(self.model, print_func=None)
-        self.optimizer.step()
-        self.optimizer.clear_grad()
-        iteration_time = time.time() - start
-
        losses_np = {
            'train_loss': float(loss),
        }
+
+        # loss backward
+        if (batch_index + 1) % train_conf.accum_grad != 0:
+            # Disable gradient synchronizations across DDP processes.
+            # Within this context, gradients will be accumulated on module
+            # variables, which will later be synchronized.
+            context = self.model.no_sync
+        else:
+            # Used for single gpu training and DDP gradient synchronization
+            # processes.
+            context = nullcontext
+
+        with context():
+            loss.backward()
+            layer_tools.print_grads(self.model, print_func=None)
+
+        # optimizer step
+        if (batch_index + 1) % train_conf.accum_grad == 0:
+            self.optimizer.step()
+            self.optimizer.clear_grad()
+            self.iteration += 1
+
+        iteration_time = time.time() - start
+
        msg += "train time: {:>.3f}s, ".format(iteration_time)
        msg += "batch size: {}, ".format(self.config.collator.batch_size)
+        msg += "accum: {}, ".format(train_conf.accum_grad)
        msg += ', '.join('{}: {:>.6f}'.format(k, v)
                         for k, v in losses_np.items())
        logger.info(msg)

        if dist.get_rank() == 0 and self.visualizer:
            for k, v in losses_np.items():
+                # `step -1` since we update `step` after optimizer.step().
                self.visualizer.add_scalar("train/{}".format(k), v,
-                                           self.iteration)
-        self.iteration += 1
+                                           self.iteration - 1)

    @paddle.no_grad()
    def valid(self):
--- a/deepspeech/exps/u2/bin/train.py
+++ b/deepspeech/exps/u2/bin/train.py
@ -21,6 +21,7 @@ from deepspeech.exps.u2.config import get_cfg_defaults
 from deepspeech.exps.u2.model import U2Trainer as Trainer
 from deepspeech.training.cli import default_argument_parser
 from deepspeech.utils.utility import print_arguments
+# from deepspeech.exps.u2.trainer import U2Trainer as Trainer


 def main_sp(config, args):
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@ -17,6 +17,7 @@ import os
 import sys
 import time
 from collections import defaultdict
+from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional

@ -33,6 +34,7 @@ from deepspeech.io.sampler import SortagradDistributedBatchSampler
 from deepspeech.models.u2 import U2Model
 from deepspeech.training.optimizer import OptimizerFactory
 from deepspeech.training.scheduler import LRSchedulerFactory
+from deepspeech.training.timer import Timer
 from deepspeech.training.trainer import Trainer
 from deepspeech.utils import ctc_utils
 from deepspeech.utils import error_rate
@ -79,21 +81,35 @@ class U2Trainer(Trainer):
    def train_batch(self, batch_index, batch_data, msg):
        train_conf = self.config.training
        start = time.time()
-        utt, audio, audio_len, text, text_len = batch_data

+        # forward
+        utt, audio, audio_len, text, text_len = batch_data
        loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
                                                    text_len)
+
        # loss div by `batch_size * accum_grad`
        loss /= train_conf.accum_grad
-        loss.backward()
-        layer_tools.print_grads(self.model, print_func=None)
-
        losses_np = {'loss': float(loss) * train_conf.accum_grad}
        if attention_loss:
            losses_np['att_loss'] = float(attention_loss)
        if ctc_loss:
            losses_np['ctc_loss'] = float(ctc_loss)

+        # loss backward
+        if (batch_index + 1) % train_conf.accum_grad != 0:
+            # Disable gradient synchronizations across DDP processes.
+            # Within this context, gradients will be accumulated on module
+            # variables, which will later be synchronized.
+            context = self.model.no_sync
+        else:
+            # Used for single gpu training and DDP gradient synchronization
+            # processes.
+            context = nullcontext
+        with context():
+            loss.backward()
+            layer_tools.print_grads(self.model, print_func=None)
+
+        # optimizer step
        if (batch_index + 1) % train_conf.accum_grad == 0:
            self.optimizer.step()
            self.optimizer.clear_grad()
@ -169,40 +185,42 @@ class U2Trainer(Trainer):
            self.save(tag='init')

        self.lr_scheduler.step(self.iteration)
-        if self.parallel:
+        if self.parallel and hasattr(self.train_loader, 'batch_sampler'):
            self.train_loader.batch_sampler.set_epoch(self.epoch)

        logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
        while self.epoch < self.config.training.n_epoch:
-            self.model.train()
-            try:
-                data_start_time = time.time()
-                for batch_index, batch in enumerate(self.train_loader):
-                    dataload_time = time.time() - data_start_time
-                    msg = "Train: Rank: {}, ".format(dist.get_rank())
-                    msg += "epoch: {}, ".format(self.epoch)
-                    msg += "step: {}, ".format(self.iteration)
-                    msg += "batch : {}/{}, ".format(batch_index + 1,
-                                                    len(self.train_loader))
-                    msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
-                    msg += "data time: {:>.3f}s, ".format(dataload_time)
-                    self.train_batch(batch_index, batch, msg)
+            with Timer("Epoch-Train Time Cost: {}"):
+                self.model.train()
+                try:
                    data_start_time = time.time()
-            except Exception as e:
-                logger.error(e)
-                raise e
-
-            total_loss, num_seen_utts = self.valid()
-            if dist.get_world_size() > 1:
-                num_seen_utts = paddle.to_tensor(num_seen_utts)
-                # the default operator in all_reduce function is sum.
-                dist.all_reduce(num_seen_utts)
-                total_loss = paddle.to_tensor(total_loss)
-                dist.all_reduce(total_loss)
-                cv_loss = total_loss / num_seen_utts
-                cv_loss = float(cv_loss)
-            else:
-                cv_loss = total_loss / num_seen_utts
+                    for batch_index, batch in enumerate(self.train_loader):
+                        dataload_time = time.time() - data_start_time
+                        msg = "Train: Rank: {}, ".format(dist.get_rank())
+                        msg += "epoch: {}, ".format(self.epoch)
+                        msg += "step: {}, ".format(self.iteration)
+                        msg += "batch : {}/{}, ".format(batch_index + 1,
+                                                        len(self.train_loader))
+                        msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
+                        msg += "data time: {:>.3f}s, ".format(dataload_time)
+                        self.train_batch(batch_index, batch, msg)
+                        data_start_time = time.time()
+                except Exception as e:
+                    logger.error(e)
+                    raise e
+
+            with Timer("Eval Time Cost: {}"):
+                total_loss, num_seen_utts = self.valid()
+                if dist.get_world_size() > 1:
+                    num_seen_utts = paddle.to_tensor(num_seen_utts)
+                    # the default operator in all_reduce function is sum.
+                    dist.all_reduce(num_seen_utts)
+                    total_loss = paddle.to_tensor(total_loss)
+                    dist.all_reduce(total_loss)
+                    cv_loss = total_loss / num_seen_utts
+                    cv_loss = float(cv_loss)
+                else:
+                    cv_loss = total_loss / num_seen_utts

            logger.info(
                'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
--- a/deepspeech/exps/u2/trainer.py
+++ b/deepspeech/exps/u2/trainer.py
@ -0,0 +1,219 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains U2 model."""
+import paddle
+from paddle import distributed as dist
+from paddle.io import DataLoader
+
+from deepspeech.io.collator import SpeechCollator
+from deepspeech.io.dataset import ManifestDataset
+from deepspeech.io.sampler import SortagradBatchSampler
+from deepspeech.io.sampler import SortagradDistributedBatchSampler
+from deepspeech.models.u2 import U2Evaluator
+from deepspeech.models.u2 import U2Model
+from deepspeech.models.u2 import U2Updater
+from deepspeech.training.extensions.snapshot import Snapshot
+from deepspeech.training.extensions.visualizer import VisualDL
+from deepspeech.training.optimizer import OptimizerFactory
+from deepspeech.training.scheduler import LRSchedulerFactory
+from deepspeech.training.timer import Timer
+from deepspeech.training.trainer import Trainer
+from deepspeech.training.updaters.trainer import Trainer as NewTrainer
+from deepspeech.utils import layer_tools
+from deepspeech.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+
+class U2Trainer(Trainer):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+
+    def setup_dataloader(self):
+        config = self.config.clone()
+        config.defrost()
+        config.collator.keep_transcription_text = False
+
+        # train/valid dataset, return token ids
+        config.data.manifest = config.data.train_manifest
+        train_dataset = ManifestDataset.from_config(config)
+
+        config.data.manifest = config.data.dev_manifest
+        dev_dataset = ManifestDataset.from_config(config)
+
+        collate_fn_train = SpeechCollator.from_config(config)
+
+        config.collator.augmentation_config = ""
+        collate_fn_dev = SpeechCollator.from_config(config)
+
+        if self.parallel:
+            batch_sampler = SortagradDistributedBatchSampler(
+                train_dataset,
+                batch_size=config.collator.batch_size,
+                num_replicas=None,
+                rank=None,
+                shuffle=True,
+                drop_last=True,
+                sortagrad=config.collator.sortagrad,
+                shuffle_method=config.collator.shuffle_method)
+        else:
+            batch_sampler = SortagradBatchSampler(
+                train_dataset,
+                shuffle=True,
+                batch_size=config.collator.batch_size,
+                drop_last=True,
+                sortagrad=config.collator.sortagrad,
+                shuffle_method=config.collator.shuffle_method)
+        self.train_loader = DataLoader(
+            train_dataset,
+            batch_sampler=batch_sampler,
+            collate_fn=collate_fn_train,
+            num_workers=config.collator.num_workers, )
+        self.valid_loader = DataLoader(
+            dev_dataset,
+            batch_size=config.collator.batch_size,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=collate_fn_dev)
+
+        # test dataset, return raw text
+        config.data.manifest = config.data.test_manifest
+        # filter test examples, will cause less examples, but no mismatch with training
+        # and can use large batch size , save training time, so filter test egs now.
+        config.data.min_input_len = 0.0  # second
+        config.data.max_input_len = float('inf')  # second
+        config.data.min_output_len = 0.0  # tokens
+        config.data.max_output_len = float('inf')  # tokens
+        config.data.min_output_input_ratio = 0.00
+        config.data.max_output_input_ratio = float('inf')
+
+        test_dataset = ManifestDataset.from_config(config)
+        # return text ord id
+        config.collator.keep_transcription_text = True
+        config.collator.augmentation_config = ""
+        self.test_loader = DataLoader(
+            test_dataset,
+            batch_size=config.decoding.batch_size,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=SpeechCollator.from_config(config))
+        # return text token id
+        config.collator.keep_transcription_text = False
+        self.align_loader = DataLoader(
+            test_dataset,
+            batch_size=config.decoding.batch_size,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=SpeechCollator.from_config(config))
+        logger.info("Setup train/valid/test/align Dataloader!")
+
+    def setup_model(self):
+        config = self.config
+        model_conf = config.model
+        model_conf.defrost()
+        model_conf.input_dim = self.train_loader.collate_fn.feature_size
+        model_conf.output_dim = self.train_loader.collate_fn.vocab_size
+        model_conf.freeze()
+        model = U2Model.from_config(model_conf)
+
+        if self.parallel:
+            model = paddle.DataParallel(model)
+
+        model.train()
+        logger.info(f"{model}")
+        layer_tools.print_params(model, logger.info)
+
+        train_config = config.training
+        optim_type = train_config.optim
+        optim_conf = train_config.optim_conf
+        scheduler_type = train_config.scheduler
+        scheduler_conf = train_config.scheduler_conf
+
+        scheduler_args = {
+            "learning_rate": optim_conf.lr,
+            "verbose": False,
+            "warmup_steps": scheduler_conf.warmup_steps,
+            "gamma": scheduler_conf.lr_decay,
+            "d_model": model_conf.encoder_conf.output_size,
+        }
+        lr_scheduler = LRSchedulerFactory.from_args(scheduler_type,
+                                                    scheduler_args)
+
+        def optimizer_args(
+                config,
+                parameters,
+                lr_scheduler=None, ):
+            train_config = config.training
+            optim_type = train_config.optim
+            optim_conf = train_config.optim_conf
+            scheduler_type = train_config.scheduler
+            scheduler_conf = train_config.scheduler_conf
+            return {
+                "grad_clip": train_config.global_grad_clip,
+                "weight_decay": optim_conf.weight_decay,
+                "learning_rate": lr_scheduler
+                if lr_scheduler else optim_conf.lr,
+                "parameters": parameters,
+                "epsilon": 1e-9 if optim_type == 'noam' else None,
+                "beta1": 0.9 if optim_type == 'noam' else None,
+                "beat2": 0.98 if optim_type == 'noam' else None,
+            }
+
+        optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
+        optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
+
+        self.model = model
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        logger.info("Setup model/optimizer/lr_scheduler!")
+
+    def setup_updater(self):
+        output_dir = self.output_dir
+        config = self.config.training
+
+        updater = U2Updater(
+            model=self.model,
+            optimizer=self.optimizer,
+            scheduler=self.lr_scheduler,
+            dataloader=self.train_loader,
+            output_dir=output_dir,
+            accum_grad=config.accum_grad)
+
+        trainer = NewTrainer(updater, (config.n_epoch, 'epoch'), output_dir)
+
+        evaluator = U2Evaluator(self.model, self.valid_loader)
+
+        trainer.extend(evaluator, trigger=(1, "epoch"))
+
+        if dist.get_rank() == 0:
+            trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
+            num_snapshots = config.checkpoint.kbest_n
+            trainer.extend(
+                Snapshot(
+                    mode='kbest',
+                    max_size=num_snapshots,
+                    indicator='VALID/LOSS',
+                    less_better=True),
+                trigger=(1, 'epoch'))
+        # print(trainer.extensions)
+        # trainer.run()
+        self.trainer = trainer
+
+    def run(self):
+        """The routine of the experiment after setup. This method is intended
+        to be used by the user.
+        """
+        self.setup_updater()
+        with Timer("Training Done: {}"):
+            self.trainer.run()
--- a/deepspeech/exps/u2_kaldi/model.py
+++ b/deepspeech/exps/u2_kaldi/model.py
@ -17,6 +17,7 @@ import os
 import sys
 import time
 from collections import defaultdict
+from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional

@ -31,6 +32,7 @@ from deepspeech.io.dataloader import BatchDataLoader
 from deepspeech.models.u2 import U2Model
 from deepspeech.training.optimizer import OptimizerFactory
 from deepspeech.training.scheduler import LRSchedulerFactory
+from deepspeech.training.timer import Timer
 from deepspeech.training.trainer import Trainer
 from deepspeech.utils import ctc_utils
 from deepspeech.utils import error_rate
@ -83,20 +85,34 @@ class U2Trainer(Trainer):
        train_conf = self.config.training
        start = time.time()

+        # forward
        utt, audio, audio_len, text, text_len = batch_data
        loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
                                                    text_len)
+
        # loss div by `batch_size * accum_grad`
        loss /= train_conf.accum_grad
-        loss.backward()
-        layer_tools.print_grads(self.model, print_func=None)
-
        losses_np = {'loss': float(loss) * train_conf.accum_grad}
        if attention_loss:
            losses_np['att_loss'] = float(attention_loss)
        if ctc_loss:
            losses_np['ctc_loss'] = float(ctc_loss)

+        # loss backward
+        if (batch_index + 1) % train_conf.accum_grad != 0:
+            # Disable gradient synchronizations across DDP processes.
+            # Within this context, gradients will be accumulated on module
+            # variables, which will later be synchronized.
+            context = self.model.no_sync
+        else:
+            # Used for single gpu training and DDP gradient synchronization
+            # processes.
+            context = nullcontext
+        with context():
+            loss.backward()
+            layer_tools.print_grads(self.model, print_func=None)
+
+        # optimizer step
        if (batch_index + 1) % train_conf.accum_grad == 0:
            self.optimizer.step()
            self.optimizer.clear_grad()
@ -175,35 +191,37 @@ class U2Trainer(Trainer):

        logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
        while self.epoch < self.config.training.n_epoch:
-            self.model.train()
-            try:
-                data_start_time = time.time()
-                for batch_index, batch in enumerate(self.train_loader):
-                    dataload_time = time.time() - data_start_time
-                    msg = "Train: Rank: {}, ".format(dist.get_rank())
-                    msg += "epoch: {}, ".format(self.epoch)
-                    msg += "step: {}, ".format(self.iteration)
-                    msg += "batch : {}/{}, ".format(batch_index + 1,
-                                                    len(self.train_loader))
-                    msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
-                    msg += "data time: {:>.3f}s, ".format(dataload_time)
-                    self.train_batch(batch_index, batch, msg)
+            with Timer("Epoch-Train Time Cost: {}"):
+                self.model.train()
+                try:
                    data_start_time = time.time()
-            except Exception as e:
-                logger.error(e)
-                raise e
-
-            total_loss, num_seen_utts = self.valid()
-            if dist.get_world_size() > 1:
-                num_seen_utts = paddle.to_tensor(num_seen_utts)
-                # the default operator in all_reduce function is sum.
-                dist.all_reduce(num_seen_utts)
-                total_loss = paddle.to_tensor(total_loss)
-                dist.all_reduce(total_loss)
-                cv_loss = total_loss / num_seen_utts
-                cv_loss = float(cv_loss)
-            else:
-                cv_loss = total_loss / num_seen_utts
+                    for batch_index, batch in enumerate(self.train_loader):
+                        dataload_time = time.time() - data_start_time
+                        msg = "Train: Rank: {}, ".format(dist.get_rank())
+                        msg += "epoch: {}, ".format(self.epoch)
+                        msg += "step: {}, ".format(self.iteration)
+                        msg += "batch : {}/{}, ".format(batch_index + 1,
+                                                        len(self.train_loader))
+                        msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
+                        msg += "data time: {:>.3f}s, ".format(dataload_time)
+                        self.train_batch(batch_index, batch, msg)
+                        data_start_time = time.time()
+                except Exception as e:
+                    logger.error(e)
+                    raise e
+
+            with Timer("Eval Time Cost: {}"):
+                total_loss, num_seen_utts = self.valid()
+                if dist.get_world_size() > 1:
+                    num_seen_utts = paddle.to_tensor(num_seen_utts)
+                    # the default operator in all_reduce function is sum.
+                    dist.all_reduce(num_seen_utts)
+                    total_loss = paddle.to_tensor(total_loss)
+                    dist.all_reduce(total_loss)
+                    cv_loss = total_loss / num_seen_utts
+                    cv_loss = float(cv_loss)
+                else:
+                    cv_loss = total_loss / num_seen_utts

            logger.info(
                'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
@ -17,6 +17,7 @@ import os
 import sys
 import time
 from collections import defaultdict
+from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional

@ -37,6 +38,7 @@ from deepspeech.io.sampler import SortagradDistributedBatchSampler
 from deepspeech.models.u2_st import U2STModel
 from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
 from deepspeech.training.scheduler import WarmupLR
+from deepspeech.training.timer import Timer
 from deepspeech.training.trainer import Trainer
 from deepspeech.utils import bleu_score
 from deepspeech.utils import ctc_utils
@ -83,6 +85,7 @@ class U2STTrainer(Trainer):
    def train_batch(self, batch_index, batch_data, msg):
        train_conf = self.config.training
        start = time.time()
+        # forward
        utt, audio, audio_len, text, text_len = batch_data
        if isinstance(text, list) and isinstance(text_len, list):
            # joint training with ASR. Two decoding texts [translation, transcription]
@ -94,18 +97,30 @@ class U2STTrainer(Trainer):
        else:
            loss, st_loss, attention_loss, ctc_loss = self.model(
                audio, audio_len, text, text_len)
+
        # loss div by `batch_size * accum_grad`
        loss /= train_conf.accum_grad
-        loss.backward()
-        layer_tools.print_grads(self.model, print_func=None)
-
        losses_np = {'loss': float(loss) * train_conf.accum_grad}
-        losses_np['st_loss'] = float(st_loss)
        if attention_loss:
            losses_np['att_loss'] = float(attention_loss)
        if ctc_loss:
            losses_np['ctc_loss'] = float(ctc_loss)

+        # loss backward
+        if (batch_index + 1) % train_conf.accum_grad != 0:
+            # Disable gradient synchronizations across DDP processes.
+            # Within this context, gradients will be accumulated on module
+            # variables, which will later be synchronized.
+            context = self.model.no_sync
+        else:
+            # Used for single gpu training and DDP gradient synchronization
+            # processes.
+            context = nullcontext
+        with context():
+            loss.backward()
+            layer_tools.print_grads(self.model, print_func=None)
+
+        # optimizer step
        if (batch_index + 1) % train_conf.accum_grad == 0:
            self.optimizer.step()
            self.optimizer.clear_grad()
@ -193,35 +208,37 @@ class U2STTrainer(Trainer):

        logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
        while self.epoch < self.config.training.n_epoch:
-            self.model.train()
-            try:
-                data_start_time = time.time()
-                for batch_index, batch in enumerate(self.train_loader):
-                    dataload_time = time.time() - data_start_time
-                    msg = "Train: Rank: {}, ".format(dist.get_rank())
-                    msg += "epoch: {}, ".format(self.epoch)
-                    msg += "step: {}, ".format(self.iteration)
-                    msg += "batch : {}/{}, ".format(batch_index + 1,
-                                                    len(self.train_loader))
-                    msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
-                    msg += "data time: {:>.3f}s, ".format(dataload_time)
-                    self.train_batch(batch_index, batch, msg)
+            with Timer("Epoch-Train Time Cost: {}"):
+                self.model.train()
+                try:
                    data_start_time = time.time()
-            except Exception as e:
-                logger.error(e)
-                raise e
-
-            total_loss, num_seen_utts = self.valid()
-            if dist.get_world_size() > 1:
-                num_seen_utts = paddle.to_tensor(num_seen_utts)
-                # the default operator in all_reduce function is sum.
-                dist.all_reduce(num_seen_utts)
-                total_loss = paddle.to_tensor(total_loss)
-                dist.all_reduce(total_loss)
-                cv_loss = total_loss / num_seen_utts
-                cv_loss = float(cv_loss)
-            else:
-                cv_loss = total_loss / num_seen_utts
+                    for batch_index, batch in enumerate(self.train_loader):
+                        dataload_time = time.time() - data_start_time
+                        msg = "Train: Rank: {}, ".format(dist.get_rank())
+                        msg += "epoch: {}, ".format(self.epoch)
+                        msg += "step: {}, ".format(self.iteration)
+                        msg += "batch : {}/{}, ".format(batch_index + 1,
+                                                        len(self.train_loader))
+                        msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
+                        msg += "data time: {:>.3f}s, ".format(dataload_time)
+                        self.train_batch(batch_index, batch, msg)
+                        data_start_time = time.time()
+                except Exception as e:
+                    logger.error(e)
+                    raise e
+
+            with Timer("Eval Time Cost: {}"):
+                total_loss, num_seen_utts = self.valid()
+                if dist.get_world_size() > 1:
+                    num_seen_utts = paddle.to_tensor(num_seen_utts)
+                    # the default operator in all_reduce function is sum.
+                    dist.all_reduce(num_seen_utts)
+                    total_loss = paddle.to_tensor(total_loss)
+                    dist.all_reduce(total_loss)
+                    cv_loss = total_loss / num_seen_utts
+                    cv_loss = float(cv_loss)
+                else:
+                    cv_loss = total_loss / num_seen_utts

            logger.info(
                'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
--- a/deepspeech/io/dataloader.py
+++ b/deepspeech/io/dataloader.py
@ -44,7 +44,7 @@ def feat_dim_and_vocab_size(data_json: List[Dict[Text, Any]],


 def batch_collate(x):
-    """de-tuple.
+    """de-minibatch, since user compose batch.

    Args:
        x (List[Tuple]): [(utts, xs, ilens, ys, olens)]
--- a/deepspeech/models/ds2/conv.py
+++ b/deepspeech/models/ds2/conv.py
@ -106,11 +106,9 @@ class ConvBn(nn.Layer):
        # reset padding part to 0
        masks = make_non_pad_mask(x_len)  #[B, T]
        masks = masks.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
-        # TODO(Hui Zhang): not support bool multiply
-        # masks = masks.type_as(x)
-        masks = masks.astype(x.dtype)
-        x = x.multiply(masks)
-
+        # https://github.com/PaddlePaddle/Paddle/pull/29265
+        # rhs will type promote to lhs
+        x = x * masks
        return x, x_len


--- a/deepspeech/models/ds2/deepspeech2.py
+++ b/deepspeech/models/ds2/deepspeech2.py
@ -128,8 +128,8 @@ class DeepSpeech2Model(nn.Layer):
                num_rnn_layers=3,  #Number of stacking RNN layers.
                rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
                use_gru=True,  #Use gru if set True. Use simple rnn if set False.
-                share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
-            ))
+                share_rnn_weights=True,  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
+                ctc_grad_norm_type='instance', ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
@ -141,7 +141,9 @@ class DeepSpeech2Model(nn.Layer):
                 num_rnn_layers=3,
                 rnn_size=1024,
                 use_gru=False,
-                 share_rnn_weights=True):
+                 share_rnn_weights=True,
+                 blank_id=0,
+                 ctc_grad_norm_type='instance'):
        super().__init__()
        self.encoder = CRNNEncoder(
            feat_size=feat_size,
@ -156,10 +158,11 @@ class DeepSpeech2Model(nn.Layer):
        self.decoder = CTCDecoder(
            odim=dict_size,  # <blank> is in  vocab
            enc_n_units=self.encoder.output_size,
-            blank_id=0,  # first token is <blank>
+            blank_id=blank_id,
            dropout_rate=0.0,
            reduction=True,  # sum
-            batch_average=True)  # sum / batch_size
+            batch_average=True,  # sum / batch_size
+            grad_norm_type=ctc_grad_norm_type)

    def forward(self, audio, audio_len, text, text_len):
        """Compute Model loss
@ -221,7 +224,8 @@ class DeepSpeech2Model(nn.Layer):
                    num_rnn_layers=config.model.num_rnn_layers,
                    rnn_size=config.model.rnn_layer_size,
                    use_gru=config.model.use_gru,
-                    share_rnn_weights=config.model.share_rnn_weights)
+                    share_rnn_weights=config.model.share_rnn_weights,
+                    blank_id=config.model.blank_id)
        infos = Checkpoint().load_parameters(
            model, checkpoint_path=checkpoint_path)
        logger.info(f"checkpoint info: {infos}")
@ -246,7 +250,8 @@ class DeepSpeech2Model(nn.Layer):
                    num_rnn_layers=config.num_rnn_layers,
                    rnn_size=config.rnn_layer_size,
                    use_gru=config.use_gru,
-                    share_rnn_weights=config.share_rnn_weights)
+                    share_rnn_weights=config.share_rnn_weights,
+                    blank_id=config.blank_id)
        return model


@ -258,7 +263,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
                 num_rnn_layers=3,
                 rnn_size=1024,
                 use_gru=False,
-                 share_rnn_weights=True):
+                 share_rnn_weights=True,
+                 blank_id=0):
        super().__init__(
            feat_size=feat_size,
            dict_size=dict_size,
@ -266,7 +272,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
            num_rnn_layers=num_rnn_layers,
            rnn_size=rnn_size,
            use_gru=use_gru,
-            share_rnn_weights=share_rnn_weights)
+            share_rnn_weights=share_rnn_weights,
+            blank_id=blank_id)

    def forward(self, audio, audio_len):
        """export model function
--- a/deepspeech/models/ds2/rnn.py
+++ b/deepspeech/models/ds2/rnn.py
@ -308,7 +308,8 @@ class RNNStack(nn.Layer):
            x, x_len = rnn(x, x_len)
            masks = make_non_pad_mask(x_len)  #[B, T]
            masks = masks.unsqueeze(-1)  # [B, T, 1]
-            # TODO(Hui Zhang): not support bool multiply
-            masks = masks.astype(x.dtype)
-            x = x.multiply(masks)
+            # https://github.com/PaddlePaddle/Paddle/pull/29265
+            # rhs will type promote to lhs
+            x = x * masks
+
        return x, x_len
--- a/deepspeech/models/ds2_online/deepspeech2.py
+++ b/deepspeech/models/ds2_online/deepspeech2.py
@ -254,6 +254,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
                num_fc_layers=2,
                fc_layers_size_list=[512, 256],
                use_gru=True,  #Use gru if set True. Use simple rnn if set False.
+                blank_id=0,  # index of blank in vocob.txt
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
@ -268,7 +269,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
                 rnn_direction='forward',
                 num_fc_layers=2,
                 fc_layers_size_list=[512, 256],
-                 use_gru=False):
+                 use_gru=False,
+                 blank_id=0):
        super().__init__()
        self.encoder = CRNNEncoder(
            feat_size=feat_size,
@ -284,10 +286,11 @@ class DeepSpeech2ModelOnline(nn.Layer):
        self.decoder = CTCDecoder(
            odim=dict_size,  # <blank> is in  vocab
            enc_n_units=self.encoder.output_size,
-            blank_id=0,  # first token is <blank>
+            blank_id=blank_id,
            dropout_rate=0.0,
            reduction=True,  # sum
-            batch_average=True)  # sum / batch_size
+            batch_average=True,  # sum / batch_size
+            grad_norm_type='instance')

    def forward(self, audio, audio_len, text, text_len):
        """Compute Model loss
@ -353,7 +356,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
                    rnn_direction=config.model.rnn_direction,
                    num_fc_layers=config.model.num_fc_layers,
                    fc_layers_size_list=config.model.fc_layers_size_list,
-                    use_gru=config.model.use_gru)
+                    use_gru=config.model.use_gru,
+                    blank_id=config.model.blank_id)
        infos = Checkpoint().load_parameters(
            model, checkpoint_path=checkpoint_path)
        logger.info(f"checkpoint info: {infos}")
@ -380,7 +384,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
                    rnn_direction=config.rnn_direction,
                    num_fc_layers=config.num_fc_layers,
                    fc_layers_size_list=config.fc_layers_size_list,
-                    use_gru=config.use_gru)
+                    use_gru=config.use_gru,
+                    blank_id=config.blank_id)
        return model


@ -394,7 +399,8 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
                 rnn_direction='forward',
                 num_fc_layers=2,
                 fc_layers_size_list=[512, 256],
-                 use_gru=False):
+                 use_gru=False,
+                 blank_id=0):
        super().__init__(
            feat_size=feat_size,
            dict_size=dict_size,
@ -404,7 +410,8 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
            rnn_direction=rnn_direction,
            num_fc_layers=num_fc_layers,
            fc_layers_size_list=fc_layers_size_list,
-            use_gru=use_gru)
+            use_gru=use_gru,
+            blank_id=blank_id)

    def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box,
                chunk_state_c_box):
--- a/deepspeech/models/u2/init.py
+++ b/deepspeech/models/u2/init.py
@ -0,0 +1,19 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .u2 import U2InferModel
+from .u2 import U2Model
+from .updater import U2Evaluator
+from .updater import U2Updater
+
+__all__ = ["U2Model", "U2InferModel", "U2Evaluator", "U2Updater"]
--- a/deepspeech/models/u2/u2.py
+++ b/deepspeech/models/u2/u2.py
@ -115,7 +115,8 @@ class U2BaseModel(nn.Layer):
                 ctc_weight: float=0.5,
                 ignore_id: int=IGNORE_ID,
                 lsm_weight: float=0.0,
-                 length_normalized_loss: bool=False):
+                 length_normalized_loss: bool=False,
+                 **kwargs):
        assert 0.0 <= ctc_weight <= 1.0, ctc_weight

        super().__init__()
@ -661,9 +662,7 @@ class U2BaseModel(nn.Layer):
            xs, offset, required_cache_size, subsampling_cache,
            elayers_output_cache, conformer_cnn_cache)

-    # @jit.to_static([
-    #         paddle.static.InputSpec(shape=[1, None, feat_dim],dtype='float32'),  # audio feat, [B,T,D]
-    #     ])
+    # @jit.to_static
    def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
        """ Export interface for c++ call, apply linear transform and log
            softmax before ctc
@ -830,6 +829,7 @@ class U2Model(U2BaseModel):
        Returns:
            int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
        """
+        # cmvn
        if configs['cmvn_file'] is not None:
            mean, istd = load_cmvn(configs['cmvn_file'],
                                   configs['cmvn_file_type'])
@ -839,11 +839,13 @@ class U2Model(U2BaseModel):
        else:
            global_cmvn = None

+        # input & output dim
        input_dim = configs['input_dim']
        vocab_size = configs['output_dim']
        assert input_dim != 0, input_dim
        assert vocab_size != 0, vocab_size

+        # encoder
        encoder_type = configs.get('encoder', 'transformer')
        logger.info(f"U2 Encoder type: {encoder_type}")
        if encoder_type == 'transformer':
@ -855,16 +857,21 @@ class U2Model(U2BaseModel):
        else:
            raise ValueError(f"not support encoder type:{encoder_type}")

+        # decoder
        decoder = TransformerDecoder(vocab_size,
                                     encoder.output_size(),
                                     **configs['decoder_conf'])
+
+        # ctc decoder and ctc loss
+        model_conf = configs['model_conf']
        ctc = CTCDecoder(
            odim=vocab_size,
            enc_n_units=encoder.output_size(),
            blank_id=0,
-            dropout_rate=0.0,
+            dropout_rate=model_conf['ctc_dropoutrate'],
            reduction=True,  # sum
-            batch_average=True)  # sum / batch_size
+            batch_average=True,  # sum / batch_size
+            grad_norm_type=model_conf['ctc_grad_norm_type'])

        return vocab_size, encoder, decoder, ctc

--- a/deepspeech/models/u2/updater.py
+++ b/deepspeech/models/u2/updater.py
@ -0,0 +1,149 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from contextlib import nullcontext
+
+import paddle
+from paddle import distributed as dist
+
+from deepspeech.training.extensions.evaluator import StandardEvaluator
+from deepspeech.training.reporter import report
+from deepspeech.training.timer import Timer
+from deepspeech.training.updaters.standard_updater import StandardUpdater
+from deepspeech.utils import layer_tools
+from deepspeech.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+
+class U2Evaluator(StandardEvaluator):
+    def __init__(self, model, dataloader):
+        super().__init__(model, dataloader)
+        self.msg = ""
+        self.num_seen_utts = 0
+        self.total_loss = 0.0
+
+    def evaluate_core(self, batch):
+        self.msg = "Valid: Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+
+        loss, attention_loss, ctc_loss = self.model(*batch[1:])
+        if paddle.isfinite(loss):
+            num_utts = batch[1].shape[0]
+            self.num_seen_utts += num_utts
+            self.total_loss += float(loss) * num_utts
+
+            losses_dict['loss'] = float(loss)
+            if attention_loss:
+                losses_dict['att_loss'] = float(attention_loss)
+            if ctc_loss:
+                losses_dict['ctc_loss'] = float(ctc_loss)
+
+            for k, v in losses_dict.items():
+                report("eval/" + k, v)
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        logger.info(self.msg)
+        return self.total_loss, self.num_seen_utts
+
+
+class U2Updater(StandardUpdater):
+    def __init__(self,
+                 model,
+                 optimizer,
+                 scheduler,
+                 dataloader,
+                 init_state=None,
+                 accum_grad=1,
+                 **kwargs):
+        super().__init__(
+            model, optimizer, scheduler, dataloader, init_state=init_state)
+        self.accum_grad = accum_grad
+        self.forward_count = 0
+        self.msg = ""
+
+    def update_core(self, batch):
+        """One Step
+
+        Args:
+            batch (List[Object]): utts, xs, xlens, ys, ylens
+        """
+        losses_dict = {}
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+
+        # forward
+        batch_size = batch[1].shape[0]
+        loss, attention_loss, ctc_loss = self.model(*batch[1:])
+        # loss div by `batch_size * accum_grad`
+        loss /= self.accum_grad
+
+        # loss backward
+        if (self.forward_count + 1) != self.accum_grad:
+            # Disable gradient synchronizations across DDP processes.
+            # Within this context, gradients will be accumulated on module
+            # variables, which will later be synchronized.
+            context = self.model.no_sync
+        else:
+            # Used for single gpu training and DDP gradient synchronization
+            # processes.
+            context = nullcontext
+
+        with context():
+            loss.backward()
+            layer_tools.print_grads(self.model, print_func=None)
+
+        # loss info
+        losses_dict['loss'] = float(loss) * self.accum_grad
+        if attention_loss:
+            losses_dict['att_loss'] = float(attention_loss)
+        if ctc_loss:
+            losses_dict['ctc_loss'] = float(ctc_loss)
+        # report loss
+        for k, v in losses_dict.items():
+            report("train/" + k, v)
+        # loss msg
+        self.msg += "batch size: {}, ".format(batch_size)
+        self.msg += "accum: {}, ".format(self.accum_grad)
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+        # Truncate the graph
+        loss.detach()
+
+        # update parameters
+        self.forward_count += 1
+        if self.forward_count != self.accum_grad:
+            return
+        self.forward_count = 0
+
+        self.optimizer.step()
+        self.optimizer.clear_grad()
+        self.scheduler.step()
+
+    def update(self):
+        # model is default in train mode
+
+        # training for a step is implemented here
+        with Timer("data time cost:{}"):
+            batch = self.read_batch()
+        with Timer("step time cost:{}"):
+            self.update_core(batch)
+
+        # #iterations with accum_grad > 1
+        # Ref.: https://github.com/espnet/espnet/issues/777
+        if self.forward_count == 0:
+            self.state.iteration += 1
+        if self.updates_per_epoch is not None:
+            if self.state.iteration % self.updates_per_epoch == 0:
+                self.state.epoch += 1
--- a/deepspeech/models/u2_st.py
+++ b/deepspeech/models/u2_st.py
@ -413,26 +413,26 @@ class U2STBaseModel(nn.Layer):
        best_hyps = best_hyps[:, 1:]
        return best_hyps

-    @jit.to_static
+    # @jit.to_static
    def subsampling_rate(self) -> int:
        """ Export interface for c++ call, return subsampling_rate of the
            model
        """
        return self.encoder.embed.subsampling_rate

-    @jit.to_static
+    # @jit.to_static
    def right_context(self) -> int:
        """ Export interface for c++ call, return right_context of the model
        """
        return self.encoder.embed.right_context

-    @jit.to_static
+    # @jit.to_static
    def sos_symbol(self) -> int:
        """ Export interface for c++ call, return sos symbol id of the model
        """
        return self.sos

-    @jit.to_static
+    # @jit.to_static
    def eos_symbol(self) -> int:
        """ Export interface for c++ call, return eos symbol id of the model
        """
@ -468,7 +468,7 @@ class U2STBaseModel(nn.Layer):
            xs, offset, required_cache_size, subsampling_cache,
            elayers_output_cache, conformer_cnn_cache)

-    @jit.to_static
+    # @jit.to_static
    def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
        """ Export interface for c++ call, apply linear transform and log
            softmax before ctc
@ -643,13 +643,16 @@ class U2STModel(U2STBaseModel):
            decoder = TransformerDecoder(vocab_size,
                                         encoder.output_size(),
                                         **configs['decoder_conf'])
+            # ctc decoder and ctc loss
+            model_conf = configs['model_conf']
            ctc = CTCDecoder(
                odim=vocab_size,
                enc_n_units=encoder.output_size(),
                blank_id=0,
-                dropout_rate=0.0,
+                dropout_rate=model_conf['ctc_dropout_rate'],
                reduction=True,  # sum
-                batch_average=True)  # sum / batch_size
+                batch_average=True,  # sum / batch_size
+                grad_norm_type=model_conf['ctc_grad_norm_type'])

            return vocab_size, encoder, (st_decoder, decoder, ctc)
        else:
--- a/deepspeech/modules/activation.py
+++ b/deepspeech/modules/activation.py
@ -15,12 +15,13 @@ from collections import OrderedDict

 import paddle
 from paddle import nn
+from paddle.nn import functional as F

 from deepspeech.utils.log import Log

 logger = Log(__name__).getlog()

-__all__ = ["get_activation", "brelu", "LinearGLUBlock", "ConvGLUBlock"]
+__all__ = ["get_activation", "brelu", "LinearGLUBlock", "ConvGLUBlock", "GLU"]


 def brelu(x, t_min=0.0, t_max=24.0, name=None):
@ -30,6 +31,17 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
    return x.maximum(t_min).minimum(t_max)


+class GLU(nn.Layer):
+    """Gated Linear Units (GLU) Layer"""
+
+    def __init__(self, dim: int=-1):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, xs):
+        return F.glu(xs, axis=self.dim)
+
+
 class LinearGLUBlock(nn.Layer):
    """A linear Gated Linear Units (GLU) block."""

@ -133,13 +145,18 @@ def get_activation(act):
    """Return activation function."""
    # Lazy load to avoid unused import
    activation_funcs = {
+        "hardshrink": paddle.nn.Hardshrink,
+        "hardswish": paddle.nn.Hardswish,
        "hardtanh": paddle.nn.Hardtanh,
        "tanh": paddle.nn.Tanh,
        "relu": paddle.nn.ReLU,
+        "relu6": paddle.nn.ReLU6,
+        "leakyrelu": paddle.nn.LeakyReLU,
        "selu": paddle.nn.SELU,
        "swish": paddle.nn.Swish,
        "gelu": paddle.nn.GELU,
-        "brelu": brelu,
+        "glu": GLU,
+        "elu": paddle.nn.ELU,
    }

    return activation_funcs[act]()
--- a/deepspeech/modules/conv.py
+++ b/deepspeech/modules/conv.py
@ -113,11 +113,9 @@ class ConvBn(nn.Layer):
        # reset padding part to 0
        masks = make_non_pad_mask(x_len)  #[B, T]
        masks = masks.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
-        # TODO(Hui Zhang): not support bool multiply
-        # masks = masks.type_as(x)
-        masks = masks.astype(x.dtype)
-        x = x.multiply(masks)
-
+        # https://github.com/PaddlePaddle/Paddle/pull/29265
+        # rhs will type promote to lhs
+        x = x * masks
        return x, x_len


--- a/deepspeech/modules/ctc.py
+++ b/deepspeech/modules/ctc.py
@ -39,7 +39,8 @@ class CTCDecoder(nn.Layer):
                 blank_id=0,
                 dropout_rate: float=0.0,
                 reduction: bool=True,
-                 batch_average: bool=True):
+                 batch_average: bool=True,
+                 grad_norm_type: str="instance"):
        """CTC decoder

        Args:
@ -48,6 +49,7 @@ class CTCDecoder(nn.Layer):
            dropout_rate (float): dropout rate (0.0 ~ 1.0)
            reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none'
            batch_average (bool): do batch dim wise average.
+            grad_norm_type (str): one of 'instance', 'batchsize', 'frame', None.
        """
        assert check_argument_types()
        super().__init__()
@ -60,7 +62,8 @@ class CTCDecoder(nn.Layer):
        self.criterion = CTCLoss(
            blank=self.blank_id,
            reduction=reduction_type,
-            batch_average=batch_average)
+            batch_average=batch_average,
+            grad_norm_type=grad_norm_type)

        # CTCDecoder LM Score handle
        self._ext_scorer = None
@ -136,7 +139,7 @@ class CTCDecoder(nn.Layer):
        results = []
        for i, probs in enumerate(probs_split):
            output_transcription = ctc_greedy_decoder(
-                probs_seq=probs, vocabulary=vocab_list)
+                probs_seq=probs, vocabulary=vocab_list, blank_id=self.blank_id)
            results.append(output_transcription)
        return results

@ -216,7 +219,8 @@ class CTCDecoder(nn.Layer):
            num_processes=num_processes,
            ext_scoring_func=self._ext_scorer,
            cutoff_prob=cutoff_prob,
-            cutoff_top_n=cutoff_top_n)
+            cutoff_top_n=cutoff_top_n,
+            blank_id=self.blank_id)

        results = [result[0][1] for result in beam_search_results]
        return results
--- a/deepspeech/modules/loss.py
+++ b/deepspeech/modules/loss.py
@ -23,11 +23,32 @@ __all__ = ['CTCLoss', "LabelSmoothingLoss"]


 class CTCLoss(nn.Layer):
-    def __init__(self, blank=0, reduction='sum', batch_average=False):
+    def __init__(self,
+                 blank=0,
+                 reduction='sum',
+                 batch_average=False,
+                 grad_norm_type=None):
        super().__init__()
        # last token id as blank id
        self.loss = nn.CTCLoss(blank=blank, reduction=reduction)
        self.batch_average = batch_average
+        logger.info(
+            f"CTCLoss Loss reduction: {reduction}, div-bs: {batch_average}")
+
+        # instance for norm_by_times
+        # batch for norm_by_batchsize
+        # frame for norm_by_total_logits_len
+        assert grad_norm_type in ('instance', 'batch', 'frame', None)
+        self.norm_by_times = False
+        self.norm_by_batchsize = False
+        self.norm_by_total_logits_len = False
+        logger.info(f"CTCLoss Grad Norm Type: {grad_norm_type}")
+        if grad_norm_type == 'instance':
+            self.norm_by_times = True
+        if grad_norm_type == 'batch':
+            self.norm_by_batchsize = True
+        if grad_norm_type == 'frame':
+            self.norm_by_total_logits_len = True

    def forward(self, logits, ys_pad, hlens, ys_lens):
        """Compute CTC loss.
@ -46,10 +67,15 @@ class CTCLoss(nn.Layer):
        # warp-ctc need activation with shape [T, B, V + 1]
        # logits: (B, L, D) -> (L, B, D)
        logits = logits.transpose([1, 0, 2])
-        # (TODO:Hui Zhang) ctc loss does not support int64 labels
        ys_pad = ys_pad.astype(paddle.int32)
        loss = self.loss(
-            logits, ys_pad, hlens, ys_lens, norm_by_times=self.batch_average)
+            logits,
+            ys_pad,
+            hlens,
+            ys_lens,
+            norm_by_times=self.norm_by_times,
+            norm_by_batchsize=self.norm_by_batchsize,
+            norm_by_total_logits_len=self.norm_by_total_logits_len)
        if self.batch_average:
            # Batch-size average
            loss = loss / B
--- a/deepspeech/modules/rnn.py
+++ b/deepspeech/modules/rnn.py
@ -308,7 +308,7 @@ class RNNStack(nn.Layer):
            x, x_len = rnn(x, x_len)
            masks = make_non_pad_mask(x_len)  #[B, T]
            masks = masks.unsqueeze(-1)  # [B, T, 1]
-            # TODO(Hui Zhang): not support bool multiply
-            masks = masks.astype(x.dtype)
-            x = x.multiply(masks)
+            # https://github.com/PaddlePaddle/Paddle/pull/29265
+            # rhs will type promote to lhs
+            x = x * masks
        return x, x_len
--- a/deepspeech/training/extensions/evaluator.py
+++ b/deepspeech/training/extensions/evaluator.py
@ -13,14 +13,18 @@
 # limitations under the License.
 from typing import Dict

-import extension
 import paddle
+from paddle import distributed as dist
 from paddle.io import DataLoader
 from paddle.nn import Layer

+from . import extension
 from ..reporter import DictSummary
 from ..reporter import report
 from ..reporter import scope
+from ..timer import Timer
+from deepspeech.utils.log import Log
+logger = Log(__name__).getlog()


 class StandardEvaluator(extension.Extension):
@ -43,6 +47,27 @@ class StandardEvaluator(extension.Extension):
    def evaluate_core(self, batch):
        # compute
        self.model(batch)  # you may report here
+        return
+
+    def evaluate_sync(self, data):
+        # dist sync `evaluate_core` outputs
+        if data is None:
+            return
+
+        numerator, denominator = data
+        if dist.get_world_size() > 1:
+            numerator = paddle.to_tensor(numerator)
+            denominator = paddle.to_tensor(denominator)
+            # the default operator in all_reduce function is sum.
+            dist.all_reduce(numerator)
+            dist.all_reduce(denominator)
+            value = numerator / denominator
+            value = float(value)
+        else:
+            value = numerator / denominator
+        # used for `snapshort` to do kbest save.
+        report("VALID/LOSS", value)
+        logger.info(f"Valid: all-reduce loss {value}")

    def evaluate(self):
        # switch to eval mode
@ -56,9 +81,13 @@ class StandardEvaluator(extension.Extension):
            with scope(observation):
                # main evaluation computation here.
                with paddle.no_grad():
-                    self.evaluate_core(batch)
+                    self.evaluate_sync(self.evaluate_core(batch))
            summary.add(observation)
        summary = summary.compute_mean()
+
+        # switch to train mode
+        for model in self.models.values():
+            model.train()
        return summary

    def __call__(self, trainer=None):
@ -66,6 +95,7 @@ class StandardEvaluator(extension.Extension):
        # if it is used to extend a trainer, the metrics is reported to
        # to observation of the trainer
        # or otherwise, you can use your own observation
-        summary = self.evaluate()
+        with Timer("Eval Time Cost: {}"):
+            summary = self.evaluate()
        for k, v in summary.items():
            report(k, v)
--- a/deepspeech/training/extensions/snapshot.py
+++ b/deepspeech/training/extensions/snapshot.py
@ -20,8 +20,9 @@ from typing import List

 import jsonlines

-from deepspeech.training.extensions import extension
-from deepspeech.training.updaters.trainer import Trainer
+from . import extension
+from ..reporter import get_observations
+from ..updaters.trainer import Trainer
 from deepspeech.utils.log import Log
 from deepspeech.utils.mp_tools import rank_zero_only

@ -52,8 +53,19 @@ class Snapshot(extension.Extension):
    priority = -100
    default_name = "snapshot"

-    def __init__(self, max_size: int=5, snapshot_on_error: bool=False):
+    def __init__(self,
+                 mode='latest',
+                 max_size: int=5,
+                 indicator=None,
+                 less_better=True,
+                 snapshot_on_error: bool=False):
        self.records: List[Dict[str, Any]] = []
+        assert mode in ('latest', 'kbest'), mode
+        if mode == 'kbest':
+            assert indicator is not None
+        self.mode = mode
+        self.indicator = indicator
+        self.less_is_better = less_better
        self.max_size = max_size
        self._snapshot_on_error = snapshot_on_error
        self._save_all = (max_size == -1)
@ -66,16 +78,17 @@ class Snapshot(extension.Extension):
        # load existing records
        record_path: Path = self.checkpoint_dir / "records.jsonl"
        if record_path.exists():
-            logger.debug("Loading from an existing checkpoint dir")
            self.records = load_records(record_path)
-            trainer.updater.load(self.records[-1]['path'])
+            ckpt_path = self.records[-1]['path']
+            logger.info(f"Loading from an existing checkpoint {ckpt_path}")
+            trainer.updater.load(ckpt_path)

    def on_error(self, trainer, exc, tb):
        if self._snapshot_on_error:
-            self.save_checkpoint_and_update(trainer)
+            self.save_checkpoint_and_update(trainer, 'latest')

    def __call__(self, trainer: Trainer):
-        self.save_checkpoint_and_update(trainer)
+        self.save_checkpoint_and_update(trainer, self.mode)

    def full(self):
        """Whether the number of snapshots it keeps track of is greater
@ -83,7 +96,7 @@ class Snapshot(extension.Extension):
        return (not self._save_all) and len(self.records) > self.max_size

    @rank_zero_only
-    def save_checkpoint_and_update(self, trainer: Trainer):
+    def save_checkpoint_and_update(self, trainer: Trainer, mode: str):
        """Saving new snapshot and remove the oldest snapshot if needed."""
        iteration = trainer.updater.state.iteration
        epoch = trainer.updater.state.epoch
@ -97,11 +110,17 @@ class Snapshot(extension.Extension):
            'path': str(path.resolve()),  # use absolute path
            'iteration': iteration,
            'epoch': epoch,
+            'indicator': get_observations()[self.indicator]
        }
        self.records.append(record)

        # remove the earist
        if self.full():
+            if mode == 'kbest':
+                self.records = sorted(
+                    self.records,
+                    key=lambda record: record['indicator'],
+                    reverse=not self.less_is_better)
            eariest_record = self.records[0]
            os.remove(eariest_record["path"])
            self.records.pop(0)
--- a/deepspeech/training/extensions/visualizer.py
+++ b/deepspeech/training/extensions/visualizer.py
@ -11,8 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from deepspeech.training.extensions import extension
-from deepspeech.training.updaters.trainer import Trainer
+from visualdl import LogWriter
+
+from . import extension
+from ..updaters.trainer import Trainer


 class VisualDL(extension.Extension):
@ -26,8 +28,8 @@ class VisualDL(extension.Extension):
    default_name = 'visualdl'
    priority = extension.PRIORITY_READER

-    def __init__(self, writer):
-        self.writer = writer
+    def __init__(self, output_dir):
+        self.writer = LogWriter(str(output_dir))

    def __call__(self, trainer: Trainer):
        for k, v in trainer.observation.items():
--- a/deepspeech/training/gradclip.py
+++ b/deepspeech/training/gradclip.py
@ -47,7 +47,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
            sum_square = layers.reduce_sum(square)
            sum_square_list.append(sum_square)

-            # debug log
+            # debug log, not dump all since slow down train process
            if i < 10:
                logger.debug(
                    f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
@ -76,7 +76,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
            new_grad = layers.elementwise_mul(x=g, y=clip_var)
            params_and_grads.append((p, new_grad))

-            # debug log
+            # debug log, not dump all since slow down train process
            if i < 10:
                logger.debug(
                    f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
--- a/deepspeech/training/timer.py
+++ b/deepspeech/training/timer.py
@ -0,0 +1,50 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import datetime
+import time
+
+from deepspeech.utils.log import Log
+
+__all__ = ["Timer"]
+
+logger = Log(__name__).getlog()
+
+
+class Timer():
+    """To be used like this: 
+        with Timer("Message") as value:
+            do some thing
+    """
+
+    def __init__(self, message=None):
+        self.message = message
+
+    def duration(self) -> str:
+        elapsed_time = time.time() - self.start
+        time_str = str(datetime.timedelta(seconds=elapsed_time))
+        return time_str
+
+    def __enter__(self):
+        self.start = time.time()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        if self.message:
+            logger.info(self.message.format(self.duration()))
+
+    def __call__(self) -> float:
+        return time.time() - self.start
+
+    def __str__(self):
+        return self.duration()
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@ -18,6 +18,7 @@ import paddle
 from paddle import distributed as dist
 from tensorboardX import SummaryWriter

+from deepspeech.training.timer import Timer
 from deepspeech.utils import mp_tools
 from deepspeech.utils.checkpoint import Checkpoint
 from deepspeech.utils.log import Log
@ -170,7 +171,7 @@ class Trainer():
            self.iteration = 0
            self.epoch = 0
            scratch = True
-
+        logger.info("Restore/Init checkpoint!")
        return scratch

    def new_epoch(self):
@ -194,35 +195,37 @@ class Trainer():

        logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
        while self.epoch < self.config.training.n_epoch:
-            self.model.train()
-            try:
-                data_start_time = time.time()
-                for batch_index, batch in enumerate(self.train_loader):
-                    dataload_time = time.time() - data_start_time
-                    msg = "Train: Rank: {}, ".format(dist.get_rank())
-                    msg += "epoch: {}, ".format(self.epoch)
-                    msg += "step: {}, ".format(self.iteration)
-                    msg += "batch : {}/{}, ".format(batch_index + 1,
-                                                    len(self.train_loader))
-                    msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
-                    msg += "data time: {:>.3f}s, ".format(dataload_time)
-                    self.train_batch(batch_index, batch, msg)
+            with Timer("Epoch-Train Time Cost: {}"):
+                self.model.train()
+                try:
                    data_start_time = time.time()
-            except Exception as e:
-                logger.error(e)
-                raise e
-
-            total_loss, num_seen_utts = self.valid()
-            if dist.get_world_size() > 1:
-                num_seen_utts = paddle.to_tensor(num_seen_utts)
-                # the default operator in all_reduce function is sum.
-                dist.all_reduce(num_seen_utts)
-                total_loss = paddle.to_tensor(total_loss)
-                dist.all_reduce(total_loss)
-                cv_loss = total_loss / num_seen_utts
-                cv_loss = float(cv_loss)
-            else:
-                cv_loss = total_loss / num_seen_utts
+                    for batch_index, batch in enumerate(self.train_loader):
+                        dataload_time = time.time() - data_start_time
+                        msg = "Train: Rank: {}, ".format(dist.get_rank())
+                        msg += "epoch: {}, ".format(self.epoch)
+                        msg += "step: {}, ".format(self.iteration)
+                        msg += "batch : {}/{}, ".format(batch_index + 1,
+                                                        len(self.train_loader))
+                        msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
+                        msg += "data time: {:>.3f}s, ".format(dataload_time)
+                        self.train_batch(batch_index, batch, msg)
+                        data_start_time = time.time()
+                except Exception as e:
+                    logger.error(e)
+                    raise e
+
+            with Timer("Eval Time Cost: {}"):
+                total_loss, num_seen_utts = self.valid()
+                if dist.get_world_size() > 1:
+                    num_seen_utts = paddle.to_tensor(num_seen_utts)
+                    # the default operator in all_reduce function is sum.
+                    dist.all_reduce(num_seen_utts)
+                    total_loss = paddle.to_tensor(total_loss)
+                    dist.all_reduce(total_loss)
+                    cv_loss = total_loss / num_seen_utts
+                    cv_loss = float(cv_loss)
+                else:
+                    cv_loss = total_loss / num_seen_utts

            logger.info(
                'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
@ -240,14 +243,14 @@ class Trainer():
        """The routine of the experiment after setup. This method is intended
        to be used by the user.
        """
-        try:
-            self.train()
-        except KeyboardInterrupt:
-            self.save()
-            exit(-1)
-        finally:
-            self.destory()
-        logger.info("Training Done.")
+        with Timer("Training Done: {}"):
+            try:
+                self.train()
+            except KeyboardInterrupt:
+                self.save()
+                exit(-1)
+            finally:
+                self.destory()

    def setup_output_dir(self):
        """Create a directory used for output.
--- a/deepspeech/training/updaters/standard_updater.py
+++ b/deepspeech/training/updaters/standard_updater.py
@ -14,12 +14,12 @@
 from typing import Dict
 from typing import Optional

-from paddle import Tensor
+import paddle
 from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
 from paddle.nn import Layer
 from paddle.optimizer import Optimizer
-from timer import timer
+from paddle.optimizer.lr import LRScheduler

 from deepspeech.training.reporter import report
 from deepspeech.training.updaters.updater import UpdaterBase
@ -39,8 +39,10 @@ class StandardUpdater(UpdaterBase):
    def __init__(self,
                 model: Layer,
                 optimizer: Optimizer,
+                 scheduler: LRScheduler,
                 dataloader: DataLoader,
                 init_state: Optional[UpdaterState]=None):
+        super().__init__(init_state)
        # it is designed to hold multiple models
        models = {"main": model}
        self.models: Dict[str, Layer] = models
@ -51,15 +53,14 @@ class StandardUpdater(UpdaterBase):
        self.optimizer = optimizer
        self.optimizers: Dict[str, Optimizer] = optimizers

+        # it is designed to hold multiple scheduler
+        schedulers = {"main": scheduler}
+        self.scheduler = scheduler
+        self.schedulers: Dict[str, LRScheduler] = schedulers
+
        # dataloaders
        self.dataloader = dataloader

-        # init state
-        if init_state is None:
-            self.state = UpdaterState()
-        else:
-            self.state = init_state
-
        self.train_iterator = iter(dataloader)

    def update(self):
@ -103,8 +104,10 @@ class StandardUpdater(UpdaterBase):
            model.train()

        # training for a step is implemented here
-        batch = self.read_batch()
-        self.update_core(batch)
+        with Timier("data time cost:{}"):
+            batch = self.read_batch()
+        with Timier("step time cost:{}"):
+            self.update_core(batch)

        self.state.iteration += 1
        if self.updates_per_epoch is not None:
@ -115,13 +118,14 @@ class StandardUpdater(UpdaterBase):
        """A simple case for a training step. Basic assumptions are:
        Single model;
        Single optimizer;
+        Single scheduler, and update learning rate each step;
        A batch from the dataloader is just the input of the model;
        The model return a single loss, or a dict containing serval losses.
        Parameters updates at every batch, no gradient accumulation.
        """
        loss = self.model(*batch)

-        if isinstance(loss, Tensor):
+        if isinstance(loss, paddle.Tensor):
            loss_dict = {"main": loss}
        else:
            # Dict[str, Tensor]
@ -135,14 +139,15 @@ class StandardUpdater(UpdaterBase):
        for name, loss_item in loss_dict.items():
            report(name, float(loss_item))

-        self.optimizer.clear_gradient()
+        self.optimizer.clear_grad()
        loss_dict["main"].backward()
-        self.optimizer.update()
+        self.optimizer.step()
+        self.scheduler.step()

    @property
    def updates_per_epoch(self):
-        """Number of updater per epoch, determined by the length of the
-        dataloader."""
+        """Number of steps per epoch, 
+        determined by the length of the dataloader."""
        length_of_dataloader = None
        try:
            length_of_dataloader = len(self.dataloader)
@ -163,18 +168,16 @@ class StandardUpdater(UpdaterBase):

    def read_batch(self):
        """Read a batch from the data loader, auto renew when data is exhausted."""
-        with timer() as t:
-            try:
-                batch = next(self.train_iterator)
-            except StopIteration:
-                self.new_epoch()
-                batch = next(self.train_iterator)
-            logger.debug(
-                f"Read a batch takes {t.elapse}s.")  # replace it with logger
+        try:
+            batch = next(self.train_iterator)
+        except StopIteration:
+            self.new_epoch()
+            batch = next(self.train_iterator)
        return batch

    def state_dict(self):
-        """State dict of a Updater, model, optimizer and updater state are included."""
+        """State dict of a Updater, model, optimizers/schedulers 
+        and updater state are included."""
        state_dict = super().state_dict()
        for name, model in self.models.items():
            state_dict[f"{name}_params"] = model.state_dict()
@ -184,7 +187,7 @@ class StandardUpdater(UpdaterBase):

    def set_state_dict(self, state_dict):
        """Set state dict for a Updater. Parameters of models, states for
-        optimizers and UpdaterState are restored."""
+        optimizers/schedulers and UpdaterState are restored."""
        for name, model in self.models.items():
            model.set_state_dict(state_dict[f"{name}_params"])
        for name, optim in self.optimizers.items():
--- a/deepspeech/training/updaters/trainer.py
+++ b/deepspeech/training/updaters/trainer.py
@ -140,8 +140,8 @@ class Trainer():
        try:
            while not stop_trigger(self):
                self.observation = {}
-                # set observation as the report target
-                # you can use report freely in Updater.update()
+                # set observation as the `report` target
+                # you can use `report` freely in Updater.update()

                # updating parameters and state
                with scope(self.observation):
--- a/deepspeech/training/updaters/updater.py
+++ b/deepspeech/training/updaters/updater.py
@ -52,6 +52,7 @@ class UpdaterBase():
    """

    def __init__(self, init_state=None):
+        # init state
        if init_state is None:
            self.state = UpdaterState()
        else:
--- a/deepspeech/utils/checkpoint.py
+++ b/deepspeech/utils/checkpoint.py
@ -114,13 +114,13 @@ class Checkpoint():
        params_path = checkpoint_path + ".pdparams"
        model_dict = paddle.load(params_path)
        model.set_state_dict(model_dict)
-        logger.info("Rank {}: loaded model from {}".format(rank, params_path))
+        logger.info("Rank {}: Restore model from {}".format(rank, params_path))

        optimizer_path = checkpoint_path + ".pdopt"
        if optimizer and os.path.isfile(optimizer_path):
            optimizer_dict = paddle.load(optimizer_path)
            optimizer.set_state_dict(optimizer_dict)
-            logger.info("Rank {}: loaded optimizer state from {}".format(
+            logger.info("Rank {}: Restore optimizer state from {}".format(
                rank, optimizer_path))

        info_path = re.sub('.pdparams$', '.json', params_path)
--- a/deepspeech/utils/log.py
+++ b/deepspeech/utils/log.py
@ -12,19 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import getpass
-import logging
 import os
 import socket
 import sys

+from loguru import logger
 from paddle import inference

-FORMAT_STR = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
-DATE_FMT_STR = '%Y/%m/%d %H:%M:%S'
-
-logging.basicConfig(
-    level=logging.DEBUG, format=FORMAT_STR, datefmt=DATE_FMT_STR)
-

 def find_log_dir(log_dir=None):
    """Returns the most suitable directory to put log files into.
@ -98,59 +92,28 @@ def find_log_dir_and_names(program_name=None, log_dir=None):


 class Log():
-
-    log_name = None
-
-    def __init__(self, logger=None):
-        self.logger = logging.getLogger(logger)
-        self.logger.setLevel(logging.DEBUG)
-
-        file_dir = os.getcwd() + '/log'
-        if not os.path.exists(file_dir):
-            os.mkdir(file_dir)
-        self.log_dir = file_dir
-
-        actual_log_dir, file_prefix, symlink_prefix = find_log_dir_and_names(
-            program_name=None, log_dir=self.log_dir)
-
-        basename = '%s.DEBUG.%d' % (file_prefix, os.getpid())
-        filename = os.path.join(actual_log_dir, basename)
-        if Log.log_name is None:
-            Log.log_name = filename
-
-        # Create a symlink to the log file with a canonical name.
-        symlink = os.path.join(actual_log_dir, symlink_prefix + '.DEBUG')
-        try:
-            if os.path.islink(symlink):
-                os.unlink(symlink)
-            os.symlink(os.path.basename(Log.log_name), symlink)
-        except EnvironmentError:
-            # If it fails, we're sad but it's no error.  Commonly, this
-            # fails because the symlink was created by another user and so
-            # we can't modify it
-            pass
-
-        if not self.logger.hasHandlers():
-            formatter = logging.Formatter(fmt=FORMAT_STR, datefmt=DATE_FMT_STR)
-            fh = logging.FileHandler(Log.log_name)
-            fh.setLevel(logging.DEBUG)
-            fh.setFormatter(formatter)
-            self.logger.addHandler(fh)
-
-            ch = logging.StreamHandler()
-            ch.setLevel(logging.INFO)
-            ch.setFormatter(formatter)
-            self.logger.addHandler(ch)
-
-        # stop propagate for propagating may print
-        # log multiple times
-        self.logger.propagate = False
+    """Default Logger for all."""
+    logger.remove()
+    logger.add(
+        sys.stdout,
+        level='INFO',
+        enqueue=True,
+        filter=lambda record: record['level'].no >= 20)
+    _, file_prefix, _ = find_log_dir_and_names()
+    sink_prefix = os.path.join("exp/log", file_prefix)
+    sink_path = sink_prefix[:-3] + "{time}.log"
+    logger.add(sink_path, level='DEBUG', enqueue=True, rotation="500 MB")
+
+    def __init__(self, name=None):
+        pass

    def getlog(self):
-        return self.logger
+        return logger


 class Autolog:
+    """Just used by fullchain project"""
+
    def __init__(self,
                 batch_size,
                 model_name="DeepSpeech",
--- a/doc/images/multi_gpu_speedup.png
+++ b/doc/images/multi_gpu_speedup.png
--- a/doc/images/tuning_error_surface.png
+++ b/doc/images/tuning_error_surface.png
--- a/doc/src/benchmark.md
+++ b/doc/src/benchmark.md
@ -1,16 +0,0 @@
-# Benchmarks
-
-## Acceleration with Multi-GPUs
-
-We compare the training time with 1, 2, 4, 8 Tesla V100 GPUs (with a subset of LibriSpeech samples whose audio durations are between 6.0 and 7.0 seconds).  And it shows that a **near-linear** acceleration with multiple GPUs has been achieved. In the following figure, the time (in seconds) cost for training is printed on the blue bars.
-
-<img src="../images/multi_gpu_speedup.png" width=450>
-
-| # of GPU  | Acceleration Rate |
-| --------  | --------------:   |
-| 1         | 1.00 X |
-| 2         | 1.98 X |
-| 4         | 3.73 X |
-| 8         | 6.95 X |
-
-`utils/profile.sh` provides such a demo profiling tool, you can change it as need.
--- a/docs/images/ds2offlineModel.png
+++ b/docs/images/ds2offlineModel.png
--- a/docs/images/ds2onlineModel.png
+++ b/docs/images/ds2onlineModel.png
--- a/docs/src/augmentation.md
+++ b/docs/src/augmentation.md
--- a/docs/src/data_preparation.md
+++ b/docs/src/data_preparation.md
--- a/docs/src/deepspeech_architecture.md
+++ b/docs/src/deepspeech_architecture.md
@ -1,8 +1,8 @@
 # Deepspeech2
 ## Streaming

-The implemented arcitecure of Deepspeech2 online model is based on [Deepspeech2 model](https://arxiv.org/pdf/1512.02595.pdf) with some changes. 
-The model is mainly composed of 2D convolution subsampling layer and stacked single direction rnn layers. 
+The implemented arcitecure of Deepspeech2 online model is based on [Deepspeech2 model](https://arxiv.org/pdf/1512.02595.pdf) with some changes.
+The model is mainly composed of 2D convolution subsampling layer and stacked single direction rnn layers.

 To illustrate the model implementation clearly, 3 parts are described in detail.  
 - Data Preparation
@ -11,10 +11,10 @@ To illustrate the model implementation clearly, 3 parts are described in detail.

 In addition, the training process and the testing process are also introduced.

-The arcitecture of the model is shown in Fig.1. 
+The arcitecture of the model is shown in Fig.1.

 <p align="center">
-<img src="../images/ds2onlineModel.png" width=800> 
+<img src="../images/ds2onlineModel.png" width=800>
 <br/>Fig.1 The Arcitecture of deepspeech2 online model
 </p>

@ -28,17 +28,17 @@ For English data, the vocabulary dictionary is composed of 26 English characters
     --unit_type="char" \
     --count_threshold=0 \
     --vocab_path="data/vocab.txt" \
-     --manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw" 
- 
+     --manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"
+
 # vocabulary for aishell dataset (Mandarin)
 vi examples/aishell/s0/data/vocab.txt
- 
+
 # vocabulary for librispeech dataset (English)
 vi examples/librispeech/s0/data/vocab.txt
 ```

 #### CMVN
-For CMVN, a subset or the full of traininig set is chosed and be used to compute the feature mean and std. 
+For CMVN, a subset or the full of traininig set is chosed and be used to compute the feature mean and std.
 ```
 # The code to compute the feature mean and std
 cd examples/aishell/s0
@ -52,16 +52,16 @@ python3 ../../../utils/compute_mean_std.py \
     --use_dB_normalization=True \
     --num_samples=2000 \
     --num_workers=10 \
-     --output_path="data/mean_std.json" 
+     --output_path="data/mean_std.json"

 ```
- 
+
 #### Feature Extraction
 For feature extraction, three methods are implemented, which are linear (FFT without using filter bank), fbank and mfcc.
 Currently, the released deepspeech2 online model use the linear feature extraction method.
 ```
 The code for feature extraction
- vi deepspeech/frontend/featurizer/audio_featurizer.py 
+ vi deepspeech/frontend/featurizer/audio_featurizer.py
 ```

 ### Encoder
@ -70,7 +70,7 @@ The code of Encoder is in:
 ```
 vi deepspeech/models/ds2_online/deepspeech2.py
 ```
- 
+
 ### Decoder
 To got the character possibilities of each frame, the feature represention of each frame output from the encoder are input into a projection layer which is implemented as a dense layer to do feature projection. The output dim of the projection layer is same with the vocabulary size. After projection layer, the softmax function is used to transform the frame-level feature representation be the possibilities of characters. While making model inference, the character possibilities of each frame are input into the CTC decoder to get the final speech recognition results.
 The code of Decoder is in:
@ -80,7 +80,7 @@ vi deepspeech/models/ds2_online/deepspeech2.py
 # The code of CTC Decoder
 vi deepspeech/modules/ctc.py
 ```
- 
+
 ## Training Process
 Using the command below, you can train the deepspeech2 online model.
 ```
@ -121,8 +121,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    avg.sh exp/${ckpt}/checkpoints ${avg_num}
 fi
 ```
+
 By using the command above, the training process can be started. There are 5 stages in "run.sh", and the first 3 stages are used for training process. The stage 0 is used for data preparation, in which the dataset will be downloaded, and the manifest files of the datasets, vocabulary dictionary and CMVN file will be generated in "./data/". The stage 1 is used for training the model, the log files and model checkpoint is saved in "exp/deepspeech2_online/". The stage 2 is used to generated final model for predicting by averaging the top-k model parameters based on validation loss.  
- 
+
 ## Testing Process
 Using the command below, you can test the deepspeech2 online model.
 ```
@ -131,7 +132,7 @@ Using the command below, you can test the deepspeech2 online model.
 The detail commands are:
 ```
 conf_path=conf/deepspeech2_online.yaml
-avg_num=1 
+avg_num=1
 model_type=online
 avg_ckpt=avg_${avg_num}

@ -152,29 +153,29 @@ fi
 ```
 After the training process, we use stage 3,4,5 for testing process. The stage 3 is for testing the model generated in the stage 2 and provided the CER index of the test set. The stage 4 is for transforming the model from dynamic graph to static graph by using "paddle.jit" library. The stage 5 is for testing the model in static graph.

- 
+
 ## Non-Streaming
 The deepspeech2 offline model is similarity to the deepspeech2 online model. The main difference between them is the offline model use the stacked bi-directional rnn layers while the online model use the single direction rnn layers and the fc layer is not used. For the stacked bi-directional rnn layers in the offline model, the rnn cell and gru cell are provided to use.

 The arcitecture of the model is shown in Fig.2.
 <p align="center">
-<img src="../images/ds2offlineModel.png" width=800> 
+<img src="../images/ds2offlineModel.png" width=800>
 <br/>Fig.2 The Arcitecture of deepspeech2 offline model
 </p>


- 
+
 For data preparation and decoder, the deepspeech2 offline model is same with the deepspeech2 online model.

 The code of encoder and decoder for deepspeech2 offline model is in:
 ```
 vi deepspeech/models/ds2/deepspeech2.py
 ```
- 
+
 The training process and testing process of deepspeech2 offline model is very similary to deepspeech2 online model.
 Only some changes should be noticed.

-For training and testing, the "model_type" and the "conf_path" must be set. 
+For training and testing, the "model_type" and the "conf_path" must be set.
 ```
 # Training offline
 cd examples/aishell/s0
@ -185,5 +186,3 @@ bash run.sh --stage 0 --stop_stage 2 --model_type offline --conf_path conf/deeps
 cd examples/aishell/s0
 bash run.sh --stage 3 --stop_stage 5 --model_type offline --conf_path conf/deepspeech2.yaml
 ```
-
- 
--- a/docs/src/feature_list.md
+++ b/docs/src/feature_list.md
--- a/docs/src/getting_started.md
+++ b/docs/src/getting_started.md
--- a/docs/src/install.md
+++ b/docs/src/install.md
--- a/docs/src/ngram_lm.md
+++ b/docs/src/ngram_lm.md
--- a/docs/src/reference.md
+++ b/docs/src/reference.md
--- a/docs/src/released_model.md
+++ b/docs/src/released_model.md
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@ -40,9 +40,12 @@ model:
  rnn_layer_size: 1024
  use_gru: True 
  share_rnn_weights: False
+  blank_id: 0
+  ctc_grad_norm_type: instance

 training:
  n_epoch: 80
+  accum_grad: 1
  lr: 2e-3
  lr_decay: 0.83
  weight_decay: 1e-06
--- a/examples/aishell/s0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/s0/conf/deepspeech2_online.yaml
@ -36,17 +36,20 @@ collator:

 model:
  num_conv_layers: 2
-  num_rnn_layers: 3
+  num_rnn_layers: 5
  rnn_layer_size: 1024
  rnn_direction: forward # [forward, bidirect]
-  num_fc_layers: 1
-  fc_layers_size_list: 512,
+  num_fc_layers: 0
+  fc_layers_size_list: -1,
  use_gru: False 
-
+  blank_id: 0
+  ctc_grad_norm_type: instance
+  
 training:
  n_epoch: 50
+  accum_grad: 1
  lr: 2e-3
-  lr_decay: 0.91  # 0.83
+  lr_decay: 0.9  # 0.83
  weight_decay: 1e-06
  global_grad_clip: 3.0
  log_interval: 100
@ -59,7 +62,7 @@ decoding:
  error_rate_type: cer 
  decoding_method: ctc_beam_search
  lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
-  alpha: 1.9
+  alpha: 2.2 #1.9
  beta: 5.0
  beam_size: 300
  cutoff_prob: 0.99
--- a/examples/aishell/s0/local/client.sh
+++ b/examples/aishell/s0/local/client.sh
@ -1,20 +0,0 @@
-#!/bin/bash
-
-source path.sh
-
-# run on MacOS
-# brew install portaudio
-# pip install pyaudio
-# pip install keyboard
-
-# start demo client
-python3 -u ${BIN_DIR}/deploy/client.py \
--host_ip="localhost" \
--host_port=8086 \
-
-if [ $? -ne 0 ]; then
-    echo "Failed in starting demo client!"
-    exit 1
-fi
-
-exit 0
--- a/examples/aishell/s0/local/server.sh
+++ b/examples/aishell/s0/local/server.sh
@ -1,40 +0,0 @@
-#!/bin/bash
-# TODO: replace the model with a mandarin model
-
-if [[ $# != 1 ]];then
-   echo "usage: $1 checkpoint_path"
-   exit -1
-fi
-
-source path.sh
-
-# download language model
-bash local/download_lm_ch.sh
-if [ $? -ne 0 ]; then
-    exit 1
-fi
-
-# download well-trained model
-#bash local/download_model.sh
-#if [ $? -ne 0 ]; then
-#    exit 1
-#fi
-
-# start demo server
-CUDA_VISIBLE_DEVICES=0 \
-python3 -u ${BIN_DIR}/deploy/server.py \
--device 'gpu' \
--nproc 1 \
--config conf/deepspeech2.yaml \
--host_ip="localhost" \
--host_port=8086 \
--speech_save_dir="demo_cache" \
--checkpoint_path ${1}
-
-if [ $? -ne 0 ]; then
-    echo "Failed in starting demo server!"
-    exit 1
-fi
-
-
-exit 0
--- a/examples/aishell/s0/local/train.sh
+++ b/examples/aishell/s0/local/train.sh
@ -20,7 +20,7 @@ fi
 mkdir -p exp

 seed=10086
-if [ ${seed} ]; then
+if [ ${seed} != 0 ]; then
    export FLAGS_cudnn_deterministic=True
 fi

@ -32,7 +32,7 @@ python3 -u ${BIN_DIR}/train.py \
 --model_type ${model_type} \
 --seed ${seed}

-if [ ${seed} ]; then
+if [ ${seed} != 0 ]; then
    unset FLAGS_cudnn_deterministic
 fi

--- a/examples/aishell/s0/local/tune.sh
+++ b/examples/aishell/s0/local/tune.sh
@ -1,28 +0,0 @@
-#!/bin/bash
-
-# grid-search for hyper-parameters in language model
-python3 -u ${BIN_DIR}/tune.py \
--device 'gpu' \
--nproc 1 \
--config conf/deepspeech2.yaml \
--num_batches=10 \
--batch_size=128 \
--beam_size=300 \
--num_proc_bsearch=8 \
--num_alphas=10 \
--num_betas=10 \
--alpha_from=0.0 \
--alpha_to=5.0 \
--beta_from=-6 \
--beta_to=6 \
--cutoff_prob=1.0 \
--cutoff_top_n=40 \
--checkpoint_path ${1}
-
-if [ $? -ne 0 ]; then
-    echo "Failed in tuning!"
-    exit 1
-fi
-
-
-exit 0
--- a/examples/aishell/s0/run.sh
+++ b/examples/aishell/s0/run.sh
@ -27,7 +27,7 @@ fi

 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # avg n best model
-    avg.sh exp/${ckpt}/checkpoints ${avg_num}
+    avg.sh best exp/${ckpt}/checkpoints ${avg_num}
 fi

 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--- a/examples/aishell/s1/conf/chunk_conformer.yaml
+++ b/examples/aishell/s1/conf/chunk_conformer.yaml
@ -76,6 +76,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false

--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@ -71,6 +71,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false

--- a/examples/aishell/s1/local/train.sh
+++ b/examples/aishell/s1/local/train.sh
@ -19,8 +19,8 @@ echo "using ${device}..."

 mkdir -p exp

-seed=1024
-if [ ${seed} ]; then
+seed=10086
+if [ ${seed} != 0]; then
    export FLAGS_cudnn_deterministic=True
 fi

@ -31,7 +31,7 @@ python3 -u ${BIN_DIR}/train.py \
 --output exp/${ckpt_name} \
 --seed ${seed}

-if [ ${seed} ]; then
+if [ ${seed} != 0 ]; then
    unset FLAGS_cudnn_deterministic
 fi

--- a/examples/aishell/s1/run.sh
+++ b/examples/aishell/s1/run.sh
@ -25,7 +25,7 @@ fi

 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # avg n best model
-    avg.sh exp/${ckpt}/checkpoints ${avg_num}
+    avg.sh best exp/${ckpt}/checkpoints ${avg_num}
 fi

 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--- a/examples/callcenter/s1/local/train.sh
+++ b/examples/callcenter/s1/local/train.sh
@ -19,8 +19,8 @@ echo "using ${device}..."

 mkdir -p exp

-seed=1024
-if [ ${seed} ]; then
+seed=10086
+if [ ${seed} != 0]; then
    export FLAGS_cudnn_deterministic=True
 fi

@ -31,7 +31,7 @@ python3 -u ${BIN_DIR}/train.py \
 --output exp/${ckpt_name} \
 --seed ${seed}

-if [ ${seed} ]; then
+if [ ${seed} != 0 ]; then
    unset FLAGS_cudnn_deterministic
 fi

--- a/examples/callcenter/s1/run.sh
+++ b/examples/callcenter/s1/run.sh
@ -25,7 +25,7 @@ fi

 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # avg n best model
-    avg.sh exp/${ckpt}/checkpoints ${avg_num}
+    avg.sh best exp/${ckpt}/checkpoints ${avg_num}
 fi

 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--- a/examples/cc-cedict/README.md
+++ b/examples/cc-cedict/README.md
@ -0,0 +1,58 @@
+# [CC-CEDICT](https://cc-cedict.org/wiki/)
+
+What is CC-CEDICT?
+CC-CEDICT is a continuation of the CEDICT project.
+The objective of the CEDICT project was to create an online, downloadable (as opposed to searchable-only) public-domain Chinese-English dictionary.
+CEDICT was started by Paul Andrew Denisowski in October 1997.
+For the most part, the project is modeled on Jim Breen's highly successful EDICT (Japanese-English dictionary) project and is intended to be a collaborative effort,
+with users providing entries and corrections to the main file.
+
+
+## Parse CC-CEDICT to Json format
+
+1. Parse to Json
+
+```
+run.sh
+```
+
+2. Result
+
+```
+exp/
+|-- cedict
+`-- cedict.json
+
+0 directories, 2 files
+```
+
+```
+4c4bffc84e24467fe1b2ea9ba37ed6b6  exp/cedict
+3adf504dacd13886f88cc9fe3b37c75d  exp/cedict.json
+```
+
+```
+==> exp/cedict <==
+# CC-CEDICT
+# Community maintained free Chinese-English dictionary.
+#
+# Published by MDBG
+#
+# License:
+# Creative Commons Attribution-ShareAlike 4.0 International License
+# https://creativecommons.org/licenses/by-sa/4.0/
+#
+# Referenced works:
+
+==> exp/cedict.json <==
+{"traditional": "2019\u51a0\u72c0\u75c5\u6bd2\u75c5", "simplified": "2019\u51a0\u72b6\u75c5\u6bd2\u75c5", "pinyin": "er4 ling2 yi1 jiu3 guan1 zhuang4 bing4 du2 bing4", "english": "COVID-19, the coronavirus disease identified in 2019"}
+{"traditional": "21\u4e09\u9ad4\u7d9c\u5408\u75c7", "simplified": "21\u4e09\u4f53\u7efc\u5408\u75c7", "pinyin": "er4 shi2 yi1 san1 ti3 zong1 he2 zheng4", "english": "trisomy"}
+{"traditional": "3C", "simplified": "3C", "pinyin": "san1 C", "english": "abbr. for computers, communications, and consumer electronics"}
+{"traditional": "3P", "simplified": "3P", "pinyin": "san1 P", "english": "(slang) threesome"}
+{"traditional": "3Q", "simplified": "3Q", "pinyin": "san1 Q", "english": "(Internet slang) thank you (loanword)"}
+{"traditional": "421", "simplified": "421", "pinyin": "si4 er4 yi1", "english": "four grandparents, two parents and an only child"}
+{"traditional": "502\u81a0", "simplified": "502\u80f6", "pinyin": "wu3 ling2 er4 jiao1", "english": "cyanoacrylate glue"}
+{"traditional": "88", "simplified": "88", "pinyin": "ba1 ba1", "english": "(Internet slang) bye-bye (alternative for \u62dc\u62dc[bai2 bai2])"}
+{"traditional": "996", "simplified": "996", "pinyin": "jiu3 jiu3 liu4", "english": "9am-9pm, six days a week (work schedule)"}
+{"traditional": "A", "simplified": "A", "pinyin": "A", "english": "(slang) (Tw) to steal"}
+```
--- a/examples/chinese_g2p/README.md
+++ b/examples/chinese_g2p/README.md
@ -1,5 +0,0 @@
-# Download Baker dataset
-
-Baker dataset has to be downloaded mannually and moved to 'data/', because you will have to pass the CATTCHA from a browswe to download the dataset.
-
-Download URL https://test.data-baker.com/#/data/index/source.
--- a/examples/chinese_g2p/.gitignore
+++ b/examples/chinese_g2p/.gitignore
--- a/examples/g2p/README.md
+++ b/examples/g2p/README.md
@ -0,0 +1,3 @@
+# G2P
+
+* zh - Chinese G2P
--- a/examples/g2p/zh/README.md
+++ b/examples/g2p/zh/README.md
@ -0,0 +1,93 @@
+# G2P
+
+* WS
+jieba
+* G2P
+pypinyin
+* Tone sandhi
+simple
+
+We recommend using [Paraket](https://github.com/PaddlePaddle/Parakeet] [TextFrontEnd](https://github.com/PaddlePaddle/Parakeet/blob/develop/parakeet/frontend/__init__.py) to do G2P.
+The phoneme set should be changed, you can reference `examples/thchs30/a0/data/dict/syllable.lexicon`.
+
+## Download Baker dataset
+
+[Baker](https://test.data-baker.com/#/data/index/source) dataset has to be downloaded mannually and moved to './data',
+because you will have to pass the `CATTCHA` from a browswe to download the dataset.
+
+
+## RUN
+
+```
+. path.sh
+./run.sh
+```
+
+## Result
+
+```
+exp/
+|-- 000001-010000.txt
+|-- ref.pinyin
+|-- trans.jieba.pinyin
+`-- trans.pinyin
+
+0 directories, 4 files
+```
+
+```
+4f5a368441eb16aaf43dc1972f8b63dd  exp/000001-010000.txt
+01707896391c2de9b6fc4a39654be942  exp/ref.pinyin
+43380ef160f65a23a3a0544700aa49b8  exp/trans.jieba.pinyin
+8e6ff1fc22d8e8584082e804e8bcdeb7  exp/trans.pinyin
+```
+
+```
+==> exp/000001-010000.txt <==
+000001  卡尔普#2陪外孙#1玩滑梯#4。
+        ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1
+000002  假语村言#2别再#1拥抱我#4。
+        jia2 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3
+000003  宝马#1配挂#1跛骡鞍#3，貂蝉#1怨枕#2董翁榻#4。
+        bao2 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4
+000004  邓小平#2与#1撒切尔#2会晤#4。
+        deng4 xiao3 ping2 yu3 sa4 qie4 er3 hui4 wu4
+000005  老虎#1幼崽#2与#1宠物犬#1玩耍#4。
+        lao2 hu3 you4 zai3 yu2 chong3 wu4 quan3 wan2 shua3
+
+==> exp/ref.pinyin <==
+000001 ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1
+000002 jia2 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3
+000003 bao2 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4
+000004 deng4 xiao3 ping2 yu3 sa4 qie4 er3 hui4 wu4
+000005 lao2 hu3 you4 zai3 yu2 chong3 wu4 quan3 wan2 shua3
+000006 shen1 chang2 yue1 wu2 chi3 er4 cun4 wu3 fen1 huo4 yi3 shang4
+000007 zhao4 di2 yue1 cao2 yun2 teng2 qu4 gui3 wu1
+000008 zhan2 pin3 sui1 you3 zhan3 yuan2 que4 tui2
+000009 yi2 san3 ju1 er2 tong2 he2 you4 tuo1 er2 tong2 wei2 zhu3
+000010 ke1 te4 ni1 shen1 chuan1 bao4 wen2 da4 yi1
+
+==> exp/trans.jieba.pinyin <==
+000001 ka3 er3 pu3 pei2 wai4 sun1 wan2 hua2 ti1
+000002 jia3 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3
+000003 bao3 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4
+000004 deng4 xiao3 ping2 yu3 sa1 qie4 er3 hui4 wu4
+000005 lao3 hu3 you4 zai3 yu3 chong3 wu4 quan3 wan2 shua3
+000006 shen1 chang2 yue1 wu3 chi3 er4 cun4 wu3 fen1 huo4 yi3 shang4
+000007 zhao4 di2 yue1 cao2 yun2 teng2 qu4 gui3 wu1
+000008 zhan3 pin3 sui1 you3 zhan3 yuan2 que4 tui2
+000009 yi3 san3 ju1 er2 tong2 he2 you4 tuo1 er2 tong2 wei2 zhu3
+000010 ke1 te4 ni1 shen1 chuan1 bao4 wen2 da4 yi1
+
+==> exp/trans.pinyin <==
+000001 ka3 er3 pu3 pei2 wai4 sun1 wan2 hua2 ti1
+000002 jia3 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3
+000003 bao3 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4
+000004 deng4 xiao3 ping2 yu3 sa1 qie4 er3 hui4 wu4
+000005 lao3 hu3 you4 zai3 yu3 chong3 wu4 quan3 wan2 shua3
+000006 shen1 chang2 yue1 wu3 chi3 er4 cun4 wu3 fen1 huo4 yi3 shang4
+000007 zhao4 di2 yue1 cao2 yun2 teng2 qu4 gui3 wu1
+000008 zhan3 pin3 sui1 you3 zhan3 yuan2 que4 tui2
+000009 yi3 san3 ju1 er2 tong2 he2 you4 tuo1 er2 tong2 wei2 zhu3
+000010 ke1 te4 ni1 shen1 chuan1 bao4 wen2 da4 yi1
+```
--- a/examples/chinese_g2p/local/convert_transcription.py
+++ b/examples/chinese_g2p/local/convert_transcription.py
--- a/examples/chinese_g2p/local/extract_pinyin_label.py
+++ b/examples/chinese_g2p/local/extract_pinyin_label.py
--- a/examples/chinese_g2p/local/ignore_sandhi.py
+++ b/examples/chinese_g2p/local/ignore_sandhi.py
--- a/examples/chinese_g2p/local/prepare_dataset.sh
+++ b/examples/chinese_g2p/local/prepare_dataset.sh
--- a/examples/chinese_g2p/path.sh
+++ b/examples/chinese_g2p/path.sh
@ -1,4 +1,4 @@
-export MAIN_ROOT=`realpath ${PWD}/../../`
+export MAIN_ROOT=`realpath ${PWD}/../../../`

 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
--- a/examples/chinese_g2p/requirements.txt
+++ b/examples/chinese_g2p/requirements.txt
--- a/examples/chinese_g2p/run.sh
+++ b/examples/chinese_g2p/run.sh
@ -6,16 +6,19 @@ stage=-1
 stop_stage=100

 exp_dir=exp
-data_dir=data
+data=data

 source ${MAIN_ROOT}/utils/parse_options.sh || exit -1

 mkdir -p ${exp_dir}

+if [ $stage -le -1 ] && [ $stop_stage -ge -1 ];then
+    test -e ${data}/BZNSYP.rar || { echo "Please download BZNSYP.rar and put it in ${data}; exit -1; }
+fi

 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
    echo "stage 0: Extracting Prosody Labeling"
-    bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data_dir}
+    bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data}
 fi

 # convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin
--- a/examples/librispeech/s0/README.md
+++ b/examples/librispeech/s0/README.md
@ -1,10 +1,17 @@
 # LibriSpeech

+## Data
+| Data Subset | Duration in Seconds |
+| --- | --- |
+| data/manifest.train |  0.83s ~ 29.735s |
+| data/manifest.dev | 1.065 ~ 35.155s |  
+| data/manifest.test-clean | 1.285s ~ 34.955s |
+
 ## Deepspeech2

 | Model | Params | release |  Config | Test set | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- |  
-| DeepSpeech2 | 42.96M | 2.2.0 | conf/deepspeech2.yaml + spec_aug | 14.49190807 | test-clean | 0.067283 |  
-| DeepSpeech2 | 42.96M | 2.1.0 | conf/deepspeech2.yaml | 15.184467315673828 | test-clean | 0.072154 |  
-| DeepSpeech2 | 42.96M | 2.0.0 | conf/deepspeech2.yaml | - | test-clean | 0.073973 |  
+| DeepSpeech2 | 42.96M | 2.2.0 | conf/deepspeech2.yaml + spec_aug | test-clean | 14.49190807 | 0.067283 |  
+| DeepSpeech2 | 42.96M | 2.1.0 | conf/deepspeech2.yaml | test-clean | 15.184467315673828 | 0.072154 |  
+| DeepSpeech2 | 42.96M | 2.0.0 | conf/deepspeech2.yaml | test-clean | - | 0.073973 |  
 | DeepSpeech2 | 42.96M | 1.8.5 | - | test-clean | - | 0.074939 |  
--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
@ -4,14 +4,14 @@ data:
  dev_manifest: data/manifest.dev-clean
  test_manifest: data/manifest.test-clean
  min_input_len: 0.0
-  max_input_len: 27.0 # second
+  max_input_len: 30.0 # second
  min_output_len: 0.0
  max_output_len: .inf
  min_output_input_ratio: 0.00
  max_output_input_ratio: .inf

 collator:
-  batch_size: 20
+  batch_size: 15
  mean_std_filepath: data/mean_std.json
  unit_type: char
  vocab_filepath: data/vocab.txt 
@ -40,9 +40,12 @@ model:
  rnn_layer_size: 2048
  use_gru: False 
  share_rnn_weights: True
+  blank_id: 0
+  ctc_grad_norm_type: instance

 training:
  n_epoch: 50
+  accum_grad: 4
  lr: 1e-3
  lr_decay: 0.83
  weight_decay: 1e-06
--- a/examples/librispeech/s0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2_online.yaml
@ -4,14 +4,14 @@ data:
  dev_manifest: data/manifest.dev-clean
  test_manifest: data/manifest.test-clean
  min_input_len: 0.0
-  max_input_len: 27.0 # second
+  max_input_len: 30.0 # second
  min_output_len: 0.0
  max_output_len: .inf
  min_output_input_ratio: 0.00
  max_output_input_ratio: .inf

 collator:
-  batch_size: 20
+  batch_size: 15
  mean_std_filepath: data/mean_std.json
  unit_type: char
  vocab_filepath: data/vocab.txt 
@ -42,9 +42,12 @@ model:
  num_fc_layers: 2
  fc_layers_size_list: 512, 256
  use_gru: False 
+  blank_id: 0
+  ctc_grad_norm_type: instance

 training:
  n_epoch: 50
+  accum_grad: 4
  lr: 1e-3
  lr_decay: 0.83
  weight_decay: 1e-06
--- a/examples/librispeech/s0/local/train.sh
+++ b/examples/librispeech/s0/local/train.sh
@ -20,8 +20,8 @@ echo "using ${device}..."

 mkdir -p exp

-seed=1024
-if [ ${seed} ]; then
+seed=10086
+if [ ${seed} != 0 ]; then
    export FLAGS_cudnn_deterministic=True
 fi

@ -33,7 +33,7 @@ python3 -u ${BIN_DIR}/train.py \
 --model_type ${model_type} \
 --seed ${seed}

-if [ ${seed} ]; then
+if [ ${seed} != 0 ]; then
    unset FLAGS_cudnn_deterministic
 fi

--- a/examples/librispeech/s0/local/tune.sh
+++ b/examples/librispeech/s0/local/tune.sh
@ -1,33 +0,0 @@
-#!/bin/bash
-
-if [ $# != 1 ];then
-    echo "usage: tune ckpt_path"
-    exit 1
-fi
-
-# grid-search for hyper-parameters in language model
-python3 -u ${BIN_DIR}/tune.py \
--device 'gpu' \
--nproc 1 \
--config conf/deepspeech2.yaml \
--num_batches=-1 \
--batch_size=128 \
--beam_size=500 \
--num_proc_bsearch=12 \
--num_alphas=45 \
--num_betas=8 \
--alpha_from=1.0 \
--alpha_to=3.2 \
--beta_from=0.1 \
--beta_to=0.45 \
--cutoff_prob=1.0 \
--cutoff_top_n=40 \
--checkpoint_path ${1}
-
-if [ $? -ne 0 ]; then
-    echo "Failed in tuning!"
-    exit 1
-fi
-
-
-exit 0
--- a/examples/librispeech/s0/run.sh
+++ b/examples/librispeech/s0/run.sh
@ -25,7 +25,7 @@ fi

 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # avg n best model
-    avg.sh exp/${ckpt}/checkpoints ${avg_num}
+    avg.sh best exp/${ckpt}/checkpoints ${avg_num}
 fi

 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--- a/examples/librispeech/s1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_conformer.yaml
@ -76,6 +76,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false

--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_transformer.yaml
@ -69,6 +69,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false

--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
@ -72,6 +72,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false

--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@ -33,7 +33,7 @@ collator:
  keep_transcription_text: False
  sortagrad: True 
  shuffle_method: batch_shuffle
-  num_workers: 2
+  num_workers: 0


 # network architecture
@ -67,6 +67,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false

--- a/examples/librispeech/s1/local/train.sh
+++ b/examples/librispeech/s1/local/train.sh
@ -19,8 +19,8 @@ echo "using ${device}..."

 mkdir -p exp

-seed=1024
-if [ ${seed} ]; then
+seed=10086
+if [ ${seed} != 0 ]; then
    export FLAGS_cudnn_deterministic=True
 fi

@ -31,7 +31,7 @@ python3 -u ${BIN_DIR}/train.py \
 --output exp/${ckpt_name} \
 --seed ${seed}

-if [ ${seed} ]; then
+if [ ${seed} != 0]; then
    unset FLAGS_cudnn_deterministic
 fi

--- a/examples/librispeech/s1/run.sh
+++ b/examples/librispeech/s1/run.sh
@ -24,7 +24,7 @@ fi

 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # avg n best model
-    avg.sh exp/${ckpt}/checkpoints ${avg_num}
+    avg.sh best exp/${ckpt}/checkpoints ${avg_num}
 fi

 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--- a/examples/librispeech/s2/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_conformer.yaml
@ -76,6 +76,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false

--- a/examples/librispeech/s2/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_transformer.yaml
@ -69,6 +69,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false

--- a/examples/librispeech/s2/conf/conformer.yaml
+++ b/examples/librispeech/s2/conf/conformer.yaml
@ -72,6 +72,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false

--- a/examples/librispeech/s2/conf/transformer.yaml
+++ b/examples/librispeech/s2/conf/transformer.yaml
@ -22,7 +22,7 @@ collator:
  batch_frames_out: 0
  batch_frames_inout: 0
  augmentation_config: conf/augmentation.json
-  num_workers: 2
+  num_workers: 0
  subsampling_factor: 1
  num_encs: 1

@ -58,6 +58,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false

--- a/examples/librispeech/s2/local/train.sh
+++ b/examples/librispeech/s2/local/train.sh
@ -19,8 +19,8 @@ echo "using ${device}..."

 mkdir -p exp

-seed=1024
-if [ ${seed} ]; then
+seed=10086
+if [ ${seed} != 0 ]; then
    export FLAGS_cudnn_deterministic=True
 fi

@ -32,7 +32,7 @@ python3 -u ${BIN_DIR}/train.py \
 --output exp/${ckpt_name} \
 --seed ${seed}

-if [ ${seed} ]; then
+if [ ${seed} != 0 ]; then
    unset FLAGS_cudnn_deterministic
 fi

--- a/examples/librispeech/s2/run.sh
+++ b/examples/librispeech/s2/run.sh
@ -25,7 +25,7 @@ fi

 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # avg n best model
-    avg.sh exp/${ckpt}/checkpoints ${avg_num}
+    avg.sh best exp/${ckpt}/checkpoints ${avg_num}
 fi

 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--- a/examples/ngram_lm/READEME.md
+++ b/examples/ngram_lm/READEME.md
@ -0,0 +1,3 @@
+# Ngram LM
+
+* s0 - kenlm ngram lm
--- a/examples/ngram_lm/s0/.gitignore
+++ b/examples/ngram_lm/s0/.gitignore
@ -0,0 +1 @@
+data/lm
--- a/examples/ngram_lm/s0/README.md
+++ b/examples/ngram_lm/s0/README.md
@ -2,6 +2,95 @@

 Train chinese chararctor ngram lm by [kenlm](https://github.com/kpu/kenlm).

+## Run
 ```
+. path.sh
 bash run.sh
 ```
+
+## Results
+
+```
+exp/
+|-- text
+|-- text.char.tn
+|-- text.word.tn
+|-- text_zh_char_o5_p0_1_2_4_4_a22_q8_b8.arpa
+|-- text_zh_char_o5_p0_1_2_4_4_a22_q8_b8.arpa.klm.bin
+|-- text_zh_word_o3_p0_0_0_a22_q8_b8.arpa
+`-- text_zh_word_o3_p0_0_0_a22_q8_b8.arpa.klm.bin
+
+0 directories, 7 files
+```
+
+```
+3ae083627b9b6cef1a82d574d8483f97  exp/text
+d97da252d2a63a662af22f98af30cb8c  exp/text.char.tn
+c18b03005bd094dbfd9b46442be361fd  exp/text.word.tn
+73dbf50097896eda33985e11e1ba9a3a  exp/text_zh_char_o5_p0_1_2_4_4_a22_q8_b8.arpa
+01334e2044c474b99c4f2ffbed790626  exp/text_zh_char_o5_p0_1_2_4_4_a22_q8_b8.arpa.klm.bin
+36a42de548045b54662411ae7982c77f  exp/text_zh_word_o3_p0_0_0_a22_q8_b8.arpa
+332422803ffd73dd7ffd16cd2b0abcd5  exp/text_zh_word_o3_p0_0_0_a22_q8_b8.arpa.klm.bin
+```
+
+```
+==> exp/text <==
+少先队员因该为老人让坐
+祛痘印可以吗？有效果吗？
+不知这款牛奶口感怎样？ 小孩子喝行吗！
+是转基因油?
+我家宝宝13斤用多大码的
+会起坨吗？
+请问给送上楼吗？
+亲是送赁上门吗
+送货时候有外包装没有还是直接发货过来
+会不会有坏的？
+
+==> exp/text.char.tn <==
+少 先 队 员 因 该 为 老 人 让 坐
+祛 痘 印 可 以 吗 有 效 果 吗
+不 知 这 款 牛 奶 口 感 怎 样 小 孩 子 喝 行 吗
+是 转 基 因 油
+我 家 宝 宝 十 三 斤 用 多 大 码 的
+会 起 坨 吗
+请 问 给 送 上 楼 吗
+亲 是 送 赁 上 门 吗
+送 货 时 候 有 外 包 装 没 有 还 是 直 接 发 货 过 来
+会 不 会 有 坏 的
+
+==> exp/text.word.tn <==
+少先队员 因该 为 老人 让 坐
+祛痘 印 可以 吗 有 效果 吗
+不知 这 款 牛奶 口感 怎样 小孩子 喝行 吗
+是 转基因 油
+我家 宝宝 十三斤 用多大码 的
+会起 坨 吗
+请问 给 送 上楼 吗
+亲是 送赁 上门 吗
+送货 时候 有 外包装 没有 还是 直接 发货 过来
+会 不会 有坏 的
+
+==> exp/text_zh_char_o5_p0_1_2_4_4_a22_q8_b8.arpa <==
+\data\
+ngram 1=587
+ngram 2=395
+ngram 3=100
+ngram 4=2
+ngram 5=0
+
+\1-grams:
+-3.272324       <unk>   0
+0       <s>     -0.36706257
+
+==> exp/text_zh_word_o3_p0_0_0_a22_q8_b8.arpa <==
+\data\
+ngram 1=689
+ngram 2=1398
+ngram 3=1506
+
+\1-grams:
+-3.1755018      <unk>   0
+0       <s>     -0.23069073
+-1.2318869      </s>    0
+-3.067262       少先队员        -0.051341705
+```
--- a/Show More
+++ b/Show More