Refactor CTC module, add embedding and fix log (#549)

* add acts, refactor ctc, add pos embed * fix export, dataloader time log * fix egs * fix libri readme
4 years ago · 1539f3e0a3
parent 00889bfaf2
commit 1539f3e0a3
23 changed files with 518 additions and 275 deletions
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@ -305,11 +305,12 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            exit(-1)
    def export(self):
-        self.infer_model.eval()
+        infer_model = DeepSpeech2InferModel.from_pretrained(
            self.test_loader.dataset, self.config, self.args.checkpoint_path)
        infer_model.eval()
        feat_dim = self.test_loader.dataset.feature_size
-        paddle.jit.save(
+        static_model = paddle.jit.to_static(
-            self.infer_model,
+            infer_model,
            self.args.export_path,
            input_spec=[
                paddle.static.InputSpec(
                    shape=[None, feat_dim, None],
@ -317,6 +318,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
                paddle.static.InputSpec(shape=[None],
                                        dtype='int64'),  # audio_length, [B]
            ])
        logger.info(f"Export code: {static_model.forward.code}")
        paddle.jit.save(static_model, self.args.export_path)
    def run_export(self):
        try:
@ -349,12 +352,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            rnn_size=config.model.rnn_layer_size,
            use_gru=config.model.use_gru,
            share_rnn_weights=config.model.share_rnn_weights)
        infer_model = DeepSpeech2InferModel.from_pretrained(
            self.test_loader.dataset, config, self.args.checkpoint_path)
        self.model = model
        self.infer_model = infer_model
        self.logger.info("Setup model!")
    def setup_dataloader(self):
--- a/deepspeech/models/deepspeech2.py
+++ b/deepspeech/models/deepspeech2.py
@ -24,17 +24,14 @@ from paddle import nn
 from paddle.nn import functional as F
 from paddle.nn import initializer as I
 from deepspeech.modules.conv import ConvStack
 from deepspeech.modules.rnn import RNNStack
 from deepspeech.modules.mask import sequence_mask
 from deepspeech.modules.activation import brelu
 from deepspeech.modules.conv import ConvStack
 from deepspeech.modules.rnn import RNNStack
 from deepspeech.modules.ctc import CTCDecoder
 from deepspeech.utils import checkpoint
 from deepspeech.utils import layer_tools
 from deepspeech.decoders.swig_wrapper import Scorer
 from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder
 from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch
 from deepspeech.modules.loss import CTCLoss
 logger = logging.getLogger(__name__)
@ -105,178 +102,6 @@ class CRNNEncoder(nn.Layer):
        return x, x_lens
 class CTCDecoder(nn.Layer):
    def __init__(self, enc_n_units, vocab_size):
        super().__init__()
        self.blank_id = vocab_size
        self.output = nn.Linear(enc_n_units,
                                vocab_size + 1)  # blank id is last id
        self.criterion = CTCLoss(self.blank_id)
        self._ext_scorer = None
    def forward(self, eout, eout_lens, texts, texts_len):
        """Compute CTC Loss
        Args:
            eout (Tensor): 
            eout_lens (Tensor): 
            texts (Tenosr):
            texts_len (Tensor):
        Returns:
            loss (Tenosr): [1]
        """
        logits = self.output(eout)
        loss = self.criterion(logits, texts, eout_lens, texts_len)
        return loss
    def probs(self, eouts, temperature=1.):
        """Get CTC probabilities.
        Args:
            eouts (FloatTensor): `[B, T, enc_units]`
        Returns:
            probs (FloatTensor): `[B, T, vocab]`
        """
        return F.softmax(self.output(eouts) / temperature, axis=-1)
    def scores(self, eouts, temperature=1.):
        """Get log-scale CTC probabilities.
        Args:
            eouts (FloatTensor): `[B, T, enc_units]`
        Returns:
            log_probs (FloatTensor): `[B, T, vocab]`
        """
        return F.log_softmax(self.output(eouts) / temperature, axis=-1)
    def _decode_batch_greedy(self, probs_split, vocab_list):
        """Decode by best path for a batch of probs matrix input.
        :param probs_split: List of 2-D probability matrix, and each consists
                            of prob vectors for one speech utterancce.
        :param probs_split: List of matrix
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        :return: List of transcription texts.
        :rtype: List of str
        """
        results = []
        for i, probs in enumerate(probs_split):
            output_transcription = ctc_greedy_decoder(
                probs_seq=probs, vocabulary=vocab_list)
            results.append(output_transcription)
        return results
    def _init_ext_scorer(self, beam_alpha, beam_beta, language_model_path,
                         vocab_list):
        """Initialize the external scorer.
        :param beam_alpha: Parameter associated with language model.
        :type beam_alpha: float
        :param beam_beta: Parameter associated with word count.
        :type beam_beta: float
        :param language_model_path: Filepath for language model. If it is
                                    empty, the external scorer will be set to
                                    None, and the decoding method will be pure
                                    beam search without scorer.
        :type language_model_path: str|None
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        """
        # init once
        if self._ext_scorer != None:
            return
        if language_model_path != '':
            logger.info("begin to initialize the external scorer "
                        "for decoding")
            self._ext_scorer = Scorer(beam_alpha, beam_beta,
                                      language_model_path, vocab_list)
            lm_char_based = self._ext_scorer.is_character_based()
            lm_max_order = self._ext_scorer.get_max_order()
            lm_dict_size = self._ext_scorer.get_dict_size()
            logger.info("language model: "
                        "is_character_based = %d," % lm_char_based +
                        " max_order = %d," % lm_max_order + " dict_size = %d" %
                        lm_dict_size)
            logger.info("end initializing scorer")
        else:
            self._ext_scorer = None
            logger.info("no language model provided, "
                        "decoding by pure beam search without scorer.")
    def _decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta,
                                  beam_size, cutoff_prob, cutoff_top_n,
                                  vocab_list, num_processes):
        """Decode by beam search for a batch of probs matrix input.
        :param probs_split: List of 2-D probability matrix, and each consists
                            of prob vectors for one speech utterancce.
        :param probs_split: List of matrix
        :param beam_alpha: Parameter associated with language model.
        :type beam_alpha: float
        :param beam_beta: Parameter associated with word count.
        :type beam_beta: float
        :param beam_size: Width for Beam search.
        :type beam_size: int
        :param cutoff_prob: Cutoff probability in pruning,
                            default 1.0, no pruning.
        :type cutoff_prob: float
        :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
                        characters with highest probs in vocabulary will be
                        used in beam search, default 40.
        :type cutoff_top_n: int
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        :param num_processes: Number of processes (CPU) for decoder.
        :type num_processes: int
        :return: List of transcription texts.
        :rtype: List of str
        """
        if self._ext_scorer != None:
            self._ext_scorer.reset_params(beam_alpha, beam_beta)
        # beam search decode
        num_processes = min(num_processes, len(probs_split))
        beam_search_results = ctc_beam_search_decoder_batch(
            probs_split=probs_split,
            vocabulary=vocab_list,
            beam_size=beam_size,
            num_processes=num_processes,
            ext_scoring_func=self._ext_scorer,
            cutoff_prob=cutoff_prob,
            cutoff_top_n=cutoff_top_n)
        results = [result[0][1] for result in beam_search_results]
        return results
    def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list,
                    decoding_method):
        if decoding_method == "ctc_beam_search":
            self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
                                  vocab_list)
    def decode_probs(self, probs, logits_lens, vocab_list, decoding_method,
                     lang_model_path, beam_alpha, beam_beta, beam_size,
                     cutoff_prob, cutoff_top_n, num_processes):
        """ probs: activation after softmax 
        logits_len: audio output lens
        """
        probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)]
        if decoding_method == "ctc_greedy":
            result_transcripts = self._decode_batch_greedy(
                probs_split=probs_split, vocab_list=vocab_list)
        elif decoding_method == "ctc_beam_search":
            result_transcripts = self._decode_batch_beam_search(
                probs_split=probs_split,
                beam_alpha=beam_alpha,
                beam_beta=beam_beta,
                beam_size=beam_size,
                cutoff_prob=cutoff_prob,
                cutoff_top_n=cutoff_top_n,
                vocab_list=vocab_list,
                num_processes=num_processes)
        else:
            raise ValueError(f"Not support: {decoding_method}")
        return result_transcripts
 class DeepSpeech2Model(nn.Layer):
    """The DeepSpeech2 network structure.
@ -339,8 +164,13 @@ class DeepSpeech2Model(nn.Layer):
            use_gru=use_gru,
            share_rnn_weights=share_rnn_weights)
        assert (self.encoder.output_size == rnn_size * 2)
        self.decoder = CTCDecoder(
-            enc_n_units=self.encoder.output_size, vocab_size=dict_size)
+            enc_n_units=self.encoder.output_size,
            odim=dict_size + 1,  # <blank> is append after vocab
            blank_id=dict_size,  # last token is <blank>
            dropout_rate=0.0,
            reduction=True)
    def forward(self, audio, text, audio_len, text_len):
        """Compute Model loss
--- a/deepspeech/modules/activation.py
+++ b/deepspeech/modules/activation.py
@ -14,6 +14,7 @@
 import logging
 import numpy as np
 import math
 import paddle
 from paddle import nn
@ -22,7 +23,7 @@ from paddle.nn import initializer as I
 logger = logging.getLogger(__name__)
-__all__ = ['brelu']
+__all__ = ['brelu', "softplus", "gelu_accurate", "gelu", 'Swish']
 def brelu(x, t_min=0.0, t_max=24.0, name=None):
@ -30,3 +31,38 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
    t_min = paddle.full(shape=[1], fill_value=t_min, dtype='float32')
    t_max = paddle.full(shape=[1], fill_value=t_max, dtype='float32')
    return x.maximum(t_min).minimum(t_max)
 def softplus(x):
    """Softplus function."""
    if hasattr(paddle.nn.functional, 'softplus'):
        #return paddle.nn.functional.softplus(x.float()).type_as(x)
        return paddle.nn.functional.softplus(x)
    else:
        raise NotImplementedError
 def gelu_accurate(x):
    """Gaussian Error Linear Units (GELU) activation."""
    # [reference] https://github.com/pytorch/fairseq/blob/e75cff5f2c1d62f12dc911e0bf420025eb1a4e33/fairseq/modules/gelu.py
    if not hasattr(gelu_accurate, "_a"):
        gelu_accurate._a = math.sqrt(2 / math.pi)
    return 0.5 * x * (1 + paddle.tanh(gelu_accurate._a *
                                      (x + 0.044715 * paddle.pow(x, 3))))
 def gelu(x):
    """Gaussian Error Linear Units (GELU) activation."""
    if hasattr(torch.nn.functional, 'gelu'):
        #return torch.nn.functional.gelu(x.float()).type_as(x)
        return torch.nn.functional.gelu(x)
    else:
        return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
 class Swish(nn.Layer):
    """Construct an Swish object."""
    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
        """Return Swish activation function."""
        return x * F.sigmoid(x)
--- a/deepspeech/modules/ctc.py
+++ b/deepspeech/modules/ctc.py
@ -0,0 +1,238 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 from typeguard import check_argument_types
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
 from paddle.nn import initializer as I
 from deepspeech.decoders.swig_wrapper import Scorer
 from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder
 from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch
 from deepspeech.modules.loss import CTCLoss
 logger = logging.getLogger(__name__)
 __all__ = ['CTCDecoder']
 class CTCDecoder(nn.Layer):
    def __init__(self,
                 enc_n_units,
                 odim,
                 blank_id=0,
                 dropout_rate: float=0.0,
                 reduction: bool=True):
        """CTC decoder
        Args:
            enc_n_units ([int]): encoder output dimention
            vocab_size ([int]): text vocabulary size
            dropout_rate (float): dropout rate (0.0 ~ 1.0)
            reduction (bool): reduce the CTC loss into a scalar
        """
        assert check_argument_types()
        super().__init__()
        self.blank_id = blank_id
        self.odim = odim
        self.dropout_rate = dropout_rate
        self.ctc_lo = nn.Linear(enc_n_units, self.odim)
        reduction_type = "sum" if reduction else "none"
        self.criterion = CTCLoss(blank=self.blank_id, reduction=reduction_type)
        # CTCDecoder LM Score handle
        self._ext_scorer = None
    def forward(self, hs_pad, hlens, ys_pad, ys_lens):
        """Calculate CTC loss.
        Args:
            hs_pad (Tensor): batch of padded hidden state sequences (B, Tmax, D)
            hlens (Tensor): batch of lengths of hidden state sequences (B)
            ys_pad (Tenosr): batch of padded character id sequence tensor (B, Lmax)
            ys_lens (Tensor): batch of lengths of character sequence (B)
        Returns:
            loss (Tenosr): scalar.
        """
        logits = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate))
        loss = self.criterion(logits, ys_pad, hlens, ys_lens)
        return loss
    def probs(self, eouts: paddle.Tensor, temperature: float=1.0):
        """Get CTC probabilities.
        Args:
            eouts (FloatTensor): `[B, T, enc_units]`
        Returns:
            probs (FloatTensor): `[B, T, odim]`
        """
        return F.softmax(self.ctc_lo(eouts) / temperature, axis=-1)
    def scores(self, eouts: paddle.Tensor, temperature: float=1.0):
        """Get log-scale CTC probabilities.
        Args:
            eouts (FloatTensor): `[B, T, enc_units]`
        Returns:
            log_probs (FloatTensor): `[B, T, odim]`
        """
        return F.log_softmax(self.ctc_lo(eouts) / temperature, axis=-1)
    def log_softmax(self, hs_pad: paddle.Tensor) -> paddle.Tensor:
        """log_softmax of frame activations
        Args:
            Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
        Returns:
            paddle.Tensor: log softmax applied 3d tensor (B, Tmax, odim)
        """
        return self.scores(hs_pad)
    def argmax(self, hs_pad: paddle.Tensor) -> paddle.Tensor:
        """argmax of frame activations
        Args:
            paddle.Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
        Returns:
            paddle.Tensor: argmax applied 2d tensor (B, Tmax)
        """
        return paddle.argmax(self.ctc_lo(hs_pad), dim=2)
    def _decode_batch_greedy(self, probs_split, vocab_list):
        """Decode by best path for a batch of probs matrix input.
        :param probs_split: List of 2-D probability matrix, and each consists
                            of prob vectors for one speech utterancce.
        :param probs_split: List of matrix
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        :return: List of transcription texts.
        :rtype: List of str
        """
        results = []
        for i, probs in enumerate(probs_split):
            output_transcription = ctc_greedy_decoder(
                probs_seq=probs, vocabulary=vocab_list)
            results.append(output_transcription)
        return results
    def _init_ext_scorer(self, beam_alpha, beam_beta, language_model_path,
                         vocab_list):
        """Initialize the external scorer.
        :param beam_alpha: Parameter associated with language model.
        :type beam_alpha: float
        :param beam_beta: Parameter associated with word count.
        :type beam_beta: float
        :param language_model_path: Filepath for language model. If it is
                                    empty, the external scorer will be set to
                                    None, and the decoding method will be pure
                                    beam search without scorer.
        :type language_model_path: str|None
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        """
        # init once
        if self._ext_scorer != None:
            return
        if language_model_path != '':
            logger.info("begin to initialize the external scorer "
                        "for decoding")
            self._ext_scorer = Scorer(beam_alpha, beam_beta,
                                      language_model_path, vocab_list)
            lm_char_based = self._ext_scorer.is_character_based()
            lm_max_order = self._ext_scorer.get_max_order()
            lm_dict_size = self._ext_scorer.get_dict_size()
            logger.info("language model: "
                        "is_character_based = %d," % lm_char_based +
                        " max_order = %d," % lm_max_order + " dict_size = %d" %
                        lm_dict_size)
            logger.info("end initializing scorer")
        else:
            self._ext_scorer = None
            logger.info("no language model provided, "
                        "decoding by pure beam search without scorer.")
    def _decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta,
                                  beam_size, cutoff_prob, cutoff_top_n,
                                  vocab_list, num_processes):
        """Decode by beam search for a batch of probs matrix input.
        :param probs_split: List of 2-D probability matrix, and each consists
                            of prob vectors for one speech utterancce.
        :param probs_split: List of matrix
        :param beam_alpha: Parameter associated with language model.
        :type beam_alpha: float
        :param beam_beta: Parameter associated with word count.
        :type beam_beta: float
        :param beam_size: Width for Beam search.
        :type beam_size: int
        :param cutoff_prob: Cutoff probability in pruning,
                            default 1.0, no pruning.
        :type cutoff_prob: float
        :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
                        characters with highest probs in vocabulary will be
                        used in beam search, default 40.
        :type cutoff_top_n: int
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        :param num_processes: Number of processes (CPU) for decoder.
        :type num_processes: int
        :return: List of transcription texts.
        :rtype: List of str
        """
        if self._ext_scorer != None:
            self._ext_scorer.reset_params(beam_alpha, beam_beta)
        # beam search decode
        num_processes = min(num_processes, len(probs_split))
        beam_search_results = ctc_beam_search_decoder_batch(
            probs_split=probs_split,
            vocabulary=vocab_list,
            beam_size=beam_size,
            num_processes=num_processes,
            ext_scoring_func=self._ext_scorer,
            cutoff_prob=cutoff_prob,
            cutoff_top_n=cutoff_top_n)
        results = [result[0][1] for result in beam_search_results]
        return results
    def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list,
                    decoding_method):
        if decoding_method == "ctc_beam_search":
            self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
                                  vocab_list)
    def decode_probs(self, probs, logits_lens, vocab_list, decoding_method,
                     lang_model_path, beam_alpha, beam_beta, beam_size,
                     cutoff_prob, cutoff_top_n, num_processes):
        """ probs: activation after softmax 
        logits_len: audio output lens
        """
        probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)]
        if decoding_method == "ctc_greedy":
            result_transcripts = self._decode_batch_greedy(
                probs_split=probs_split, vocab_list=vocab_list)
        elif decoding_method == "ctc_beam_search":
            result_transcripts = self._decode_batch_beam_search(
                probs_split=probs_split,
                beam_alpha=beam_alpha,
                beam_beta=beam_beta,
                beam_size=beam_size,
                cutoff_prob=cutoff_prob,
                cutoff_top_n=cutoff_top_n,
                vocab_list=vocab_list,
                num_processes=num_processes)
        else:
            raise ValueError(f"Not support: {decoding_method}")
        return result_transcripts
--- a/deepspeech/modules/embedding.py
+++ b/deepspeech/modules/embedding.py
@ -0,0 +1,132 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Positonal Encoding Module."""
 import math
 import logging
 import numpy as np
 from typing import Tuple
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
 from paddle.nn import initializer as I
 logger = logging.getLogger(__name__)
 __all__ = ["PositionalEncoding", "RelPositionalEncoding"]
 # TODO(Hui Zhang): remove this hack
 paddle.float32 = 'float32'
 class PositionalEncoding(nn.Layer):
    def __init__(self,
                 d_model: int,
                 dropout_rate: float,
                 max_len: int=5000,
                 reverse: bool=False):
        """Positional encoding.
            PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
            PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
        Args:
            d_model (int): embedding dim.
            dropout_rate (float): dropout rate.
            max_len (int, optional): maximum input length. Defaults to 5000.
            reverse (bool, optional): Not used. Defaults to False.
        """
        super().__init__()
        self.d_model = d_model
        self.max_len = max_len
        self.xscale = paddle.to_tensor(math.sqrt(self.d_model))
        self.dropout = nn.Dropout(p=dropout_rate)
        self.pe = paddle.zeros(self.max_len, self.d_model)  #[T,D]
        position = paddle.arange(
            0, self.max_len, dtype=paddle.float32).unsqueeze(1)
        div_term = paddle.exp(
            paddle.arange(0, self.d_model, 2, dtype=paddle.float32) *
            -(math.log(10000.0) / self.d_model))
        self.pe[:, 0::2] = paddle.sin(position * div_term)
        self.pe[:, 1::2] = paddle.cos(position * div_term)
        self.pe = self.pe.unsqueeze(0)  #[1, T, D]
    def forward(self, x: paddle.Tensor,
                offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]:
        """Add positional encoding.
        Args:
            x (paddle.Tensor): Input. Its shape is (batch, time, ...)
            offset (int): position offset
        Returns:
            paddle.Tensor: Encoded tensor. Its shape is (batch, time, ...)
            paddle.Tensor: for compatibility to RelPositionalEncoding
        """
        T = paddle.shape(x)[1]
        assert offset + T < self.max_len
        #assert offset + x.size(1) < self.max_len
        #self.pe = self.pe.to(x.device)
        #pos_emb = self.pe[:, offset:offset + x.size(1)]
        pos_emb = self.pe[:, offset:offset + T]
        x = x * self.xscale + pos_emb
        return self.dropout(x), self.dropout(pos_emb)
    def position_encoding(self, offset: int, size: int) -> paddle.Tensor:
        """ For getting encoding in a streaming fashion
        Attention!!!!!
        we apply dropout only once at the whole utterance level in a none
        streaming way, but will call this function several times with
        increasing input size in a streaming scenario, so the dropout will
        be applied several times.
        Args:
            offset (int): start offset
            size (int): requried size of position encoding
        Returns:
            paddle.Tensor: Corresponding encoding
        """
        assert offset + size < self.max_len
        return self.dropout(self.pe[:, offset:offset + size])
 class RelPositionalEncoding(PositionalEncoding):
    """Relative positional encoding module.
    See : Appendix B in https://arxiv.org/abs/1901.02860
    """
    def __init__(self, d_model: int, dropout_rate: float, max_len: int=5000):
        """
        Args:
            d_model (int): Embedding dimension.
            dropout_rate (float): Dropout rate.
            max_len (int, optional): [Maximum input length.]. Defaults to 5000.
        """
        super().__init__(d_model, dropout_rate, max_len, reverse=True)
    def forward(self, x: paddle.Tensor,
                offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]:
        """Compute positional encoding.
        Args:
            x (paddle.Tensor): Input tensor (batch, time, `*`).
        Returns:
            paddle.Tensor: Encoded tensor (batch, time, `*`).
            paddle.Tensor: Positional embedding tensor (1, time, `*`).
        """
        T = paddle.shape()[1]
        assert offset + T < self.max_len
        #assert offset + x.size(1) < self.max_len
        #self.pe = self.pe.to(x.device)
        x = x * self.xscale
        #pos_emb = self.pe[:, offset:offset + x.size(1)]
        pos_emb = self.pe[:, offset:offset + T]
        return self.dropout(x), self.dropout(pos_emb)
--- a/deepspeech/modules/loss.py
+++ b/deepspeech/modules/loss.py
@ -24,6 +24,7 @@ logger = logging.getLogger(__name__)
 __all__ = ['CTCLoss']
 # TODO(Hui Zhang): remove this hack, when `norm_by_times=True` is added
 def ctc_loss(logits,
             labels,
             input_lengths,
@ -47,19 +48,35 @@ def ctc_loss(logits,
    return loss_out
 # TODO(Hui Zhang): remove this hack
 F.ctc_loss = ctc_loss
 class CTCLoss(nn.Layer):
-    def __init__(self, blank_id):
+    def __init__(self, blank=0, reduction='sum'):
        super().__init__()
        # last token id as blank id
-        self.loss = nn.CTCLoss(blank=blank_id, reduction='sum')
+        self.loss = nn.CTCLoss(blank=blank, reduction=reduction)
-    def forward(self, logits, text, logits_len, text_len):
+    def forward(self, logits, ys_pad, hlens, ys_lens):
-        # warp-ctc do softmax on activations
+        """Compute CTC loss.
        Args:
            logits ([paddle.Tensor]): [description]
            ys_pad ([paddle.Tensor]): [description]
            hlens ([paddle.Tensor]): [description]
            ys_lens ([paddle.Tensor]): [description]
        Returns:
            [paddle.Tensor]: scalar. If reduction is 'none', then (N), where N = \text{batch size}.
        """
        # warp-ctc need logits, and do softmax on logits by itself
        # warp-ctc need activation with shape [T, B, V + 1]
        # logits: (B, L, D) -> (L, B, D)
        logits = logits.transpose([1, 0, 2])
        loss = self.loss(logits, ys_pad, hlens, ys_lens)
-        ctc_loss = self.loss(logits, text, logits_len, text_len)
+        # wenet do batch-size average, deepspeech2 not do this
-        return ctc_loss
+        # Batch-size average
        # loss = loss / paddle.shape(logits)[1]
        return loss
--- a/deepspeech/modules/mask.py
+++ b/deepspeech/modules/mask.py
@ -28,6 +28,7 @@ def sequence_mask(x_len, max_len=None, dtype='float32'):
    max_len = max_len or x_len.max()
    x_len = paddle.unsqueeze(x_len, -1)
    row_vector = paddle.arange(max_len)
    # TODO(Hui Zhang): fix this bug
    #mask = row_vector < x_len
    mask = row_vector > x_len  # a bug, broadcast 的时候出错了
    mask = paddle.cast(mask, dtype)
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@ -167,9 +167,17 @@ class Trainer():
        self.new_epoch()
        while self.epoch <= self.config.training.n_epoch:
            try:
                data_start_time = time.time()
                for batch in self.train_loader:
                    dataload_time = time.time() - data_start_time
                    msg = "Train: Rank: {}, ".format(dist.get_rank())
                    msg += "epoch: {}, ".format(self.epoch)
                    msg += "step: {}, ".format(self.iteration)
                    msg += "dataloader time: {:>.3f}s, ".format(dataload_time)
                    self.logger.info(msg)
                    self.iteration += 1
                    self.train_batch(batch)
                    data_start_time = time.time()
            except Exception as e:
                self.logger.error(e)
                pass
--- a/examples/aishell/local/infer.sh
+++ b/examples/aishell/local/infer.sh
@ -1,5 +1,9 @@
 #! /usr/bin/env bash
 if [[ $# != 1 ]];
    echo "usage: $0 ckpt-path"
    exit -1
 fi
 # download language model
 bash local/download_lm_ch.sh
--- a/examples/aishell/local/infer_golden.sh
+++ b/examples/aishell/local/infer_golden.sh
@ -1,31 +0,0 @@
 #! /usr/bin/env bash
 # download language model
 bash local/download_lm_ch.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
 # download well-trained model
 bash local/download_model.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
 # infer
 CUDA_VISIBLE_DEVICES=0 \
 python3 -u ${BIN_DIR}/infer.py \
 --device 'gpu' \
 --nproc 1 \
 --config conf/deepspeech2.yaml \
 --checkpoint_path data/pretrain/params.pdparams  \
 --opts data.mean_std_filepath data/pretrain/mean_std.npz  \
 --opts data.vocab_filepath data/pretrain/vocab.txt
 if [ $? -ne 0 ]; then
    echo "Failed in inference!"
    exit 1
 fi
 exit 0
--- a/examples/aishell/local/server.sh
+++ b/examples/aishell/local/server.sh
@ -2,7 +2,7 @@
 # TODO: replace the model with a mandarin model
 if [[ $# != 1 ]];then
-   echo "usage: server.sh checkpoint_path"
+   echo "usage: $1 checkpoint_path"
   exit -1
 fi
--- a/examples/aishell/local/test.sh
+++ b/examples/aishell/local/test.sh
@ -10,7 +10,7 @@ python3 -u ${BIN_DIR}/test.py \
 --device 'gpu' \
 --nproc 1 \
 --config conf/deepspeech2.yaml \
--checkpoint_path ${1} 
+--output ckpt
 if [ $? -ne 0 ]; then
    echo "Failed in evaluation!"
--- a/examples/aishell/local/test_golden.sh
+++ b/examples/aishell/local/test_golden.sh
@ -1,31 +0,0 @@
 #! /usr/bin/env bash
 # download language model
 bash local/download_lm_ch.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
 # download well-trained model
 bash local/download_model.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
 # evaluate model
 CUDA_VISIBLE_DEVICES=0 \
 python3 -u ${BIN_DIR}/test.py \
 --device 'gpu' \
 --nproc 1 \
 --config conf/deepspeech2.yaml \
 --checkpoint_path data/pretrain/params.pdparams  \
 --opts data.mean_std_filepath data/pretrain/mean_std.npz  \
 --opts data.vocab_filepath data/pretrain/vocab.txt
 if [ $? -ne 0 ]; then
    echo "Failed in evaluation!"
    exit 1
 fi
 exit 0
--- a/examples/aishell/local/train.sh
+++ b/examples/aishell/local/train.sh
@ -11,7 +11,7 @@ python3 -u ${BIN_DIR}/train.py \
 --device 'gpu' \
 --nproc ${ngpu} \
 --config conf/deepspeech2.yaml \
--output ckpt
+--output ckpt-${1}
 if [ $? -ne 0 ]; then
--- a/examples/aishell/run.sh
+++ b/examples/aishell/run.sh
@ -10,7 +10,10 @@ bash ./local/data.sh
 CUDA_VISIBLE_DEVICES=0,1,2,3 bash ./local/train.sh
 # test model
-CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh ckpt/checkpoints/step-3284
+CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh
 # infer model
 CUDA_VISIBLE_DEVICES=0 bash ./local/infer.sh ckpt/checkpoints/step-3284
 # export model
 bash ./local/export.sh ckpt/checkpoints/step-3284 jit.model
--- a/examples/librispeech/README.md
+++ b/examples/librispeech/README.md
@ -1,7 +1,7 @@
 # LibriSpeech
 ## CTC
-| Model | Config | Test set |  CER |
+| Model | Config | Test set |  WER |
 | --- | --- | --- | --- |
 | DeepSpeech2 | conf/deepspeech2.yaml | test-clean | 0.073973 |
 | DeepSpeech2 | release 1.8.5 | test-clean | 0.074939 |
--- a/examples/librispeech/local/export.sh
+++ b/examples/librispeech/local/export.sh
@ -0,0 +1,20 @@
 #! /usr/bin/env bash
 if [ $# != 2 ];then
    echo "usage: export ckpt_path jit_model_path"
    exit -1
 fi
 python3 -u ${BIN_DIR}/export.py \
 --config conf/deepspeech2.yaml \
 --checkpoint_path ${1} \
 --export_path ${2} 
 if [ $? -ne 0 ]; then
    echo "Failed in evaluation!"
    exit 1
 fi
 exit 0
--- a/examples/librispeech/local/infer.sh
+++ b/examples/librispeech/local/infer.sh
@ -1,5 +1,10 @@
 #! /usr/bin/env bash
 if [[ $# != 1 ]];
    echo "usage: $0 ckpt-path"
    exit -1
 fi
 # download language model
 bash local/download_lm_en.sh
 if [ $? -ne 0 ]; then
--- a/examples/librispeech/run.sh
+++ b/examples/librispeech/run.sh
@ -7,10 +7,13 @@ source path.sh
 bash ./local/data.sh
 # train model
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ./local/train.sh
+CUDA_VISIBLE_DEVICES=0,1,2,3 bash ./local/train.sh
 # test model
 CUDA_VISIBLE_DEVICES=0  bash ./local/test.sh
 # infer model
-CUDA_VISIBLE_DEVICES=0 bash ./local/infer.sh
+CUDA_VISIBLE_DEVICES=0  bash ./local/infer.sh ckpt/checkpoints/step-3284
 # export model
 bash ./local/export.sh ckpt/checkpoints/step-3284 jit.model
--- a/examples/tiny/README.md
+++ b/examples/tiny/README.md
@ -1,7 +1,8 @@
 # Tiny Example
 1. `source path.sh`
-2. `bash run.sh`
+3. set `CUDA_VISIBLE_DEVICES` as you need.
 2. demo scrpt is `bash run.sh`. You can run commond separately as needed.
 ## Steps
 - Prepare the data
@ -26,11 +27,7 @@
    bash local/infer.sh
    ```
-    `infer.sh` will show us some speech-to-text decoding results for several (default: 10) samples with the trained model. The performance might not be good now as the current model is only trained with a toy subset of LibriSpeech. To see the results with a better model, you can download a well-trained (trained for several days, with the complete LibriSpeech) model and do the inference:
+    `infer.sh` will show us some speech-to-text decoding results for several (default: 10) samples with the trained model. The performance might not be good now as the current model is only trained with a toy subset of LibriSpeech. To see the results with a better model, you can download a well-trained (trained for several days, with the complete LibriSpeech) model and do the inference.
    ```bash
    bash local/infer_golden.sh
    ```
 - Evaluate an existing model
@ -40,6 +37,15 @@
    `test.sh` will evaluate the model with Word Error Rate (or Character Error Rate) measurement. Similarly, you can also download a well-trained model and test its performance:
 - Export jit model
    ```bash
    bash local/export.sh ckpt_path saved_jit_model_path
    ```
 - Tune hyper paerameter
    ```bash
-    bash local/test_golden.sh
+    bash local/tune.sh
    ```
--- a/examples/tiny/local/infer.sh
+++ b/examples/tiny/local/infer.sh
@ -1,17 +1,21 @@
 #! /usr/bin/env bash
 if [[ $# != 1 ]];
    echo "usage: $0 ckpt-path"
    exit -1
 fi
 # download language model
 bash local/download_lm_en.sh
 if [ $? -ne 0 ]; then
    exit 1
 fi
 CUDA_VISIBLE_DEVICES=0 \
 python3 -u ${BIN_DIR}/infer.py \
 --device 'gpu' \
 --nproc 1 \
 --config conf/deepspeech2.yaml \
--output ckpt
+--checkpoint_path ${1} 
 if [ $? -ne 0 ]; then
--- a/examples/tiny/local/test.sh
+++ b/examples/tiny/local/test.sh
@ -13,7 +13,6 @@ python3 -u ${BIN_DIR}/test.py \
 --config conf/deepspeech2.yaml \
 --output ckpt
 if [ $? -ne 0 ]; then
    echo "Failed in evaluation!"
    exit 1
--- a/requirements.txt
+++ b/requirements.txt
@ -4,3 +4,4 @@ SoundFile==0.9.0.post1
 python_speech_features
 tensorboardX
 yacs
 typeguard