PaddleSpeech/deepspeech/models/deepspeech2.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import collections
import numpy as np
import logging
from typing import Optional
from yacs.config import CfgNode

import paddle
from paddle import nn
from paddle.nn import functional as F
from paddle.nn import initializer as I

from deepspeech.modules.mask import sequence_mask
from deepspeech.modules.activation import brelu
from deepspeech.modules.conv import ConvStack
from deepspeech.modules.rnn import RNNStack
from deepspeech.modules.ctc import CTCDecoder

from deepspeech.utils import checkpoint
from deepspeech.utils import layer_tools

logger = logging.getLogger(__name__)

__all__ = ['DeepSpeech2Model']


class CRNNEncoder(nn.Layer):
    def __init__(self,
                 feat_size,
                 dict_size,
                 num_conv_layers=2,
                 num_rnn_layers=3,
                 rnn_size=1024,
                 use_gru=False,
                 share_rnn_weights=True):
        super().__init__()
        self.rnn_size = rnn_size
        self.feat_size = feat_size  # 161 for linear
        self.dict_size = dict_size

        self.conv = ConvStack(feat_size, num_conv_layers)

        i_size = self.conv.output_height  # H after conv stack
        self.rnn = RNNStack(
            i_size=i_size,
            h_size=rnn_size,
            num_stacks=num_rnn_layers,
            use_gru=use_gru,
            share_rnn_weights=share_rnn_weights)

    @property
    def output_size(self):
        return self.rnn_size * 2

    def forward(self, audio, audio_len):
        """
        audio: shape [B, D, T]
        text: shape [B, T]
        audio_len: shape [B]
        text_len: shape [B]
        """
        """Compute Encoder outputs

        Args:
            audio (Tensor): [B, D, T]
            text (Tensor): [B, T]
            audio_len (Tensor): [B]
            text_len (Tensor): [B]
        Returns:
            x (Tensor): encoder outputs, [B, T, D]
            x_lens (Tensor): encoder length, [B]
        """
        # [B, D, T] -> [B, C=1, D, T]
        x = audio.unsqueeze(1)
        x_lens = audio_len

        # convolution group
        x, x_lens = self.conv(x, x_lens)

        # convert data from convolution feature map to sequence of vectors
        #B, C, D, T = paddle.shape(x)  # not work under jit
        x = x.transpose([0, 3, 1, 2])  #[B, T, C, D]
        #x = x.reshape([B, T, C * D])  #[B, T, C*D]  # not work under jit
        x = x.reshape([0, 0, -1])  #[B, T, C*D]

        # remove padding part
        x, x_lens = self.rnn(x, x_lens)  #[B, T, D]
        return x, x_lens


class DeepSpeech2Model(nn.Layer):
    """The DeepSpeech2 network structure.

    :param audio_data: Audio spectrogram data layer.
    :type audio_data: Variable
    :param text_data: Transcription text data layer.
    :type text_data: Variable
    :param audio_len: Valid sequence length data layer.
    :type audio_len: Variable
    :param masks: Masks data layer to reset padding.
    :type masks: Variable
    :param dict_size: Dictionary size for tokenized transcription.
    :type dict_size: int
    :param num_conv_layers: Number of stacking convolution layers.
    :type num_conv_layers: int
    :param num_rnn_layers: Number of stacking RNN layers.
    :type num_rnn_layers: int
    :param rnn_size: RNN layer size (dimension of RNN cells).
    :type rnn_size: int
    :param use_gru: Use gru if set True. Use simple rnn if set False.
    :type use_gru: bool
    :param share_rnn_weights: Whether to share input-hidden weights between
                              forward and backward direction RNNs.
                              It is only available when use_gru=False.
    :type share_weights: bool
    :return: A tuple of an output unnormalized log probability layer (
             before softmax) and a ctc cost layer.
    :rtype: tuple of LayerOutput    
    """

    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
                num_conv_layers=2,  #Number of stacking convolution layers.
                num_rnn_layers=3,  #Number of stacking RNN layers.
                rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
                use_gru=True,  #Use gru if set True. Use simple rnn if set False.
                share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default

    def __init__(self,
                 feat_size,
                 dict_size,
                 num_conv_layers=2,
                 num_rnn_layers=3,
                 rnn_size=1024,
                 use_gru=False,
                 share_rnn_weights=True):
        super().__init__()
        self.encoder = CRNNEncoder(
            feat_size=feat_size,
            dict_size=dict_size,
            num_conv_layers=num_conv_layers,
            num_rnn_layers=num_rnn_layers,
            rnn_size=rnn_size,
            use_gru=use_gru,
            share_rnn_weights=share_rnn_weights)
        assert (self.encoder.output_size == rnn_size * 2)

        self.decoder = CTCDecoder(
            enc_n_units=self.encoder.output_size,
            odim=dict_size + 1,  # <blank> is append after vocab
            blank_id=dict_size,  # last token is <blank>
            dropout_rate=0.0,
            reduction=True,  # sum
            batch_average=True)  # sum / batch_size

    def forward(self, audio, text, audio_len, text_len):
        """Compute Model loss

        Args:
            audio (Tenosr): [B, D, T]
            text (Tensor): [B, T]
            audio_len (Tensor): [B]
            text_len (Tensor): [B]

        Returns:
            loss (Tenosr): [1]
        """

        eouts, eouts_len = self.encoder(audio, audio_len)
        loss = self.decoder(eouts, eouts_len, text, text_len)
        return loss

    @paddle.no_grad()
    def decode(self, audio, audio_len, vocab_list, decoding_method,
               lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
               cutoff_top_n, num_processes):
        # init once
        # decoders only accept string encoded in utf-8
        self.decoder.init_decode(
            beam_alpha=beam_alpha,
            beam_beta=beam_beta,
            lang_model_path=lang_model_path,
            vocab_list=vocab_list,
            decoding_method=decoding_method)

        eouts, eouts_len = self.encoder(audio, audio_len)
        probs = self.decoder.probs(eouts)
        return self.decoder.decode_probs(
            probs.numpy(), eouts_len, vocab_list, decoding_method,
            lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
            cutoff_top_n, num_processes)

    @classmethod
    def from_pretrained(cls, dataset, config, checkpoint_path):
        """Build a DeepSpeech2Model model from a pretrained model.
        Parameters
        ----------
        dataset: paddle.io.Dataset

        config: yacs.config.CfgNode
            model configs
        
        checkpoint_path: Path or str
            the path of pretrained model checkpoint, without extension name
        
        Returns
        -------
        DeepSpeech2Model
            The model built from pretrained result.
        """
        model = cls(feat_size=dataset.feature_size,
                    dict_size=dataset.vocab_size,
                    num_conv_layers=config.model.num_conv_layers,
                    num_rnn_layers=config.model.num_rnn_layers,
                    rnn_size=config.model.rnn_layer_size,
                    use_gru=config.model.use_gru,
                    share_rnn_weights=config.model.share_rnn_weights)
        checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
        layer_tools.summary(model)
        return model


class DeepSpeech2InferModel(DeepSpeech2Model):
    def __init__(self,
                 feat_size,
                 dict_size,
                 num_conv_layers=2,
                 num_rnn_layers=3,
                 rnn_size=1024,
                 use_gru=False,
                 share_rnn_weights=True):
        super().__init__(
            feat_size=feat_size,
            dict_size=dict_size,
            num_conv_layers=num_conv_layers,
            num_rnn_layers=num_rnn_layers,
            rnn_size=rnn_size,
            use_gru=use_gru,
            share_rnn_weights=share_rnn_weights)

    def forward(self, audio, audio_len):
        """export model function

        Args:
            audio (Tensor): [B, D, T]
            audio_len (Tensor): [B]

        Returns:
            probs: probs after softmax
        """
        eouts, eouts_len = self.encoder(audio, audio_len)
        probs = self.decoder.probs(eouts)
        return probs
Support paddle 2.x (#538) * 2.x model * model test pass * fix data * fix soundfile with flac support * one thread dataloader test pass * export feasture size add trainer and utils add setup model and dataloader update travis using Bionic dist * add venv; test under venv * fix unittest; train and valid * add train and config * add config and train script * fix ctc cuda memcopy error * fix imports * fix train valid log * fix dataset batch shuffle shift start from 1 fix rank_zero_only decreator error close tensorboard when train over add decoding config and code * test process can run * test with decoding * test and infer with decoding * fix infer * fix ctc loss lr schedule sortagrad logger * aishell egs * refactor train add aishell egs * fix dataset batch shuffle and add batch sampler log print model parameter * fix model and ctc * sequence_mask make all inputs zeros, which cause grad be zero, this is a bug of LessThanOp add grad clip by global norm add model train test notebook * ctc loss remove run prefix using ord value as text id * using unk when training compute_loss need text ids ord id using in test mode, which compute wer/cer * fix tester * add lr_deacy refactor code * fix tools * fix ci add tune fix gru model bugs add dataset and model test * fix decoding * refactor repo fix decoding * fix musan and rir dataset * refactor io, loss, conv, rnn, gradclip, model, utils * fix ci and import * refactor model add export jit model * add deploy bin and test it * rm uselss egs * add layer tools * refactor socket server new model from pretrain * remve useless * fix instability loss and grad nan or inf for librispeech training * fix sampler * fix libri train.sh * fix doc * add license on cpp * fix doc * fix libri script * fix install * clip 5 wer 7.39, clip 400 wer 7.54, 1.8 clip 400 baseline 7.49 4 years ago			`# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`import math`
			`import collections`
			`import numpy as np`
			`import logging`
			`from typing import Optional`
			`from yacs.config import CfgNode`

			`import paddle`
			`from paddle import nn`
			`from paddle.nn import functional as F`
			`from paddle.nn import initializer as I`

			`from deepspeech.modules.mask import sequence_mask`
			`from deepspeech.modules.activation import brelu`
Refactor CTC module, add embedding and fix log (#549) * add acts, refactor ctc, add pos embed * fix export, dataloader time log * fix egs * fix libri readme 4 years ago			`from deepspeech.modules.conv import ConvStack`
			`from deepspeech.modules.rnn import RNNStack`
			`from deepspeech.modules.ctc import CTCDecoder`

Support paddle 2.x (#538) * 2.x model * model test pass * fix data * fix soundfile with flac support * one thread dataloader test pass * export feasture size add trainer and utils add setup model and dataloader update travis using Bionic dist * add venv; test under venv * fix unittest; train and valid * add train and config * add config and train script * fix ctc cuda memcopy error * fix imports * fix train valid log * fix dataset batch shuffle shift start from 1 fix rank_zero_only decreator error close tensorboard when train over add decoding config and code * test process can run * test with decoding * test and infer with decoding * fix infer * fix ctc loss lr schedule sortagrad logger * aishell egs * refactor train add aishell egs * fix dataset batch shuffle and add batch sampler log print model parameter * fix model and ctc * sequence_mask make all inputs zeros, which cause grad be zero, this is a bug of LessThanOp add grad clip by global norm add model train test notebook * ctc loss remove run prefix using ord value as text id * using unk when training compute_loss need text ids ord id using in test mode, which compute wer/cer * fix tester * add lr_deacy refactor code * fix tools * fix ci add tune fix gru model bugs add dataset and model test * fix decoding * refactor repo fix decoding * fix musan and rir dataset * refactor io, loss, conv, rnn, gradclip, model, utils * fix ci and import * refactor model add export jit model * add deploy bin and test it * rm uselss egs * add layer tools * refactor socket server new model from pretrain * remve useless * fix instability loss and grad nan or inf for librispeech training * fix sampler * fix libri train.sh * fix doc * add license on cpp * fix doc * fix libri script * fix install * clip 5 wer 7.39, clip 400 wer 7.54, 1.8 clip 400 baseline 7.49 4 years ago			`from deepspeech.utils import checkpoint`
			`from deepspeech.utils import layer_tools`

			`logger = logging.getLogger(__name__)`

			`__all__ = ['DeepSpeech2Model']`


			`class CRNNEncoder(nn.Layer):`
			`def __init__(self,`
			`feat_size,`
			`dict_size,`
			`num_conv_layers=2,`
			`num_rnn_layers=3,`
			`rnn_size=1024,`
			`use_gru=False,`
			`share_rnn_weights=True):`
			`super().__init__()`
			`self.rnn_size = rnn_size`
			`self.feat_size = feat_size # 161 for linear`
			`self.dict_size = dict_size`

			`self.conv = ConvStack(feat_size, num_conv_layers)`

			`i_size = self.conv.output_height # H after conv stack`
			`self.rnn = RNNStack(`
			`i_size=i_size,`
			`h_size=rnn_size,`
			`num_stacks=num_rnn_layers,`
			`use_gru=use_gru,`
			`share_rnn_weights=share_rnn_weights)`

			`@property`
			`def output_size(self):`
			`return self.rnn_size * 2`

			`def forward(self, audio, audio_len):`
			`"""`
			`audio: shape [B, D, T]`
			`text: shape [B, T]`
			`audio_len: shape [B]`
			`text_len: shape [B]`
			`"""`
			`"""Compute Encoder outputs`

			`Args:`
			`audio (Tensor): [B, D, T]`
			`text (Tensor): [B, T]`
			`audio_len (Tensor): [B]`
			`text_len (Tensor): [B]`
			`Returns:`
			`x (Tensor): encoder outputs, [B, T, D]`
			`x_lens (Tensor): encoder length, [B]`
			`"""`
			`# [B, D, T] -> [B, C=1, D, T]`
			`x = audio.unsqueeze(1)`
			`x_lens = audio_len`

			`# convolution group`
			`x, x_lens = self.conv(x, x_lens)`

			`# convert data from convolution feature map to sequence of vectors`
			`#B, C, D, T = paddle.shape(x) # not work under jit`
			`x = x.transpose([0, 3, 1, 2]) #[B, T, C, D]`
			`#x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit`
			`x = x.reshape([0, 0, -1]) #[B, T, C*D]`

			`# remove padding part`
			`x, x_lens = self.rnn(x, x_lens) #[B, T, D]`
			`return x, x_lens`


			`class DeepSpeech2Model(nn.Layer):`
			`"""The DeepSpeech2 network structure.`

			`:param audio_data: Audio spectrogram data layer.`
			`:type audio_data: Variable`
			`:param text_data: Transcription text data layer.`
			`:type text_data: Variable`
			`:param audio_len: Valid sequence length data layer.`
			`:type audio_len: Variable`
			`:param masks: Masks data layer to reset padding.`
			`:type masks: Variable`
			`:param dict_size: Dictionary size for tokenized transcription.`
			`:type dict_size: int`
			`:param num_conv_layers: Number of stacking convolution layers.`
			`:type num_conv_layers: int`
			`:param num_rnn_layers: Number of stacking RNN layers.`
			`:type num_rnn_layers: int`
			`:param rnn_size: RNN layer size (dimension of RNN cells).`
			`:type rnn_size: int`
			`:param use_gru: Use gru if set True. Use simple rnn if set False.`
			`:type use_gru: bool`
			`:param share_rnn_weights: Whether to share input-hidden weights between`
			`forward and backward direction RNNs.`
			`It is only available when use_gru=False.`
			`:type share_weights: bool`
			`:return: A tuple of an output unnormalized log probability layer (`
			`before softmax) and a ctc cost layer.`
			`:rtype: tuple of LayerOutput`
			`"""`

			`@classmethod`
			`def params(cls, config: Optional[CfgNode]=None) -> CfgNode:`
			`default = CfgNode(`
			`dict(`
			`num_conv_layers=2, #Number of stacking convolution layers.`
			`num_rnn_layers=3, #Number of stacking RNN layers.`
			`rnn_layer_size=1024, #RNN layer size (number of RNN cells).`
			`use_gru=True, #Use gru if set True. Use simple rnn if set False.`
			`share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.`
			`))`
			`if config is not None:`
			`config.merge_from_other_cfg(default)`
			`return default`

			`def __init__(self,`
			`feat_size,`
			`dict_size,`
			`num_conv_layers=2,`
			`num_rnn_layers=3,`
			`rnn_size=1024,`
			`use_gru=False,`
			`share_rnn_weights=True):`
			`super().__init__()`
			`self.encoder = CRNNEncoder(`
			`feat_size=feat_size,`
			`dict_size=dict_size,`
			`num_conv_layers=num_conv_layers,`
			`num_rnn_layers=num_rnn_layers,`
			`rnn_size=rnn_size,`
			`use_gru=use_gru,`
			`share_rnn_weights=share_rnn_weights)`
			`assert (self.encoder.output_size == rnn_size * 2)`
Refactor CTC module, add embedding and fix log (#549) * add acts, refactor ctc, add pos embed * fix export, dataloader time log * fix egs * fix libri readme 4 years ago
Support paddle 2.x (#538) * 2.x model * model test pass * fix data * fix soundfile with flac support * one thread dataloader test pass * export feasture size add trainer and utils add setup model and dataloader update travis using Bionic dist * add venv; test under venv * fix unittest; train and valid * add train and config * add config and train script * fix ctc cuda memcopy error * fix imports * fix train valid log * fix dataset batch shuffle shift start from 1 fix rank_zero_only decreator error close tensorboard when train over add decoding config and code * test process can run * test with decoding * test and infer with decoding * fix infer * fix ctc loss lr schedule sortagrad logger * aishell egs * refactor train add aishell egs * fix dataset batch shuffle and add batch sampler log print model parameter * fix model and ctc * sequence_mask make all inputs zeros, which cause grad be zero, this is a bug of LessThanOp add grad clip by global norm add model train test notebook * ctc loss remove run prefix using ord value as text id * using unk when training compute_loss need text ids ord id using in test mode, which compute wer/cer * fix tester * add lr_deacy refactor code * fix tools * fix ci add tune fix gru model bugs add dataset and model test * fix decoding * refactor repo fix decoding * fix musan and rir dataset * refactor io, loss, conv, rnn, gradclip, model, utils * fix ci and import * refactor model add export jit model * add deploy bin and test it * rm uselss egs * add layer tools * refactor socket server new model from pretrain * remve useless * fix instability loss and grad nan or inf for librispeech training * fix sampler * fix libri train.sh * fix doc * add license on cpp * fix doc * fix libri script * fix install * clip 5 wer 7.39, clip 400 wer 7.54, 1.8 clip 400 baseline 7.49 4 years ago			`self.decoder = CTCDecoder(`
Refactor CTC module, add embedding and fix log (#549) * add acts, refactor ctc, add pos embed * fix export, dataloader time log * fix egs * fix libri readme 4 years ago			`enc_n_units=self.encoder.output_size,`
			`odim=dict_size + 1, # <blank> is append after vocab`
			`blank_id=dict_size, # last token is <blank>`
			`dropout_rate=0.0,`
batch average ctc loss (#567) * when loss div batchsize, change lr, more epoch, loss can reduce more and cer lower than before * since loss reduce more when loss div batchsize, less lm alpha can be better. * less lm alpha, more cer reduce * alpha 2.2, cer 0.077478 * alpha 1.9, cer 0.077249 * large librispeech lr for batch_average ctc loss * since loss reduce and model more confidence, then less lm alpha 4 years ago			`reduction=True, # sum`
			`batch_average=True) # sum / batch_size`
Support paddle 2.x (#538) * 2.x model * model test pass * fix data * fix soundfile with flac support * one thread dataloader test pass * export feasture size add trainer and utils add setup model and dataloader update travis using Bionic dist * add venv; test under venv * fix unittest; train and valid * add train and config * add config and train script * fix ctc cuda memcopy error * fix imports * fix train valid log * fix dataset batch shuffle shift start from 1 fix rank_zero_only decreator error close tensorboard when train over add decoding config and code * test process can run * test with decoding * test and infer with decoding * fix infer * fix ctc loss lr schedule sortagrad logger * aishell egs * refactor train add aishell egs * fix dataset batch shuffle and add batch sampler log print model parameter * fix model and ctc * sequence_mask make all inputs zeros, which cause grad be zero, this is a bug of LessThanOp add grad clip by global norm add model train test notebook * ctc loss remove run prefix using ord value as text id * using unk when training compute_loss need text ids ord id using in test mode, which compute wer/cer * fix tester * add lr_deacy refactor code * fix tools * fix ci add tune fix gru model bugs add dataset and model test * fix decoding * refactor repo fix decoding * fix musan and rir dataset * refactor io, loss, conv, rnn, gradclip, model, utils * fix ci and import * refactor model add export jit model * add deploy bin and test it * rm uselss egs * add layer tools * refactor socket server new model from pretrain * remve useless * fix instability loss and grad nan or inf for librispeech training * fix sampler * fix libri train.sh * fix doc * add license on cpp * fix doc * fix libri script * fix install * clip 5 wer 7.39, clip 400 wer 7.54, 1.8 clip 400 baseline 7.49 4 years ago
			`def forward(self, audio, text, audio_len, text_len):`
			`"""Compute Model loss`

			`Args:`
			`audio (Tenosr): [B, D, T]`
			`text (Tensor): [B, T]`
			`audio_len (Tensor): [B]`
			`text_len (Tensor): [B]`

			`Returns:`
			`loss (Tenosr): [1]`
			`"""`

			`eouts, eouts_len = self.encoder(audio, audio_len)`
			`loss = self.decoder(eouts, eouts_len, text, text_len)`
			`return loss`

			`@paddle.no_grad()`
			`def decode(self, audio, audio_len, vocab_list, decoding_method,`
			`lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,`
			`cutoff_top_n, num_processes):`
			`# init once`
			`# decoders only accept string encoded in utf-8`
			`self.decoder.init_decode(`
			`beam_alpha=beam_alpha,`
			`beam_beta=beam_beta,`
			`lang_model_path=lang_model_path,`
			`vocab_list=vocab_list,`
			`decoding_method=decoding_method)`

			`eouts, eouts_len = self.encoder(audio, audio_len)`
			`probs = self.decoder.probs(eouts)`
			`return self.decoder.decode_probs(`
			`probs.numpy(), eouts_len, vocab_list, decoding_method,`
			`lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,`
			`cutoff_top_n, num_processes)`

			`@classmethod`
			`def from_pretrained(cls, dataset, config, checkpoint_path):`
			`"""Build a DeepSpeech2Model model from a pretrained model.`
			`Parameters`
			`----------`
			`dataset: paddle.io.Dataset`

			`config: yacs.config.CfgNode`
			`model configs`

			`checkpoint_path: Path or str`
			`the path of pretrained model checkpoint, without extension name`

			`Returns`
			`-------`
			`DeepSpeech2Model`
			`The model built from pretrained result.`
			`"""`
			`model = cls(feat_size=dataset.feature_size,`
			`dict_size=dataset.vocab_size,`
			`num_conv_layers=config.model.num_conv_layers,`
			`num_rnn_layers=config.model.num_rnn_layers,`
			`rnn_size=config.model.rnn_layer_size,`
			`use_gru=config.model.use_gru,`
			`share_rnn_weights=config.model.share_rnn_weights)`
			`checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)`
			`layer_tools.summary(model)`
			`return model`


			`class DeepSpeech2InferModel(DeepSpeech2Model):`
			`def __init__(self,`
			`feat_size,`
			`dict_size,`
			`num_conv_layers=2,`
			`num_rnn_layers=3,`
			`rnn_size=1024,`
			`use_gru=False,`
			`share_rnn_weights=True):`
			`super().__init__(`
			`feat_size=feat_size,`
			`dict_size=dict_size,`
			`num_conv_layers=num_conv_layers,`
			`num_rnn_layers=num_rnn_layers,`
			`rnn_size=rnn_size,`
			`use_gru=use_gru,`
			`share_rnn_weights=share_rnn_weights)`

			`def forward(self, audio, audio_len):`
			`"""export model function`

			`Args:`
			`audio (Tensor): [B, D, T]`
			`audio_len (Tensor): [B]`

			`Returns:`
			`probs: probs after softmax`
			`"""`
			`eouts, eouts_len = self.encoder(audio, audio_len)`
			`probs = self.decoder.probs(eouts)`
			`return probs`