Refactor CTC module, add embedding and fix log (#549)

* add acts, refactor ctc, add pos embed

* fix export, dataloader time log

* fix egs

* fix libri readme
pull/550/head
Hui Zhang 3 years ago committed by GitHub
parent 00889bfaf2
commit 1539f3e0a3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -305,11 +305,12 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
exit(-1)
def export(self):
self.infer_model.eval()
infer_model = DeepSpeech2InferModel.from_pretrained(
self.test_loader.dataset, self.config, self.args.checkpoint_path)
infer_model.eval()
feat_dim = self.test_loader.dataset.feature_size
paddle.jit.save(
self.infer_model,
self.args.export_path,
static_model = paddle.jit.to_static(
infer_model,
input_spec=[
paddle.static.InputSpec(
shape=[None, feat_dim, None],
@ -317,6 +318,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
paddle.static.InputSpec(shape=[None],
dtype='int64'), # audio_length, [B]
])
logger.info(f"Export code: {static_model.forward.code}")
paddle.jit.save(static_model, self.args.export_path)
def run_export(self):
try:
@ -349,12 +352,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
rnn_size=config.model.rnn_layer_size,
use_gru=config.model.use_gru,
share_rnn_weights=config.model.share_rnn_weights)
infer_model = DeepSpeech2InferModel.from_pretrained(
self.test_loader.dataset, config, self.args.checkpoint_path)
self.model = model
self.infer_model = infer_model
self.logger.info("Setup model!")
def setup_dataloader(self):

@ -24,17 +24,14 @@ from paddle import nn
from paddle.nn import functional as F
from paddle.nn import initializer as I
from deepspeech.modules.conv import ConvStack
from deepspeech.modules.rnn import RNNStack
from deepspeech.modules.mask import sequence_mask
from deepspeech.modules.activation import brelu
from deepspeech.modules.conv import ConvStack
from deepspeech.modules.rnn import RNNStack
from deepspeech.modules.ctc import CTCDecoder
from deepspeech.utils import checkpoint
from deepspeech.utils import layer_tools
from deepspeech.decoders.swig_wrapper import Scorer
from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder
from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch
from deepspeech.modules.loss import CTCLoss
logger = logging.getLogger(__name__)
@ -105,178 +102,6 @@ class CRNNEncoder(nn.Layer):
return x, x_lens
class CTCDecoder(nn.Layer):
def __init__(self, enc_n_units, vocab_size):
super().__init__()
self.blank_id = vocab_size
self.output = nn.Linear(enc_n_units,
vocab_size + 1) # blank id is last id
self.criterion = CTCLoss(self.blank_id)
self._ext_scorer = None
def forward(self, eout, eout_lens, texts, texts_len):
"""Compute CTC Loss
Args:
eout (Tensor):
eout_lens (Tensor):
texts (Tenosr):
texts_len (Tensor):
Returns:
loss (Tenosr): [1]
"""
logits = self.output(eout)
loss = self.criterion(logits, texts, eout_lens, texts_len)
return loss
def probs(self, eouts, temperature=1.):
"""Get CTC probabilities.
Args:
eouts (FloatTensor): `[B, T, enc_units]`
Returns:
probs (FloatTensor): `[B, T, vocab]`
"""
return F.softmax(self.output(eouts) / temperature, axis=-1)
def scores(self, eouts, temperature=1.):
"""Get log-scale CTC probabilities.
Args:
eouts (FloatTensor): `[B, T, enc_units]`
Returns:
log_probs (FloatTensor): `[B, T, vocab]`
"""
return F.log_softmax(self.output(eouts) / temperature, axis=-1)
def _decode_batch_greedy(self, probs_split, vocab_list):
"""Decode by best path for a batch of probs matrix input.
:param probs_split: List of 2-D probability matrix, and each consists
of prob vectors for one speech utterancce.
:param probs_split: List of matrix
:param vocab_list: List of tokens in the vocabulary, for decoding.
:type vocab_list: list
:return: List of transcription texts.
:rtype: List of str
"""
results = []
for i, probs in enumerate(probs_split):
output_transcription = ctc_greedy_decoder(
probs_seq=probs, vocabulary=vocab_list)
results.append(output_transcription)
return results
def _init_ext_scorer(self, beam_alpha, beam_beta, language_model_path,
vocab_list):
"""Initialize the external scorer.
:param beam_alpha: Parameter associated with language model.
:type beam_alpha: float
:param beam_beta: Parameter associated with word count.
:type beam_beta: float
:param language_model_path: Filepath for language model. If it is
empty, the external scorer will be set to
None, and the decoding method will be pure
beam search without scorer.
:type language_model_path: str|None
:param vocab_list: List of tokens in the vocabulary, for decoding.
:type vocab_list: list
"""
# init once
if self._ext_scorer != None:
return
if language_model_path != '':
logger.info("begin to initialize the external scorer "
"for decoding")
self._ext_scorer = Scorer(beam_alpha, beam_beta,
language_model_path, vocab_list)
lm_char_based = self._ext_scorer.is_character_based()
lm_max_order = self._ext_scorer.get_max_order()
lm_dict_size = self._ext_scorer.get_dict_size()
logger.info("language model: "
"is_character_based = %d," % lm_char_based +
" max_order = %d," % lm_max_order + " dict_size = %d" %
lm_dict_size)
logger.info("end initializing scorer")
else:
self._ext_scorer = None
logger.info("no language model provided, "
"decoding by pure beam search without scorer.")
def _decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta,
beam_size, cutoff_prob, cutoff_top_n,
vocab_list, num_processes):
"""Decode by beam search for a batch of probs matrix input.
:param probs_split: List of 2-D probability matrix, and each consists
of prob vectors for one speech utterancce.
:param probs_split: List of matrix
:param beam_alpha: Parameter associated with language model.
:type beam_alpha: float
:param beam_beta: Parameter associated with word count.
:type beam_beta: float
:param beam_size: Width for Beam search.
:type beam_size: int
:param cutoff_prob: Cutoff probability in pruning,
default 1.0, no pruning.
:type cutoff_prob: float
:param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
characters with highest probs in vocabulary will be
used in beam search, default 40.
:type cutoff_top_n: int
:param vocab_list: List of tokens in the vocabulary, for decoding.
:type vocab_list: list
:param num_processes: Number of processes (CPU) for decoder.
:type num_processes: int
:return: List of transcription texts.
:rtype: List of str
"""
if self._ext_scorer != None:
self._ext_scorer.reset_params(beam_alpha, beam_beta)
# beam search decode
num_processes = min(num_processes, len(probs_split))
beam_search_results = ctc_beam_search_decoder_batch(
probs_split=probs_split,
vocabulary=vocab_list,
beam_size=beam_size,
num_processes=num_processes,
ext_scoring_func=self._ext_scorer,
cutoff_prob=cutoff_prob,
cutoff_top_n=cutoff_top_n)
results = [result[0][1] for result in beam_search_results]
return results
def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list,
decoding_method):
if decoding_method == "ctc_beam_search":
self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
vocab_list)
def decode_probs(self, probs, logits_lens, vocab_list, decoding_method,
lang_model_path, beam_alpha, beam_beta, beam_size,
cutoff_prob, cutoff_top_n, num_processes):
""" probs: activation after softmax
logits_len: audio output lens
"""
probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)]
if decoding_method == "ctc_greedy":
result_transcripts = self._decode_batch_greedy(
probs_split=probs_split, vocab_list=vocab_list)
elif decoding_method == "ctc_beam_search":
result_transcripts = self._decode_batch_beam_search(
probs_split=probs_split,
beam_alpha=beam_alpha,
beam_beta=beam_beta,
beam_size=beam_size,
cutoff_prob=cutoff_prob,
cutoff_top_n=cutoff_top_n,
vocab_list=vocab_list,
num_processes=num_processes)
else:
raise ValueError(f"Not support: {decoding_method}")
return result_transcripts
class DeepSpeech2Model(nn.Layer):
"""The DeepSpeech2 network structure.
@ -339,8 +164,13 @@ class DeepSpeech2Model(nn.Layer):
use_gru=use_gru,
share_rnn_weights=share_rnn_weights)
assert (self.encoder.output_size == rnn_size * 2)
self.decoder = CTCDecoder(
enc_n_units=self.encoder.output_size, vocab_size=dict_size)
enc_n_units=self.encoder.output_size,
odim=dict_size + 1, # <blank> is append after vocab
blank_id=dict_size, # last token is <blank>
dropout_rate=0.0,
reduction=True)
def forward(self, audio, text, audio_len, text_len):
"""Compute Model loss

@ -14,6 +14,7 @@
import logging
import numpy as np
import math
import paddle
from paddle import nn
@ -22,7 +23,7 @@ from paddle.nn import initializer as I
logger = logging.getLogger(__name__)
__all__ = ['brelu']
__all__ = ['brelu', "softplus", "gelu_accurate", "gelu", 'Swish']
def brelu(x, t_min=0.0, t_max=24.0, name=None):
@ -30,3 +31,38 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
t_min = paddle.full(shape=[1], fill_value=t_min, dtype='float32')
t_max = paddle.full(shape=[1], fill_value=t_max, dtype='float32')
return x.maximum(t_min).minimum(t_max)
def softplus(x):
"""Softplus function."""
if hasattr(paddle.nn.functional, 'softplus'):
#return paddle.nn.functional.softplus(x.float()).type_as(x)
return paddle.nn.functional.softplus(x)
else:
raise NotImplementedError
def gelu_accurate(x):
"""Gaussian Error Linear Units (GELU) activation."""
# [reference] https://github.com/pytorch/fairseq/blob/e75cff5f2c1d62f12dc911e0bf420025eb1a4e33/fairseq/modules/gelu.py
if not hasattr(gelu_accurate, "_a"):
gelu_accurate._a = math.sqrt(2 / math.pi)
return 0.5 * x * (1 + paddle.tanh(gelu_accurate._a *
(x + 0.044715 * paddle.pow(x, 3))))
def gelu(x):
"""Gaussian Error Linear Units (GELU) activation."""
if hasattr(torch.nn.functional, 'gelu'):
#return torch.nn.functional.gelu(x.float()).type_as(x)
return torch.nn.functional.gelu(x)
else:
return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
class Swish(nn.Layer):
"""Construct an Swish object."""
def forward(self, x: paddle.Tensor) -> paddle.Tensor:
"""Return Swish activation function."""
return x * F.sigmoid(x)

@ -0,0 +1,238 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from typeguard import check_argument_types
import paddle
from paddle import nn
from paddle.nn import functional as F
from paddle.nn import initializer as I
from deepspeech.decoders.swig_wrapper import Scorer
from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder
from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch
from deepspeech.modules.loss import CTCLoss
logger = logging.getLogger(__name__)
__all__ = ['CTCDecoder']
class CTCDecoder(nn.Layer):
def __init__(self,
enc_n_units,
odim,
blank_id=0,
dropout_rate: float=0.0,
reduction: bool=True):
"""CTC decoder
Args:
enc_n_units ([int]): encoder output dimention
vocab_size ([int]): text vocabulary size
dropout_rate (float): dropout rate (0.0 ~ 1.0)
reduction (bool): reduce the CTC loss into a scalar
"""
assert check_argument_types()
super().__init__()
self.blank_id = blank_id
self.odim = odim
self.dropout_rate = dropout_rate
self.ctc_lo = nn.Linear(enc_n_units, self.odim)
reduction_type = "sum" if reduction else "none"
self.criterion = CTCLoss(blank=self.blank_id, reduction=reduction_type)
# CTCDecoder LM Score handle
self._ext_scorer = None
def forward(self, hs_pad, hlens, ys_pad, ys_lens):
"""Calculate CTC loss.
Args:
hs_pad (Tensor): batch of padded hidden state sequences (B, Tmax, D)
hlens (Tensor): batch of lengths of hidden state sequences (B)
ys_pad (Tenosr): batch of padded character id sequence tensor (B, Lmax)
ys_lens (Tensor): batch of lengths of character sequence (B)
Returns:
loss (Tenosr): scalar.
"""
logits = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate))
loss = self.criterion(logits, ys_pad, hlens, ys_lens)
return loss
def probs(self, eouts: paddle.Tensor, temperature: float=1.0):
"""Get CTC probabilities.
Args:
eouts (FloatTensor): `[B, T, enc_units]`
Returns:
probs (FloatTensor): `[B, T, odim]`
"""
return F.softmax(self.ctc_lo(eouts) / temperature, axis=-1)
def scores(self, eouts: paddle.Tensor, temperature: float=1.0):
"""Get log-scale CTC probabilities.
Args:
eouts (FloatTensor): `[B, T, enc_units]`
Returns:
log_probs (FloatTensor): `[B, T, odim]`
"""
return F.log_softmax(self.ctc_lo(eouts) / temperature, axis=-1)
def log_softmax(self, hs_pad: paddle.Tensor) -> paddle.Tensor:
"""log_softmax of frame activations
Args:
Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
Returns:
paddle.Tensor: log softmax applied 3d tensor (B, Tmax, odim)
"""
return self.scores(hs_pad)
def argmax(self, hs_pad: paddle.Tensor) -> paddle.Tensor:
"""argmax of frame activations
Args:
paddle.Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
Returns:
paddle.Tensor: argmax applied 2d tensor (B, Tmax)
"""
return paddle.argmax(self.ctc_lo(hs_pad), dim=2)
def _decode_batch_greedy(self, probs_split, vocab_list):
"""Decode by best path for a batch of probs matrix input.
:param probs_split: List of 2-D probability matrix, and each consists
of prob vectors for one speech utterancce.
:param probs_split: List of matrix
:param vocab_list: List of tokens in the vocabulary, for decoding.
:type vocab_list: list
:return: List of transcription texts.
:rtype: List of str
"""
results = []
for i, probs in enumerate(probs_split):
output_transcription = ctc_greedy_decoder(
probs_seq=probs, vocabulary=vocab_list)
results.append(output_transcription)
return results
def _init_ext_scorer(self, beam_alpha, beam_beta, language_model_path,
vocab_list):
"""Initialize the external scorer.
:param beam_alpha: Parameter associated with language model.
:type beam_alpha: float
:param beam_beta: Parameter associated with word count.
:type beam_beta: float
:param language_model_path: Filepath for language model. If it is
empty, the external scorer will be set to
None, and the decoding method will be pure
beam search without scorer.
:type language_model_path: str|None
:param vocab_list: List of tokens in the vocabulary, for decoding.
:type vocab_list: list
"""
# init once
if self._ext_scorer != None:
return
if language_model_path != '':
logger.info("begin to initialize the external scorer "
"for decoding")
self._ext_scorer = Scorer(beam_alpha, beam_beta,
language_model_path, vocab_list)
lm_char_based = self._ext_scorer.is_character_based()
lm_max_order = self._ext_scorer.get_max_order()
lm_dict_size = self._ext_scorer.get_dict_size()
logger.info("language model: "
"is_character_based = %d," % lm_char_based +
" max_order = %d," % lm_max_order + " dict_size = %d" %
lm_dict_size)
logger.info("end initializing scorer")
else:
self._ext_scorer = None
logger.info("no language model provided, "
"decoding by pure beam search without scorer.")
def _decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta,
beam_size, cutoff_prob, cutoff_top_n,
vocab_list, num_processes):
"""Decode by beam search for a batch of probs matrix input.
:param probs_split: List of 2-D probability matrix, and each consists
of prob vectors for one speech utterancce.
:param probs_split: List of matrix
:param beam_alpha: Parameter associated with language model.
:type beam_alpha: float
:param beam_beta: Parameter associated with word count.
:type beam_beta: float
:param beam_size: Width for Beam search.
:type beam_size: int
:param cutoff_prob: Cutoff probability in pruning,
default 1.0, no pruning.
:type cutoff_prob: float
:param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
characters with highest probs in vocabulary will be
used in beam search, default 40.
:type cutoff_top_n: int
:param vocab_list: List of tokens in the vocabulary, for decoding.
:type vocab_list: list
:param num_processes: Number of processes (CPU) for decoder.
:type num_processes: int
:return: List of transcription texts.
:rtype: List of str
"""
if self._ext_scorer != None:
self._ext_scorer.reset_params(beam_alpha, beam_beta)
# beam search decode
num_processes = min(num_processes, len(probs_split))
beam_search_results = ctc_beam_search_decoder_batch(
probs_split=probs_split,
vocabulary=vocab_list,
beam_size=beam_size,
num_processes=num_processes,
ext_scoring_func=self._ext_scorer,
cutoff_prob=cutoff_prob,
cutoff_top_n=cutoff_top_n)
results = [result[0][1] for result in beam_search_results]
return results
def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list,
decoding_method):
if decoding_method == "ctc_beam_search":
self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
vocab_list)
def decode_probs(self, probs, logits_lens, vocab_list, decoding_method,
lang_model_path, beam_alpha, beam_beta, beam_size,
cutoff_prob, cutoff_top_n, num_processes):
""" probs: activation after softmax
logits_len: audio output lens
"""
probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)]
if decoding_method == "ctc_greedy":
result_transcripts = self._decode_batch_greedy(
probs_split=probs_split, vocab_list=vocab_list)
elif decoding_method == "ctc_beam_search":
result_transcripts = self._decode_batch_beam_search(
probs_split=probs_split,
beam_alpha=beam_alpha,
beam_beta=beam_beta,
beam_size=beam_size,
cutoff_prob=cutoff_prob,
cutoff_top_n=cutoff_top_n,
vocab_list=vocab_list,
num_processes=num_processes)
else:
raise ValueError(f"Not support: {decoding_method}")
return result_transcripts

@ -0,0 +1,132 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Positonal Encoding Module."""
import math
import logging
import numpy as np
from typing import Tuple
import paddle
from paddle import nn
from paddle.nn import functional as F
from paddle.nn import initializer as I
logger = logging.getLogger(__name__)
__all__ = ["PositionalEncoding", "RelPositionalEncoding"]
# TODO(Hui Zhang): remove this hack
paddle.float32 = 'float32'
class PositionalEncoding(nn.Layer):
def __init__(self,
d_model: int,
dropout_rate: float,
max_len: int=5000,
reverse: bool=False):
"""Positional encoding.
PE(pos, 2i) = sin(pos/(10000^(2i/dmodel)))
PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
Args:
d_model (int): embedding dim.
dropout_rate (float): dropout rate.
max_len (int, optional): maximum input length. Defaults to 5000.
reverse (bool, optional): Not used. Defaults to False.
"""
super().__init__()
self.d_model = d_model
self.max_len = max_len
self.xscale = paddle.to_tensor(math.sqrt(self.d_model))
self.dropout = nn.Dropout(p=dropout_rate)
self.pe = paddle.zeros(self.max_len, self.d_model) #[T,D]
position = paddle.arange(
0, self.max_len, dtype=paddle.float32).unsqueeze(1)
div_term = paddle.exp(
paddle.arange(0, self.d_model, 2, dtype=paddle.float32) *
-(math.log(10000.0) / self.d_model))
self.pe[:, 0::2] = paddle.sin(position * div_term)
self.pe[:, 1::2] = paddle.cos(position * div_term)
self.pe = self.pe.unsqueeze(0) #[1, T, D]
def forward(self, x: paddle.Tensor,
offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Add positional encoding.
Args:
x (paddle.Tensor): Input. Its shape is (batch, time, ...)
offset (int): position offset
Returns:
paddle.Tensor: Encoded tensor. Its shape is (batch, time, ...)
paddle.Tensor: for compatibility to RelPositionalEncoding
"""
T = paddle.shape(x)[1]
assert offset + T < self.max_len
#assert offset + x.size(1) < self.max_len
#self.pe = self.pe.to(x.device)
#pos_emb = self.pe[:, offset:offset + x.size(1)]
pos_emb = self.pe[:, offset:offset + T]
x = x * self.xscale + pos_emb
return self.dropout(x), self.dropout(pos_emb)
def position_encoding(self, offset: int, size: int) -> paddle.Tensor:
""" For getting encoding in a streaming fashion
Attention!!!!!
we apply dropout only once at the whole utterance level in a none
streaming way, but will call this function several times with
increasing input size in a streaming scenario, so the dropout will
be applied several times.
Args:
offset (int): start offset
size (int): requried size of position encoding
Returns:
paddle.Tensor: Corresponding encoding
"""
assert offset + size < self.max_len
return self.dropout(self.pe[:, offset:offset + size])
class RelPositionalEncoding(PositionalEncoding):
"""Relative positional encoding module.
See : Appendix B in https://arxiv.org/abs/1901.02860
"""
def __init__(self, d_model: int, dropout_rate: float, max_len: int=5000):
"""
Args:
d_model (int): Embedding dimension.
dropout_rate (float): Dropout rate.
max_len (int, optional): [Maximum input length.]. Defaults to 5000.
"""
super().__init__(d_model, dropout_rate, max_len, reverse=True)
def forward(self, x: paddle.Tensor,
offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Compute positional encoding.
Args:
x (paddle.Tensor): Input tensor (batch, time, `*`).
Returns:
paddle.Tensor: Encoded tensor (batch, time, `*`).
paddle.Tensor: Positional embedding tensor (1, time, `*`).
"""
T = paddle.shape()[1]
assert offset + T < self.max_len
#assert offset + x.size(1) < self.max_len
#self.pe = self.pe.to(x.device)
x = x * self.xscale
#pos_emb = self.pe[:, offset:offset + x.size(1)]
pos_emb = self.pe[:, offset:offset + T]
return self.dropout(x), self.dropout(pos_emb)

@ -24,6 +24,7 @@ logger = logging.getLogger(__name__)
__all__ = ['CTCLoss']
# TODO(Hui Zhang): remove this hack, when `norm_by_times=True` is added
def ctc_loss(logits,
labels,
input_lengths,
@ -47,19 +48,35 @@ def ctc_loss(logits,
return loss_out
# TODO(Hui Zhang): remove this hack
F.ctc_loss = ctc_loss
class CTCLoss(nn.Layer):
def __init__(self, blank_id):
def __init__(self, blank=0, reduction='sum'):
super().__init__()
# last token id as blank id
self.loss = nn.CTCLoss(blank=blank_id, reduction='sum')
self.loss = nn.CTCLoss(blank=blank, reduction=reduction)
def forward(self, logits, text, logits_len, text_len):
# warp-ctc do softmax on activations
def forward(self, logits, ys_pad, hlens, ys_lens):
"""Compute CTC loss.
Args:
logits ([paddle.Tensor]): [description]
ys_pad ([paddle.Tensor]): [description]
hlens ([paddle.Tensor]): [description]
ys_lens ([paddle.Tensor]): [description]
Returns:
[paddle.Tensor]: scalar. If reduction is 'none', then (N), where N = \text{batch size}.
"""
# warp-ctc need logits, and do softmax on logits by itself
# warp-ctc need activation with shape [T, B, V + 1]
# logits: (B, L, D) -> (L, B, D)
logits = logits.transpose([1, 0, 2])
loss = self.loss(logits, ys_pad, hlens, ys_lens)
ctc_loss = self.loss(logits, text, logits_len, text_len)
return ctc_loss
# wenet do batch-size average, deepspeech2 not do this
# Batch-size average
# loss = loss / paddle.shape(logits)[1]
return loss

@ -28,6 +28,7 @@ def sequence_mask(x_len, max_len=None, dtype='float32'):
max_len = max_len or x_len.max()
x_len = paddle.unsqueeze(x_len, -1)
row_vector = paddle.arange(max_len)
# TODO(Hui Zhang): fix this bug
#mask = row_vector < x_len
mask = row_vector > x_len # a bug, broadcast 的时候出错了
mask = paddle.cast(mask, dtype)

@ -167,9 +167,17 @@ class Trainer():
self.new_epoch()
while self.epoch <= self.config.training.n_epoch:
try:
data_start_time = time.time()
for batch in self.train_loader:
dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "dataloader time: {:>.3f}s, ".format(dataload_time)
self.logger.info(msg)
self.iteration += 1
self.train_batch(batch)
data_start_time = time.time()
except Exception as e:
self.logger.error(e)
pass

@ -1,5 +1,9 @@
#! /usr/bin/env bash
if [[ $# != 1 ]];
echo "usage: $0 ckpt-path"
exit -1
fi
# download language model
bash local/download_lm_ch.sh

@ -1,31 +0,0 @@
#! /usr/bin/env bash
# download language model
bash local/download_lm_ch.sh
if [ $? -ne 0 ]; then
exit 1
fi
# download well-trained model
bash local/download_model.sh
if [ $? -ne 0 ]; then
exit 1
fi
# infer
CUDA_VISIBLE_DEVICES=0 \
python3 -u ${BIN_DIR}/infer.py \
--device 'gpu' \
--nproc 1 \
--config conf/deepspeech2.yaml \
--checkpoint_path data/pretrain/params.pdparams \
--opts data.mean_std_filepath data/pretrain/mean_std.npz \
--opts data.vocab_filepath data/pretrain/vocab.txt
if [ $? -ne 0 ]; then
echo "Failed in inference!"
exit 1
fi
exit 0

@ -2,7 +2,7 @@
# TODO: replace the model with a mandarin model
if [[ $# != 1 ]];then
echo "usage: server.sh checkpoint_path"
echo "usage: $1 checkpoint_path"
exit -1
fi

@ -10,7 +10,7 @@ python3 -u ${BIN_DIR}/test.py \
--device 'gpu' \
--nproc 1 \
--config conf/deepspeech2.yaml \
--checkpoint_path ${1}
--output ckpt
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"

@ -1,31 +0,0 @@
#! /usr/bin/env bash
# download language model
bash local/download_lm_ch.sh
if [ $? -ne 0 ]; then
exit 1
fi
# download well-trained model
bash local/download_model.sh
if [ $? -ne 0 ]; then
exit 1
fi
# evaluate model
CUDA_VISIBLE_DEVICES=0 \
python3 -u ${BIN_DIR}/test.py \
--device 'gpu' \
--nproc 1 \
--config conf/deepspeech2.yaml \
--checkpoint_path data/pretrain/params.pdparams \
--opts data.mean_std_filepath data/pretrain/mean_std.npz \
--opts data.vocab_filepath data/pretrain/vocab.txt
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
exit 0

@ -11,7 +11,7 @@ python3 -u ${BIN_DIR}/train.py \
--device 'gpu' \
--nproc ${ngpu} \
--config conf/deepspeech2.yaml \
--output ckpt
--output ckpt-${1}
if [ $? -ne 0 ]; then

@ -10,7 +10,10 @@ bash ./local/data.sh
CUDA_VISIBLE_DEVICES=0,1,2,3 bash ./local/train.sh
# test model
CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh ckpt/checkpoints/step-3284
CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh
# infer model
CUDA_VISIBLE_DEVICES=0 bash ./local/infer.sh ckpt/checkpoints/step-3284
# export model
bash ./local/export.sh ckpt/checkpoints/step-3284 jit.model

@ -1,7 +1,7 @@
# LibriSpeech
## CTC
| Model | Config | Test set | CER |
| Model | Config | Test set | WER |
| --- | --- | --- | --- |
| DeepSpeech2 | conf/deepspeech2.yaml | test-clean | 0.073973 |
| DeepSpeech2 | release 1.8.5 | test-clean | 0.074939 |

@ -0,0 +1,20 @@
#! /usr/bin/env bash
if [ $# != 2 ];then
echo "usage: export ckpt_path jit_model_path"
exit -1
fi
python3 -u ${BIN_DIR}/export.py \
--config conf/deepspeech2.yaml \
--checkpoint_path ${1} \
--export_path ${2}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
exit 0

@ -1,5 +1,10 @@
#! /usr/bin/env bash
if [[ $# != 1 ]];
echo "usage: $0 ckpt-path"
exit -1
fi
# download language model
bash local/download_lm_en.sh
if [ $? -ne 0 ]; then

@ -7,10 +7,13 @@ source path.sh
bash ./local/data.sh
# train model
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ./local/train.sh
CUDA_VISIBLE_DEVICES=0,1,2,3 bash ./local/train.sh
# test model
CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh
# infer model
CUDA_VISIBLE_DEVICES=0 bash ./local/infer.sh
CUDA_VISIBLE_DEVICES=0 bash ./local/infer.sh ckpt/checkpoints/step-3284
# export model
bash ./local/export.sh ckpt/checkpoints/step-3284 jit.model

@ -1,7 +1,8 @@
# Tiny Example
1. `source path.sh`
2. `bash run.sh`
3. set `CUDA_VISIBLE_DEVICES` as you need.
2. demo scrpt is `bash run.sh`. You can run commond separately as needed.
## Steps
- Prepare the data
@ -26,11 +27,7 @@
bash local/infer.sh
```
`infer.sh` will show us some speech-to-text decoding results for several (default: 10) samples with the trained model. The performance might not be good now as the current model is only trained with a toy subset of LibriSpeech. To see the results with a better model, you can download a well-trained (trained for several days, with the complete LibriSpeech) model and do the inference:
```bash
bash local/infer_golden.sh
```
`infer.sh` will show us some speech-to-text decoding results for several (default: 10) samples with the trained model. The performance might not be good now as the current model is only trained with a toy subset of LibriSpeech. To see the results with a better model, you can download a well-trained (trained for several days, with the complete LibriSpeech) model and do the inference.
- Evaluate an existing model
@ -40,6 +37,15 @@
`test.sh` will evaluate the model with Word Error Rate (or Character Error Rate) measurement. Similarly, you can also download a well-trained model and test its performance:
- Export jit model
```bash
bash local/export.sh ckpt_path saved_jit_model_path
```
- Tune hyper paerameter
```bash
bash local/test_golden.sh
bash local/tune.sh
```

@ -1,17 +1,21 @@
#! /usr/bin/env bash
if [[ $# != 1 ]];
echo "usage: $0 ckpt-path"
exit -1
fi
# download language model
bash local/download_lm_en.sh
if [ $? -ne 0 ]; then
exit 1
fi
CUDA_VISIBLE_DEVICES=0 \
python3 -u ${BIN_DIR}/infer.py \
--device 'gpu' \
--nproc 1 \
--config conf/deepspeech2.yaml \
--output ckpt
--checkpoint_path ${1}
if [ $? -ne 0 ]; then

@ -13,7 +13,6 @@ python3 -u ${BIN_DIR}/test.py \
--config conf/deepspeech2.yaml \
--output ckpt
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1

@ -4,3 +4,4 @@ SoundFile==0.9.0.post1
python_speech_features
tensorboardX
yacs
typeguard

Loading…
Cancel
Save