From c8e96d732ba4a941da9c72cf23d01d058564615c Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 2 Oct 2021 09:44:14 +0000 Subject: [PATCH 01/17] bool logical, sum and multiply op; ctc grad norm; support old and new pd api --- deepspeech/models/ds2/conv.py | 14 +++++++-- deepspeech/models/ds2/rnn.py | 6 ++-- deepspeech/models/u2/u2.py | 12 ++++++-- deepspeech/models/u2_st.py | 8 +++-- deepspeech/modules/decoder.py | 8 +++-- deepspeech/modules/encoder.py | 3 +- deepspeech/modules/loss.py | 53 ++++++++++++++++++++------------ deepspeech/modules/mask.py | 16 +++++++--- deepspeech/utils/tensor_utils.py | 10 ++++-- tests/mask_test.py | 4 +-- 10 files changed, 93 insertions(+), 41 deletions(-) diff --git a/deepspeech/models/ds2/conv.py b/deepspeech/models/ds2/conv.py index 9548af0a2..069b7dd4b 100644 --- a/deepspeech/models/ds2/conv.py +++ b/deepspeech/models/ds2/conv.py @@ -41,6 +41,13 @@ def conv_output_size(I, F, P, S): return (I - F + 2 * P - S) // S +# receptive field calculator +# https://fomoro.com/research/article/receptive-field-calculator +# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters +# https://distill.pub/2019/computing-receptive-fields/ +# Rl-1 = Sl * Rl + (Kl - Sl) + + class ConvBn(nn.Layer): """Convolution layer with batch normalization. @@ -106,9 +113,10 @@ class ConvBn(nn.Layer): # reset padding part to 0 masks = make_non_pad_mask(x_len) #[B, T] masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] - # https://github.com/PaddlePaddle/Paddle/pull/29265 - # rhs will type promote to lhs - x = x * masks + # TODO(Hui Zhang): not support bool multiply + # masks = masks.type_as(x) + masks = masks.astype(x.dtype) + x = x.multiply(masks) return x, x_len diff --git a/deepspeech/models/ds2/rnn.py b/deepspeech/models/ds2/rnn.py index 3fc52a378..68a3e6e72 100644 --- a/deepspeech/models/ds2/rnn.py +++ b/deepspeech/models/ds2/rnn.py @@ -308,8 +308,8 @@ class RNNStack(nn.Layer): x, x_len = rnn(x, x_len) masks = make_non_pad_mask(x_len) #[B, T] masks = masks.unsqueeze(-1) # [B, T, 1] - # https://github.com/PaddlePaddle/Paddle/pull/29265 - # rhs will type promote to lhs - x = x * masks + # TODO(Hui Zhang): not support bool multiply + masks = masks.astype(x.dtype) + x = x.multiply(masks) return x, x_len diff --git a/deepspeech/models/u2/u2.py b/deepspeech/models/u2/u2.py index 46bbd102f..e6cd7b5c8 100644 --- a/deepspeech/models/u2/u2.py +++ b/deepspeech/models/u2/u2.py @@ -164,7 +164,10 @@ class U2BaseModel(nn.Layer): encoder_out, encoder_mask = self.encoder(speech, speech_lengths) encoder_time = time.time() - start #logger.debug(f"encoder time: {encoder_time}") - encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] + #TODO(Hui Zhang): sum not support bool type + #encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] + encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum( + 1) #[B, 1, T] -> [B] # 2a. Attention-decoder branch loss_att = None @@ -319,7 +322,8 @@ class U2BaseModel(nn.Layer): # 2. Decoder forward step by step for i in range(1, maxlen + 1): # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: + # TODO(Hui Zhang): if end_flag.sum() == running_size: + if end_flag.cast(paddle.int64).sum() == running_size: break # 2.1 Forward decoder step @@ -405,7 +409,9 @@ class U2BaseModel(nn.Layer): speech, speech_lengths, decoding_chunk_size, num_decoding_left_chunks, simulate_streaming) maxlen = encoder_out.shape[1] - encoder_out_lens = encoder_mask.squeeze(1).sum(1) + # (TODO Hui Zhang): bool no support reduce_sum + # encoder_out_lens = encoder_mask.squeeze(1).sum(1) + encoder_out_lens = encoder_mask.squeeze(1).astype(paddle.int).sum(1) ctc_probs = self.ctc.log_softmax(encoder_out) # (B, maxlen, vocab_size) topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1) diff --git a/deepspeech/models/u2_st.py b/deepspeech/models/u2_st.py index 8f87f6daa..bf98423d4 100644 --- a/deepspeech/models/u2_st.py +++ b/deepspeech/models/u2_st.py @@ -165,7 +165,10 @@ class U2STBaseModel(nn.Layer): encoder_out, encoder_mask = self.encoder(speech, speech_lengths) encoder_time = time.time() - start #logger.debug(f"encoder time: {encoder_time}") - encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] + #TODO(Hui Zhang): sum not support bool type + #encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] + encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum( + 1) #[B, 1, T] -> [B] # 2a. ST-decoder branch start = time.time() @@ -362,7 +365,8 @@ class U2STBaseModel(nn.Layer): # 2. Decoder forward step by step for i in range(1, maxlen + 1): # Stop if all batch and all beam produce eos - if end_flag.sum() == running_size: + # TODO(Hui Zhang): if end_flag.sum() == running_size: + if end_flag.cast(paddle.int64).sum() == running_size: break # 2.1 Forward decoder step diff --git a/deepspeech/modules/decoder.py b/deepspeech/modules/decoder.py index 8ca72894a..1ae3ce371 100644 --- a/deepspeech/modules/decoder.py +++ b/deepspeech/modules/decoder.py @@ -124,7 +124,9 @@ class TransformerDecoder(nn.Layer): # m: (1, L, L) m = subsequent_mask(tgt_mask.shape[-1]).unsqueeze(0) # tgt_mask: (B, L, L) - tgt_mask = tgt_mask & m + # TODO(Hui Zhang): not support & for tensor + # tgt_mask = tgt_mask & m + tgt_mask = tgt_mask.logical_and(m) x, _ = self.embed(tgt) for layer in self.decoders: @@ -135,7 +137,9 @@ class TransformerDecoder(nn.Layer): if self.use_output_layer: x = self.output_layer(x) - olens = tgt_mask.sum(1) + # TODO(Hui Zhang): reduce_sum not support bool type + # olens = tgt_mask.sum(1) + olens = tgt_mask.astype(paddle.int).sum(1) return x, olens def forward_one_step( diff --git a/deepspeech/modules/encoder.py b/deepspeech/modules/encoder.py index d4a8275c3..6ffb6465c 100644 --- a/deepspeech/modules/encoder.py +++ b/deepspeech/modules/encoder.py @@ -162,7 +162,8 @@ class BaseEncoder(nn.Layer): xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0) #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor masks = masks.astype(paddle.bool) - mask_pad = ~masks + #TODO(Hui Zhang): mask_pad = ~masks + mask_pad = masks.logical_not() chunk_masks = add_optional_chunk_mask( xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk, decoding_chunk_size, self.static_chunk_size, diff --git a/deepspeech/modules/loss.py b/deepspeech/modules/loss.py index 1f33e5125..df5298ea3 100644 --- a/deepspeech/modules/loss.py +++ b/deepspeech/modules/loss.py @@ -11,6 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import inspect +from functools import partial + import paddle from paddle import nn from paddle.nn import functional as F @@ -32,18 +35,19 @@ class CTCLoss(nn.Layer): # last token id as blank id self.loss = nn.CTCLoss(blank=blank, reduction=reduction) self.batch_average = batch_average + logger.info( f"CTCLoss Loss reduction: {reduction}, div-bs: {batch_average}") + logger.info(f"CTCLoss Grad Norm Type: {grad_norm_type}") - # instance for norm_by_times - # batch for norm_by_batchsize - # frame for norm_by_total_logits_len assert grad_norm_type in ('instance', 'batch', 'frame', None) self.norm_by_times = False self.norm_by_batchsize = False self.norm_by_total_logits_len = False - logger.info(f"CTCLoss Grad Norm Type: {grad_norm_type}") - if grad_norm_type == 'instance': + if grad_norm_type is None: + # no grad norm + pass + elif grad_norm_type == 'instance': self.norm_by_times = True elif grad_norm_type == 'batch': self.norm_by_batchsize = True @@ -51,6 +55,22 @@ class CTCLoss(nn.Layer): self.norm_by_total_logits_len = True else: raise ValueError(f"CTCLoss Grad Norm no support {grad_norm_type}") + self.kwargs = { + "norm_by_times": self.norm_by_times, + "norm_by_batchsize": self.norm_by_batchsize, + "norm_by_total_logits_len": self.norm_by_total_logits_len, + } + + # Derive only the args which the func has + try: + param = inspect.signature(self.loss.forward).parameters + except ValueError: + # Some function, e.g. built-in function, are failed + param = {} + _kwargs = {k: v for k, v in self.kwargs.items() if k in param} + _notin = {k: v for k, v in self.kwargs.items() if k not in param} + logger.info(f"{self.loss} kwargs:{_kwargs}, not support: {_notin}") + self.loss_fn = partial(self.loss.forward, **_kwargs) def forward(self, logits, ys_pad, hlens, ys_lens): """Compute CTC loss. @@ -70,14 +90,7 @@ class CTCLoss(nn.Layer): # logits: (B, L, D) -> (L, B, D) logits = logits.transpose([1, 0, 2]) ys_pad = ys_pad.astype(paddle.int32) - loss = self.loss( - logits, - ys_pad, - hlens, - ys_lens, - norm_by_times=self.norm_by_times, - norm_by_batchsize=self.norm_by_batchsize, - norm_by_total_logits_len=self.norm_by_total_logits_len) + loss = self.loss_fn(logits, ys_pad, hlens, ys_lens) if self.batch_average: # Batch-size average loss = loss / B @@ -118,8 +131,8 @@ class LabelSmoothingLoss(nn.Layer): size (int): the number of class padding_idx (int): padding class id which will be ignored for loss smoothing (float): smoothing rate (0.0 means the conventional CE) - normalize_length (bool): - True, normalize loss by sequence length; + normalize_length (bool): + True, normalize loss by sequence length; False, normalize loss by batch size. Defaults to False. """ @@ -136,7 +149,7 @@ class LabelSmoothingLoss(nn.Layer): The model outputs and data labels tensors are flatten to (batch*seqlen, class) shape and a mask is applied to the padding part which should not be calculated for loss. - + Args: x (paddle.Tensor): prediction (batch, seqlen, class) target (paddle.Tensor): @@ -152,7 +165,7 @@ class LabelSmoothingLoss(nn.Layer): # use zeros_like instead of torch.no_grad() for true_dist, # since no_grad() can not be exported by JIT true_dist = paddle.full_like(x, self.smoothing / (self.size - 1)) - ignore = (target == self.padding_idx) # (B,) + ignore = target == self.padding_idx # (B,) #TODO(Hui Zhang): target = target * (1 - ignore) # avoid -1 index target = target.masked_fill(ignore, 0) # avoid -1 index @@ -163,8 +176,10 @@ class LabelSmoothingLoss(nn.Layer): kl = self.criterion(F.log_softmax(x, axis=1), true_dist) - total = len(target) - int(ignore.sum()) + #TODO(Hui Zhang): sum not support bool type + #total = len(target) - int(ignore.sum()) + total = len(target) - int(ignore.type_as(target).sum()) denom = total if self.normalize_length else B - #TODO(Hui Zhang): numer = (kl * (1 - ignore)).sum() + #numer = (kl * (1 - ignore)).sum() numer = kl.masked_fill(ignore.unsqueeze(1), 0).sum() return numer / denom diff --git a/deepspeech/modules/mask.py b/deepspeech/modules/mask.py index 6d46f5ba0..00f228a2b 100644 --- a/deepspeech/modules/mask.py +++ b/deepspeech/modules/mask.py @@ -69,7 +69,8 @@ def make_non_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor: [1, 1, 1, 0, 0], [1, 1, 0, 0, 0]] """ - return ~make_pad_mask(lengths) + #return ~make_pad_mask(lengths) + return make_pad_mask(lengths).logical_not() def subsequent_mask(size: int) -> paddle.Tensor: @@ -91,7 +92,12 @@ def subsequent_mask(size: int) -> paddle.Tensor: [1, 1, 1]] """ ret = paddle.ones([size, size], dtype=paddle.bool) - return paddle.tril(ret) + #TODO(Hui Zhang): tril not support bool + #return paddle.tril(ret) + ret = ret.astype(paddle.float) + ret = paddle.tril(ret) + ret = ret.astype(paddle.bool) + return ret def subsequent_chunk_mask( @@ -180,13 +186,15 @@ def add_optional_chunk_mask(xs: paddle.Tensor, chunk_masks = subsequent_chunk_mask(xs.shape[1], chunk_size, num_left_chunks) # (L, L) chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) + # chunk_masks = masks & chunk_masks # (B, L, L) + chunk_masks = masks.logical_and(chunk_masks) # (B, L, L) elif static_chunk_size > 0: num_left_chunks = num_decoding_left_chunks chunk_masks = subsequent_chunk_mask(xs.shape[1], static_chunk_size, num_left_chunks) # (L, L) chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - chunk_masks = masks & chunk_masks # (B, L, L) + # chunk_masks = masks & chunk_masks # (B, L, L) + chunk_masks = masks.logical_and(chunk_masks) # (B, L, L) else: chunk_masks = masks return chunk_masks diff --git a/deepspeech/utils/tensor_utils.py b/deepspeech/utils/tensor_utils.py index 0050794c7..0cc03b193 100644 --- a/deepspeech/utils/tensor_utils.py +++ b/deepspeech/utils/tensor_utils.py @@ -183,7 +183,13 @@ def th_accuracy(pad_outputs: paddle.Tensor, pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1], pad_outputs.shape[1]).argmax(2) mask = pad_targets != ignore_label - numerator = paddle.sum( + #TODO(Hui Zhang): sum not support bool type + # numerator = paddle.sum( + # pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) + numerator = ( pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - denominator = paddle.sum(mask) + numerator = paddle.sum(numerator.type_as(pad_targets)) + #TODO(Hui Zhang): sum not support bool type + # denominator = paddle.sum(mask) + denominator = paddle.sum(mask.type_as(pad_targets)) return float(numerator) / float(denominator) diff --git a/tests/mask_test.py b/tests/mask_test.py index dbe8c4b09..f44aca8fc 100644 --- a/tests/mask_test.py +++ b/tests/mask_test.py @@ -37,13 +37,13 @@ class TestU2Model(unittest.TestCase): def test_make_non_pad_mask(self): res = make_non_pad_mask(self.lengths) - res2 = ~make_pad_mask(self.lengths) + res2 = make_pad_mask(self.lengths).logical_not() self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist()) self.assertSequenceEqual(res.numpy().tolist(), res2.numpy().tolist()) def test_make_pad_mask(self): res = make_pad_mask(self.lengths) - res1 = ~make_non_pad_mask(self.lengths) + res1 = make_non_pad_mask(self.lengths).logical_not() self.assertSequenceEqual(res.numpy().tolist(), self.pad_masks.tolist()) self.assertSequenceEqual(res.numpy().tolist(), res1.tolist()) From f4f2d6f07e225a254659939d12b1f9cb745e260c Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 2 Oct 2021 10:22:18 +0000 Subject: [PATCH 02/17] print deps module version --- deepspeech/training/trainer.py | 2 ++ deepspeech/utils/utility.py | 21 ++++++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index 35b1690b8..75652ead6 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -27,6 +27,7 @@ from deepspeech.utils import mp_tools from deepspeech.utils import profiler from deepspeech.utils.checkpoint import Checkpoint from deepspeech.utils.log import Log +from deepspeech.utils.utility import all_version from deepspeech.utils.utility import seed_all from deepspeech.utils.utility import UpdateConfig @@ -103,6 +104,7 @@ class Trainer(): self.epoch = 0 self.rank = dist.get_rank() + all_version() logger.info(f"Rank: {self.rank}/{dist.get_world_size()}") if args.seed: diff --git a/deepspeech/utils/utility.py b/deepspeech/utils/utility.py index 6f84c41be..159b686e0 100644 --- a/deepspeech/utils/utility.py +++ b/deepspeech/utils/utility.py @@ -16,17 +16,36 @@ import distutils.util import math import os import random +import sys from contextlib import contextmanager from typing import List import numpy as np import paddle +import soundfile + +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() __all__ = [ - "UpdateConfig", "seed_all", 'print_arguments', 'add_arguments', "log_add" + "all_version", "UpdateConfig", "seed_all", 'print_arguments', + 'add_arguments', "log_add" ] +def all_version(): + vers = { + "python": sys.version, + "paddle": paddle.__version__, + "paddle_commit": paddle.version.commit, + "soundfile": soundfile.__version__, + } + logger.info("Deps Module Version:") + for k, v in vers.items(): + logger.info(f"{k}: {v}") + + @contextmanager def UpdateConfig(config): """Update yacs config""" From 913ea33f29377e8f600c5f49df83974e33a319cc Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 2 Oct 2021 10:22:34 +0000 Subject: [PATCH 03/17] remove conv and rnn uesless --- deepspeech/modules/conv.py | 170 -------------------- deepspeech/modules/rnn.py | 314 ------------------------------------- 2 files changed, 484 deletions(-) delete mode 100644 deepspeech/modules/conv.py delete mode 100644 deepspeech/modules/rnn.py diff --git a/deepspeech/modules/conv.py b/deepspeech/modules/conv.py deleted file mode 100644 index 22a168800..000000000 --- a/deepspeech/modules/conv.py +++ /dev/null @@ -1,170 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from paddle import nn -from paddle.nn import functional as F - -from deepspeech.modules.activation import brelu -from deepspeech.modules.mask import make_non_pad_mask -from deepspeech.utils.log import Log - -logger = Log(__name__).getlog() - -__all__ = ['ConvStack', "conv_output_size"] - - -def conv_output_size(I, F, P, S): - # https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters - # Output size after Conv: - # By noting I the length of the input volume size, - # F the length of the filter, - # P the amount of zero padding, - # S the stride, - # then the output size O of the feature map along that dimension is given by: - # O = (I - F + Pstart + Pend) // S + 1 - # When Pstart == Pend == P, we can replace Pstart + Pend by 2P. - # When Pstart == Pend == 0 - # O = (I - F - S) // S - # https://iq.opengenus.org/output-size-of-convolution/ - # Output height = (Input height + padding height top + padding height bottom - kernel height) / (stride height) + 1 - # Output width = (Output width + padding width right + padding width left - kernel width) / (stride width) + 1 - return (I - F + 2 * P - S) // S - - -# receptive field calculator -# https://fomoro.com/research/article/receptive-field-calculator -# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters -# https://distill.pub/2019/computing-receptive-fields/ -# Rl-1 = Sl * Rl + (Kl - Sl) - - -class ConvBn(nn.Layer): - """Convolution layer with batch normalization. - - :param kernel_size: The x dimension of a filter kernel. Or input a tuple for - two image dimension. - :type kernel_size: int|tuple|list - :param num_channels_in: Number of input channels. - :type num_channels_in: int - :param num_channels_out: Number of output channels. - :type num_channels_out: int - :param stride: The x dimension of the stride. Or input a tuple for two - image dimension. - :type stride: int|tuple|list - :param padding: The x dimension of the padding. Or input a tuple for two - image dimension. - :type padding: int|tuple|list - :param act: Activation type, relu|brelu - :type act: string - :return: Batch norm layer after convolution layer. - :rtype: Variable - - """ - - def __init__(self, num_channels_in, num_channels_out, kernel_size, stride, - padding, act): - - super().__init__() - assert len(kernel_size) == 2 - assert len(stride) == 2 - assert len(padding) == 2 - self.kernel_size = kernel_size - self.stride = stride - self.padding = padding - - self.conv = nn.Conv2D( - num_channels_in, - num_channels_out, - kernel_size=kernel_size, - stride=stride, - padding=padding, - weight_attr=None, - bias_attr=False, - data_format='NCHW') - - self.bn = nn.BatchNorm2D( - num_channels_out, - weight_attr=None, - bias_attr=None, - data_format='NCHW') - self.act = F.relu if act == 'relu' else brelu - - def forward(self, x, x_len): - """ - x(Tensor): audio, shape [B, C, D, T] - """ - x = self.conv(x) - x = self.bn(x) - x = self.act(x) - - x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1] - ) // self.stride[1] + 1 - - # reset padding part to 0 - masks = make_non_pad_mask(x_len) #[B, T] - masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] - # https://github.com/PaddlePaddle/Paddle/pull/29265 - # rhs will type promote to lhs - x = x * masks - return x, x_len - - -class ConvStack(nn.Layer): - """Convolution group with stacked convolution layers. - - :param feat_size: audio feature dim. - :type feat_size: int - :param num_stacks: Number of stacked convolution layers. - :type num_stacks: int - """ - - def __init__(self, feat_size, num_stacks): - super().__init__() - self.feat_size = feat_size # D - self.num_stacks = num_stacks - - self.conv_in = ConvBn( - num_channels_in=1, - num_channels_out=32, - kernel_size=(41, 11), #[D, T] - stride=(2, 3), - padding=(20, 5), - act='brelu') - - out_channel = 32 - convs = [ - ConvBn( - num_channels_in=32, - num_channels_out=out_channel, - kernel_size=(21, 11), - stride=(2, 1), - padding=(10, 5), - act='brelu') for i in range(num_stacks - 1) - ] - self.conv_stack = nn.LayerList(convs) - - # conv output feat_dim - output_height = (feat_size - 1) // 2 + 1 - for i in range(self.num_stacks - 1): - output_height = (output_height - 1) // 2 + 1 - self.output_height = out_channel * output_height - - def forward(self, x, x_len): - """ - x: shape [B, C, D, T] - x_len : shape [B] - """ - x, x_len = self.conv_in(x, x_len) - for i, conv in enumerate(self.conv_stack): - x, x_len = conv(x, x_len) - return x, x_len diff --git a/deepspeech/modules/rnn.py b/deepspeech/modules/rnn.py deleted file mode 100644 index 8f8b2a18d..000000000 --- a/deepspeech/modules/rnn.py +++ /dev/null @@ -1,314 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import math - -import paddle -from paddle import nn -from paddle.nn import functional as F -from paddle.nn import initializer as I - -from deepspeech.modules.activation import brelu -from deepspeech.modules.mask import make_non_pad_mask -from deepspeech.utils.log import Log - -logger = Log(__name__).getlog() - -__all__ = ['RNNStack'] - - -class RNNCell(nn.RNNCellBase): - r""" - Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it - computes the outputs and updates states. - The formula used is as follows: - .. math:: - h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh}) - y_{t} & = h_{t} - - where :math:`act` is for :attr:`activation`. - """ - - def __init__(self, - hidden_size: int, - activation="tanh", - weight_ih_attr=None, - weight_hh_attr=None, - bias_ih_attr=None, - bias_hh_attr=None, - name=None): - super().__init__() - std = 1.0 / math.sqrt(hidden_size) - self.weight_hh = self.create_parameter( - (hidden_size, hidden_size), - weight_hh_attr, - default_initializer=I.Uniform(-std, std)) - self.bias_ih = None - self.bias_hh = self.create_parameter( - (hidden_size, ), - bias_hh_attr, - is_bias=True, - default_initializer=I.Uniform(-std, std)) - - self.hidden_size = hidden_size - if activation not in ["tanh", "relu", "brelu"]: - raise ValueError( - "activation for SimpleRNNCell should be tanh or relu, " - "but get {}".format(activation)) - self.activation = activation - self._activation_fn = paddle.tanh \ - if activation == "tanh" \ - else F.relu - if activation == 'brelu': - self._activation_fn = brelu - - def forward(self, inputs, states=None): - if states is None: - states = self.get_initial_states(inputs, self.state_shape) - pre_h = states - i2h = inputs - if self.bias_ih is not None: - i2h += self.bias_ih - h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True) - if self.bias_hh is not None: - h2h += self.bias_hh - h = self._activation_fn(i2h + h2h) - return h, h - - @property - def state_shape(self): - return (self.hidden_size, ) - - -class GRUCell(nn.RNNCellBase): - r""" - Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states, - it computes the outputs and updates states. - The formula for GRU used is as follows: - .. math:: - r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr}) - z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz}) - \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc})) - h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t} - y_{t} & = h_{t} - - where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise - multiplication operator. - """ - - def __init__(self, - input_size: int, - hidden_size: int, - weight_ih_attr=None, - weight_hh_attr=None, - bias_ih_attr=None, - bias_hh_attr=None, - name=None): - super().__init__() - std = 1.0 / math.sqrt(hidden_size) - self.weight_hh = self.create_parameter( - (3 * hidden_size, hidden_size), - weight_hh_attr, - default_initializer=I.Uniform(-std, std)) - self.bias_ih = None - self.bias_hh = self.create_parameter( - (3 * hidden_size, ), - bias_hh_attr, - is_bias=True, - default_initializer=I.Uniform(-std, std)) - - self.hidden_size = hidden_size - self.input_size = input_size - self._gate_activation = F.sigmoid - self._activation = paddle.tanh - - def forward(self, inputs, states=None): - if states is None: - states = self.get_initial_states(inputs, self.state_shape) - - pre_hidden = states - x_gates = inputs - if self.bias_ih is not None: - x_gates = x_gates + self.bias_ih - h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True) - if self.bias_hh is not None: - h_gates = h_gates + self.bias_hh - - x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1) - h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1) - - r = self._gate_activation(x_r + h_r) - z = self._gate_activation(x_z + h_z) - c = self._activation(x_c + r * h_c) # apply reset gate after mm - h = (pre_hidden - c) * z + c - # https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru - - return h, h - - @property - def state_shape(self): - r""" - The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch - size would be automatically inserted into shape). The shape corresponds - to the shape of :math:`h_{t-1}`. - """ - return (self.hidden_size, ) - - -class BiRNNWithBN(nn.Layer): - """Bidirectonal simple rnn layer with sequence-wise batch normalization. - The batch normalization is only performed on input-state weights. - - :param size: Dimension of RNN cells. - :type size: int - :param share_weights: Whether to share input-hidden weights between - forward and backward directional RNNs. - :type share_weights: bool - :return: Bidirectional simple rnn layer. - :rtype: Variable - """ - - def __init__(self, i_size: int, h_size: int, share_weights: bool): - super().__init__() - self.share_weights = share_weights - if self.share_weights: - #input-hidden weights shared between bi-directional rnn. - self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False) - # batch norm is only performed on input-state projection - self.fw_bn = nn.BatchNorm1D( - h_size, bias_attr=None, data_format='NLC') - self.bw_fc = self.fw_fc - self.bw_bn = self.fw_bn - else: - self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False) - self.fw_bn = nn.BatchNorm1D( - h_size, bias_attr=None, data_format='NLC') - self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False) - self.bw_bn = nn.BatchNorm1D( - h_size, bias_attr=None, data_format='NLC') - - self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu') - self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu') - self.fw_rnn = nn.RNN( - self.fw_cell, is_reverse=False, time_major=False) #[B, T, D] - self.bw_rnn = nn.RNN( - self.fw_cell, is_reverse=True, time_major=False) #[B, T, D] - - def forward(self, x: paddle.Tensor, x_len: paddle.Tensor): - # x, shape [B, T, D] - fw_x = self.fw_bn(self.fw_fc(x)) - bw_x = self.bw_bn(self.bw_fc(x)) - fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len) - bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len) - x = paddle.concat([fw_x, bw_x], axis=-1) - return x, x_len - - -class BiGRUWithBN(nn.Layer): - """Bidirectonal gru layer with sequence-wise batch normalization. - The batch normalization is only performed on input-state weights. - - :param name: Name of the layer. - :type name: string - :param input: Input layer. - :type input: Variable - :param size: Dimension of GRU cells. - :type size: int - :param act: Activation type. - :type act: string - :return: Bidirectional GRU layer. - :rtype: Variable - """ - - def __init__(self, i_size: int, h_size: int): - super().__init__() - hidden_size = h_size * 3 - - self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False) - self.fw_bn = nn.BatchNorm1D( - hidden_size, bias_attr=None, data_format='NLC') - self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False) - self.bw_bn = nn.BatchNorm1D( - hidden_size, bias_attr=None, data_format='NLC') - - self.fw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size) - self.bw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size) - self.fw_rnn = nn.RNN( - self.fw_cell, is_reverse=False, time_major=False) #[B, T, D] - self.bw_rnn = nn.RNN( - self.fw_cell, is_reverse=True, time_major=False) #[B, T, D] - - def forward(self, x, x_len): - # x, shape [B, T, D] - fw_x = self.fw_bn(self.fw_fc(x)) - bw_x = self.bw_bn(self.bw_fc(x)) - fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len) - bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len) - x = paddle.concat([fw_x, bw_x], axis=-1) - return x, x_len - - -class RNNStack(nn.Layer): - """RNN group with stacked bidirectional simple RNN or GRU layers. - - :param input: Input layer. - :type input: Variable - :param size: Dimension of RNN cells in each layer. - :type size: int - :param num_stacks: Number of stacked rnn layers. - :type num_stacks: int - :param use_gru: Use gru if set True. Use simple rnn if set False. - :type use_gru: bool - :param share_rnn_weights: Whether to share input-hidden weights between - forward and backward directional RNNs. - It is only available when use_gru=False. - :type share_weights: bool - :return: Output layer of the RNN group. - :rtype: Variable - """ - - def __init__(self, - i_size: int, - h_size: int, - num_stacks: int, - use_gru: bool, - share_rnn_weights: bool): - super().__init__() - rnn_stacks = [] - for i in range(num_stacks): - if use_gru: - #default:GRU using tanh - rnn_stacks.append(BiGRUWithBN(i_size=i_size, h_size=h_size)) - else: - rnn_stacks.append( - BiRNNWithBN( - i_size=i_size, - h_size=h_size, - share_weights=share_rnn_weights)) - i_size = h_size * 2 - - self.rnn_stacks = nn.LayerList(rnn_stacks) - - def forward(self, x: paddle.Tensor, x_len: paddle.Tensor): - """ - x: shape [B, T, D] - x_len: shpae [B] - """ - for i, rnn in enumerate(self.rnn_stacks): - x, x_len = rnn(x, x_len) - masks = make_non_pad_mask(x_len) #[B, T] - masks = masks.unsqueeze(-1) # [B, T, 1] - # https://github.com/PaddlePaddle/Paddle/pull/29265 - # rhs will type promote to lhs - x = x * masks - return x, x_len From b4e16eb815c16e33a5398ae362d59800b5250e97 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 2 Oct 2021 12:39:33 +0000 Subject: [PATCH 04/17] revert ctc_utils that can work with paddle 2.1.2 and new --- deepspeech/utils/ctc_utils.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/deepspeech/utils/ctc_utils.py b/deepspeech/utils/ctc_utils.py index fc43a71f0..70d99e6c2 100644 --- a/deepspeech/utils/ctc_utils.py +++ b/deepspeech/utils/ctc_utils.py @@ -87,14 +87,16 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, (ctc_probs.shape[0], len(y_insert_blank))) #(T, 2L+1) log_alpha = log_alpha - float('inf') # log of zero + # TODO(Hui Zhang): zeros not support paddle.int16 # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16 state_path = (paddle.zeros( (ctc_probs.shape[0], len(y_insert_blank)), dtype=paddle.int32) - 1 ) # state path, Tuple((T, 2L+1)) # init start state - log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] # State-b, Sb - log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] # State-nb, Snb + # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64 + log_alpha[0, 0] = ctc_probs[0][int(y_insert_blank[0])] # State-b, Sb + log_alpha[0, 1] = ctc_probs[0][int(y_insert_blank[1])] # State-nb, Snb for t in range(1, ctc_probs.shape[0]): # T for s in range(len(y_insert_blank)): # 2L+1 @@ -110,9 +112,11 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, log_alpha[t - 1, s - 2], ]) prev_state = [s, s - 1, s - 2] - log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][ - y_insert_blank[s]] + # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64 + log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][int( + y_insert_blank[s])] state_path[t, s] = prev_state[paddle.argmax(candidates)] + # TODO(Hui Zhang): zeros not support paddle.int16 # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16 state_seq = -1 * paddle.ones((ctc_probs.shape[0], 1), dtype=paddle.int32) From 466672e1de6c77c1b18109332d1e1b72f341fa6a Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 2 Oct 2021 12:44:27 +0000 Subject: [PATCH 05/17] no_sync if paddle support else nullcontext --- deepspeech/exps/deepspeech2/model.py | 3 ++- deepspeech/exps/u2/model.py | 3 ++- deepspeech/exps/u2_kaldi/model.py | 3 ++- deepspeech/exps/u2_st/model.py | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index e84de6157..3dc8286d2 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -87,7 +87,8 @@ class DeepSpeech2Trainer(Trainer): # Disable gradient synchronizations across DDP processes. # Within this context, gradients will be accumulated on module # variables, which will later be synchronized. - context = self.model.no_sync + context = self.model.no_sync if (hasattr(self.model, "no_sync") and + self.parallel) else nullcontext else: # Used for single gpu training and DDP gradient synchronization # processes. diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 9cb3fa3cf..65ec5174f 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -106,7 +106,8 @@ class U2Trainer(Trainer): # Within this context, gradients will be accumulated on module # variables, which will later be synchronized. # When using cpu w/o DDP, model does not have `no_sync` - context = self.model.no_sync if self.parallel else nullcontext + context = self.model.no_sync if (hasattr(self.model, "no_sync") and + self.parallel) else nullcontext else: # Used for single gpu training and DDP gradient synchronization # processes. diff --git a/deepspeech/exps/u2_kaldi/model.py b/deepspeech/exps/u2_kaldi/model.py index d38afe25c..5a72e44d8 100644 --- a/deepspeech/exps/u2_kaldi/model.py +++ b/deepspeech/exps/u2_kaldi/model.py @@ -105,7 +105,8 @@ class U2Trainer(Trainer): # Disable gradient synchronizations across DDP processes. # Within this context, gradients will be accumulated on module # variables, which will later be synchronized. - context = self.model.no_sync + context = self.model.no_sync if (hasattr(self.model, "no_sync") and + self.parallel) else nullcontext else: # Used for single gpu training and DDP gradient synchronization # processes. diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py index c480499c7..08060d975 100644 --- a/deepspeech/exps/u2_st/model.py +++ b/deepspeech/exps/u2_st/model.py @@ -110,7 +110,8 @@ class U2STTrainer(Trainer): # Disable gradient synchronizations across DDP processes. # Within this context, gradients will be accumulated on module # variables, which will later be synchronized. - context = self.model.no_sync + context = self.model.no_sync if (hasattr(self.model, "no_sync") and + self.parallel) else nullcontext else: # Used for single gpu training and DDP gradient synchronization # processes. From 8977f1850e8a0d8088884ad20ec2466efae6c86f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 2 Oct 2021 13:13:15 +0000 Subject: [PATCH 06/17] close editdistance package format warning --- deepspeech/utils/error_rate.py | 4 ++++ requirements.txt | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/deepspeech/utils/error_rate.py b/deepspeech/utils/error_rate.py index 6fd593ebb..81f458b6e 100644 --- a/deepspeech/utils/error_rate.py +++ b/deepspeech/utils/error_rate.py @@ -19,6 +19,8 @@ import numpy as np __all__ = ['word_errors', 'char_errors', 'wer', 'cer'] +editdistance.eval("a", "b") + def _levenshtein_distance(ref, hyp): """Levenshtein distance is a string metric for measuring the difference @@ -90,6 +92,7 @@ def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '): hyp_words = list(filter(None, hypothesis.split(delimiter))) edit_distance = _levenshtein_distance(ref_words, hyp_words) + # `editdistance.eavl precision` less than `_levenshtein_distance` # edit_distance = editdistance.eval(ref_words, hyp_words) return float(edit_distance), len(ref_words) @@ -121,6 +124,7 @@ def char_errors(reference, hypothesis, ignore_case=False, remove_space=False): hypothesis = join_char.join(list(filter(None, hypothesis.split(' ')))) edit_distance = _levenshtein_distance(reference, hypothesis) + # `editdistance.eavl precision` less than `_levenshtein_distance` # edit_distance = editdistance.eval(reference, hypothesis) return float(edit_distance), len(reference) diff --git a/requirements.txt b/requirements.txt index 9ecf6bbd8..332b52388 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ coverage +editdistance gpustat jsonlines kaldiio @@ -19,4 +20,3 @@ tqdm typeguard visualdl==2.2.0 yacs -editdistance \ No newline at end of file From 37563d975e1063bd42899651aa3fbaea3165a04b Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 5 Oct 2021 09:40:00 +0000 Subject: [PATCH 07/17] ds2 model_type more info --- deepspeech/exps/deepspeech2/bin/export.py | 5 ++--- deepspeech/exps/deepspeech2/bin/test.py | 5 ++--- deepspeech/exps/deepspeech2/bin/test_export.py | 5 ++--- deepspeech/exps/deepspeech2/bin/test_hub.py | 7 +++---- deepspeech/exps/deepspeech2/bin/train.py | 5 ++--- examples/1xt2x/src_deepspeech2x/bin/test.py | 5 ++--- 6 files changed, 13 insertions(+), 19 deletions(-) diff --git a/deepspeech/exps/deepspeech2/bin/export.py b/deepspeech/exps/deepspeech2/bin/export.py index 7962d4fc0..d92ed4def 100644 --- a/deepspeech/exps/deepspeech2/bin/export.py +++ b/deepspeech/exps/deepspeech2/bin/export.py @@ -33,10 +33,9 @@ if __name__ == "__main__": # save jit model to parser.add_argument( "--export_path", type=str, help="path of the jit model to save") - parser.add_argument("--model_type") + parser.add_argument( + "--model_type", type=str, default='offline', help='offline/online') args = parser.parse_args() - if args.model_type is None: - args.model_type = 'offline' print("model_type:{}".format(args.model_type)) print_arguments(args) diff --git a/deepspeech/exps/deepspeech2/bin/test.py b/deepspeech/exps/deepspeech2/bin/test.py index f2fd3a394..7fbdab451 100644 --- a/deepspeech/exps/deepspeech2/bin/test.py +++ b/deepspeech/exps/deepspeech2/bin/test.py @@ -30,14 +30,13 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - parser.add_argument("--model_type") + parser.add_argument( + "--model_type", type=str, default='offline', help='offline/online') # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() print_arguments(args, globals()) - if args.model_type is None: - args.model_type = 'offline' print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html diff --git a/deepspeech/exps/deepspeech2/bin/test_export.py b/deepspeech/exps/deepspeech2/bin/test_export.py index 7a012144d..be1a8479a 100644 --- a/deepspeech/exps/deepspeech2/bin/test_export.py +++ b/deepspeech/exps/deepspeech2/bin/test_export.py @@ -36,11 +36,10 @@ if __name__ == "__main__": #load jit model from parser.add_argument( "--export_path", type=str, help="path of the jit model to save") - parser.add_argument("--model_type") + parser.add_argument( + "--model_type", type=str, default='offline', help='offline/online') args = parser.parse_args() print_arguments(args, globals()) - if args.model_type is None: - args.model_type = 'offline' print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html diff --git a/deepspeech/exps/deepspeech2/bin/test_hub.py b/deepspeech/exps/deepspeech2/bin/test_hub.py index 892679600..181e4ac31 100644 --- a/deepspeech/exps/deepspeech2/bin/test_hub.py +++ b/deepspeech/exps/deepspeech2/bin/test_hub.py @@ -177,15 +177,14 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - parser.add_argument("--model_type") - parser.add_argument("--audio_file") + parser.add_argument( + "--model_type", type=str, default='offline', help='offline/online') + parser.add_argument("--audio_file", type=str, help='audio file path.') # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() print_arguments(args, globals()) - if args.model_type is None: - args.model_type = 'offline' if not os.path.isfile(args.audio_file): print("Please input the audio file path") sys.exit(-1) diff --git a/deepspeech/exps/deepspeech2/bin/train.py b/deepspeech/exps/deepspeech2/bin/train.py index 6740f288f..02aefe3df 100644 --- a/deepspeech/exps/deepspeech2/bin/train.py +++ b/deepspeech/exps/deepspeech2/bin/train.py @@ -35,10 +35,9 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - parser.add_argument("--model_type") + parser.add_argument( + "--model_type", type=str, default='offline', help='offline/online') args = parser.parse_args() - if args.model_type is None: - args.model_type = 'offline' print("model_type:{}".format(args.model_type)) print_arguments(args, globals()) diff --git a/examples/1xt2x/src_deepspeech2x/bin/test.py b/examples/1xt2x/src_deepspeech2x/bin/test.py index 59e1b38dd..be3f9822f 100644 --- a/examples/1xt2x/src_deepspeech2x/bin/test.py +++ b/examples/1xt2x/src_deepspeech2x/bin/test.py @@ -31,14 +31,13 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - parser.add_argument("--model_type") + parser.add_argument( + "--model_type", type=str, default='offline', help='offline/online') # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() print_arguments(args, globals()) - if args.model_type is None: - args.model_type = 'offline' print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html From b291c693868b65473f032699ff1f381131a1acfc Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 5 Oct 2021 09:41:38 +0000 Subject: [PATCH 08/17] add checkpoint to save parameters --- deepspeech/training/trainer.py | 6 +++--- deepspeech/utils/checkpoint.py | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index 75652ead6..c3e1bec88 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -163,9 +163,9 @@ class Trainer(): "epoch": self.epoch, "lr": self.optimizer.get_lr() }) - self.checkpoint.add_checkpoint(self.checkpoint_dir, self.iteration - if tag is None else tag, self.model, - self.optimizer, infos) + self.checkpoint.save_parameters(self.checkpoint_dir, self.iteration + if tag is None else tag, self.model, + self.optimizer, infos) def resume_or_scratch(self): """Resume from latest checkpoint at checkpoints in the output diff --git a/deepspeech/utils/checkpoint.py b/deepspeech/utils/checkpoint.py index 8e31edfae..796cafe04 100644 --- a/deepspeech/utils/checkpoint.py +++ b/deepspeech/utils/checkpoint.py @@ -39,13 +39,13 @@ class Checkpoint(): self.latest_n = latest_n self._save_all = (kbest_n == -1) - def add_checkpoint(self, - checkpoint_dir, - tag_or_iteration: Union[int, Text], - model: paddle.nn.Layer, - optimizer: Optimizer=None, - infos: dict=None, - metric_type="val_loss"): + def save_parameters(self, + checkpoint_dir, + tag_or_iteration: Union[int, Text], + model: paddle.nn.Layer, + optimizer: Optimizer=None, + infos: dict=None, + metric_type="val_loss"): """Save checkpoint in best_n and latest_n. Args: From 1b67bd09e99a53fd8e8c2f43508448d1711bd948 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 5 Oct 2021 09:42:06 +0000 Subject: [PATCH 09/17] fix env.sh PATH postion --- env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/env.sh b/env.sh index 461586e7d..c7754d65f 100644 --- a/env.sh +++ b/env.sh @@ -1,6 +1,6 @@ export MAIN_ROOT=${PWD} -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:/usr/local/bin:${PATH} +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}:/usr/local/bin/ export LC_ALL=C # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C From 86a221d58f8f67473f19d833a984273e25550b0a Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 5 Oct 2021 09:45:53 +0000 Subject: [PATCH 10/17] fix autolog add cuda device env when not has --- deepspeech/utils/log.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/deepspeech/utils/log.py b/deepspeech/utils/log.py index 7e8de600a..0b5088544 100644 --- a/deepspeech/utils/log.py +++ b/deepspeech/utils/log.py @@ -120,14 +120,14 @@ class Autolog: model_precision="fp32"): import auto_log pid = os.getpid() - if (os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''): + if os.environ.get('CUDA_VISIBLE_DEVICES', None): gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0]) infer_config = inference.Config() infer_config.enable_use_gpu(100, gpu_id) else: gpu_id = None infer_config = inference.Config() - autolog = auto_log.AutoLogger( + self.autolog = auto_log.AutoLogger( model_name=model_name, model_precision=model_precision, batch_size=batch_size, @@ -139,7 +139,6 @@ class Autolog: gpu_ids=gpu_id, time_keys=['preprocess_time', 'inference_time', 'postprocess_time'], warmup=0) - self.autolog = autolog def getlog(self): return self.autolog From 269101323233c681a58c331baa16c2f15fca2d6e Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 8 Oct 2021 03:24:51 +0000 Subject: [PATCH 11/17] ctcloss can work w/ paddle2.1.2, but loss larger than before --- deepspeech/__init__.py | 28 ++++++++++++++++++++++++++++ deepspeech/modules/loss.py | 9 +++++---- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/deepspeech/__init__.py b/deepspeech/__init__.py index 5505ecbf0..493f10a6f 100644 --- a/deepspeech/__init__.py +++ b/deepspeech/__init__.py @@ -353,3 +353,31 @@ if not hasattr(paddle.Tensor, 'tolist'): logger.debug( "register user tolist to paddle.Tensor, remove this when fixed!") setattr(paddle.Tensor, 'tolist', tolist) + + +# hack loss +def ctc_loss(logits, + labels, + input_lengths, + label_lengths, + blank=0, + reduction='mean', + norm_by_times=True): + #logger.info("my ctc loss with norm by times") + ## https://github.com/PaddlePaddle/Paddle/blob/f5ca2db2cc/paddle/fluid/operators/warpctc_op.h#L403 + loss_out = paddle.fluid.layers.warpctc(logits, labels, blank, norm_by_times, + input_lengths, label_lengths) + + loss_out = paddle.fluid.layers.squeeze(loss_out, [-1]) + assert reduction in ['mean', 'sum', 'none'] + if reduction == 'mean': + loss_out = paddle.mean(loss_out / label_lengths) + elif reduction == 'sum': + loss_out = paddle.sum(loss_out) + return loss_out + + +logger.debug( + "override ctc_loss of paddle.nn.functional if exists, remove this when fixed!" +) +F.ctc_loss = ctc_loss diff --git a/deepspeech/modules/loss.py b/deepspeech/modules/loss.py index df5298ea3..71ecd2662 100644 --- a/deepspeech/modules/loss.py +++ b/deepspeech/modules/loss.py @@ -67,10 +67,10 @@ class CTCLoss(nn.Layer): except ValueError: # Some function, e.g. built-in function, are failed param = {} - _kwargs = {k: v for k, v in self.kwargs.items() if k in param} + self._kwargs = {k: v for k, v in self.kwargs.items() if k in param} _notin = {k: v for k, v in self.kwargs.items() if k not in param} - logger.info(f"{self.loss} kwargs:{_kwargs}, not support: {_notin}") - self.loss_fn = partial(self.loss.forward, **_kwargs) + logger.info(f"{self.loss} kwargs:{self._kwargs}, not support: {_notin}") + #self.loss_fn = partial(self.loss.forward, **_kwargs) def forward(self, logits, ys_pad, hlens, ys_lens): """Compute CTC loss. @@ -90,7 +90,8 @@ class CTCLoss(nn.Layer): # logits: (B, L, D) -> (L, B, D) logits = logits.transpose([1, 0, 2]) ys_pad = ys_pad.astype(paddle.int32) - loss = self.loss_fn(logits, ys_pad, hlens, ys_lens) + #loss = self.loss_fn(logits, ys_pad, hlens, ys_lens) + loss = self.loss(logits, ys_pad, hlens, ys_lens) if self.batch_average: # Batch-size average loss = loss / B From eb4b38926201fa1c3b23959ef9b4cbf22150abb9 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 9 Oct 2021 10:11:51 +0000 Subject: [PATCH 12/17] more log; refactor ctc decoders; rm useless code --- .../decoders/swig/ctc_beam_search_decoder.cpp | 1 - deepspeech/decoders/swig/decoder_utils.h | 2 + deepspeech/decoders/swig/scorer.cpp | 1 - deepspeech/exps/deepspeech2/bin/export.py | 2 +- deepspeech/exps/deepspeech2/bin/test_hub.py | 2 +- deepspeech/frontend/augmentor/augmentation.py | 11 ++- deepspeech/frontend/augmentor/spec_augment.py | 8 +- .../frontend/featurizer/speech_featurizer.py | 2 + .../frontend/featurizer/text_featurizer.py | 23 ++++- deepspeech/frontend/utility.py | 6 +- deepspeech/models/ds2/deepspeech2.py | 58 +++++------ deepspeech/models/ds2_online/deepspeech2.py | 96 ++++++++----------- deepspeech/utils/log.py | 1 + 13 files changed, 110 insertions(+), 103 deletions(-) diff --git a/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp index 1a37dd1ce..8469a194d 100644 --- a/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp +++ b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp @@ -28,7 +28,6 @@ #include "path_trie.h" using FSTMATCH = fst::SortedMatcher; -const std::string kSPACE = ""; std::vector> ctc_beam_search_decoder( const std::vector> &probs_seq, diff --git a/deepspeech/decoders/swig/decoder_utils.h b/deepspeech/decoders/swig/decoder_utils.h index a874e439f..96399c778 100644 --- a/deepspeech/decoders/swig/decoder_utils.h +++ b/deepspeech/decoders/swig/decoder_utils.h @@ -15,10 +15,12 @@ #ifndef DECODER_UTILS_H_ #define DECODER_UTILS_H_ +#include #include #include "fst/log.h" #include "path_trie.h" +const std::string kSPACE = ""; const float NUM_FLT_INF = std::numeric_limits::max(); const float NUM_FLT_MIN = std::numeric_limits::min(); diff --git a/deepspeech/decoders/swig/scorer.cpp b/deepspeech/decoders/swig/scorer.cpp index ebb9e448d..7bd6542df 100644 --- a/deepspeech/decoders/swig/scorer.cpp +++ b/deepspeech/decoders/swig/scorer.cpp @@ -26,7 +26,6 @@ #include "decoder_utils.h" using namespace lm::ngram; -const std::string kSPACE = ""; Scorer::Scorer(double alpha, double beta, diff --git a/deepspeech/exps/deepspeech2/bin/export.py b/deepspeech/exps/deepspeech2/bin/export.py index d92ed4def..ab5251d55 100644 --- a/deepspeech/exps/deepspeech2/bin/export.py +++ b/deepspeech/exps/deepspeech2/bin/export.py @@ -34,7 +34,7 @@ if __name__ == "__main__": parser.add_argument( "--export_path", type=str, help="path of the jit model to save") parser.add_argument( - "--model_type", type=str, default='offline', help='offline/online') + "--model_type", type=str, default='offline', help="offline/online") args = parser.parse_args() print("model_type:{}".format(args.model_type)) print_arguments(args) diff --git a/deepspeech/exps/deepspeech2/bin/test_hub.py b/deepspeech/exps/deepspeech2/bin/test_hub.py index 181e4ac31..1cf24bb03 100644 --- a/deepspeech/exps/deepspeech2/bin/test_hub.py +++ b/deepspeech/exps/deepspeech2/bin/test_hub.py @@ -179,7 +179,7 @@ if __name__ == "__main__": parser = default_argument_parser() parser.add_argument( "--model_type", type=str, default='offline', help='offline/online') - parser.add_argument("--audio_file", type=str, help='audio file path.') + parser.add_argument("--audio_file", type=str, help='audio file path') # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") diff --git a/deepspeech/frontend/augmentor/augmentation.py b/deepspeech/frontend/augmentor/augmentation.py index 17abcf605..0de81333e 100644 --- a/deepspeech/frontend/augmentor/augmentation.py +++ b/deepspeech/frontend/augmentor/augmentation.py @@ -15,6 +15,7 @@ import json from collections.abc import Sequence from inspect import signature +from pprint import pformat import numpy as np @@ -22,10 +23,10 @@ from deepspeech.frontend.augmentor.base import AugmentorBase from deepspeech.utils.dynamic_import import dynamic_import from deepspeech.utils.log import Log -__all__ = ["AugmentationPipeline"] - logger = Log(__name__).getlog() +__all__ = ["AugmentationPipeline"] + import_alias = dict( volume="deepspeech.frontend.augmentor.impulse_response:VolumePerturbAugmentor", shift="deepspeech.frontend.augmentor.shift_perturb:ShiftPerturbAugmentor", @@ -111,6 +112,8 @@ class AugmentationPipeline(): 'audio') self._spec_augmentors, self._spec_rates = self._parse_pipeline_from( 'feature') + logger.info( + f"Augmentation: {pformat(list(zip(self._augmentors, self._rates)))}") def __call__(self, xs, uttid_list=None, **kwargs): if not isinstance(xs, Sequence): @@ -197,8 +200,10 @@ class AugmentationPipeline(): aug_confs = audio_confs elif aug_type == 'feature': aug_confs = feature_confs - else: + elif aug_type == 'all': aug_confs = all_confs + else: + raise ValueError(f"Not support: {aug_type}") augmentors = [ self._get_augmentor(config["type"], config["params"]) diff --git a/deepspeech/frontend/augmentor/spec_augment.py b/deepspeech/frontend/augmentor/spec_augment.py index 26c94d416..e78f6f6ad 100644 --- a/deepspeech/frontend/augmentor/spec_augment.py +++ b/deepspeech/frontend/augmentor/spec_augment.py @@ -29,10 +29,10 @@ class SpecAugmentor(AugmentorBase): SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition https://arxiv.org/abs/1904.08779 - + SpecAugment on Large Scale Datasets https://arxiv.org/abs/1912.05533 - + """ def __init__(self, @@ -61,7 +61,7 @@ class SpecAugmentor(AugmentorBase): adaptive_size_ratio (float): adaptive size ratio for time masking max_n_time_masks (int): maximum number of time masking replace_with_zero (bool): pad zero on mask if true else use mean - warp_mode (str): "PIL" (default, fast, not differentiable) + warp_mode (str): "PIL" (default, fast, not differentiable) or "sparse_image_warp" (slow, differentiable) """ super().__init__() @@ -133,7 +133,7 @@ class SpecAugmentor(AugmentorBase): return self._time_mask def __repr__(self): - return f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}" + return f"specaug: F-{self.F}, T-{self.T}, F-n-{self.n_freq_masks}, T-n-{self.n_time_masks}" def time_warp(self, x, mode='PIL'): """time warp for spec augment diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py index 7471d164a..256871408 100644 --- a/deepspeech/frontend/featurizer/speech_featurizer.py +++ b/deepspeech/frontend/featurizer/speech_featurizer.py @@ -51,12 +51,14 @@ class SpeechFeaturizer(): use_dB_normalization=use_dB_normalization, target_dB=target_dB, dither=dither) + self.feature_size = self.audio_feature.feature_size self.text_feature = TextFeaturizer( unit_type=unit_type, vocab_filepath=vocab_filepath, spm_model_prefix=spm_model_prefix, maskctc=maskctc) + self.vocab_size = self.text_feature.vocab_size def featurize(self, speech_segment, keep_transcription_text): """Extract features for speech segment. diff --git a/deepspeech/frontend/featurizer/text_featurizer.py b/deepspeech/frontend/featurizer/text_featurizer.py index 10ea69244..ac129b0f7 100644 --- a/deepspeech/frontend/featurizer/text_featurizer.py +++ b/deepspeech/frontend/featurizer/text_featurizer.py @@ -12,12 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. """Contains the text featurizer class.""" +from pprint import pformat + import sentencepiece as spm +from ..utility import BLANK from ..utility import EOS from ..utility import load_dict +from ..utility import MASKCTC +from ..utility import SOS from ..utility import SPACE from ..utility import UNK +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() __all__ = ["TextFeaturizer"] @@ -76,7 +84,7 @@ class TextFeaturizer(): """Convert text string to a list of token indices. Args: - text (str): Text. + text (str): Text to process. Returns: List[int]: List of token indices. @@ -199,13 +207,24 @@ class TextFeaturizer(): """Load vocabulary from file.""" vocab_list = load_dict(vocab_filepath, maskctc) assert vocab_list is not None + logger.info(f"Vocab: {pformat(vocab_list)}") id2token = dict( [(idx, token) for (idx, token) in enumerate(vocab_list)]) token2id = dict( [(token, idx) for (idx, token) in enumerate(vocab_list)]) + blank_id = vocab_list.index(BLANK) if BLANK in vocab_list else -1 + maskctc_id = vocab_list.index(MASKCTC) if MASKCTC in vocab_list else -1 unk_id = vocab_list.index(UNK) if UNK in vocab_list else -1 eos_id = vocab_list.index(EOS) if EOS in vocab_list else -1 - + sos_id = vocab_list.index(SOS) if SOS in vocab_list else -1 + space_id = vocab_list.index(SPACE) if SPACE in vocab_list else -1 + + logger.info(f"BLANK id: {blank_id}") + logger.info(f"UNK id: {unk_id}") + logger.info(f"EOS id: {eos_id}") + logger.info(f"SOS id: {sos_id}") + logger.info(f"SPACE id: {space_id}") + logger.info(f"MASKCTC id: {maskctc_id}") return token2id, id2token, vocab_list, unk_id, eos_id diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py index f5fc3097e..f83f1d4e1 100644 --- a/deepspeech/frontend/utility.py +++ b/deepspeech/frontend/utility.py @@ -49,7 +49,11 @@ def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]: with open(dict_path, "r") as f: dictionary = f.readlines() - char_list = [entry.strip().split(" ")[0] for entry in dictionary] + # first token is `` + # multi line: ` 0\n` + # one line: `` + # space is relpace with + char_list = [entry[:-1].split(" ")[0] for entry in dictionary] if BLANK not in char_list: char_list.insert(0, BLANK) if EOS not in char_list: diff --git a/deepspeech/models/ds2/deepspeech2.py b/deepspeech/models/ds2/deepspeech2.py index dda26358b..a2aa31f7f 100644 --- a/deepspeech/models/ds2/deepspeech2.py +++ b/deepspeech/models/ds2/deepspeech2.py @@ -218,14 +218,18 @@ class DeepSpeech2Model(nn.Layer): DeepSpeech2Model The model built from pretrained result. """ - model = cls(feat_size=dataloader.collate_fn.feature_size, - dict_size=dataloader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights, - blank_id=config.model.blank_id) + model = cls( + #feat_size=dataloader.collate_fn.feature_size, + feat_size=dataloader.dataset.feature_size, + #dict_size=dataloader.collate_fn.vocab_size, + dict_size=dataloader.dataset.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + use_gru=config.model.use_gru, + share_rnn_weights=config.model.share_rnn_weights, + blank_id=config.model.blank_id, + ctc_grad_norm_type=config.model.ctc_grad_norm_type, ) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) logger.info(f"checkpoint info: {infos}") @@ -244,36 +248,22 @@ class DeepSpeech2Model(nn.Layer): DeepSpeech2Model The model built from config. """ - model = cls(feat_size=config.feat_size, - dict_size=config.dict_size, - num_conv_layers=config.num_conv_layers, - num_rnn_layers=config.num_rnn_layers, - rnn_size=config.rnn_layer_size, - use_gru=config.use_gru, - share_rnn_weights=config.share_rnn_weights, - blank_id=config.blank_id) + model = cls( + feat_size=config.feat_size, + dict_size=config.dict_size, + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + use_gru=config.use_gru, + share_rnn_weights=config.share_rnn_weights, + blank_id=config.blank_id, + ctc_grad_norm_type=config.ctc_grad_norm_type, ) return model class DeepSpeech2InferModel(DeepSpeech2Model): - def __init__(self, - feat_size, - dict_size, - num_conv_layers=2, - num_rnn_layers=3, - rnn_size=1024, - use_gru=False, - share_rnn_weights=True, - blank_id=0): - super().__init__( - feat_size=feat_size, - dict_size=dict_size, - num_conv_layers=num_conv_layers, - num_rnn_layers=num_rnn_layers, - rnn_size=rnn_size, - use_gru=use_gru, - share_rnn_weights=share_rnn_weights, - blank_id=blank_id) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) def forward(self, audio, audio_len): """export model function diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py index 29d207c44..52e0c7b17 100644 --- a/deepspeech/models/ds2_online/deepspeech2.py +++ b/deepspeech/models/ds2_online/deepspeech2.py @@ -255,22 +255,24 @@ class DeepSpeech2ModelOnline(nn.Layer): fc_layers_size_list=[512, 256], use_gru=True, #Use gru if set True. Use simple rnn if set False. blank_id=0, # index of blank in vocob.txt - )) + ctc_grad_norm_type='instance', )) if config is not None: config.merge_from_other_cfg(default) return default - def __init__(self, - feat_size, - dict_size, - num_conv_layers=2, - num_rnn_layers=4, - rnn_size=1024, - rnn_direction='forward', - num_fc_layers=2, - fc_layers_size_list=[512, 256], - use_gru=False, - blank_id=0): + def __init__( + self, + feat_size, + dict_size, + num_conv_layers=2, + num_rnn_layers=4, + rnn_size=1024, + rnn_direction='forward', + num_fc_layers=2, + fc_layers_size_list=[512, 256], + use_gru=False, + blank_id=0, + ctc_grad_norm_type='instance', ): super().__init__() self.encoder = CRNNEncoder( feat_size=feat_size, @@ -290,7 +292,7 @@ class DeepSpeech2ModelOnline(nn.Layer): dropout_rate=0.0, reduction=True, # sum batch_average=True, # sum / batch_size - grad_norm_type='instance') + grad_norm_type=ctc_grad_norm_type) def forward(self, audio, audio_len, text, text_len): """Compute Model loss @@ -348,16 +350,18 @@ class DeepSpeech2ModelOnline(nn.Layer): DeepSpeech2ModelOnline The model built from pretrained result. """ - model = cls(feat_size=dataloader.collate_fn.feature_size, - dict_size=dataloader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - rnn_direction=config.model.rnn_direction, - num_fc_layers=config.model.num_fc_layers, - fc_layers_size_list=config.model.fc_layers_size_list, - use_gru=config.model.use_gru, - blank_id=config.model.blank_id) + model = cls( + feat_size=dataloader.collate_fn.feature_size, + dict_size=dataloader.collate_fn.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + rnn_direction=config.model.rnn_direction, + num_fc_layers=config.model.num_fc_layers, + fc_layers_size_list=config.model.fc_layers_size_list, + use_gru=config.model.use_gru, + blank_id=config.model.blank_id, + ctc_grad_norm_type=config.model.ctc_grad_norm_type, ) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) logger.info(f"checkpoint info: {infos}") @@ -376,42 +380,24 @@ class DeepSpeech2ModelOnline(nn.Layer): DeepSpeech2ModelOnline The model built from config. """ - model = cls(feat_size=config.feat_size, - dict_size=config.dict_size, - num_conv_layers=config.num_conv_layers, - num_rnn_layers=config.num_rnn_layers, - rnn_size=config.rnn_layer_size, - rnn_direction=config.rnn_direction, - num_fc_layers=config.num_fc_layers, - fc_layers_size_list=config.fc_layers_size_list, - use_gru=config.use_gru, - blank_id=config.blank_id) + model = cls( + feat_size=config.feat_size, + dict_size=config.dict_size, + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + rnn_direction=config.rnn_direction, + num_fc_layers=config.num_fc_layers, + fc_layers_size_list=config.fc_layers_size_list, + use_gru=config.use_gru, + blank_id=config.blank_id, + ctc_grad_norm_type=config.ctc_grad_norm_type, ) return model class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): - def __init__(self, - feat_size, - dict_size, - num_conv_layers=2, - num_rnn_layers=4, - rnn_size=1024, - rnn_direction='forward', - num_fc_layers=2, - fc_layers_size_list=[512, 256], - use_gru=False, - blank_id=0): - super().__init__( - feat_size=feat_size, - dict_size=dict_size, - num_conv_layers=num_conv_layers, - num_rnn_layers=num_rnn_layers, - rnn_size=rnn_size, - rnn_direction=rnn_direction, - num_fc_layers=num_fc_layers, - fc_layers_size_list=fc_layers_size_list, - use_gru=use_gru, - blank_id=blank_id) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box, chunk_state_c_box): diff --git a/deepspeech/utils/log.py b/deepspeech/utils/log.py index 0b5088544..1790efdb1 100644 --- a/deepspeech/utils/log.py +++ b/deepspeech/utils/log.py @@ -127,6 +127,7 @@ class Autolog: else: gpu_id = None infer_config = inference.Config() + self.autolog = auto_log.AutoLogger( model_name=model_name, model_precision=model_precision, From f5ec6e34c6499980dca82918bf7c166289a48f79 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 9 Oct 2021 10:54:31 +0000 Subject: [PATCH 13/17] disable __pycache__ --- examples/1xt2x/aishell/path.sh | 1 + examples/1xt2x/baidu_en8k/path.sh | 1 + examples/1xt2x/librispeech/path.sh | 1 + examples/aishell/s0/path.sh | 1 + examples/aishell/s1/path.sh | 1 + examples/callcenter/s1/path.sh | 1 + examples/librispeech/s0/path.sh | 1 + examples/librispeech/s1/path.sh | 1 + examples/librispeech/s2/path.sh | 1 + examples/ted_en_zh/t0/path.sh | 1 + examples/timit/s1/path.sh | 1 + 11 files changed, 11 insertions(+) diff --git a/examples/1xt2x/aishell/path.sh b/examples/1xt2x/aishell/path.sh index 16a0ad63e..40c7cec23 100644 --- a/examples/1xt2x/aishell/path.sh +++ b/examples/1xt2x/aishell/path.sh @@ -4,6 +4,7 @@ export LOCAL_DEEPSPEECH2=`realpath ${PWD}/../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C +export PYTHONDONTWRITEBYTECODE=1 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} diff --git a/examples/1xt2x/baidu_en8k/path.sh b/examples/1xt2x/baidu_en8k/path.sh index 16a0ad63e..40c7cec23 100644 --- a/examples/1xt2x/baidu_en8k/path.sh +++ b/examples/1xt2x/baidu_en8k/path.sh @@ -4,6 +4,7 @@ export LOCAL_DEEPSPEECH2=`realpath ${PWD}/../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C +export PYTHONDONTWRITEBYTECODE=1 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} diff --git a/examples/1xt2x/librispeech/path.sh b/examples/1xt2x/librispeech/path.sh index e95de15b0..a146956ed 100644 --- a/examples/1xt2x/librispeech/path.sh +++ b/examples/1xt2x/librispeech/path.sh @@ -4,6 +4,7 @@ export LOCAL_DEEPSPEECH2=`realpath ${PWD}/../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C +export PYTHONDONTWRITEBYTECODE=1 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} diff --git a/examples/aishell/s0/path.sh b/examples/aishell/s0/path.sh index e6d3a655b..a066676e1 100644 --- a/examples/aishell/s0/path.sh +++ b/examples/aishell/s0/path.sh @@ -3,6 +3,7 @@ export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C +export PYTHONDONTWRITEBYTECODE=1 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} diff --git a/examples/aishell/s1/path.sh b/examples/aishell/s1/path.sh index 6807a9505..dd3ccd8e0 100644 --- a/examples/aishell/s1/path.sh +++ b/examples/aishell/s1/path.sh @@ -3,6 +3,7 @@ export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C +export PYTHONDONTWRITEBYTECODE=1 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} diff --git a/examples/callcenter/s1/path.sh b/examples/callcenter/s1/path.sh index 29841bc10..e4e278da4 100644 --- a/examples/callcenter/s1/path.sh +++ b/examples/callcenter/s1/path.sh @@ -3,6 +3,7 @@ export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C +export PYTHONDONTWRITEBYTECODE=1 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} diff --git a/examples/librispeech/s0/path.sh b/examples/librispeech/s0/path.sh index 8a9345f2e..5ec461d74 100644 --- a/examples/librispeech/s0/path.sh +++ b/examples/librispeech/s0/path.sh @@ -3,6 +3,7 @@ export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C +export PYTHONDONTWRITEBYTECODE=1 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} diff --git a/examples/librispeech/s1/path.sh b/examples/librispeech/s1/path.sh index 439f71ae2..581a350a5 100644 --- a/examples/librispeech/s1/path.sh +++ b/examples/librispeech/s1/path.sh @@ -3,6 +3,7 @@ export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sctk/bin:${PWD}/utils:${PATH} export LC_ALL=C +export PYTHONDONTWRITEBYTECODE=1 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} diff --git a/examples/librispeech/s2/path.sh b/examples/librispeech/s2/path.sh index 05a037af8..eec437b60 100644 --- a/examples/librispeech/s2/path.sh +++ b/examples/librispeech/s2/path.sh @@ -3,6 +3,7 @@ export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sctk/bin:${PWD}/utils:${PATH} export LC_ALL=C +export PYTHONDONTWRITEBYTECODE=1 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} diff --git a/examples/ted_en_zh/t0/path.sh b/examples/ted_en_zh/t0/path.sh index a7f60425f..f6801e436 100644 --- a/examples/ted_en_zh/t0/path.sh +++ b/examples/ted_en_zh/t0/path.sh @@ -3,6 +3,7 @@ export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C +export PYTHONDONTWRITEBYTECODE=1 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} diff --git a/examples/timit/s1/path.sh b/examples/timit/s1/path.sh index 29841bc10..e4e278da4 100644 --- a/examples/timit/s1/path.sh +++ b/examples/timit/s1/path.sh @@ -3,6 +3,7 @@ export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C +export PYTHONDONTWRITEBYTECODE=1 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} From 8b45c3e65e95ccfb4776d2e227f16949d09bc090 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 9 Oct 2021 11:28:15 +0000 Subject: [PATCH 14/17] refactor trainer.py and rm ueseless dir setup code --- deepspeech/exps/deepspeech2/model.py | 73 -------------- deepspeech/exps/u2/model.py | 48 --------- deepspeech/exps/u2_kaldi/model.py | 48 +-------- deepspeech/exps/u2_st/model.py | 48 --------- deepspeech/modules/loss.py | 1 - deepspeech/training/trainer.py | 140 +++++++++++++++++++++------ 6 files changed, 110 insertions(+), 248 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 3dc8286d2..3ebbbe7a0 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -386,13 +386,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): logger.info(msg) self.autolog.report() - def run_test(self): - self.resume_or_scratch() - try: - self.test() - except KeyboardInterrupt: - exit(-1) - def export(self): if self.args.model_type == 'offline': infer_model = DeepSpeech2InferModel.from_pretrained( @@ -409,40 +402,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): logger.info(f"Export code: {static_model.forward.code}") paddle.jit.save(static_model, self.args.export_path) - def run_export(self): - try: - self.export() - except KeyboardInterrupt: - exit(-1) - - def setup(self): - """Setup the experiment. - """ - paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu') - - self.setup_output_dir() - self.setup_checkpointer() - - self.setup_dataloader() - self.setup_model() - - self.iteration = 0 - self.epoch = 0 - - def setup_output_dir(self): - """Create a directory used for output. - """ - # output dir - if self.args.output: - output_dir = Path(self.args.output).expanduser() - output_dir.mkdir(parents=True, exist_ok=True) - else: - output_dir = Path( - self.args.checkpoint_path).expanduser().parent.parent - output_dir.mkdir(parents=True, exist_ok=True) - - self.output_dir = output_dir - class DeepSpeech2ExportTester(DeepSpeech2Tester): def __init__(self, config, args): @@ -646,38 +605,6 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): output_lens = output_lens_handle.copy_to_cpu() return output_probs, output_lens - def run_test(self): - try: - self.test() - except KeyboardInterrupt: - exit(-1) - - def setup(self): - """Setup the experiment. - """ - paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu') - - self.setup_output_dir() - - self.setup_dataloader() - self.setup_model() - - self.iteration = 0 - self.epoch = 0 - - def setup_output_dir(self): - """Create a directory used for output. - """ - # output dir - if self.args.output: - output_dir = Path(self.args.output).expanduser() - output_dir.mkdir(parents=True, exist_ok=True) - else: - output_dir = Path(self.args.export_path).expanduser().parent.parent - output_dir.mkdir(parents=True, exist_ok=True) - - self.output_dir = output_dir - def setup_model(self): super().setup_model() speedyspeech_config = inference.Config( diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 65ec5174f..beb91d5de 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -551,13 +551,6 @@ class U2Tester(U2Trainer): }) f.write(data + '\n') - def run_test(self): - self.resume_or_scratch() - try: - self.test() - except KeyboardInterrupt: - sys.exit(-1) - @paddle.no_grad() def align(self): if self.config.decoding.batch_size > 1: @@ -617,13 +610,6 @@ class U2Tester(U2Trainer): intervals=tierformat, output=str(textgrid_path)) - def run_align(self): - self.resume_or_scratch() - try: - self.align() - except KeyboardInterrupt: - sys.exit(-1) - def load_inferspec(self): """infer model and input spec. @@ -651,37 +637,3 @@ class U2Tester(U2Trainer): static_model = paddle.jit.to_static(infer_model, input_spec=input_spec) logger.info(f"Export code: {static_model.forward.code}") paddle.jit.save(static_model, self.args.export_path) - - def run_export(self): - try: - self.export() - except KeyboardInterrupt: - sys.exit(-1) - - def setup(self): - """Setup the experiment. - """ - paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu') - - self.setup_output_dir() - self.setup_checkpointer() - - self.setup_dataloader() - self.setup_model() - - self.iteration = 0 - self.epoch = 0 - - def setup_output_dir(self): - """Create a directory used for output. - """ - # output dir - if self.args.output: - output_dir = Path(self.args.output).expanduser() - output_dir.mkdir(parents=True, exist_ok=True) - else: - output_dir = Path( - self.args.checkpoint_path).expanduser().parent.parent - output_dir.mkdir(parents=True, exist_ok=True) - - self.output_dir = output_dir diff --git a/deepspeech/exps/u2_kaldi/model.py b/deepspeech/exps/u2_kaldi/model.py index 5a72e44d8..48950fc8b 100644 --- a/deepspeech/exps/u2_kaldi/model.py +++ b/deepspeech/exps/u2_kaldi/model.py @@ -525,13 +525,6 @@ class U2Tester(U2Trainer): }) f.write(data + '\n') - def run_test(self): - self.resume_or_scratch() - try: - self.test() - except KeyboardInterrupt: - sys.exit(-1) - @paddle.no_grad() def align(self): if self.config.decoding.batch_size > 1: @@ -591,13 +584,6 @@ class U2Tester(U2Trainer): intervals=tierformat, output=str(textgrid_path)) - def run_align(self): - self.resume_or_scratch() - try: - self.align() - except KeyboardInterrupt: - sys.exit(-1) - def load_inferspec(self): """infer model and input spec. @@ -626,43 +612,11 @@ class U2Tester(U2Trainer): logger.info(f"Export code: {static_model.forward.code}") paddle.jit.save(static_model, self.args.export_path) - def run_export(self): - try: - self.export() - except KeyboardInterrupt: - sys.exit(-1) - def setup_dict(self): # load dictionary for debug log self.args.char_list = load_dict(self.args.dict_path, "maskctc" in self.args.model_name) def setup(self): - """Setup the experiment. - """ - paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu') - - self.setup_output_dir() - self.setup_checkpointer() - - self.setup_dataloader() - self.setup_model() - + super().setup() self.setup_dict() - - self.iteration = 0 - self.epoch = 0 - - def setup_output_dir(self): - """Create a directory used for output. - """ - # output dir - if self.args.output: - output_dir = Path(self.args.output).expanduser() - output_dir.mkdir(parents=True, exist_ok=True) - else: - output_dir = Path( - self.args.checkpoint_path).expanduser().parent.parent - output_dir.mkdir(parents=True, exist_ok=True) - - self.output_dir = output_dir diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py index 08060d975..2d228d294 100644 --- a/deepspeech/exps/u2_st/model.py +++ b/deepspeech/exps/u2_st/model.py @@ -545,13 +545,6 @@ class U2STTester(U2STTrainer): }) f.write(data + '\n') - def run_test(self): - self.resume_or_scratch() - try: - self.test() - except KeyboardInterrupt: - sys.exit(-1) - @paddle.no_grad() def align(self): if self.config.decoding.batch_size > 1: @@ -611,13 +604,6 @@ class U2STTester(U2STTrainer): intervals=tierformat, output=str(textgrid_path)) - def run_align(self): - self.resume_or_scratch() - try: - self.align() - except KeyboardInterrupt: - sys.exit(-1) - def load_inferspec(self): """infer model and input spec. @@ -645,37 +631,3 @@ class U2STTester(U2STTrainer): static_model = paddle.jit.to_static(infer_model, input_spec=input_spec) logger.info(f"Export code: {static_model.forward.code}") paddle.jit.save(static_model, self.args.export_path) - - def run_export(self): - try: - self.export() - except KeyboardInterrupt: - sys.exit(-1) - - def setup(self): - """Setup the experiment. - """ - paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu') - - self.setup_output_dir() - self.setup_checkpointer() - - self.setup_dataloader() - self.setup_model() - - self.iteration = 0 - self.epoch = 0 - - def setup_output_dir(self): - """Create a directory used for output. - """ - # output dir - if self.args.output: - output_dir = Path(self.args.output).expanduser() - output_dir.mkdir(parents=True, exist_ok=True) - else: - output_dir = Path( - self.args.checkpoint_path).expanduser().parent.parent - output_dir.mkdir(parents=True, exist_ok=True) - - self.output_dir = output_dir diff --git a/deepspeech/modules/loss.py b/deepspeech/modules/loss.py index 71ecd2662..e06f26f81 100644 --- a/deepspeech/modules/loss.py +++ b/deepspeech/modules/loss.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import inspect -from functools import partial import paddle from paddle import nn diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index c3e1bec88..a14cd7a02 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -14,6 +14,7 @@ import sys import time from collections import OrderedDict +from contextlib import contextmanager from pathlib import Path import paddle @@ -103,14 +104,28 @@ class Trainer(): self.iteration = 0 self.epoch = 0 self.rank = dist.get_rank() + self.world_size = dist.get_world_size() + self._train = True + # print deps version all_version() - logger.info(f"Rank: {self.rank}/{dist.get_world_size()}") + logger.info(f"Rank: {self.rank}/{self.world_size}") + # set device + paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu') + if self.parallel: + self.init_parallel() + + self.checkpoint = Checkpoint( + kbest_n=self.config.training.checkpoint.kbest_n, + latest_n=self.config.training.checkpoint.latest_n) + + # set random seed if needed if args.seed: seed_all(args.seed) logger.info(f"Set seed {args.seed}") + # profiler and benchmark options if self.args.benchmark_batch_size: with UpdateConfig(self.config): self.config.collator.batch_size = self.args.benchmark_batch_size @@ -118,17 +133,18 @@ class Trainer(): logger.info( f"Benchmark reset batch-size: {self.args.benchmark_batch_size}") + @contextmanager + def eval(self): + self._train = False + yield + self._train = True + def setup(self): """Setup the experiment. """ - paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu') - if self.parallel: - self.init_parallel() - self.setup_output_dir() self.dump_config() self.setup_visualizer() - self.setup_checkpointer() self.setup_dataloader() self.setup_model() @@ -183,8 +199,8 @@ class Trainer(): if infos: # just restore ckpt # lr will resotre from optimizer ckpt - self.iteration = infos["step"] - self.epoch = infos["epoch"] + self.iteration = infos["step"] + 1 + self.epoch = infos["epoch"] + 1 scratch = False logger.info( f"Restore ckpt: epoch {self.epoch }, step {self.iteration}!") @@ -302,37 +318,74 @@ class Trainer(): """The routine of the experiment after setup. This method is intended to be used by the user. """ - with Timer("Training Done: {}"): - try: + try: + with Timer("Training Done: {}"): self.train() - except KeyboardInterrupt: - exit(-1) - finally: - self.destory() + except KeyboardInterrupt: + exit(-1) + finally: + self.destory() + + def run_test(self): + """Do Test/Decode""" + try: + with Timer("Test/Decode Done: {}"): + with self.eval(): + self.resume_or_scratch() + self.test() + except KeyboardInterrupt: + exit(-1) + + def run_export(self): + """Do Model Export""" + try: + with Timer("Export Done: {}"): + with self.eval(): + self.export() + except KeyboardInterrupt: + exit(-1) + + def run_align(self): + """Do CTC alignment""" + try: + with Timer("Align Done: {}"): + with self.eval(): + self.resume_or_scratch() + self.align() + except KeyboardInterrupt: + sys.exit(-1) def setup_output_dir(self): """Create a directory used for output. """ - # output dir - output_dir = Path(self.args.output).expanduser() - output_dir.mkdir(parents=True, exist_ok=True) - + if self.args.output: + output_dir = Path(self.args.output).expanduser() + elif self.args.checkpoint_path: + output_dir = Path( + self.args.checkpoint_path).expanduser().parent.parent self.output_dir = output_dir + self.output_dir.mkdir(parents=True, exist_ok=True) - def setup_checkpointer(self): - """Create a directory used to save checkpoints into. + self.checkpoint_dir = self.output_dir / "checkpoints" + self.checkpoint_dir.mkdir(parents=True, exist_ok=True) - It is "checkpoints" inside the output directory. - """ - # checkpoint dir - checkpoint_dir = self.output_dir / "checkpoints" - checkpoint_dir.mkdir(exist_ok=True) + self.log_dir = output_dir / "log" + self.log_dir.mkdir(parents=True, exist_ok=True) - self.checkpoint_dir = checkpoint_dir + self.test_dir = output_dir / "test" + self.test_dir.mkdir(parents=True, exist_ok=True) - self.checkpoint = Checkpoint( - kbest_n=self.config.training.checkpoint.kbest_n, - latest_n=self.config.training.checkpoint.latest_n) + self.decode_dir = output_dir / "decode" + self.decode_dir.mkdir(parents=True, exist_ok=True) + + self.export_dir = output_dir / "export" + self.export_dir.mkdir(parents=True, exist_ok=True) + + self.visual_dir = output_dir / "visual" + self.visual_dir.mkdir(parents=True, exist_ok=True) + + self.config_dir = output_dir / "conf" + self.config_dir.mkdir(parents=True, exist_ok=True) @mp_tools.rank_zero_only def destory(self): @@ -354,7 +407,7 @@ class Trainer(): unexpected behaviors. """ # visualizer - visualizer = SummaryWriter(logdir=str(self.output_dir)) + visualizer = SummaryWriter(logdir=str(self.visual_dir)) self.visualizer = visualizer @mp_tools.rank_zero_only @@ -364,7 +417,14 @@ class Trainer(): It is saved in to ``config.yaml`` in the output directory at the beginning of the experiment. """ - with open(self.output_dir / "config.yaml", 'wt') as f: + config_file = self.config_dir / "config.yaml" + if self._train and config_file.exists(): + time_stamp = time.strftime("%Y_%m_%d_%H_%M_%s", time.gmtime()) + target_path = self.config_dir / ".".join( + [time_stamp, "config.yaml"]) + config_file.rename(target_path) + + with open(config_file, 'wt') as f: print(self.config, file=f) def train_batch(self): @@ -378,6 +438,24 @@ class Trainer(): """ raise NotImplementedError("valid should be implemented.") + @paddle.no_grad() + def test(self): + """The test. A subclass should implement this method in Tester. + """ + raise NotImplementedError("test should be implemented.") + + @paddle.no_grad() + def export(self): + """The test. A subclass should implement this method in Tester. + """ + raise NotImplementedError("export should be implemented.") + + @paddle.no_grad() + def align(self): + """The align. A subclass should implement this method in Tester. + """ + raise NotImplementedError("align should be implemented.") + def setup_model(self): """Setup model, criterion and optimizer, etc. A subclass should implement this method. From dd06472432300be1a118d47c9f1514db6177ea90 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 9 Oct 2021 11:41:44 +0000 Subject: [PATCH 15/17] extract ctc_align into ctc_utils --- deepspeech/exps/deepspeech2/model.py | 2 +- deepspeech/exps/u2/model.py | 65 ++------------------- deepspeech/exps/u2_kaldi/model.py | 65 ++------------------- deepspeech/exps/u2_st/model.py | 65 ++------------------- deepspeech/utils/ctc_utils.py | 84 ++++++++++++++++++++++++++++ 5 files changed, 100 insertions(+), 181 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 3ebbbe7a0..92320dac8 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -16,7 +16,6 @@ import os import time from collections import defaultdict from contextlib import nullcontext -from pathlib import Path from typing import Optional import jsonlines @@ -386,6 +385,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): logger.info(msg) self.autolog.report() + @paddle.no_grad() def export(self): if self.args.model_type == 'offline': infer_model = DeepSpeech2InferModel.from_pretrained( diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index beb91d5de..0976ec1ac 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -14,12 +14,10 @@ """Contains U2 model.""" import json import os -import sys import time from collections import defaultdict from collections import OrderedDict from contextlib import nullcontext -from pathlib import Path from typing import Optional import jsonlines @@ -44,8 +42,6 @@ from deepspeech.utils import ctc_utils from deepspeech.utils import error_rate from deepspeech.utils import layer_tools from deepspeech.utils import mp_tools -from deepspeech.utils import text_grid -from deepspeech.utils import utility from deepspeech.utils.log import Log from deepspeech.utils.utility import UpdateConfig @@ -553,62 +549,10 @@ class U2Tester(U2Trainer): @paddle.no_grad() def align(self): - if self.config.decoding.batch_size > 1: - logger.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - # xxx.align - assert self.args.result_file and self.args.result_file.endswith( - '.align') - - self.model.eval() - logger.info(f"Align Total Examples: {len(self.align_loader.dataset)}") - - stride_ms = self.align_loader.collate_fn.stride_ms - token_dict = self.align_loader.collate_fn.vocab_list - with open(self.args.result_file, 'w') as fout: - # one example in batch - for i, batch in enumerate(self.align_loader): - key, feat, feats_length, target, target_length = batch - - # 1. Encoder - encoder_out, encoder_mask = self.model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.shape[1] - ctc_probs = self.model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - - # 2. alignment - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = ctc_utils.forced_align(ctc_probs, target) - logger.info(f"align ids: {key[0]} {alignment}") - fout.write('{} {}\n'.format(key[0], alignment)) - - # 3. gen praat - # segment alignment - align_segs = text_grid.segment_alignment(alignment) - logger.info(f"align tokens: {key[0]}, {align_segs}") - # IntervalTier, List["start end token\n"] - subsample = utility.get_subsample(self.config) - tierformat = text_grid.align_to_tierformat( - align_segs, subsample, token_dict) - # write tier - align_output_path = Path(self.args.result_file).parent / "align" - align_output_path.mkdir(parents=True, exist_ok=True) - tier_path = align_output_path / (key[0] + ".tier") - with tier_path.open('w') as f: - f.writelines(tierformat) - # write textgrid - textgrid_path = align_output_path / (key[0] + ".TextGrid") - second_per_frame = 1. / (1000. / - stride_ms) # 25ms window, 10ms stride - second_per_example = ( - len(alignment) + 1) * subsample * second_per_frame - text_grid.generate_textgrid( - maxtime=second_per_example, - intervals=tierformat, - output=str(textgrid_path)) + ctc_utils.ctc_align( + self.model, self.align_loader, self.config.decoding.batch_size, + self.align_loader.collate_fn.stride_ms, + self.align_loader.collate_fn.vocab_list, self.args.result_file) def load_inferspec(self): """infer model and input spec. @@ -630,6 +574,7 @@ class U2Tester(U2Trainer): ] return infer_model, input_spec + @paddle.no_grad() def export(self): infer_model, input_spec = self.load_inferspec() assert isinstance(input_spec, list), type(input_spec) diff --git a/deepspeech/exps/u2_kaldi/model.py b/deepspeech/exps/u2_kaldi/model.py index 48950fc8b..0151e208c 100644 --- a/deepspeech/exps/u2_kaldi/model.py +++ b/deepspeech/exps/u2_kaldi/model.py @@ -14,11 +14,9 @@ """Contains U2 model.""" import json import os -import sys import time from collections import defaultdict from contextlib import nullcontext -from pathlib import Path from typing import Optional import jsonlines @@ -39,8 +37,6 @@ from deepspeech.utils import ctc_utils from deepspeech.utils import error_rate from deepspeech.utils import layer_tools from deepspeech.utils import mp_tools -from deepspeech.utils import text_grid -from deepspeech.utils import utility from deepspeech.utils.log import Log from deepspeech.utils.utility import UpdateConfig @@ -527,62 +523,10 @@ class U2Tester(U2Trainer): @paddle.no_grad() def align(self): - if self.config.decoding.batch_size > 1: - logger.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - # xxx.align - assert self.args.result_file and self.args.result_file.endswith( - '.align') - - self.model.eval() - logger.info(f"Align Total Examples: {len(self.align_loader.dataset)}") - - stride_ms = self.align_loader.collate_fn.stride_ms - token_dict = self.align_loader.collate_fn.vocab_list - with open(self.args.result_file, 'w') as fout: - # one example in batch - for i, batch in enumerate(self.align_loader): - key, feat, feats_length, target, target_length = batch - - # 1. Encoder - encoder_out, encoder_mask = self.model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.shape[1] - ctc_probs = self.model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - - # 2. alignment - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = ctc_utils.forced_align(ctc_probs, target) - logger.info(f"align ids: {key[0]} {alignment}") - fout.write('{} {}\n'.format(key[0], alignment)) - - # 3. gen praat - # segment alignment - align_segs = text_grid.segment_alignment(alignment) - logger.info(f"align tokens: {key[0]}, {align_segs}") - # IntervalTier, List["start end token\n"] - subsample = utility.get_subsample(self.config) - tierformat = text_grid.align_to_tierformat( - align_segs, subsample, token_dict) - # write tier - align_output_path = Path(self.args.result_file).parent / "align" - align_output_path.mkdir(parents=True, exist_ok=True) - tier_path = align_output_path / (key[0] + ".tier") - with tier_path.open('w') as f: - f.writelines(tierformat) - # write textgrid - textgrid_path = align_output_path / (key[0] + ".TextGrid") - second_per_frame = 1. / (1000. / - stride_ms) # 25ms window, 10ms stride - second_per_example = ( - len(alignment) + 1) * subsample * second_per_frame - text_grid.generate_textgrid( - maxtime=second_per_example, - intervals=tierformat, - output=str(textgrid_path)) + ctc_utils.ctc_align( + self.model, self.align_loader, self.config.decoding.batch_size, + self.align_loader.collate_fn.stride_ms, + self.align_loader.collate_fn.vocab_list, self.args.result_file) def load_inferspec(self): """infer model and input spec. @@ -604,6 +548,7 @@ class U2Tester(U2Trainer): ] return infer_model, input_spec + @paddle.no_grad() def export(self): infer_model, input_spec = self.load_inferspec() assert isinstance(input_spec, list), type(input_spec) diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py index 2d228d294..c5df44c67 100644 --- a/deepspeech/exps/u2_st/model.py +++ b/deepspeech/exps/u2_st/model.py @@ -14,11 +14,9 @@ """Contains U2 model.""" import json import os -import sys import time from collections import defaultdict from contextlib import nullcontext -from pathlib import Path from typing import Optional import jsonlines @@ -42,8 +40,6 @@ from deepspeech.utils import bleu_score from deepspeech.utils import ctc_utils from deepspeech.utils import layer_tools from deepspeech.utils import mp_tools -from deepspeech.utils import text_grid -from deepspeech.utils import utility from deepspeech.utils.log import Log from deepspeech.utils.utility import UpdateConfig @@ -547,62 +543,10 @@ class U2STTester(U2STTrainer): @paddle.no_grad() def align(self): - if self.config.decoding.batch_size > 1: - logger.fatal('alignment mode must be running with batch_size == 1') - sys.exit(1) - - # xxx.align - assert self.args.result_file and self.args.result_file.endswith( - '.align') - - self.model.eval() - logger.info(f"Align Total Examples: {len(self.align_loader.dataset)}") - - stride_ms = self.align_loader.collate_fn.stride_ms - token_dict = self.align_loader.collate_fn.vocab_list - with open(self.args.result_file, 'w') as fout: - # one example in batch - for i, batch in enumerate(self.align_loader): - key, feat, feats_length, target, target_length = batch - - # 1. Encoder - encoder_out, encoder_mask = self.model._forward_encoder( - feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.shape[1] - ctc_probs = self.model.ctc.log_softmax( - encoder_out) # (1, maxlen, vocab_size) - - # 2. alignment - ctc_probs = ctc_probs.squeeze(0) - target = target.squeeze(0) - alignment = ctc_utils.forced_align(ctc_probs, target) - logger.info(f"align ids: {key[0]} {alignment}") - fout.write('{} {}\n'.format(key[0], alignment)) - - # 3. gen praat - # segment alignment - align_segs = text_grid.segment_alignment(alignment) - logger.info(f"align tokens: {key[0]}, {align_segs}") - # IntervalTier, List["start end token\n"] - subsample = utility.get_subsample(self.config) - tierformat = text_grid.align_to_tierformat( - align_segs, subsample, token_dict) - # write tier - align_output_path = Path(self.args.result_file).parent / "align" - align_output_path.mkdir(parents=True, exist_ok=True) - tier_path = align_output_path / (key[0] + ".tier") - with tier_path.open('w') as f: - f.writelines(tierformat) - # write textgrid - textgrid_path = align_output_path / (key[0] + ".TextGrid") - second_per_frame = 1. / (1000. / - stride_ms) # 25ms window, 10ms stride - second_per_example = ( - len(alignment) + 1) * subsample * second_per_frame - text_grid.generate_textgrid( - maxtime=second_per_example, - intervals=tierformat, - output=str(textgrid_path)) + ctc_utils.ctc_align( + self.model, self.align_loader, self.config.decoding.batch_size, + self.align_loader.collate_fn.stride_ms, + self.align_loader.collate_fn.vocab_list, self.args.result_file) def load_inferspec(self): """infer model and input spec. @@ -624,6 +568,7 @@ class U2STTester(U2STTrainer): ] return infer_model, input_spec + @paddle.no_grad() def export(self): infer_model, input_spec = self.load_inferspec() assert isinstance(input_spec, list), type(input_spec) diff --git a/deepspeech/utils/ctc_utils.py b/deepspeech/utils/ctc_utils.py index 70d99e6c2..7e8629c2d 100644 --- a/deepspeech/utils/ctc_utils.py +++ b/deepspeech/utils/ctc_utils.py @@ -16,6 +16,8 @@ from typing import List import numpy as np import paddle +from deepspeech.utils import text_grid +from deepspeech.utils import utility from deepspeech.utils.log import Log logger = Log(__name__).getlog() @@ -134,3 +136,85 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, output_alignment.append(y_insert_blank[state_seq[t, 0]]) return output_alignment + + +# ctc_align( +# self.model, +# self.align_loader, +# self.config.decoding.batch_size, +# self.align_loader.collate_fn.stride_ms, +# self.align_loader.collate_fn.vocab_list, +# self.args.result_file, +# ) + + +def ctc_align(model, dataloader, batch_size, stride_ms, token_dict, + result_file): + """ctc alignment. + + Args: + model (nn.Layer): U2 Model. + dataloader (io.DataLoader): dataloader. + batch_size (int): decoding batchsize. + stride_ms (int): audio feature stride in ms unit. + token_dict (List[str]): vocab list, e.g. ['blank', 'unk', 'a', 'b', '']. + result_file (str): alignment output file, e.g. xxx.align. + """ + if batch_size > 1: + logger.fatal('alignment mode must be running with batch_size == 1') + sys.exit(1) + + assert result_file and result_file.endswith('.align') + + model.eval() + + logger.info(f"Align Total Examples: {len(dataloader.dataset)}") + + with open(result_file, 'w') as fout: + # one example in batch + for i, batch in enumerate(dataloader): + key, feat, feats_length, target, target_length = batch + + # 1. Encoder + encoder_out, encoder_mask = model._forward_encoder( + feat, feats_length) # (B, maxlen, encoder_dim) + maxlen = encoder_out.shape[1] + ctc_probs = model.ctc.log_softmax( + encoder_out) # (1, maxlen, vocab_size) + + # 2. alignment + ctc_probs = ctc_probs.squeeze(0) + target = target.squeeze(0) + alignment = forced_align(ctc_probs, target) + + logger.info(f"align ids: {key[0]} {alignment}") + fout.write('{} {}\n'.format(key[0], alignment)) + + # 3. gen praat + # segment alignment + align_segs = text_grid.segment_alignment(alignment) + logger.info(f"align tokens: {key[0]}, {align_segs}") + + # IntervalTier, List["start end token\n"] + subsample = utility.get_subsample(self.config) + + tierformat = text_grid.align_to_tierformat(align_segs, subsample, + token_dict) + + # write tier + align_output_path = Path(self.args.result_file).parent / "align" + align_output_path.mkdir(parents=True, exist_ok=True) + tier_path = align_output_path / (key[0] + ".tier") + with tier_path.open('w') as f: + f.writelines(tierformat) + + # write textgrid + textgrid_path = align_output_path / (key[0] + ".TextGrid") + second_per_frame = 1. / (1000. / + stride_ms) # 25ms window, 10ms stride + second_per_example = ( + len(alignment) + 1) * subsample * second_per_frame + text_grid.generate_textgrid( + maxtime=second_per_example, + intervals=tierformat, + output=str(textgrid_path)) From 92832ef590f7b07b1499e3e264d2e6a008cad587 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 9 Oct 2021 11:52:44 +0000 Subject: [PATCH 16/17] fix all_version print --- deepspeech/utils/utility.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/deepspeech/utils/utility.py b/deepspeech/utils/utility.py index 159b686e0..ba5acbb96 100644 --- a/deepspeech/utils/utility.py +++ b/deepspeech/utils/utility.py @@ -18,6 +18,7 @@ import os import random import sys from contextlib import contextmanager +from pprint import pformat from typing import List import numpy as np @@ -41,9 +42,7 @@ def all_version(): "paddle_commit": paddle.version.commit, "soundfile": soundfile.__version__, } - logger.info("Deps Module Version:") - for k, v in vers.items(): - logger.info(f"{k}: {v}") + logger.info(f"Deps Module Version:{pformat(vers.items())}") @contextmanager @@ -54,7 +53,7 @@ def UpdateConfig(config): config.freeze() -def seed_all(seed: int=210329): +def seed_all(seed: int=20210329): """freeze random generator seed.""" np.random.seed(seed) random.seed(seed) @@ -80,7 +79,7 @@ def print_arguments(args, info=None): if info: filename = info["__file__"] filename = os.path.basename(filename) - print(f"----------- {filename} Configuration Arguments -----------") + print(f"----------- {filename} Arguments -----------") for arg, value in sorted(vars(args).items()): print("%s: %s" % (arg, value)) print("-----------------------------------------------------------") From 366e34c9257a91a7b45af2d0428cc0489b72c937 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 11 Oct 2021 03:10:00 +0000 Subject: [PATCH 17/17] update paddle version to 2.1.2 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 71bc63638..f89869778 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ All tested under: * Ubuntu 16.04 * python>=3.7 -* paddlepaddle>=2.2.0rc +* paddlepaddle==2.1.2 Please see [install](docs/src/install.md).