From f8719971b50b3ba70e9130829fd6bf5bbe25f541 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 22 Mar 2021 03:21:41 +0000 Subject: [PATCH] large librispeech lr for batch_average ctc loss --- deepspeech/exps/deepspeech2/model.py | 1 - deepspeech/models/deepspeech2.py | 3 ++- deepspeech/modules/ctc.py | 11 ++++++++--- deepspeech/modules/loss.py | 10 +++++----- examples/aishell/README.md | 2 +- examples/librispeech/README.md | 8 ++++---- examples/librispeech/conf/deepspeech2.yaml | 4 ++-- examples/librispeech/local/train.sh | 5 +++-- 8 files changed, 25 insertions(+), 19 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index c171089dc..717eea4bf 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -39,7 +39,6 @@ from deepspeech.io.sampler import SortagradDistributedBatchSampler from deepspeech.io.sampler import SortagradBatchSampler from deepspeech.io.dataset import ManifestDataset -from deepspeech.modules.loss import CTCLoss from deepspeech.models.deepspeech2 import DeepSpeech2Model from deepspeech.models.deepspeech2 import DeepSpeech2InferModel diff --git a/deepspeech/models/deepspeech2.py b/deepspeech/models/deepspeech2.py index ffe678a69..4e66a75f8 100644 --- a/deepspeech/models/deepspeech2.py +++ b/deepspeech/models/deepspeech2.py @@ -170,7 +170,8 @@ class DeepSpeech2Model(nn.Layer): odim=dict_size + 1, # is append after vocab blank_id=dict_size, # last token is dropout_rate=0.0, - reduction=True) + reduction=True, # sum + batch_average=True) # sum / batch_size def forward(self, audio, text, audio_len, text_len): """Compute Model loss diff --git a/deepspeech/modules/ctc.py b/deepspeech/modules/ctc.py index 66737f599..74b21d395 100644 --- a/deepspeech/modules/ctc.py +++ b/deepspeech/modules/ctc.py @@ -36,14 +36,16 @@ class CTCDecoder(nn.Layer): odim, blank_id=0, dropout_rate: float=0.0, - reduction: bool=True): + reduction: bool=True, + batch_average: bool=False): """CTC decoder Args: enc_n_units ([int]): encoder output dimention vocab_size ([int]): text vocabulary size dropout_rate (float): dropout rate (0.0 ~ 1.0) - reduction (bool): reduce the CTC loss into a scalar + reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none' + batch_average (bool): do batch dim wise average. """ assert check_argument_types() super().__init__() @@ -53,7 +55,10 @@ class CTCDecoder(nn.Layer): self.dropout_rate = dropout_rate self.ctc_lo = nn.Linear(enc_n_units, self.odim) reduction_type = "sum" if reduction else "none" - self.criterion = CTCLoss(blank=self.blank_id, reduction=reduction_type) + self.criterion = CTCLoss( + blank=self.blank_id, + reduction=reduction_type, + batch_average=batch_average) # CTCDecoder LM Score handle self._ext_scorer = None diff --git a/deepspeech/modules/loss.py b/deepspeech/modules/loss.py index 0ef7e2f73..a229e7ebe 100644 --- a/deepspeech/modules/loss.py +++ b/deepspeech/modules/loss.py @@ -53,10 +53,11 @@ F.ctc_loss = ctc_loss class CTCLoss(nn.Layer): - def __init__(self, blank=0, reduction='sum'): + def __init__(self, blank=0, reduction='sum', batch_average=False): super().__init__() # last token id as blank id self.loss = nn.CTCLoss(blank=blank, reduction=reduction) + self.batch_average = batch_average def forward(self, logits, ys_pad, hlens, ys_lens): """Compute CTC loss. @@ -76,8 +77,7 @@ class CTCLoss(nn.Layer): # logits: (B, L, D) -> (L, B, D) logits = logits.transpose([1, 0, 2]) loss = self.loss(logits, ys_pad, hlens, ys_lens) - - # wenet do batch-size average, deepspeech2 not do this - # Batch-size average - # loss = loss / B + if self.batch_average: + # Batch-size average + loss = loss / B return loss diff --git a/examples/aishell/README.md b/examples/aishell/README.md index fdb4c133b..ded740d10 100644 --- a/examples/aishell/README.md +++ b/examples/aishell/README.md @@ -4,4 +4,4 @@ | Model | Config | Test Set | CER | Valid Loss | | --- | --- | --- | --- | --- | | DeepSpeech2 | conf/deepspeech2.yaml | test | 0.077249 | 7.036566 | -| DeepSpeech2 | release 1.8.5 | test | 0.080447 | - | +| DeepSpeech2 | release 1.8.5 | test | 0.087004 | 8.575452 | diff --git a/examples/librispeech/README.md b/examples/librispeech/README.md index 1e694df1c..3d22128bb 100644 --- a/examples/librispeech/README.md +++ b/examples/librispeech/README.md @@ -1,7 +1,7 @@ # LibriSpeech ## CTC -| Model | Config | Test set | WER | -| --- | --- | --- | --- | -| DeepSpeech2 | conf/deepspeech2.yaml | test-clean | 0.073973 | -| DeepSpeech2 | release 1.8.5 | test-clean | 0.074939 | +| Model | Config | Test Set | WER | Valid Loss | +| --- | --- | --- | --- | --- | +| DeepSpeech2 | conf/deepspeech2.yaml | test-clean | 0.071391 | 15.078561 | +| DeepSpeech2 | release 1.8.5 | test-clean | 0.074939 | 15.351633 | diff --git a/examples/librispeech/conf/deepspeech2.yaml b/examples/librispeech/conf/deepspeech2.yaml index 15fd4cbe3..20d4e6402 100644 --- a/examples/librispeech/conf/deepspeech2.yaml +++ b/examples/librispeech/conf/deepspeech2.yaml @@ -29,8 +29,8 @@ model: use_gru: False share_rnn_weights: True training: - n_epoch: 20 - lr: 5e-4 + n_epoch: 50 + lr: 1e-3 lr_decay: 0.83 weight_decay: 1e-06 global_grad_clip: 5.0 diff --git a/examples/librispeech/local/train.sh b/examples/librispeech/local/train.sh index 758098679..cbccb1896 100644 --- a/examples/librispeech/local/train.sh +++ b/examples/librispeech/local/train.sh @@ -1,8 +1,9 @@ #! /usr/bin/env bash -export FLAGS_sync_nccl_allreduce=0 +#export FLAGS_sync_nccl_allreduce=0 + # https://github.com/PaddlePaddle/Paddle/pull/28484 -export NCCL_SHM_DISABLE=1 +#export NCCL_SHM_DISABLE=1 ngpu=$(echo ${CUDA_VISIBLE_DEVICES} | python -c 'import sys; a = sys.stdin.read(); print(len(a.split(",")));') echo "using $ngpu gpus..."