From f8719971b50b3ba70e9130829fd6bf5bbe25f541 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 22 Mar 2021 03:21:41 +0000
Subject: [PATCH] large librispeech lr for batch_average ctc loss

---
 deepspeech/exps/deepspeech2/model.py       |  1 -
 deepspeech/models/deepspeech2.py           |  3 ++-
 deepspeech/modules/ctc.py                  | 11 ++++++++---
 deepspeech/modules/loss.py                 | 10 +++++-----
 examples/aishell/README.md                 |  2 +-
 examples/librispeech/README.md             |  8 ++++----
 examples/librispeech/conf/deepspeech2.yaml |  4 ++--
 examples/librispeech/local/train.sh        |  5 +++--
 8 files changed, 25 insertions(+), 19 deletions(-)
diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py
index c171089dc..717eea4bf 100644
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@@ -39,7 +39,6 @@ from deepspeech.io.sampler import SortagradDistributedBatchSampler
 from deepspeech.io.sampler import SortagradBatchSampler
 from deepspeech.io.dataset import ManifestDataset
 
-from deepspeech.modules.loss import CTCLoss
 from deepspeech.models.deepspeech2 import DeepSpeech2Model
 from deepspeech.models.deepspeech2 import DeepSpeech2InferModel
 
diff --git a/deepspeech/models/deepspeech2.py b/deepspeech/models/deepspeech2.py
index ffe678a69..4e66a75f8 100644
--- a/deepspeech/models/deepspeech2.py
+++ b/deepspeech/models/deepspeech2.py
@@ -170,7 +170,8 @@ class DeepSpeech2Model(nn.Layer):
             odim=dict_size + 1,  # <blank> is append after vocab
             blank_id=dict_size,  # last token is <blank>
             dropout_rate=0.0,
-            reduction=True)
+            reduction=True,  # sum
+            batch_average=True)  # sum / batch_size
 
     def forward(self, audio, text, audio_len, text_len):
         """Compute Model loss
diff --git a/deepspeech/modules/ctc.py b/deepspeech/modules/ctc.py
index 66737f599..74b21d395 100644
--- a/deepspeech/modules/ctc.py
+++ b/deepspeech/modules/ctc.py
@@ -36,14 +36,16 @@ class CTCDecoder(nn.Layer):
                  odim,
                  blank_id=0,
                  dropout_rate: float=0.0,
-                 reduction: bool=True):
+                 reduction: bool=True,
+                 batch_average: bool=False):
         """CTC decoder
 
         Args:
             enc_n_units ([int]): encoder output dimention
             vocab_size ([int]): text vocabulary size
             dropout_rate (float): dropout rate (0.0 ~ 1.0)
-            reduction (bool): reduce the CTC loss into a scalar
+            reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none'
+            batch_average (bool): do batch dim wise average.
         """
         assert check_argument_types()
         super().__init__()
@@ -53,7 +55,10 @@ class CTCDecoder(nn.Layer):
         self.dropout_rate = dropout_rate
         self.ctc_lo = nn.Linear(enc_n_units, self.odim)
         reduction_type = "sum" if reduction else "none"
-        self.criterion = CTCLoss(blank=self.blank_id, reduction=reduction_type)
+        self.criterion = CTCLoss(
+            blank=self.blank_id,
+            reduction=reduction_type,
+            batch_average=batch_average)
 
         # CTCDecoder LM Score handle
         self._ext_scorer = None
diff --git a/deepspeech/modules/loss.py b/deepspeech/modules/loss.py
index 0ef7e2f73..a229e7ebe 100644
--- a/deepspeech/modules/loss.py
+++ b/deepspeech/modules/loss.py
@@ -53,10 +53,11 @@ F.ctc_loss = ctc_loss
 
 
 class CTCLoss(nn.Layer):
-    def __init__(self, blank=0, reduction='sum'):
+    def __init__(self, blank=0, reduction='sum', batch_average=False):
         super().__init__()
         # last token id as blank id
         self.loss = nn.CTCLoss(blank=blank, reduction=reduction)
+        self.batch_average = batch_average
 
     def forward(self, logits, ys_pad, hlens, ys_lens):
         """Compute CTC loss.
@@ -76,8 +77,7 @@ class CTCLoss(nn.Layer):
         # logits: (B, L, D) -> (L, B, D)
         logits = logits.transpose([1, 0, 2])
         loss = self.loss(logits, ys_pad, hlens, ys_lens)
-
-        # wenet do batch-size average, deepspeech2 not do this
-        # Batch-size average
-        # loss = loss / B
+        if self.batch_average:
+            # Batch-size average
+            loss = loss / B
         return loss
diff --git a/examples/aishell/README.md b/examples/aishell/README.md
index fdb4c133b..ded740d10 100644
--- a/examples/aishell/README.md
+++ b/examples/aishell/README.md
@@ -4,4 +4,4 @@
 | Model | Config | Test Set |  CER | Valid Loss |
 | --- | --- | --- | --- | --- |
 | DeepSpeech2 | conf/deepspeech2.yaml | test | 0.077249 | 7.036566 |
-| DeepSpeech2 | release 1.8.5 | test | 0.080447 | - |
+| DeepSpeech2 | release 1.8.5 | test | 0.087004 | 8.575452 |
diff --git a/examples/librispeech/README.md b/examples/librispeech/README.md
index 1e694df1c..3d22128bb 100644
--- a/examples/librispeech/README.md
+++ b/examples/librispeech/README.md
@@ -1,7 +1,7 @@
 # LibriSpeech
 
 ## CTC
-| Model | Config | Test set |  WER |
-| --- | --- | --- | --- |
-| DeepSpeech2 | conf/deepspeech2.yaml | test-clean | 0.073973 |
-| DeepSpeech2 | release 1.8.5 | test-clean | 0.074939 |
+| Model | Config | Test Set |  WER | Valid Loss |
+| --- | --- | --- | --- | --- |
+| DeepSpeech2 | conf/deepspeech2.yaml | test-clean | 0.071391 | 15.078561 |
+| DeepSpeech2 | release 1.8.5 | test-clean | 0.074939 | 15.351633 |
diff --git a/examples/librispeech/conf/deepspeech2.yaml b/examples/librispeech/conf/deepspeech2.yaml
index 15fd4cbe3..20d4e6402 100644
--- a/examples/librispeech/conf/deepspeech2.yaml
+++ b/examples/librispeech/conf/deepspeech2.yaml
@@ -29,8 +29,8 @@ model:
   use_gru: False 
   share_rnn_weights: True 
 training:
-  n_epoch: 20
-  lr: 5e-4
+  n_epoch: 50
+  lr: 1e-3
   lr_decay: 0.83
   weight_decay: 1e-06
   global_grad_clip: 5.0
diff --git a/examples/librispeech/local/train.sh b/examples/librispeech/local/train.sh
index 758098679..cbccb1896 100644
--- a/examples/librispeech/local/train.sh
+++ b/examples/librispeech/local/train.sh
@@ -1,8 +1,9 @@
 #! /usr/bin/env bash
 
-export FLAGS_sync_nccl_allreduce=0
+#export FLAGS_sync_nccl_allreduce=0
+
 # https://github.com/PaddlePaddle/Paddle/pull/28484
-export NCCL_SHM_DISABLE=1
+#export NCCL_SHM_DISABLE=1
 
 ngpu=$(echo ${CUDA_VISIBLE_DEVICES} | python -c 'import sys; a = sys.stdin.read(); print(len(a.split(",")));')
 echo "using $ngpu gpus..."