Merge pull request #817 from PaddlePaddle/ctc

export ctc grad norm config
3 years ago · 59766313b1
parent 7d0204e9fe 1a8c5278a1
commit 59766313b1
12 changed files with 52 additions and 11 deletions
--- a/deepspeech/models/ds2/deepspeech2.py
+++ b/deepspeech/models/ds2/deepspeech2.py
@ -128,8 +128,8 @@ class DeepSpeech2Model(nn.Layer):
                num_rnn_layers=3,  #Number of stacking RNN layers.
                rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
                use_gru=True,  #Use gru if set True. Use simple rnn if set False.
-                share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
-            ))
+                share_rnn_weights=True,  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
+                ctc_grad_norm_type='instance', ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
@ -142,7 +142,8 @@ class DeepSpeech2Model(nn.Layer):
                 rnn_size=1024,
                 use_gru=False,
                 share_rnn_weights=True,
-                 blank_id=0):
+                 blank_id=0,
+                 ctc_grad_norm_type='instance'):
        super().__init__()
        self.encoder = CRNNEncoder(
            feat_size=feat_size,
@ -160,7 +161,8 @@ class DeepSpeech2Model(nn.Layer):
            blank_id=blank_id,
            dropout_rate=0.0,
            reduction=True,  # sum
-            batch_average=True)  # sum / batch_size
+            batch_average=True,  # sum / batch_size
+            grad_norm_type=ctc_grad_norm_type)

    def forward(self, audio, audio_len, text, text_len):
        """Compute Model loss
--- a/deepspeech/models/ds2_online/deepspeech2.py
+++ b/deepspeech/models/ds2_online/deepspeech2.py
@ -289,7 +289,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
            blank_id=blank_id,
            dropout_rate=0.0,
            reduction=True,  # sum
-            batch_average=True)  # sum / batch_size
+            batch_average=True,  # sum / batch_size
+            grad_norm_type='instance')

    def forward(self, audio, audio_len, text, text_len):
        """Compute Model loss
--- a/deepspeech/models/u2.py
+++ b/deepspeech/models/u2.py
@ -864,7 +864,8 @@ class U2Model(U2BaseModel):
            blank_id=0,
            dropout_rate=0.0,
            reduction=True,  # sum
-            batch_average=True)  # sum / batch_size
+            batch_average=True,  # sum / batch_size
+            grad_norm_type='instance')

        return vocab_size, encoder, decoder, ctc

--- a/deepspeech/models/u2_st.py
+++ b/deepspeech/models/u2_st.py
@ -649,7 +649,8 @@ class U2STModel(U2STBaseModel):
                blank_id=0,
                dropout_rate=0.0,
                reduction=True,  # sum
-                batch_average=True)  # sum / batch_size
+                batch_average=True,  # sum / batch_size
+                grad_norm_type='instance')

            return vocab_size, encoder, (st_decoder, decoder, ctc)
        else:
--- a/deepspeech/modules/ctc.py
+++ b/deepspeech/modules/ctc.py
@ -39,7 +39,8 @@ class CTCDecoder(nn.Layer):
                 blank_id=0,
                 dropout_rate: float=0.0,
                 reduction: bool=True,
-                 batch_average: bool=True):
+                 batch_average: bool=True,
+                 grad_norm_type: str="instance"):
        """CTC decoder

        Args:
@ -48,6 +49,7 @@ class CTCDecoder(nn.Layer):
            dropout_rate (float): dropout rate (0.0 ~ 1.0)
            reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none'
            batch_average (bool): do batch dim wise average.
+            grad_norm_type (str): one of 'instance', 'batchsize', 'frame', None.
        """
        assert check_argument_types()
        super().__init__()
@ -60,7 +62,8 @@ class CTCDecoder(nn.Layer):
        self.criterion = CTCLoss(
            blank=self.blank_id,
            reduction=reduction_type,
-            batch_average=batch_average)
+            batch_average=batch_average,
+            grad_norm_type=grad_norm_type)

        # CTCDecoder LM Score handle
        self._ext_scorer = None
--- a/deepspeech/modules/loss.py
+++ b/deepspeech/modules/loss.py
@ -23,11 +23,32 @@ __all__ = ['CTCLoss', "LabelSmoothingLoss"]


 class CTCLoss(nn.Layer):
-    def __init__(self, blank=0, reduction='sum', batch_average=False):
+    def __init__(self,
+                 blank=0,
+                 reduction='sum',
+                 batch_average=False,
+                 grad_norm_type=None):
        super().__init__()
        # last token id as blank id
        self.loss = nn.CTCLoss(blank=blank, reduction=reduction)
        self.batch_average = batch_average
+        logger.info(
+            f"CTCLoss Loss reduction: {reduction}, div-bs: {batch_average}")
+
+        # instance for norm_by_times
+        # batchsize for norm_by_batchsize
+        # frame for norm_by_total_logits_len
+        assert grad_norm_type in ('instance', 'batchsize', 'frame', None)
+        self.norm_by_times = False
+        self.norm_by_batchsize = False
+        self.norm_by_total_logits_len = False
+        logger.info(f"CTCLoss Grad Norm Type: {grad_norm_type}")
+        if grad_norm_type == 'instance':
+            self.norm_by_times = True
+        if grad_norm_type == 'batchsize':
+            self.norm_by_times = True
+        if grad_norm_type == 'frame':
+            self.norm_by_total_logits_len = True

    def forward(self, logits, ys_pad, hlens, ys_lens):
        """Compute CTC loss.
@ -48,7 +69,13 @@ class CTCLoss(nn.Layer):
        logits = logits.transpose([1, 0, 2])
        ys_pad = ys_pad.astype(paddle.int32)
        loss = self.loss(
-            logits, ys_pad, hlens, ys_lens, norm_by_times=self.batch_average)
+            logits,
+            ys_pad,
+            hlens,
+            ys_lens,
+            norm_by_times=self.norm_by_times,
+            norm_by_batchsize=self.norm_by_batchsize,
+            norm_by_total_logits_len=self.norm_by_total_logits_len)
        if self.batch_average:
            # Batch-size average
            loss = loss / B
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@ -41,6 +41,7 @@ model:
  use_gru: True 
  share_rnn_weights: False
  blank_id: 0
+  ctc_grad_norm_type: instance

 training:
  n_epoch: 80
--- a/examples/aishell/s0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/s0/conf/deepspeech2_online.yaml
@ -43,6 +43,7 @@ model:
  fc_layers_size_list: -1,
  use_gru: False 
  blank_id: 0
+  ctc_grad_norm_type: instance
  
 training:
  n_epoch: 50
--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
@ -41,6 +41,7 @@ model:
  use_gru: False 
  share_rnn_weights: True
  blank_id: 0
+  ctc_grad_norm_type: instance

 training:
  n_epoch: 50
--- a/examples/librispeech/s0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2_online.yaml
@ -43,6 +43,7 @@ model:
  fc_layers_size_list: 512, 256
  use_gru: False 
  blank_id: 0
+  ctc_grad_norm_type: instance

 training:
  n_epoch: 50
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@ -42,6 +42,7 @@ model:
  use_gru: False 
  share_rnn_weights: True 
  blank_id: 0
+  ctc_grad_norm_type: instance

 training:
  n_epoch: 10
--- a/examples/tiny/s0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/s0/conf/deepspeech2_online.yaml
@ -44,6 +44,7 @@ model:
  fc_layers_size_list: 512, 256
  use_gru: True 
  blank_id: 0
+  ctc_grad_norm_type: instance

 training:
  n_epoch: 10