From 54b13722f50f92e4ec80249d7a075008bff9d667 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Sat, 20 Feb 2021 10:51:54 +0000
Subject: [PATCH] fix model and ctc

---
 model_utils/network.py | 70 ++++++++++++++++++++++++------------------
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/model_utils/network.py b/model_utils/network.py
index 2b0f6765b..ea36c1493 100644
--- a/model_utils/network.py
+++ b/model_utils/network.py
@@ -26,6 +26,8 @@ from decoders.swig_wrapper import Scorer
 from decoders.swig_wrapper import ctc_greedy_decoder
 from decoders.swig_wrapper import ctc_beam_search_decoder_batch
 
+logger = logging.getLogger(__name__)
+
 __all__ = ['DeepSpeech2', 'DeepSpeech2Loss']
 
 
@@ -36,9 +38,9 @@ def ctc_loss(log_probs,
              blank=0,
              reduction='mean',
              norm_by_times=True):
-    #print("my ctc loss with norm by times")
-    loss_out = paddle.fluid.layers.warpctc(log_probs, labels, blank, norm_by_times,
-                                    input_lengths, label_lengths)
+    #logger.info("my ctc loss with norm by times")
+    loss_out = paddle.fluid.layers.warpctc(
+        log_probs, labels, blank, norm_by_times, input_lengths, label_lengths)
 
     loss_out = paddle.fluid.layers.squeeze(loss_out, [-1])
     assert reduction in ['mean', 'sum', 'none']
@@ -48,6 +50,7 @@ def ctc_loss(log_probs,
         loss_out = paddle.sum(loss_out)
     return loss_out
 
+
 F.ctc_loss = ctc_loss
 
 
@@ -216,11 +219,12 @@ class RNNCell(nn.RNNCellBase):
             (hidden_size, hidden_size),
             weight_hh_attr,
             default_initializer=I.Uniform(-std, std))
-        self.bias_ih = self.create_parameter(
-            (hidden_size, ),
-            bias_ih_attr,
-            is_bias=True,
-            default_initializer=I.Uniform(-std, std))
+        # self.bias_ih = self.create_parameter(
+        #     (hidden_size, ),
+        #     bias_ih_attr,
+        #     is_bias=True,
+        #     default_initializer=I.Uniform(-std, std))
+        self.bias_ih = None
         self.bias_hh = self.create_parameter(
             (hidden_size, ),
             bias_hh_attr,
@@ -287,11 +291,12 @@ class GRUCellShare(nn.RNNCellBase):
             (3 * hidden_size, hidden_size),
             weight_hh_attr,
             default_initializer=I.Uniform(-std, std))
-        self.bias_ih = self.create_parameter(
-            (3 * hidden_size, ),
-            bias_ih_attr,
-            is_bias=True,
-            default_initializer=I.Uniform(-std, std))
+        # self.bias_ih = self.create_parameter(
+        #     (3 * hidden_size, ),
+        #     bias_ih_attr,
+        #     is_bias=True,
+        #     default_initializer=I.Uniform(-std, std))
+        self.bias_ih = None
         self.bias_hh = self.create_parameter(
             (3 * hidden_size, ),
             bias_hh_attr,
@@ -301,7 +306,8 @@ class GRUCellShare(nn.RNNCellBase):
         self.hidden_size = hidden_size
         self.input_size = input_size
         self._gate_activation = F.sigmoid
-        self._activation = paddle.tanh
+        #self._activation = paddle.tanh
+        self._activation = paddle.relu
 
     def forward(self, inputs, states=None):
         if states is None:
@@ -322,6 +328,8 @@ class GRUCellShare(nn.RNNCellBase):
         z = self._gate_activation(x_z + h_z)
         c = self._activation(x_c + r * h_c)  # apply reset gate after mm
         h = (pre_hidden - c) * z + c
+        # https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru
+        #h = (1-z) * pre_hidden + z * c
 
         return h, h
 
@@ -353,24 +361,24 @@ class BiRNNWithBN(nn.Layer):
     def __init__(self, i_size, h_size, share_weights):
         super().__init__()
         self.share_weights = share_weights
-        self.pad_value = paddle.to_tensor(np.array([0.0], dtype=np.float32))
         if self.share_weights:
             #input-hidden weights shared between bi-directional rnn.
-            self.fw_fc = nn.Linear(i_size, h_size)
+            self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
             # batch norm is only performed on input-state projection
-            self.fw_bn = nn.BatchNorm1D(h_size, data_format='NLC')
+            self.fw_bn = nn.BatchNorm1D(
+                h_size, bias_attr=None, data_format='NLC')
             self.bw_fc = self.fw_fc
             self.bw_bn = self.fw_bn
         else:
-            self.fw_fc = nn.Linear(i_size, h_size)
-            self.fw_bn = nn.BatchNorm1D(h_size, data_format='NLC')
-            self.bw_fc = nn.Linear(i_size, h_size)
-            self.bw_bn = nn.BatchNorm1D(h_size, data_format='NLC')
-
-        self.fw_cell = RNNCell(hidden_size=h_size, activation='relu')
-        self.bw_cell = RNNCell(
-            hidden_size=h_size,
-            activation='relu', )
+            self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
+            self.fw_bn = nn.BatchNorm1D(
+                h_size, bias_attr=None, data_format='NLC')
+            self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False)
+            self.bw_bn = nn.BatchNorm1D(
+                h_size, bias_attr=None, data_format='NLC')
+
+        self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu')
+        self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu')
         self.fw_rnn = nn.RNN(
             self.fw_cell, is_reverse=False, time_major=False)  #[B, T, D]
         self.bw_rnn = nn.RNN(
@@ -405,10 +413,12 @@ class BiGRUWithBN(nn.Layer):
     def __init__(self, i_size, h_size, act):
         super().__init__()
         hidden_size = h_size * 3
-        self.fw_fc = nn.Linear(i_size, hidden_size)
-        self.fw_bn = nn.BatchNorm1D(hidden_size, data_format='NLC')
-        self.bw_fc = nn.Linear(i_size, hidden_size)
-        self.bw_bn = nn.BatchNorm1D(hidden_size, data_format='NLC')
+        self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
+        self.fw_bn = nn.BatchNorm1D(
+            hidden_size, bias_attr=None, data_format='NLC')
+        self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
+        self.bw_bn = nn.BatchNorm1D(
+            hidden_size, bias_attr=None, data_format='NLC')
 
         self.fw_cell = GRUCellShare(input_size=hidden_size, hidden_size=h_size)
         self.bw_cell = GRUCellShare(input_size=hidden_size, hidden_size=h_size)