default cmvn compute config; more log of grad clip; diff ds2 cmvn compute and conf; ds2 lr step by epoch;

4 years ago · 0ff57cec18
parent 7af055631b
commit 0ff57cec18
8 changed files with 30 additions and 29 deletions
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@ -43,13 +43,11 @@ class DeepSpeech2Trainer(Trainer):

    def train_batch(self, batch_index, batch_data, msg):
        start = time.time()
-
        loss = self.model(*batch_data)
        loss.backward()
        layer_tools.print_grads(self.model, print_func=None)
        self.optimizer.step()
        self.optimizer.clear_grad()
-
        iteration_time = time.time() - start

        losses_np = {
--- a/deepspeech/training/gradclip.py
+++ b/deepspeech/training/gradclip.py
@ -31,7 +31,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
    def _dygraph_clip(self, params_grads):
        params_and_grads = []
        sum_square_list = []
-        for p, g in params_grads:
+        for i, (p, g) in enumerate(params_grads):
            if g is None:
                continue
            if getattr(p, 'need_clip', True) is False:
@ -45,7 +45,8 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
            sum_square_list.append(sum_square)

            # debug log
-            # logger.debug(f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
+            if i < 10:
+                logger.debug(f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")

        # all parameters have been filterd out
        if len(sum_square_list) == 0:
@ -62,7 +63,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
        clip_var = layers.elementwise_div(
            x=max_global_norm,
            y=layers.elementwise_max(x=global_norm_var, y=max_global_norm))
-        for p, g in params_grads:
+        for i, (p, g) in enumerate(params_grads):
            if g is None:
                continue
            if getattr(p, 'need_clip', True) is False:
@ -72,8 +73,9 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
            params_and_grads.append((p, new_grad))

            # debug log
-            # logger.debug(
-            #     f"Grad After Clip: {p.name}: {float(merge_grad.square().sum().sqrt())}"
-            # )
+            if i < 10:
+                logger.debug(
+                    f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
+                )

        return params_and_grads
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@ -226,6 +226,7 @@ class Trainer():
                              'lr': self.lr_scheduler()}, self.epoch)

            self.save(tag=self.epoch, infos={'val_loss': cv_loss})
+            # step lr every epoch
            self.lr_scheduler.step()
            self.new_epoch()

@ -283,7 +284,6 @@ class Trainer():
        """
        # visualizer
        visualizer = SummaryWriter(logdir=str(self.output_dir))
-
        self.visualizer = visualizer

    @mp_tools.rank_zero_only
@ -301,7 +301,6 @@ class Trainer():
        """
        raise NotImplementedError("train_batch should be implemented.")

-    @mp_tools.rank_zero_only
    @paddle.no_grad()
    def valid(self):
        """The validation. A subclass should implement this method.
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@ -10,9 +10,9 @@ data:
  min_input_len: 0.0
  max_input_len: 27.0 # second
  min_output_len: 0.0
-  max_output_len: 400.0
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 10.0
+  max_output_len: .inf
+  min_output_input_ratio: 0.00
+  max_output_input_ratio: .inf
  specgram_type: linear
  target_sample_rate: 16000
  max_freq: None
@ -41,7 +41,7 @@ training:
  lr: 2e-3
  lr_decay: 0.83
  weight_decay: 1e-06
-  global_grad_clip: 5.0
+  global_grad_clip: 3.0
  log_interval: 100
  
 decoding:
--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
@ -32,7 +32,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --unit_type="char" \
    --count_threshold=0 \
    --vocab_path="data/vocab.txt" \
-    --manifest_paths "data/manifest.train.raw"
+    --manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"

    if [ $? -ne 0 ]; then
        echo "Build vocabulary failed. Terminated."
@ -51,8 +51,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    --stride_ms=10.0 \
    --window_ms=20.0 \
    --sample_rate=16000 \
-    --use_dB_normalization=False \
-    --num_samples=-1 \
+    --use_dB_normalization=True \
+    --num_samples=2000 \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"

--- a/tests/mask_test.py
+++ b/tests/mask_test.py
@ -26,25 +26,27 @@ class TestU2Model(unittest.TestCase):
        paddle.set_device('cpu')
        self.lengths = paddle.to_tensor([5, 3, 2])
        self.masks = np.array([
-            [1, 1, 1, 1, 1],
-            [1, 1, 1, 0, 0],
-            [1, 1, 0, 0, 0],
+            [True, True, True, True, True],
+            [True, True, True, False, False],
+            [True, True, False, False, False],
        ])
        self.pad_masks = np.array([
-            [0, 0, 0, 0, 0],
-            [0, 0, 0, 1, 1],
-            [0, 0, 1, 1, 1],
+            [False, False, False, False, False],
+            [False, False, False, True, True],
+            [False, False, True, True, True],
        ])

    def test_sequence_mask(self):
-        res = sequence_mask(self.lengths)
+        res = sequence_mask(self.lengths, dtype='bool')
        self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist())

    def test_make_non_pad_mask(self):
        res = make_non_pad_mask(self.lengths)
-        res1 = sequence_mask(self.lengths)
+        res1 = sequence_mask(self.lengths, dtype='bool')
+        res2 = make_pad_mask(self.lengths).logical_not()
        self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist())
        self.assertSequenceEqual(res.numpy().tolist(), res1.numpy().tolist())
+        self.assertSequenceEqual(res.numpy().tolist(), res2.numpy().tolist())

    def test_make_pad_mask(self):
        res = make_pad_mask(self.lengths)
--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@ -24,7 +24,7 @@ from deepspeech.utils.utility import print_arguments
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('num_samples',      int,    -1,    "# of samples to for statistics.")
+add_arg('num_samples',      int,    2000,    "# of samples to for statistics.")

 add_arg('specgram_type',    str,
        'linear',
@ -35,7 +35,7 @@ add_arg('delta_delta', bool,  False, "Audio feature with delta delta.")
 add_arg('stride_ms', float, 10.0,  "stride length in ms.")
 add_arg('window_ms', float, 20.0,  "stride length in ms.")
 add_arg('sample_rate',  int, 16000,  "target sample rate.")
-add_arg('use_dB_normalization', bool, False, "do dB normalization.")
+add_arg('use_dB_normalization', bool, True, "do dB normalization.")
 add_arg('target_dB',   int, -20,  "target dB.")

 add_arg('manifest_path',    str,