old grad clip has 0d tensor problem, fix it (#3334)

3 years ago · a2ae6396ef
parent 5153ac8318
commit a2ae6396ef
4 changed files with 12 additions and 90 deletions
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@ -27,7 +27,6 @@ from paddlespeech.audio.text.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.io.dataloader import BatchDataLoader
 from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
 from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
 from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
 from paddlespeech.s2t.training.reporter import report
 from paddlespeech.s2t.training.timer import Timer
 from paddlespeech.s2t.training.trainer import Trainer
@ -148,7 +147,7 @@ class DeepSpeech2Trainer(Trainer):
        if not self.train:
            return
-        grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip)
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(config.global_grad_clip)
        lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
            learning_rate=config.lr, gamma=config.lr_decay, verbose=True)
        optimizer = paddle.optimizer.Adam(
--- a/paddlespeech/s2t/training/gradclip.py
+++ b/paddlespeech/s2t/training/gradclip.py
@ -1,86 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 from paddle.fluid import core
 from paddle.fluid import layers
 from paddle.fluid.dygraph import base as imperative_base
 from paddlespeech.s2t.utils.log import Log
 __all__ = ["ClipGradByGlobalNormWithLog"]
 logger = Log(__name__).getlog()
 class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
    def __init__(self, clip_norm):
        super().__init__(clip_norm)
    def __repr__(self):
        return f"{self.__class__.__name__}(global_clip_norm={self.clip_norm})"
    @imperative_base.no_grad
    def _dygraph_clip(self, params_grads):
        params_and_grads = []
        sum_square_list = []
        for i, (p, g) in enumerate(params_grads):
            if g is None:
                continue
            if getattr(p, 'need_clip', True) is False:
                continue
            merge_grad = g
            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                merge_grad = layers.merge_selected_rows(g)
                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
            square = paddle.square(merge_grad)
            sum_square = paddle.sum(square)
            sum_square_list.append(sum_square)
            # debug log, not dump all since slow down train process
            if i < 10:
                logger.debug(
                    f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
        # all parameters have been filterd out
        if len(sum_square_list) == 0:
            return params_grads
        global_norm_var = paddle.concat(sum_square_list)
        global_norm_var = paddle.sum(global_norm_var)
        global_norm_var = paddle.sqrt(global_norm_var)
        # debug log
        logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!")
        max_global_norm = paddle.full(
            shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm)
        clip_var = paddle.divide(
            x=max_global_norm,
            y=paddle.maximum(x=global_norm_var, y=max_global_norm))
        for i, (p, g) in enumerate(params_grads):
            if g is None:
                continue
            if getattr(p, 'need_clip', True) is False:
                params_and_grads.append((p, g))
                continue
            new_grad = paddle.multiply(x=g, y=clip_var)
            params_and_grads.append((p, new_grad))
            # debug log, not dump all since slow down train process
            if i < 10:
                logger.debug(
                    f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
                )
        return params_and_grads
--- a/paddlespeech/s2t/training/optimizer/init.py
+++ b/paddlespeech/s2t/training/optimizer/init.py
@ -19,7 +19,7 @@ from typing import Text
 import paddle
 from paddle.optimizer import Optimizer
 from paddle.regularizer import L2Decay
-from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
+
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.s2t.utils.dynamic_import import instance_class
 from paddlespeech.s2t.utils.log import Log
@ -100,7 +100,7 @@ class OptimizerFactory():
        assert "parameters" in args, "parameters not in args."
        assert "learning_rate" in args, "learning_rate not in args."
-        grad_clip = ClipGradByGlobalNormWithLog(
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(
            args['grad_clip']) if "grad_clip" in args else None
        weight_decay = L2Decay(
            args['weight_decay']) if "weight_decay" in args else None
--- a/tests/unit/tts/test_ssml.py
+++ b/tests/unit/tts/test_ssml.py
@ -72,3 +72,12 @@ if __name__ == '__main__':
    for i, sub in enumerate(outs):
        print(i, sub)
    print()
    import json
    import xmltodict
    text = "<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>"
    ssml = xmltodict.parse(text)
    print(json.dumps(ssml))
    print(ssml['speak'].keys())
    print(ssml['speak']['#text'])
    print(ssml['speak']['say-as'])