old grad clip has 0d tensor problem, fix it (#3334)

pull/3336/head
Hui Zhang 2 years ago committed by GitHub
parent 5153ac8318
commit a2ae6396ef
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -27,7 +27,6 @@ from paddlespeech.audio.text.text_featurizer import TextFeaturizer
from paddlespeech.s2t.io.dataloader import BatchDataLoader from paddlespeech.s2t.io.dataloader import BatchDataLoader
from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
from paddlespeech.s2t.models.ds2 import DeepSpeech2Model from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
from paddlespeech.s2t.training.reporter import report from paddlespeech.s2t.training.reporter import report
from paddlespeech.s2t.training.timer import Timer from paddlespeech.s2t.training.timer import Timer
from paddlespeech.s2t.training.trainer import Trainer from paddlespeech.s2t.training.trainer import Trainer
@ -148,7 +147,7 @@ class DeepSpeech2Trainer(Trainer):
if not self.train: if not self.train:
return return
grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip) grad_clip = paddle.nn.ClipGradByGlobalNorm(config.global_grad_clip)
lr_scheduler = paddle.optimizer.lr.ExponentialDecay( lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
learning_rate=config.lr, gamma=config.lr_decay, verbose=True) learning_rate=config.lr, gamma=config.lr_decay, verbose=True)
optimizer = paddle.optimizer.Adam( optimizer = paddle.optimizer.Adam(

@ -1,86 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle.fluid import core
from paddle.fluid import layers
from paddle.fluid.dygraph import base as imperative_base
from paddlespeech.s2t.utils.log import Log
__all__ = ["ClipGradByGlobalNormWithLog"]
logger = Log(__name__).getlog()
class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
def __init__(self, clip_norm):
super().__init__(clip_norm)
def __repr__(self):
return f"{self.__class__.__name__}(global_clip_norm={self.clip_norm})"
@imperative_base.no_grad
def _dygraph_clip(self, params_grads):
params_and_grads = []
sum_square_list = []
for i, (p, g) in enumerate(params_grads):
if g is None:
continue
if getattr(p, 'need_clip', True) is False:
continue
merge_grad = g
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
merge_grad = layers.merge_selected_rows(g)
merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
square = paddle.square(merge_grad)
sum_square = paddle.sum(square)
sum_square_list.append(sum_square)
# debug log, not dump all since slow down train process
if i < 10:
logger.debug(
f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
# all parameters have been filterd out
if len(sum_square_list) == 0:
return params_grads
global_norm_var = paddle.concat(sum_square_list)
global_norm_var = paddle.sum(global_norm_var)
global_norm_var = paddle.sqrt(global_norm_var)
# debug log
logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!")
max_global_norm = paddle.full(
shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm)
clip_var = paddle.divide(
x=max_global_norm,
y=paddle.maximum(x=global_norm_var, y=max_global_norm))
for i, (p, g) in enumerate(params_grads):
if g is None:
continue
if getattr(p, 'need_clip', True) is False:
params_and_grads.append((p, g))
continue
new_grad = paddle.multiply(x=g, y=clip_var)
params_and_grads.append((p, new_grad))
# debug log, not dump all since slow down train process
if i < 10:
logger.debug(
f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
)
return params_and_grads

@ -19,7 +19,7 @@ from typing import Text
import paddle import paddle
from paddle.optimizer import Optimizer from paddle.optimizer import Optimizer
from paddle.regularizer import L2Decay from paddle.regularizer import L2Decay
from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.s2t.utils.dynamic_import import instance_class from paddlespeech.s2t.utils.dynamic_import import instance_class
from paddlespeech.s2t.utils.log import Log from paddlespeech.s2t.utils.log import Log
@ -100,7 +100,7 @@ class OptimizerFactory():
assert "parameters" in args, "parameters not in args." assert "parameters" in args, "parameters not in args."
assert "learning_rate" in args, "learning_rate not in args." assert "learning_rate" in args, "learning_rate not in args."
grad_clip = ClipGradByGlobalNormWithLog( grad_clip = paddle.nn.ClipGradByGlobalNorm(
args['grad_clip']) if "grad_clip" in args else None args['grad_clip']) if "grad_clip" in args else None
weight_decay = L2Decay( weight_decay = L2Decay(
args['weight_decay']) if "weight_decay" in args else None args['weight_decay']) if "weight_decay" in args else None

@ -72,3 +72,12 @@ if __name__ == '__main__':
for i, sub in enumerate(outs): for i, sub in enumerate(outs):
print(i, sub) print(i, sub)
print() print()
import json
import xmltodict
text = "<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>"
ssml = xmltodict.parse(text)
print(json.dumps(ssml))
print(ssml['speak'].keys())
print(ssml['speak']['#text'])
print(ssml['speak']['say-as'])

Loading…
Cancel
Save