old grad clip has 0d tensor problem, fix it (#3334)
parent
5153ac8318
commit
a2ae6396ef
@ -1,86 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import paddle
|
|
||||||
from paddle.fluid import core
|
|
||||||
from paddle.fluid import layers
|
|
||||||
from paddle.fluid.dygraph import base as imperative_base
|
|
||||||
|
|
||||||
from paddlespeech.s2t.utils.log import Log
|
|
||||||
|
|
||||||
__all__ = ["ClipGradByGlobalNormWithLog"]
|
|
||||||
|
|
||||||
logger = Log(__name__).getlog()
|
|
||||||
|
|
||||||
|
|
||||||
class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
|
|
||||||
def __init__(self, clip_norm):
|
|
||||||
super().__init__(clip_norm)
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return f"{self.__class__.__name__}(global_clip_norm={self.clip_norm})"
|
|
||||||
|
|
||||||
@imperative_base.no_grad
|
|
||||||
def _dygraph_clip(self, params_grads):
|
|
||||||
params_and_grads = []
|
|
||||||
sum_square_list = []
|
|
||||||
for i, (p, g) in enumerate(params_grads):
|
|
||||||
if g is None:
|
|
||||||
continue
|
|
||||||
if getattr(p, 'need_clip', True) is False:
|
|
||||||
continue
|
|
||||||
merge_grad = g
|
|
||||||
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
|
|
||||||
merge_grad = layers.merge_selected_rows(g)
|
|
||||||
merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
|
|
||||||
square = paddle.square(merge_grad)
|
|
||||||
sum_square = paddle.sum(square)
|
|
||||||
sum_square_list.append(sum_square)
|
|
||||||
|
|
||||||
# debug log, not dump all since slow down train process
|
|
||||||
if i < 10:
|
|
||||||
logger.debug(
|
|
||||||
f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
|
|
||||||
|
|
||||||
# all parameters have been filterd out
|
|
||||||
if len(sum_square_list) == 0:
|
|
||||||
return params_grads
|
|
||||||
|
|
||||||
global_norm_var = paddle.concat(sum_square_list)
|
|
||||||
global_norm_var = paddle.sum(global_norm_var)
|
|
||||||
global_norm_var = paddle.sqrt(global_norm_var)
|
|
||||||
|
|
||||||
# debug log
|
|
||||||
logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!")
|
|
||||||
|
|
||||||
max_global_norm = paddle.full(
|
|
||||||
shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm)
|
|
||||||
clip_var = paddle.divide(
|
|
||||||
x=max_global_norm,
|
|
||||||
y=paddle.maximum(x=global_norm_var, y=max_global_norm))
|
|
||||||
for i, (p, g) in enumerate(params_grads):
|
|
||||||
if g is None:
|
|
||||||
continue
|
|
||||||
if getattr(p, 'need_clip', True) is False:
|
|
||||||
params_and_grads.append((p, g))
|
|
||||||
continue
|
|
||||||
new_grad = paddle.multiply(x=g, y=clip_var)
|
|
||||||
params_and_grads.append((p, new_grad))
|
|
||||||
|
|
||||||
# debug log, not dump all since slow down train process
|
|
||||||
if i < 10:
|
|
||||||
logger.debug(
|
|
||||||
f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
|
|
||||||
)
|
|
||||||
|
|
||||||
return params_and_grads
|
|
Loading…
Reference in new issue