You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
75 lines
2.8 KiB
75 lines
2.8 KiB
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import logging
|
|
|
|
import paddle
|
|
from paddle.fluid.dygraph import base as imperative_base
|
|
from paddle.fluid import layers
|
|
from paddle.fluid import core
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class MyClipGradByGlobalNorm(paddle.nn.ClipGradByGlobalNorm):
|
|
def __init__(self, clip_norm):
|
|
super().__init__(clip_norm)
|
|
|
|
@imperative_base.no_grad
|
|
def _dygraph_clip(self, params_grads):
|
|
params_and_grads = []
|
|
sum_square_list = []
|
|
for p, g in params_grads:
|
|
if g is None:
|
|
continue
|
|
if getattr(p, 'need_clip', True) is False:
|
|
continue
|
|
merge_grad = g
|
|
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
|
|
merge_grad = layers.merge_selected_rows(g)
|
|
merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
|
|
square = layers.square(merge_grad)
|
|
sum_square = layers.reduce_sum(square)
|
|
logger.info(
|
|
f"Grad Before Clip: {p.name}: {float(layers.sqrt(layers.reduce_sum(layers.square(merge_grad))) ) }"
|
|
)
|
|
sum_square_list.append(sum_square)
|
|
|
|
# all parameters have been filterd out
|
|
if len(sum_square_list) == 0:
|
|
return params_grads
|
|
|
|
global_norm_var = layers.concat(sum_square_list)
|
|
global_norm_var = layers.reduce_sum(global_norm_var)
|
|
global_norm_var = layers.sqrt(global_norm_var)
|
|
logger.info(f"Grad Global Norm: {float(global_norm_var)}!!!!")
|
|
max_global_norm = layers.fill_constant(
|
|
shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
|
|
clip_var = layers.elementwise_div(
|
|
x=max_global_norm,
|
|
y=layers.elementwise_max(x=global_norm_var, y=max_global_norm))
|
|
for p, g in params_grads:
|
|
if g is None:
|
|
continue
|
|
if getattr(p, 'need_clip', True) is False:
|
|
params_and_grads.append((p, g))
|
|
continue
|
|
new_grad = layers.elementwise_mul(x=g, y=clip_var)
|
|
logger.info(
|
|
f"Grad After Clip: {p.name}: {float(layers.sqrt(layers.reduce_sum(layers.square(merge_grad))) ) }"
|
|
)
|
|
params_and_grads.append((p, new_grad))
|
|
|
|
return params_and_grads
|