|
|
@ -56,15 +56,15 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
|
|
|
|
if len(sum_square_list) == 0:
|
|
|
|
if len(sum_square_list) == 0:
|
|
|
|
return params_grads
|
|
|
|
return params_grads
|
|
|
|
|
|
|
|
|
|
|
|
global_norm_var = layers.concat(sum_square_list)
|
|
|
|
global_norm_var = paddle.concat(sum_square_list)
|
|
|
|
global_norm_var = paddle.sum(global_norm_var)
|
|
|
|
global_norm_var = paddle.sum(global_norm_var)
|
|
|
|
global_norm_var = paddle.sqrt(global_norm_var)
|
|
|
|
global_norm_var = paddle.sqrt(global_norm_var)
|
|
|
|
|
|
|
|
|
|
|
|
# debug log
|
|
|
|
# debug log
|
|
|
|
logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!")
|
|
|
|
logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!")
|
|
|
|
|
|
|
|
|
|
|
|
max_global_norm = layers.fill_constant(
|
|
|
|
max_global_norm = paddle.full(
|
|
|
|
shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
|
|
|
|
shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm)
|
|
|
|
clip_var = paddle.divide(
|
|
|
|
clip_var = paddle.divide(
|
|
|
|
x=max_global_norm,
|
|
|
|
x=max_global_norm,
|
|
|
|
y=paddle.maximum(x=global_norm_var, y=max_global_norm))
|
|
|
|
y=paddle.maximum(x=global_norm_var, y=max_global_norm))
|
|
|
|