|
|
@ -47,7 +47,8 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
|
|
|
|
sum_square = layers.reduce_sum(square)
|
|
|
|
sum_square = layers.reduce_sum(square)
|
|
|
|
sum_square_list.append(sum_square)
|
|
|
|
sum_square_list.append(sum_square)
|
|
|
|
|
|
|
|
|
|
|
|
# debug log
|
|
|
|
# debug log, not dump all since slow down train process
|
|
|
|
|
|
|
|
if i < 10:
|
|
|
|
logger.debug(
|
|
|
|
logger.debug(
|
|
|
|
f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
|
|
|
|
f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
|
|
|
|
|
|
|
|
|
|
|
@ -75,7 +76,8 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
|
|
|
|
new_grad = layers.elementwise_mul(x=g, y=clip_var)
|
|
|
|
new_grad = layers.elementwise_mul(x=g, y=clip_var)
|
|
|
|
params_and_grads.append((p, new_grad))
|
|
|
|
params_and_grads.append((p, new_grad))
|
|
|
|
|
|
|
|
|
|
|
|
# debug log
|
|
|
|
# debug log, not dump all since slow down train process
|
|
|
|
|
|
|
|
if i < 10:
|
|
|
|
logger.debug(
|
|
|
|
logger.debug(
|
|
|
|
f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
|
|
|
|
f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
|
|
|
|
)
|
|
|
|
)
|
|
|
|