|
|
@ -55,6 +55,7 @@ class PositionwiseFeedForward(nn.Layer):
|
|
|
|
self.dropout = nn.Dropout(dropout_rate)
|
|
|
|
self.dropout = nn.Dropout(dropout_rate)
|
|
|
|
self.w_2 = Linear(hidden_units, idim)
|
|
|
|
self.w_2 = Linear(hidden_units, idim)
|
|
|
|
self.adaptive_scale = adaptive_scale
|
|
|
|
self.adaptive_scale = adaptive_scale
|
|
|
|
|
|
|
|
if self.adaptive_scale:
|
|
|
|
ada_scale = self.create_parameter(
|
|
|
|
ada_scale = self.create_parameter(
|
|
|
|
[1, 1, idim], default_initializer=I.XavierUniform())
|
|
|
|
[1, 1, idim], default_initializer=I.XavierUniform())
|
|
|
|
self.add_parameter('ada_scale', ada_scale)
|
|
|
|
self.add_parameter('ada_scale', ada_scale)
|
|
|
@ -84,4 +85,6 @@ class PositionwiseFeedForward(nn.Layer):
|
|
|
|
Returns:
|
|
|
|
Returns:
|
|
|
|
output tensor, (B, Lmax, D)
|
|
|
|
output tensor, (B, Lmax, D)
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
|
|
|
|
if self.adaptive_scale:
|
|
|
|
|
|
|
|
xs = self.ada_scale * xs + self.ada_bias
|
|
|
|
return self.w_2(self.dropout(self.activation(self.w_1(xs))))
|
|
|
|
return self.w_2(self.dropout(self.activation(self.w_1(xs))))
|
|
|
|