diff --git a/paddlespeech/vector/models/ecapa_tdnn.py b/paddlespeech/vector/models/ecapa_tdnn.py index 895ff13f4..66518f3a3 100644 --- a/paddlespeech/vector/models/ecapa_tdnn.py +++ b/paddlespeech/vector/models/ecapa_tdnn.py @@ -66,7 +66,12 @@ class Conv1d(nn.Layer): self.stride = stride self.dilation = dilation self.padding = padding - self.padding_mode = padding_mode + + # padding_mode is forcibly set to 'constant' when using the npu device because npu only support mode=constant right now + if paddle.get_device().startswith('npu'): + self.padding_mode = 'constant' + else: + self.padding_mode = padding_mode self.conv = nn.Conv1D( in_channels, @@ -335,10 +340,16 @@ class AttentiveStatisticsPooling(nn.Layer): # Apply layers attn = self.conv(self.tanh(self.tdnn(attn))) + if paddle.get_device().startswith('npu'): + # The following way is designed to fix the 'Broadcast dimension mismatch' error + # that occurs when using the npu device and setting padding_mode to 'constant'. + inf_tensor = paddle.full_like(attn, float("-inf")) + else: + # the default way + inf_tensor = paddle.ones_like(attn) * float("-inf") + # Filter out zero-paddings - attn = paddle.where( - mask.tile((1, C, 1)) == 0, - paddle.ones_like(attn) * float("-inf"), attn) + attn = paddle.where(mask.tile((1, C, 1)) == 0, inf_tensor, attn) attn = F.softmax(attn, axis=2) mean, std = _compute_statistics(x, attn)