diff --git a/.gitignore b/.gitignore index 4a0c43312..d31cfc06c 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ build *output/ .history +.idea audio/dist/ audio/fc_patch/ @@ -51,3 +52,5 @@ tools/onnx-simplifier/ speechx/fc_patch/ third_party/ctc_decoders/paddlespeech_ctcdecoders.py + +kernel_meta/ diff --git a/paddlespeech/audio/compliance/kaldi.py b/paddlespeech/audio/compliance/kaldi.py index a94ec4053..254a87f72 100644 --- a/paddlespeech/audio/compliance/kaldi.py +++ b/paddlespeech/audio/compliance/kaldi.py @@ -167,10 +167,15 @@ def _get_window(waveform: Tensor, energy_floor) # (m) if preemphasis_coefficient != 0.0: + # npu only support mode=constant right now + if paddle.get_device().startswith('npu'): + mode = 'constant' + else: + mode = 'replicate' + offset_strided_input = paddle.nn.functional.pad( - strided_input.unsqueeze(0), (1, 0), - data_format='NCL', - mode='replicate').squeeze(0) # (m, window_size + 1) + strided_input.unsqueeze(0), (1, 0), data_format='NCL', + mode=mode).squeeze(0) # (m, window_size + 1) strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, : -1] diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py index 54780fdd2..b27644733 100644 --- a/paddlespeech/cli/cls/infer.py +++ b/paddlespeech/cli/cls/infer.py @@ -144,6 +144,12 @@ class CLSExecutor(BaseExecutor): if isinstance(audio_file, (str, os.PathLike)): logger.debug("Preprocessing audio_file:" + audio_file) + # set 'pad_mode' be 'constant' when device is npu, otherwise be the default 'pad_mode' value + if paddle.get_device().startswith('npu'): + pad_mode_kwarg = {"pad_mode": "constant"} + else: + pad_mode_kwarg = {} + # Feature extraction feature_extractor = LogMelSpectrogram( sr=feat_conf['sample_rate'], @@ -153,7 +159,8 @@ class CLSExecutor(BaseExecutor): win_length=feat_conf['window_length'], f_min=feat_conf['f_min'], f_max=feat_conf['f_max'], - n_mels=feat_conf['n_mels'], ) + n_mels=feat_conf['n_mels'], + **pad_mode_kwarg, ) feats = feature_extractor( paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0))) self._inputs['feats'] = paddle.transpose(feats, [0, 2, 1]).unsqueeze( diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index acfaa012d..a374f3017 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -451,12 +451,27 @@ def get_voc_inference( voc_name = voc[:voc.rindex('_')] voc_class = dynamic_import(voc_name, model_alias) voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) + + # npu only support mode=constant right now + # this code has been adapted to support 'paddlespeech.t2s.models.melgan.melgan.MelGANGenerator' + npu_pad_mode = { + "mode": "constant" + } if paddle.get_device().startswith('npu') else {} + if voc_name != 'wavernn': + if npu_pad_mode: + voc_config["generator_params"].setdefault("pad_params", {}) + voc_config["generator_params"]["pad_params"].update(npu_pad_mode) + voc = voc_class(**voc_config["generator_params"]) voc.set_state_dict(paddle.load(voc_ckpt)["generator_params"]) voc.remove_weight_norm() voc.eval() else: + if npu_pad_mode: + voc_config["model"].setdefault("pad_params", {}) + voc_config["model"]["pad_params"].update(npu_pad_mode) + voc = voc_class(**voc_config["model"]) voc.set_state_dict(paddle.load(voc_ckpt)["main_params"]) voc.eval() diff --git a/paddlespeech/vector/models/ecapa_tdnn.py b/paddlespeech/vector/models/ecapa_tdnn.py index 895ff13f4..66518f3a3 100644 --- a/paddlespeech/vector/models/ecapa_tdnn.py +++ b/paddlespeech/vector/models/ecapa_tdnn.py @@ -66,7 +66,12 @@ class Conv1d(nn.Layer): self.stride = stride self.dilation = dilation self.padding = padding - self.padding_mode = padding_mode + + # padding_mode is forcibly set to 'constant' when using the npu device because npu only support mode=constant right now + if paddle.get_device().startswith('npu'): + self.padding_mode = 'constant' + else: + self.padding_mode = padding_mode self.conv = nn.Conv1D( in_channels, @@ -335,10 +340,16 @@ class AttentiveStatisticsPooling(nn.Layer): # Apply layers attn = self.conv(self.tanh(self.tdnn(attn))) + if paddle.get_device().startswith('npu'): + # The following way is designed to fix the 'Broadcast dimension mismatch' error + # that occurs when using the npu device and setting padding_mode to 'constant'. + inf_tensor = paddle.full_like(attn, float("-inf")) + else: + # the default way + inf_tensor = paddle.ones_like(attn) * float("-inf") + # Filter out zero-paddings - attn = paddle.where( - mask.tile((1, C, 1)) == 0, - paddle.ones_like(attn) * float("-inf"), attn) + attn = paddle.where(mask.tile((1, C, 1)) == 0, inf_tensor, attn) attn = F.softmax(attn, axis=2) mean, std = _compute_statistics(x, attn)