Feat: npu supported for default model (#4084)

* Feat: npu supported for default model * Feat: npu supported for 'vector' * typo
6 months ago · 78a952a66e
parent 563217abb0
commit 78a952a66e
5 changed files with 49 additions and 8 deletions
--- a/.gitignore
+++ b/.gitignore
@ -16,6 +16,7 @@
 build
 *output/
 .history
+.idea

 audio/dist/
 audio/fc_patch/
@ -51,3 +52,5 @@ tools/onnx-simplifier/
 speechx/fc_patch/

 third_party/ctc_decoders/paddlespeech_ctcdecoders.py
+
+kernel_meta/
--- a/paddlespeech/audio/compliance/kaldi.py
+++ b/paddlespeech/audio/compliance/kaldi.py
@ -167,10 +167,15 @@ def _get_window(waveform: Tensor,
                                            energy_floor)  # (m)

    if preemphasis_coefficient != 0.0:
+        # npu only support mode=constant right now
+        if paddle.get_device().startswith('npu'):
+            mode = 'constant'
+        else:
+            mode = 'replicate'
+
        offset_strided_input = paddle.nn.functional.pad(
-            strided_input.unsqueeze(0), (1, 0),
-            data_format='NCL',
-            mode='replicate').squeeze(0)  # (m, window_size + 1)
+            strided_input.unsqueeze(0), (1, 0), data_format='NCL',
+            mode=mode).squeeze(0)  # (m, window_size + 1)
        strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :
                                                                                       -1]

--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@ -144,6 +144,12 @@ class CLSExecutor(BaseExecutor):
        if isinstance(audio_file, (str, os.PathLike)):
            logger.debug("Preprocessing audio_file:" + audio_file)

+        # set 'pad_mode' be 'constant' when device is npu, otherwise be the default 'pad_mode' value
+        if paddle.get_device().startswith('npu'):
+            pad_mode_kwarg = {"pad_mode": "constant"}
+        else:
+            pad_mode_kwarg = {}
+
        # Feature extraction
        feature_extractor = LogMelSpectrogram(
            sr=feat_conf['sample_rate'],
@ -153,7 +159,8 @@ class CLSExecutor(BaseExecutor):
            win_length=feat_conf['window_length'],
            f_min=feat_conf['f_min'],
            f_max=feat_conf['f_max'],
-            n_mels=feat_conf['n_mels'], )
+            n_mels=feat_conf['n_mels'],
+            **pad_mode_kwarg, )
        feats = feature_extractor(
            paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0)))
        self._inputs['feats'] = paddle.transpose(feats, [0, 2, 1]).unsqueeze(
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@ -451,12 +451,27 @@ def get_voc_inference(
    voc_name = voc[:voc.rindex('_')]
    voc_class = dynamic_import(voc_name, model_alias)
    voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
+
+    # npu only support mode=constant right now
+    # this code has been adapted to support 'paddlespeech.t2s.models.melgan.melgan.MelGANGenerator'
+    npu_pad_mode = {
+        "mode": "constant"
+    } if paddle.get_device().startswith('npu') else {}
+
    if voc_name != 'wavernn':
+        if npu_pad_mode:
+            voc_config["generator_params"].setdefault("pad_params", {})
+            voc_config["generator_params"]["pad_params"].update(npu_pad_mode)
+
        voc = voc_class(**voc_config["generator_params"])
        voc.set_state_dict(paddle.load(voc_ckpt)["generator_params"])
        voc.remove_weight_norm()
        voc.eval()
    else:
+        if npu_pad_mode:
+            voc_config["model"].setdefault("pad_params", {})
+            voc_config["model"]["pad_params"].update(npu_pad_mode)
+
        voc = voc_class(**voc_config["model"])
        voc.set_state_dict(paddle.load(voc_ckpt)["main_params"])
        voc.eval()
--- a/paddlespeech/vector/models/ecapa_tdnn.py
+++ b/paddlespeech/vector/models/ecapa_tdnn.py
@ -66,7 +66,12 @@ class Conv1d(nn.Layer):
        self.stride = stride
        self.dilation = dilation
        self.padding = padding
-        self.padding_mode = padding_mode
+
+        # padding_mode is forcibly set to 'constant' when using the npu device because npu only support mode=constant right now
+        if paddle.get_device().startswith('npu'):
+            self.padding_mode = 'constant'
+        else:
+            self.padding_mode = padding_mode

        self.conv = nn.Conv1D(
            in_channels,
@ -335,10 +340,16 @@ class AttentiveStatisticsPooling(nn.Layer):
        # Apply layers
        attn = self.conv(self.tanh(self.tdnn(attn)))

+        if paddle.get_device().startswith('npu'):
+            # The following way is designed to fix the 'Broadcast dimension mismatch' error
+            # that occurs when using the npu device and setting padding_mode to 'constant'.
+            inf_tensor = paddle.full_like(attn, float("-inf"))
+        else:
+            # the default way
+            inf_tensor = paddle.ones_like(attn) * float("-inf")
+
        # Filter out zero-paddings
-        attn = paddle.where(
-            mask.tile((1, C, 1)) == 0,
-            paddle.ones_like(attn) * float("-inf"), attn)
+        attn = paddle.where(mask.tile((1, C, 1)) == 0, inf_tensor, attn)

        attn = F.softmax(attn, axis=2)
        mean, std = _compute_statistics(x, attn)