Feat: npu supported for default model (#4084)

* Feat: npu supported for default model * Feat: npu supported for 'vector' * typo
3 months ago · 78a952a66e
parent 563217abb0
commit 78a952a66e
5 changed files with 49 additions and 8 deletions
--- a/.gitignore
+++ b/.gitignore
@ -16,6 +16,7 @@
 build
 *output/
 .history
 .idea
 audio/dist/
 audio/fc_patch/
@ -51,3 +52,5 @@ tools/onnx-simplifier/
 speechx/fc_patch/
 third_party/ctc_decoders/paddlespeech_ctcdecoders.py
 kernel_meta/
--- a/paddlespeech/audio/compliance/kaldi.py
+++ b/paddlespeech/audio/compliance/kaldi.py
@ -167,10 +167,15 @@ def _get_window(waveform: Tensor,
                                            energy_floor)  # (m)
    if preemphasis_coefficient != 0.0:
        # npu only support mode=constant right now
        if paddle.get_device().startswith('npu'):
            mode = 'constant'
        else:
            mode = 'replicate'
        offset_strided_input = paddle.nn.functional.pad(
-            strided_input.unsqueeze(0), (1, 0),
+            strided_input.unsqueeze(0), (1, 0), data_format='NCL',
-            data_format='NCL',
+            mode=mode).squeeze(0)  # (m, window_size + 1)
            mode='replicate').squeeze(0)  # (m, window_size + 1)
        strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :
                                                                                       -1]
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@ -144,6 +144,12 @@ class CLSExecutor(BaseExecutor):
        if isinstance(audio_file, (str, os.PathLike)):
            logger.debug("Preprocessing audio_file:" + audio_file)
        # set 'pad_mode' be 'constant' when device is npu, otherwise be the default 'pad_mode' value
        if paddle.get_device().startswith('npu'):
            pad_mode_kwarg = {"pad_mode": "constant"}
        else:
            pad_mode_kwarg = {}
        # Feature extraction
        feature_extractor = LogMelSpectrogram(
            sr=feat_conf['sample_rate'],
@ -153,7 +159,8 @@ class CLSExecutor(BaseExecutor):
            win_length=feat_conf['window_length'],
            f_min=feat_conf['f_min'],
            f_max=feat_conf['f_max'],
-            n_mels=feat_conf['n_mels'], )
+            n_mels=feat_conf['n_mels'],
            **pad_mode_kwarg, )
        feats = feature_extractor(
            paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0)))
        self._inputs['feats'] = paddle.transpose(feats, [0, 2, 1]).unsqueeze(
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@ -451,12 +451,27 @@ def get_voc_inference(
    voc_name = voc[:voc.rindex('_')]
    voc_class = dynamic_import(voc_name, model_alias)
    voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
    # npu only support mode=constant right now
    # this code has been adapted to support 'paddlespeech.t2s.models.melgan.melgan.MelGANGenerator'
    npu_pad_mode = {
        "mode": "constant"
    } if paddle.get_device().startswith('npu') else {}
    if voc_name != 'wavernn':
        if npu_pad_mode:
            voc_config["generator_params"].setdefault("pad_params", {})
            voc_config["generator_params"]["pad_params"].update(npu_pad_mode)
        voc = voc_class(**voc_config["generator_params"])
        voc.set_state_dict(paddle.load(voc_ckpt)["generator_params"])
        voc.remove_weight_norm()
        voc.eval()
    else:
        if npu_pad_mode:
            voc_config["model"].setdefault("pad_params", {})
            voc_config["model"]["pad_params"].update(npu_pad_mode)
        voc = voc_class(**voc_config["model"])
        voc.set_state_dict(paddle.load(voc_ckpt)["main_params"])
        voc.eval()
--- a/paddlespeech/vector/models/ecapa_tdnn.py
+++ b/paddlespeech/vector/models/ecapa_tdnn.py
@ -66,7 +66,12 @@ class Conv1d(nn.Layer):
        self.stride = stride
        self.dilation = dilation
        self.padding = padding
-        self.padding_mode = padding_mode
+
        # padding_mode is forcibly set to 'constant' when using the npu device because npu only support mode=constant right now
        if paddle.get_device().startswith('npu'):
            self.padding_mode = 'constant'
        else:
            self.padding_mode = padding_mode
        self.conv = nn.Conv1D(
            in_channels,
@ -335,10 +340,16 @@ class AttentiveStatisticsPooling(nn.Layer):
        # Apply layers
        attn = self.conv(self.tanh(self.tdnn(attn)))
        if paddle.get_device().startswith('npu'):
            # The following way is designed to fix the 'Broadcast dimension mismatch' error
            # that occurs when using the npu device and setting padding_mode to 'constant'.
            inf_tensor = paddle.full_like(attn, float("-inf"))
        else:
            # the default way
            inf_tensor = paddle.ones_like(attn) * float("-inf")
        # Filter out zero-paddings
-        attn = paddle.where(
+        attn = paddle.where(mask.tile((1, C, 1)) == 0, inf_tensor, attn)
            mask.tile((1, C, 1)) == 0,
            paddle.ones_like(attn) * float("-inf"), attn)
        attn = F.softmax(attn, axis=2)
        mean, std = _compute_statistics(x, attn)