Feat: npu supported for default model

4 months ago · 3cb284f18c
parent 563217abb0
commit 3cb284f18c
4 changed files with 32 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -16,6 +16,7 @@
 build
 *output/
 .history
+.idea

 audio/dist/
 audio/fc_patch/
@ -51,3 +52,5 @@ tools/onnx-simplifier/
 speechx/fc_patch/

 third_party/ctc_decoders/paddlespeech_ctcdecoders.py
+
+kernel_meta/
--- a/paddlespeech/audio/compliance/kaldi.py
+++ b/paddlespeech/audio/compliance/kaldi.py
@ -167,10 +167,16 @@ def _get_window(waveform: Tensor,
                                            energy_floor)  # (m)

    if preemphasis_coefficient != 0.0:
+        # npu only support mode=constant right now
+        if paddle.get_device().startswith('npu'):
+            mode = 'constant'
+        else:
+            mode = 'replicate'
+
        offset_strided_input = paddle.nn.functional.pad(
            strided_input.unsqueeze(0), (1, 0),
            data_format='NCL',
-            mode='replicate').squeeze(0)  # (m, window_size + 1)
+            mode=mode).squeeze(0)  # (m, window_size + 1)
        strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :
                                                                                       -1]

--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@ -144,6 +144,12 @@ class CLSExecutor(BaseExecutor):
        if isinstance(audio_file, (str, os.PathLike)):
            logger.debug("Preprocessing audio_file:" + audio_file)

+        # set 'pad_mode' be 'constant' when device is npu, otherwise be the default 'pad_mode' value
+        if paddle.get_device().startswith('npu'):
+            pad_mode_kwarg = {"pad_mode": "constant"}
+        else:
+            pad_mode_kwarg = {}
+
        # Feature extraction
        feature_extractor = LogMelSpectrogram(
            sr=feat_conf['sample_rate'],
@ -153,7 +159,9 @@ class CLSExecutor(BaseExecutor):
            win_length=feat_conf['window_length'],
            f_min=feat_conf['f_min'],
            f_max=feat_conf['f_max'],
-            n_mels=feat_conf['n_mels'], )
+            n_mels=feat_conf['n_mels'],
+            **pad_mode_kwarg,
+        )
        feats = feature_extractor(
            paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0)))
        self._inputs['feats'] = paddle.transpose(feats, [0, 2, 1]).unsqueeze(
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@ -451,12 +451,25 @@ def get_voc_inference(
    voc_name = voc[:voc.rindex('_')]
    voc_class = dynamic_import(voc_name, model_alias)
    voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
+
+    # npu only support mode=constant right now
+    # this code has been adapted to support 'paddlespeech.t2s.models.melgan.melgan.MelGANGenerator'
+    npu_pad_mode = {"mode": "constant"} if paddle.get_device().startswith('npu') else {}
+
    if voc_name != 'wavernn':
+        if npu_pad_mode:
+            voc_config["generator_params"].setdefault("pad_params", {})
+            voc_config["generator_params"]["pad_params"].update(npu_pad_mode)
+
        voc = voc_class(**voc_config["generator_params"])
        voc.set_state_dict(paddle.load(voc_ckpt)["generator_params"])
        voc.remove_weight_norm()
        voc.eval()
    else:
+        if npu_pad_mode:
+            voc_config["model"].setdefault("pad_params", {})
+            voc_config["model"]["pad_params"].update(npu_pad_mode)
+
        voc = voc_class(**voc_config["model"])
        voc.set_state_dict(paddle.load(voc_ckpt)["main_params"])
        voc.eval()