diff --git a/.gitignore b/.gitignore index 4a0c43312..d31cfc06c 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ build *output/ .history +.idea audio/dist/ audio/fc_patch/ @@ -51,3 +52,5 @@ tools/onnx-simplifier/ speechx/fc_patch/ third_party/ctc_decoders/paddlespeech_ctcdecoders.py + +kernel_meta/ diff --git a/paddlespeech/audio/compliance/kaldi.py b/paddlespeech/audio/compliance/kaldi.py index a94ec4053..f15fdfd5d 100644 --- a/paddlespeech/audio/compliance/kaldi.py +++ b/paddlespeech/audio/compliance/kaldi.py @@ -167,10 +167,16 @@ def _get_window(waveform: Tensor, energy_floor) # (m) if preemphasis_coefficient != 0.0: + # npu only support mode=constant right now + if paddle.get_device().startswith('npu'): + mode = 'constant' + else: + mode = 'replicate' + offset_strided_input = paddle.nn.functional.pad( strided_input.unsqueeze(0), (1, 0), data_format='NCL', - mode='replicate').squeeze(0) # (m, window_size + 1) + mode=mode).squeeze(0) # (m, window_size + 1) strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, : -1] diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py index 54780fdd2..e1be8bad0 100644 --- a/paddlespeech/cli/cls/infer.py +++ b/paddlespeech/cli/cls/infer.py @@ -144,6 +144,12 @@ class CLSExecutor(BaseExecutor): if isinstance(audio_file, (str, os.PathLike)): logger.debug("Preprocessing audio_file:" + audio_file) + # set 'pad_mode' be 'constant' when device is npu, otherwise be the default 'pad_mode' value + if paddle.get_device().startswith('npu'): + pad_mode_kwarg = {"pad_mode": "constant"} + else: + pad_mode_kwarg = {} + # Feature extraction feature_extractor = LogMelSpectrogram( sr=feat_conf['sample_rate'], @@ -153,7 +159,9 @@ class CLSExecutor(BaseExecutor): win_length=feat_conf['window_length'], f_min=feat_conf['f_min'], f_max=feat_conf['f_max'], - n_mels=feat_conf['n_mels'], ) + n_mels=feat_conf['n_mels'], + **pad_mode_kwarg, + ) feats = feature_extractor( paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0))) self._inputs['feats'] = paddle.transpose(feats, [0, 2, 1]).unsqueeze( diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index acfaa012d..53c88d6d0 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -451,12 +451,25 @@ def get_voc_inference( voc_name = voc[:voc.rindex('_')] voc_class = dynamic_import(voc_name, model_alias) voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) + + # npu only support mode=constant right now + # this code has been adapted to support 'paddlespeech.t2s.models.melgan.melgan.MelGANGenerator' + npu_pad_mode = {"mode": "constant"} if paddle.get_device().startswith('npu') else {} + if voc_name != 'wavernn': + if npu_pad_mode: + voc_config["generator_params"].setdefault("pad_params", {}) + voc_config["generator_params"]["pad_params"].update(npu_pad_mode) + voc = voc_class(**voc_config["generator_params"]) voc.set_state_dict(paddle.load(voc_ckpt)["generator_params"]) voc.remove_weight_norm() voc.eval() else: + if npu_pad_mode: + voc_config["model"].setdefault("pad_params", {}) + voc_config["model"]["pad_params"].update(npu_pad_mode) + voc = voc_class(**voc_config["model"]) voc.set_state_dict(paddle.load(voc_ckpt)["main_params"]) voc.eval()