Feat: npu supported for default model (#4084)

* Feat: npu supported for default model

* Feat: npu supported for 'vector'

* typo
pull/4090/head
yzz 3 months ago committed by GitHub
parent 563217abb0
commit 78a952a66e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

3
.gitignore vendored

@ -16,6 +16,7 @@
build
*output/
.history
.idea
audio/dist/
audio/fc_patch/
@ -51,3 +52,5 @@ tools/onnx-simplifier/
speechx/fc_patch/
third_party/ctc_decoders/paddlespeech_ctcdecoders.py
kernel_meta/

@ -167,10 +167,15 @@ def _get_window(waveform: Tensor,
energy_floor) # (m)
if preemphasis_coefficient != 0.0:
# npu only support mode=constant right now
if paddle.get_device().startswith('npu'):
mode = 'constant'
else:
mode = 'replicate'
offset_strided_input = paddle.nn.functional.pad(
strided_input.unsqueeze(0), (1, 0),
data_format='NCL',
mode='replicate').squeeze(0) # (m, window_size + 1)
strided_input.unsqueeze(0), (1, 0), data_format='NCL',
mode=mode).squeeze(0) # (m, window_size + 1)
strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :
-1]

@ -144,6 +144,12 @@ class CLSExecutor(BaseExecutor):
if isinstance(audio_file, (str, os.PathLike)):
logger.debug("Preprocessing audio_file:" + audio_file)
# set 'pad_mode' be 'constant' when device is npu, otherwise be the default 'pad_mode' value
if paddle.get_device().startswith('npu'):
pad_mode_kwarg = {"pad_mode": "constant"}
else:
pad_mode_kwarg = {}
# Feature extraction
feature_extractor = LogMelSpectrogram(
sr=feat_conf['sample_rate'],
@ -153,7 +159,8 @@ class CLSExecutor(BaseExecutor):
win_length=feat_conf['window_length'],
f_min=feat_conf['f_min'],
f_max=feat_conf['f_max'],
n_mels=feat_conf['n_mels'], )
n_mels=feat_conf['n_mels'],
**pad_mode_kwarg, )
feats = feature_extractor(
paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0)))
self._inputs['feats'] = paddle.transpose(feats, [0, 2, 1]).unsqueeze(

@ -451,12 +451,27 @@ def get_voc_inference(
voc_name = voc[:voc.rindex('_')]
voc_class = dynamic_import(voc_name, model_alias)
voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
# npu only support mode=constant right now
# this code has been adapted to support 'paddlespeech.t2s.models.melgan.melgan.MelGANGenerator'
npu_pad_mode = {
"mode": "constant"
} if paddle.get_device().startswith('npu') else {}
if voc_name != 'wavernn':
if npu_pad_mode:
voc_config["generator_params"].setdefault("pad_params", {})
voc_config["generator_params"]["pad_params"].update(npu_pad_mode)
voc = voc_class(**voc_config["generator_params"])
voc.set_state_dict(paddle.load(voc_ckpt)["generator_params"])
voc.remove_weight_norm()
voc.eval()
else:
if npu_pad_mode:
voc_config["model"].setdefault("pad_params", {})
voc_config["model"]["pad_params"].update(npu_pad_mode)
voc = voc_class(**voc_config["model"])
voc.set_state_dict(paddle.load(voc_ckpt)["main_params"])
voc.eval()

@ -66,7 +66,12 @@ class Conv1d(nn.Layer):
self.stride = stride
self.dilation = dilation
self.padding = padding
self.padding_mode = padding_mode
# padding_mode is forcibly set to 'constant' when using the npu device because npu only support mode=constant right now
if paddle.get_device().startswith('npu'):
self.padding_mode = 'constant'
else:
self.padding_mode = padding_mode
self.conv = nn.Conv1D(
in_channels,
@ -335,10 +340,16 @@ class AttentiveStatisticsPooling(nn.Layer):
# Apply layers
attn = self.conv(self.tanh(self.tdnn(attn)))
if paddle.get_device().startswith('npu'):
# The following way is designed to fix the 'Broadcast dimension mismatch' error
# that occurs when using the npu device and setting padding_mode to 'constant'.
inf_tensor = paddle.full_like(attn, float("-inf"))
else:
# the default way
inf_tensor = paddle.ones_like(attn) * float("-inf")
# Filter out zero-paddings
attn = paddle.where(
mask.tile((1, C, 1)) == 0,
paddle.ones_like(attn) * float("-inf"), attn)
attn = paddle.where(mask.tile((1, C, 1)) == 0, inf_tensor, attn)
attn = F.softmax(attn, axis=2)
mean, std = _compute_statistics(x, attn)

Loading…
Cancel
Save