diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 7a7aef8b0..231a00f4d 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -25,9 +25,6 @@ import librosa import numpy as np import paddle import soundfile -from paddlespeech.audio.transform.transformation import Transformation -from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer -from paddlespeech.s2t.utils.utility import UpdateConfig from yacs.config import CfgNode from ...utils.env import MODEL_HOME @@ -37,6 +34,9 @@ from ..log import logger from ..utils import CLI_TIMER from ..utils import stats_wrapper from ..utils import timer_register +from paddlespeech.audio.transform.transformation import Transformation +from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer +from paddlespeech.s2t.utils.utility import UpdateConfig __all__ = ['ASRExecutor'] @@ -274,7 +274,7 @@ class ASRExecutor(BaseExecutor): # fbank audio = preprocessing(audio, **preprocess_args) - audio_len = paddle.to_tensor(audio.shape[0]) + audio_len = paddle.to_tensor(audio.shape[0]).unsqueeze(axis=0) audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0) self._inputs["audio"] = audio diff --git a/paddlespeech/cli/ssl/infer.py b/paddlespeech/cli/ssl/infer.py index dce7c7781..44fbb4257 100644 --- a/paddlespeech/cli/ssl/infer.py +++ b/paddlespeech/cli/ssl/infer.py @@ -245,7 +245,7 @@ class SSLExecutor(BaseExecutor): # fbank audio = preprocessing(audio, **preprocess_args) - audio_len = paddle.to_tensor(audio.shape[0]) + audio_len = paddle.to_tensor(audio.shape[0]).unsqueeze(axis=0) audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0) self._inputs["audio"] = audio diff --git a/paddlespeech/cli/whisper/infer.py b/paddlespeech/cli/whisper/infer.py index ebcca890b..17e8c0b8c 100644 --- a/paddlespeech/cli/whisper/infer.py +++ b/paddlespeech/cli/whisper/infer.py @@ -253,7 +253,7 @@ class WhisperExecutor(BaseExecutor): # fbank audio = log_mel_spectrogram(audio, resource_path=self.resource_path) - audio_len = paddle.to_tensor(audio.shape[0]) + audio_len = paddle.to_tensor(audio.shape[0]).unsqueeze(axis=0) self._inputs["audio"] = audio self._inputs["audio_len"] = audio_len