diff --git a/demos/whisper/README.md b/demos/whisper/README.md index d332fca13..455bca92b 100644 --- a/demos/whisper/README.md +++ b/demos/whisper/README.md @@ -65,7 +65,7 @@ Whisper model trained by OpenAI whisper https://github.com/openai/whisper print('ASR Result: \n{}'.format(text)) # to recognize text and translate to English - feature = ssl_executor( + feature = whisper_executor( model='whisper-large', task='translate', sample_rate=16000, @@ -86,4 +86,4 @@ Whisper model trained by OpenAI whisper https://github.com/openai/whisper Translate Result: Detected language: Chinese [00:00.000 --> 00:05.000] I think the most important thing about running is that it brings me good health. - {'text': ' I think the most important thing about running is that it brings me good health.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.0, 'text': ' I think the most important thing about running is that it brings me good health.', 'tokens': [50364, 286, 519, 264, 881, 1021, 551, 466, 2614, 307, 300, 309, 5607, 385, 665, 1585, 13, 50614], 'temperature': 0.0, 'avg_logprob': -0.47945233395225123, 'compression_ratio': 1.095890410958904, 'no_speech_prob': 0.028302080929279327}], 'language': 'zh'} \ No newline at end of file + {'text': ' I think the most important thing about running is that it brings me good health.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.0, 'text': ' I think the most important thing about running is that it brings me good health.', 'tokens': [50364, 286, 519, 264, 881, 1021, 551, 466, 2614, 307, 300, 309, 5607, 385, 665, 1585, 13, 50614], 'temperature': 0.0, 'avg_logprob': -0.47945233395225123, 'compression_ratio': 1.095890410958904, 'no_speech_prob': 0.028302080929279327}], 'language': 'zh'} diff --git a/demos/whisper/README_cn.md b/demos/whisper/README_cn.md index e559ba041..784952761 100644 --- a/demos/whisper/README_cn.md +++ b/demos/whisper/README_cn.md @@ -66,7 +66,7 @@ Whisper模型由OpenAI Whisper训练 https://github.com/openai/whisper print('ASR Result: \n{}'.format(text)) # 将语音翻译成英语 - feature = ssl_executor( + feature = whisper_executor( model='whisper-large', task='translate', sample_rate=16000, @@ -88,4 +88,4 @@ Whisper模型由OpenAI Whisper训练 https://github.com/openai/whisper Translate Result: Detected language: Chinese [00:00.000 --> 00:05.000] I think the most important thing about running is that it brings me good health. - {'text': ' I think the most important thing about running is that it brings me good health.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.0, 'text': ' I think the most important thing about running is that it brings me good health.', 'tokens': [50364, 286, 519, 264, 881, 1021, 551, 466, 2614, 307, 300, 309, 5607, 385, 665, 1585, 13, 50614], 'temperature': 0.0, 'avg_logprob': -0.47945233395225123, 'compression_ratio': 1.095890410958904, 'no_speech_prob': 0.028302080929279327}], 'language': 'zh'} \ No newline at end of file + {'text': ' I think the most important thing about running is that it brings me good health.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.0, 'text': ' I think the most important thing about running is that it brings me good health.', 'tokens': [50364, 286, 519, 264, 881, 1021, 551, 466, 2614, 307, 300, 309, 5607, 385, 665, 1585, 13, 50614], 'temperature': 0.0, 'avg_logprob': -0.47945233395225123, 'compression_ratio': 1.095890410958904, 'no_speech_prob': 0.028302080929279327}], 'language': 'zh'} diff --git a/demos/whisper/run.sh b/demos/whisper/run.sh index 095743bb0..1d758108d 100644 --- a/demos/whisper/run.sh +++ b/demos/whisper/run.sh @@ -6,5 +6,5 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav # to recognize text paddlespeech whisper --task transcribe --input ./zh.wav -# to recognize text and transcribe to English +# to recognize text and translate to English paddlespeech whisper --task translate --input ./zh.wav \ No newline at end of file diff --git a/paddlespeech/cli/whisper/infer.py b/paddlespeech/cli/whisper/infer.py index ada888bfc..3b1771b2d 100644 --- a/paddlespeech/cli/whisper/infer.py +++ b/paddlespeech/cli/whisper/infer.py @@ -181,18 +181,13 @@ class WhisperExecutor(BaseExecutor): resource_md5) else: raise Exception("wrong type") - #model_name = model_type - #model_class = self.task_resource.get_model_class(model_name) - #model_conf = self.config - #model = model_class.from_config(model_conf) - #self.model = model - #self.model.eval() # load model model_dict = paddle.load(self.ckpt_path) dims = ModelDimensions(**model_dict["dims"]) self.model = Whisper(dims) self.model.load_dict(model_dict) + self.model.eval() #set task if task is not None: diff --git a/paddlespeech/s2t/models/whisper/__init__.py b/paddlespeech/s2t/models/whisper/__init__.py index 1c8adba56..98ab23610 100644 --- a/paddlespeech/s2t/models/whisper/__init__.py +++ b/paddlespeech/s2t/models/whisper/__init__.py @@ -2,4 +2,11 @@ # Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. # # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/__init__.py) -from .whipser import * +from paddlespeech.s2t.models.whisper.whipser import decode +from paddlespeech.s2t.models.whisper.whipser import DecodingOptions +from paddlespeech.s2t.models.whisper.whipser import DecodingResult +from paddlespeech.s2t.models.whisper.whipser import detect_language +from paddlespeech.s2t.models.whisper.whipser import log_mel_spectrogram +from paddlespeech.s2t.models.whisper.whipser import ModelDimensions +from paddlespeech.s2t.models.whisper.whipser import transcribe +from paddlespeech.s2t.models.whisper.whipser import Whisper diff --git a/paddlespeech/s2t/models/whisper/whipser.py b/paddlespeech/s2t/models/whisper/whipser.py index 5a495f634..853e11b85 100644 --- a/paddlespeech/s2t/models/whisper/whipser.py +++ b/paddlespeech/s2t/models/whisper/whipser.py @@ -194,7 +194,7 @@ class AudioEncoder(nn.Layer): def forward(self, x: paddle.Tensor): """ - x : torch.Tensor, shape = (batch_size, n_mels, n_ctx) + x : paddle.Tensor, shape = (batch_size, n_mels, n_ctx) the mel spectrogram of the audio """ x = F.gelu(self.conv1(x)) @@ -241,9 +241,9 @@ class TextDecoder(nn.Layer): xa: paddle.Tensor, kv_cache: Optional[dict]=None): """ - x : torch.LongTensor, shape = (batch_size, <= n_ctx) + x : paddle.LongTensor, shape = (batch_size, <= n_ctx) the text tokens - xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx) + xa : paddle.Tensor, shape = (batch_size, n_mels, n_audio_ctx) the encoded audio features to be attended on """ offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0 @@ -436,7 +436,7 @@ def transcribe( model: Whisper The Whisper model instance - mel: torch.Tensor + mel: paddle.Tensor The audio feature verbose: bool @@ -1264,7 +1264,7 @@ def decode(model: "Whisper", model: Whisper the Whisper model instance - mel: torch.Tensor, shape = (80, 3000) or (*, 80, 3000) + mel: paddle.Tensor, shape = (80, 3000) or (*, 80, 3000) A tensor containing the Mel spectrogram(s) options: DecodingOptions @@ -1331,7 +1331,7 @@ class Whisper(nn.Layer): Returns ------- - cache : Dict[nn.Module, torch.Tensor] + cache : Dict[nn.Layer, paddle.Tensor] A dictionary object mapping the key/value projection modules to its cache hooks : List[RemovableHandle] List of PyTorch RemovableHandle objects to stop the hooks to be called @@ -1429,7 +1429,7 @@ def log_mel_spectrogram(audio: Union[str, np.ndarray, paddle.Tensor], Parameters ---------- - audio: Union[str, np.ndarray, torch.Tensor], shape = (*) + audio: Union[str, np.ndarray, paddle.Tensor], shape = (*) The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz n_mels: int @@ -1437,7 +1437,7 @@ def log_mel_spectrogram(audio: Union[str, np.ndarray, paddle.Tensor], Returns ------- - torch.Tensor, shape = (80, n_frames) + paddle.Tensor, shape = (80, n_frames) A Tensor that contains the Mel spectrogram """ if not paddle.is_tensor(audio):