fix some problem by comment.

pull/2640/head
zxcd 3 years ago
parent ee3a784c8b
commit dc009521df

@ -65,7 +65,7 @@ Whisper model trained by OpenAI whisper https://github.com/openai/whisper
print('ASR Result: \n{}'.format(text))
# to recognize text and translate to English
feature = ssl_executor(
feature = whisper_executor(
model='whisper-large',
task='translate',
sample_rate=16000,
@ -86,4 +86,4 @@ Whisper model trained by OpenAI whisper https://github.com/openai/whisper
Translate Result:
Detected language: Chinese
[00:00.000 --> 00:05.000] I think the most important thing about running is that it brings me good health.
{'text': ' I think the most important thing about running is that it brings me good health.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.0, 'text': ' I think the most important thing about running is that it brings me good health.', 'tokens': [50364, 286, 519, 264, 881, 1021, 551, 466, 2614, 307, 300, 309, 5607, 385, 665, 1585, 13, 50614], 'temperature': 0.0, 'avg_logprob': -0.47945233395225123, 'compression_ratio': 1.095890410958904, 'no_speech_prob': 0.028302080929279327}], 'language': 'zh'}
{'text': ' I think the most important thing about running is that it brings me good health.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.0, 'text': ' I think the most important thing about running is that it brings me good health.', 'tokens': [50364, 286, 519, 264, 881, 1021, 551, 466, 2614, 307, 300, 309, 5607, 385, 665, 1585, 13, 50614], 'temperature': 0.0, 'avg_logprob': -0.47945233395225123, 'compression_ratio': 1.095890410958904, 'no_speech_prob': 0.028302080929279327}], 'language': 'zh'}

@ -66,7 +66,7 @@ Whisper模型由OpenAI Whisper训练 https://github.com/openai/whisper
print('ASR Result: \n{}'.format(text))
# 将语音翻译成英语
feature = ssl_executor(
feature = whisper_executor(
model='whisper-large',
task='translate',
sample_rate=16000,
@ -88,4 +88,4 @@ Whisper模型由OpenAI Whisper训练 https://github.com/openai/whisper
Translate Result:
Detected language: Chinese
[00:00.000 --> 00:05.000] I think the most important thing about running is that it brings me good health.
{'text': ' I think the most important thing about running is that it brings me good health.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.0, 'text': ' I think the most important thing about running is that it brings me good health.', 'tokens': [50364, 286, 519, 264, 881, 1021, 551, 466, 2614, 307, 300, 309, 5607, 385, 665, 1585, 13, 50614], 'temperature': 0.0, 'avg_logprob': -0.47945233395225123, 'compression_ratio': 1.095890410958904, 'no_speech_prob': 0.028302080929279327}], 'language': 'zh'}
{'text': ' I think the most important thing about running is that it brings me good health.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.0, 'text': ' I think the most important thing about running is that it brings me good health.', 'tokens': [50364, 286, 519, 264, 881, 1021, 551, 466, 2614, 307, 300, 309, 5607, 385, 665, 1585, 13, 50614], 'temperature': 0.0, 'avg_logprob': -0.47945233395225123, 'compression_ratio': 1.095890410958904, 'no_speech_prob': 0.028302080929279327}], 'language': 'zh'}

@ -6,5 +6,5 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
# to recognize text
paddlespeech whisper --task transcribe --input ./zh.wav
# to recognize text and transcribe to English
# to recognize text and translate to English
paddlespeech whisper --task translate --input ./zh.wav

@ -181,18 +181,13 @@ class WhisperExecutor(BaseExecutor):
resource_md5)
else:
raise Exception("wrong type")
#model_name = model_type
#model_class = self.task_resource.get_model_class(model_name)
#model_conf = self.config
#model = model_class.from_config(model_conf)
#self.model = model
#self.model.eval()
# load model
model_dict = paddle.load(self.ckpt_path)
dims = ModelDimensions(**model_dict["dims"])
self.model = Whisper(dims)
self.model.load_dict(model_dict)
self.model.eval()
#set task
if task is not None:

@ -2,4 +2,11 @@
# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
#
# Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/__init__.py)
from .whipser import *
from paddlespeech.s2t.models.whisper.whipser import decode
from paddlespeech.s2t.models.whisper.whipser import DecodingOptions
from paddlespeech.s2t.models.whisper.whipser import DecodingResult
from paddlespeech.s2t.models.whisper.whipser import detect_language
from paddlespeech.s2t.models.whisper.whipser import log_mel_spectrogram
from paddlespeech.s2t.models.whisper.whipser import ModelDimensions
from paddlespeech.s2t.models.whisper.whipser import transcribe
from paddlespeech.s2t.models.whisper.whipser import Whisper

@ -194,7 +194,7 @@ class AudioEncoder(nn.Layer):
def forward(self, x: paddle.Tensor):
"""
x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
x : paddle.Tensor, shape = (batch_size, n_mels, n_ctx)
the mel spectrogram of the audio
"""
x = F.gelu(self.conv1(x))
@ -241,9 +241,9 @@ class TextDecoder(nn.Layer):
xa: paddle.Tensor,
kv_cache: Optional[dict]=None):
"""
x : torch.LongTensor, shape = (batch_size, <= n_ctx)
x : paddle.LongTensor, shape = (batch_size, <= n_ctx)
the text tokens
xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
xa : paddle.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
the encoded audio features to be attended on
"""
offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
@ -436,7 +436,7 @@ def transcribe(
model: Whisper
The Whisper model instance
mel: torch.Tensor
mel: paddle.Tensor
The audio feature
verbose: bool
@ -1264,7 +1264,7 @@ def decode(model: "Whisper",
model: Whisper
the Whisper model instance
mel: torch.Tensor, shape = (80, 3000) or (*, 80, 3000)
mel: paddle.Tensor, shape = (80, 3000) or (*, 80, 3000)
A tensor containing the Mel spectrogram(s)
options: DecodingOptions
@ -1331,7 +1331,7 @@ class Whisper(nn.Layer):
Returns
-------
cache : Dict[nn.Module, torch.Tensor]
cache : Dict[nn.Layer, paddle.Tensor]
A dictionary object mapping the key/value projection modules to its cache
hooks : List[RemovableHandle]
List of PyTorch RemovableHandle objects to stop the hooks to be called
@ -1429,7 +1429,7 @@ def log_mel_spectrogram(audio: Union[str, np.ndarray, paddle.Tensor],
Parameters
----------
audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
audio: Union[str, np.ndarray, paddle.Tensor], shape = (*)
The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
n_mels: int
@ -1437,7 +1437,7 @@ def log_mel_spectrogram(audio: Union[str, np.ndarray, paddle.Tensor],
Returns
-------
torch.Tensor, shape = (80, n_frames)
paddle.Tensor, shape = (80, n_frames)
A Tensor that contains the Mel spectrogram
"""
if not paddle.is_tensor(audio):

Loading…
Cancel
Save