fix some problem by comment.

3 years ago · dc009521df
parent ee3a784c8b
commit dc009521df
6 changed files with 22 additions and 20 deletions
--- a/demos/whisper/README.md
+++ b/demos/whisper/README.md
@ -65,7 +65,7 @@ Whisper model trained by OpenAI whisper https://github.com/openai/whisper
   print('ASR Result: \n{}'.format(text))

   # to recognize text and translate to English
-   feature = ssl_executor(
+   feature = whisper_executor(
       model='whisper-large',
       task='translate',
       sample_rate=16000,
@ -86,4 +86,4 @@ Whisper model trained by OpenAI whisper https://github.com/openai/whisper
   Translate Result:
   Detected language: Chinese
   [00:00.000 --> 00:05.000]  I think the most important thing about running is that it brings me good health.
-   {'text': ' I think the most important thing about running is that it brings me good health.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.0, 'text': ' I think the most important thing about running is that it brings me good health.', 'tokens': [50364, 286, 519, 264, 881, 1021, 551, 466, 2614, 307, 300, 309, 5607, 385, 665, 1585, 13, 50614], 'temperature': 0.0, 'avg_logprob': -0.47945233395225123, 'compression_ratio': 1.095890410958904, 'no_speech_prob': 0.028302080929279327}], 'language': 'zh'}
+   {'text': ' I think the most important thing about running is that it brings me good health.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.0, 'text': ' I think the most important thing about running is that it brings me good health.', 'tokens': [50364, 286, 519, 264, 881, 1021, 551, 466, 2614, 307, 300, 309, 5607, 385, 665, 1585, 13, 50614], 'temperature': 0.0, 'avg_logprob': -0.47945233395225123, 'compression_ratio': 1.095890410958904, 'no_speech_prob': 0.028302080929279327}], 'language': 'zh'}
--- a/demos/whisper/README_cn.md
+++ b/demos/whisper/README_cn.md
@ -66,7 +66,7 @@ Whisper模型由OpenAI Whisper训练 https://github.com/openai/whisper
   print('ASR Result: \n{}'.format(text))

    # 将语音翻译成英语
-   feature = ssl_executor(
+   feature = whisper_executor(
       model='whisper-large',
       task='translate',
       sample_rate=16000,
@ -88,4 +88,4 @@ Whisper模型由OpenAI Whisper训练 https://github.com/openai/whisper
   Translate Result:
   Detected language: Chinese
   [00:00.000 --> 00:05.000]  I think the most important thing about running is that it brings me good health.
-   {'text': ' I think the most important thing about running is that it brings me good health.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.0, 'text': ' I think the most important thing about running is that it brings me good health.', 'tokens': [50364, 286, 519, 264, 881, 1021, 551, 466, 2614, 307, 300, 309, 5607, 385, 665, 1585, 13, 50614], 'temperature': 0.0, 'avg_logprob': -0.47945233395225123, 'compression_ratio': 1.095890410958904, 'no_speech_prob': 0.028302080929279327}], 'language': 'zh'}
+   {'text': ' I think the most important thing about running is that it brings me good health.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.0, 'text': ' I think the most important thing about running is that it brings me good health.', 'tokens': [50364, 286, 519, 264, 881, 1021, 551, 466, 2614, 307, 300, 309, 5607, 385, 665, 1585, 13, 50614], 'temperature': 0.0, 'avg_logprob': -0.47945233395225123, 'compression_ratio': 1.095890410958904, 'no_speech_prob': 0.028302080929279327}], 'language': 'zh'}
--- a/demos/whisper/run.sh
+++ b/demos/whisper/run.sh
@ -6,5 +6,5 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 # to recognize text 
 paddlespeech whisper --task transcribe --input ./zh.wav

-# to recognize text and transcribe to English
+# to recognize text and translate to English
 paddlespeech whisper --task translate --input ./zh.wav
--- a/paddlespeech/cli/whisper/infer.py
+++ b/paddlespeech/cli/whisper/infer.py
@ -181,18 +181,13 @@ class WhisperExecutor(BaseExecutor):
                                       resource_md5)
            else:
                raise Exception("wrong type")
-        #model_name = model_type
-        #model_class = self.task_resource.get_model_class(model_name)
-        #model_conf = self.config
-        #model = model_class.from_config(model_conf)
-        #self.model = model
-        #self.model.eval()

        # load model
        model_dict = paddle.load(self.ckpt_path)
        dims = ModelDimensions(**model_dict["dims"])
        self.model = Whisper(dims)
        self.model.load_dict(model_dict)
+        self.model.eval()

        #set task
        if task is not None:
--- a/paddlespeech/s2t/models/whisper/init.py
+++ b/paddlespeech/s2t/models/whisper/init.py
@ -2,4 +2,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
 # 
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/__init__.py)
-from .whipser import *
+from paddlespeech.s2t.models.whisper.whipser import decode
+from paddlespeech.s2t.models.whisper.whipser import DecodingOptions
+from paddlespeech.s2t.models.whisper.whipser import DecodingResult
+from paddlespeech.s2t.models.whisper.whipser import detect_language
+from paddlespeech.s2t.models.whisper.whipser import log_mel_spectrogram
+from paddlespeech.s2t.models.whisper.whipser import ModelDimensions
+from paddlespeech.s2t.models.whisper.whipser import transcribe
+from paddlespeech.s2t.models.whisper.whipser import Whisper
--- a/paddlespeech/s2t/models/whisper/whipser.py
+++ b/paddlespeech/s2t/models/whisper/whipser.py
@ -194,7 +194,7 @@ class AudioEncoder(nn.Layer):

    def forward(self, x: paddle.Tensor):
        """
-        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
+        x : paddle.Tensor, shape = (batch_size, n_mels, n_ctx)
            the mel spectrogram of the audio
        """
        x = F.gelu(self.conv1(x))
@ -241,9 +241,9 @@ class TextDecoder(nn.Layer):
                xa: paddle.Tensor,
                kv_cache: Optional[dict]=None):
        """
-        x : torch.LongTensor, shape = (batch_size, <= n_ctx)
+        x : paddle.LongTensor, shape = (batch_size, <= n_ctx)
            the text tokens
-        xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
+        xa : paddle.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
            the encoded audio features to be attended on
        """
        offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
@ -436,7 +436,7 @@ def transcribe(
    model: Whisper
        The Whisper model instance

-    mel: torch.Tensor
+    mel: paddle.Tensor
        The audio feature

    verbose: bool
@ -1264,7 +1264,7 @@ def decode(model: "Whisper",
    model: Whisper
        the Whisper model instance

-    mel: torch.Tensor, shape = (80, 3000) or (*, 80, 3000)
+    mel: paddle.Tensor, shape = (80, 3000) or (*, 80, 3000)
        A tensor containing the Mel spectrogram(s)

    options: DecodingOptions
@ -1331,7 +1331,7 @@ class Whisper(nn.Layer):

        Returns
        -------
-        cache : Dict[nn.Module, torch.Tensor]
+        cache : Dict[nn.Layer, paddle.Tensor]
            A dictionary object mapping the key/value projection modules to its cache
        hooks : List[RemovableHandle]
            List of PyTorch RemovableHandle objects to stop the hooks to be called
@ -1429,7 +1429,7 @@ def log_mel_spectrogram(audio: Union[str, np.ndarray, paddle.Tensor],

    Parameters
    ----------
-    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+    audio: Union[str, np.ndarray, paddle.Tensor], shape = (*)
        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz

    n_mels: int
@ -1437,7 +1437,7 @@ def log_mel_spectrogram(audio: Union[str, np.ndarray, paddle.Tensor],

    Returns
    -------
-    torch.Tensor, shape = (80, n_frames)
+    paddle.Tensor, shape = (80, n_frames)
        A Tensor that contains the Mel spectrogram
    """
    if not paddle.is_tensor(audio):