Merge branch 'PaddlePaddle:develop' into develop

3 years ago · 859e8d2339
parent 0ab7fb0f62 bcd8e309ec
commit 859e8d2339
7 changed files with 17 additions and 13 deletions
--- a/paddlespeech/cli/whisper/infer.py
+++ b/paddlespeech/cli/whisper/infer.py
@ -152,8 +152,7 @@ class WhisperExecutor(BaseExecutor):
        Init model and other resources from a specific path.
        """
        logger.debug("start to init the model")
-        # default max_len: unit:second
-        self.max_len = 50
+
        if hasattr(self, 'model'):
            logger.debug('Model had been initialized.')
            return
@ -339,12 +338,6 @@ class WhisperExecutor(BaseExecutor):
        try:
            audio, audio_sample_rate = soundfile.read(
                audio_file, dtype="int16", always_2d=True)
-            audio_duration = audio.shape[0] / audio_sample_rate
-            if audio_duration > self.max_len:
-                logger.error(
-                    f"Please input audio file less then {self.max_len} seconds.\n"
-                )
-                return False
        except Exception as e:
            logger.exception(e)
            logger.error(
--- a/paddlespeech/s2t/models/whisper/init.py
+++ b/paddlespeech/s2t/models/whisper/init.py
@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/__init__.py)
 from paddlespeech.s2t.models.whisper.whipser import decode
--- a/paddlespeech/s2t/models/whisper/tokenizer.py
+++ b/paddlespeech/s2t/models/whisper/tokenizer.py
@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/tokenizer.py)
 import os
--- a/paddlespeech/s2t/models/whisper/utils.py
+++ b/paddlespeech/s2t/models/whisper/utils.py
@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/utils.py)
 import zlib
--- a/paddlespeech/s2t/models/whisper/whipser.py
+++ b/paddlespeech/s2t/models/whisper/whipser.py
@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper)
 import os
--- a/paddlespeech/t2s/modules/diffusion.py
+++ b/paddlespeech/t2s/modules/diffusion.py
@ -360,6 +360,8 @@ class GaussianDiffusion(nn.Layer):
                  num_inference_steps: Optional[int]=1000,
                  strength: Optional[float]=None,
                  scheduler_type: Optional[str]="ddpm",
+                  clip_noise: Optional[bool]=True,
+                  clip_noise_range: Optional[Tuple[float, float]]=(-1, 1),
                  callback: Optional[Callable[[int, int, int, paddle.Tensor],
                                              None]]=None,
                  callback_steps: Optional[int]=1):
@ -380,6 +382,10 @@ class GaussianDiffusion(nn.Layer):
            scheduler_type (str, optional):
                Noise scheduler for generate noises. 
                Choose a great scheduler can skip many denoising step, by default 'ddpm'.
+            clip_noise (bool, optional):
+                Whether to clip each denoised output, by default True.
+            clip_noise_range (tuple, optional):
+                denoised output min and max value range after clip, by default (-1, 1).
            callback (Callable[[int,int,int,Tensor], None], optional):
                Callback function during denoising steps.

@ -440,6 +446,9 @@ class GaussianDiffusion(nn.Layer):

        # denoising loop
        denoised_output = noisy_input
+        if clip_noise:
+            n_min, n_max = clip_noise_range
+            denoised_output = paddle.clip(denoised_output, n_min, n_max)
        num_warmup_steps = len(
            timesteps) - num_inference_steps * scheduler.order
        for i, t in enumerate(timesteps):
@ -451,6 +460,8 @@ class GaussianDiffusion(nn.Layer):
            # compute the previous noisy sample x_t -> x_t-1
            denoised_output = scheduler.step(noise_pred, t,
                                             denoised_output).prev_sample
+            if clip_noise:
+                denoised_output = paddle.clip(denoised_output, n_min, n_max)

            # call the callback, if provided
            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and
--- a/third_party/ctc_decoders/setup.py
+++ b/third_party/ctc_decoders/setup.py
@ -129,7 +129,7 @@ decoders_module = [

 setup(
    name='paddlespeech_ctcdecoders',
-    version='0.2.0',
+    version='0.2.2',
    description="CTC decoders in paddlespeech",
    author="PaddlePaddle Speech and Language Team",
    author_email="paddlesl@baidu.com",