diff --git a/paddlespeech/cli/whisper/infer.py b/paddlespeech/cli/whisper/infer.py index c016b453a..ebcca890b 100644 --- a/paddlespeech/cli/whisper/infer.py +++ b/paddlespeech/cli/whisper/infer.py @@ -152,8 +152,7 @@ class WhisperExecutor(BaseExecutor): Init model and other resources from a specific path. """ logger.debug("start to init the model") - # default max_len: unit:second - self.max_len = 50 + if hasattr(self, 'model'): logger.debug('Model had been initialized.') return @@ -339,12 +338,6 @@ class WhisperExecutor(BaseExecutor): try: audio, audio_sample_rate = soundfile.read( audio_file, dtype="int16", always_2d=True) - audio_duration = audio.shape[0] / audio_sample_rate - if audio_duration > self.max_len: - logger.error( - f"Please input audio file less then {self.max_len} seconds.\n" - ) - return False except Exception as e: logger.exception(e) logger.error( diff --git a/paddlespeech/s2t/models/whisper/__init__.py b/paddlespeech/s2t/models/whisper/__init__.py index 98ab23610..b78dece8a 100644 --- a/paddlespeech/s2t/models/whisper/__init__.py +++ b/paddlespeech/s2t/models/whisper/__init__.py @@ -1,5 +1,5 @@ # MIT License, Copyright (c) 2022 OpenAI. -# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/__init__.py) from paddlespeech.s2t.models.whisper.whipser import decode diff --git a/paddlespeech/s2t/models/whisper/tokenizer.py b/paddlespeech/s2t/models/whisper/tokenizer.py index 1e1aea044..e8b201bcc 100644 --- a/paddlespeech/s2t/models/whisper/tokenizer.py +++ b/paddlespeech/s2t/models/whisper/tokenizer.py @@ -1,5 +1,5 @@ # MIT License, Copyright (c) 2022 OpenAI. -# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/tokenizer.py) import os diff --git a/paddlespeech/s2t/models/whisper/utils.py b/paddlespeech/s2t/models/whisper/utils.py index d067af7d2..5528f9604 100644 --- a/paddlespeech/s2t/models/whisper/utils.py +++ b/paddlespeech/s2t/models/whisper/utils.py @@ -1,5 +1,5 @@ # MIT License, Copyright (c) 2022 OpenAI. -# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/utils.py) import zlib diff --git a/paddlespeech/s2t/models/whisper/whipser.py b/paddlespeech/s2t/models/whisper/whipser.py index 9cf9a9eca..a28013e4b 100644 --- a/paddlespeech/s2t/models/whisper/whipser.py +++ b/paddlespeech/s2t/models/whisper/whipser.py @@ -1,5 +1,5 @@ # MIT License, Copyright (c) 2022 OpenAI. -# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper) import os diff --git a/paddlespeech/t2s/modules/diffusion.py b/paddlespeech/t2s/modules/diffusion.py index eb67ffb0d..be684ce38 100644 --- a/paddlespeech/t2s/modules/diffusion.py +++ b/paddlespeech/t2s/modules/diffusion.py @@ -360,6 +360,8 @@ class GaussianDiffusion(nn.Layer): num_inference_steps: Optional[int]=1000, strength: Optional[float]=None, scheduler_type: Optional[str]="ddpm", + clip_noise: Optional[bool]=True, + clip_noise_range: Optional[Tuple[float, float]]=(-1, 1), callback: Optional[Callable[[int, int, int, paddle.Tensor], None]]=None, callback_steps: Optional[int]=1): @@ -380,6 +382,10 @@ class GaussianDiffusion(nn.Layer): scheduler_type (str, optional): Noise scheduler for generate noises. Choose a great scheduler can skip many denoising step, by default 'ddpm'. + clip_noise (bool, optional): + Whether to clip each denoised output, by default True. + clip_noise_range (tuple, optional): + denoised output min and max value range after clip, by default (-1, 1). callback (Callable[[int,int,int,Tensor], None], optional): Callback function during denoising steps. @@ -440,6 +446,9 @@ class GaussianDiffusion(nn.Layer): # denoising loop denoised_output = noisy_input + if clip_noise: + n_min, n_max = clip_noise_range + denoised_output = paddle.clip(denoised_output, n_min, n_max) num_warmup_steps = len( timesteps) - num_inference_steps * scheduler.order for i, t in enumerate(timesteps): @@ -451,6 +460,8 @@ class GaussianDiffusion(nn.Layer): # compute the previous noisy sample x_t -> x_t-1 denoised_output = scheduler.step(noise_pred, t, denoised_output).prev_sample + if clip_noise: + denoised_output = paddle.clip(denoised_output, n_min, n_max) # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and diff --git a/third_party/ctc_decoders/setup.py b/third_party/ctc_decoders/setup.py index c13f3df99..5ae5b3bf6 100644 --- a/third_party/ctc_decoders/setup.py +++ b/third_party/ctc_decoders/setup.py @@ -129,7 +129,7 @@ decoders_module = [ setup( name='paddlespeech_ctcdecoders', - version='0.2.0', + version='0.2.2', description="CTC decoders in paddlespeech", author="PaddlePaddle Speech and Language Team", author_email="paddlesl@baidu.com",