diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py index db1842b2..5fc51365 100644 --- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py +++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py @@ -55,8 +55,11 @@ def process_sentence(config: Dict[str, Any], if utt_id in sentences: # reading, resampling may occur wav, _ = librosa.load(str(fp), sr=config.fs) - if len(wav.shape) != 1 or np.abs(wav).max() > 1.0: + if len(wav.shape) != 1: return record + max_value = np.abs(wav).max() + if max_value > 1.0: + wav = wav / max_value assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio." assert np.abs(wav).max( ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM." diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py index 4871bca7..8adab0fe 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py +++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py @@ -47,8 +47,11 @@ def process_sentence(config: Dict[str, Any], if utt_id in sentences: # reading, resampling may occur y, _ = librosa.load(str(fp), sr=config.fs) - if len(y.shape) != 1 or np.abs(y).max() > 1.0: + if len(y.shape) != 1: return record + max_value = np.abs(y).max() + if max_value > 1.0: + y = y / max_value assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio." assert np.abs(y).max( ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM." diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py index e833d139..6c6b443f 100644 --- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py +++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py @@ -47,8 +47,11 @@ def process_sentence(config: Dict[str, Any], if utt_id in sentences: # reading, resampling may occur wav, _ = librosa.load(str(fp), sr=config.fs) - if len(wav.shape) != 1 or np.abs(wav).max() > 1.0: + if len(wav.shape) != 1: return record + max_value = np.abs(wav).max() + if max_value > 1.0: + wav = wav / max_value assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio." assert np.abs(wav).max( ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM." diff --git a/paddlespeech/t2s/exps/tacotron2/preprocess.py b/paddlespeech/t2s/exps/tacotron2/preprocess.py index 14a0d7ea..95349d59 100644 --- a/paddlespeech/t2s/exps/tacotron2/preprocess.py +++ b/paddlespeech/t2s/exps/tacotron2/preprocess.py @@ -51,8 +51,11 @@ def process_sentence(config: Dict[str, Any], if utt_id in sentences: # reading, resampling may occur wav, _ = librosa.load(str(fp), sr=config.fs) - if len(wav.shape) != 1 or np.abs(wav).max() > 1.0: + if len(wav.shape) != 1: return record + max_value = np.abs(wav).max() + if max_value > 1.0: + wav = wav / max_value assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio." assert np.abs(wav).max( ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."