From 167aaa65b97471fe39e6bd3b0075d1c362ff4617 Mon Sep 17 00:00:00 2001
From: Jerryuhoo <jerryuhoo@gmail.com>
Date: Thu, 12 May 2022 14:39:50 +0800
Subject: [PATCH] normalize wav max value to 1 in preprocess, test=tts

---
 paddlespeech/t2s/exps/fastspeech2/preprocess.py  | 5 ++++-
 paddlespeech/t2s/exps/gan_vocoder/preprocess.py  | 5 ++++-
 paddlespeech/t2s/exps/speedyspeech/preprocess.py | 5 ++++-
 paddlespeech/t2s/exps/tacotron2/preprocess.py    | 5 ++++-
 4 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
index db1842b2..5fc51365 100644
--- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py
+++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
@@ -55,8 +55,11 @@ def process_sentence(config: Dict[str, Any],
     if utt_id in sentences:
         # reading, resampling may occur
         wav, _ = librosa.load(str(fp), sr=config.fs)
-        if len(wav.shape) != 1 or np.abs(wav).max() > 1.0:
+        if len(wav.shape) != 1:
             return record
+        max_value = np.abs(wav).max()
+        if max_value > 1.0:
+            wav = wav / max_value
         assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
         assert np.abs(wav).max(
         ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
index 4871bca7..8adab0fe 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
@@ -47,8 +47,11 @@ def process_sentence(config: Dict[str, Any],
     if utt_id in sentences:
         # reading, resampling may occur
         y, _ = librosa.load(str(fp), sr=config.fs)
-        if len(y.shape) != 1 or np.abs(y).max() > 1.0:
+        if len(y.shape) != 1:
             return record
+        max_value = np.abs(y).max()
+        if max_value > 1.0:
+            y = y / max_value
         assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
         assert np.abs(y).max(
         ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py
index e833d139..6c6b443f 100644
--- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py
+++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py
@@ -47,8 +47,11 @@ def process_sentence(config: Dict[str, Any],
     if utt_id in sentences:
         # reading, resampling may occur
         wav, _ = librosa.load(str(fp), sr=config.fs)
-        if len(wav.shape) != 1 or np.abs(wav).max() > 1.0:
+        if len(wav.shape) != 1:
             return record
+        max_value = np.abs(wav).max()
+        if max_value > 1.0:
+            wav = wav / max_value
         assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
         assert np.abs(wav).max(
         ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
diff --git a/paddlespeech/t2s/exps/tacotron2/preprocess.py b/paddlespeech/t2s/exps/tacotron2/preprocess.py
index 14a0d7ea..95349d59 100644
--- a/paddlespeech/t2s/exps/tacotron2/preprocess.py
+++ b/paddlespeech/t2s/exps/tacotron2/preprocess.py
@@ -51,8 +51,11 @@ def process_sentence(config: Dict[str, Any],
     if utt_id in sentences:
         # reading, resampling may occur
         wav, _ = librosa.load(str(fp), sr=config.fs)
-        if len(wav.shape) != 1 or np.abs(wav).max() > 1.0:
+        if len(wav.shape) != 1:
             return record
+        max_value = np.abs(wav).max()
+        if max_value > 1.0:
+            wav = wav / max_value
         assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
         assert np.abs(wav).max(
         ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."