fix some librosa bugs, test=tts

4 years ago · 7dc1f2daa3
parent 30085ac229
commit 7dc1f2daa3
10 changed files with 62 additions and 16 deletions
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@ -34,7 +34,7 @@ Language Model | Training Data | Token-based | Size | Descriptions
 Model Type | Dataset| Example Link | Pretrained Models|Static Models|Size (static)
 :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
 Tacotron2|LJSpeech|[tacotron2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip)|||
-Tacotron2|CSMSC|[tacotron2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0)|[tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)|[tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip)|94.95MB|
+Tacotron2|CSMSC|[tacotron2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0)|[tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)|[tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip)|103MB|
 TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)|||
 SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)|12MB|
 FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB|
@ -54,6 +54,8 @@ Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeec
 |Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip) <br>[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) |8.2MB|
 Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | |
 HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB|
 WaveRNN | CSMSC |[WaveRNN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc6)|[wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip)|[wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip)|18MB|
 ### Voice Cloning
 Model Type | Dataset| Example Link | Pretrained Models
--- a/examples/csmsc/tts0/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts0/local/synthesize_e2e.sh
@ -34,7 +34,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    FLAGS_allocator_strategy=naive_best_fit \
    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
    python3 ${BIN_DIR}/../synthesize_e2e.py \
-        --am=fastspeech2_csmsc \
+        --am=tacotron2_csmsc \
        --am_config=${config_path} \
        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
        --am_stat=dump/train/speech_stats.npy \
@ -56,7 +56,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    FLAGS_allocator_strategy=naive_best_fit \
    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
    python3 ${BIN_DIR}/../synthesize_e2e.py \
-        --am=fastspeech2_csmsc \
+        --am=tacotron2_csmsc \
        --am_config=${config_path} \
        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
        --am_stat=dump/train/speech_stats.npy \
@ -77,7 +77,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    FLAGS_allocator_strategy=naive_best_fit \
    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
    python3 ${BIN_DIR}/../synthesize_e2e.py \
-        --am=fastspeech2_csmsc \
+        --am=tacotron2_csmsc \
        --am_config=${config_path} \
        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
        --am_stat=dump/train/speech_stats.npy \
@ -91,3 +91,24 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --inference_dir=${train_output_path}/inference \
        --phones_dict=dump/phone_id_map.txt
 fi
 # wavernn
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    echo "in wavernn syn_e2e"
    FLAGS_allocator_strategy=naive_best_fit \
    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
    python3 ${BIN_DIR}/../synthesize_e2e.py \
        --am=tacotron2_csmsc \
        --am_config=${config_path} \
        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
        --am_stat=dump/train/speech_stats.npy \
        --voc=wavernn_csmsc \
        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
        --lang=zh \
        --text=${BIN_DIR}/../sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference
 fi
--- a/examples/csmsc/tts2/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts2/local/synthesize_e2e.sh
@ -92,3 +92,26 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt
 fi
 # wavernn
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    echo "in wavernn syn_e2e"
    FLAGS_allocator_strategy=naive_best_fit \
    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
    python3 ${BIN_DIR}/../synthesize_e2e.py \
        --am=speedyspeech_csmsc \
        --am_config=${config_path} \
        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
        --am_stat=dump/train/feats_stats.npy \
        --voc=wavernn_csmsc \
        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
        --lang=zh \
        --text=${BIN_DIR}/../sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt \
        --inference_dir=${train_output_path}/inference
 fi
--- a/examples/csmsc/tts3/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts3/local/synthesize_e2e.sh
@ -102,9 +102,9 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
        --am_stat=dump/train/speech_stats.npy \
        --voc=wavernn_csmsc \
-        --voc_config=wavernn_test/default.yaml \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
-        --voc_ckpt=wavernn_test/snapshot_iter_5000.pdz \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
-        --voc_stat=wavernn_test/feats_stats.npy \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
        --lang=zh \
        --text=${BIN_DIR}/../sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
--- a/paddleaudio/features/core.py
+++ b/paddleaudio/features/core.py
@ -415,11 +415,11 @@ def mfcc(x,
        **kwargs)
    # librosa mfcc:
-    spect = librosa.feature.melspectrogram(x,sr=16000,n_fft=512,
+    spect = librosa.feature.melspectrogram(y=x,sr=16000,n_fft=512,
                                              win_length=512,
                                              hop_length=320,
                                              n_mels=64, fmin=50)
-    b = librosa.feature.mfcc(x,
+    b = librosa.feature.mfcc(y=x,
        sr=16000,
        S=spect,
        n_mfcc=20,
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@ -67,7 +67,7 @@ def istft(x, n_shift, win_length=None, window="hann", center=True):
    x = np.stack(
        [
            librosa.istft(
-                y=x[:, ch].T,  # [Time, Freq] -> [Freq, Time]
+                stft_matrix=x[:, ch].T,  # [Time, Freq] -> [Freq, Time]
                hop_length=n_shift,
                win_length=win_length,
                window=window,
--- a/paddlespeech/t2s/audio/audio.py
+++ b/paddlespeech/t2s/audio/audio.py
@ -53,8 +53,8 @@ class AudioProcessor(object):
    def _create_mel_filter(self):
        mel_filter = librosa.filters.mel(
-            self.sample_rate,
+            sr=self.sample_rate,
-            self.n_fft,
+            n_fft=self.n_fft,
            n_mels=self.n_mels,
            fmin=self.fmin,
            fmax=self.fmax)
--- a/paddlespeech/t2s/datasets/common.py
+++ b/paddlespeech/t2s/datasets/common.py
@ -38,7 +38,7 @@ class AudioSegmentDataset(Dataset):
    def __getitem__(self, i):
        fpath = self.file_paths[i]
-        y, sr = librosa.load(fpath, self.sr)
+        y, sr = librosa.load(fpath, sr=self.sr)
        y, _ = librosa.effects.trim(y, top_db=self.top_db)
        y = librosa.util.normalize(y)
        y = y.astype(np.float32)
@ -70,7 +70,7 @@ class AudioDataset(Dataset):
    def __getitem__(self, i):
        fpath = self.file_paths[i]
-        y, sr = librosa.load(fpath, self.sr)
+        y, sr = librosa.load(fpath, sr=self.sr)
        y, _ = librosa.effects.trim(y, top_db=self.top_db)
        y = librosa.util.normalize(y)
        y = y.astype(np.float32)
--- a/paddlespeech/t2s/exps/wavernn/synthesize.py
+++ b/paddlespeech/t2s/exps/wavernn/synthesize.py
@ -31,7 +31,7 @@ from paddlespeech.t2s.models.wavernn import WaveRNN
 def main():
    parser = argparse.ArgumentParser(description="Synthesize with WaveRNN.")
-    parser.add_argument("--config", type=str, help="GANVocoder config file.")
+    parser.add_argument("--config", type=str, help="Vocoder config file.")
    parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
    parser.add_argument("--test-metadata", type=str, help="dev data.")
    parser.add_argument("--output-dir", type=str, help="output dir.")
--- a/paddlespeech/t2s/exps/wavernn/train.py
+++ b/paddlespeech/t2s/exps/wavernn/train.py
@ -179,7 +179,7 @@ def train_sp(args, config):
 def main():
    # parse args and config and redirect to train_sp
-    parser = argparse.ArgumentParser(description="Train a HiFiGAN model.")
+    parser = argparse.ArgumentParser(description="Train a WaveRNN model.")
    parser.add_argument(
        "--config", type=str, help="config file to overwrite default config.")
    parser.add_argument("--train-metadata", type=str, help="training data.")