diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 9227a7c5..3b37e169 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -34,7 +34,7 @@ Language Model | Training Data | Token-based | Size | Descriptions Model Type | Dataset| Example Link | Pretrained Models|Static Models|Size (static) :-------------:| :------------:| :-----: | :-----:| :-----:| :-----: Tacotron2|LJSpeech|[tacotron2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip)||| -Tacotron2|CSMSC|[tacotron2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0)|[tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)|[tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip)|94.95MB| +Tacotron2|CSMSC|[tacotron2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0)|[tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)|[tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip)|103MB| TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)||| SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)|12MB| FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB| @@ -54,6 +54,8 @@ Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeec |Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) |8.2MB| Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | | HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB| +WaveRNN | CSMSC |[WaveRNN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc6)|[wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip)|[wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip)|18MB| + ### Voice Cloning Model Type | Dataset| Example Link | Pretrained Models diff --git a/examples/csmsc/tts0/local/synthesize_e2e.sh b/examples/csmsc/tts0/local/synthesize_e2e.sh index ea0f11d6..79bb9f83 100755 --- a/examples/csmsc/tts0/local/synthesize_e2e.sh +++ b/examples/csmsc/tts0/local/synthesize_e2e.sh @@ -34,7 +34,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ python3 ${BIN_DIR}/../synthesize_e2e.py \ - --am=fastspeech2_csmsc \ + --am=tacotron2_csmsc \ --am_config=${config_path} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ @@ -56,7 +56,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ python3 ${BIN_DIR}/../synthesize_e2e.py \ - --am=fastspeech2_csmsc \ + --am=tacotron2_csmsc \ --am_config=${config_path} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ @@ -77,7 +77,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ python3 ${BIN_DIR}/../synthesize_e2e.py \ - --am=fastspeech2_csmsc \ + --am=tacotron2_csmsc \ --am_config=${config_path} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ @@ -91,3 +91,24 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt fi + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference +fi \ No newline at end of file diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh index 0a4cf69b..35fcf251 100755 --- a/examples/csmsc/tts2/local/synthesize_e2e.sh +++ b/examples/csmsc/tts2/local/synthesize_e2e.sh @@ -92,3 +92,26 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones_dict=dump/phone_id_map.txt \ --tones_dict=dump/tone_id_map.txt fi + + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference +fi diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh index d1fadf77..44356e4b 100755 --- a/examples/csmsc/tts3/local/synthesize_e2e.sh +++ b/examples/csmsc/tts3/local/synthesize_e2e.sh @@ -102,9 +102,9 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=wavernn_csmsc \ - --voc_config=wavernn_test/default.yaml \ - --voc_ckpt=wavernn_test/snapshot_iter_5000.pdz \ - --voc_stat=wavernn_test/feats_stats.npy \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ diff --git a/paddleaudio/features/core.py b/paddleaudio/features/core.py index d3c2e290..01925ec6 100644 --- a/paddleaudio/features/core.py +++ b/paddleaudio/features/core.py @@ -415,11 +415,11 @@ def mfcc(x, **kwargs) # librosa mfcc: - spect = librosa.feature.melspectrogram(x,sr=16000,n_fft=512, + spect = librosa.feature.melspectrogram(y=x,sr=16000,n_fft=512, win_length=512, hop_length=320, n_mels=64, fmin=50) - b = librosa.feature.mfcc(x, + b = librosa.feature.mfcc(y=x, sr=16000, S=spect, n_mfcc=20, diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index 988fd627..889cd349 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -67,7 +67,7 @@ def istft(x, n_shift, win_length=None, window="hann", center=True): x = np.stack( [ librosa.istft( - y=x[:, ch].T, # [Time, Freq] -> [Freq, Time] + stft_matrix=x[:, ch].T, # [Time, Freq] -> [Freq, Time] hop_length=n_shift, win_length=win_length, window=window, diff --git a/paddlespeech/t2s/audio/audio.py b/paddlespeech/t2s/audio/audio.py index ab9a45d3..59ea8c87 100644 --- a/paddlespeech/t2s/audio/audio.py +++ b/paddlespeech/t2s/audio/audio.py @@ -53,8 +53,8 @@ class AudioProcessor(object): def _create_mel_filter(self): mel_filter = librosa.filters.mel( - self.sample_rate, - self.n_fft, + sr=self.sample_rate, + n_fft=self.n_fft, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax) diff --git a/paddlespeech/t2s/datasets/common.py b/paddlespeech/t2s/datasets/common.py index d6fa3a84..122a35ae 100644 --- a/paddlespeech/t2s/datasets/common.py +++ b/paddlespeech/t2s/datasets/common.py @@ -38,7 +38,7 @@ class AudioSegmentDataset(Dataset): def __getitem__(self, i): fpath = self.file_paths[i] - y, sr = librosa.load(fpath, self.sr) + y, sr = librosa.load(fpath, sr=self.sr) y, _ = librosa.effects.trim(y, top_db=self.top_db) y = librosa.util.normalize(y) y = y.astype(np.float32) @@ -70,7 +70,7 @@ class AudioDataset(Dataset): def __getitem__(self, i): fpath = self.file_paths[i] - y, sr = librosa.load(fpath, self.sr) + y, sr = librosa.load(fpath, sr=self.sr) y, _ = librosa.effects.trim(y, top_db=self.top_db) y = librosa.util.normalize(y) y = y.astype(np.float32) diff --git a/paddlespeech/t2s/exps/wavernn/synthesize.py b/paddlespeech/t2s/exps/wavernn/synthesize.py index 61723e03..4357b282 100644 --- a/paddlespeech/t2s/exps/wavernn/synthesize.py +++ b/paddlespeech/t2s/exps/wavernn/synthesize.py @@ -31,7 +31,7 @@ from paddlespeech.t2s.models.wavernn import WaveRNN def main(): parser = argparse.ArgumentParser(description="Synthesize with WaveRNN.") - parser.add_argument("--config", type=str, help="GANVocoder config file.") + parser.add_argument("--config", type=str, help="Vocoder config file.") parser.add_argument("--checkpoint", type=str, help="snapshot to load.") parser.add_argument("--test-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/wavernn/train.py b/paddlespeech/t2s/exps/wavernn/train.py index aec745f7..c9a6ba01 100644 --- a/paddlespeech/t2s/exps/wavernn/train.py +++ b/paddlespeech/t2s/exps/wavernn/train.py @@ -179,7 +179,7 @@ def train_sp(args, config): def main(): # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser(description="Train a HiFiGAN model.") + parser = argparse.ArgumentParser(description="Train a WaveRNN model.") parser.add_argument( "--config", type=str, help="config file to overwrite default config.") parser.add_argument("--train-metadata", type=str, help="training data.")