[tts]Update mb melgan (#1272)

* update mb melgan

* update mb melgan, test=tts
pull/1274/head
TianYuan 4 years ago committed by GitHub
parent 5125ac8912
commit 680eac02b9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -49,7 +49,7 @@ Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpe
Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)||| Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)|||
Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)||| Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)|||
Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)||| Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)|||
|Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip) <br>[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) |8.2MB| |Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip) <br>[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) |8.2MB|
Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | | Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | |
HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB| HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB|

@ -38,9 +38,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \ --am_stat=dump/train/feats_stats.npy \
--voc=mb_melgan_csmsc \ --voc=mb_melgan_csmsc \
--voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \

@ -37,9 +37,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \ --am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \ --voc=mb_melgan_csmsc \
--voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \

@ -152,22 +152,22 @@ TODO:
The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set). The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set).
## Pretrained Models ## Pretrained Models
The pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip). The pretrained model can be downloaded here [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip).
The finetuned model can be downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip). The finetuned model can be downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip).
The static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) The static model can be downloaded here [mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip)
Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss|eval/spectral_convergence_loss |eval/sub_log_stft_magnitude_loss|eval/sub_spectral_convergence_loss Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss|eval/spectral_convergence_loss |eval/sub_log_stft_magnitude_loss|eval/sub_spectral_convergence_loss
:-------------:| :------------:| :-----: | :-----: | :--------:| :--------:| :--------: :-------------:| :------------:| :-----: | :-----: | :--------:| :--------:| :--------:
default| 1(gpu) x 1000000| ——|—— |—— |—— | ——| default| 1(gpu) x 1000000| 2.4851|0.71778 |0.2761 |0.66334 |0.2777|
finetune| 1(gpu) x 1000000|3.196967|0.977804| 0.778484| 0.889576 |0.776756 | finetune| 1(gpu) x 1000000|3.196967|0.977804| 0.778484| 0.889576 |0.776756 |
Multi Band MelGAN checkpoint contains files listed below. Multi Band MelGAN checkpoint contains files listed below.
```text ```text
mb_melgan_baker_ckpt_0.5 mb_melgan_csmsc_ckpt_0.1.1
├── default.yaml # default config used to train multi band melgan ├── default.yaml # default config used to train multi band melgan
├── feats_stats.npy # statistics used to normalize spectrogram when training multi band melgan ├── feats_stats.npy # statistics used to normalize spectrogram when training multi band melgan
└── snapshot_iter_1000000.pdz # generator parameters of multi band melgan └── snapshot_iter_1000000.pdz # generator parameters of multi band melgan

@ -211,7 +211,6 @@ class ASRExecutor(BaseExecutor):
model_dict = paddle.load(self.ckpt_path) model_dict = paddle.load(self.ckpt_path)
self.model.set_state_dict(model_dict) self.model.set_state_dict(model_dict)
def preprocess(self, model_type: str, input: Union[str, os.PathLike]): def preprocess(self, model_type: str, input: Union[str, os.PathLike]):
""" """
Input preprocess and return paddle.Tensor stored in self.input. Input preprocess and return paddle.Tensor stored in self.input.

@ -168,13 +168,13 @@ pretrained_models = {
# mb_melgan # mb_melgan
"mb_melgan_csmsc-zh": { "mb_melgan_csmsc-zh": {
'url': 'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip', 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
'md5': 'md5':
'b69322ab4ea766d955bd3d9af7dc5f2d', 'ee5f0604e20091f0d495b6ec4618b90d',
'config': 'config':
'finetune.yaml', 'default.yaml',
'ckpt': 'ckpt':
'snapshot_iter_2000000.pdz', 'snapshot_iter_1000000.pdz',
'speech_stats': 'speech_stats':
'feats_stats.npy', 'feats_stats.npy',
}, },

@ -54,6 +54,7 @@ def speedyspeech_single_spk_batch_fn(examples):
} }
return batch return batch
def speedyspeech_multi_spk_batch_fn(examples): def speedyspeech_multi_spk_batch_fn(examples):
# fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"] # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"]
phones = [np.array(item["phones"], dtype=np.int64) for item in examples] phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
@ -95,6 +96,7 @@ def speedyspeech_multi_spk_batch_fn(examples):
batch["spk_id"] = spk_id batch["spk_id"] = spk_id
return batch return batch
def fastspeech2_single_spk_batch_fn(examples): def fastspeech2_single_spk_batch_fn(examples):
# fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy"] # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy"]
text = [np.array(item["text"], dtype=np.int64) for item in examples] text = [np.array(item["text"], dtype=np.int64) for item in examples]

@ -13,7 +13,6 @@
# limitations under the License. # limitations under the License.
import argparse import argparse
import re import re
import os
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from operator import itemgetter from operator import itemgetter
from pathlib import Path from pathlib import Path
@ -32,8 +31,9 @@ from paddlespeech.t2s.data.get_feats import LogMelFBank
from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
from paddlespeech.t2s.datasets.preprocess_utils import get_phones_tones from paddlespeech.t2s.datasets.preprocess_utils import get_phones_tones
from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
def process_sentence(config: Dict[str, Any], def process_sentence(config: Dict[str, Any],
fp: Path, fp: Path,

@ -27,8 +27,8 @@ from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler from paddle.io import DistributedBatchSampler
from yacs.config import CfgNode from yacs.config import CfgNode
from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_single_spk_batch_fn
from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_multi_spk_batch_fn from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_multi_spk_batch_fn
from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_single_spk_batch_fn
from paddlespeech.t2s.datasets.data_table import DataTable from paddlespeech.t2s.datasets.data_table import DataTable
from paddlespeech.t2s.models.speedyspeech import SpeedySpeech from paddlespeech.t2s.models.speedyspeech import SpeedySpeech
from paddlespeech.t2s.models.speedyspeech import SpeedySpeechEvaluator from paddlespeech.t2s.models.speedyspeech import SpeedySpeechEvaluator
@ -58,7 +58,9 @@ def train_sp(args, config):
f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
) )
fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"] fields = [
"phones", "tones", "num_phones", "num_frames", "feats", "durations"
]
spk_num = None spk_num = None
if args.speaker_dict is not None: if args.speaker_dict is not None:
@ -137,7 +139,10 @@ def train_sp(args, config):
print("tone_size:", tone_size) print("tone_size:", tone_size)
model = SpeedySpeech( model = SpeedySpeech(
vocab_size=vocab_size, tone_size=tone_size, spk_num=spk_num, **config["model"]) vocab_size=vocab_size,
tone_size=tone_size,
spk_num=spk_num,
**config["model"])
if world_size > 1: if world_size > 1:
model = DataParallel(model) model = DataParallel(model)
print("model done!") print("model done!")

@ -14,7 +14,7 @@
import numpy as np import numpy as np
import paddle import paddle
from paddle import nn from paddle import nn
import paddle.nn.functional as F
from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding
@ -95,8 +95,13 @@ class TextEmbedding(nn.Layer):
class SpeedySpeechEncoder(nn.Layer): class SpeedySpeechEncoder(nn.Layer):
def __init__(self, vocab_size, tone_size, hidden_size, kernel_size, def __init__(self,
dilations, spk_num=None): vocab_size,
tone_size,
hidden_size,
kernel_size,
dilations,
spk_num=None):
super().__init__() super().__init__()
self.embedding = TextEmbedding( self.embedding = TextEmbedding(
vocab_size, vocab_size,
@ -104,7 +109,7 @@ class SpeedySpeechEncoder(nn.Layer):
tone_size, tone_size,
padding_idx=0, padding_idx=0,
tone_padding_idx=0) tone_padding_idx=0)
if spk_num: if spk_num:
self.spk_emb = nn.Embedding( self.spk_emb = nn.Embedding(
num_embeddings=spk_num, num_embeddings=spk_num,
@ -112,7 +117,7 @@ class SpeedySpeechEncoder(nn.Layer):
padding_idx=0) padding_idx=0)
else: else:
self.spk_emb = None self.spk_emb = None
self.prenet = nn.Sequential( self.prenet = nn.Sequential(
nn.Linear(hidden_size, hidden_size), nn.Linear(hidden_size, hidden_size),
nn.ReLU(), ) nn.ReLU(), )
@ -171,19 +176,18 @@ class SpeedySpeechDecoder(nn.Layer):
class SpeedySpeech(nn.Layer): class SpeedySpeech(nn.Layer):
def __init__( def __init__(self,
self, vocab_size,
vocab_size, encoder_hidden_size,
encoder_hidden_size, encoder_kernel_size,
encoder_kernel_size, encoder_dilations,
encoder_dilations, duration_predictor_hidden_size,
duration_predictor_hidden_size, decoder_hidden_size,
decoder_hidden_size, decoder_output_size,
decoder_output_size, decoder_kernel_size,
decoder_kernel_size, decoder_dilations,
decoder_dilations, tone_size=None,
tone_size=None, spk_num=None):
spk_num=None):
super().__init__() super().__init__()
encoder = SpeedySpeechEncoder(vocab_size, tone_size, encoder = SpeedySpeechEncoder(vocab_size, tone_size,
encoder_hidden_size, encoder_kernel_size, encoder_hidden_size, encoder_kernel_size,
@ -255,6 +259,7 @@ class SpeedySpeech(nn.Layer):
decoded = self.decoder(encodings) decoded = self.decoder(encodings)
return decoded[0] return decoded[0]
class SpeedySpeechInference(nn.Layer): class SpeedySpeechInference(nn.Layer):
def __init__(self, normalizer, speedyspeech_model): def __init__(self, normalizer, speedyspeech_model):
super().__init__() super().__init__()

@ -57,8 +57,7 @@ class SpeedySpeechUpdater(StandardUpdater):
text=batch["phones"], text=batch["phones"],
tones=batch["tones"], tones=batch["tones"],
durations=batch["durations"], durations=batch["durations"],
spk_id=spk_id spk_id=spk_id)
)
target_mel = batch["feats"] target_mel = batch["feats"]
spec_mask = F.sequence_mask( spec_mask = F.sequence_mask(
@ -123,8 +122,7 @@ class SpeedySpeechEvaluator(StandardEvaluator):
text=batch["phones"], text=batch["phones"],
tones=batch["tones"], tones=batch["tones"],
durations=batch["durations"], durations=batch["durations"],
spk_id=spk_id spk_id=spk_id)
)
target_mel = batch["feats"] target_mel = batch["feats"]
spec_mask = F.sequence_mask( spec_mask = F.sequence_mask(

Loading…
Cancel
Save