update tacotron2 voice cloning in aishell3 with new tacotron2, test=tts (#1419)
parent
a3ff7c0df1
commit
348a1a33bf
@ -0,0 +1,86 @@
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
|
||||
fs: 24000 # sr
|
||||
n_fft: 2048 # FFT size (samples).
|
||||
n_shift: 300 # Hop size (samples). 12.5ms
|
||||
win_length: 1200 # Window length (samples). 50ms
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
|
||||
# Only used for feats_type != raw
|
||||
|
||||
fmin: 80 # Minimum frequency of Mel basis.
|
||||
fmax: 7600 # Maximum frequency of Mel basis.
|
||||
n_mels: 80 # The number of mel basis.
|
||||
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
batch_size: 64
|
||||
num_workers: 2
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model: # keyword arguments for the selected model
|
||||
embed_dim: 512 # char or phn embedding dimension
|
||||
elayers: 1 # number of blstm layers in encoder
|
||||
eunits: 512 # number of blstm units
|
||||
econv_layers: 3 # number of convolutional layers in encoder
|
||||
econv_chans: 512 # number of channels in convolutional layer
|
||||
econv_filts: 5 # filter size of convolutional layer
|
||||
atype: location # attention function type
|
||||
adim: 512 # attention dimension
|
||||
aconv_chans: 32 # number of channels in convolutional layer of attention
|
||||
aconv_filts: 15 # filter size of convolutional layer of attention
|
||||
cumulate_att_w: True # whether to cumulate attention weight
|
||||
dlayers: 2 # number of lstm layers in decoder
|
||||
dunits: 1024 # number of lstm units in decoder
|
||||
prenet_layers: 2 # number of layers in prenet
|
||||
prenet_units: 256 # number of units in prenet
|
||||
postnet_layers: 5 # number of layers in postnet
|
||||
postnet_chans: 512 # number of channels in postnet
|
||||
postnet_filts: 5 # filter size of postnet layer
|
||||
output_activation: null # activation function for the final output
|
||||
use_batch_norm: True # whether to use batch normalization in encoder
|
||||
use_concate: True # whether to concatenate encoder embedding with decoder outputs
|
||||
use_residual: False # whether to use residual connection in encoder
|
||||
dropout_rate: 0.5 # dropout rate
|
||||
zoneout_rate: 0.1 # zoneout rate
|
||||
reduction_factor: 1 # reduction factor
|
||||
spk_embed_dim: 256 # speaker embedding dimension
|
||||
spk_embed_integration_type: concat # how to integrate speaker embedding
|
||||
|
||||
|
||||
###########################################################
|
||||
# UPDATER SETTING #
|
||||
###########################################################
|
||||
updater:
|
||||
use_masking: True # whether to apply masking for padded part in loss calculation
|
||||
bce_pos_weight: 5.0 # weight of positive sample in binary cross entropy calculation
|
||||
use_guided_attn_loss: True # whether to use guided attention loss
|
||||
guided_attn_loss_sigma: 0.4 # sigma of guided attention loss
|
||||
guided_attn_loss_lambda: 1.0 # strength of guided attention loss
|
||||
|
||||
|
||||
##########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
##########################################################
|
||||
optimizer:
|
||||
optim: adam # optimizer type
|
||||
learning_rate: 1.0e-03 # learning rate
|
||||
epsilon: 1.0e-06 # epsilon
|
||||
weight_decay: 0.0 # weight decay coefficient
|
||||
|
||||
###########################################################
|
||||
# TRAINING SETTING #
|
||||
###########################################################
|
||||
max_epoch: 200
|
||||
num_snapshots: 5
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
seed: 42
|
@ -1,36 +1,72 @@
|
||||
#!/bin/bash
|
||||
|
||||
stage=0
|
||||
stage=3
|
||||
stop_stage=100
|
||||
|
||||
input=$1
|
||||
preprocess_path=$2
|
||||
alignment=$3
|
||||
ge2e_ckpt_path=$4
|
||||
config_path=$1
|
||||
ge2e_ckpt_path=$2
|
||||
|
||||
# gen speaker embedding
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \
|
||||
--input=${input}/wav \
|
||||
--output=${preprocess_path}/embed \
|
||||
--input=~/datasets/data_aishell3/train/wav/ \
|
||||
--output=dump/embed \
|
||||
--checkpoint_path=${ge2e_ckpt_path}
|
||||
fi
|
||||
|
||||
# copy from tts3/preprocess
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
echo "Process wav ..."
|
||||
python3 ${BIN_DIR}/process_wav.py \
|
||||
--input=${input}/wav \
|
||||
--output=${preprocess_path}/normalized_wav \
|
||||
--alignment=${alignment}
|
||||
# get durations from MFA's result
|
||||
echo "Generate durations.txt from MFA results ..."
|
||||
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
|
||||
--inputdir=./aishell3_alignment_tone \
|
||||
--output durations.txt \
|
||||
--config=${config_path}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
python3 ${BIN_DIR}/preprocess_transcription.py \
|
||||
--input=${input} \
|
||||
--output=${preprocess_path}
|
||||
# extract features
|
||||
echo "Extract features ..."
|
||||
python3 ${BIN_DIR}/preprocess.py \
|
||||
--dataset=aishell3 \
|
||||
--rootdir=~/datasets/data_aishell3/ \
|
||||
--dumpdir=dump \
|
||||
--dur-file=durations.txt \
|
||||
--config=${config_path} \
|
||||
--num-cpu=20 \
|
||||
--cut-sil=True \
|
||||
--spk_emb_dir=dump/embed
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
python3 ${BIN_DIR}/extract_mel.py \
|
||||
--input=${preprocess_path}/normalized_wav \
|
||||
--output=${preprocess_path}/mel
|
||||
# get features' stats(mean and std)
|
||||
echo "Get features' stats ..."
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="speech"
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
# normalize and covert phone to id, dev and test should use train's stats
|
||||
echo "Normalize ..."
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--dumpdir=dump/train/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/dev/raw/metadata.jsonl \
|
||||
--dumpdir=dump/dev/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/test/raw/metadata.jsonl \
|
||||
--dumpdir=dump/test/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
fi
|
||||
|
@ -0,0 +1,22 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=tacotron2_aishell3 \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=pwgan_aishell3 \
|
||||
--voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
|
||||
--voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
|
||||
--voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--speaker_dict=dump/speaker_id_map.txt \
|
||||
--voice-cloning=True
|
@ -1,9 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
preprocess_path=$1
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
python3 ${BIN_DIR}/train.py \
|
||||
--data=${preprocess_path} \
|
||||
--output=${train_output_path} \
|
||||
--ngpu=1
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--ngpu=2 \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--voice-cloning=True
|
@ -1,14 +1,24 @@
|
||||
#!/bin/bash
|
||||
|
||||
ge2e_params_path=$1
|
||||
tacotron2_params_path=$2
|
||||
waveflow_params_path=$3
|
||||
vc_input=$4
|
||||
vc_output=$5
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
ge2e_params_path=$4
|
||||
ref_audio_dir=$5
|
||||
|
||||
python3 ${BIN_DIR}/voice_cloning.py \
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../voice_cloning.py \
|
||||
--am=tacotron2_aishell3 \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=pwgan_aishell3 \
|
||||
--voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
|
||||
--voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
|
||||
--voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
|
||||
--ge2e_params_path=${ge2e_params_path} \
|
||||
--tacotron2_params_path=${tacotron2_params_path} \
|
||||
--waveflow_params_path=${waveflow_params_path} \
|
||||
--input-dir=${vc_input} \
|
||||
--output-dir=${vc_output}
|
||||
--text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \
|
||||
--input-dir=${ref_audio_dir} \
|
||||
--output-dir=${train_output_path}/vc_syn \
|
||||
--phones-dict=dump/phone_id_map.txt
|
||||
|
@ -1,13 +0,0 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -1,75 +0,0 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from yacs.config import CfgNode as CN
|
||||
|
||||
_C = CN()
|
||||
_C.data = CN(
|
||||
dict(
|
||||
batch_size=32, # batch size
|
||||
valid_size=64, # the first N examples are reserved for validation
|
||||
sample_rate=22050, # Hz, sample rate
|
||||
n_fft=1024, # fft frame size
|
||||
win_length=1024, # window size
|
||||
hop_length=256, # hop size between ajacent frame
|
||||
fmax=8000, # Hz, max frequency when converting to mel
|
||||
fmin=0, # Hz, min frequency when converting to mel
|
||||
n_mels=80, # mel bands
|
||||
padding_idx=0, # text embedding's padding index
|
||||
))
|
||||
|
||||
_C.model = CN(
|
||||
dict(
|
||||
vocab_size=37, # set this according to the frontend's vocab size
|
||||
n_tones=None,
|
||||
reduction_factor=1, # reduction factor
|
||||
d_encoder=512, # embedding & encoder's internal size
|
||||
encoder_conv_layers=3, # number of conv layer in tacotron2 encoder
|
||||
encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder
|
||||
d_prenet=256, # hidden size of decoder prenet
|
||||
d_attention_rnn=1024, # hidden size of the first rnn layer in tacotron2 decoder
|
||||
d_decoder_rnn=1024, # hidden size of the second rnn layer in tacotron2 decoder
|
||||
d_attention=128, # hidden size of decoder location linear layer
|
||||
attention_filters=32, # number of filter in decoder location conv layer
|
||||
attention_kernel_size=31, # kernel size of decoder location conv layer
|
||||
d_postnet=512, # hidden size of decoder postnet
|
||||
postnet_kernel_size=5, # kernel size of conv layers in postnet
|
||||
postnet_conv_layers=5, # number of conv layer in decoder postnet
|
||||
p_encoder_dropout=0.5, # droput probability in encoder
|
||||
p_prenet_dropout=0.5, # droput probability in decoder prenet
|
||||
p_attention_dropout=0.1, # droput probability of first rnn layer in decoder
|
||||
p_decoder_dropout=0.1, # droput probability of second rnn layer in decoder
|
||||
p_postnet_dropout=0.5, # droput probability in decoder postnet
|
||||
d_global_condition=None,
|
||||
use_stop_token=True, # wherther to use binary classifier to predict when to stop
|
||||
use_guided_attention_loss=False, # whether to use guided attention loss
|
||||
guided_attention_loss_sigma=0.2 # sigma in guided attention loss
|
||||
))
|
||||
|
||||
_C.training = CN(
|
||||
dict(
|
||||
lr=1e-3, # learning rate
|
||||
weight_decay=1e-6, # the coeff of weight decay
|
||||
grad_clip_thresh=1.0, # the clip norm of grad clip.
|
||||
plot_interval=1000, # plot attention and spectrogram
|
||||
valid_interval=1000, # validation
|
||||
save_interval=1000, # checkpoint
|
||||
max_iteration=500000, # max iteration to train
|
||||
))
|
||||
|
||||
|
||||
def get_cfg_defaults():
|
||||
"""Get a yacs CfgNode object with default values for my_project."""
|
||||
# Return a clone so that the defaults will not be altered
|
||||
# This is for the "local variable" use pattern
|
||||
return _C.clone()
|
@ -1,91 +0,0 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from paddle.io import Dataset
|
||||
|
||||
from paddlespeech.t2s.data.batch import batch_spec
|
||||
from paddlespeech.t2s.data.batch import batch_text_id
|
||||
|
||||
|
||||
class LJSpeech(Dataset):
|
||||
"""A simple dataset adaptor for the processed ljspeech dataset."""
|
||||
|
||||
def __init__(self, root):
|
||||
self.root = Path(root).expanduser()
|
||||
records = []
|
||||
with open(self.root / "metadata.pkl", 'rb') as f:
|
||||
metadata = pickle.load(f)
|
||||
for mel_name, text, ids in metadata:
|
||||
mel_name = self.root / "mel" / (mel_name + ".npy")
|
||||
records.append((mel_name, text, ids))
|
||||
self.records = records
|
||||
|
||||
def __getitem__(self, i):
|
||||
mel_name, _, ids = self.records[i]
|
||||
mel = np.load(mel_name)
|
||||
return ids, mel
|
||||
|
||||
def __len__(self):
|
||||
return len(self.records)
|
||||
|
||||
|
||||
class LJSpeechCollector(object):
|
||||
"""A simple callable to batch LJSpeech examples."""
|
||||
|
||||
def __init__(self, padding_idx=0, padding_value=0., padding_stop_token=1.0):
|
||||
self.padding_idx = padding_idx
|
||||
self.padding_value = padding_value
|
||||
self.padding_stop_token = padding_stop_token
|
||||
|
||||
def __call__(self, examples):
|
||||
texts = []
|
||||
mels = []
|
||||
text_lens = []
|
||||
mel_lens = []
|
||||
|
||||
for data in examples:
|
||||
text, mel = data
|
||||
text = np.array(text, dtype=np.int64)
|
||||
text_lens.append(len(text))
|
||||
mels.append(mel)
|
||||
texts.append(text)
|
||||
mel_lens.append(mel.shape[1])
|
||||
|
||||
# Sort by text_len in descending order
|
||||
texts = [
|
||||
i for i, _ in sorted(
|
||||
zip(texts, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
mels = [
|
||||
i for i, _ in sorted(
|
||||
zip(mels, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
|
||||
mel_lens = [
|
||||
i for i, _ in sorted(
|
||||
zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
|
||||
mel_lens = np.array(mel_lens, dtype=np.int64)
|
||||
text_lens = np.array(sorted(text_lens, reverse=True), dtype=np.int64)
|
||||
|
||||
# Pad sequence with largest len of the batch
|
||||
texts, _ = batch_text_id(texts, pad_id=self.padding_idx)
|
||||
mels, _ = batch_spec(mels, pad_value=self.padding_value)
|
||||
mels = np.transpose(mels, axes=(0, 2, 1))
|
||||
|
||||
return texts, mels, text_lens, mel_lens
|
@ -1,98 +0,0 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import tqdm
|
||||
|
||||
from paddlespeech.t2s.audio import AudioProcessor
|
||||
from paddlespeech.t2s.audio import LogMagnitude
|
||||
from paddlespeech.t2s.datasets import LJSpeechMetaData
|
||||
from paddlespeech.t2s.exps.tacotron2.config import get_cfg_defaults
|
||||
from paddlespeech.t2s.frontend import EnglishCharacter
|
||||
|
||||
|
||||
def create_dataset(config, source_path, target_path, verbose=False):
|
||||
# create output dir
|
||||
target_path = Path(target_path).expanduser()
|
||||
mel_path = target_path / "mel"
|
||||
os.makedirs(mel_path, exist_ok=True)
|
||||
|
||||
meta_data = LJSpeechMetaData(source_path)
|
||||
frontend = EnglishCharacter()
|
||||
processor = AudioProcessor(
|
||||
sample_rate=config.data.sample_rate,
|
||||
n_fft=config.data.n_fft,
|
||||
n_mels=config.data.n_mels,
|
||||
win_length=config.data.win_length,
|
||||
hop_length=config.data.hop_length,
|
||||
fmax=config.data.fmax,
|
||||
fmin=config.data.fmin)
|
||||
normalizer = LogMagnitude()
|
||||
|
||||
records = []
|
||||
for (fname, text, _) in tqdm.tqdm(meta_data):
|
||||
wav = processor.read_wav(fname)
|
||||
mel = processor.mel_spectrogram(wav)
|
||||
mel = normalizer.transform(mel)
|
||||
ids = frontend(text)
|
||||
mel_name = os.path.splitext(os.path.basename(fname))[0]
|
||||
|
||||
# save mel spectrogram
|
||||
records.append((mel_name, text, ids))
|
||||
np.save(mel_path / mel_name, mel)
|
||||
if verbose:
|
||||
print("save mel spectrograms into {}".format(mel_path))
|
||||
|
||||
# save meta data as pickle archive
|
||||
with open(target_path / "metadata.pkl", 'wb') as f:
|
||||
pickle.dump(records, f)
|
||||
if verbose:
|
||||
print("saved metadata into {}".format(target_path / "metadata.pkl"))
|
||||
|
||||
print("Done.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="create dataset")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--input", type=str, help="path of the ljspeech dataset")
|
||||
parser.add_argument(
|
||||
"--output", type=str, help="path to save output dataset")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
config = get_cfg_defaults()
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config.data)
|
||||
|
||||
create_dataset(config, args.input, args.output, args.verbose)
|
File diff suppressed because one or more lines are too long
@ -1,103 +0,0 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
from paddlespeech.t2s.exps.tacotron2.config import get_cfg_defaults
|
||||
from paddlespeech.t2s.frontend import EnglishCharacter
|
||||
from paddlespeech.t2s.models.tacotron2 import Tacotron2
|
||||
from paddlespeech.t2s.utils import display
|
||||
|
||||
|
||||
def main(config, args):
|
||||
if args.ngpu == 0:
|
||||
paddle.set_device("cpu")
|
||||
elif args.ngpu > 0:
|
||||
paddle.set_device("gpu")
|
||||
else:
|
||||
print("ngpu should >= 0 !")
|
||||
|
||||
# model
|
||||
frontend = EnglishCharacter()
|
||||
model = Tacotron2.from_pretrained(config, args.checkpoint_path)
|
||||
model.eval()
|
||||
|
||||
# inputs
|
||||
input_path = Path(args.input).expanduser()
|
||||
sentences = []
|
||||
with open(input_path, "rt") as f:
|
||||
for line in f:
|
||||
line_list = line.strip().split()
|
||||
utt_id = line_list[0]
|
||||
sentence = " ".join(line_list[1:])
|
||||
sentences.append((utt_id, sentence))
|
||||
|
||||
if args.output is None:
|
||||
output_dir = input_path.parent / "synthesis"
|
||||
else:
|
||||
output_dir = Path(args.output).expanduser()
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
for i, sentence in enumerate(sentences):
|
||||
sentence = paddle.to_tensor(frontend(sentence)).unsqueeze(0)
|
||||
outputs = model.infer(sentence)
|
||||
mel_output = outputs["mel_outputs_postnet"][0].numpy().T
|
||||
alignment = outputs["alignments"][0].numpy().T
|
||||
|
||||
np.save(str(output_dir / f"sentence_{i}"), mel_output)
|
||||
display.plot_alignment(alignment)
|
||||
plt.savefig(str(output_dir / f"sentence_{i}.png"))
|
||||
if args.verbose:
|
||||
print("spectrogram saved at {}".format(output_dir /
|
||||
f"sentence_{i}.npy"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="generate mel spectrogram with TransformerTTS.")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
|
||||
parser.add_argument("--input", type=str, help="path of the text sentences")
|
||||
parser.add_argument("--output", type=str, help="path to save outputs")
|
||||
parser.add_argument(
|
||||
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
print(args)
|
||||
|
||||
main(config, args)
|
@ -1,220 +0,0 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import time
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import distributed as dist
|
||||
from paddle.io import DataLoader
|
||||
from paddle.io import DistributedBatchSampler
|
||||
|
||||
from paddlespeech.t2s.data import dataset
|
||||
from paddlespeech.t2s.exps.tacotron2.config import get_cfg_defaults
|
||||
from paddlespeech.t2s.exps.tacotron2.ljspeech import LJSpeech
|
||||
from paddlespeech.t2s.exps.tacotron2.ljspeech import LJSpeechCollector
|
||||
from paddlespeech.t2s.models.tacotron2 import Tacotron2
|
||||
from paddlespeech.t2s.models.tacotron2 import Tacotron2Loss
|
||||
from paddlespeech.t2s.training.cli import default_argument_parser
|
||||
from paddlespeech.t2s.training.experiment import ExperimentBase
|
||||
from paddlespeech.t2s.utils import display
|
||||
from paddlespeech.t2s.utils import mp_tools
|
||||
|
||||
|
||||
class Experiment(ExperimentBase):
|
||||
def compute_losses(self, inputs, outputs):
|
||||
texts, mel_targets, plens, slens = inputs
|
||||
|
||||
mel_outputs = outputs["mel_output"]
|
||||
mel_outputs_postnet = outputs["mel_outputs_postnet"]
|
||||
attention_weight = outputs["alignments"]
|
||||
if self.config.model.use_stop_token:
|
||||
stop_logits = outputs["stop_logits"]
|
||||
else:
|
||||
stop_logits = None
|
||||
|
||||
losses = self.criterion(mel_outputs, mel_outputs_postnet, mel_targets,
|
||||
attention_weight, slens, plens, stop_logits)
|
||||
return losses
|
||||
|
||||
def train_batch(self):
|
||||
start = time.time()
|
||||
batch = self.read_batch()
|
||||
data_loader_time = time.time() - start
|
||||
|
||||
self.optimizer.clear_grad()
|
||||
self.model.train()
|
||||
texts, mels, text_lens, output_lens = batch
|
||||
outputs = self.model(texts, text_lens, mels, output_lens)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
loss = losses["loss"]
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
iteration_time = time.time() - start
|
||||
|
||||
losses_np = {k: float(v) for k, v in losses.items()}
|
||||
# logging
|
||||
msg = "Rank: {}, ".format(dist.get_rank())
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
|
||||
iteration_time)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_np.items())
|
||||
self.logger.info(msg)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
for k, v in losses_np.items():
|
||||
self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
def valid(self):
|
||||
valid_losses = defaultdict(list)
|
||||
for i, batch in enumerate(self.valid_loader):
|
||||
texts, mels, text_lens, output_lens = batch
|
||||
outputs = self.model(texts, text_lens, mels, output_lens)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
for k, v in losses.items():
|
||||
valid_losses[k].append(float(v))
|
||||
|
||||
attention_weights = outputs["alignments"]
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_alignments",
|
||||
display.plot_alignment(attention_weights[0].numpy().T),
|
||||
self.iteration)
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_target_spectrogram",
|
||||
display.plot_spectrogram(mels[0].numpy().T), self.iteration)
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_predicted_spectrogram",
|
||||
display.plot_spectrogram(outputs['mel_outputs_postnet'][0]
|
||||
.numpy().T), self.iteration)
|
||||
|
||||
# write visual log
|
||||
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
|
||||
|
||||
# logging
|
||||
msg = "Valid: "
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in valid_losses.items())
|
||||
self.logger.info(msg)
|
||||
|
||||
for k, v in valid_losses.items():
|
||||
self.visualizer.add_scalar(f"valid/{k}", v, self.iteration)
|
||||
|
||||
def setup_model(self):
|
||||
config = self.config
|
||||
model = Tacotron2(
|
||||
vocab_size=config.model.vocab_size,
|
||||
d_mels=config.data.n_mels,
|
||||
d_encoder=config.model.d_encoder,
|
||||
encoder_conv_layers=config.model.encoder_conv_layers,
|
||||
encoder_kernel_size=config.model.encoder_kernel_size,
|
||||
d_prenet=config.model.d_prenet,
|
||||
d_attention_rnn=config.model.d_attention_rnn,
|
||||
d_decoder_rnn=config.model.d_decoder_rnn,
|
||||
attention_filters=config.model.attention_filters,
|
||||
attention_kernel_size=config.model.attention_kernel_size,
|
||||
d_attention=config.model.d_attention,
|
||||
d_postnet=config.model.d_postnet,
|
||||
postnet_kernel_size=config.model.postnet_kernel_size,
|
||||
postnet_conv_layers=config.model.postnet_conv_layers,
|
||||
reduction_factor=config.model.reduction_factor,
|
||||
p_encoder_dropout=config.model.p_encoder_dropout,
|
||||
p_prenet_dropout=config.model.p_prenet_dropout,
|
||||
p_attention_dropout=config.model.p_attention_dropout,
|
||||
p_decoder_dropout=config.model.p_decoder_dropout,
|
||||
p_postnet_dropout=config.model.p_postnet_dropout,
|
||||
use_stop_token=config.model.use_stop_token)
|
||||
|
||||
if self.parallel:
|
||||
model = paddle.DataParallel(model)
|
||||
|
||||
grad_clip = paddle.nn.ClipGradByGlobalNorm(
|
||||
config.training.grad_clip_thresh)
|
||||
optimizer = paddle.optimizer.Adam(
|
||||
learning_rate=config.training.lr,
|
||||
parameters=model.parameters(),
|
||||
weight_decay=paddle.regularizer.L2Decay(
|
||||
config.training.weight_decay),
|
||||
grad_clip=grad_clip)
|
||||
criterion = Tacotron2Loss(
|
||||
use_stop_token_loss=config.model.use_stop_token,
|
||||
use_guided_attention_loss=config.model.use_guided_attention_loss,
|
||||
sigma=config.model.guided_attention_loss_sigma)
|
||||
self.model = model
|
||||
self.optimizer = optimizer
|
||||
self.criterion = criterion
|
||||
|
||||
def setup_dataloader(self):
|
||||
args = self.args
|
||||
config = self.config
|
||||
ljspeech_dataset = LJSpeech(args.data)
|
||||
|
||||
valid_set, train_set = dataset.split(ljspeech_dataset,
|
||||
config.data.valid_size)
|
||||
batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
|
||||
|
||||
if not self.parallel:
|
||||
self.train_loader = DataLoader(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
collate_fn=batch_fn)
|
||||
else:
|
||||
sampler = DistributedBatchSampler(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
self.train_loader = DataLoader(
|
||||
train_set, batch_sampler=sampler, collate_fn=batch_fn)
|
||||
|
||||
self.valid_loader = DataLoader(
|
||||
valid_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=batch_fn)
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
exp = Experiment(config, args)
|
||||
exp.setup()
|
||||
exp.resume_or_load()
|
||||
exp.run()
|
||||
|
||||
|
||||
def main(config, args):
|
||||
if args.ngpu > 1:
|
||||
dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
|
||||
else:
|
||||
main_sp(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
parser = default_argument_parser()
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
print(args)
|
||||
|
||||
main(config, args)
|
@ -1,13 +0,0 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -1,13 +0,0 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -1,89 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from paddle.io import Dataset
|
||||
|
||||
from paddlespeech.t2s.data import batch_spec
|
||||
from paddlespeech.t2s.data import batch_text_id
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import _phones
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import _tones
|
||||
from paddlespeech.t2s.frontend import Vocab
|
||||
|
||||
voc_phones = Vocab(sorted(list(_phones)))
|
||||
print("vocab_phones:\n", voc_phones)
|
||||
voc_tones = Vocab(sorted(list(_tones)))
|
||||
print("vocab_tones:\n", voc_tones)
|
||||
|
||||
|
||||
class AiShell3(Dataset):
|
||||
"""Processed AiShell3 dataset."""
|
||||
|
||||
def __init__(self, root):
|
||||
super().__init__()
|
||||
self.root = Path(root).expanduser()
|
||||
self.embed_dir = self.root / "embed"
|
||||
self.mel_dir = self.root / "mel"
|
||||
|
||||
with open(self.root / "metadata.pickle", 'rb') as f:
|
||||
self.records = pickle.load(f)
|
||||
|
||||
def __getitem__(self, index):
|
||||
metadatum = self.records[index]
|
||||
sentence_id = metadatum["sentence_id"]
|
||||
speaker_id = sentence_id[:7]
|
||||
phones = metadatum["phones"]
|
||||
tones = metadatum["tones"]
|
||||
phones = np.array(
|
||||
[voc_phones.lookup(item) for item in phones], dtype=np.int64)
|
||||
tones = np.array(
|
||||
[voc_tones.lookup(item) for item in tones], dtype=np.int64)
|
||||
mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy")))
|
||||
embed = np.load(
|
||||
str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
|
||||
return phones, tones, mel, embed
|
||||
|
||||
def __len__(self):
|
||||
return len(self.records)
|
||||
|
||||
|
||||
def collate_aishell3_examples(examples):
|
||||
phones, tones, mel, embed = list(zip(*examples))
|
||||
|
||||
text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
|
||||
spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
|
||||
T_dec = np.max(spec_lengths)
|
||||
stop_tokens = (
|
||||
np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32)
|
||||
phones, _ = batch_text_id(phones)
|
||||
tones, _ = batch_text_id(tones)
|
||||
mel, _ = batch_spec(mel)
|
||||
mel = np.transpose(mel, (0, 2, 1))
|
||||
embed = np.stack(embed)
|
||||
# 7 fields
|
||||
# (B, T), (B, T), (B, T, C), (B, C), (B,), (B,), (B, T)
|
||||
return phones, tones, mel, embed, text_lengths, spec_lengths, stop_tokens
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dataset = AiShell3("~/datasets/aishell3/train")
|
||||
example = dataset[0]
|
||||
|
||||
examples = [dataset[i] for i in range(10)]
|
||||
batch = collate_aishell3_examples(examples)
|
||||
|
||||
for field in batch:
|
||||
print(field.shape, field.dtype)
|
@ -1,42 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
|
||||
from pypinyin import lazy_pinyin
|
||||
from pypinyin import Style
|
||||
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import split_syllable
|
||||
|
||||
|
||||
def convert_to_pinyin(text: str) -> List[str]:
|
||||
"""convert text into list of syllables, other characters that are not chinese, thus
|
||||
cannot be converted to pinyin are splited.
|
||||
"""
|
||||
syllables = lazy_pinyin(
|
||||
text, style=Style.TONE3, neutral_tone_with_five=True)
|
||||
return syllables
|
||||
|
||||
|
||||
def convert_sentence(text: str) -> List[Tuple[str]]:
|
||||
"""convert a sentence into two list: phones and tones"""
|
||||
syllables = convert_to_pinyin(text)
|
||||
phones = []
|
||||
tones = []
|
||||
for syllable in syllables:
|
||||
p, t = split_syllable(syllable)
|
||||
phones.extend(p)
|
||||
tones.extend(t)
|
||||
|
||||
return phones, tones
|
@ -1,81 +0,0 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from yacs.config import CfgNode as CN
|
||||
|
||||
_C = CN()
|
||||
_C.data = CN(
|
||||
dict(
|
||||
batch_size=32, # batch size
|
||||
valid_size=64, # the first N examples are reserved for validation
|
||||
sample_rate=22050, # Hz, sample rate
|
||||
n_fft=1024, # fft frame size
|
||||
win_length=1024, # window size
|
||||
hop_length=256, # hop size between ajacent frame
|
||||
fmax=8000, # Hz, max frequency when converting to mel
|
||||
fmin=0, # Hz, min frequency when converting to mel
|
||||
d_mels=80, # mel bands
|
||||
padding_idx=0, # text embedding's padding index
|
||||
))
|
||||
|
||||
_C.model = CN(
|
||||
dict(
|
||||
vocab_size=70,
|
||||
n_tones=10,
|
||||
reduction_factor=1, # reduction factor
|
||||
d_encoder=512, # embedding & encoder's internal size
|
||||
encoder_conv_layers=3, # number of conv layer in tacotron2 encoder
|
||||
encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder
|
||||
d_prenet=256, # hidden size of decoder prenet
|
||||
# hidden size of the first rnn layer in tacotron2 decoder
|
||||
d_attention_rnn=1024,
|
||||
# hidden size of the second rnn layer in tacotron2 decoder
|
||||
d_decoder_rnn=1024,
|
||||
d_attention=128, # hidden size of decoder location linear layer
|
||||
attention_filters=32, # number of filter in decoder location conv layer
|
||||
attention_kernel_size=31, # kernel size of decoder location conv layer
|
||||
d_postnet=512, # hidden size of decoder postnet
|
||||
postnet_kernel_size=5, # kernel size of conv layers in postnet
|
||||
postnet_conv_layers=5, # number of conv layer in decoder postnet
|
||||
p_encoder_dropout=0.5, # droput probability in encoder
|
||||
p_prenet_dropout=0.5, # droput probability in decoder prenet
|
||||
|
||||
# droput probability of first rnn layer in decoder
|
||||
p_attention_dropout=0.1,
|
||||
# droput probability of second rnn layer in decoder
|
||||
p_decoder_dropout=0.1,
|
||||
p_postnet_dropout=0.5, # droput probability in decoder postnet
|
||||
guided_attention_loss_sigma=0.2,
|
||||
d_global_condition=256,
|
||||
|
||||
# whether to use a classifier to predict stop probability
|
||||
use_stop_token=False,
|
||||
# whether to use guided attention loss in training
|
||||
use_guided_attention_loss=True, ))
|
||||
|
||||
_C.training = CN(
|
||||
dict(
|
||||
lr=1e-3, # learning rate
|
||||
weight_decay=1e-6, # the coeff of weight decay
|
||||
grad_clip_thresh=1.0, # the clip norm of grad clip.
|
||||
valid_interval=1000, # validation
|
||||
save_interval=1000, # checkpoint
|
||||
max_iteration=500000, # max iteration to train
|
||||
))
|
||||
|
||||
|
||||
def get_cfg_defaults():
|
||||
"""Get a yacs CfgNode object with default values for my_project."""
|
||||
# Return a clone so that the defaults will not be altered
|
||||
# This is for the "local variable" use pattern
|
||||
return _C.clone()
|
@ -1,95 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import multiprocessing as mp
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import tqdm
|
||||
|
||||
from paddlespeech.t2s.audio import AudioProcessor
|
||||
from paddlespeech.t2s.audio.spec_normalizer import LogMagnitude
|
||||
from paddlespeech.t2s.audio.spec_normalizer import NormalizerBase
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.config import get_cfg_defaults
|
||||
|
||||
|
||||
def extract_mel(fname: Path,
|
||||
input_dir: Path,
|
||||
output_dir: Path,
|
||||
p: AudioProcessor,
|
||||
n: NormalizerBase):
|
||||
relative_path = fname.relative_to(input_dir)
|
||||
out_path = (output_dir / relative_path).with_suffix(".npy")
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
wav = p.read_wav(fname)
|
||||
mel = p.mel_spectrogram(wav)
|
||||
mel = n.transform(mel)
|
||||
np.save(out_path, mel)
|
||||
|
||||
|
||||
def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):
|
||||
input_dir = Path(input_dir).expanduser()
|
||||
fnames = list(input_dir.rglob(f"*{extension}"))
|
||||
output_dir = Path(output_dir).expanduser()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length,
|
||||
config.hop_length, config.d_mels, config.fmin,
|
||||
config.fmax)
|
||||
n = LogMagnitude(1e-5)
|
||||
|
||||
func = partial(
|
||||
extract_mel, input_dir=input_dir, output_dir=output_dir, p=p, n=n)
|
||||
|
||||
with mp.Pool(16) as pool:
|
||||
list(
|
||||
tqdm.tqdm(
|
||||
pool.imap(func, fnames), total=len(fnames), unit="utterance"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extract mel spectrogram from processed wav in AiShell3 training dataset."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
help="yaml config file to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/normalized_wav",
|
||||
help="path of the processed wav folder")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/mel",
|
||||
help="path of the folder to save mel spectrograms")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
default_config = get_cfg_defaults()
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
default_config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
default_config.merge_from_list(args.opts)
|
||||
default_config.freeze()
|
||||
audio_config = default_config.data
|
||||
|
||||
extract_mel_multispeaker(audio_config, args.input, args.output)
|
File diff suppressed because it is too large
Load Diff
@ -1,257 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import pickle
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import tqdm
|
||||
import yaml
|
||||
|
||||
zh_pattern = re.compile("[\u4e00-\u9fa5]")
|
||||
|
||||
_tones = {'<pad>', '<s>', '</s>', '0', '1', '2', '3', '4', '5'}
|
||||
|
||||
_pauses = {'%', '$'}
|
||||
|
||||
_initials = {
|
||||
'b',
|
||||
'p',
|
||||
'm',
|
||||
'f',
|
||||
'd',
|
||||
't',
|
||||
'n',
|
||||
'l',
|
||||
'g',
|
||||
'k',
|
||||
'h',
|
||||
'j',
|
||||
'q',
|
||||
'x',
|
||||
'zh',
|
||||
'ch',
|
||||
'sh',
|
||||
'r',
|
||||
'z',
|
||||
'c',
|
||||
's',
|
||||
}
|
||||
|
||||
_finals = {
|
||||
'ii',
|
||||
'iii',
|
||||
'a',
|
||||
'o',
|
||||
'e',
|
||||
'ea',
|
||||
'ai',
|
||||
'ei',
|
||||
'ao',
|
||||
'ou',
|
||||
'an',
|
||||
'en',
|
||||
'ang',
|
||||
'eng',
|
||||
'er',
|
||||
'i',
|
||||
'ia',
|
||||
'io',
|
||||
'ie',
|
||||
'iai',
|
||||
'iao',
|
||||
'iou',
|
||||
'ian',
|
||||
'ien',
|
||||
'iang',
|
||||
'ieng',
|
||||
'u',
|
||||
'ua',
|
||||
'uo',
|
||||
'uai',
|
||||
'uei',
|
||||
'uan',
|
||||
'uen',
|
||||
'uang',
|
||||
'ueng',
|
||||
'v',
|
||||
've',
|
||||
'van',
|
||||
'ven',
|
||||
'veng',
|
||||
}
|
||||
|
||||
_ernized_symbol = {'&r'}
|
||||
|
||||
_specials = {'<pad>', '<unk>', '<s>', '</s>'}
|
||||
|
||||
_phones = _initials | _finals | _ernized_symbol | _specials | _pauses
|
||||
|
||||
|
||||
def is_zh(word):
|
||||
global zh_pattern
|
||||
match = zh_pattern.search(word)
|
||||
return match is not None
|
||||
|
||||
|
||||
def ernized(syllable):
|
||||
return syllable[:2] != "er" and syllable[-2] == 'r'
|
||||
|
||||
|
||||
def convert(syllable):
|
||||
# expansion of o -> uo
|
||||
syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable)
|
||||
# syllable = syllable.replace("bo", "buo").replace("po", "puo").replace("mo", "muo").replace("fo", "fuo")
|
||||
# expansion for iong, ong
|
||||
syllable = syllable.replace("iong", "veng").replace("ong", "ueng")
|
||||
|
||||
# expansion for ing, in
|
||||
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
|
||||
|
||||
# expansion for un, ui, iu
|
||||
syllable = syllable.replace("un", "uen").replace("ui",
|
||||
"uei").replace("iu", "iou")
|
||||
|
||||
# rule for variants of i
|
||||
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
|
||||
.replace("zhi", "zhiii").replace("chi", "chiii").replace("shi", "shiii")\
|
||||
.replace("ri", "riii")
|
||||
|
||||
# rule for y preceding i, u
|
||||
syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i")
|
||||
|
||||
# rule for w
|
||||
syllable = syllable.replace("wu", "u").replace("w", "u")
|
||||
|
||||
# rule for v following j, q, x
|
||||
syllable = syllable.replace("ju", "jv").replace("qu",
|
||||
"qv").replace("xu", "xv")
|
||||
|
||||
return syllable
|
||||
|
||||
|
||||
def split_syllable(syllable: str):
|
||||
"""Split a syllable in pinyin into a list of phones and a list of tones.
|
||||
Initials have no tone, represented by '0', while finals have tones from
|
||||
'1,2,3,4,5'.
|
||||
|
||||
e.g.
|
||||
|
||||
zhang -> ['zh', 'ang'], ['0', '1']
|
||||
"""
|
||||
if syllable in _pauses:
|
||||
# syllable, tone
|
||||
return [syllable], ['0']
|
||||
|
||||
tone = syllable[-1]
|
||||
syllable = convert(syllable[:-1])
|
||||
|
||||
phones = []
|
||||
tones = []
|
||||
|
||||
global _initials
|
||||
if syllable[:2] in _initials:
|
||||
phones.append(syllable[:2])
|
||||
tones.append('0')
|
||||
phones.append(syllable[2:])
|
||||
tones.append(tone)
|
||||
elif syllable[0] in _initials:
|
||||
phones.append(syllable[0])
|
||||
tones.append('0')
|
||||
phones.append(syllable[1:])
|
||||
tones.append(tone)
|
||||
else:
|
||||
phones.append(syllable)
|
||||
tones.append(tone)
|
||||
return phones, tones
|
||||
|
||||
|
||||
def load_aishell3_transcription(line: str):
|
||||
sentence_id, pinyin, text = line.strip().split("|")
|
||||
syllables = pinyin.strip().split()
|
||||
|
||||
results = []
|
||||
|
||||
for syllable in syllables:
|
||||
if syllable in _pauses:
|
||||
results.append(syllable)
|
||||
elif not ernized(syllable):
|
||||
results.append(syllable)
|
||||
else:
|
||||
results.append(syllable[:-2] + syllable[-1])
|
||||
results.append('&r5')
|
||||
|
||||
phones = []
|
||||
tones = []
|
||||
for syllable in results:
|
||||
p, t = split_syllable(syllable)
|
||||
phones.extend(p)
|
||||
tones.extend(t)
|
||||
for p in phones:
|
||||
assert p in _phones, p
|
||||
return {
|
||||
"sentence_id": sentence_id,
|
||||
"text": text,
|
||||
"syllables": results,
|
||||
"phones": phones,
|
||||
"tones": tones
|
||||
}
|
||||
|
||||
|
||||
def process_aishell3(dataset_root, output_dir):
|
||||
dataset_root = Path(dataset_root).expanduser()
|
||||
output_dir = Path(output_dir).expanduser()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
prosody_label_path = dataset_root / "label_train-set.txt"
|
||||
with open(prosody_label_path, 'rt') as f:
|
||||
lines = [line.strip() for line in f]
|
||||
|
||||
records = lines[5:]
|
||||
|
||||
processed_records = []
|
||||
for record in tqdm.tqdm(records):
|
||||
new_record = load_aishell3_transcription(record)
|
||||
processed_records.append(new_record)
|
||||
print(new_record)
|
||||
|
||||
with open(output_dir / "metadata.pickle", 'wb') as f:
|
||||
pickle.dump(processed_records, f)
|
||||
|
||||
with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
|
||||
yaml.safe_dump(
|
||||
processed_records, f, default_flow_style=None, allow_unicode=True)
|
||||
|
||||
print("metadata done!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train",
|
||||
help="path of the training dataset,(contains a label_train-set.txt).")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
help="the directory to save the processed transcription."
|
||||
"If not provided, it would be the same as the input.")
|
||||
args = parser.parse_args()
|
||||
if args.output is None:
|
||||
args.output = args.input
|
||||
|
||||
process_aishell3(args.input, args.output)
|
@ -1,94 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
from functools import partial
|
||||
from multiprocessing import Pool
|
||||
from pathlib import Path
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
from praatio import textgrid
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def get_valid_part(fpath):
|
||||
f = textgrid.openTextgrid(fpath, includeEmptyIntervals=True)
|
||||
|
||||
start = 0
|
||||
phone_entry_list = f.tierDict['phones'].entryList
|
||||
first_entry = phone_entry_list[0]
|
||||
if first_entry.label == "sil":
|
||||
start = first_entry.end
|
||||
|
||||
last_entry = phone_entry_list[-1]
|
||||
if last_entry.label == "sp":
|
||||
end = last_entry.start
|
||||
else:
|
||||
end = last_entry.end
|
||||
return start, end
|
||||
|
||||
|
||||
def process_utterance(fpath, source_dir, target_dir, alignment_dir):
|
||||
rel_path = fpath.relative_to(source_dir)
|
||||
opath = target_dir / rel_path
|
||||
apath = (alignment_dir / rel_path).with_suffix(".TextGrid")
|
||||
opath.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
start, end = get_valid_part(apath)
|
||||
wav, _ = librosa.load(fpath, sr=22050, offset=start, duration=end - start)
|
||||
normalized_wav = wav / np.max(wav) * 0.999
|
||||
sf.write(opath, normalized_wav, samplerate=22050, subtype='PCM_16')
|
||||
# print(f"{fpath} => {opath}")
|
||||
|
||||
|
||||
def preprocess_aishell3(source_dir, target_dir, alignment_dir):
|
||||
source_dir = Path(source_dir).expanduser()
|
||||
target_dir = Path(target_dir).expanduser()
|
||||
alignment_dir = Path(alignment_dir).expanduser()
|
||||
|
||||
wav_paths = list(source_dir.rglob("*.wav"))
|
||||
print(f"there are {len(wav_paths)} audio files in total")
|
||||
fx = partial(
|
||||
process_utterance,
|
||||
source_dir=source_dir,
|
||||
target_dir=target_dir,
|
||||
alignment_dir=alignment_dir)
|
||||
with Pool(16) as p:
|
||||
list(
|
||||
tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Process audio in AiShell3, trim silence according to the alignment "
|
||||
"files generated by MFA, and normalize volume by peak.")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/wav",
|
||||
help="path of the original audio folder in aishell3.")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/normalized_wav",
|
||||
help="path of the folder to save the processed audio files.")
|
||||
parser.add_argument(
|
||||
"--alignment",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/alignment",
|
||||
help="path of the alignment files.")
|
||||
args = parser.parse_args()
|
||||
|
||||
preprocess_aishell3(args.input, args.output, args.alignment)
|
@ -1,263 +0,0 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from matplotlib import pyplot as plt
|
||||
from paddle import distributed as dist
|
||||
from paddle.io import DataLoader
|
||||
from paddle.io import DistributedBatchSampler
|
||||
|
||||
from paddlespeech.t2s.data import dataset
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import AiShell3
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import collate_aishell3_examples
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.config import get_cfg_defaults
|
||||
from paddlespeech.t2s.models.tacotron2 import Tacotron2
|
||||
from paddlespeech.t2s.models.tacotron2 import Tacotron2Loss
|
||||
from paddlespeech.t2s.training.cli import default_argument_parser
|
||||
from paddlespeech.t2s.training.experiment import ExperimentBase
|
||||
from paddlespeech.t2s.utils import display
|
||||
from paddlespeech.t2s.utils import mp_tools
|
||||
|
||||
|
||||
class Experiment(ExperimentBase):
|
||||
def compute_losses(self, inputs, outputs):
|
||||
texts, tones, mel_targets, utterance_embeds, text_lens, output_lens, stop_tokens = inputs
|
||||
|
||||
mel_outputs = outputs["mel_output"]
|
||||
mel_outputs_postnet = outputs["mel_outputs_postnet"]
|
||||
alignments = outputs["alignments"]
|
||||
|
||||
losses = self.criterion(mel_outputs, mel_outputs_postnet, mel_targets,
|
||||
alignments, output_lens, text_lens)
|
||||
return losses
|
||||
|
||||
def train_batch(self):
|
||||
start = time.time()
|
||||
batch = self.read_batch()
|
||||
data_loader_time = time.time() - start
|
||||
|
||||
self.optimizer.clear_grad()
|
||||
self.model.train()
|
||||
texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
|
||||
outputs = self.model(
|
||||
texts,
|
||||
text_lens,
|
||||
mels,
|
||||
output_lens,
|
||||
tones=tones,
|
||||
global_condition=utterance_embeds)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
loss = losses["loss"]
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
iteration_time = time.time() - start
|
||||
|
||||
losses_np = {k: float(v) for k, v in losses.items()}
|
||||
# logging
|
||||
msg = "Rank: {}, ".format(dist.get_rank())
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
|
||||
iteration_time)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_np.items())
|
||||
self.logger.info(msg)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
for key, value in losses_np.items():
|
||||
self.visualizer.add_scalar(f"train_loss/{key}", value,
|
||||
self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
def valid(self):
|
||||
valid_losses = defaultdict(list)
|
||||
for i, batch in enumerate(self.valid_loader):
|
||||
texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
|
||||
outputs = self.model(
|
||||
texts,
|
||||
text_lens,
|
||||
mels,
|
||||
output_lens,
|
||||
tones=tones,
|
||||
global_condition=utterance_embeds)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
for key, value in losses.items():
|
||||
valid_losses[key].append(float(value))
|
||||
|
||||
attention_weights = outputs["alignments"]
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_alignments",
|
||||
display.plot_alignment(attention_weights[0].numpy().T),
|
||||
self.iteration)
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_target_spectrogram",
|
||||
display.plot_spectrogram(mels[0].numpy().T), self.iteration)
|
||||
mel_pred = outputs['mel_outputs_postnet']
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_predicted_spectrogram",
|
||||
display.plot_spectrogram(mel_pred[0].numpy().T), self.iteration)
|
||||
|
||||
# write visual log
|
||||
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
|
||||
|
||||
# logging
|
||||
msg = "Valid: "
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in valid_losses.items())
|
||||
self.logger.info(msg)
|
||||
|
||||
for key, value in valid_losses.items():
|
||||
self.visualizer.add_scalar(f"valid/{key}", value, self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
def eval(self):
|
||||
"""Evaluation of Tacotron2 in autoregressive manner."""
|
||||
self.model.eval()
|
||||
mel_dir = Path(self.output_dir / ("eval_{}".format(self.iteration)))
|
||||
mel_dir.mkdir(parents=True, exist_ok=True)
|
||||
for i, batch in enumerate(self.test_loader):
|
||||
texts, tones, mels, utterance_embeds, *_ = batch
|
||||
outputs = self.model.infer(
|
||||
texts, tones=tones, global_condition=utterance_embeds)
|
||||
|
||||
display.plot_alignment(outputs["alignments"][0].numpy().T)
|
||||
plt.savefig(mel_dir / f"sentence_{i}.png")
|
||||
plt.close()
|
||||
np.save(mel_dir / f"sentence_{i}",
|
||||
outputs["mel_outputs_postnet"][0].numpy().T)
|
||||
print(f"sentence_{i}")
|
||||
|
||||
def setup_model(self):
|
||||
config = self.config
|
||||
model = Tacotron2(
|
||||
vocab_size=config.model.vocab_size,
|
||||
n_tones=config.model.n_tones,
|
||||
d_mels=config.data.d_mels,
|
||||
d_encoder=config.model.d_encoder,
|
||||
encoder_conv_layers=config.model.encoder_conv_layers,
|
||||
encoder_kernel_size=config.model.encoder_kernel_size,
|
||||
d_prenet=config.model.d_prenet,
|
||||
d_attention_rnn=config.model.d_attention_rnn,
|
||||
d_decoder_rnn=config.model.d_decoder_rnn,
|
||||
attention_filters=config.model.attention_filters,
|
||||
attention_kernel_size=config.model.attention_kernel_size,
|
||||
d_attention=config.model.d_attention,
|
||||
d_postnet=config.model.d_postnet,
|
||||
postnet_kernel_size=config.model.postnet_kernel_size,
|
||||
postnet_conv_layers=config.model.postnet_conv_layers,
|
||||
reduction_factor=config.model.reduction_factor,
|
||||
p_encoder_dropout=config.model.p_encoder_dropout,
|
||||
p_prenet_dropout=config.model.p_prenet_dropout,
|
||||
p_attention_dropout=config.model.p_attention_dropout,
|
||||
p_decoder_dropout=config.model.p_decoder_dropout,
|
||||
p_postnet_dropout=config.model.p_postnet_dropout,
|
||||
d_global_condition=config.model.d_global_condition,
|
||||
use_stop_token=config.model.use_stop_token, )
|
||||
|
||||
if self.parallel:
|
||||
model = paddle.DataParallel(model)
|
||||
|
||||
grad_clip = paddle.nn.ClipGradByGlobalNorm(
|
||||
config.training.grad_clip_thresh)
|
||||
optimizer = paddle.optimizer.Adam(
|
||||
learning_rate=config.training.lr,
|
||||
parameters=model.parameters(),
|
||||
weight_decay=paddle.regularizer.L2Decay(
|
||||
config.training.weight_decay),
|
||||
grad_clip=grad_clip)
|
||||
criterion = Tacotron2Loss(
|
||||
use_stop_token_loss=config.model.use_stop_token,
|
||||
use_guided_attention_loss=config.model.use_guided_attention_loss,
|
||||
sigma=config.model.guided_attention_loss_sigma)
|
||||
self.model = model
|
||||
self.optimizer = optimizer
|
||||
self.criterion = criterion
|
||||
|
||||
def setup_dataloader(self):
|
||||
args = self.args
|
||||
config = self.config
|
||||
aishell3_dataset = AiShell3(args.data)
|
||||
|
||||
valid_set, train_set = dataset.split(aishell3_dataset,
|
||||
config.data.valid_size)
|
||||
batch_fn = collate_aishell3_examples
|
||||
|
||||
if not self.parallel:
|
||||
self.train_loader = DataLoader(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
collate_fn=batch_fn)
|
||||
else:
|
||||
sampler = DistributedBatchSampler(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
self.train_loader = DataLoader(
|
||||
train_set, batch_sampler=sampler, collate_fn=batch_fn)
|
||||
|
||||
self.valid_loader = DataLoader(
|
||||
valid_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=batch_fn)
|
||||
|
||||
self.test_loader = DataLoader(
|
||||
valid_set,
|
||||
batch_size=1,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=batch_fn)
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
exp = Experiment(config, args)
|
||||
exp.setup()
|
||||
exp.resume_or_load()
|
||||
if not args.test:
|
||||
exp.run()
|
||||
else:
|
||||
exp.eval()
|
||||
|
||||
|
||||
def main(config, args):
|
||||
if args.ngpu > 1:
|
||||
dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
|
||||
else:
|
||||
main_sp(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
parser = default_argument_parser()
|
||||
parser.add_argument("--test", action="store_true")
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
print(args)
|
||||
|
||||
main(config, args)
|
@ -1,166 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import soundfile as sf
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_phones
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_tones
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.chinese_g2p import convert_sentence
|
||||
from paddlespeech.t2s.models.tacotron2 import Tacotron2
|
||||
from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow
|
||||
from paddlespeech.t2s.utils import display
|
||||
from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
|
||||
from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
|
||||
|
||||
|
||||
def voice_cloning(args):
|
||||
# speaker encoder
|
||||
p = SpeakerVerificationPreprocessor(
|
||||
sampling_rate=16000,
|
||||
audio_norm_target_dBFS=-30,
|
||||
vad_window_length=30,
|
||||
vad_moving_average_width=8,
|
||||
vad_max_silence_length=6,
|
||||
mel_window_length=25,
|
||||
mel_window_step=10,
|
||||
n_mels=40,
|
||||
partial_n_frames=160,
|
||||
min_pad_coverage=0.75,
|
||||
partial_overlap_ratio=0.5)
|
||||
print("Audio Processor Done!")
|
||||
|
||||
speaker_encoder = LSTMSpeakerEncoder(
|
||||
n_mels=40, num_layers=3, hidden_size=256, output_size=256)
|
||||
speaker_encoder.set_state_dict(paddle.load(args.ge2e_params_path))
|
||||
speaker_encoder.eval()
|
||||
print("GE2E Done!")
|
||||
|
||||
synthesizer = Tacotron2(
|
||||
vocab_size=68,
|
||||
n_tones=10,
|
||||
d_mels=80,
|
||||
d_encoder=512,
|
||||
encoder_conv_layers=3,
|
||||
encoder_kernel_size=5,
|
||||
d_prenet=256,
|
||||
d_attention_rnn=1024,
|
||||
d_decoder_rnn=1024,
|
||||
attention_filters=32,
|
||||
attention_kernel_size=31,
|
||||
d_attention=128,
|
||||
d_postnet=512,
|
||||
postnet_kernel_size=5,
|
||||
postnet_conv_layers=5,
|
||||
reduction_factor=1,
|
||||
p_encoder_dropout=0.5,
|
||||
p_prenet_dropout=0.5,
|
||||
p_attention_dropout=0.1,
|
||||
p_decoder_dropout=0.1,
|
||||
p_postnet_dropout=0.5,
|
||||
d_global_condition=256,
|
||||
use_stop_token=False, )
|
||||
synthesizer.set_state_dict(paddle.load(args.tacotron2_params_path))
|
||||
synthesizer.eval()
|
||||
print("Tacotron2 Done!")
|
||||
|
||||
# vocoder
|
||||
vocoder = ConditionalWaveFlow(
|
||||
upsample_factors=[16, 16],
|
||||
n_flows=8,
|
||||
n_layers=8,
|
||||
n_group=16,
|
||||
channels=128,
|
||||
n_mels=80,
|
||||
kernel_size=[3, 3])
|
||||
vocoder.set_state_dict(paddle.load(args.waveflow_params_path))
|
||||
vocoder.eval()
|
||||
print("WaveFlow Done!")
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
input_dir = Path(args.input_dir)
|
||||
|
||||
# 因为 AISHELL-3 数据集中使用 % 和 $ 表示韵律词和韵律短语的边界,它们大约对应着较短和较长的停顿,在文本中可以使用 % 和 $ 来调节韵律。
|
||||
# 值得的注意的是,句子的有效字符集仅包含汉字和 %, $, 因此输入的句子只能包含这些字符。
|
||||
sentence = "每当你觉得%想要批评什么人的时候$你切要记着%这个世界上的人%并非都具备你禀有的条件$"
|
||||
phones, tones = convert_sentence(sentence)
|
||||
phones = np.array(
|
||||
[voc_phones.lookup(item) for item in phones], dtype=np.int64)
|
||||
tones = np.array([voc_tones.lookup(item) for item in tones], dtype=np.int64)
|
||||
phones = paddle.to_tensor(phones).unsqueeze(0)
|
||||
tones = paddle.to_tensor(tones).unsqueeze(0)
|
||||
|
||||
for name in os.listdir(input_dir):
|
||||
utt_id = name.split(".")[0]
|
||||
ref_audio_path = input_dir / name
|
||||
mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path))
|
||||
print("mel_sequences: ", mel_sequences.shape)
|
||||
with paddle.no_grad():
|
||||
embed = speaker_encoder.embed_utterance(
|
||||
paddle.to_tensor(mel_sequences))
|
||||
print("embed shape: ", embed.shape)
|
||||
utterance_embeds = paddle.unsqueeze(embed, 0)
|
||||
outputs = synthesizer.infer(
|
||||
phones, tones=tones, global_condition=utterance_embeds)
|
||||
mel_input = paddle.transpose(outputs["mel_outputs_postnet"], [0, 2, 1])
|
||||
alignment = outputs["alignments"][0].numpy().T
|
||||
display.plot_alignment(alignment)
|
||||
plt.savefig(str(output_dir / (utt_id + ".png")))
|
||||
|
||||
with paddle.no_grad():
|
||||
wav = vocoder.infer(mel_input)
|
||||
wav = wav.numpy()[0]
|
||||
sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=22050)
|
||||
|
||||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(description="")
|
||||
parser.add_argument(
|
||||
"--ge2e_params_path", type=str, help="ge2e params path.")
|
||||
parser.add_argument(
|
||||
"--tacotron2_params_path", type=str, help="tacotron2 params path.")
|
||||
parser.add_argument(
|
||||
"--waveflow_params_path", type=str, help="waveflow params path.")
|
||||
|
||||
parser.add_argument(
|
||||
"--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
|
||||
|
||||
parser.add_argument(
|
||||
"--input-dir",
|
||||
type=str,
|
||||
help="input dir of *.wav, the sample rate will be resample to 16k.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.ngpu == 0:
|
||||
paddle.set_device("cpu")
|
||||
elif args.ngpu > 0:
|
||||
paddle.set_device("gpu")
|
||||
else:
|
||||
print("ngpu should >= 0 !")
|
||||
|
||||
voice_cloning(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue