commit
a2e7ccac4b
@ -0,0 +1,10 @@
|
||||
# [VoxCeleb](http://www.robots.ox.ac.uk/~vgg/data/voxceleb/)
|
||||
VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from interview videos uploaded to YouTube。
|
||||
|
||||
VoxCeleb contains speech from speakers spanning a wide range of different ethnicities, accents, professions and ages.
|
||||
All speaking face-tracks are captured "in the wild", with background chatter, laughter, overlapping speech, pose variation and different lighting conditions.
|
||||
VoxCeleb consists of both audio and video. Each segment is at least 3 seconds long.
|
||||
|
||||
The dataset consists of two versions, VoxCeleb1 and VoxCeleb2. Each version has it's own train/test split. For each we provide YouTube URLs, face detections and tracks, audio files, cropped face videos and speaker meta-data. There is no overlap between the two versions.
|
||||
|
||||
more info in details refers to http://www.robots.ox.ac.uk/~vgg/data/voxceleb/
|
@ -0,0 +1,188 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Prepare VoxCeleb1 dataset
|
||||
|
||||
create manifest files.
|
||||
Manifest file is a json-format file with each line containing the
|
||||
meta data (i.e. audio filepath, transcript and audio duration)
|
||||
of each audio file in the data set.
|
||||
|
||||
researchers should download the voxceleb1 dataset yourselves
|
||||
through google form to get the username & password and unpack the data
|
||||
"""
|
||||
import argparse
|
||||
import codecs
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import soundfile
|
||||
|
||||
from utils.utility import check_md5sum
|
||||
from utils.utility import download
|
||||
from utils.utility import unzip
|
||||
|
||||
# all the data will be download in the current data/voxceleb directory default
|
||||
DATA_HOME = os.path.expanduser('.')
|
||||
|
||||
# if you use the http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/ as the download base url
|
||||
# you need to get the username & password via the google form
|
||||
|
||||
# if you use the https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a as the download base url,
|
||||
# you need use --no-check-certificate to connect the target download url
|
||||
|
||||
BASE_URL = "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a"
|
||||
|
||||
# dev data
|
||||
DEV_LIST = {
|
||||
"vox1_dev_wav_partaa": "e395d020928bc15670b570a21695ed96",
|
||||
"vox1_dev_wav_partab": "bbfaaccefab65d82b21903e81a8a8020",
|
||||
"vox1_dev_wav_partac": "017d579a2a96a077f40042ec33e51512",
|
||||
"vox1_dev_wav_partad": "7bb1e9f70fddc7a678fa998ea8b3ba19",
|
||||
}
|
||||
DEV_TARGET_DATA = "vox1_dev_wav_parta* vox1_dev_wav.zip ae63e55b951748cc486645f532ba230b"
|
||||
|
||||
# test data
|
||||
TEST_LIST = {"vox1_test_wav.zip": "185fdc63c3c739954633d50379a3d102"}
|
||||
TEST_TARGET_DATA = "vox1_test_wav.zip vox1_test_wav.zip 185fdc63c3c739954633d50379a3d102"
|
||||
|
||||
# kaldi trial
|
||||
# this trial file is organized by kaldi according the official file,
|
||||
# which is a little different with the official trial veri_test2.txt
|
||||
KALDI_BASE_URL = "http://www.openslr.org/resources/49/"
|
||||
TRIAL_LIST = {"voxceleb1_test_v2.txt": "29fc7cc1c5d59f0816dc15d6e8be60f7"}
|
||||
TRIAL_TARGET_DATA = "voxceleb1_test_v2.txt voxceleb1_test_v2.txt 29fc7cc1c5d59f0816dc15d6e8be60f7"
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--target_dir",
|
||||
default=DATA_HOME + "/voxceleb1/",
|
||||
type=str,
|
||||
help="Directory to save the voxceleb1 dataset. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--manifest_prefix",
|
||||
default="manifest",
|
||||
type=str,
|
||||
help="Filepath prefix for output manifests. (default: %(default)s)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
def create_manifest(data_dir, manifest_path_prefix):
|
||||
print("Creating manifest %s ..." % manifest_path_prefix)
|
||||
json_lines = []
|
||||
data_path = os.path.join(data_dir, "wav", "**", "*.wav")
|
||||
total_sec = 0.0
|
||||
total_text = 0.0
|
||||
total_num = 0
|
||||
speakers = set()
|
||||
for audio_path in glob.glob(data_path, recursive=True):
|
||||
audio_id = "-".join(audio_path.split("/")[-3:])
|
||||
utt2spk = audio_path.split("/")[-3]
|
||||
duration = soundfile.info(audio_path).duration
|
||||
text = ""
|
||||
json_lines.append(
|
||||
json.dumps(
|
||||
{
|
||||
"utt": audio_id,
|
||||
"utt2spk": str(utt2spk),
|
||||
"feat": audio_path,
|
||||
"feat_shape": (duration, ),
|
||||
"text": text # compatible with asr data format
|
||||
},
|
||||
ensure_ascii=False))
|
||||
|
||||
total_sec += duration
|
||||
total_text += len(text)
|
||||
total_num += 1
|
||||
speakers.add(utt2spk)
|
||||
|
||||
# data_dir_name refer to dev or test
|
||||
# voxceleb1 is given explicit in the path
|
||||
data_dir_name = Path(data_dir).name
|
||||
manifest_path_prefix = manifest_path_prefix + "." + data_dir_name
|
||||
with codecs.open(manifest_path_prefix, 'w', encoding='utf-8') as f:
|
||||
for line in json_lines:
|
||||
f.write(line + "\n")
|
||||
|
||||
manifest_dir = os.path.dirname(manifest_path_prefix)
|
||||
meta_path = os.path.join(manifest_dir, "voxceleb1." +
|
||||
data_dir_name) + ".meta"
|
||||
with codecs.open(meta_path, 'w', encoding='utf-8') as f:
|
||||
print(f"{total_num} utts", file=f)
|
||||
print(f"{len(speakers)} speakers", file=f)
|
||||
print(f"{total_sec / (60 * 60)} h", file=f)
|
||||
print(f"{total_text} text", file=f)
|
||||
print(f"{total_text / total_sec} text/sec", file=f)
|
||||
print(f"{total_sec / total_num} sec/utt", file=f)
|
||||
|
||||
def prepare_dataset(base_url, data_list, target_dir, manifest_path,
|
||||
target_data):
|
||||
if not os.path.exists(target_dir):
|
||||
os.mkdir(target_dir)
|
||||
|
||||
# wav directory already exists, it need do nothing
|
||||
if not os.path.exists(os.path.join(target_dir, "wav")):
|
||||
# download all dataset part
|
||||
for zip_part in data_list.keys():
|
||||
download_url = " --no-check-certificate " + base_url + "/" + zip_part
|
||||
download(
|
||||
url=download_url,
|
||||
md5sum=data_list[zip_part],
|
||||
target_dir=target_dir)
|
||||
|
||||
# pack the all part to target zip file
|
||||
all_target_part, target_name, target_md5sum = target_data.split()
|
||||
target_name = os.path.join(target_dir, target_name)
|
||||
if not os.path.exists(target_name):
|
||||
pack_part_cmd = "cat {}/{} > {}".format(target_dir, all_target_part,
|
||||
target_name)
|
||||
subprocess.call(pack_part_cmd, shell=True)
|
||||
|
||||
# check the target zip file md5sum
|
||||
if not check_md5sum(target_name, target_md5sum):
|
||||
raise RuntimeError("{} MD5 checkssum failed".format(target_name))
|
||||
else:
|
||||
print("Check {} md5sum successfully".format(target_name))
|
||||
|
||||
# unzip the all zip file
|
||||
if target_name.endswith(".zip"):
|
||||
unzip(target_name, target_dir)
|
||||
|
||||
# create the manifest file
|
||||
create_manifest(data_dir=target_dir, manifest_path_prefix=manifest_path)
|
||||
|
||||
def main():
|
||||
if args.target_dir.startswith('~'):
|
||||
args.target_dir = os.path.expanduser(args.target_dir)
|
||||
|
||||
prepare_dataset(
|
||||
base_url=BASE_URL,
|
||||
data_list=DEV_LIST,
|
||||
target_dir=os.path.join(args.target_dir, "dev"),
|
||||
manifest_path=args.manifest_prefix,
|
||||
target_data=DEV_TARGET_DATA)
|
||||
|
||||
prepare_dataset(
|
||||
base_url=BASE_URL,
|
||||
data_list=TEST_LIST,
|
||||
target_dir=os.path.join(args.target_dir, "test"),
|
||||
manifest_path=args.manifest_prefix,
|
||||
target_data=TEST_TARGET_DATA)
|
||||
|
||||
print("Manifest prepare done!")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,86 @@
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
|
||||
fs: 24000 # sr
|
||||
n_fft: 2048 # FFT size (samples).
|
||||
n_shift: 300 # Hop size (samples). 12.5ms
|
||||
win_length: 1200 # Window length (samples). 50ms
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
|
||||
# Only used for feats_type != raw
|
||||
|
||||
fmin: 80 # Minimum frequency of Mel basis.
|
||||
fmax: 7600 # Maximum frequency of Mel basis.
|
||||
n_mels: 80 # The number of mel basis.
|
||||
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
batch_size: 64
|
||||
num_workers: 2
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model: # keyword arguments for the selected model
|
||||
embed_dim: 512 # char or phn embedding dimension
|
||||
elayers: 1 # number of blstm layers in encoder
|
||||
eunits: 512 # number of blstm units
|
||||
econv_layers: 3 # number of convolutional layers in encoder
|
||||
econv_chans: 512 # number of channels in convolutional layer
|
||||
econv_filts: 5 # filter size of convolutional layer
|
||||
atype: location # attention function type
|
||||
adim: 512 # attention dimension
|
||||
aconv_chans: 32 # number of channels in convolutional layer of attention
|
||||
aconv_filts: 15 # filter size of convolutional layer of attention
|
||||
cumulate_att_w: True # whether to cumulate attention weight
|
||||
dlayers: 2 # number of lstm layers in decoder
|
||||
dunits: 1024 # number of lstm units in decoder
|
||||
prenet_layers: 2 # number of layers in prenet
|
||||
prenet_units: 256 # number of units in prenet
|
||||
postnet_layers: 5 # number of layers in postnet
|
||||
postnet_chans: 512 # number of channels in postnet
|
||||
postnet_filts: 5 # filter size of postnet layer
|
||||
output_activation: null # activation function for the final output
|
||||
use_batch_norm: True # whether to use batch normalization in encoder
|
||||
use_concate: True # whether to concatenate encoder embedding with decoder outputs
|
||||
use_residual: False # whether to use residual connection in encoder
|
||||
dropout_rate: 0.5 # dropout rate
|
||||
zoneout_rate: 0.1 # zoneout rate
|
||||
reduction_factor: 1 # reduction factor
|
||||
spk_embed_dim: 256 # speaker embedding dimension
|
||||
spk_embed_integration_type: concat # how to integrate speaker embedding
|
||||
|
||||
|
||||
###########################################################
|
||||
# UPDATER SETTING #
|
||||
###########################################################
|
||||
updater:
|
||||
use_masking: True # whether to apply masking for padded part in loss calculation
|
||||
bce_pos_weight: 5.0 # weight of positive sample in binary cross entropy calculation
|
||||
use_guided_attn_loss: True # whether to use guided attention loss
|
||||
guided_attn_loss_sigma: 0.4 # sigma of guided attention loss
|
||||
guided_attn_loss_lambda: 1.0 # strength of guided attention loss
|
||||
|
||||
|
||||
##########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
##########################################################
|
||||
optimizer:
|
||||
optim: adam # optimizer type
|
||||
learning_rate: 1.0e-03 # learning rate
|
||||
epsilon: 1.0e-06 # epsilon
|
||||
weight_decay: 0.0 # weight decay coefficient
|
||||
|
||||
###########################################################
|
||||
# TRAINING SETTING #
|
||||
###########################################################
|
||||
max_epoch: 200
|
||||
num_snapshots: 5
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
seed: 42
|
@ -1,36 +1,72 @@
|
||||
#!/bin/bash
|
||||
|
||||
stage=0
|
||||
stage=3
|
||||
stop_stage=100
|
||||
|
||||
input=$1
|
||||
preprocess_path=$2
|
||||
alignment=$3
|
||||
ge2e_ckpt_path=$4
|
||||
config_path=$1
|
||||
ge2e_ckpt_path=$2
|
||||
|
||||
# gen speaker embedding
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \
|
||||
--input=${input}/wav \
|
||||
--output=${preprocess_path}/embed \
|
||||
--input=~/datasets/data_aishell3/train/wav/ \
|
||||
--output=dump/embed \
|
||||
--checkpoint_path=${ge2e_ckpt_path}
|
||||
fi
|
||||
|
||||
# copy from tts3/preprocess
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
echo "Process wav ..."
|
||||
python3 ${BIN_DIR}/process_wav.py \
|
||||
--input=${input}/wav \
|
||||
--output=${preprocess_path}/normalized_wav \
|
||||
--alignment=${alignment}
|
||||
# get durations from MFA's result
|
||||
echo "Generate durations.txt from MFA results ..."
|
||||
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
|
||||
--inputdir=./aishell3_alignment_tone \
|
||||
--output durations.txt \
|
||||
--config=${config_path}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
python3 ${BIN_DIR}/preprocess_transcription.py \
|
||||
--input=${input} \
|
||||
--output=${preprocess_path}
|
||||
# extract features
|
||||
echo "Extract features ..."
|
||||
python3 ${BIN_DIR}/preprocess.py \
|
||||
--dataset=aishell3 \
|
||||
--rootdir=~/datasets/data_aishell3/ \
|
||||
--dumpdir=dump \
|
||||
--dur-file=durations.txt \
|
||||
--config=${config_path} \
|
||||
--num-cpu=20 \
|
||||
--cut-sil=True \
|
||||
--spk_emb_dir=dump/embed
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
python3 ${BIN_DIR}/extract_mel.py \
|
||||
--input=${preprocess_path}/normalized_wav \
|
||||
--output=${preprocess_path}/mel
|
||||
# get features' stats(mean and std)
|
||||
echo "Get features' stats ..."
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="speech"
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
# normalize and covert phone to id, dev and test should use train's stats
|
||||
echo "Normalize ..."
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--dumpdir=dump/train/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/dev/raw/metadata.jsonl \
|
||||
--dumpdir=dump/dev/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/test/raw/metadata.jsonl \
|
||||
--dumpdir=dump/test/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
fi
|
||||
|
@ -0,0 +1,22 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=tacotron2_aishell3 \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=pwgan_aishell3 \
|
||||
--voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
|
||||
--voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
|
||||
--voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--speaker_dict=dump/speaker_id_map.txt \
|
||||
--voice-cloning=True
|
@ -1,9 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
preprocess_path=$1
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
python3 ${BIN_DIR}/train.py \
|
||||
--data=${preprocess_path} \
|
||||
--output=${train_output_path} \
|
||||
--ngpu=1
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--ngpu=2 \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--voice-cloning=True
|
@ -1,14 +1,24 @@
|
||||
#!/bin/bash
|
||||
|
||||
ge2e_params_path=$1
|
||||
tacotron2_params_path=$2
|
||||
waveflow_params_path=$3
|
||||
vc_input=$4
|
||||
vc_output=$5
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
ge2e_params_path=$4
|
||||
ref_audio_dir=$5
|
||||
|
||||
python3 ${BIN_DIR}/voice_cloning.py \
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../voice_cloning.py \
|
||||
--am=tacotron2_aishell3 \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=pwgan_aishell3 \
|
||||
--voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
|
||||
--voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
|
||||
--voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
|
||||
--ge2e_params_path=${ge2e_params_path} \
|
||||
--tacotron2_params_path=${tacotron2_params_path} \
|
||||
--waveflow_params_path=${waveflow_params_path} \
|
||||
--input-dir=${vc_input} \
|
||||
--output-dir=${vc_output}
|
||||
--text="凯莫瑞安联合体的经济崩溃迫在眉睫。" \
|
||||
--input-dir=${ref_audio_dir} \
|
||||
--output-dir=${train_output_path}/vc_syn \
|
||||
--phones-dict=dump/phone_id_map.txt
|
||||
|
@ -0,0 +1,51 @@
|
||||
#!/bin/bash
|
||||
|
||||
train_output_path=$1
|
||||
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
python3 ${BIN_DIR}/../inference.py \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--am=tacotron2_csmsc \
|
||||
--voc=pwgan_csmsc \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/pd_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt
|
||||
fi
|
||||
|
||||
# for more GAN Vocoders
|
||||
# multi band melgan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
python3 ${BIN_DIR}/../inference.py \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--am=tacotron2_csmsc \
|
||||
--voc=mb_melgan_csmsc \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/pd_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt
|
||||
fi
|
||||
|
||||
# style melgan
|
||||
# style melgan's Dygraph to Static Graph is not ready now
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
python3 ${BIN_DIR}/../inference.py \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--am=tacotron2_csmsc \
|
||||
--voc=style_melgan_csmsc \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/pd_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt
|
||||
fi
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
python3 ${BIN_DIR}/../inference.py \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--am=tacotron2_csmsc \
|
||||
--voc=hifigan_csmsc \
|
||||
--text=${BIN_DIR}/../sentences.txt \
|
||||
--output_dir=${train_output_path}/pd_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt
|
||||
fi
|
@ -0,0 +1,67 @@
|
||||
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
fs: 24000 # Sampling rate.
|
||||
n_fft: 2048 # FFT size (samples).
|
||||
n_shift: 300 # Hop size (samples). 12.5ms
|
||||
win_length: 1200 # Window length (samples). 50ms
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
n_mels: 80 # Number of mel basis.
|
||||
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
|
||||
fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)
|
||||
mu_law: True # Recommended to suppress noise if using raw bitsexit()
|
||||
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model:
|
||||
rnn_dims: 512 # Hidden dims of RNN Layers.
|
||||
fc_dims: 512
|
||||
bits: 9 # Bit depth of signal
|
||||
aux_context_window: 2 # Context window size for auxiliary feature.
|
||||
# If set to 2, previous 2 and future 2 frames will be considered.
|
||||
aux_channels: 80 # Number of channels for auxiliary feature conv.
|
||||
# Must be the same as num_mels.
|
||||
upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size, same with pwgan here
|
||||
compute_dims: 128 # Dims of Conv1D in MelResNet.
|
||||
res_out_dims: 128 # Dims of output in MelResNet.
|
||||
res_blocks: 10 # Number of residual blocks.
|
||||
mode: RAW # either 'raw'(softmax on raw bits) or 'mold' (sample from mixture of logistics)
|
||||
inference:
|
||||
gen_batched: True # whether to genenate sample in batch mode
|
||||
target: 12000 # target number of samples to be generated in each batch entry
|
||||
overlap: 600 # number of samples for crossfading between batches
|
||||
|
||||
|
||||
###########################################################
|
||||
# DATA LOADER SETTING #
|
||||
###########################################################
|
||||
batch_size: 64 # Batch size.
|
||||
batch_max_steps: 4500 # Length of each audio in batch. Make sure dividable by hop_size.
|
||||
num_workers: 2 # Number of workers in DataLoader.
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
###########################################################
|
||||
grad_clip: 4.0
|
||||
learning_rate: 1.0e-4
|
||||
|
||||
|
||||
###########################################################
|
||||
# INTERVAL SETTING #
|
||||
###########################################################
|
||||
|
||||
train_max_steps: 400000 # Number of training steps.
|
||||
save_interval_steps: 5000 # Interval steps to save checkpoint.
|
||||
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
||||
gen_eval_samples_interval_steps: 5000 # the iteration interval of generating valid samples
|
||||
generate_num: 5 # number of samples to generate at each checkpoint
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
num_snapshots: 10 # max number of snapshots to keep while training
|
||||
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,55 @@
|
||||
#!/bin/bash
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
config_path=$1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# get durations from MFA's result
|
||||
echo "Generate durations.txt from MFA results ..."
|
||||
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
|
||||
--inputdir=./baker_alignment_tone \
|
||||
--output=durations.txt \
|
||||
--config=${config_path}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# extract features
|
||||
echo "Extract features ..."
|
||||
python3 ${BIN_DIR}/../gan_vocoder/preprocess.py \
|
||||
--rootdir=~/datasets/BZNSYP/ \
|
||||
--dataset=baker \
|
||||
--dumpdir=dump \
|
||||
--dur-file=durations.txt \
|
||||
--config=${config_path} \
|
||||
--cut-sil=True \
|
||||
--num-cpu=20
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# get features' stats(mean and std)
|
||||
echo "Get features' stats ..."
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="feats"
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# normalize, dev and test should use train's stats
|
||||
echo "Normalize ..."
|
||||
|
||||
python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--dumpdir=dump/train/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
|
||||
--metadata=dump/dev/raw/metadata.jsonl \
|
||||
--dumpdir=dump/dev/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
|
||||
python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
|
||||
--metadata=dump/test/raw/metadata.jsonl \
|
||||
--dumpdir=dump/test/norm \
|
||||
--stats=dump/train/feats_stats.npy
|
||||
fi
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/synthesize.py \
|
||||
--config=${config_path} \
|
||||
--checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=${train_output_path}/test
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
FLAGS_cudnn_exhaustive_search=true \
|
||||
FLAGS_conv_workspace_size_limit=4000 \
|
||||
python ${BIN_DIR}/train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--ngpu=1
|
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
MODEL=wavernn
|
||||
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
|
@ -0,0 +1,30 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=0,1
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
test_input=dump/dump_gta_test
|
||||
ckpt_name=snapshot_iter_100000.pdz
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# prepare data
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
@ -1,89 +0,0 @@
|
||||
# Tacotron2 with LJSpeech
|
||||
PaddlePaddle dynamic graph implementation of Tacotron2, a neural network architecture for speech synthesis directly from the text. The implementation is based on [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884).
|
||||
|
||||
## Dataset
|
||||
We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).
|
||||
|
||||
```bash
|
||||
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
|
||||
tar xjvf LJSpeech-1.1.tar.bz2
|
||||
```
|
||||
## Get Started
|
||||
Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
|
||||
Run the command below to
|
||||
1. **source path**.
|
||||
2. preprocess the dataset.
|
||||
3. train the model.
|
||||
4. synthesize mels.
|
||||
```bash
|
||||
./run.sh
|
||||
```
|
||||
You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
|
||||
```bash
|
||||
./run.sh --stage 0 --stop-stage 0
|
||||
```
|
||||
### Data Preprocessing
|
||||
```bash
|
||||
./local/preprocess.sh ${conf_path}
|
||||
```
|
||||
### Model Training
|
||||
`./local/train.sh` calls `${BIN_DIR}/train.py`.
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
|
||||
```
|
||||
Here's the complete help message.
|
||||
```text
|
||||
usage: train.py [-h] [--config FILE] [--data DATA_DIR] [--output OUTPUT_DIR]
|
||||
[--checkpoint_path CHECKPOINT_PATH] [--ngpu NGPU] [--opts ...]
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config FILE path of the config file to overwrite to default config
|
||||
with.
|
||||
--data DATA_DIR path to the dataset.
|
||||
--output OUTPUT_DIR path to save checkpoint and logs.
|
||||
--checkpoint_path CHECKPOINT_PATH
|
||||
path of the checkpoint to load
|
||||
--ngpu NGPU if ngpu == 0, use cpu.
|
||||
--opts ... options to overwrite --config file and the default
|
||||
config, passing in KEY VALUE pairs
|
||||
```
|
||||
|
||||
If you want to train on CPU, just set `--ngpu=0`.
|
||||
If you want to train on multiple GPUs, just set `--ngpu` as the num of GPU.
|
||||
By default, training will be resumed from the latest checkpoint in `--output`, if you want to start a new training, please use a new `${OUTPUTPATH}` with no checkpoint.
|
||||
And if you want to resume from another existing model, you should set `checkpoint_path` to be the checkpoint path you want to load.
|
||||
**Note: The checkpoint path cannot contain the file extension.**
|
||||
|
||||
### Synthesizing
|
||||
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which synthesize **mels** from text_list here.
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_name}
|
||||
```
|
||||
```text
|
||||
usage: synthesize.py [-h] [--config FILE] [--checkpoint_path CHECKPOINT_PATH]
|
||||
[--input INPUT] [--output OUTPUT] [--ngpu NGPU]
|
||||
[--opts ...] [-v]
|
||||
|
||||
generate mel spectrogram with TransformerTTS.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config FILE extra config to overwrite the default config
|
||||
--checkpoint_path CHECKPOINT_PATH
|
||||
path of the checkpoint to load.
|
||||
--input INPUT path of the text sentences
|
||||
--output OUTPUT path to save outputs
|
||||
--ngpu NGPU if ngpu == 0, use cpu.
|
||||
--opts ... options to overwrite --config file and the default
|
||||
config, passing in KEY VALUE pairs
|
||||
-v, --verbose print msg
|
||||
```
|
||||
**Ps.** You can use [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0) as the neural vocoder to synthesize mels to wavs. (Please refer to `synthesize.sh` in our LJSpeech waveflow example)
|
||||
|
||||
## Pretrained Models
|
||||
Pretrained Models can be downloaded from the links below. We provide 2 models with different configurations.
|
||||
|
||||
1. This model uses a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)
|
||||
|
||||
2. This model does not have a stop token predictor. It uses the attention peak position to decide whether all the contents have been uttered. Also, guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
|
@ -0,0 +1,87 @@
|
||||
# This configuration is for Paddle to train Tacotron 2. Compared to the
|
||||
# original paper, this configuration additionally use the guided attention
|
||||
# loss to accelerate the learning of the diagonal attention. It requires
|
||||
# only a single GPU with 12 GB memory and it takes ~1 days to finish the
|
||||
# training on Titan V.
|
||||
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
fs: 22050 # Sampling rate.
|
||||
n_fft: 1024 # FFT size (samples).
|
||||
n_shift: 256 # Hop size (samples). 11.6ms
|
||||
win_length: null # Window length (samples).
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
n_mels: 80 # Number of mel basis.
|
||||
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
|
||||
fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)
|
||||
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
batch_size: 64
|
||||
num_workers: 2
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model: # keyword arguments for the selected model
|
||||
embed_dim: 512 # char or phn embedding dimension
|
||||
elayers: 1 # number of blstm layers in encoder
|
||||
eunits: 512 # number of blstm units
|
||||
econv_layers: 3 # number of convolutional layers in encoder
|
||||
econv_chans: 512 # number of channels in convolutional layer
|
||||
econv_filts: 5 # filter size of convolutional layer
|
||||
atype: location # attention function type
|
||||
adim: 512 # attention dimension
|
||||
aconv_chans: 32 # number of channels in convolutional layer of attention
|
||||
aconv_filts: 15 # filter size of convolutional layer of attention
|
||||
cumulate_att_w: True # whether to cumulate attention weight
|
||||
dlayers: 2 # number of lstm layers in decoder
|
||||
dunits: 1024 # number of lstm units in decoder
|
||||
prenet_layers: 2 # number of layers in prenet
|
||||
prenet_units: 256 # number of units in prenet
|
||||
postnet_layers: 5 # number of layers in postnet
|
||||
postnet_chans: 512 # number of channels in postnet
|
||||
postnet_filts: 5 # filter size of postnet layer
|
||||
output_activation: null # activation function for the final output
|
||||
use_batch_norm: True # whether to use batch normalization in encoder
|
||||
use_concate: True # whether to concatenate encoder embedding with decoder outputs
|
||||
use_residual: False # whether to use residual connection in encoder
|
||||
dropout_rate: 0.5 # dropout rate
|
||||
zoneout_rate: 0.1 # zoneout rate
|
||||
reduction_factor: 1 # reduction factor
|
||||
spk_embed_dim: null # speaker embedding dimension
|
||||
|
||||
|
||||
###########################################################
|
||||
# UPDATER SETTING #
|
||||
###########################################################
|
||||
updater:
|
||||
use_masking: True # whether to apply masking for padded part in loss calculation
|
||||
bce_pos_weight: 5.0 # weight of positive sample in binary cross entropy calculation
|
||||
use_guided_attn_loss: True # whether to use guided attention loss
|
||||
guided_attn_loss_sigma: 0.4 # sigma of guided attention loss
|
||||
guided_attn_loss_lambda: 1.0 # strength of guided attention loss
|
||||
|
||||
|
||||
##########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
##########################################################
|
||||
optimizer:
|
||||
optim: adam # optimizer type
|
||||
learning_rate: 1.0e-03 # learning rate
|
||||
epsilon: 1.0e-06 # epsilon
|
||||
weight_decay: 0.0 # weight decay coefficient
|
||||
|
||||
###########################################################
|
||||
# TRAINING SETTING #
|
||||
###########################################################
|
||||
max_epoch: 300
|
||||
num_snapshots: 5
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
seed: 42
|
@ -1,8 +1,62 @@
|
||||
#!/bin/bash
|
||||
|
||||
preprocess_path=$1
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
python3 ${BIN_DIR}/preprocess.py \
|
||||
--input=~/datasets/LJSpeech-1.1 \
|
||||
--output=${preprocess_path} \
|
||||
-v \
|
||||
config_path=$1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# get durations from MFA's result
|
||||
echo "Generate durations.txt from MFA results ..."
|
||||
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
|
||||
--inputdir=./ljspeech_alignment \
|
||||
--output=durations.txt \
|
||||
--config=${config_path}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# extract features
|
||||
echo "Extract features ..."
|
||||
python3 ${BIN_DIR}/preprocess.py \
|
||||
--dataset=ljspeech \
|
||||
--rootdir=~/datasets/LJSpeech-1.1/ \
|
||||
--dumpdir=dump \
|
||||
--dur-file=durations.txt \
|
||||
--config=${config_path} \
|
||||
--num-cpu=20 \
|
||||
--cut-sil=True
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# get features' stats(mean and std)
|
||||
echo "Get features' stats ..."
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="speech"
|
||||
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# normalize and covert phone to id, dev and test should use train's stats
|
||||
echo "Normalize ..."
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--dumpdir=dump/train/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/dev/raw/metadata.jsonl \
|
||||
--dumpdir=dump/dev/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
|
||||
python3 ${BIN_DIR}/normalize.py \
|
||||
--metadata=dump/test/raw/metadata.jsonl \
|
||||
--dumpdir=dump/test/norm \
|
||||
--speech-stats=dump/train/speech_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--speaker-dict=dump/speaker_id_map.txt
|
||||
fi
|
||||
|
@ -1,11 +1,20 @@
|
||||
#!/bin/bash
|
||||
|
||||
train_output_path=$1
|
||||
ckpt_name=$2
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
python3 ${BIN_DIR}/synthesize.py \
|
||||
--config=${train_output_path}/config.yaml \
|
||||
--checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--input=${BIN_DIR}/../sentences_en.txt \
|
||||
--output=${train_output_path}/test \
|
||||
--ngpu=1
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=tacotron2_ljspeech \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/speech_stats.npy \
|
||||
--voc=pwgan_ljspeech \
|
||||
--voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
|
||||
--voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
|
||||
--voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt
|
||||
|
@ -1,9 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
preprocess_path=$1
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
|
||||
python3 ${BIN_DIR}/train.py \
|
||||
--data=${preprocess_path} \
|
||||
--output=${train_output_path} \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--ngpu=1 \
|
||||
--phones-dict=dump/phone_id_map.txt
|
@ -0,0 +1,51 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
||||
|
||||
# x: [0: 2**bit-1], return: [-1, 1]
|
||||
def label_2_float(x, bits):
|
||||
return 2 * x / (2**bits - 1.) - 1.
|
||||
|
||||
|
||||
#x: [-1, 1], return: [0, 2**bits-1]
|
||||
def float_2_label(x, bits):
|
||||
assert abs(x).max() <= 1.0
|
||||
x = (x + 1.) * (2**bits - 1) / 2
|
||||
return x.clip(0, 2**bits - 1)
|
||||
|
||||
|
||||
# y: [-1, 1], mu: 2**bits, return: [0, 2**bits-1]
|
||||
# see https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
|
||||
# be careful the input `mu` here, which is +1 than that of the link above
|
||||
def encode_mu_law(x, mu):
|
||||
mu = mu - 1
|
||||
fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
|
||||
return np.floor((fx + 1) / 2 * mu + 0.5)
|
||||
|
||||
|
||||
# from_labels = True:
|
||||
# y: [0: 2**bit-1], mu: 2**bits, return: [-1,1]
|
||||
# from_labels = False:
|
||||
# y: [-1, 1], return: [-1, 1]
|
||||
def decode_mu_law(y, mu, from_labels=True):
|
||||
# TODO: get rid of log2 - makes no sense
|
||||
if from_labels:
|
||||
y = label_2_float(y, math.log2(mu))
|
||||
mu = mu - 1
|
||||
x = paddle.sign(y) / mu * ((1 + mu)**paddle.abs(y) - 1)
|
||||
return x
|
@ -1,75 +0,0 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from yacs.config import CfgNode as CN
|
||||
|
||||
_C = CN()
|
||||
_C.data = CN(
|
||||
dict(
|
||||
batch_size=32, # batch size
|
||||
valid_size=64, # the first N examples are reserved for validation
|
||||
sample_rate=22050, # Hz, sample rate
|
||||
n_fft=1024, # fft frame size
|
||||
win_length=1024, # window size
|
||||
hop_length=256, # hop size between ajacent frame
|
||||
fmax=8000, # Hz, max frequency when converting to mel
|
||||
fmin=0, # Hz, min frequency when converting to mel
|
||||
n_mels=80, # mel bands
|
||||
padding_idx=0, # text embedding's padding index
|
||||
))
|
||||
|
||||
_C.model = CN(
|
||||
dict(
|
||||
vocab_size=37, # set this according to the frontend's vocab size
|
||||
n_tones=None,
|
||||
reduction_factor=1, # reduction factor
|
||||
d_encoder=512, # embedding & encoder's internal size
|
||||
encoder_conv_layers=3, # number of conv layer in tacotron2 encoder
|
||||
encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder
|
||||
d_prenet=256, # hidden size of decoder prenet
|
||||
d_attention_rnn=1024, # hidden size of the first rnn layer in tacotron2 decoder
|
||||
d_decoder_rnn=1024, # hidden size of the second rnn layer in tacotron2 decoder
|
||||
d_attention=128, # hidden size of decoder location linear layer
|
||||
attention_filters=32, # number of filter in decoder location conv layer
|
||||
attention_kernel_size=31, # kernel size of decoder location conv layer
|
||||
d_postnet=512, # hidden size of decoder postnet
|
||||
postnet_kernel_size=5, # kernel size of conv layers in postnet
|
||||
postnet_conv_layers=5, # number of conv layer in decoder postnet
|
||||
p_encoder_dropout=0.5, # droput probability in encoder
|
||||
p_prenet_dropout=0.5, # droput probability in decoder prenet
|
||||
p_attention_dropout=0.1, # droput probability of first rnn layer in decoder
|
||||
p_decoder_dropout=0.1, # droput probability of second rnn layer in decoder
|
||||
p_postnet_dropout=0.5, # droput probability in decoder postnet
|
||||
d_global_condition=None,
|
||||
use_stop_token=True, # wherther to use binary classifier to predict when to stop
|
||||
use_guided_attention_loss=False, # whether to use guided attention loss
|
||||
guided_attention_loss_sigma=0.2 # sigma in guided attention loss
|
||||
))
|
||||
|
||||
_C.training = CN(
|
||||
dict(
|
||||
lr=1e-3, # learning rate
|
||||
weight_decay=1e-6, # the coeff of weight decay
|
||||
grad_clip_thresh=1.0, # the clip norm of grad clip.
|
||||
plot_interval=1000, # plot attention and spectrogram
|
||||
valid_interval=1000, # validation
|
||||
save_interval=1000, # checkpoint
|
||||
max_iteration=500000, # max iteration to train
|
||||
))
|
||||
|
||||
|
||||
def get_cfg_defaults():
|
||||
"""Get a yacs CfgNode object with default values for my_project."""
|
||||
# Return a clone so that the defaults will not be altered
|
||||
# This is for the "local variable" use pattern
|
||||
return _C.clone()
|
@ -1,91 +0,0 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from paddle.io import Dataset
|
||||
|
||||
from paddlespeech.t2s.data.batch import batch_spec
|
||||
from paddlespeech.t2s.data.batch import batch_text_id
|
||||
|
||||
|
||||
class LJSpeech(Dataset):
|
||||
"""A simple dataset adaptor for the processed ljspeech dataset."""
|
||||
|
||||
def __init__(self, root):
|
||||
self.root = Path(root).expanduser()
|
||||
records = []
|
||||
with open(self.root / "metadata.pkl", 'rb') as f:
|
||||
metadata = pickle.load(f)
|
||||
for mel_name, text, ids in metadata:
|
||||
mel_name = self.root / "mel" / (mel_name + ".npy")
|
||||
records.append((mel_name, text, ids))
|
||||
self.records = records
|
||||
|
||||
def __getitem__(self, i):
|
||||
mel_name, _, ids = self.records[i]
|
||||
mel = np.load(mel_name)
|
||||
return ids, mel
|
||||
|
||||
def __len__(self):
|
||||
return len(self.records)
|
||||
|
||||
|
||||
class LJSpeechCollector(object):
|
||||
"""A simple callable to batch LJSpeech examples."""
|
||||
|
||||
def __init__(self, padding_idx=0, padding_value=0., padding_stop_token=1.0):
|
||||
self.padding_idx = padding_idx
|
||||
self.padding_value = padding_value
|
||||
self.padding_stop_token = padding_stop_token
|
||||
|
||||
def __call__(self, examples):
|
||||
texts = []
|
||||
mels = []
|
||||
text_lens = []
|
||||
mel_lens = []
|
||||
|
||||
for data in examples:
|
||||
text, mel = data
|
||||
text = np.array(text, dtype=np.int64)
|
||||
text_lens.append(len(text))
|
||||
mels.append(mel)
|
||||
texts.append(text)
|
||||
mel_lens.append(mel.shape[1])
|
||||
|
||||
# Sort by text_len in descending order
|
||||
texts = [
|
||||
i for i, _ in sorted(
|
||||
zip(texts, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
mels = [
|
||||
i for i, _ in sorted(
|
||||
zip(mels, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
|
||||
mel_lens = [
|
||||
i for i, _ in sorted(
|
||||
zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
|
||||
]
|
||||
|
||||
mel_lens = np.array(mel_lens, dtype=np.int64)
|
||||
text_lens = np.array(sorted(text_lens, reverse=True), dtype=np.int64)
|
||||
|
||||
# Pad sequence with largest len of the batch
|
||||
texts, _ = batch_text_id(texts, pad_id=self.padding_idx)
|
||||
mels, _ = batch_spec(mels, pad_value=self.padding_value)
|
||||
mels = np.transpose(mels, axes=(0, 2, 1))
|
||||
|
||||
return texts, mels, text_lens, mel_lens
|
@ -1,98 +0,0 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import tqdm
|
||||
|
||||
from paddlespeech.t2s.audio import AudioProcessor
|
||||
from paddlespeech.t2s.audio import LogMagnitude
|
||||
from paddlespeech.t2s.datasets import LJSpeechMetaData
|
||||
from paddlespeech.t2s.exps.tacotron2.config import get_cfg_defaults
|
||||
from paddlespeech.t2s.frontend import EnglishCharacter
|
||||
|
||||
|
||||
def create_dataset(config, source_path, target_path, verbose=False):
|
||||
# create output dir
|
||||
target_path = Path(target_path).expanduser()
|
||||
mel_path = target_path / "mel"
|
||||
os.makedirs(mel_path, exist_ok=True)
|
||||
|
||||
meta_data = LJSpeechMetaData(source_path)
|
||||
frontend = EnglishCharacter()
|
||||
processor = AudioProcessor(
|
||||
sample_rate=config.data.sample_rate,
|
||||
n_fft=config.data.n_fft,
|
||||
n_mels=config.data.n_mels,
|
||||
win_length=config.data.win_length,
|
||||
hop_length=config.data.hop_length,
|
||||
fmax=config.data.fmax,
|
||||
fmin=config.data.fmin)
|
||||
normalizer = LogMagnitude()
|
||||
|
||||
records = []
|
||||
for (fname, text, _) in tqdm.tqdm(meta_data):
|
||||
wav = processor.read_wav(fname)
|
||||
mel = processor.mel_spectrogram(wav)
|
||||
mel = normalizer.transform(mel)
|
||||
ids = frontend(text)
|
||||
mel_name = os.path.splitext(os.path.basename(fname))[0]
|
||||
|
||||
# save mel spectrogram
|
||||
records.append((mel_name, text, ids))
|
||||
np.save(mel_path / mel_name, mel)
|
||||
if verbose:
|
||||
print("save mel spectrograms into {}".format(mel_path))
|
||||
|
||||
# save meta data as pickle archive
|
||||
with open(target_path / "metadata.pkl", 'wb') as f:
|
||||
pickle.dump(records, f)
|
||||
if verbose:
|
||||
print("saved metadata into {}".format(target_path / "metadata.pkl"))
|
||||
|
||||
print("Done.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="create dataset")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--input", type=str, help="path of the ljspeech dataset")
|
||||
parser.add_argument(
|
||||
"--output", type=str, help="path to save output dataset")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
config = get_cfg_defaults()
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config.data)
|
||||
|
||||
create_dataset(config, args.input, args.output, args.verbose)
|
File diff suppressed because one or more lines are too long
@ -1,103 +0,0 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
from paddlespeech.t2s.exps.tacotron2.config import get_cfg_defaults
|
||||
from paddlespeech.t2s.frontend import EnglishCharacter
|
||||
from paddlespeech.t2s.models.tacotron2 import Tacotron2
|
||||
from paddlespeech.t2s.utils import display
|
||||
|
||||
|
||||
def main(config, args):
|
||||
if args.ngpu == 0:
|
||||
paddle.set_device("cpu")
|
||||
elif args.ngpu > 0:
|
||||
paddle.set_device("gpu")
|
||||
else:
|
||||
print("ngpu should >= 0 !")
|
||||
|
||||
# model
|
||||
frontend = EnglishCharacter()
|
||||
model = Tacotron2.from_pretrained(config, args.checkpoint_path)
|
||||
model.eval()
|
||||
|
||||
# inputs
|
||||
input_path = Path(args.input).expanduser()
|
||||
sentences = []
|
||||
with open(input_path, "rt") as f:
|
||||
for line in f:
|
||||
line_list = line.strip().split()
|
||||
utt_id = line_list[0]
|
||||
sentence = " ".join(line_list[1:])
|
||||
sentences.append((utt_id, sentence))
|
||||
|
||||
if args.output is None:
|
||||
output_dir = input_path.parent / "synthesis"
|
||||
else:
|
||||
output_dir = Path(args.output).expanduser()
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
for i, sentence in enumerate(sentences):
|
||||
sentence = paddle.to_tensor(frontend(sentence)).unsqueeze(0)
|
||||
outputs = model.infer(sentence)
|
||||
mel_output = outputs["mel_outputs_postnet"][0].numpy().T
|
||||
alignment = outputs["alignments"][0].numpy().T
|
||||
|
||||
np.save(str(output_dir / f"sentence_{i}"), mel_output)
|
||||
display.plot_alignment(alignment)
|
||||
plt.savefig(str(output_dir / f"sentence_{i}.png"))
|
||||
if args.verbose:
|
||||
print("spectrogram saved at {}".format(output_dir /
|
||||
f"sentence_{i}.npy"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="generate mel spectrogram with TransformerTTS.")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="extra config to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--checkpoint_path", type=str, help="path of the checkpoint to load.")
|
||||
parser.add_argument("--input", type=str, help="path of the text sentences")
|
||||
parser.add_argument("--output", type=str, help="path to save outputs")
|
||||
parser.add_argument(
|
||||
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="print msg")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
print(args)
|
||||
|
||||
main(config, args)
|
@ -1,220 +0,0 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import time
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import distributed as dist
|
||||
from paddle.io import DataLoader
|
||||
from paddle.io import DistributedBatchSampler
|
||||
|
||||
from paddlespeech.t2s.data import dataset
|
||||
from paddlespeech.t2s.exps.tacotron2.config import get_cfg_defaults
|
||||
from paddlespeech.t2s.exps.tacotron2.ljspeech import LJSpeech
|
||||
from paddlespeech.t2s.exps.tacotron2.ljspeech import LJSpeechCollector
|
||||
from paddlespeech.t2s.models.tacotron2 import Tacotron2
|
||||
from paddlespeech.t2s.models.tacotron2 import Tacotron2Loss
|
||||
from paddlespeech.t2s.training.cli import default_argument_parser
|
||||
from paddlespeech.t2s.training.experiment import ExperimentBase
|
||||
from paddlespeech.t2s.utils import display
|
||||
from paddlespeech.t2s.utils import mp_tools
|
||||
|
||||
|
||||
class Experiment(ExperimentBase):
|
||||
def compute_losses(self, inputs, outputs):
|
||||
texts, mel_targets, plens, slens = inputs
|
||||
|
||||
mel_outputs = outputs["mel_output"]
|
||||
mel_outputs_postnet = outputs["mel_outputs_postnet"]
|
||||
attention_weight = outputs["alignments"]
|
||||
if self.config.model.use_stop_token:
|
||||
stop_logits = outputs["stop_logits"]
|
||||
else:
|
||||
stop_logits = None
|
||||
|
||||
losses = self.criterion(mel_outputs, mel_outputs_postnet, mel_targets,
|
||||
attention_weight, slens, plens, stop_logits)
|
||||
return losses
|
||||
|
||||
def train_batch(self):
|
||||
start = time.time()
|
||||
batch = self.read_batch()
|
||||
data_loader_time = time.time() - start
|
||||
|
||||
self.optimizer.clear_grad()
|
||||
self.model.train()
|
||||
texts, mels, text_lens, output_lens = batch
|
||||
outputs = self.model(texts, text_lens, mels, output_lens)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
loss = losses["loss"]
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
iteration_time = time.time() - start
|
||||
|
||||
losses_np = {k: float(v) for k, v in losses.items()}
|
||||
# logging
|
||||
msg = "Rank: {}, ".format(dist.get_rank())
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
|
||||
iteration_time)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_np.items())
|
||||
self.logger.info(msg)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
for k, v in losses_np.items():
|
||||
self.visualizer.add_scalar(f"train_loss/{k}", v, self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
def valid(self):
|
||||
valid_losses = defaultdict(list)
|
||||
for i, batch in enumerate(self.valid_loader):
|
||||
texts, mels, text_lens, output_lens = batch
|
||||
outputs = self.model(texts, text_lens, mels, output_lens)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
for k, v in losses.items():
|
||||
valid_losses[k].append(float(v))
|
||||
|
||||
attention_weights = outputs["alignments"]
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_alignments",
|
||||
display.plot_alignment(attention_weights[0].numpy().T),
|
||||
self.iteration)
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_target_spectrogram",
|
||||
display.plot_spectrogram(mels[0].numpy().T), self.iteration)
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_predicted_spectrogram",
|
||||
display.plot_spectrogram(outputs['mel_outputs_postnet'][0]
|
||||
.numpy().T), self.iteration)
|
||||
|
||||
# write visual log
|
||||
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
|
||||
|
||||
# logging
|
||||
msg = "Valid: "
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in valid_losses.items())
|
||||
self.logger.info(msg)
|
||||
|
||||
for k, v in valid_losses.items():
|
||||
self.visualizer.add_scalar(f"valid/{k}", v, self.iteration)
|
||||
|
||||
def setup_model(self):
|
||||
config = self.config
|
||||
model = Tacotron2(
|
||||
vocab_size=config.model.vocab_size,
|
||||
d_mels=config.data.n_mels,
|
||||
d_encoder=config.model.d_encoder,
|
||||
encoder_conv_layers=config.model.encoder_conv_layers,
|
||||
encoder_kernel_size=config.model.encoder_kernel_size,
|
||||
d_prenet=config.model.d_prenet,
|
||||
d_attention_rnn=config.model.d_attention_rnn,
|
||||
d_decoder_rnn=config.model.d_decoder_rnn,
|
||||
attention_filters=config.model.attention_filters,
|
||||
attention_kernel_size=config.model.attention_kernel_size,
|
||||
d_attention=config.model.d_attention,
|
||||
d_postnet=config.model.d_postnet,
|
||||
postnet_kernel_size=config.model.postnet_kernel_size,
|
||||
postnet_conv_layers=config.model.postnet_conv_layers,
|
||||
reduction_factor=config.model.reduction_factor,
|
||||
p_encoder_dropout=config.model.p_encoder_dropout,
|
||||
p_prenet_dropout=config.model.p_prenet_dropout,
|
||||
p_attention_dropout=config.model.p_attention_dropout,
|
||||
p_decoder_dropout=config.model.p_decoder_dropout,
|
||||
p_postnet_dropout=config.model.p_postnet_dropout,
|
||||
use_stop_token=config.model.use_stop_token)
|
||||
|
||||
if self.parallel:
|
||||
model = paddle.DataParallel(model)
|
||||
|
||||
grad_clip = paddle.nn.ClipGradByGlobalNorm(
|
||||
config.training.grad_clip_thresh)
|
||||
optimizer = paddle.optimizer.Adam(
|
||||
learning_rate=config.training.lr,
|
||||
parameters=model.parameters(),
|
||||
weight_decay=paddle.regularizer.L2Decay(
|
||||
config.training.weight_decay),
|
||||
grad_clip=grad_clip)
|
||||
criterion = Tacotron2Loss(
|
||||
use_stop_token_loss=config.model.use_stop_token,
|
||||
use_guided_attention_loss=config.model.use_guided_attention_loss,
|
||||
sigma=config.model.guided_attention_loss_sigma)
|
||||
self.model = model
|
||||
self.optimizer = optimizer
|
||||
self.criterion = criterion
|
||||
|
||||
def setup_dataloader(self):
|
||||
args = self.args
|
||||
config = self.config
|
||||
ljspeech_dataset = LJSpeech(args.data)
|
||||
|
||||
valid_set, train_set = dataset.split(ljspeech_dataset,
|
||||
config.data.valid_size)
|
||||
batch_fn = LJSpeechCollector(padding_idx=config.data.padding_idx)
|
||||
|
||||
if not self.parallel:
|
||||
self.train_loader = DataLoader(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
collate_fn=batch_fn)
|
||||
else:
|
||||
sampler = DistributedBatchSampler(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
self.train_loader = DataLoader(
|
||||
train_set, batch_sampler=sampler, collate_fn=batch_fn)
|
||||
|
||||
self.valid_loader = DataLoader(
|
||||
valid_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=batch_fn)
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
exp = Experiment(config, args)
|
||||
exp.setup()
|
||||
exp.resume_or_load()
|
||||
exp.run()
|
||||
|
||||
|
||||
def main(config, args):
|
||||
if args.ngpu > 1:
|
||||
dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
|
||||
else:
|
||||
main_sp(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
parser = default_argument_parser()
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
print(args)
|
||||
|
||||
main(config, args)
|
@ -1,13 +0,0 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -1,89 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from paddle.io import Dataset
|
||||
|
||||
from paddlespeech.t2s.data import batch_spec
|
||||
from paddlespeech.t2s.data import batch_text_id
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import _phones
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import _tones
|
||||
from paddlespeech.t2s.frontend import Vocab
|
||||
|
||||
voc_phones = Vocab(sorted(list(_phones)))
|
||||
print("vocab_phones:\n", voc_phones)
|
||||
voc_tones = Vocab(sorted(list(_tones)))
|
||||
print("vocab_tones:\n", voc_tones)
|
||||
|
||||
|
||||
class AiShell3(Dataset):
|
||||
"""Processed AiShell3 dataset."""
|
||||
|
||||
def __init__(self, root):
|
||||
super().__init__()
|
||||
self.root = Path(root).expanduser()
|
||||
self.embed_dir = self.root / "embed"
|
||||
self.mel_dir = self.root / "mel"
|
||||
|
||||
with open(self.root / "metadata.pickle", 'rb') as f:
|
||||
self.records = pickle.load(f)
|
||||
|
||||
def __getitem__(self, index):
|
||||
metadatum = self.records[index]
|
||||
sentence_id = metadatum["sentence_id"]
|
||||
speaker_id = sentence_id[:7]
|
||||
phones = metadatum["phones"]
|
||||
tones = metadatum["tones"]
|
||||
phones = np.array(
|
||||
[voc_phones.lookup(item) for item in phones], dtype=np.int64)
|
||||
tones = np.array(
|
||||
[voc_tones.lookup(item) for item in tones], dtype=np.int64)
|
||||
mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy")))
|
||||
embed = np.load(
|
||||
str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
|
||||
return phones, tones, mel, embed
|
||||
|
||||
def __len__(self):
|
||||
return len(self.records)
|
||||
|
||||
|
||||
def collate_aishell3_examples(examples):
|
||||
phones, tones, mel, embed = list(zip(*examples))
|
||||
|
||||
text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
|
||||
spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
|
||||
T_dec = np.max(spec_lengths)
|
||||
stop_tokens = (
|
||||
np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32)
|
||||
phones, _ = batch_text_id(phones)
|
||||
tones, _ = batch_text_id(tones)
|
||||
mel, _ = batch_spec(mel)
|
||||
mel = np.transpose(mel, (0, 2, 1))
|
||||
embed = np.stack(embed)
|
||||
# 7 fields
|
||||
# (B, T), (B, T), (B, T, C), (B, C), (B,), (B,), (B, T)
|
||||
return phones, tones, mel, embed, text_lengths, spec_lengths, stop_tokens
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dataset = AiShell3("~/datasets/aishell3/train")
|
||||
example = dataset[0]
|
||||
|
||||
examples = [dataset[i] for i in range(10)]
|
||||
batch = collate_aishell3_examples(examples)
|
||||
|
||||
for field in batch:
|
||||
print(field.shape, field.dtype)
|
@ -1,42 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
|
||||
from pypinyin import lazy_pinyin
|
||||
from pypinyin import Style
|
||||
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import split_syllable
|
||||
|
||||
|
||||
def convert_to_pinyin(text: str) -> List[str]:
|
||||
"""convert text into list of syllables, other characters that are not chinese, thus
|
||||
cannot be converted to pinyin are splited.
|
||||
"""
|
||||
syllables = lazy_pinyin(
|
||||
text, style=Style.TONE3, neutral_tone_with_five=True)
|
||||
return syllables
|
||||
|
||||
|
||||
def convert_sentence(text: str) -> List[Tuple[str]]:
|
||||
"""convert a sentence into two list: phones and tones"""
|
||||
syllables = convert_to_pinyin(text)
|
||||
phones = []
|
||||
tones = []
|
||||
for syllable in syllables:
|
||||
p, t = split_syllable(syllable)
|
||||
phones.extend(p)
|
||||
tones.extend(t)
|
||||
|
||||
return phones, tones
|
@ -1,81 +0,0 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from yacs.config import CfgNode as CN
|
||||
|
||||
_C = CN()
|
||||
_C.data = CN(
|
||||
dict(
|
||||
batch_size=32, # batch size
|
||||
valid_size=64, # the first N examples are reserved for validation
|
||||
sample_rate=22050, # Hz, sample rate
|
||||
n_fft=1024, # fft frame size
|
||||
win_length=1024, # window size
|
||||
hop_length=256, # hop size between ajacent frame
|
||||
fmax=8000, # Hz, max frequency when converting to mel
|
||||
fmin=0, # Hz, min frequency when converting to mel
|
||||
d_mels=80, # mel bands
|
||||
padding_idx=0, # text embedding's padding index
|
||||
))
|
||||
|
||||
_C.model = CN(
|
||||
dict(
|
||||
vocab_size=70,
|
||||
n_tones=10,
|
||||
reduction_factor=1, # reduction factor
|
||||
d_encoder=512, # embedding & encoder's internal size
|
||||
encoder_conv_layers=3, # number of conv layer in tacotron2 encoder
|
||||
encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder
|
||||
d_prenet=256, # hidden size of decoder prenet
|
||||
# hidden size of the first rnn layer in tacotron2 decoder
|
||||
d_attention_rnn=1024,
|
||||
# hidden size of the second rnn layer in tacotron2 decoder
|
||||
d_decoder_rnn=1024,
|
||||
d_attention=128, # hidden size of decoder location linear layer
|
||||
attention_filters=32, # number of filter in decoder location conv layer
|
||||
attention_kernel_size=31, # kernel size of decoder location conv layer
|
||||
d_postnet=512, # hidden size of decoder postnet
|
||||
postnet_kernel_size=5, # kernel size of conv layers in postnet
|
||||
postnet_conv_layers=5, # number of conv layer in decoder postnet
|
||||
p_encoder_dropout=0.5, # droput probability in encoder
|
||||
p_prenet_dropout=0.5, # droput probability in decoder prenet
|
||||
|
||||
# droput probability of first rnn layer in decoder
|
||||
p_attention_dropout=0.1,
|
||||
# droput probability of second rnn layer in decoder
|
||||
p_decoder_dropout=0.1,
|
||||
p_postnet_dropout=0.5, # droput probability in decoder postnet
|
||||
guided_attention_loss_sigma=0.2,
|
||||
d_global_condition=256,
|
||||
|
||||
# whether to use a classifier to predict stop probability
|
||||
use_stop_token=False,
|
||||
# whether to use guided attention loss in training
|
||||
use_guided_attention_loss=True, ))
|
||||
|
||||
_C.training = CN(
|
||||
dict(
|
||||
lr=1e-3, # learning rate
|
||||
weight_decay=1e-6, # the coeff of weight decay
|
||||
grad_clip_thresh=1.0, # the clip norm of grad clip.
|
||||
valid_interval=1000, # validation
|
||||
save_interval=1000, # checkpoint
|
||||
max_iteration=500000, # max iteration to train
|
||||
))
|
||||
|
||||
|
||||
def get_cfg_defaults():
|
||||
"""Get a yacs CfgNode object with default values for my_project."""
|
||||
# Return a clone so that the defaults will not be altered
|
||||
# This is for the "local variable" use pattern
|
||||
return _C.clone()
|
@ -1,95 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import multiprocessing as mp
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import tqdm
|
||||
|
||||
from paddlespeech.t2s.audio import AudioProcessor
|
||||
from paddlespeech.t2s.audio.spec_normalizer import LogMagnitude
|
||||
from paddlespeech.t2s.audio.spec_normalizer import NormalizerBase
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.config import get_cfg_defaults
|
||||
|
||||
|
||||
def extract_mel(fname: Path,
|
||||
input_dir: Path,
|
||||
output_dir: Path,
|
||||
p: AudioProcessor,
|
||||
n: NormalizerBase):
|
||||
relative_path = fname.relative_to(input_dir)
|
||||
out_path = (output_dir / relative_path).with_suffix(".npy")
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
wav = p.read_wav(fname)
|
||||
mel = p.mel_spectrogram(wav)
|
||||
mel = n.transform(mel)
|
||||
np.save(out_path, mel)
|
||||
|
||||
|
||||
def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):
|
||||
input_dir = Path(input_dir).expanduser()
|
||||
fnames = list(input_dir.rglob(f"*{extension}"))
|
||||
output_dir = Path(output_dir).expanduser()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length,
|
||||
config.hop_length, config.d_mels, config.fmin,
|
||||
config.fmax)
|
||||
n = LogMagnitude(1e-5)
|
||||
|
||||
func = partial(
|
||||
extract_mel, input_dir=input_dir, output_dir=output_dir, p=p, n=n)
|
||||
|
||||
with mp.Pool(16) as pool:
|
||||
list(
|
||||
tqdm.tqdm(
|
||||
pool.imap(func, fnames), total=len(fnames), unit="utterance"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extract mel spectrogram from processed wav in AiShell3 training dataset."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
help="yaml config file to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/normalized_wav",
|
||||
help="path of the processed wav folder")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/mel",
|
||||
help="path of the folder to save mel spectrograms")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
default_config = get_cfg_defaults()
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
default_config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
default_config.merge_from_list(args.opts)
|
||||
default_config.freeze()
|
||||
audio_config = default_config.data
|
||||
|
||||
extract_mel_multispeaker(audio_config, args.input, args.output)
|
File diff suppressed because it is too large
Load Diff
@ -1,257 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import pickle
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import tqdm
|
||||
import yaml
|
||||
|
||||
zh_pattern = re.compile("[\u4e00-\u9fa5]")
|
||||
|
||||
_tones = {'<pad>', '<s>', '</s>', '0', '1', '2', '3', '4', '5'}
|
||||
|
||||
_pauses = {'%', '$'}
|
||||
|
||||
_initials = {
|
||||
'b',
|
||||
'p',
|
||||
'm',
|
||||
'f',
|
||||
'd',
|
||||
't',
|
||||
'n',
|
||||
'l',
|
||||
'g',
|
||||
'k',
|
||||
'h',
|
||||
'j',
|
||||
'q',
|
||||
'x',
|
||||
'zh',
|
||||
'ch',
|
||||
'sh',
|
||||
'r',
|
||||
'z',
|
||||
'c',
|
||||
's',
|
||||
}
|
||||
|
||||
_finals = {
|
||||
'ii',
|
||||
'iii',
|
||||
'a',
|
||||
'o',
|
||||
'e',
|
||||
'ea',
|
||||
'ai',
|
||||
'ei',
|
||||
'ao',
|
||||
'ou',
|
||||
'an',
|
||||
'en',
|
||||
'ang',
|
||||
'eng',
|
||||
'er',
|
||||
'i',
|
||||
'ia',
|
||||
'io',
|
||||
'ie',
|
||||
'iai',
|
||||
'iao',
|
||||
'iou',
|
||||
'ian',
|
||||
'ien',
|
||||
'iang',
|
||||
'ieng',
|
||||
'u',
|
||||
'ua',
|
||||
'uo',
|
||||
'uai',
|
||||
'uei',
|
||||
'uan',
|
||||
'uen',
|
||||
'uang',
|
||||
'ueng',
|
||||
'v',
|
||||
've',
|
||||
'van',
|
||||
'ven',
|
||||
'veng',
|
||||
}
|
||||
|
||||
_ernized_symbol = {'&r'}
|
||||
|
||||
_specials = {'<pad>', '<unk>', '<s>', '</s>'}
|
||||
|
||||
_phones = _initials | _finals | _ernized_symbol | _specials | _pauses
|
||||
|
||||
|
||||
def is_zh(word):
|
||||
global zh_pattern
|
||||
match = zh_pattern.search(word)
|
||||
return match is not None
|
||||
|
||||
|
||||
def ernized(syllable):
|
||||
return syllable[:2] != "er" and syllable[-2] == 'r'
|
||||
|
||||
|
||||
def convert(syllable):
|
||||
# expansion of o -> uo
|
||||
syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable)
|
||||
# syllable = syllable.replace("bo", "buo").replace("po", "puo").replace("mo", "muo").replace("fo", "fuo")
|
||||
# expansion for iong, ong
|
||||
syllable = syllable.replace("iong", "veng").replace("ong", "ueng")
|
||||
|
||||
# expansion for ing, in
|
||||
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
|
||||
|
||||
# expansion for un, ui, iu
|
||||
syllable = syllable.replace("un", "uen").replace("ui",
|
||||
"uei").replace("iu", "iou")
|
||||
|
||||
# rule for variants of i
|
||||
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
|
||||
.replace("zhi", "zhiii").replace("chi", "chiii").replace("shi", "shiii")\
|
||||
.replace("ri", "riii")
|
||||
|
||||
# rule for y preceding i, u
|
||||
syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i")
|
||||
|
||||
# rule for w
|
||||
syllable = syllable.replace("wu", "u").replace("w", "u")
|
||||
|
||||
# rule for v following j, q, x
|
||||
syllable = syllable.replace("ju", "jv").replace("qu",
|
||||
"qv").replace("xu", "xv")
|
||||
|
||||
return syllable
|
||||
|
||||
|
||||
def split_syllable(syllable: str):
|
||||
"""Split a syllable in pinyin into a list of phones and a list of tones.
|
||||
Initials have no tone, represented by '0', while finals have tones from
|
||||
'1,2,3,4,5'.
|
||||
|
||||
e.g.
|
||||
|
||||
zhang -> ['zh', 'ang'], ['0', '1']
|
||||
"""
|
||||
if syllable in _pauses:
|
||||
# syllable, tone
|
||||
return [syllable], ['0']
|
||||
|
||||
tone = syllable[-1]
|
||||
syllable = convert(syllable[:-1])
|
||||
|
||||
phones = []
|
||||
tones = []
|
||||
|
||||
global _initials
|
||||
if syllable[:2] in _initials:
|
||||
phones.append(syllable[:2])
|
||||
tones.append('0')
|
||||
phones.append(syllable[2:])
|
||||
tones.append(tone)
|
||||
elif syllable[0] in _initials:
|
||||
phones.append(syllable[0])
|
||||
tones.append('0')
|
||||
phones.append(syllable[1:])
|
||||
tones.append(tone)
|
||||
else:
|
||||
phones.append(syllable)
|
||||
tones.append(tone)
|
||||
return phones, tones
|
||||
|
||||
|
||||
def load_aishell3_transcription(line: str):
|
||||
sentence_id, pinyin, text = line.strip().split("|")
|
||||
syllables = pinyin.strip().split()
|
||||
|
||||
results = []
|
||||
|
||||
for syllable in syllables:
|
||||
if syllable in _pauses:
|
||||
results.append(syllable)
|
||||
elif not ernized(syllable):
|
||||
results.append(syllable)
|
||||
else:
|
||||
results.append(syllable[:-2] + syllable[-1])
|
||||
results.append('&r5')
|
||||
|
||||
phones = []
|
||||
tones = []
|
||||
for syllable in results:
|
||||
p, t = split_syllable(syllable)
|
||||
phones.extend(p)
|
||||
tones.extend(t)
|
||||
for p in phones:
|
||||
assert p in _phones, p
|
||||
return {
|
||||
"sentence_id": sentence_id,
|
||||
"text": text,
|
||||
"syllables": results,
|
||||
"phones": phones,
|
||||
"tones": tones
|
||||
}
|
||||
|
||||
|
||||
def process_aishell3(dataset_root, output_dir):
|
||||
dataset_root = Path(dataset_root).expanduser()
|
||||
output_dir = Path(output_dir).expanduser()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
prosody_label_path = dataset_root / "label_train-set.txt"
|
||||
with open(prosody_label_path, 'rt') as f:
|
||||
lines = [line.strip() for line in f]
|
||||
|
||||
records = lines[5:]
|
||||
|
||||
processed_records = []
|
||||
for record in tqdm.tqdm(records):
|
||||
new_record = load_aishell3_transcription(record)
|
||||
processed_records.append(new_record)
|
||||
print(new_record)
|
||||
|
||||
with open(output_dir / "metadata.pickle", 'wb') as f:
|
||||
pickle.dump(processed_records, f)
|
||||
|
||||
with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
|
||||
yaml.safe_dump(
|
||||
processed_records, f, default_flow_style=None, allow_unicode=True)
|
||||
|
||||
print("metadata done!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train",
|
||||
help="path of the training dataset,(contains a label_train-set.txt).")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
help="the directory to save the processed transcription."
|
||||
"If not provided, it would be the same as the input.")
|
||||
args = parser.parse_args()
|
||||
if args.output is None:
|
||||
args.output = args.input
|
||||
|
||||
process_aishell3(args.input, args.output)
|
@ -1,94 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
from functools import partial
|
||||
from multiprocessing import Pool
|
||||
from pathlib import Path
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
from praatio import textgrid
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def get_valid_part(fpath):
|
||||
f = textgrid.openTextgrid(fpath, includeEmptyIntervals=True)
|
||||
|
||||
start = 0
|
||||
phone_entry_list = f.tierDict['phones'].entryList
|
||||
first_entry = phone_entry_list[0]
|
||||
if first_entry.label == "sil":
|
||||
start = first_entry.end
|
||||
|
||||
last_entry = phone_entry_list[-1]
|
||||
if last_entry.label == "sp":
|
||||
end = last_entry.start
|
||||
else:
|
||||
end = last_entry.end
|
||||
return start, end
|
||||
|
||||
|
||||
def process_utterance(fpath, source_dir, target_dir, alignment_dir):
|
||||
rel_path = fpath.relative_to(source_dir)
|
||||
opath = target_dir / rel_path
|
||||
apath = (alignment_dir / rel_path).with_suffix(".TextGrid")
|
||||
opath.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
start, end = get_valid_part(apath)
|
||||
wav, _ = librosa.load(fpath, sr=22050, offset=start, duration=end - start)
|
||||
normalized_wav = wav / np.max(wav) * 0.999
|
||||
sf.write(opath, normalized_wav, samplerate=22050, subtype='PCM_16')
|
||||
# print(f"{fpath} => {opath}")
|
||||
|
||||
|
||||
def preprocess_aishell3(source_dir, target_dir, alignment_dir):
|
||||
source_dir = Path(source_dir).expanduser()
|
||||
target_dir = Path(target_dir).expanduser()
|
||||
alignment_dir = Path(alignment_dir).expanduser()
|
||||
|
||||
wav_paths = list(source_dir.rglob("*.wav"))
|
||||
print(f"there are {len(wav_paths)} audio files in total")
|
||||
fx = partial(
|
||||
process_utterance,
|
||||
source_dir=source_dir,
|
||||
target_dir=target_dir,
|
||||
alignment_dir=alignment_dir)
|
||||
with Pool(16) as p:
|
||||
list(
|
||||
tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Process audio in AiShell3, trim silence according to the alignment "
|
||||
"files generated by MFA, and normalize volume by peak.")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/wav",
|
||||
help="path of the original audio folder in aishell3.")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/normalized_wav",
|
||||
help="path of the folder to save the processed audio files.")
|
||||
parser.add_argument(
|
||||
"--alignment",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/alignment",
|
||||
help="path of the alignment files.")
|
||||
args = parser.parse_args()
|
||||
|
||||
preprocess_aishell3(args.input, args.output, args.alignment)
|
@ -1,263 +0,0 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from matplotlib import pyplot as plt
|
||||
from paddle import distributed as dist
|
||||
from paddle.io import DataLoader
|
||||
from paddle.io import DistributedBatchSampler
|
||||
|
||||
from paddlespeech.t2s.data import dataset
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import AiShell3
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import collate_aishell3_examples
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.config import get_cfg_defaults
|
||||
from paddlespeech.t2s.models.tacotron2 import Tacotron2
|
||||
from paddlespeech.t2s.models.tacotron2 import Tacotron2Loss
|
||||
from paddlespeech.t2s.training.cli import default_argument_parser
|
||||
from paddlespeech.t2s.training.experiment import ExperimentBase
|
||||
from paddlespeech.t2s.utils import display
|
||||
from paddlespeech.t2s.utils import mp_tools
|
||||
|
||||
|
||||
class Experiment(ExperimentBase):
|
||||
def compute_losses(self, inputs, outputs):
|
||||
texts, tones, mel_targets, utterance_embeds, text_lens, output_lens, stop_tokens = inputs
|
||||
|
||||
mel_outputs = outputs["mel_output"]
|
||||
mel_outputs_postnet = outputs["mel_outputs_postnet"]
|
||||
alignments = outputs["alignments"]
|
||||
|
||||
losses = self.criterion(mel_outputs, mel_outputs_postnet, mel_targets,
|
||||
alignments, output_lens, text_lens)
|
||||
return losses
|
||||
|
||||
def train_batch(self):
|
||||
start = time.time()
|
||||
batch = self.read_batch()
|
||||
data_loader_time = time.time() - start
|
||||
|
||||
self.optimizer.clear_grad()
|
||||
self.model.train()
|
||||
texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
|
||||
outputs = self.model(
|
||||
texts,
|
||||
text_lens,
|
||||
mels,
|
||||
output_lens,
|
||||
tones=tones,
|
||||
global_condition=utterance_embeds)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
loss = losses["loss"]
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
iteration_time = time.time() - start
|
||||
|
||||
losses_np = {k: float(v) for k, v in losses.items()}
|
||||
# logging
|
||||
msg = "Rank: {}, ".format(dist.get_rank())
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
|
||||
iteration_time)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_np.items())
|
||||
self.logger.info(msg)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
for key, value in losses_np.items():
|
||||
self.visualizer.add_scalar(f"train_loss/{key}", value,
|
||||
self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
def valid(self):
|
||||
valid_losses = defaultdict(list)
|
||||
for i, batch in enumerate(self.valid_loader):
|
||||
texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
|
||||
outputs = self.model(
|
||||
texts,
|
||||
text_lens,
|
||||
mels,
|
||||
output_lens,
|
||||
tones=tones,
|
||||
global_condition=utterance_embeds)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
for key, value in losses.items():
|
||||
valid_losses[key].append(float(value))
|
||||
|
||||
attention_weights = outputs["alignments"]
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_alignments",
|
||||
display.plot_alignment(attention_weights[0].numpy().T),
|
||||
self.iteration)
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_target_spectrogram",
|
||||
display.plot_spectrogram(mels[0].numpy().T), self.iteration)
|
||||
mel_pred = outputs['mel_outputs_postnet']
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_predicted_spectrogram",
|
||||
display.plot_spectrogram(mel_pred[0].numpy().T), self.iteration)
|
||||
|
||||
# write visual log
|
||||
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
|
||||
|
||||
# logging
|
||||
msg = "Valid: "
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in valid_losses.items())
|
||||
self.logger.info(msg)
|
||||
|
||||
for key, value in valid_losses.items():
|
||||
self.visualizer.add_scalar(f"valid/{key}", value, self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
def eval(self):
|
||||
"""Evaluation of Tacotron2 in autoregressive manner."""
|
||||
self.model.eval()
|
||||
mel_dir = Path(self.output_dir / ("eval_{}".format(self.iteration)))
|
||||
mel_dir.mkdir(parents=True, exist_ok=True)
|
||||
for i, batch in enumerate(self.test_loader):
|
||||
texts, tones, mels, utterance_embeds, *_ = batch
|
||||
outputs = self.model.infer(
|
||||
texts, tones=tones, global_condition=utterance_embeds)
|
||||
|
||||
display.plot_alignment(outputs["alignments"][0].numpy().T)
|
||||
plt.savefig(mel_dir / f"sentence_{i}.png")
|
||||
plt.close()
|
||||
np.save(mel_dir / f"sentence_{i}",
|
||||
outputs["mel_outputs_postnet"][0].numpy().T)
|
||||
print(f"sentence_{i}")
|
||||
|
||||
def setup_model(self):
|
||||
config = self.config
|
||||
model = Tacotron2(
|
||||
vocab_size=config.model.vocab_size,
|
||||
n_tones=config.model.n_tones,
|
||||
d_mels=config.data.d_mels,
|
||||
d_encoder=config.model.d_encoder,
|
||||
encoder_conv_layers=config.model.encoder_conv_layers,
|
||||
encoder_kernel_size=config.model.encoder_kernel_size,
|
||||
d_prenet=config.model.d_prenet,
|
||||
d_attention_rnn=config.model.d_attention_rnn,
|
||||
d_decoder_rnn=config.model.d_decoder_rnn,
|
||||
attention_filters=config.model.attention_filters,
|
||||
attention_kernel_size=config.model.attention_kernel_size,
|
||||
d_attention=config.model.d_attention,
|
||||
d_postnet=config.model.d_postnet,
|
||||
postnet_kernel_size=config.model.postnet_kernel_size,
|
||||
postnet_conv_layers=config.model.postnet_conv_layers,
|
||||
reduction_factor=config.model.reduction_factor,
|
||||
p_encoder_dropout=config.model.p_encoder_dropout,
|
||||
p_prenet_dropout=config.model.p_prenet_dropout,
|
||||
p_attention_dropout=config.model.p_attention_dropout,
|
||||
p_decoder_dropout=config.model.p_decoder_dropout,
|
||||
p_postnet_dropout=config.model.p_postnet_dropout,
|
||||
d_global_condition=config.model.d_global_condition,
|
||||
use_stop_token=config.model.use_stop_token, )
|
||||
|
||||
if self.parallel:
|
||||
model = paddle.DataParallel(model)
|
||||
|
||||
grad_clip = paddle.nn.ClipGradByGlobalNorm(
|
||||
config.training.grad_clip_thresh)
|
||||
optimizer = paddle.optimizer.Adam(
|
||||
learning_rate=config.training.lr,
|
||||
parameters=model.parameters(),
|
||||
weight_decay=paddle.regularizer.L2Decay(
|
||||
config.training.weight_decay),
|
||||
grad_clip=grad_clip)
|
||||
criterion = Tacotron2Loss(
|
||||
use_stop_token_loss=config.model.use_stop_token,
|
||||
use_guided_attention_loss=config.model.use_guided_attention_loss,
|
||||
sigma=config.model.guided_attention_loss_sigma)
|
||||
self.model = model
|
||||
self.optimizer = optimizer
|
||||
self.criterion = criterion
|
||||
|
||||
def setup_dataloader(self):
|
||||
args = self.args
|
||||
config = self.config
|
||||
aishell3_dataset = AiShell3(args.data)
|
||||
|
||||
valid_set, train_set = dataset.split(aishell3_dataset,
|
||||
config.data.valid_size)
|
||||
batch_fn = collate_aishell3_examples
|
||||
|
||||
if not self.parallel:
|
||||
self.train_loader = DataLoader(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
collate_fn=batch_fn)
|
||||
else:
|
||||
sampler = DistributedBatchSampler(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
self.train_loader = DataLoader(
|
||||
train_set, batch_sampler=sampler, collate_fn=batch_fn)
|
||||
|
||||
self.valid_loader = DataLoader(
|
||||
valid_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=batch_fn)
|
||||
|
||||
self.test_loader = DataLoader(
|
||||
valid_set,
|
||||
batch_size=1,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=batch_fn)
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
exp = Experiment(config, args)
|
||||
exp.setup()
|
||||
exp.resume_or_load()
|
||||
if not args.test:
|
||||
exp.run()
|
||||
else:
|
||||
exp.eval()
|
||||
|
||||
|
||||
def main(config, args):
|
||||
if args.ngpu > 1:
|
||||
dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
|
||||
else:
|
||||
main_sp(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
parser = default_argument_parser()
|
||||
parser.add_argument("--test", action="store_true")
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
print(args)
|
||||
|
||||
main(config, args)
|
@ -1,166 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import soundfile as sf
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_phones
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_tones
|
||||
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.chinese_g2p import convert_sentence
|
||||
from paddlespeech.t2s.models.tacotron2 import Tacotron2
|
||||
from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow
|
||||
from paddlespeech.t2s.utils import display
|
||||
from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
|
||||
from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
|
||||
|
||||
|
||||
def voice_cloning(args):
|
||||
# speaker encoder
|
||||
p = SpeakerVerificationPreprocessor(
|
||||
sampling_rate=16000,
|
||||
audio_norm_target_dBFS=-30,
|
||||
vad_window_length=30,
|
||||
vad_moving_average_width=8,
|
||||
vad_max_silence_length=6,
|
||||
mel_window_length=25,
|
||||
mel_window_step=10,
|
||||
n_mels=40,
|
||||
partial_n_frames=160,
|
||||
min_pad_coverage=0.75,
|
||||
partial_overlap_ratio=0.5)
|
||||
print("Audio Processor Done!")
|
||||
|
||||
speaker_encoder = LSTMSpeakerEncoder(
|
||||
n_mels=40, num_layers=3, hidden_size=256, output_size=256)
|
||||
speaker_encoder.set_state_dict(paddle.load(args.ge2e_params_path))
|
||||
speaker_encoder.eval()
|
||||
print("GE2E Done!")
|
||||
|
||||
synthesizer = Tacotron2(
|
||||
vocab_size=68,
|
||||
n_tones=10,
|
||||
d_mels=80,
|
||||
d_encoder=512,
|
||||
encoder_conv_layers=3,
|
||||
encoder_kernel_size=5,
|
||||
d_prenet=256,
|
||||
d_attention_rnn=1024,
|
||||
d_decoder_rnn=1024,
|
||||
attention_filters=32,
|
||||
attention_kernel_size=31,
|
||||
d_attention=128,
|
||||
d_postnet=512,
|
||||
postnet_kernel_size=5,
|
||||
postnet_conv_layers=5,
|
||||
reduction_factor=1,
|
||||
p_encoder_dropout=0.5,
|
||||
p_prenet_dropout=0.5,
|
||||
p_attention_dropout=0.1,
|
||||
p_decoder_dropout=0.1,
|
||||
p_postnet_dropout=0.5,
|
||||
d_global_condition=256,
|
||||
use_stop_token=False, )
|
||||
synthesizer.set_state_dict(paddle.load(args.tacotron2_params_path))
|
||||
synthesizer.eval()
|
||||
print("Tacotron2 Done!")
|
||||
|
||||
# vocoder
|
||||
vocoder = ConditionalWaveFlow(
|
||||
upsample_factors=[16, 16],
|
||||
n_flows=8,
|
||||
n_layers=8,
|
||||
n_group=16,
|
||||
channels=128,
|
||||
n_mels=80,
|
||||
kernel_size=[3, 3])
|
||||
vocoder.set_state_dict(paddle.load(args.waveflow_params_path))
|
||||
vocoder.eval()
|
||||
print("WaveFlow Done!")
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
input_dir = Path(args.input_dir)
|
||||
|
||||
# 因为 AISHELL-3 数据集中使用 % 和 $ 表示韵律词和韵律短语的边界,它们大约对应着较短和较长的停顿,在文本中可以使用 % 和 $ 来调节韵律。
|
||||
# 值得的注意的是,句子的有效字符集仅包含汉字和 %, $, 因此输入的句子只能包含这些字符。
|
||||
sentence = "每当你觉得%想要批评什么人的时候$你切要记着%这个世界上的人%并非都具备你禀有的条件$"
|
||||
phones, tones = convert_sentence(sentence)
|
||||
phones = np.array(
|
||||
[voc_phones.lookup(item) for item in phones], dtype=np.int64)
|
||||
tones = np.array([voc_tones.lookup(item) for item in tones], dtype=np.int64)
|
||||
phones = paddle.to_tensor(phones).unsqueeze(0)
|
||||
tones = paddle.to_tensor(tones).unsqueeze(0)
|
||||
|
||||
for name in os.listdir(input_dir):
|
||||
utt_id = name.split(".")[0]
|
||||
ref_audio_path = input_dir / name
|
||||
mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path))
|
||||
print("mel_sequences: ", mel_sequences.shape)
|
||||
with paddle.no_grad():
|
||||
embed = speaker_encoder.embed_utterance(
|
||||
paddle.to_tensor(mel_sequences))
|
||||
print("embed shape: ", embed.shape)
|
||||
utterance_embeds = paddle.unsqueeze(embed, 0)
|
||||
outputs = synthesizer.infer(
|
||||
phones, tones=tones, global_condition=utterance_embeds)
|
||||
mel_input = paddle.transpose(outputs["mel_outputs_postnet"], [0, 2, 1])
|
||||
alignment = outputs["alignments"][0].numpy().T
|
||||
display.plot_alignment(alignment)
|
||||
plt.savefig(str(output_dir / (utt_id + ".png")))
|
||||
|
||||
with paddle.no_grad():
|
||||
wav = vocoder.infer(mel_input)
|
||||
wav = wav.numpy()[0]
|
||||
sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=22050)
|
||||
|
||||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(description="")
|
||||
parser.add_argument(
|
||||
"--ge2e_params_path", type=str, help="ge2e params path.")
|
||||
parser.add_argument(
|
||||
"--tacotron2_params_path", type=str, help="tacotron2 params path.")
|
||||
parser.add_argument(
|
||||
"--waveflow_params_path", type=str, help="waveflow params path.")
|
||||
|
||||
parser.add_argument(
|
||||
"--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
|
||||
|
||||
parser.add_argument(
|
||||
"--input-dir",
|
||||
type=str,
|
||||
help="input dir of *.wav, the sample rate will be resample to 16k.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.ngpu == 0:
|
||||
paddle.set_device("cpu")
|
||||
elif args.ngpu > 0:
|
||||
paddle.set_device("gpu")
|
||||
else:
|
||||
print("ngpu should >= 0 !")
|
||||
|
||||
voice_cloning(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,108 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
import paddle
|
||||
import soundfile as sf
|
||||
import yaml
|
||||
from paddle import distributed as dist
|
||||
from timer import timer
|
||||
from yacs.config import CfgNode
|
||||
|
||||
from paddlespeech.t2s.datasets.data_table import DataTable
|
||||
from paddlespeech.t2s.models.wavernn import WaveRNN
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Synthesize with WaveRNN.")
|
||||
|
||||
parser.add_argument("--config", type=str, help="GANVocoder config file.")
|
||||
parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
|
||||
parser.add_argument("--test-metadata", type=str, help="dev data.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.config) as f:
|
||||
config = CfgNode(yaml.safe_load(f))
|
||||
|
||||
print("========Args========")
|
||||
print(yaml.safe_dump(vars(args)))
|
||||
print("========Config========")
|
||||
print(config)
|
||||
print(
|
||||
f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
|
||||
)
|
||||
|
||||
if args.ngpu == 0:
|
||||
paddle.set_device("cpu")
|
||||
elif args.ngpu > 0:
|
||||
paddle.set_device("gpu")
|
||||
else:
|
||||
print("ngpu should >= 0 !")
|
||||
|
||||
model = WaveRNN(
|
||||
hop_length=config.n_shift, sample_rate=config.fs, **config["model"])
|
||||
state_dict = paddle.load(args.checkpoint)
|
||||
model.set_state_dict(state_dict["main_params"])
|
||||
|
||||
model.eval()
|
||||
|
||||
with jsonlines.open(args.test_metadata, 'r') as reader:
|
||||
metadata = list(reader)
|
||||
test_dataset = DataTable(
|
||||
metadata,
|
||||
fields=['utt_id', 'feats'],
|
||||
converters={
|
||||
'utt_id': None,
|
||||
'feats': np.load,
|
||||
})
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
N = 0
|
||||
T = 0
|
||||
for example in test_dataset:
|
||||
utt_id = example['utt_id']
|
||||
mel = example['feats']
|
||||
mel = paddle.to_tensor(mel) # (T, C)
|
||||
with timer() as t:
|
||||
with paddle.no_grad():
|
||||
wav = model.generate(
|
||||
c=mel,
|
||||
batched=config.inference.gen_batched,
|
||||
target=config.inference.target,
|
||||
overlap=config.inference.overlap,
|
||||
mu_law=config.mu_law,
|
||||
gen_display=True)
|
||||
wav = wav.numpy()
|
||||
N += wav.size
|
||||
T += t.elapse
|
||||
speed = wav.size / t.elapse
|
||||
rtf = config.fs / speed
|
||||
print(
|
||||
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
|
||||
)
|
||||
sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.fs)
|
||||
print(f"generation speed: {N / T}Hz, RTF: {config.fs / (N / T) }")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,212 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
import paddle
|
||||
import yaml
|
||||
from paddle import DataParallel
|
||||
from paddle import distributed as dist
|
||||
from paddle.io import DataLoader
|
||||
from paddle.io import DistributedBatchSampler
|
||||
from paddle.optimizer import Adam
|
||||
from yacs.config import CfgNode
|
||||
|
||||
from paddlespeech.t2s.datasets.data_table import DataTable
|
||||
from paddlespeech.t2s.datasets.vocoder_batch_fn import WaveRNNClip
|
||||
from paddlespeech.t2s.models.wavernn import WaveRNN
|
||||
from paddlespeech.t2s.models.wavernn import WaveRNNEvaluator
|
||||
from paddlespeech.t2s.models.wavernn import WaveRNNUpdater
|
||||
from paddlespeech.t2s.modules.losses import discretized_mix_logistic_loss
|
||||
from paddlespeech.t2s.training.extensions.snapshot import Snapshot
|
||||
from paddlespeech.t2s.training.extensions.visualizer import VisualDL
|
||||
from paddlespeech.t2s.training.seeding import seed_everything
|
||||
from paddlespeech.t2s.training.trainer import Trainer
|
||||
|
||||
|
||||
def train_sp(args, config):
|
||||
# decides device type and whether to run in parallel
|
||||
# setup running environment correctly
|
||||
world_size = paddle.distributed.get_world_size()
|
||||
if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
|
||||
paddle.set_device("cpu")
|
||||
else:
|
||||
paddle.set_device("gpu")
|
||||
if world_size > 1:
|
||||
paddle.distributed.init_parallel_env()
|
||||
|
||||
# set the random seed, it is a must for multiprocess training
|
||||
seed_everything(config.seed)
|
||||
|
||||
print(
|
||||
f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
|
||||
)
|
||||
|
||||
# construct dataset for training and validation
|
||||
with jsonlines.open(args.train_metadata, 'r') as reader:
|
||||
train_metadata = list(reader)
|
||||
train_dataset = DataTable(
|
||||
data=train_metadata,
|
||||
fields=["wave", "feats"],
|
||||
converters={
|
||||
"wave": np.load,
|
||||
"feats": np.load,
|
||||
}, )
|
||||
|
||||
with jsonlines.open(args.dev_metadata, 'r') as reader:
|
||||
dev_metadata = list(reader)
|
||||
dev_dataset = DataTable(
|
||||
data=dev_metadata,
|
||||
fields=["wave", "feats"],
|
||||
converters={
|
||||
"wave": np.load,
|
||||
"feats": np.load,
|
||||
}, )
|
||||
|
||||
batch_fn = WaveRNNClip(
|
||||
mode=config.model.mode,
|
||||
aux_context_window=config.model.aux_context_window,
|
||||
hop_size=config.n_shift,
|
||||
batch_max_steps=config.batch_max_steps,
|
||||
bits=config.model.bits)
|
||||
|
||||
# collate function and dataloader
|
||||
train_sampler = DistributedBatchSampler(
|
||||
train_dataset,
|
||||
batch_size=config.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
dev_sampler = DistributedBatchSampler(
|
||||
dev_dataset,
|
||||
batch_size=config.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False)
|
||||
print("samplers done!")
|
||||
|
||||
train_dataloader = DataLoader(
|
||||
train_dataset,
|
||||
batch_sampler=train_sampler,
|
||||
collate_fn=batch_fn,
|
||||
num_workers=config.num_workers)
|
||||
|
||||
dev_dataloader = DataLoader(
|
||||
dev_dataset,
|
||||
collate_fn=batch_fn,
|
||||
batch_sampler=dev_sampler,
|
||||
num_workers=config.num_workers)
|
||||
|
||||
valid_generate_loader = DataLoader(dev_dataset, batch_size=1)
|
||||
|
||||
print("dataloaders done!")
|
||||
|
||||
model = WaveRNN(
|
||||
hop_length=config.n_shift, sample_rate=config.fs, **config["model"])
|
||||
if world_size > 1:
|
||||
model = DataParallel(model)
|
||||
print("model done!")
|
||||
|
||||
if config.model.mode == 'RAW':
|
||||
criterion = paddle.nn.CrossEntropyLoss(axis=1)
|
||||
elif config.model.mode == 'MOL':
|
||||
criterion = discretized_mix_logistic_loss
|
||||
else:
|
||||
criterion = None
|
||||
RuntimeError('Unknown model mode value - ', config.model.mode)
|
||||
print("criterions done!")
|
||||
clip = paddle.nn.ClipGradByGlobalNorm(config.grad_clip)
|
||||
optimizer = Adam(
|
||||
parameters=model.parameters(),
|
||||
learning_rate=config.learning_rate,
|
||||
grad_clip=clip)
|
||||
|
||||
print("optimizer done!")
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
if dist.get_rank() == 0:
|
||||
config_name = args.config.split("/")[-1]
|
||||
# copy conf to output_dir
|
||||
shutil.copyfile(args.config, output_dir / config_name)
|
||||
|
||||
updater = WaveRNNUpdater(
|
||||
model=model,
|
||||
optimizer=optimizer,
|
||||
criterion=criterion,
|
||||
dataloader=train_dataloader,
|
||||
output_dir=output_dir,
|
||||
mode=config.model.mode)
|
||||
|
||||
evaluator = WaveRNNEvaluator(
|
||||
model=model,
|
||||
dataloader=dev_dataloader,
|
||||
criterion=criterion,
|
||||
output_dir=output_dir,
|
||||
valid_generate_loader=valid_generate_loader,
|
||||
config=config)
|
||||
|
||||
trainer = Trainer(
|
||||
updater,
|
||||
stop_trigger=(config.train_max_steps, "iteration"),
|
||||
out=output_dir)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
trainer.extend(
|
||||
evaluator, trigger=(config.eval_interval_steps, 'iteration'))
|
||||
trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
|
||||
trainer.extend(
|
||||
Snapshot(max_size=config.num_snapshots),
|
||||
trigger=(config.save_interval_steps, 'iteration'))
|
||||
|
||||
print("Trainer Done!")
|
||||
trainer.run()
|
||||
|
||||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
|
||||
parser = argparse.ArgumentParser(description="Train a HiFiGAN model.")
|
||||
parser.add_argument(
|
||||
"--config", type=str, help="config file to overwrite default config.")
|
||||
parser.add_argument("--train-metadata", type=str, help="training data.")
|
||||
parser.add_argument("--dev-metadata", type=str, help="dev data.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||
parser.add_argument(
|
||||
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.config, 'rt') as f:
|
||||
config = CfgNode(yaml.safe_load(f))
|
||||
|
||||
print("========Args========")
|
||||
print(yaml.safe_dump(vars(args)))
|
||||
print("========Config========")
|
||||
print(config)
|
||||
print(
|
||||
f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
|
||||
)
|
||||
|
||||
# dispatch
|
||||
if args.ngpu > 1:
|
||||
dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
|
||||
else:
|
||||
train_sp(args, config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,627 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import sys
|
||||
import time
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
|
||||
from paddlespeech.t2s.audio.codec import decode_mu_law
|
||||
from paddlespeech.t2s.modules.losses import sample_from_discretized_mix_logistic
|
||||
from paddlespeech.t2s.modules.nets_utils import initialize
|
||||
from paddlespeech.t2s.modules.upsample import Stretch2D
|
||||
|
||||
|
||||
class ResBlock(nn.Layer):
|
||||
def __init__(self, dims):
|
||||
super().__init__()
|
||||
self.conv1 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False)
|
||||
self.conv2 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False)
|
||||
self.batch_norm1 = nn.BatchNorm1D(dims)
|
||||
self.batch_norm2 = nn.BatchNorm1D(dims)
|
||||
|
||||
def forward(self, x):
|
||||
'''
|
||||
conv -> bn -> relu -> conv -> bn + residual connection
|
||||
'''
|
||||
residual = x
|
||||
x = self.conv1(x)
|
||||
x = self.batch_norm1(x)
|
||||
x = F.relu(x)
|
||||
x = self.conv2(x)
|
||||
x = self.batch_norm2(x)
|
||||
return x + residual
|
||||
|
||||
|
||||
class MelResNet(nn.Layer):
|
||||
def __init__(self,
|
||||
res_blocks: int=10,
|
||||
compute_dims: int=128,
|
||||
res_out_dims: int=128,
|
||||
aux_channels: int=80,
|
||||
aux_context_window: int=0):
|
||||
super().__init__()
|
||||
k_size = aux_context_window * 2 + 1
|
||||
# pay attention here, the dim reduces aux_context_window * 2
|
||||
self.conv_in = nn.Conv1D(
|
||||
aux_channels, compute_dims, kernel_size=k_size, bias_attr=False)
|
||||
self.batch_norm = nn.BatchNorm1D(compute_dims)
|
||||
self.layers = nn.LayerList()
|
||||
for _ in range(res_blocks):
|
||||
self.layers.append(ResBlock(compute_dims))
|
||||
self.conv_out = nn.Conv1D(compute_dims, res_out_dims, kernel_size=1)
|
||||
|
||||
def forward(self, x):
|
||||
'''
|
||||
Parameters
|
||||
----------
|
||||
x : Tensor
|
||||
Input tensor (B, in_dims, T).
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Output tensor (B, res_out_dims, T).
|
||||
'''
|
||||
|
||||
x = self.conv_in(x)
|
||||
x = self.batch_norm(x)
|
||||
x = F.relu(x)
|
||||
for f in self.layers:
|
||||
x = f(x)
|
||||
x = self.conv_out(x)
|
||||
return x
|
||||
|
||||
|
||||
class UpsampleNetwork(nn.Layer):
|
||||
def __init__(self,
|
||||
aux_channels: int=80,
|
||||
upsample_scales: List[int]=[4, 5, 3, 5],
|
||||
compute_dims: int=128,
|
||||
res_blocks: int=10,
|
||||
res_out_dims: int=128,
|
||||
aux_context_window: int=2):
|
||||
super().__init__()
|
||||
# total_scale is the total Up sampling multiple
|
||||
total_scale = np.prod(upsample_scales)
|
||||
# TODO pad*total_scale is numpy.int64
|
||||
self.indent = int(aux_context_window * total_scale)
|
||||
self.resnet = MelResNet(
|
||||
res_blocks=res_blocks,
|
||||
aux_channels=aux_channels,
|
||||
compute_dims=compute_dims,
|
||||
res_out_dims=res_out_dims,
|
||||
aux_context_window=aux_context_window)
|
||||
self.resnet_stretch = Stretch2D(total_scale, 1)
|
||||
self.up_layers = nn.LayerList()
|
||||
for scale in upsample_scales:
|
||||
k_size = (1, scale * 2 + 1)
|
||||
padding = (0, scale)
|
||||
stretch = Stretch2D(scale, 1)
|
||||
|
||||
conv = nn.Conv2D(
|
||||
1, 1, kernel_size=k_size, padding=padding, bias_attr=False)
|
||||
weight_ = paddle.full_like(conv.weight, 1. / k_size[1])
|
||||
conv.weight.set_value(weight_)
|
||||
self.up_layers.append(stretch)
|
||||
self.up_layers.append(conv)
|
||||
|
||||
def forward(self, m):
|
||||
'''
|
||||
Parameters
|
||||
----------
|
||||
c : Tensor
|
||||
Input tensor (B, C_aux, T).
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
Output tensor (B, (T - 2 * pad) * prob(upsample_scales), C_aux).
|
||||
Tensor
|
||||
Output tensor (B, (T - 2 * pad) * prob(upsample_scales), res_out_dims).
|
||||
'''
|
||||
# aux: [B, C_aux, T]
|
||||
# -> [B, res_out_dims, T - 2 * aux_context_window]
|
||||
# -> [B, 1, res_out_dims, T - 2 * aux_context_window]
|
||||
aux = self.resnet(m).unsqueeze(1)
|
||||
# aux: [B, 1, res_out_dims, T - 2 * aux_context_window]
|
||||
# -> [B, 1, res_out_dims, (T - 2 * pad) * prob(upsample_scales)]
|
||||
aux = self.resnet_stretch(aux)
|
||||
# aux: [B, 1, res_out_dims, T * prob(upsample_scales)]
|
||||
# -> [B, res_out_dims, T * prob(upsample_scales)]
|
||||
aux = aux.squeeze(1)
|
||||
# m: [B, C_aux, T] -> [B, 1, C_aux, T]
|
||||
m = m.unsqueeze(1)
|
||||
for f in self.up_layers:
|
||||
m = f(m)
|
||||
# m: [B, 1, C_aux, T*prob(upsample_scales)]
|
||||
# -> [B, C_aux, T * prob(upsample_scales)]
|
||||
# -> [B, C_aux, (T - 2 * pad) * prob(upsample_scales)]
|
||||
m = m.squeeze(1)[:, :, self.indent:-self.indent]
|
||||
# m: [B, (T - 2 * pad) * prob(upsample_scales), C_aux]
|
||||
# aux: [B, (T - 2 * pad) * prob(upsample_scales), res_out_dims]
|
||||
return m.transpose([0, 2, 1]), aux.transpose([0, 2, 1])
|
||||
|
||||
|
||||
class WaveRNN(nn.Layer):
|
||||
def __init__(
|
||||
self,
|
||||
rnn_dims: int=512,
|
||||
fc_dims: int=512,
|
||||
bits: int=9,
|
||||
aux_context_window: int=2,
|
||||
upsample_scales: List[int]=[4, 5, 3, 5],
|
||||
aux_channels: int=80,
|
||||
compute_dims: int=128,
|
||||
res_out_dims: int=128,
|
||||
res_blocks: int=10,
|
||||
hop_length: int=300,
|
||||
sample_rate: int=24000,
|
||||
mode='RAW',
|
||||
init_type: str="xavier_uniform", ):
|
||||
'''
|
||||
Parameters
|
||||
----------
|
||||
rnn_dims : int, optional
|
||||
Hidden dims of RNN Layers.
|
||||
fc_dims : int, optional
|
||||
Dims of FC Layers.
|
||||
bits : int, optional
|
||||
bit depth of signal.
|
||||
aux_context_window : int, optional
|
||||
The context window size of the first convolution applied to the
|
||||
auxiliary input, by default 2
|
||||
upsample_scales : List[int], optional
|
||||
Upsample scales of the upsample network.
|
||||
aux_channels : int, optional
|
||||
Auxiliary channel of the residual blocks.
|
||||
compute_dims : int, optional
|
||||
Dims of Conv1D in MelResNet.
|
||||
res_out_dims : int, optional
|
||||
Dims of output in MelResNet.
|
||||
res_blocks : int, optional
|
||||
Number of residual blocks.
|
||||
mode : str, optional
|
||||
Output mode of the WaveRNN vocoder. `MOL` for Mixture of Logistic Distribution,
|
||||
and `RAW` for quantized bits as the model's output.
|
||||
init_type : str
|
||||
How to initialize parameters.
|
||||
'''
|
||||
super().__init__()
|
||||
self.mode = mode
|
||||
self.aux_context_window = aux_context_window
|
||||
if self.mode == 'RAW':
|
||||
self.n_classes = 2**bits
|
||||
elif self.mode == 'MOL':
|
||||
self.n_classes = 10 * 3
|
||||
else:
|
||||
RuntimeError('Unknown model mode value - ', self.mode)
|
||||
|
||||
# List of rnns to call 'flatten_parameters()' on
|
||||
self._to_flatten = []
|
||||
|
||||
self.rnn_dims = rnn_dims
|
||||
self.aux_dims = res_out_dims // 4
|
||||
self.hop_length = hop_length
|
||||
self.sample_rate = sample_rate
|
||||
|
||||
# initialize parameters
|
||||
initialize(self, init_type)
|
||||
|
||||
self.upsample = UpsampleNetwork(
|
||||
aux_channels=aux_channels,
|
||||
upsample_scales=upsample_scales,
|
||||
compute_dims=compute_dims,
|
||||
res_blocks=res_blocks,
|
||||
res_out_dims=res_out_dims,
|
||||
aux_context_window=aux_context_window)
|
||||
self.I = nn.Linear(aux_channels + self.aux_dims + 1, rnn_dims)
|
||||
|
||||
self.rnn1 = nn.GRU(rnn_dims, rnn_dims)
|
||||
self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims)
|
||||
|
||||
self._to_flatten += [self.rnn1, self.rnn2]
|
||||
|
||||
self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims)
|
||||
self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims)
|
||||
self.fc3 = nn.Linear(fc_dims, self.n_classes)
|
||||
|
||||
# Avoid fragmentation of RNN parameters and associated warning
|
||||
self._flatten_parameters()
|
||||
|
||||
nn.initializer.set_global_initializer(None)
|
||||
|
||||
def forward(self, x, c):
|
||||
'''
|
||||
Parameters
|
||||
----------
|
||||
x : Tensor
|
||||
wav sequence, [B, T]
|
||||
c : Tensor
|
||||
mel spectrogram [B, C_aux, T']
|
||||
|
||||
T = (T' - 2 * aux_context_window ) * hop_length
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
[B, T, n_classes]
|
||||
'''
|
||||
# Although we `_flatten_parameters()` on init, when using DataParallel
|
||||
# the model gets replicated, making it no longer guaranteed that the
|
||||
# weights are contiguous in GPU memory. Hence, we must call it again
|
||||
self._flatten_parameters()
|
||||
|
||||
bsize = paddle.shape(x)[0]
|
||||
h1 = paddle.zeros([1, bsize, self.rnn_dims])
|
||||
h2 = paddle.zeros([1, bsize, self.rnn_dims])
|
||||
# c: [B, T, C_aux]
|
||||
# aux: [B, T, res_out_dims]
|
||||
c, aux = self.upsample(c)
|
||||
|
||||
aux_idx = [self.aux_dims * i for i in range(5)]
|
||||
a1 = aux[:, :, aux_idx[0]:aux_idx[1]]
|
||||
a2 = aux[:, :, aux_idx[1]:aux_idx[2]]
|
||||
a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
|
||||
a4 = aux[:, :, aux_idx[3]:aux_idx[4]]
|
||||
|
||||
x = paddle.concat([x.unsqueeze(-1), c, a1], axis=2)
|
||||
x = self.I(x)
|
||||
res = x
|
||||
x, _ = self.rnn1(x, h1)
|
||||
|
||||
x = x + res
|
||||
res = x
|
||||
x = paddle.concat([x, a2], axis=2)
|
||||
x, _ = self.rnn2(x, h2)
|
||||
|
||||
x = x + res
|
||||
x = paddle.concat([x, a3], axis=2)
|
||||
x = F.relu(self.fc1(x))
|
||||
|
||||
x = paddle.concat([x, a4], axis=2)
|
||||
x = F.relu(self.fc2(x))
|
||||
|
||||
return self.fc3(x)
|
||||
|
||||
@paddle.no_grad()
|
||||
def generate(self,
|
||||
c,
|
||||
batched: bool=True,
|
||||
target: int=12000,
|
||||
overlap: int=600,
|
||||
mu_law: bool=True,
|
||||
gen_display: bool=False):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
c : Tensor
|
||||
input mels, (T', C_aux)
|
||||
batched : bool
|
||||
generate in batch or not
|
||||
target : int
|
||||
target number of samples to be generated in each batch entry
|
||||
overlap : int
|
||||
number of samples for crossfading between batches
|
||||
mu_law : bool
|
||||
use mu law or not
|
||||
Returns
|
||||
----------
|
||||
wav sequence
|
||||
Output (T' * prod(upsample_scales), out_channels, C_out).
|
||||
"""
|
||||
|
||||
self.eval()
|
||||
|
||||
mu_law = mu_law if self.mode == 'RAW' else False
|
||||
|
||||
output = []
|
||||
start = time.time()
|
||||
|
||||
# pseudo batch
|
||||
# (T, C_aux) -> (1, C_aux, T)
|
||||
c = paddle.transpose(c, [1, 0]).unsqueeze(0)
|
||||
T = paddle.shape(c)[-1]
|
||||
wave_len = T * self.hop_length
|
||||
# TODO remove two transpose op by modifying function pad_tensor
|
||||
c = self.pad_tensor(
|
||||
c.transpose([0, 2, 1]), pad=self.aux_context_window,
|
||||
side='both').transpose([0, 2, 1])
|
||||
|
||||
c, aux = self.upsample(c)
|
||||
|
||||
if batched:
|
||||
# (num_folds, target + 2 * overlap, features)
|
||||
c = self.fold_with_overlap(c, target, overlap)
|
||||
aux = self.fold_with_overlap(aux, target, overlap)
|
||||
|
||||
# for dygraph to static graph, if use seq_len of `b_size, seq_len, _ = paddle.shape(c)` in for
|
||||
# will not get TensorArray
|
||||
# see https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/04_dygraph_to_static/case_analysis_cn.html#list-lodtensorarray
|
||||
# b_size, seq_len, _ = paddle.shape(c)
|
||||
b_size = paddle.shape(c)[0]
|
||||
seq_len = paddle.shape(c)[1]
|
||||
|
||||
h1 = paddle.zeros([b_size, self.rnn_dims])
|
||||
h2 = paddle.zeros([b_size, self.rnn_dims])
|
||||
x = paddle.zeros([b_size, 1])
|
||||
|
||||
d = self.aux_dims
|
||||
aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)]
|
||||
|
||||
for i in range(seq_len):
|
||||
m_t = c[:, i, :]
|
||||
# for dygraph to static graph
|
||||
# a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split)
|
||||
a1_t = aux_split[0][:, i, :]
|
||||
a2_t = aux_split[1][:, i, :]
|
||||
a3_t = aux_split[2][:, i, :]
|
||||
a4_t = aux_split[3][:, i, :]
|
||||
x = paddle.concat([x, m_t, a1_t], axis=1)
|
||||
x = self.I(x)
|
||||
# use GRUCell here
|
||||
h1, _ = self.rnn1[0].cell(x, h1)
|
||||
x = x + h1
|
||||
inp = paddle.concat([x, a2_t], axis=1)
|
||||
# use GRUCell here
|
||||
h2, _ = self.rnn2[0].cell(inp, h2)
|
||||
|
||||
x = x + h2
|
||||
x = paddle.concat([x, a3_t], axis=1)
|
||||
x = F.relu(self.fc1(x))
|
||||
|
||||
x = paddle.concat([x, a4_t], axis=1)
|
||||
x = F.relu(self.fc2(x))
|
||||
|
||||
logits = self.fc3(x)
|
||||
|
||||
if self.mode == 'MOL':
|
||||
sample = sample_from_discretized_mix_logistic(
|
||||
logits.unsqueeze(0).transpose([0, 2, 1]))
|
||||
output.append(sample.reshape([-1]))
|
||||
x = sample.transpose([1, 0, 2])
|
||||
|
||||
elif self.mode == 'RAW':
|
||||
posterior = F.softmax(logits, axis=1)
|
||||
distrib = paddle.distribution.Categorical(posterior)
|
||||
# corresponding operate [np.floor((fx + 1) / 2 * mu + 0.5)] in enocde_mu_law
|
||||
# distrib.sample([1])[0].cast('float32'): [0, 2**bits-1]
|
||||
# sample: [-1, 1]
|
||||
sample = 2 * distrib.sample([1])[0].cast('float32') / (
|
||||
self.n_classes - 1.) - 1.
|
||||
output.append(sample)
|
||||
x = sample.unsqueeze(-1)
|
||||
else:
|
||||
raise RuntimeError('Unknown model mode value - ', self.mode)
|
||||
|
||||
if gen_display:
|
||||
if i % 1000 == 0:
|
||||
self.gen_display(i, int(seq_len), int(b_size), start)
|
||||
|
||||
output = paddle.stack(output).transpose([1, 0])
|
||||
|
||||
if mu_law:
|
||||
output = decode_mu_law(output, self.n_classes, False)
|
||||
|
||||
if batched:
|
||||
output = self.xfade_and_unfold(output, target, overlap)
|
||||
else:
|
||||
output = output[0]
|
||||
|
||||
# Fade-out at the end to avoid signal cutting out suddenly
|
||||
fade_out = paddle.linspace(1, 0, 10 * self.hop_length)
|
||||
output = output[:wave_len]
|
||||
output[-10 * self.hop_length:] *= fade_out
|
||||
|
||||
self.train()
|
||||
|
||||
# 增加 C_out 维度
|
||||
return output.unsqueeze(-1)
|
||||
|
||||
def _flatten_parameters(self):
|
||||
[m.flatten_parameters() for m in self._to_flatten]
|
||||
|
||||
def pad_tensor(self, x, pad, side='both'):
|
||||
'''
|
||||
Parameters
|
||||
----------
|
||||
x : Tensor
|
||||
mel, [1, n_frames, 80]
|
||||
pad : int
|
||||
side : str
|
||||
'both', 'before' or 'after'
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
'''
|
||||
b, t, _ = paddle.shape(x)
|
||||
# for dygraph to static graph
|
||||
c = x.shape[-1]
|
||||
total = t + 2 * pad if side == 'both' else t + pad
|
||||
padded = paddle.zeros([b, total, c])
|
||||
if side == 'before' or side == 'both':
|
||||
padded[:, pad:pad + t, :] = x
|
||||
elif side == 'after':
|
||||
padded[:, :t, :] = x
|
||||
return padded
|
||||
|
||||
def fold_with_overlap(self, x, target, overlap):
|
||||
'''
|
||||
Fold the tensor with overlap for quick batched inference.
|
||||
Overlap will be used for crossfading in xfade_and_unfold()
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : Tensor
|
||||
Upsampled conditioning features. mels or aux
|
||||
shape=(1, T, features)
|
||||
mels: [1, T, 80]
|
||||
aux: [1, T, 128]
|
||||
target : int
|
||||
Target timesteps for each index of batch
|
||||
overlap : int
|
||||
Timesteps for both xfade and rnn warmup
|
||||
overlap = hop_length * 2
|
||||
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
shape=(num_folds, target + 2 * overlap, features)
|
||||
num_flods = (time_seq - overlap) // (target + overlap)
|
||||
mel: [num_folds, target + 2 * overlap, 80]
|
||||
aux: [num_folds, target + 2 * overlap, 128]
|
||||
|
||||
Details
|
||||
----------
|
||||
x = [[h1, h2, ... hn]]
|
||||
|
||||
Where each h is a vector of conditioning features
|
||||
|
||||
Eg: target=2, overlap=1 with x.size(1)=10
|
||||
|
||||
folded = [[h1, h2, h3, h4],
|
||||
[h4, h5, h6, h7],
|
||||
[h7, h8, h9, h10]]
|
||||
'''
|
||||
|
||||
_, total_len, features = paddle.shape(x)
|
||||
|
||||
# Calculate variables needed
|
||||
num_folds = (total_len - overlap) // (target + overlap)
|
||||
extended_len = num_folds * (overlap + target) + overlap
|
||||
remaining = total_len - extended_len
|
||||
|
||||
# Pad if some time steps poking out
|
||||
if remaining != 0:
|
||||
num_folds += 1
|
||||
padding = target + 2 * overlap - remaining
|
||||
x = self.pad_tensor(x, padding, side='after')
|
||||
|
||||
folded = paddle.zeros([num_folds, target + 2 * overlap, features])
|
||||
|
||||
# Get the values for the folded tensor
|
||||
for i in range(num_folds):
|
||||
start = i * (target + overlap)
|
||||
end = start + target + 2 * overlap
|
||||
folded[i] = x[0][start:end, :]
|
||||
return folded
|
||||
|
||||
def xfade_and_unfold(self, y, target: int=12000, overlap: int=600):
|
||||
''' Applies a crossfade and unfolds into a 1d array.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : Tensor
|
||||
Batched sequences of audio samples
|
||||
shape=(num_folds, target + 2 * overlap)
|
||||
dtype=paddle.float32
|
||||
overlap : int
|
||||
Timesteps for both xfade and rnn warmup
|
||||
|
||||
Returns
|
||||
----------
|
||||
Tensor
|
||||
audio samples in a 1d array
|
||||
shape=(total_len)
|
||||
dtype=paddle.float32
|
||||
|
||||
Details
|
||||
----------
|
||||
y = [[seq1],
|
||||
[seq2],
|
||||
[seq3]]
|
||||
|
||||
Apply a gain envelope at both ends of the sequences
|
||||
|
||||
y = [[seq1_in, seq1_target, seq1_out],
|
||||
[seq2_in, seq2_target, seq2_out],
|
||||
[seq3_in, seq3_target, seq3_out]]
|
||||
|
||||
Stagger and add up the groups of samples:
|
||||
|
||||
[seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]
|
||||
|
||||
'''
|
||||
# num_folds = (total_len - overlap) // (target + overlap)
|
||||
num_folds, length = paddle.shape(y)
|
||||
target = length - 2 * overlap
|
||||
total_len = num_folds * (target + overlap) + overlap
|
||||
|
||||
# Need some silence for the run warmup
|
||||
slience_len = overlap // 2
|
||||
fade_len = overlap - slience_len
|
||||
slience = paddle.zeros([slience_len], dtype=paddle.float32)
|
||||
linear = paddle.ones([fade_len], dtype=paddle.float32)
|
||||
|
||||
# Equal power crossfade
|
||||
# fade_in increase from 0 to 1, fade_out reduces from 1 to 0
|
||||
t = paddle.linspace(-1, 1, fade_len, dtype=paddle.float32)
|
||||
fade_in = paddle.sqrt(0.5 * (1 + t))
|
||||
fade_out = paddle.sqrt(0.5 * (1 - t))
|
||||
# Concat the silence to the fades
|
||||
fade_out = paddle.concat([linear, fade_out])
|
||||
fade_in = paddle.concat([slience, fade_in])
|
||||
|
||||
# Apply the gain to the overlap samples
|
||||
y[:, :overlap] *= fade_in
|
||||
y[:, -overlap:] *= fade_out
|
||||
|
||||
unfolded = paddle.zeros([total_len], dtype=paddle.float32)
|
||||
|
||||
# Loop to add up all the samples
|
||||
for i in range(num_folds):
|
||||
start = i * (target + overlap)
|
||||
end = start + target + 2 * overlap
|
||||
unfolded[start:end] += y[i]
|
||||
|
||||
return unfolded
|
||||
|
||||
def gen_display(self, i, seq_len, b_size, start):
|
||||
gen_rate = (i + 1) / (time.time() - start) * b_size / 1000
|
||||
pbar = self.progbar(i, seq_len)
|
||||
msg = f'| {pbar} {i*b_size}/{seq_len*b_size} | Batch Size: {b_size} | Gen Rate: {gen_rate:.1f}kHz | '
|
||||
sys.stdout.write(f"\r{msg}")
|
||||
|
||||
def progbar(self, i, n, size=16):
|
||||
done = int(i * size) // n
|
||||
bar = ''
|
||||
for i in range(size):
|
||||
bar += '█' if i <= done else '░'
|
||||
return bar
|
||||
|
||||
|
||||
class WaveRNNInference(nn.Layer):
|
||||
def __init__(self, normalizer, wavernn):
|
||||
super().__init__()
|
||||
self.normalizer = normalizer
|
||||
self.wavernn = wavernn
|
||||
|
||||
def forward(self,
|
||||
logmel,
|
||||
batched: bool=True,
|
||||
target: int=12000,
|
||||
overlap: int=600,
|
||||
mu_law: bool=True,
|
||||
gen_display: bool=False):
|
||||
normalized_mel = self.normalizer(logmel)
|
||||
|
||||
wav = self.wavernn.generate(
|
||||
normalized_mel, )
|
||||
# batched=batched,
|
||||
# target=target,
|
||||
# overlap=overlap,
|
||||
# mu_law=mu_law,
|
||||
# gen_display=gen_display)
|
||||
|
||||
return wav
|
@ -0,0 +1,201 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import paddle
|
||||
import soundfile as sf
|
||||
from paddle import distributed as dist
|
||||
from paddle.io import DataLoader
|
||||
from paddle.nn import Layer
|
||||
from paddle.optimizer import Optimizer
|
||||
|
||||
from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
|
||||
from paddlespeech.t2s.training.reporter import report
|
||||
from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
|
||||
logging.basicConfig(
|
||||
format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
|
||||
datefmt='[%Y-%m-%d %H:%M:%S]')
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def calculate_grad_norm(parameters, norm_type: str=2):
|
||||
'''
|
||||
calculate grad norm of mdoel's parameters
|
||||
parameters:
|
||||
model's parameters
|
||||
norm_type: str
|
||||
Returns
|
||||
------------
|
||||
Tensor
|
||||
grad_norm
|
||||
'''
|
||||
|
||||
grad_list = [
|
||||
paddle.to_tensor(p.grad) for p in parameters if p.grad is not None
|
||||
]
|
||||
norm_list = paddle.stack(
|
||||
[paddle.norm(grad, norm_type) for grad in grad_list])
|
||||
total_norm = paddle.norm(norm_list)
|
||||
return total_norm
|
||||
|
||||
|
||||
# for save name in gen_valid_samples()
|
||||
ITERATION = 0
|
||||
|
||||
|
||||
class WaveRNNUpdater(StandardUpdater):
|
||||
def __init__(self,
|
||||
model: Layer,
|
||||
optimizer: Optimizer,
|
||||
criterion: Layer,
|
||||
dataloader: DataLoader,
|
||||
init_state=None,
|
||||
output_dir: Path=None,
|
||||
mode='RAW'):
|
||||
super().__init__(model, optimizer, dataloader, init_state=None)
|
||||
|
||||
self.criterion = criterion
|
||||
# self.scheduler = scheduler
|
||||
|
||||
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
|
||||
self.filehandler = logging.FileHandler(str(log_file))
|
||||
logger.addHandler(self.filehandler)
|
||||
self.logger = logger
|
||||
self.msg = ""
|
||||
self.mode = mode
|
||||
|
||||
def update_core(self, batch):
|
||||
|
||||
self.msg = "Rank: {}, ".format(dist.get_rank())
|
||||
losses_dict = {}
|
||||
# parse batch
|
||||
self.model.train()
|
||||
self.optimizer.clear_grad()
|
||||
|
||||
wav, y, mel = batch
|
||||
|
||||
y_hat = self.model(wav, mel)
|
||||
if self.mode == 'RAW':
|
||||
y_hat = y_hat.transpose([0, 2, 1]).unsqueeze(-1)
|
||||
elif self.mode == 'MOL':
|
||||
y_hat = paddle.cast(y, dtype='float32')
|
||||
|
||||
y = y.unsqueeze(-1)
|
||||
loss = self.criterion(y_hat, y)
|
||||
loss.backward()
|
||||
grad_norm = float(
|
||||
calculate_grad_norm(self.model.parameters(), norm_type=2))
|
||||
|
||||
self.optimizer.step()
|
||||
|
||||
report("train/loss", float(loss))
|
||||
report("train/grad_norm", float(grad_norm))
|
||||
|
||||
losses_dict["loss"] = float(loss)
|
||||
losses_dict["grad_norm"] = float(grad_norm)
|
||||
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_dict.items())
|
||||
global ITERATION
|
||||
ITERATION = self.state.iteration + 1
|
||||
|
||||
|
||||
class WaveRNNEvaluator(StandardEvaluator):
|
||||
def __init__(self,
|
||||
model: Layer,
|
||||
criterion: Layer,
|
||||
dataloader: Optimizer,
|
||||
output_dir: Path=None,
|
||||
valid_generate_loader=None,
|
||||
config=None):
|
||||
super().__init__(model, dataloader)
|
||||
|
||||
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
|
||||
self.filehandler = logging.FileHandler(str(log_file))
|
||||
logger.addHandler(self.filehandler)
|
||||
self.logger = logger
|
||||
self.msg = ""
|
||||
|
||||
self.criterion = criterion
|
||||
self.valid_generate_loader = valid_generate_loader
|
||||
self.config = config
|
||||
self.mode = config.model.mode
|
||||
|
||||
self.valid_samples_dir = output_dir / "valid_samples"
|
||||
self.valid_samples_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def evaluate_core(self, batch):
|
||||
self.msg = "Evaluate: "
|
||||
losses_dict = {}
|
||||
# parse batch
|
||||
wav, y, mel = batch
|
||||
y_hat = self.model(wav, mel)
|
||||
|
||||
if self.mode == 'RAW':
|
||||
y_hat = y_hat.transpose([0, 2, 1]).unsqueeze(-1)
|
||||
elif self.mode == 'MOL':
|
||||
y_hat = paddle.cast(y, dtype='float32')
|
||||
|
||||
y = y.unsqueeze(-1)
|
||||
loss = self.criterion(y_hat, y)
|
||||
report("eval/loss", float(loss))
|
||||
|
||||
losses_dict["loss"] = float(loss)
|
||||
|
||||
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_dict.items())
|
||||
self.logger.info(self.msg)
|
||||
|
||||
def gen_valid_samples(self):
|
||||
|
||||
for i, item in enumerate(self.valid_generate_loader):
|
||||
if i >= self.config.generate_num:
|
||||
break
|
||||
print(
|
||||
'\n| Generating: {}/{}'.format(i + 1, self.config.generate_num))
|
||||
|
||||
mel = item['feats']
|
||||
wav = item['wave']
|
||||
wav = wav.squeeze(0)
|
||||
|
||||
origin_save_path = self.valid_samples_dir / '{}_steps_{}_target.wav'.format(
|
||||
self.iteration, i)
|
||||
sf.write(origin_save_path, wav.numpy(), samplerate=self.config.fs)
|
||||
|
||||
if self.config.inference.gen_batched:
|
||||
batch_str = 'gen_batched_target{}_overlap{}'.format(
|
||||
self.config.inference.target, self.config.inference.overlap)
|
||||
else:
|
||||
batch_str = 'gen_not_batched'
|
||||
gen_save_path = str(self.valid_samples_dir /
|
||||
'{}_steps_{}_{}.wav'.format(self.iteration, i,
|
||||
batch_str))
|
||||
# (1, T, C_aux) -> (T, C_aux)
|
||||
mel = mel.squeeze(0)
|
||||
gen_sample = self.model.generate(
|
||||
mel, self.config.inference.gen_batched,
|
||||
self.config.inference.target, self.config.inference.overlap,
|
||||
self.config.mu_law)
|
||||
sf.write(
|
||||
gen_save_path, gen_sample.numpy(), samplerate=self.config.fs)
|
||||
|
||||
def __call__(self, trainer=None):
|
||||
summary = self.evaluate()
|
||||
for k, v in summary.items():
|
||||
report(k, v)
|
||||
# gen samples at then end of evaluate
|
||||
self.iteration = ITERATION
|
||||
if self.iteration % self.config.gen_eval_samples_interval_steps == 0:
|
||||
self.gen_valid_samples()
|
Loading…
Reference in new issue