Merge pull request #1314 from yt605155624/add_new_tacotron2

[TTS]Add new tacotron2
pull/1364/head
Hui Zhang 3 years ago committed by GitHub
commit 97db74ca60
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
f0min: 80 # Minimum f0 for pitch extraction.
f0max: 400 # Maximum f0 for pitch extraction.
###########################################################
@ -64,14 +64,14 @@ model:
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
spk_embed_dim: 256 # speaker embedding dimension
spk_embed_integration_type: concat # speaker embedding integration type
@ -84,7 +84,6 @@ updater:
use_masking: True # whether to apply masking for padded part in loss calculation
###########################################################
# OPTIMIZER SETTING #
###########################################################

@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
f0min: 80 # Minimum f0 for pitch extraction.
f0max: 400 # Maximum f0 for pitch extraction.
###########################################################
@ -64,14 +64,14 @@ model:
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
spk_embed_dim: 256 # speaker embedding dimension
spk_embed_integration_type: concat # speaker embedding integration type

@ -33,7 +33,7 @@ generator_params:
aux_context_window: 2 # Context window size for auxiliary feature.
# If set to 2, previous 2 and future 2 frames will be considered.
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
use_weight_norm: true # Whether to use weight norm.
use_weight_norm: True # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
upsample_scales: [4, 5, 3, 5] # Upsampling scales. prod(upsample_scales) == n_shift
@ -46,8 +46,8 @@ discriminator_params:
kernel_size: 3 # Number of output channels.
layers: 10 # Number of conv layers.
conv_channels: 64 # Number of chnn layers.
bias: true # Whether to use bias parameter in conv.
use_weight_norm: true # Whether to use weight norm.
bias: True # Whether to use bias parameter in conv.
use_weight_norm: True # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
nonlinear_activation: "leakyrelu" # Nonlinear function after each conv.
nonlinear_activation_params: # Nonlinear function parameters

@ -0,0 +1,91 @@
# This configuration is for Paddle to train Tacotron 2. Compared to the
# original paper, this configuration additionally use the guided attention
# loss to accelerate the learning of the diagonal attention. It requires
# only a single GPU with 12 GB memory and it takes ~1 days to finish the
# training on Titan V.
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs: 24000 # sr
n_fft: 2048 # FFT size (samples).
n_shift: 300 # Hop size (samples). 12.5ms
win_length: 1200 # Window length (samples). 50ms
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.
# Only used for feats_type != raw
fmin: 80 # Minimum frequency of Mel basis.
fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
###########################################################
# DATA SETTING #
###########################################################
batch_size: 64
num_workers: 2
###########################################################
# MODEL SETTING #
###########################################################
model: # keyword arguments for the selected model
embed_dim: 512 # char or phn embedding dimension
elayers: 1 # number of blstm layers in encoder
eunits: 512 # number of blstm units
econv_layers: 3 # number of convolutional layers in encoder
econv_chans: 512 # number of channels in convolutional layer
econv_filts: 5 # filter size of convolutional layer
atype: location # attention function type
adim: 512 # attention dimension
aconv_chans: 32 # number of channels in convolutional layer of attention
aconv_filts: 15 # filter size of convolutional layer of attention
cumulate_att_w: True # whether to cumulate attention weight
dlayers: 2 # number of lstm layers in decoder
dunits: 1024 # number of lstm units in decoder
prenet_layers: 2 # number of layers in prenet
prenet_units: 256 # number of units in prenet
postnet_layers: 5 # number of layers in postnet
postnet_chans: 512 # number of channels in postnet
postnet_filts: 5 # filter size of postnet layer
output_activation: null # activation function for the final output
use_batch_norm: True # whether to use batch normalization in encoder
use_concate: True # whether to concatenate encoder embedding with decoder outputs
use_residual: False # whether to use residual connection in encoder
dropout_rate: 0.5 # dropout rate
zoneout_rate: 0.1 # zoneout rate
reduction_factor: 1 # reduction factor
spk_embed_dim: null # speaker embedding dimension
###########################################################
# UPDATER SETTING #
###########################################################
updater:
use_masking: True # whether to apply masking for padded part in loss calculation
bce_pos_weight: 5.0 # weight of positive sample in binary cross entropy calculation
use_guided_attn_loss: True # whether to use guided attention loss
guided_attn_loss_sigma: 0.4 # sigma of guided attention loss
guided_attn_loss_lambda: 1.0 # strength of guided attention loss
##########################################################
# OPTIMIZER SETTING #
##########################################################
optimizer:
optim: adam # optimizer type
learning_rate: 1.0e-03 # learning rate
epsilon: 1.0e-06 # epsilon
weight_decay: 0.0 # weight decay coefficient
###########################################################
# TRAINING SETTING #
###########################################################
max_epoch: 200
num_snapshots: 5
###########################################################
# OTHER SETTING #
###########################################################
seed: 42

@ -0,0 +1,62 @@
#!/bin/bash
stage=0
stop_stage=100
config_path=$1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# get durations from MFA's result
echo "Generate durations.txt from MFA results ..."
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
--inputdir=./baker_alignment_tone \
--output=durations.txt \
--config=${config_path}
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# extract features
echo "Extract features ..."
python3 ${BIN_DIR}/preprocess.py \
--dataset=baker \
--rootdir=~/datasets/BZNSYP/ \
--dumpdir=dump \
--dur-file=durations.txt \
--config=${config_path} \
--num-cpu=20 \
--cut-sil=True
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# get features' stats(mean and std)
echo "Get features' stats ..."
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
--metadata=dump/train/raw/metadata.jsonl \
--field-name="speech"
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# normalize and covert phone to id, dev and test should use train's stats
echo "Normalize ..."
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
--speech-stats=dump/train/speech_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
--speech-stats=dump/train/speech_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
--speech-stats=dump/train/speech_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
fi

@ -0,0 +1,20 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=tacotron2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=pwgan_csmsc \
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt

@ -0,0 +1,91 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
stage=0
stop_stage=0
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=tacotron2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=pwgan_csmsc \
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \
--voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\
--voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt
fi
# the pretrained models haven't release now
# style melgan
# style melgan's Dygraph to Static Graph is not ready now
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt
# --inference_dir=${train_output_path}/inference
fi
# hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in hifigan syn_e2e"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt
fi

@ -0,0 +1,12 @@
#!/bin/bash
config_path=$1
train_output_path=$2
python3 ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=${config_path} \
--output-dir=${train_output_path} \
--ngpu=1 \
--phones-dict=dump/phone_id_map.txt

@ -0,0 +1,13 @@
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=new_tacotron2
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}

@ -0,0 +1,37 @@
#!/bin/bash
set -e
source path.sh
gpus=0,1
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_153.pdz
# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi

@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
f0min: 80 # Minimum f0 for pitch extraction.
f0max: 400 # Maximum f0 for pitch extraction.
###########################################################
@ -53,8 +53,8 @@ model:
conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type
conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
conformer_activation_type: swish # conformer activation type
use_macaron_style_in_conformer: true # whether to use macaron style in conformer
use_cnn_in_conformer: true # whether to use CNN in conformer
use_macaron_style_in_conformer: True # whether to use macaron style in conformer
use_cnn_in_conformer: True # whether to use CNN in conformer
conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder
conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder
init_type: xavier_uniform # initialization type
@ -70,14 +70,14 @@ model:
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder

@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
f0min: 80 # Minimum f0 for pitch extraction.
f0max: 400 # Maximum f0 for pitch extraction.
###########################################################
@ -64,14 +64,14 @@ model:
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
@ -82,7 +82,6 @@ updater:
use_masking: True # whether to apply masking for padded part in loss calculation
###########################################################
# OPTIMIZER SETTING #
###########################################################

@ -18,7 +18,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
bash ./local/preprocess.sh ${conf_path} || exit -1
./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then

@ -34,10 +34,10 @@ generator_params:
aux_context_window: 2 # Context window size for auxiliary feature.
# If set to 2, previous 2 and future 2 frames will be considered.
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
bias: true # use bias in residual blocks
use_weight_norm: true # Whether to use weight norm.
bias: True # use bias in residual blocks
use_weight_norm: True # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
use_causal_conv: false # use causal conv in residual blocks and upsample layers
use_causal_conv: False # use causal conv in residual blocks and upsample layers
upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size.
interpolate_mode: "nearest" # upsample net interpolate mode
freq_axis_kernel_size: 1 # upsamling net: convolution kernel size in frequencey axis
@ -53,8 +53,8 @@ discriminator_params:
kernel_size: 3 # Number of output channels.
layers: 10 # Number of conv layers.
conv_channels: 64 # Number of chnn layers.
bias: true # Whether to use bias parameter in conv.
use_weight_norm: true # Whether to use weight norm.
bias: True # Whether to use bias parameter in conv.
use_weight_norm: True # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
nonlinear_activation: "leakyrelu" # Nonlinear function after each conv.
nonlinear_activation_params: # Nonlinear function parameters

@ -63,13 +63,13 @@ discriminator_params:
###########################################################
# STFT LOSS SETTING #
###########################################################
use_stft_loss: true
use_stft_loss: True
stft_loss_params:
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss.
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
window: "hann" # Window function for STFT-based loss
use_subband_stft_loss: true
use_subband_stft_loss: True
subband_stft_loss_params:
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss
@ -79,7 +79,7 @@ subband_stft_loss_params:
###########################################################
# ADVERSARIAL LOSS SETTING #
###########################################################
use_feat_match_loss: false # Whether to use feature matching loss.
use_feat_match_loss: False # Whether to use feature matching loss.
lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss.
###########################################################

@ -63,13 +63,13 @@ discriminator_params:
###########################################################
# STFT LOSS SETTING #
###########################################################
use_stft_loss: true
use_stft_loss: True
stft_loss_params:
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
window: "hann" # Window function for STFT-based loss
use_subband_stft_loss: true
use_subband_stft_loss: True
subband_stft_loss_params:
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss.
@ -79,7 +79,7 @@ subband_stft_loss_params:
###########################################################
# ADVERSARIAL LOSS SETTING #
###########################################################
use_feat_match_loss: false # Whether to use feature matching loss.
use_feat_match_loss: False # Whether to use feature matching loss.
lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss.
###########################################################

@ -65,7 +65,7 @@ discriminator_params:
###########################################################
# STFT LOSS SETTING #
###########################################################
use_stft_loss: true
use_stft_loss: True
stft_loss_params:
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
@ -78,9 +78,9 @@ lambda_aux: 1.0 # Loss balancing coefficient for aux loss.
###########################################################
lambda_adv: 1.0 # Loss balancing coefficient for adv loss.
generator_adv_loss_params:
average_by_discriminators: false # Whether to average loss by #discriminators.
average_by_discriminators: False # Whether to average loss by #discriminators.
discriminator_adv_loss_params:
average_by_discriminators: false # Whether to average loss by #discriminators.
average_by_discriminators: False # Whether to average loss by #discriminators.
###########################################################
# DATA LOADER SETTING #

@ -35,12 +35,12 @@ generator_params:
- [1, 3, 5]
- [1, 3, 5]
- [1, 3, 5]
use_additional_convs: true # Whether to use additional conv layer in residual blocks.
bias: true # Whether to use bias parameter in conv.
use_additional_convs: True # Whether to use additional conv layer in residual blocks.
bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters.
negative_slope: 0.1
use_weight_norm: true # Whether to apply weight normalization.
use_weight_norm: True # Whether to apply weight normalization.
###########################################################
@ -60,12 +60,12 @@ discriminator_params:
channels: 128 # Initial number of channels.
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
max_groups: 16 # Maximum number of groups in downsampling conv layers.
bias: true
bias: True
downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params:
negative_slope: 0.1
follow_official_norm: true # Whether to follow the official norm setting.
follow_official_norm: True # Whether to follow the official norm setting.
periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator.
period_discriminator_params:
in_channels: 1 # Number of input channels.
@ -74,19 +74,19 @@ discriminator_params:
channels: 32 # Initial number of channels.
downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
bias: true # Whether to use bias parameter in conv layer."
bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters.
negative_slope: 0.1
use_weight_norm: true # Whether to apply weight normalization.
use_spectral_norm: false # Whether to apply spectral normalization.
use_weight_norm: True # Whether to apply weight normalization.
use_spectral_norm: False # Whether to apply spectral normalization.
###########################################################
# STFT LOSS SETTING #
###########################################################
use_stft_loss: false # Whether to use multi-resolution STFT loss.
use_mel_loss: true # Whether to use Mel-spectrogram loss.
use_stft_loss: False # Whether to use multi-resolution STFT loss.
use_mel_loss: True # Whether to use Mel-spectrogram loss.
mel_loss_params:
fs: 24000
fft_size: 2048
@ -98,14 +98,14 @@ mel_loss_params:
fmax: 12000
log_base: null
generator_adv_loss_params:
average_by_discriminators: false # Whether to average loss by #discriminators.
average_by_discriminators: False # Whether to average loss by #discriminators.
discriminator_adv_loss_params:
average_by_discriminators: false # Whether to average loss by #discriminators.
use_feat_match_loss: true
average_by_discriminators: False # Whether to average loss by #discriminators.
use_feat_match_loss: True
feat_match_loss_params:
average_by_discriminators: false # Whether to average loss by #discriminators.
average_by_layers: false # Whether to average loss by #layers in each discriminator.
include_final_outputs: false # Whether to include final outputs in feat match loss calculation.
average_by_discriminators: False # Whether to average loss by #discriminators.
average_by_layers: False # Whether to average loss by #layers in each discriminator.
include_final_outputs: False # Whether to include final outputs in feat match loss calculation.
###########################################################
# ADVERSARIAL LOSS SETTING #

@ -35,12 +35,12 @@ generator_params:
- [1, 3, 5]
- [1, 3, 5]
- [1, 3, 5]
use_additional_convs: true # Whether to use additional conv layer in residual blocks.
bias: true # Whether to use bias parameter in conv.
use_additional_convs: True # Whether to use additional conv layer in residual blocks.
bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters.
negative_slope: 0.1
use_weight_norm: true # Whether to apply weight normalization.
use_weight_norm: True # Whether to apply weight normalization.
###########################################################
@ -60,12 +60,12 @@ discriminator_params:
channels: 128 # Initial number of channels.
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
max_groups: 16 # Maximum number of groups in downsampling conv layers.
bias: true
bias: True
downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params:
negative_slope: 0.1
follow_official_norm: true # Whether to follow the official norm setting.
follow_official_norm: True # Whether to follow the official norm setting.
periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator.
period_discriminator_params:
in_channels: 1 # Number of input channels.
@ -74,19 +74,19 @@ discriminator_params:
channels: 32 # Initial number of channels.
downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
bias: true # Whether to use bias parameter in conv layer."
bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters.
negative_slope: 0.1
use_weight_norm: true # Whether to apply weight normalization.
use_spectral_norm: false # Whether to apply spectral normalization.
use_weight_norm: True # Whether to apply weight normalization.
use_spectral_norm: False # Whether to apply spectral normalization.
###########################################################
# STFT LOSS SETTING #
###########################################################
use_stft_loss: false # Whether to use multi-resolution STFT loss.
use_mel_loss: true # Whether to use Mel-spectrogram loss.
use_stft_loss: False # Whether to use multi-resolution STFT loss.
use_mel_loss: True # Whether to use Mel-spectrogram loss.
mel_loss_params:
fs: 24000
fft_size: 2048
@ -98,14 +98,14 @@ mel_loss_params:
fmax: 12000
log_base: null
generator_adv_loss_params:
average_by_discriminators: false # Whether to average loss by #discriminators.
average_by_discriminators: False # Whether to average loss by #discriminators.
discriminator_adv_loss_params:
average_by_discriminators: false # Whether to average loss by #discriminators.
use_feat_match_loss: true
average_by_discriminators: False # Whether to average loss by #discriminators.
use_feat_match_loss: True
feat_match_loss_params:
average_by_discriminators: false # Whether to average loss by #discriminators.
average_by_layers: false # Whether to average loss by #layers in each discriminator.
include_final_outputs: false # Whether to include final outputs in feat match loss calculation.
average_by_discriminators: False # Whether to average loss by #discriminators.
average_by_layers: False # Whether to average loss by #layers in each discriminator.
include_final_outputs: False # Whether to include final outputs in feat match loss calculation.
###########################################################
# ADVERSARIAL LOSS SETTING #

@ -63,9 +63,9 @@ model: # keyword arguments for the selected model
# UPDATER SETTING #
###########################################################
updater:
use_masking: true # whether to apply masking for padded part in loss calculation
use_masking: True # whether to apply masking for padded part in loss calculation
loss_type: L1
use_guided_attn_loss: true # whether to use guided attention loss
use_guided_attn_loss: True # whether to use guided attention loss
guided_attn_loss_sigma: 0.4 # sigma in guided attention loss
guided_attn_loss_lambda: 10.0 # lambda in guided attention loss
modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss

@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
f0min: 80 # Minimum f0 for pitch extraction.
f0max: 400 # Maximum f0 for pitch extraction.
###########################################################
@ -64,14 +64,14 @@ model:
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder

@ -33,7 +33,7 @@ generator_params:
aux_context_window: 2 # Context window size for auxiliary feature.
# If set to 2, previous 2 and future 2 frames will be considered.
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
use_weight_norm: true # Whether to use weight norm.
use_weight_norm: True # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
upsample_scales: [4, 4, 4, 4] # Upsampling scales. prod(upsample_scales) == n_shift
@ -46,8 +46,8 @@ discriminator_params:
kernel_size: 3 # Number of output channels.
layers: 10 # Number of conv layers.
conv_channels: 64 # Number of chnn layers.
bias: true # Whether to use bias parameter in conv.
use_weight_norm: true # Whether to use weight norm.
bias: True # Whether to use bias parameter in conv.
use_weight_norm: True # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
nonlinear_activation: "leakyrelu" # Nonlinear function after each conv.
nonlinear_activation_params: # Nonlinear function parameters

@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
f0min: 80 # Minimum f0 for pitch extraction.
f0max: 400 # Maximum f0 for pitch extraction.
###########################################################
@ -64,14 +64,14 @@ model:
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
spk_embed_dim: 256 # speaker embedding dimension
spk_embed_integration_type: concat # speaker embedding integration type

@ -33,7 +33,7 @@ generator_params:
aux_context_window: 2 # Context window size for auxiliary feature.
# If set to 2, previous 2 and future 2 frames will be considered.
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
use_weight_norm: true # Whether to use weight norm.
use_weight_norm: True # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
upsample_scales: [4, 5, 3, 5] # Upsampling scales. prod(upsample_scales) == n_shift
@ -46,8 +46,8 @@ discriminator_params:
kernel_size: 3 # Number of output channels.
layers: 10 # Number of conv layers.
conv_channels: 64 # Number of chnn layers.
bias: true # Whether to use bias parameter in conv.
use_weight_norm: true # Whether to use weight norm.
bias: True # Whether to use bias parameter in conv.
use_weight_norm: True # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
nonlinear_activation: "leakyrelu" # Nonlinear function after each conv.
nonlinear_activation_params: # Nonlinear function parameters

@ -17,6 +17,35 @@ import paddle
from paddlespeech.t2s.data.batch import batch_sequences
def tacotron2_single_spk_batch_fn(examples):
# fields = ["text", "text_lengths", "speech", "speech_lengths"]
text = [np.array(item["text"], dtype=np.int64) for item in examples]
speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
text_lengths = [
np.array(item["text_lengths"], dtype=np.int64) for item in examples
]
speech_lengths = [
np.array(item["speech_lengths"], dtype=np.int64) for item in examples
]
text = batch_sequences(text)
speech = batch_sequences(speech)
# convert each batch to paddle.Tensor
text = paddle.to_tensor(text)
speech = paddle.to_tensor(speech)
text_lengths = paddle.to_tensor(text_lengths)
speech_lengths = paddle.to_tensor(speech_lengths)
batch = {
"text": text,
"text_lengths": text_lengths,
"speech": speech,
"speech_lengths": speech_lengths,
}
return batch
def speedyspeech_single_spk_batch_fn(examples):
# fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"]
phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
@ -56,7 +85,7 @@ def speedyspeech_single_spk_batch_fn(examples):
def speedyspeech_multi_spk_batch_fn(examples):
# fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"]
# fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations", "spk_id"]
phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
feats = [np.array(item["feats"], dtype=np.float32) for item in examples]

@ -15,14 +15,14 @@
# for mb melgan finetune
# 长度和原本的 mel 不一致怎么办?
import argparse
import os
from pathlib import Path
import numpy as np
import paddle
import yaml
from yacs.config import CfgNode
from tqdm import tqdm
import os
from yacs.config import CfgNode
from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
@ -50,11 +50,14 @@ def evaluate(args, fastspeech2_config):
spk_id_list = [line.strip().split() for line in f.readlines()]
spk_num = len(spk_id_list)
else:
spk_num=None
spk_num = None
odim = fastspeech2_config.n_mels
model = FastSpeech2(
idim=vocab_size, odim=odim, **fastspeech2_config["model"], spk_num=spk_num)
idim=vocab_size,
odim=odim,
**fastspeech2_config["model"],
spk_num=spk_num)
model.set_state_dict(
paddle.load(args.fastspeech2_checkpoint)["main_params"])
@ -99,9 +102,15 @@ def evaluate(args, fastspeech2_config):
else:
train_wav_files += wav_files
train_wav_files = [os.path.basename(str(str_path)) for str_path in train_wav_files]
dev_wav_files = [os.path.basename(str(str_path)) for str_path in dev_wav_files]
test_wav_files = [os.path.basename(str(str_path)) for str_path in test_wav_files]
train_wav_files = [
os.path.basename(str(str_path)) for str_path in train_wav_files
]
dev_wav_files = [
os.path.basename(str(str_path)) for str_path in dev_wav_files
]
test_wav_files = [
os.path.basename(str(str_path)) for str_path in test_wav_files
]
for i, utt_id in enumerate(tqdm(sentences)):
phones = sentences[utt_id][0]
@ -122,7 +131,8 @@ def evaluate(args, fastspeech2_config):
phone_ids = paddle.to_tensor(np.array(phone_ids))
if args.speaker_dict:
speaker_id = int([item[1] for item in spk_id_list if speaker == item[0]][0])
speaker_id = int(
[item[1] for item in spk_id_list if speaker == item[0]][0])
speaker_id = paddle.to_tensor(speaker_id)
else:
speaker_id = None
@ -143,7 +153,8 @@ def evaluate(args, fastspeech2_config):
sub_output_dir.mkdir(parents=True, exist_ok=True)
with paddle.no_grad():
mel = fastspeech2_inference(phone_ids, durations=durations, spk_id=speaker_id)
mel = fastspeech2_inference(
phone_ids, durations=durations, spk_id=speaker_id)
np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
@ -175,12 +186,9 @@ def main():
type=str,
default="phone_id_map.txt",
help="phone vocabulary file.")
parser.add_argument(
"--speaker-dict",
type=str,
default=None,
help="speaker id map file.")
"--speaker-dict", type=str, default=None, help="speaker id map file.")
parser.add_argument(
"--dur-file", default=None, type=str, help="path to durations.txt.")

@ -0,0 +1,13 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -0,0 +1 @@
../transformer_tts/normalize.py

@ -0,0 +1,328 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from concurrent.futures import ThreadPoolExecutor
from operator import itemgetter
from pathlib import Path
from typing import Any
from typing import Dict
from typing import List
import jsonlines
import librosa
import numpy as np
import tqdm
import yaml
from yacs.config import CfgNode
from paddlespeech.t2s.data.get_feats import LogMelFBank
from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
from paddlespeech.t2s.datasets.preprocess_utils import get_input_token
from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
def process_sentence(config: Dict[str, Any],
fp: Path,
sentences: Dict,
output_dir: Path,
mel_extractor=None,
cut_sil: bool=True,
spk_emb_dir: Path=None):
utt_id = fp.stem
# for vctk
if utt_id.endswith("_mic2"):
utt_id = utt_id[:-5]
record = None
if utt_id in sentences:
# reading, resampling may occur
wav, _ = librosa.load(str(fp), sr=config.fs)
if len(wav.shape) != 1 or np.abs(wav).max() > 1.0:
return record
assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
assert np.abs(wav).max(
) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
phones = sentences[utt_id][0]
durations = sentences[utt_id][1]
speaker = sentences[utt_id][2]
d_cumsum = np.pad(np.array(durations).cumsum(0), (1, 0), 'constant')
# little imprecise than use *.TextGrid directly
times = librosa.frames_to_time(
d_cumsum, sr=config.fs, hop_length=config.n_shift)
if cut_sil:
start = 0
end = d_cumsum[-1]
if phones[0] == "sil" and len(durations) > 1:
start = times[1]
durations = durations[1:]
phones = phones[1:]
if phones[-1] == 'sil' and len(durations) > 1:
end = times[-2]
durations = durations[:-1]
phones = phones[:-1]
sentences[utt_id][0] = phones
sentences[utt_id][1] = durations
start, end = librosa.time_to_samples([start, end], sr=config.fs)
wav = wav[start:end]
# extract mel feats
logmel = mel_extractor.get_log_mel_fbank(wav)
# change duration according to mel_length
compare_duration_and_mel_length(sentences, utt_id, logmel)
phones = sentences[utt_id][0]
durations = sentences[utt_id][1]
num_frames = logmel.shape[0]
assert sum(durations) == num_frames
mel_dir = output_dir / "data_speech"
mel_dir.mkdir(parents=True, exist_ok=True)
mel_path = mel_dir / (utt_id + "_speech.npy")
np.save(mel_path, logmel)
record = {
"utt_id": utt_id,
"phones": phones,
"text_lengths": len(phones),
"speech_lengths": num_frames,
"speech": str(mel_path),
"speaker": speaker
}
if spk_emb_dir:
if speaker in os.listdir(spk_emb_dir):
embed_name = utt_id + ".npy"
embed_path = spk_emb_dir / speaker / embed_name
if embed_path.is_file():
record["spk_emb"] = str(embed_path)
else:
return None
return record
def process_sentences(config,
fps: List[Path],
sentences: Dict,
output_dir: Path,
mel_extractor=None,
nprocs: int=1,
cut_sil: bool=True,
spk_emb_dir: Path=None):
if nprocs == 1:
results = []
for fp in fps:
record = process_sentence(config, fp, sentences, output_dir,
mel_extractor, cut_sil, spk_emb_dir)
if record:
results.append(record)
else:
with ThreadPoolExecutor(nprocs) as pool:
futures = []
with tqdm.tqdm(total=len(fps)) as progress:
for fp in fps:
future = pool.submit(process_sentence, config, fp,
sentences, output_dir, mel_extractor,
cut_sil, spk_emb_dir)
future.add_done_callback(lambda p: progress.update())
futures.append(future)
results = []
for ft in futures:
record = ft.result()
if record:
results.append(record)
results.sort(key=itemgetter("utt_id"))
with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
for item in results:
writer.write(item)
print("Done")
def main():
# parse config and args
parser = argparse.ArgumentParser(
description="Preprocess audio and then extract features.")
parser.add_argument(
"--dataset",
default="baker",
type=str,
help="name of dataset, should in {baker, aishell3, ljspeech, vctk} now")
parser.add_argument(
"--rootdir", default=None, type=str, help="directory to dataset.")
parser.add_argument(
"--dumpdir",
type=str,
required=True,
help="directory to dump feature files.")
parser.add_argument(
"--dur-file", default=None, type=str, help="path to durations.txt.")
parser.add_argument("--config", type=str, help="fastspeech2 config file.")
parser.add_argument(
"--verbose",
type=int,
default=1,
help="logging level. higher is more logging. (default=1)")
parser.add_argument(
"--num-cpu", type=int, default=1, help="number of process.")
def str2bool(str):
return True if str.lower() == 'true' else False
parser.add_argument(
"--cut-sil",
type=str2bool,
default=True,
help="whether cut sil in the edge of audio")
parser.add_argument(
"--spk_emb_dir",
default=None,
type=str,
help="directory to speaker embedding files.")
args = parser.parse_args()
rootdir = Path(args.rootdir).expanduser()
dumpdir = Path(args.dumpdir).expanduser()
# use absolute path
dumpdir = dumpdir.resolve()
dumpdir.mkdir(parents=True, exist_ok=True)
dur_file = Path(args.dur_file).expanduser()
if args.spk_emb_dir:
spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve()
else:
spk_emb_dir = None
assert rootdir.is_dir()
assert dur_file.is_file()
with open(args.config, 'rt') as f:
config = CfgNode(yaml.safe_load(f))
if args.verbose > 1:
print(vars(args))
print(config)
sentences, speaker_set = get_phn_dur(dur_file)
merge_silence(sentences)
phone_id_map_path = dumpdir / "phone_id_map.txt"
speaker_id_map_path = dumpdir / "speaker_id_map.txt"
get_input_token(sentences, phone_id_map_path, args.dataset)
get_spk_id_map(speaker_set, speaker_id_map_path)
if args.dataset == "baker":
wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
# split data into 3 sections
num_train = 9800
num_dev = 100
train_wav_files = wav_files[:num_train]
dev_wav_files = wav_files[num_train:num_train + num_dev]
test_wav_files = wav_files[num_train + num_dev:]
elif args.dataset == "aishell3":
sub_num_dev = 5
wav_dir = rootdir / "train" / "wav"
train_wav_files = []
dev_wav_files = []
test_wav_files = []
for speaker in os.listdir(wav_dir):
wav_files = sorted(list((wav_dir / speaker).rglob("*.wav")))
if len(wav_files) > 100:
train_wav_files += wav_files[:-sub_num_dev * 2]
dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
test_wav_files += wav_files[-sub_num_dev:]
else:
train_wav_files += wav_files
elif args.dataset == "ljspeech":
wav_files = sorted(list((rootdir / "wavs").rglob("*.wav")))
# split data into 3 sections
num_train = 12900
num_dev = 100
train_wav_files = wav_files[:num_train]
dev_wav_files = wav_files[num_train:num_train + num_dev]
test_wav_files = wav_files[num_train + num_dev:]
elif args.dataset == "vctk":
sub_num_dev = 5
wav_dir = rootdir / "wav48_silence_trimmed"
train_wav_files = []
dev_wav_files = []
test_wav_files = []
for speaker in os.listdir(wav_dir):
wav_files = sorted(list((wav_dir / speaker).rglob("*_mic2.flac")))
if len(wav_files) > 100:
train_wav_files += wav_files[:-sub_num_dev * 2]
dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
test_wav_files += wav_files[-sub_num_dev:]
else:
train_wav_files += wav_files
else:
print("dataset should in {baker, aishell3, ljspeech, vctk} now!")
train_dump_dir = dumpdir / "train" / "raw"
train_dump_dir.mkdir(parents=True, exist_ok=True)
dev_dump_dir = dumpdir / "dev" / "raw"
dev_dump_dir.mkdir(parents=True, exist_ok=True)
test_dump_dir = dumpdir / "test" / "raw"
test_dump_dir.mkdir(parents=True, exist_ok=True)
# Extractor
mel_extractor = LogMelFBank(
sr=config.fs,
n_fft=config.n_fft,
hop_length=config.n_shift,
win_length=config.win_length,
window=config.window,
n_mels=config.n_mels,
fmin=config.fmin,
fmax=config.fmax)
# process for the 3 sections
if train_wav_files:
process_sentences(
config,
train_wav_files,
sentences,
train_dump_dir,
mel_extractor,
nprocs=args.num_cpu,
cut_sil=args.cut_sil,
spk_emb_dir=spk_emb_dir)
if dev_wav_files:
process_sentences(
config,
dev_wav_files,
sentences,
dev_dump_dir,
mel_extractor,
cut_sil=args.cut_sil,
spk_emb_dir=spk_emb_dir)
if test_wav_files:
process_sentences(
config,
test_wav_files,
sentences,
test_dump_dir,
mel_extractor,
nprocs=args.num_cpu,
cut_sil=args.cut_sil,
spk_emb_dir=spk_emb_dir)
if __name__ == "__main__":
main()

@ -0,0 +1,190 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import os
import shutil
from pathlib import Path
import jsonlines
import numpy as np
import paddle
import yaml
from paddle import DataParallel
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
from yacs.config import CfgNode
from paddlespeech.t2s.datasets.am_batch_fn import tacotron2_single_spk_batch_fn
from paddlespeech.t2s.datasets.data_table import DataTable
from paddlespeech.t2s.models.new_tacotron2 import Tacotron2
from paddlespeech.t2s.models.new_tacotron2 import Tacotron2Evaluator
from paddlespeech.t2s.models.new_tacotron2 import Tacotron2Updater
from paddlespeech.t2s.training.extensions.snapshot import Snapshot
from paddlespeech.t2s.training.extensions.visualizer import VisualDL
from paddlespeech.t2s.training.optimizer import build_optimizers
from paddlespeech.t2s.training.seeding import seed_everything
from paddlespeech.t2s.training.trainer import Trainer
def train_sp(args, config):
# decides device type and whether to run in parallel
# setup running environment correctly
if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
paddle.set_device("cpu")
else:
paddle.set_device("gpu")
world_size = paddle.distributed.get_world_size()
if world_size > 1:
paddle.distributed.init_parallel_env()
# set the random seed, it is a must for multiprocess training
seed_everything(config.seed)
print(
f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
)
# dataloader has been too verbose
logging.getLogger("DataLoader").disabled = True
# construct dataset for training and validation
with jsonlines.open(args.train_metadata, 'r') as reader:
train_metadata = list(reader)
train_dataset = DataTable(
data=train_metadata,
fields=[
"text",
"text_lengths",
"speech",
"speech_lengths",
],
converters={
"speech": np.load,
}, )
with jsonlines.open(args.dev_metadata, 'r') as reader:
dev_metadata = list(reader)
dev_dataset = DataTable(
data=dev_metadata,
fields=[
"text",
"text_lengths",
"speech",
"speech_lengths",
],
converters={
"speech": np.load,
}, )
# collate function and dataloader
train_sampler = DistributedBatchSampler(
train_dataset,
batch_size=config.batch_size,
shuffle=True,
drop_last=True)
print("samplers done!")
train_dataloader = DataLoader(
train_dataset,
batch_sampler=train_sampler,
collate_fn=tacotron2_single_spk_batch_fn,
num_workers=config.num_workers)
dev_dataloader = DataLoader(
dev_dataset,
shuffle=False,
drop_last=False,
batch_size=config.batch_size,
collate_fn=tacotron2_single_spk_batch_fn,
num_workers=config.num_workers)
print("dataloaders done!")
with open(args.phones_dict, "r") as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
odim = config.n_mels
model = Tacotron2(idim=vocab_size, odim=odim, **config["model"])
if world_size > 1:
model = DataParallel(model)
print("model done!")
optimizer = build_optimizers(model, **config["optimizer"])
print("optimizer done!")
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
if dist.get_rank() == 0:
config_name = args.config.split("/")[-1]
# copy conf to output_dir
shutil.copyfile(args.config, output_dir / config_name)
updater = Tacotron2Updater(
model=model,
optimizer=optimizer,
dataloader=train_dataloader,
output_dir=output_dir,
**config["updater"])
trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
evaluator = Tacotron2Evaluator(
model, dev_dataloader, output_dir=output_dir, **config["updater"])
if dist.get_rank() == 0:
trainer.extend(evaluator, trigger=(1, "epoch"))
trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
trainer.extend(
Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
# print(trainer.extensions)
trainer.run()
def main():
# parse args and config and redirect to train_sp
parser = argparse.ArgumentParser(description="Train a Tacotron2 model.")
parser.add_argument("--config", type=str, help="tacotron2 config file.")
parser.add_argument("--train-metadata", type=str, help="training data.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
parser.add_argument(
"--phones-dict", type=str, default=None, help="phone vocabulary file.")
args = parser.parse_args()
with open(args.config) as f:
config = CfgNode(yaml.safe_load(f))
print("========Args========")
print(yaml.safe_dump(vars(args)))
print("========Config========")
print(config)
print(
f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
)
# dispatch
if args.ngpu > 1:
dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
else:
train_sp(args, config)
if __name__ == "__main__":
main()

@ -36,6 +36,10 @@ model_alias = {
"paddlespeech.t2s.models.fastspeech2:FastSpeech2",
"fastspeech2_inference":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
"tacotron2":
"paddlespeech.t2s.models.new_tacotron2:Tacotron2",
"tacotron2_inference":
"paddlespeech.t2s.models.new_tacotron2:Tacotron2Inference",
# voc
"pwgan":
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
@ -91,6 +95,8 @@ def evaluate(args):
print("spk_num:", spk_num)
elif am_name == 'speedyspeech':
fields = ["utt_id", "phones", "tones"]
elif am_name == 'tacotron2':
fields = ["utt_id", "text"]
test_dataset = DataTable(data=test_metadata, fields=fields)
@ -117,6 +123,8 @@ def evaluate(args):
elif am_name == 'speedyspeech':
am = am_class(
vocab_size=vocab_size, tone_size=tone_size, **am_config["model"])
elif am_name == 'tacotron2':
am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
am.eval()
@ -168,6 +176,9 @@ def evaluate(args):
phone_ids = paddle.to_tensor(datum["phones"])
tone_ids = paddle.to_tensor(datum["tones"])
mel = am_inference(phone_ids, tone_ids)
elif am_name == 'tacotron2':
phone_ids = paddle.to_tensor(datum["text"])
mel = am_inference(phone_ids)
# vocoder
wav = voc_inference(mel)
sf.write(
@ -188,7 +199,7 @@ def main():
default='fastspeech2_csmsc',
choices=[
'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech',
'fastspeech2_aishell3', 'fastspeech2_vctk'
'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc'
],
help='Choose acoustic model type of tts task.')
parser.add_argument(

@ -38,6 +38,10 @@ model_alias = {
"paddlespeech.t2s.models.fastspeech2:FastSpeech2",
"fastspeech2_inference":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
"tacotron2":
"paddlespeech.t2s.models.new_tacotron2:Tacotron2",
"tacotron2_inference":
"paddlespeech.t2s.models.new_tacotron2:Tacotron2Inference",
# voc
"pwgan":
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
@ -126,6 +130,8 @@ def evaluate(args):
elif am_name == 'speedyspeech':
am = am_class(
vocab_size=vocab_size, tone_size=tone_size, **am_config["model"])
elif am_name == 'tacotron2':
am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
am.eval()
@ -237,6 +243,8 @@ def evaluate(args):
elif am_name == 'speedyspeech':
part_tone_ids = tone_ids[i]
mel = am_inference(part_phone_ids, part_tone_ids)
elif am_name == 'tacotron2':
mel = am_inference(part_phone_ids)
# vocoder
wav = voc_inference(mel)
if flags == 0:
@ -262,7 +270,7 @@ def main():
default='fastspeech2_csmsc',
choices=[
'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech',
'fastspeech2_aishell3', 'fastspeech2_vctk'
'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc'
],
help='Choose acoustic model type of tts task.')
parser.add_argument(

@ -14,6 +14,7 @@
from .fastspeech2 import *
from .hifigan import *
from .melgan import *
from .new_tacotron2 import *
from .parallel_wavegan import *
from .speedyspeech import *
from .tacotron2 import *

@ -556,8 +556,7 @@ class FastSpeech2(nn.Layer):
tone_id=tone_id)
# modify mod part of groundtruth
if self.reduction_factor > 1:
olens = paddle.to_tensor(
[olen - olen % self.reduction_factor for olen in olens.numpy()])
olens = olens - olens % self.reduction_factor
max_olen = max(olens)
ys = ys[:, :max_olen]

@ -12,8 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from pathlib import Path
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.nn import Layer
from paddle.optimizer import Optimizer
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss
from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
@ -28,20 +32,17 @@ logger.setLevel(logging.INFO)
class FastSpeech2Updater(StandardUpdater):
def __init__(self,
model,
optimizer,
dataloader,
model: Layer,
optimizer: Optimizer,
dataloader: DataLoader,
init_state=None,
use_masking=False,
use_weighted_masking=False,
output_dir=None):
use_masking: bool=False,
use_weighted_masking: bool=False,
output_dir: Path=None):
super().__init__(model, optimizer, dataloader, init_state=None)
self.use_masking = use_masking
self.use_weighted_masking = use_weighted_masking
self.criterion = FastSpeech2Loss(
use_masking=self.use_masking,
use_weighted_masking=self.use_weighted_masking)
use_masking=use_masking, use_weighted_masking=use_weighted_masking)
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
self.filehandler = logging.FileHandler(str(log_file))
@ -107,14 +108,12 @@ class FastSpeech2Updater(StandardUpdater):
class FastSpeech2Evaluator(StandardEvaluator):
def __init__(self,
model,
dataloader,
use_masking=False,
use_weighted_masking=False,
output_dir=None):
model: Layer,
dataloader: DataLoader,
use_masking: bool=False,
use_weighted_masking: bool=False,
output_dir: Path=None):
super().__init__(model, dataloader)
self.use_masking = use_masking
self.use_weighted_masking = use_weighted_masking
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
self.filehandler = logging.FileHandler(str(log_file))
@ -123,8 +122,7 @@ class FastSpeech2Evaluator(StandardEvaluator):
self.msg = ""
self.criterion = FastSpeech2Loss(
use_masking=self.use_masking,
use_weighted_masking=self.use_weighted_masking)
use_masking=use_masking, use_weighted_masking=use_weighted_masking)
def evaluate_core(self, batch):
self.msg = "Evaluate: "

@ -0,0 +1,15 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .tacotron2 import *
from .tacotron2_updater import *

@ -0,0 +1,500 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tacotron 2 related modules for paddle"""
import logging
from typing import Dict
from typing import Optional
from typing import Tuple
import paddle
import paddle.nn.functional as F
from paddle import nn
from typeguard import check_argument_types
from paddlespeech.t2s.modules.nets_utils import initialize
from paddlespeech.t2s.modules.nets_utils import make_pad_mask
from paddlespeech.t2s.modules.tacotron2.attentions import AttForward
from paddlespeech.t2s.modules.tacotron2.attentions import AttForwardTA
from paddlespeech.t2s.modules.tacotron2.attentions import AttLoc
from paddlespeech.t2s.modules.tacotron2.decoder import Decoder
from paddlespeech.t2s.modules.tacotron2.encoder import Encoder
class Tacotron2(nn.Layer):
"""Tacotron2 module for end-to-end text-to-speech.
This is a module of Spectrogram prediction network in Tacotron2 described
in `Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_,
which converts the sequence of characters into the sequence of Mel-filterbanks.
.. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
https://arxiv.org/abs/1712.05884
"""
def __init__(
self,
# network structure related
idim: int,
odim: int,
embed_dim: int=512,
elayers: int=1,
eunits: int=512,
econv_layers: int=3,
econv_chans: int=512,
econv_filts: int=5,
atype: str="location",
adim: int=512,
aconv_chans: int=32,
aconv_filts: int=15,
cumulate_att_w: bool=True,
dlayers: int=2,
dunits: int=1024,
prenet_layers: int=2,
prenet_units: int=256,
postnet_layers: int=5,
postnet_chans: int=512,
postnet_filts: int=5,
output_activation: str=None,
use_batch_norm: bool=True,
use_concate: bool=True,
use_residual: bool=False,
reduction_factor: int=1,
# extra embedding related
spk_num: Optional[int]=None,
lang_num: Optional[int]=None,
spk_embed_dim: Optional[int]=None,
spk_embed_integration_type: str="concat",
dropout_rate: float=0.5,
zoneout_rate: float=0.1,
# training related
init_type: str="xavier_uniform", ):
"""Initialize Tacotron2 module.
Parameters
----------
idim : int
Dimension of the inputs.
odim : int
Dimension of the outputs.
embed_dim : int
Dimension of the token embedding.
elayers : int
Number of encoder blstm layers.
eunits : int
Number of encoder blstm units.
econv_layers : int
Number of encoder conv layers.
econv_filts : int
Number of encoder conv filter size.
econv_chans : int
Number of encoder conv filter channels.
dlayers : int
Number of decoder lstm layers.
dunits : int
Number of decoder lstm units.
prenet_layers : int
Number of prenet layers.
prenet_units : int
Number of prenet units.
postnet_layers : int
Number of postnet layers.
postnet_filts : int
Number of postnet filter size.
postnet_chans : int
Number of postnet filter channels.
output_activation : str
Name of activation function for outputs.
adim : int
Number of dimension of mlp in attention.
aconv_chans : int
Number of attention conv filter channels.
aconv_filts : int
Number of attention conv filter size.
cumulate_att_w : bool
Whether to cumulate previous attention weight.
use_batch_norm : bool
Whether to use batch normalization.
use_concate : bool
Whether to concat enc outputs w/ dec lstm outputs.
reduction_factor : int
Reduction factor.
spk_num : Optional[int]
Number of speakers. If set to > 1, assume that the
sids will be provided as the input and use sid embedding layer.
lang_num : Optional[int]
Number of languages. If set to > 1, assume that the
lids will be provided as the input and use sid embedding layer.
spk_embed_dim : Optional[int]
Speaker embedding dimension. If set to > 0,
assume that spk_emb will be provided as the input.
spk_embed_integration_type : str
How to integrate speaker embedding.
dropout_rate : float
Dropout rate.
zoneout_rate : float
Zoneout rate.
"""
assert check_argument_types()
super().__init__()
# store hyperparameters
self.idim = idim
self.odim = odim
self.eos = idim - 1
self.cumulate_att_w = cumulate_att_w
self.reduction_factor = reduction_factor
# define activation function for the final output
if output_activation is None:
self.output_activation_fn = None
elif hasattr(F, output_activation):
self.output_activation_fn = getattr(F, output_activation)
else:
raise ValueError(f"there is no such an activation function. "
f"({output_activation})")
# set padding idx
padding_idx = 0
self.padding_idx = padding_idx
# initialize parameters
initialize(self, init_type)
# define network modules
self.enc = Encoder(
idim=idim,
embed_dim=embed_dim,
elayers=elayers,
eunits=eunits,
econv_layers=econv_layers,
econv_chans=econv_chans,
econv_filts=econv_filts,
use_batch_norm=use_batch_norm,
use_residual=use_residual,
dropout_rate=dropout_rate,
padding_idx=padding_idx, )
self.spk_num = None
if spk_num is not None and spk_num > 1:
self.spk_num = spk_num
self.sid_emb = nn.Embedding(spk_num, eunits)
self.lang_num = None
if lang_num is not None and lang_num > 1:
self.lang_num = lang_num
self.lid_emb = nn.Embedding(lang_num, eunits)
self.spk_embed_dim = None
if spk_embed_dim is not None and spk_embed_dim > 0:
self.spk_embed_dim = spk_embed_dim
self.spk_embed_integration_type = spk_embed_integration_type
if self.spk_embed_dim is None:
dec_idim = eunits
elif self.spk_embed_integration_type == "concat":
dec_idim = eunits + spk_embed_dim
elif self.spk_embed_integration_type == "add":
dec_idim = eunits
self.projection = nn.Linear(self.spk_embed_dim, eunits)
else:
raise ValueError(f"{spk_embed_integration_type} is not supported.")
if atype == "location":
att = AttLoc(dec_idim, dunits, adim, aconv_chans, aconv_filts)
elif atype == "forward":
att = AttForward(dec_idim, dunits, adim, aconv_chans, aconv_filts)
if self.cumulate_att_w:
logging.warning("cumulation of attention weights is disabled "
"in forward attention.")
self.cumulate_att_w = False
elif atype == "forward_ta":
att = AttForwardTA(dec_idim, dunits, adim, aconv_chans, aconv_filts,
odim)
if self.cumulate_att_w:
logging.warning("cumulation of attention weights is disabled "
"in forward attention.")
self.cumulate_att_w = False
else:
raise NotImplementedError("Support only location or forward")
self.dec = Decoder(
idim=dec_idim,
odim=odim,
att=att,
dlayers=dlayers,
dunits=dunits,
prenet_layers=prenet_layers,
prenet_units=prenet_units,
postnet_layers=postnet_layers,
postnet_chans=postnet_chans,
postnet_filts=postnet_filts,
output_activation_fn=self.output_activation_fn,
cumulate_att_w=self.cumulate_att_w,
use_batch_norm=use_batch_norm,
use_concate=use_concate,
dropout_rate=dropout_rate,
zoneout_rate=zoneout_rate,
reduction_factor=reduction_factor, )
nn.initializer.set_global_initializer(None)
def forward(
self,
text: paddle.Tensor,
text_lengths: paddle.Tensor,
speech: paddle.Tensor,
speech_lengths: paddle.Tensor,
spk_emb: Optional[paddle.Tensor]=None,
spk_id: Optional[paddle.Tensor]=None,
lang_id: Optional[paddle.Tensor]=None
) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
"""Calculate forward propagation.
Parameters
----------
text : Tensor(int64)
Batch of padded character ids (B, T_text).
text_lengths : Tensor(int64)
Batch of lengths of each input batch (B,).
speech : Tensor
Batch of padded target features (B, T_feats, odim).
speech_lengths : Tensor(int64)
Batch of the lengths of each target (B,).
spk_emb : Optional[Tensor]
Batch of speaker embeddings (B, spk_embed_dim).
spk_id : Optional[Tensor]
Batch of speaker IDs (B, 1).
lang_id : Optional[Tensor]
Batch of language IDs (B, 1).
Returns
----------
Tensor
Loss scalar value.
Dict
Statistics to be monitored.
Tensor
Weight value if not joint training else model outputs.
"""
text = text[:, :text_lengths.max()]
speech = speech[:, :speech_lengths.max()]
batch_size = paddle.shape(text)[0]
# Add eos at the last of sequence
xs = F.pad(text, [0, 0, 0, 1], "constant", self.padding_idx)
for i, l in enumerate(text_lengths):
xs[i, l] = self.eos
ilens = text_lengths + 1
ys = speech
olens = speech_lengths
# make labels for stop prediction
stop_labels = make_pad_mask(olens - 1)
# bool 类型无法切片
stop_labels = paddle.cast(stop_labels, dtype='float32')
stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0)
# calculate tacotron2 outputs
after_outs, before_outs, logits, att_ws = self._forward(
xs=xs,
ilens=ilens,
ys=ys,
olens=olens,
spk_emb=spk_emb,
spk_id=spk_id,
lang_id=lang_id, )
# modify mod part of groundtruth
if self.reduction_factor > 1:
assert olens.ge(self.reduction_factor).all(
), "Output length must be greater than or equal to reduction factor."
olens = olens - olens % self.reduction_factor
max_out = max(olens)
ys = ys[:, :max_out]
stop_labels = stop_labels[:, :max_out]
stop_labels = paddle.scatter(stop_labels, 1,
(olens - 1).unsqueeze(1), 1.0)
olens_in = olens // self.reduction_factor
else:
olens_in = olens
return after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in
def _forward(
self,
xs: paddle.Tensor,
ilens: paddle.Tensor,
ys: paddle.Tensor,
olens: paddle.Tensor,
spk_emb: paddle.Tensor,
spk_id: paddle.Tensor,
lang_id: paddle.Tensor,
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
hs, hlens = self.enc(xs, ilens)
if self.spk_num is not None:
sid_embs = self.sid_emb(spk_id.reshape([-1]))
hs = hs + sid_embs.unsqueeze(1)
if self.lang_num is not None:
lid_embs = self.lid_emb(lang_id.reshape([-1]))
hs = hs + lid_embs.unsqueeze(1)
if self.spk_embed_dim is not None:
hs = self._integrate_with_spk_embed(hs, spk_emb)
return self.dec(hs, hlens, ys)
def inference(
self,
text: paddle.Tensor,
speech: Optional[paddle.Tensor]=None,
spk_emb: Optional[paddle.Tensor]=None,
spk_id: Optional[paddle.Tensor]=None,
lang_id: Optional[paddle.Tensor]=None,
threshold: float=0.5,
minlenratio: float=0.0,
maxlenratio: float=10.0,
use_att_constraint: bool=False,
backward_window: int=1,
forward_window: int=3,
use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
"""Generate the sequence of features given the sequences of characters.
Parameters
----------
text Tensor(int64)
Input sequence of characters (T_text,).
speech : Optional[Tensor]
Feature sequence to extract style (N, idim).
spk_emb : ptional[Tensor]
Speaker embedding (spk_embed_dim,).
spk_id : Optional[Tensor]
Speaker ID (1,).
lang_id : Optional[Tensor]
Language ID (1,).
threshold : float
Threshold in inference.
minlenratio : float
Minimum length ratio in inference.
maxlenratio : float
Maximum length ratio in inference.
use_att_constraint : bool
Whether to apply attention constraint.
backward_window : int
Backward window in attention constraint.
forward_window : int
Forward window in attention constraint.
use_teacher_forcing : bool
Whether to use teacher forcing.
Return
----------
Dict[str, Tensor]
Output dict including the following items:
* feat_gen (Tensor): Output sequence of features (T_feats, odim).
* prob (Tensor): Output sequence of stop probabilities (T_feats,).
* att_w (Tensor): Attention weights (T_feats, T).
"""
x = text
y = speech
# add eos at the last of sequence
x = F.pad(x, [0, 1], "constant", self.eos)
# inference with teacher forcing
if use_teacher_forcing:
assert speech is not None, "speech must be provided with teacher forcing."
xs, ys = x.unsqueeze(0), y.unsqueeze(0)
spk_emb = None if spk_emb is None else spk_emb.unsqueeze(0)
ilens = paddle.shape(xs)[1]
olens = paddle.shape(ys)[1]
outs, _, _, att_ws = self._forward(
xs=xs,
ilens=ilens,
ys=ys,
olens=olens,
spk_emb=spk_emb,
spk_id=spk_id,
lang_id=lang_id, )
return dict(feat_gen=outs[0], att_w=att_ws[0])
# inference
h = self.enc.inference(x)
if self.spk_num is not None:
sid_emb = self.sid_emb(spk_id.reshape([-1]))
h = h + sid_emb
if self.lang_num is not None:
lid_emb = self.lid_emb(lang_id.reshape([-1]))
h = h + lid_emb
if self.spk_embed_dim is not None:
hs, spk_emb = h.unsqueeze(0), spk_emb.unsqueeze(0)
h = self._integrate_with_spk_embed(hs, spk_emb)[0]
out, prob, att_w = self.dec.inference(
h,
threshold=threshold,
minlenratio=minlenratio,
maxlenratio=maxlenratio,
use_att_constraint=use_att_constraint,
backward_window=backward_window,
forward_window=forward_window, )
return dict(feat_gen=out, prob=prob, att_w=att_w)
def _integrate_with_spk_embed(self,
hs: paddle.Tensor,
spk_emb: paddle.Tensor) -> paddle.Tensor:
"""Integrate speaker embedding with hidden states.
Parameters
----------
hs : Tensor
Batch of hidden state sequences (B, Tmax, eunits).
spk_emb : Tensor
Batch of speaker embeddings (B, spk_embed_dim).
Returns
----------
Tensor
Batch of integrated hidden state sequences (B, Tmax, eunits) if
integration_type is "add" else (B, Tmax, eunits + spk_embed_dim).
"""
if self.spk_embed_integration_type == "add":
# apply projection and then add to hidden states
spk_emb = self.projection(F.normalize(spk_emb))
hs = hs + spk_emb.unsqueeze(1)
elif self.spk_embed_integration_type == "concat":
# concat hidden states with spk embeds
spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
-1, paddle.shape(hs)[1], -1)
hs = paddle.concat([hs, spk_emb], axis=-1)
else:
raise NotImplementedError("support only add or concat.")
return hs
class Tacotron2Inference(nn.Layer):
def __init__(self, normalizer, model):
super().__init__()
self.normalizer = normalizer
self.acoustic_model = model
def forward(self, text, spk_id=None, spk_emb=None):
out = self.acoustic_model.inference(
text, spk_id=spk_id, spk_emb=spk_emb)
normalized_mel = out["feat_gen"]
logmel = self.normalizer.inverse(normalized_mel)
return logmel

@ -0,0 +1,219 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from pathlib import Path
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.nn import Layer
from paddle.optimizer import Optimizer
from paddlespeech.t2s.modules.losses import GuidedAttentionLoss
from paddlespeech.t2s.modules.losses import Tacotron2Loss
from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
from paddlespeech.t2s.training.reporter import report
from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
logging.basicConfig(
format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
datefmt='[%Y-%m-%d %H:%M:%S]')
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class Tacotron2Updater(StandardUpdater):
def __init__(self,
model: Layer,
optimizer: Optimizer,
dataloader: DataLoader,
init_state=None,
use_masking: bool=True,
use_weighted_masking: bool=False,
bce_pos_weight: float=5.0,
loss_type: str="L1+L2",
use_guided_attn_loss: bool=True,
guided_attn_loss_sigma: float=0.4,
guided_attn_loss_lambda: float=1.0,
output_dir: Path=None):
super().__init__(model, optimizer, dataloader, init_state=None)
self.loss_type = loss_type
self.use_guided_attn_loss = use_guided_attn_loss
self.taco2_loss = Tacotron2Loss(
use_masking=use_masking,
use_weighted_masking=use_weighted_masking,
bce_pos_weight=bce_pos_weight, )
if self.use_guided_attn_loss:
self.attn_loss = GuidedAttentionLoss(
sigma=guided_attn_loss_sigma,
alpha=guided_attn_loss_lambda, )
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
self.filehandler = logging.FileHandler(str(log_file))
logger.addHandler(self.filehandler)
self.logger = logger
self.msg = ""
def update_core(self, batch):
self.msg = "Rank: {}, ".format(dist.get_rank())
losses_dict = {}
# spk_id!=None in multiple spk fastspeech2
spk_id = batch["spk_id"] if "spk_id" in batch else None
spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
if spk_emb is not None:
spk_id = None
after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model(
text=batch["text"],
text_lengths=batch["text_lengths"],
speech=batch["speech"],
speech_lengths=batch["speech_lengths"],
spk_id=spk_id,
spk_emb=spk_emb)
# calculate taco2 loss
l1_loss, mse_loss, bce_loss = self.taco2_loss(
after_outs=after_outs,
before_outs=before_outs,
logits=logits,
ys=ys,
stop_labels=stop_labels,
olens=olens)
if self.loss_type == "L1+L2":
loss = l1_loss + mse_loss + bce_loss
elif self.loss_type == "L1":
loss = l1_loss + bce_loss
elif self.loss_type == "L2":
loss = mse_loss + bce_loss
else:
raise ValueError(f"unknown --loss-type {self.loss_type}")
# calculate attention loss
if self.use_guided_attn_loss:
# NOTE: length of output for auto-regressive
# input will be changed when r > 1
attn_loss = self.attn_loss(
att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in)
loss = loss + attn_loss
optimizer = self.optimizer
optimizer.clear_grad()
loss.backward()
optimizer.step()
report("train/l1_loss", float(l1_loss))
report("train/mse_loss", float(mse_loss))
report("train/bce_loss", float(bce_loss))
report("train/attn_loss", float(attn_loss))
report("train/loss", float(loss))
losses_dict["l1_loss"] = float(l1_loss)
losses_dict["mse_loss"] = float(mse_loss)
losses_dict["bce_loss"] = float(bce_loss)
losses_dict["attn_loss"] = float(attn_loss)
losses_dict["loss"] = float(loss)
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items())
class Tacotron2Evaluator(StandardEvaluator):
def __init__(self,
model: Layer,
dataloader: DataLoader,
use_masking: bool=True,
use_weighted_masking: bool=False,
bce_pos_weight: float=5.0,
loss_type: str="L1+L2",
use_guided_attn_loss: bool=True,
guided_attn_loss_sigma: float=0.4,
guided_attn_loss_lambda: float=1.0,
output_dir=None):
super().__init__(model, dataloader)
self.loss_type = loss_type
self.use_guided_attn_loss = use_guided_attn_loss
self.taco2_loss = Tacotron2Loss(
use_masking=use_masking,
use_weighted_masking=use_weighted_masking,
bce_pos_weight=bce_pos_weight, )
if self.use_guided_attn_loss:
self.attn_loss = GuidedAttentionLoss(
sigma=guided_attn_loss_sigma,
alpha=guided_attn_loss_lambda, )
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
self.filehandler = logging.FileHandler(str(log_file))
logger.addHandler(self.filehandler)
self.logger = logger
self.msg = ""
def evaluate_core(self, batch):
self.msg = "Evaluate: "
losses_dict = {}
# spk_id!=None in multiple spk fastspeech2
spk_id = batch["spk_id"] if "spk_id" in batch else None
spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
if spk_emb is not None:
spk_id = None
after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model(
text=batch["text"],
text_lengths=batch["text_lengths"],
speech=batch["speech"],
speech_lengths=batch["speech_lengths"],
spk_id=spk_id,
spk_emb=spk_emb)
# calculate taco2 loss
l1_loss, mse_loss, bce_loss = self.taco2_loss(
after_outs=after_outs,
before_outs=before_outs,
logits=logits,
ys=ys,
stop_labels=stop_labels,
olens=olens)
if self.loss_type == "L1+L2":
loss = l1_loss + mse_loss + bce_loss
elif self.loss_type == "L1":
loss = l1_loss + bce_loss
elif self.loss_type == "L2":
loss = mse_loss + bce_loss
else:
raise ValueError(f"unknown --loss-type {self.loss_type}")
# calculate attention loss
if self.use_guided_attn_loss:
# NOTE: length of output for auto-regressive
# input will be changed when r > 1
attn_loss = self.attn_loss(
att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in)
loss = loss + attn_loss
report("eval/l1_loss", float(l1_loss))
report("eval/mse_loss", float(mse_loss))
report("eval/bce_loss", float(bce_loss))
report("eval/attn_loss", float(attn_loss))
report("eval/loss", float(loss))
losses_dict["l1_loss"] = float(l1_loss)
losses_dict["mse_loss"] = float(mse_loss)
losses_dict["bce_loss"] = float(bce_loss)
losses_dict["attn_loss"] = float(attn_loss)
losses_dict["loss"] = float(loss)
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items())
self.logger.info(self.msg)

@ -12,11 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from pathlib import Path
import paddle
from paddle import distributed as dist
from paddle.fluid.layers import huber_loss
from paddle.io import DataLoader
from paddle.nn import functional as F
from paddle.nn import Layer
from paddle.optimizer import Optimizer
from paddlespeech.t2s.modules.losses import masked_l1_loss
from paddlespeech.t2s.modules.losses import ssim
@ -33,11 +37,11 @@ logger.setLevel(logging.INFO)
class SpeedySpeechUpdater(StandardUpdater):
def __init__(self,
model,
optimizer,
dataloader,
model: Layer,
optimizer: Optimizer,
dataloader: DataLoader,
init_state=None,
output_dir=None):
output_dir: Path=None):
super().__init__(model, optimizer, dataloader, init_state=None)
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
@ -103,7 +107,10 @@ class SpeedySpeechUpdater(StandardUpdater):
class SpeedySpeechEvaluator(StandardEvaluator):
def __init__(self, model, dataloader, output_dir=None):
def __init__(self,
model: Layer,
dataloader: DataLoader,
output_dir: Path=None):
super().__init__(model, dataloader)
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())

@ -433,12 +433,10 @@ class TransformerTTS(nn.Layer):
olens = paddle.cast(speech_lengths, 'int64')
# make labels for stop prediction
labels = make_pad_mask(olens - 1)
labels = numpy.pad(
labels.numpy(), ((0, 0), (0, 1)), 'constant', constant_values=1.0)
labels = paddle.to_tensor(labels)
labels = paddle.cast(labels, dtype="float32")
# labels = F.pad(labels, [0, 1], "constant", 1.0)
stop_labels = make_pad_mask(olens - 1)
# bool 类型无法切片
stop_labels = paddle.cast(stop_labels, dtype='float32')
stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0)
# calculate transformer outputs
after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens,
@ -447,12 +445,15 @@ class TransformerTTS(nn.Layer):
# modifiy mod part of groundtruth
if self.reduction_factor > 1:
olens = paddle.to_tensor(
[olen - olen % self.reduction_factor for olen in olens.numpy()])
olens = olens - olens % self.reduction_factor
max_olen = max(olens)
ys = ys[:, :max_olen]
labels = labels[:, :max_olen]
labels[:, -1] = 1.0 # make sure at least one frame has 1
stop_labels = stop_labels[:, :max_olen]
stop_labels[:, -1] = 1.0 # make sure at least one frame has 1
olens_in = olens // self.reduction_factor
else:
olens_in = olens
need_dict = {}
need_dict['encoder'] = self.encoder
need_dict['decoder'] = self.decoder
@ -462,7 +463,7 @@ class TransformerTTS(nn.Layer):
'num_layers_applied_guided_attn'] = self.num_layers_applied_guided_attn
need_dict['use_scaled_pos_enc'] = self.use_scaled_pos_enc
return after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict
return after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict
def _forward(
self,
@ -488,8 +489,7 @@ class TransformerTTS(nn.Layer):
# thin out frames for reduction factor (B, Lmax, odim) -> (B, Lmax//r, odim)
if self.reduction_factor > 1:
ys_in = ys[:, self.reduction_factor - 1::self.reduction_factor]
olens_in = olens.new(
[olen // self.reduction_factor for olen in olens])
olens_in = olens // self.reduction_factor
else:
ys_in, olens_in = ys, olens
@ -769,318 +769,3 @@ class TransformerTTSInference(nn.Layer):
normalized_mel = self.acoustic_model.inference(text)[0]
logmel = self.normalizer.inverse(normalized_mel)
return logmel
class TransformerTTSLoss(nn.Layer):
"""Loss function module for Tacotron2."""
def __init__(self,
use_masking=True,
use_weighted_masking=False,
bce_pos_weight=5.0):
"""Initialize Tactoron2 loss module.
Parameters
----------
use_masking : bool
Whether to apply masking for padded part in loss calculation.
use_weighted_masking : bool
Whether to apply weighted masking in loss calculation.
bce_pos_weight : float
Weight of positive sample of stop token.
"""
super().__init__()
assert (use_masking != use_weighted_masking) or not use_masking
self.use_masking = use_masking
self.use_weighted_masking = use_weighted_masking
# define criterions
reduction = "none" if self.use_weighted_masking else "mean"
self.l1_criterion = nn.L1Loss(reduction=reduction)
self.mse_criterion = nn.MSELoss(reduction=reduction)
self.bce_criterion = nn.BCEWithLogitsLoss(
reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight))
def forward(self, after_outs, before_outs, logits, ys, labels, olens):
"""Calculate forward propagation.
Parameters
----------
after_outs : Tensor
Batch of outputs after postnets (B, Lmax, odim).
before_outs : Tensor
Batch of outputs before postnets (B, Lmax, odim).
logits : Tensor
Batch of stop logits (B, Lmax).
ys : Tensor
Batch of padded target features (B, Lmax, odim).
labels : LongTensor
Batch of the sequences of stop token labels (B, Lmax).
olens : LongTensor
Batch of the lengths of each target (B,).
Returns
----------
Tensor
L1 loss value.
Tensor
Mean square error loss value.
Tensor
Binary cross entropy loss value.
"""
# make mask and apply it
if self.use_masking:
masks = make_non_pad_mask(olens).unsqueeze(-1)
ys = ys.masked_select(masks.broadcast_to(ys.shape))
after_outs = after_outs.masked_select(
masks.broadcast_to(after_outs.shape))
before_outs = before_outs.masked_select(
masks.broadcast_to(before_outs.shape))
# Operator slice does not have kernel for data_type[bool]
tmp_masks = paddle.cast(masks, dtype='int64')
tmp_masks = tmp_masks[:, :, 0]
tmp_masks = paddle.cast(tmp_masks, dtype='bool')
labels = labels.masked_select(tmp_masks.broadcast_to(labels.shape))
logits = logits.masked_select(tmp_masks.broadcast_to(logits.shape))
# calculate loss
l1_loss = self.l1_criterion(after_outs, ys) + self.l1_criterion(
before_outs, ys)
mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion(
before_outs, ys)
bce_loss = self.bce_criterion(logits, labels)
# make weighted mask and apply it
if self.use_weighted_masking:
masks = make_non_pad_mask(olens).unsqueeze(-1)
weights = masks.float() / masks.sum(dim=1, keepdim=True).float()
out_weights = weights.div(ys.shape[0] * ys.shape[2])
logit_weights = weights.div(ys.shape[0])
# apply weight
l1_loss = l1_loss.multiply(out_weights)
l1_loss = l1_loss.masked_select(
masks.broadcast_to(l1_loss.shape)).sum()
mse_loss = mse_loss.multiply(out_weights)
mse_loss = mse_loss.masked_select(
masks.broadcast_to(mse_loss.shape)).sum()
bce_loss = bce_loss.multiply(logit_weights.squeeze(-1))
bce_loss = bce_loss.masked_select(
masks.squeeze(-1).broadcast_to(bce_loss.shape)).sum()
return l1_loss, mse_loss, bce_loss
class GuidedAttentionLoss(nn.Layer):
"""Guided attention loss function module.
This module calculates the guided attention loss described
in `Efficiently Trainable Text-to-Speech System Based
on Deep Convolutional Networks with Guided Attention`_,
which forces the attention to be diagonal.
.. _`Efficiently Trainable Text-to-Speech System
Based on Deep Convolutional Networks with Guided Attention`:
https://arxiv.org/abs/1710.08969
"""
def __init__(self, sigma=0.4, alpha=1.0, reset_always=True):
"""Initialize guided attention loss module.
Parameters
----------
sigma : float, optional
Standard deviation to control how close attention to a diagonal.
alpha : float, optional
Scaling coefficient (lambda).
reset_always : bool, optional
Whether to always reset masks.
"""
super(GuidedAttentionLoss, self).__init__()
self.sigma = sigma
self.alpha = alpha
self.reset_always = reset_always
self.guided_attn_masks = None
self.masks = None
def _reset_masks(self):
self.guided_attn_masks = None
self.masks = None
def forward(self, att_ws, ilens, olens):
"""Calculate forward propagation.
Parameters
----------
att_ws : Tensor
Batch of attention weights (B, T_max_out, T_max_in).
ilens : LongTensor
Batch of input lenghts (B,).
olens : LongTensor
Batch of output lenghts (B,).
Returns
----------
Tensor
Guided attention loss value.
"""
if self.guided_attn_masks is None:
self.guided_attn_masks = self._make_guided_attention_masks(ilens,
olens)
if self.masks is None:
self.masks = self._make_masks(ilens, olens)
losses = self.guided_attn_masks * att_ws
loss = paddle.mean(
losses.masked_select(self.masks.broadcast_to(losses.shape)))
if self.reset_always:
self._reset_masks()
return self.alpha * loss
def _make_guided_attention_masks(self, ilens, olens):
n_batches = len(ilens)
max_ilen = max(ilens)
max_olen = max(olens)
guided_attn_masks = paddle.zeros((n_batches, max_olen, max_ilen))
for idx, (ilen, olen) in enumerate(zip(ilens, olens)):
ilen = int(ilen)
olen = int(olen)
guided_attn_masks[idx, :olen, :
ilen] = self._make_guided_attention_mask(
ilen, olen, self.sigma)
return guided_attn_masks
@staticmethod
def _make_guided_attention_mask(ilen, olen, sigma):
"""Make guided attention mask.
Examples
----------
>>> guided_attn_mask =_make_guided_attention(5, 5, 0.4)
>>> guided_attn_mask.shape
[5, 5]
>>> guided_attn_mask
tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647],
[0.1175, 0.0000, 0.1175, 0.3935, 0.6753],
[0.3935, 0.1175, 0.0000, 0.1175, 0.3935],
[0.6753, 0.3935, 0.1175, 0.0000, 0.1175],
[0.8647, 0.6753, 0.3935, 0.1175, 0.0000]])
>>> guided_attn_mask =_make_guided_attention(3, 6, 0.4)
>>> guided_attn_mask.shape
[6, 3]
>>> guided_attn_mask
tensor([[0.0000, 0.2934, 0.7506],
[0.0831, 0.0831, 0.5422],
[0.2934, 0.0000, 0.2934],
[0.5422, 0.0831, 0.0831],
[0.7506, 0.2934, 0.0000],
[0.8858, 0.5422, 0.0831]])
"""
grid_x, grid_y = paddle.meshgrid(
paddle.arange(olen), paddle.arange(ilen))
grid_x = grid_x.cast(dtype=paddle.float32)
grid_y = grid_y.cast(dtype=paddle.float32)
return 1.0 - paddle.exp(-(
(grid_y / ilen - grid_x / olen)**2) / (2 * (sigma**2)))
@staticmethod
def _make_masks(ilens, olens):
"""Make masks indicating non-padded part.
Parameters
----------
ilens (LongTensor or List): Batch of lengths (B,).
olens (LongTensor or List): Batch of lengths (B,).
Returns
----------
Tensor
Mask tensor indicating non-padded part.
Examples
----------
>>> ilens, olens = [5, 2], [8, 5]
>>> _make_mask(ilens, olens)
tensor([[[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1]],
[[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]]], dtype=paddle.uint8)
"""
# (B, T_in)
in_masks = make_non_pad_mask(ilens)
# (B, T_out)
out_masks = make_non_pad_mask(olens)
# (B, T_out, T_in)
return paddle.logical_and(
out_masks.unsqueeze(-1), in_masks.unsqueeze(-2))
class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
"""Guided attention loss function module for multi head attention.
Parameters
----------
sigma : float, optional
Standard deviation to controlGuidedAttentionLoss
how close attention to a diagonal.
alpha : float, optional
Scaling coefficient (lambda).
reset_always : bool, optional
Whether to always reset masks.
"""
def forward(self, att_ws, ilens, olens):
"""Calculate forward propagation.
Parameters
----------
att_ws : Tensor
Batch of multi head attention weights (B, H, T_max_out, T_max_in).
ilens : Tensor
Batch of input lenghts (B,).
olens : Tensor
Batch of output lenghts (B,).
Returns
----------
Tensor
Guided attention loss value.
"""
if self.guided_attn_masks is None:
self.guided_attn_masks = (
self._make_guided_attention_masks(ilens, olens).unsqueeze(1))
if self.masks is None:
self.masks = self._make_masks(ilens, olens).unsqueeze(1)
losses = self.guided_attn_masks * att_ws
loss = paddle.mean(
losses.masked_select(self.masks.broadcast_to(losses.shape)))
if self.reset_always:
self._reset_masks()
return self.alpha * loss

@ -12,13 +12,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from pathlib import Path
from typing import Sequence
import paddle
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.nn import Layer
from paddle.optimizer import Optimizer
from paddlespeech.t2s.models.transformer_tts import GuidedMultiHeadAttentionLoss
from paddlespeech.t2s.models.transformer_tts import TransformerTTSLoss
from paddlespeech.t2s.modules.losses import GuidedMultiHeadAttentionLoss
from paddlespeech.t2s.modules.losses import Tacotron2Loss as TransformerTTSLoss
from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
from paddlespeech.t2s.training.reporter import report
from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
@ -32,38 +36,34 @@ logger.setLevel(logging.INFO)
class TransformerTTSUpdater(StandardUpdater):
def __init__(
self,
model,
optimizer,
dataloader,
model: Layer,
optimizer: Optimizer,
dataloader: DataLoader,
init_state=None,
use_masking=False,
use_weighted_masking=False,
output_dir=None,
bce_pos_weight=5.0,
use_masking: bool=False,
use_weighted_masking: bool=False,
output_dir: Path=None,
bce_pos_weight: float=5.0,
loss_type: str="L1",
use_guided_attn_loss: bool=True,
modules_applied_guided_attn: Sequence[str]=("encoder-decoder"),
guided_attn_loss_sigma: float=0.4,
guided_attn_loss_lambda: float=1.0, ):
super().__init__(model, optimizer, dataloader, init_state=None)
self.use_masking = use_masking
self.use_weighted_masking = use_weighted_masking
self.bce_pos_weight = bce_pos_weight
self.loss_type = loss_type
self.use_guided_attn_loss = use_guided_attn_loss
self.guided_attn_loss_sigma = guided_attn_loss_sigma
self.guided_attn_loss_lambda = guided_attn_loss_lambda
self.modules_applied_guided_attn = modules_applied_guided_attn
self.criterion = TransformerTTSLoss(
use_masking=self.use_masking,
use_weighted_masking=self.use_weighted_masking,
bce_pos_weight=self.bce_pos_weight)
use_masking=use_masking,
use_weighted_masking=use_weighted_masking,
bce_pos_weight=bce_pos_weight)
if self.use_guided_attn_loss:
self.attn_criterion = GuidedMultiHeadAttentionLoss(
sigma=self.guided_attn_loss_sigma,
alpha=self.guided_attn_loss_lambda, )
sigma=guided_attn_loss_sigma,
alpha=guided_attn_loss_lambda, )
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
self.filehandler = logging.FileHandler(str(log_file))
@ -75,7 +75,7 @@ class TransformerTTSUpdater(StandardUpdater):
self.msg = "Rank: {}, ".format(dist.get_rank())
losses_dict = {}
after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict = self.model(
after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model(
text=batch["text"],
text_lengths=batch["text_lengths"],
speech=batch["speech"],
@ -86,7 +86,7 @@ class TransformerTTSUpdater(StandardUpdater):
before_outs=before_outs,
logits=logits,
ys=ys,
labels=labels,
stop_labels=stop_labels,
olens=olens)
report("train/bce_loss", float(bce_loss))
@ -120,7 +120,10 @@ class TransformerTTSUpdater(StandardUpdater):
break
# (B, H*L, T_in, T_in)
att_ws = paddle.concat(att_ws, axis=1)
enc_attn_loss = self.attn_criterion(att_ws, ilens, ilens)
enc_attn_loss = self.attn_criterion(
att_ws=att_ws,
ilens=batch["text_lengths"] + 1,
olens=batch["text_lengths"] + 1)
loss = loss + enc_attn_loss
report("train/enc_attn_loss", float(enc_attn_loss))
losses_dict["enc_attn_loss"] = float(enc_attn_loss)
@ -137,7 +140,8 @@ class TransformerTTSUpdater(StandardUpdater):
break
# (B, H*L, T_out, T_out)
att_ws = paddle.concat(att_ws, axis=1)
dec_attn_loss = self.attn_criterion(att_ws, olens, olens)
dec_attn_loss = self.attn_criterion(
att_ws=att_ws, ilens=olens_in, olens=olens_in)
report("train/dec_attn_loss", float(dec_attn_loss))
losses_dict["dec_attn_loss"] = float(dec_attn_loss)
loss = loss + dec_attn_loss
@ -154,7 +158,10 @@ class TransformerTTSUpdater(StandardUpdater):
break
# (B, H*L, T_out, T_in)
att_ws = paddle.concat(att_ws, axis=1)
enc_dec_attn_loss = self.attn_criterion(att_ws, ilens, olens)
enc_dec_attn_loss = self.attn_criterion(
att_ws=att_ws,
ilens=batch["text_lengths"] + 1,
olens=olens_in)
report("train/enc_dec_attn_loss", float(enc_dec_attn_loss))
losses_dict["enc_dec_attn_loss"] = float(enc_dec_attn_loss)
loss = loss + enc_dec_attn_loss
@ -182,37 +189,33 @@ class TransformerTTSUpdater(StandardUpdater):
class TransformerTTSEvaluator(StandardEvaluator):
def __init__(
self,
model,
dataloader,
model: Layer,
dataloader: DataLoader,
init_state=None,
use_masking=False,
use_weighted_masking=False,
output_dir=None,
bce_pos_weight=5.0,
use_masking: bool=False,
use_weighted_masking: bool=False,
output_dir: Path=None,
bce_pos_weight: float=5.0,
loss_type: str="L1",
use_guided_attn_loss: bool=True,
modules_applied_guided_attn: Sequence[str]=("encoder-decoder"),
guided_attn_loss_sigma: float=0.4,
guided_attn_loss_lambda: float=1.0, ):
super().__init__(model, dataloader)
self.use_masking = use_masking
self.use_weighted_masking = use_weighted_masking
self.bce_pos_weight = bce_pos_weight
self.loss_type = loss_type
self.use_guided_attn_loss = use_guided_attn_loss
self.guided_attn_loss_sigma = guided_attn_loss_sigma
self.guided_attn_loss_lambda = guided_attn_loss_lambda
self.modules_applied_guided_attn = modules_applied_guided_attn
self.criterion = TransformerTTSLoss(
use_masking=self.use_masking,
use_weighted_masking=self.use_weighted_masking,
bce_pos_weight=self.bce_pos_weight)
use_masking=use_masking,
use_weighted_masking=use_weighted_masking,
bce_pos_weight=bce_pos_weight)
if self.use_guided_attn_loss:
self.attn_criterion = GuidedMultiHeadAttentionLoss(
sigma=self.guided_attn_loss_sigma,
alpha=self.guided_attn_loss_lambda, )
sigma=guided_attn_loss_sigma,
alpha=guided_attn_loss_lambda, )
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
self.filehandler = logging.FileHandler(str(log_file))
@ -223,7 +226,7 @@ class TransformerTTSEvaluator(StandardEvaluator):
def evaluate_core(self, batch):
self.msg = "Evaluate: "
losses_dict = {}
after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict = self.model(
after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model(
text=batch["text"],
text_lengths=batch["text_lengths"],
speech=batch["speech"],
@ -234,7 +237,7 @@ class TransformerTTSEvaluator(StandardEvaluator):
before_outs=before_outs,
logits=logits,
ys=ys,
labels=labels,
stop_labels=stop_labels,
olens=olens)
report("eval/bce_loss", float(bce_loss))
@ -268,7 +271,10 @@ class TransformerTTSEvaluator(StandardEvaluator):
break
# (B, H*L, T_in, T_in)
att_ws = paddle.concat(att_ws, axis=1)
enc_attn_loss = self.attn_criterion(att_ws, ilens, ilens)
enc_attn_loss = self.attn_criterion(
att_ws=att_ws,
ilens=batch["text_lengths"] + 1,
olens=batch["text_lengths"] + 1)
loss = loss + enc_attn_loss
report("train/enc_attn_loss", float(enc_attn_loss))
losses_dict["enc_attn_loss"] = float(enc_attn_loss)
@ -285,7 +291,8 @@ class TransformerTTSEvaluator(StandardEvaluator):
break
# (B, H*L, T_out, T_out)
att_ws = paddle.concat(att_ws, axis=1)
dec_attn_loss = self.attn_criterion(att_ws, olens, olens)
dec_attn_loss = self.attn_criterion(
att_ws=att_ws, ilens=olens_in, olens=olens_in)
report("eval/dec_attn_loss", float(dec_attn_loss))
losses_dict["dec_attn_loss"] = float(dec_attn_loss)
loss = loss + dec_attn_loss
@ -303,7 +310,10 @@ class TransformerTTSEvaluator(StandardEvaluator):
break
# (B, H*L, T_out, T_in)
att_ws = paddle.concat(att_ws, axis=1)
enc_dec_attn_loss = self.attn_criterion(att_ws, ilens, olens)
enc_dec_attn_loss = self.attn_criterion(
att_ws=att_ws,
ilens=batch["text_lengths"] + 1,
olens=olens_in)
report("eval/enc_dec_attn_loss", float(enc_dec_attn_loss))
losses_dict["enc_dec_attn_loss"] = float(enc_dec_attn_loss)
loss = loss + enc_dec_attn_loss

@ -20,6 +20,314 @@ from paddle.fluid.layers import sequence_mask
from paddle.nn import functional as F
from scipy import signal
from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
# Loss for new Tacotron2
class GuidedAttentionLoss(nn.Layer):
"""Guided attention loss function module.
This module calculates the guided attention loss described
in `Efficiently Trainable Text-to-Speech System Based
on Deep Convolutional Networks with Guided Attention`_,
which forces the attention to be diagonal.
.. _`Efficiently Trainable Text-to-Speech System
Based on Deep Convolutional Networks with Guided Attention`:
https://arxiv.org/abs/1710.08969
"""
def __init__(self, sigma=0.4, alpha=1.0, reset_always=True):
"""Initialize guided attention loss module.
Parameters
----------
sigma : float, optional
Standard deviation to control how close attention to a diagonal.
alpha : float, optional
Scaling coefficient (lambda).
reset_always : bool, optional
Whether to always reset masks.
"""
super().__init__()
self.sigma = sigma
self.alpha = alpha
self.reset_always = reset_always
self.guided_attn_masks = None
self.masks = None
def _reset_masks(self):
self.guided_attn_masks = None
self.masks = None
def forward(self, att_ws, ilens, olens):
"""Calculate forward propagation.
Parameters
----------
att_ws : Tensor
Batch of attention weights (B, T_max_out, T_max_in).
ilens : Tensor(int64)
Batch of input lenghts (B,).
olens : Tensor(int64)
Batch of output lenghts (B,).
Returns
----------
Tensor
Guided attention loss value.
"""
if self.guided_attn_masks is None:
self.guided_attn_masks = self._make_guided_attention_masks(ilens,
olens)
if self.masks is None:
self.masks = self._make_masks(ilens, olens)
losses = self.guided_attn_masks * att_ws
loss = paddle.mean(
losses.masked_select(self.masks.broadcast_to(losses.shape)))
if self.reset_always:
self._reset_masks()
return self.alpha * loss
def _make_guided_attention_masks(self, ilens, olens):
n_batches = len(ilens)
max_ilen = max(ilens)
max_olen = max(olens)
guided_attn_masks = paddle.zeros((n_batches, max_olen, max_ilen))
for idx, (ilen, olen) in enumerate(zip(ilens, olens)):
guided_attn_masks[idx, :olen, :
ilen] = self._make_guided_attention_mask(
ilen, olen, self.sigma)
return guided_attn_masks
@staticmethod
def _make_guided_attention_mask(ilen, olen, sigma):
"""Make guided attention mask.
Examples
----------
>>> guided_attn_mask =_make_guided_attention(5, 5, 0.4)
>>> guided_attn_mask.shape
[5, 5]
>>> guided_attn_mask
tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647],
[0.1175, 0.0000, 0.1175, 0.3935, 0.6753],
[0.3935, 0.1175, 0.0000, 0.1175, 0.3935],
[0.6753, 0.3935, 0.1175, 0.0000, 0.1175],
[0.8647, 0.6753, 0.3935, 0.1175, 0.0000]])
>>> guided_attn_mask =_make_guided_attention(3, 6, 0.4)
>>> guided_attn_mask.shape
[6, 3]
>>> guided_attn_mask
tensor([[0.0000, 0.2934, 0.7506],
[0.0831, 0.0831, 0.5422],
[0.2934, 0.0000, 0.2934],
[0.5422, 0.0831, 0.0831],
[0.7506, 0.2934, 0.0000],
[0.8858, 0.5422, 0.0831]])
"""
grid_x, grid_y = paddle.meshgrid(
paddle.arange(olen), paddle.arange(ilen))
grid_x = grid_x.cast(dtype=paddle.float32)
grid_y = grid_y.cast(dtype=paddle.float32)
return 1.0 - paddle.exp(-(
(grid_y / ilen - grid_x / olen)**2) / (2 * (sigma**2)))
@staticmethod
def _make_masks(ilens, olens):
"""Make masks indicating non-padded part.
Parameters
----------
ilens : Tensor(int64) or List
Batch of lengths (B,).
olens : Tensor(int64) or List
Batch of lengths (B,).
Returns
----------
Tensor
Mask tensor indicating non-padded part.
Examples
----------
>>> ilens, olens = [5, 2], [8, 5]
>>> _make_mask(ilens, olens)
tensor([[[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1]],
[[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]]], dtype=paddle.uint8)
"""
# (B, T_in)
in_masks = make_non_pad_mask(ilens)
# (B, T_out)
out_masks = make_non_pad_mask(olens)
# (B, T_out, T_in)
return paddle.logical_and(
out_masks.unsqueeze(-1), in_masks.unsqueeze(-2))
class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
"""Guided attention loss function module for multi head attention.
Parameters
----------
sigma : float, optional
Standard deviation to controlGuidedAttentionLoss
how close attention to a diagonal.
alpha : float, optional
Scaling coefficient (lambda).
reset_always : bool, optional
Whether to always reset masks.
"""
def forward(self, att_ws, ilens, olens):
"""Calculate forward propagation.
Parameters
----------
att_ws : Tensor
Batch of multi head attention weights (B, H, T_max_out, T_max_in).
ilens : Tensor
Batch of input lenghts (B,).
olens : Tensor
Batch of output lenghts (B,).
Returns
----------
Tensor
Guided attention loss value.
"""
if self.guided_attn_masks is None:
self.guided_attn_masks = (
self._make_guided_attention_masks(ilens, olens).unsqueeze(1))
if self.masks is None:
self.masks = self._make_masks(ilens, olens).unsqueeze(1)
losses = self.guided_attn_masks * att_ws
loss = paddle.mean(
losses.masked_select(self.masks.broadcast_to(losses.shape)))
if self.reset_always:
self._reset_masks()
return self.alpha * loss
class Tacotron2Loss(nn.Layer):
"""Loss function module for Tacotron2."""
def __init__(self,
use_masking=True,
use_weighted_masking=False,
bce_pos_weight=20.0):
"""Initialize Tactoron2 loss module.
Parameters
----------
use_masking : bool
Whether to apply masking for padded part in loss calculation.
use_weighted_masking : bool
Whether to apply weighted masking in loss calculation.
bce_pos_weight : float
Weight of positive sample of stop token.
"""
super().__init__()
assert (use_masking != use_weighted_masking) or not use_masking
self.use_masking = use_masking
self.use_weighted_masking = use_weighted_masking
# define criterions
reduction = "none" if self.use_weighted_masking else "mean"
self.l1_criterion = nn.L1Loss(reduction=reduction)
self.mse_criterion = nn.MSELoss(reduction=reduction)
self.bce_criterion = nn.BCEWithLogitsLoss(
reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight))
def forward(self, after_outs, before_outs, logits, ys, stop_labels, olens):
"""Calculate forward propagation.
Parameters
----------
after_outs : Tensor
Batch of outputs after postnets (B, Lmax, odim).
before_outs : Tensor
Batch of outputs before postnets (B, Lmax, odim).
logits : Tensor
Batch of stop logits (B, Lmax).
ys : Tensor
Batch of padded target features (B, Lmax, odim).
stop_labels : Tensor(int64)
Batch of the sequences of stop token labels (B, Lmax).
olens : Tensor(int64)
Batch of the lengths of each target (B,).
Returns
----------
Tensor
L1 loss value.
Tensor
Mean square error loss value.
Tensor
Binary cross entropy loss value.
"""
# make mask and apply it
if self.use_masking:
masks = make_non_pad_mask(olens).unsqueeze(-1)
ys = ys.masked_select(masks.broadcast_to(ys.shape))
after_outs = after_outs.masked_select(
masks.broadcast_to(after_outs.shape))
before_outs = before_outs.masked_select(
masks.broadcast_to(before_outs.shape))
stop_labels = stop_labels.masked_select(
masks[:, :, 0].broadcast_to(stop_labels.shape))
logits = logits.masked_select(
masks[:, :, 0].broadcast_to(logits.shape))
# calculate loss
l1_loss = self.l1_criterion(after_outs, ys) + self.l1_criterion(
before_outs, ys)
mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion(
before_outs, ys)
bce_loss = self.bce_criterion(logits, stop_labels)
# make weighted mask and apply it
if self.use_weighted_masking:
masks = make_non_pad_mask(olens).unsqueeze(-1)
weights = masks.float() / masks.sum(axis=1, keepdim=True).float()
out_weights = weights.divide(
paddle.shape(ys)[0] * paddle.shape(ys)[2])
logit_weights = weights.divide(paddle.shape(ys)[0])
# apply weight
l1_loss = l1_loss.multiply(out_weights)
l1_loss = l1_loss.masked_select(masks.broadcast_to(l1_loss)).sum()
mse_loss = mse_loss.multiply(out_weights)
mse_loss = mse_loss.masked_select(
masks.broadcast_to(mse_loss)).sum()
bce_loss = bce_loss.multiply(logit_weights.squeeze(-1))
bce_loss = bce_loss.masked_select(
masks.squeeze(-1).broadcast_to(bce_loss)).sum()
return l1_loss, mse_loss, bce_loss
# Loss for Tacotron2
def attention_guide(dec_lens, enc_lens, N, T, g, dtype=None):
@ -80,7 +388,7 @@ def stft(x,
details. Defaults to "hann".
center : bool, optional
center (bool, optional): Whether to pad `x` to make that the
:math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`.
:math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`.
pad_mode : str, optional
Choose padding pattern when `center` is `True`.
Returns
@ -433,7 +741,8 @@ def weighted_mean(input, weight):
Weighted mean tensor with the same dtype as input.
"""
weight = paddle.cast(weight, input.dtype)
broadcast_ratio = input.size / weight.size
# paddle.Tensor.size is different with torch.size() and has been overrided in s2t.__init__
broadcast_ratio = input.numel() / weight.numel()
return paddle.sum(input * weight) / (paddle.sum(weight) * broadcast_ratio)

@ -0,0 +1,519 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Attention modules for RNN."""
import paddle
import paddle.nn.functional as F
from paddle import nn
from paddlespeech.t2s.modules.masked_fill import masked_fill
from paddlespeech.t2s.modules.nets_utils import make_pad_mask
def _apply_attention_constraint(e,
last_attended_idx,
backward_window=1,
forward_window=3):
"""Apply monotonic attention constraint.
This function apply the monotonic attention constraint
introduced in `Deep Voice 3: Scaling
Text-to-Speech with Convolutional Sequence Learning`_.
Parameters
----------
e : Tensor
Attention energy before applying softmax (1, T).
last_attended_idx : int
The index of the inputs of the last attended [0, T].
backward_window : int, optional
Backward window size in attention constraint.
forward_window : int, optional
Forward window size in attetion constraint.
Returns
----------
Tensor
Monotonic constrained attention energy (1, T).
.. _`Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning`:
https://arxiv.org/abs/1710.07654
"""
if paddle.shape(e)[0] != 1:
raise NotImplementedError(
"Batch attention constraining is not yet supported.")
backward_idx = last_attended_idx - backward_window
forward_idx = last_attended_idx + forward_window
if backward_idx > 0:
e[:, :backward_idx] = -float("inf")
if forward_idx < paddle.shape(e)[1]:
e[:, forward_idx:] = -float("inf")
return e
class AttLoc(nn.Layer):
"""location-aware attention module.
Reference: Attention-Based Models for Speech Recognition
(https://arxiv.org/pdf/1506.07503.pdf)
Parameters
----------
eprojs : int
projection-units of encoder
dunits : int
units of decoder
att_dim : int
att_dim: attention dimension
aconv_chans : int
channels of attention convolution
aconv_filts : int
filter size of attention convolution
han_mode : bool
flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
"""
def __init__(self,
eprojs,
dunits,
att_dim,
aconv_chans,
aconv_filts,
han_mode=False):
super().__init__()
self.mlp_enc = nn.Linear(eprojs, att_dim)
self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False)
self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False)
self.loc_conv = nn.Conv2D(
1,
aconv_chans,
(1, 2 * aconv_filts + 1),
padding=(0, aconv_filts),
bias_attr=False, )
self.gvec = nn.Linear(att_dim, 1)
self.dunits = dunits
self.eprojs = eprojs
self.att_dim = att_dim
self.h_length = None
self.enc_h = None
self.pre_compute_enc_h = None
self.mask = None
self.han_mode = han_mode
def reset(self):
"""reset states"""
self.h_length = None
self.enc_h = None
self.pre_compute_enc_h = None
self.mask = None
def forward(
self,
enc_hs_pad,
enc_hs_len,
dec_z,
att_prev,
scaling=2.0,
last_attended_idx=None,
backward_window=1,
forward_window=3, ):
"""Calculate AttLoc forward propagation.
Parameters
----------
enc_hs_pad : paddle.Tensor
padded encoder hidden state (B, T_max, D_enc)
enc_hs_len : paddle.Tensor
padded encoder hidden state length (B)
dec_z : paddle.Tensor dec_z
decoder hidden state (B, D_dec)
att_prev : paddle.Tensor
previous attention weight (B, T_max)
scaling : float
scaling parameter before applying softmax
forward_window : paddle.Tensor
forward window size when constraining attention
last_attended_idx : int
index of the inputs of the last attended
backward_window : int
backward window size in attention constraint
forward_window : int
forward window size in attetion constraint
Returns
----------
paddle.Tensor
attention weighted encoder state (B, D_enc)
paddle.Tensor
previous attention weights (B, T_max)
"""
batch = len(enc_hs_pad)
# pre-compute all h outside the decoder loop
if self.pre_compute_enc_h is None or self.han_mode:
# (utt, frame, hdim)
self.enc_h = enc_hs_pad
self.h_length = paddle.shape(self.enc_h)[1]
# (utt, frame, att_dim)
self.pre_compute_enc_h = self.mlp_enc(self.enc_h)
if dec_z is None:
dec_z = paddle.zeros([batch, self.dunits])
else:
dec_z = dec_z.reshape([batch, self.dunits])
# initialize attention weight with uniform dist.
if att_prev is None:
# if no bias, 0 0-pad goes 0
att_prev = 1.0 - make_pad_mask(enc_hs_len)
att_prev = att_prev / enc_hs_len.unsqueeze(-1)
# att_prev: (utt, frame) -> (utt, 1, 1, frame)
# -> (utt, att_conv_chans, 1, frame)
att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length]))
# att_conv: (utt, att_conv_chans, 1, frame) -> (utt, frame, att_conv_chans)
att_conv = att_conv.squeeze(2).transpose([0, 2, 1])
# att_conv: (utt, frame, att_conv_chans) -> (utt, frame, att_dim)
att_conv = self.mlp_att(att_conv)
# dec_z_tiled: (utt, frame, att_dim)
dec_z_tiled = self.mlp_dec(dec_z).reshape([batch, 1, self.att_dim])
# dot with gvec
# (utt, frame, att_dim) -> (utt, frame)
e = self.gvec(
paddle.tanh(att_conv + self.pre_compute_enc_h +
dec_z_tiled)).squeeze(2)
# NOTE: consider zero padding when compute w.
if self.mask is None:
self.mask = make_pad_mask(enc_hs_len)
e = masked_fill(e, self.mask, -float("inf"))
# apply monotonic attention constraint (mainly for TTS)
if last_attended_idx is not None:
e = _apply_attention_constraint(e, last_attended_idx,
backward_window, forward_window)
w = F.softmax(scaling * e, axis=1)
# weighted sum over frames
# utt x hdim
c = paddle.sum(
self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1)
return c, w
class AttForward(nn.Layer):
"""Forward attention module.
Reference
----------
Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
(https://arxiv.org/pdf/1807.06736.pdf)
Parameters
----------
eprojs : int
projection-units of encoder
dunits : int
units of decoder
att_dim : int
attention dimension
aconv_chans : int
channels of attention convolution
aconv_filts : int
filter size of attention convolution
"""
def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts):
super().__init__()
self.mlp_enc = nn.Linear(eprojs, att_dim)
self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False)
self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False)
self.loc_conv = nn.Conv2D(
1,
aconv_chans,
(1, 2 * aconv_filts + 1),
padding=(0, aconv_filts),
bias_attr=False, )
self.gvec = nn.Linear(att_dim, 1)
self.dunits = dunits
self.eprojs = eprojs
self.att_dim = att_dim
self.h_length = None
self.enc_h = None
self.pre_compute_enc_h = None
self.mask = None
def reset(self):
"""reset states"""
self.h_length = None
self.enc_h = None
self.pre_compute_enc_h = None
self.mask = None
def forward(
self,
enc_hs_pad,
enc_hs_len,
dec_z,
att_prev,
scaling=1.0,
last_attended_idx=None,
backward_window=1,
forward_window=3, ):
"""Calculate AttForward forward propagation.
Parameters
----------
enc_hs_pad : paddle.Tensor
padded encoder hidden state (B, T_max, D_enc)
enc_hs_len : list
padded encoder hidden state length (B,)
dec_z : paddle.Tensor
decoder hidden state (B, D_dec)
att_prev : paddle.Tensor
attention weights of previous step (B, T_max)
scaling : float
scaling parameter before applying softmax
last_attended_idx : int
index of the inputs of the last attended
backward_window : int
backward window size in attention constraint
forward_window : int
forward window size in attetion constraint
Returns
----------
paddle.Tensor
attention weighted encoder state (B, D_enc)
paddle.Tensor
previous attention weights (B, T_max)
"""
batch = len(enc_hs_pad)
# pre-compute all h outside the decoder loop
if self.pre_compute_enc_h is None:
self.enc_h = enc_hs_pad # utt x frame x hdim
self.h_length = paddle.shape(self.enc_h)[1]
# utt x frame x att_dim
self.pre_compute_enc_h = self.mlp_enc(self.enc_h)
if dec_z is None:
dec_z = paddle.zeros([batch, self.dunits])
else:
dec_z = dec_z.reshape([batch, self.dunits])
if att_prev is None:
# initial attention will be [1, 0, 0, ...]
att_prev = paddle.zeros([*paddle.shape(enc_hs_pad)[:2]])
att_prev[:, 0] = 1.0
# att_prev: utt x frame -> utt x 1 x 1 x frame
# -> utt x att_conv_chans x 1 x frame
att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length]))
# att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans
att_conv = att_conv.squeeze(2).transpose([0, 2, 1])
# att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim
att_conv = self.mlp_att(att_conv)
# dec_z_tiled: utt x frame x att_dim
dec_z_tiled = self.mlp_dec(dec_z).unsqueeze(1)
# dot with gvec
# utt x frame x att_dim -> utt x frame
e = self.gvec(
paddle.tanh(self.pre_compute_enc_h + dec_z_tiled +
att_conv)).squeeze(2)
# NOTE: consider zero padding when compute w.
if self.mask is None:
self.mask = make_pad_mask(enc_hs_len)
e = masked_fill(e, self.mask, -float("inf"))
# apply monotonic attention constraint (mainly for TTS)
if last_attended_idx is not None:
e = _apply_attention_constraint(e, last_attended_idx,
backward_window, forward_window)
w = F.softmax(scaling * e, axis=1)
# forward attention
att_prev_shift = F.pad(att_prev, (0, 0, 1, 0))[:, :-1]
w = (att_prev + att_prev_shift) * w
# NOTE: clip is needed to avoid nan gradient
w = F.normalize(paddle.clip(w, 1e-6), p=1, axis=1)
# weighted sum over flames
# utt x hdim
# NOTE use bmm instead of sum(*)
c = paddle.sum(self.enc_h * w.unsqueeze(-1), axis=1)
return c, w
class AttForwardTA(nn.Layer):
"""Forward attention with transition agent module.
Reference
----------
Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
(https://arxiv.org/pdf/1807.06736.pdf)
Parameters
----------
eunits : int
units of encoder
dunits : int
units of decoder
att_dim : int
attention dimension
aconv_chans : int
channels of attention convolution
aconv_filts : int
filter size of attention convolution
odim : int
output dimension
"""
def __init__(self, eunits, dunits, att_dim, aconv_chans, aconv_filts, odim):
super().__init__()
self.mlp_enc = nn.Linear(eunits, att_dim)
self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False)
self.mlp_ta = nn.Linear(eunits + dunits + odim, 1)
self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False)
self.loc_conv = nn.Conv2D(
1,
aconv_chans,
(1, 2 * aconv_filts + 1),
padding=(0, aconv_filts),
bias_attr=False, )
self.gvec = nn.Linear(att_dim, 1)
self.dunits = dunits
self.eunits = eunits
self.att_dim = att_dim
self.h_length = None
self.enc_h = None
self.pre_compute_enc_h = None
self.mask = None
self.trans_agent_prob = 0.5
def reset(self):
self.h_length = None
self.enc_h = None
self.pre_compute_enc_h = None
self.mask = None
self.trans_agent_prob = 0.5
def forward(
self,
enc_hs_pad,
enc_hs_len,
dec_z,
att_prev,
out_prev,
scaling=1.0,
last_attended_idx=None,
backward_window=1,
forward_window=3, ):
"""Calculate AttForwardTA forward propagation.
Parameters
----------
enc_hs_pad : paddle.Tensor
padded encoder hidden state (B, Tmax, eunits)
enc_hs_len : list paddle.Tensor
padded encoder hidden state length (B,)
dec_z : paddle.Tensor
decoder hidden state (B, dunits)
att_prev : paddle.Tensor
attention weights of previous step (B, T_max)
out_prev : paddle.Tensor
decoder outputs of previous step (B, odim)
scaling : float
scaling parameter before applying softmax
last_attended_idx : int
index of the inputs of the last attended
backward_window : int
backward window size in attention constraint
forward_window : int
forward window size in attetion constraint
Returns
----------
paddle.Tensor
attention weighted encoder state (B, dunits)
paddle.Tensor
previous attention weights (B, Tmax)
"""
batch = len(enc_hs_pad)
# pre-compute all h outside the decoder loop
if self.pre_compute_enc_h is None:
self.enc_h = enc_hs_pad # utt x frame x hdim
self.h_length = paddle.shape(self.enc_h)[1]
# utt x frame x att_dim
self.pre_compute_enc_h = self.mlp_enc(self.enc_h)
if dec_z is None:
dec_z = paddle.zeros([batch, self.dunits])
else:
dec_z = dec_z.reshape([batch, self.dunits])
if att_prev is None:
# initial attention will be [1, 0, 0, ...]
att_prev = paddle.zeros([*paddle.shape(enc_hs_pad)[:2]])
att_prev[:, 0] = 1.0
# att_prev: utt x frame -> utt x 1 x 1 x frame
# -> utt x att_conv_chans x 1 x frame
att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length]))
# att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans
att_conv = att_conv.squeeze(2).transpose([0, 2, 1])
# att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim
att_conv = self.mlp_att(att_conv)
# dec_z_tiled: utt x frame x att_dim
dec_z_tiled = self.mlp_dec(dec_z).reshape([batch, 1, self.att_dim])
# dot with gvec
# utt x frame x att_dim -> utt x frame
e = self.gvec(
paddle.tanh(att_conv + self.pre_compute_enc_h +
dec_z_tiled)).squeeze(2)
# NOTE consider zero padding when compute w.
if self.mask is None:
self.mask = make_pad_mask(enc_hs_len)
e = masked_fill(e, self.mask, -float("inf"))
# apply monotonic attention constraint (mainly for TTS)
if last_attended_idx is not None:
e = _apply_attention_constraint(e, last_attended_idx,
backward_window, forward_window)
w = F.softmax(scaling * e, axis=1)
# forward attention
# att_prev_shift = F.pad(att_prev.unsqueeze(0), (1, 0), data_format='NCL').squeeze(0)[:, :-1]
att_prev_shift = F.pad(att_prev, (0, 0, 1, 0))[:, :-1]
w = (self.trans_agent_prob * att_prev +
(1 - self.trans_agent_prob) * att_prev_shift) * w
# NOTE: clip is needed to avoid nan gradient
w = F.normalize(paddle.clip(w, 1e-6), p=1, axis=1)
# weighted sum over flames
# utt x hdim
# NOTE use bmm instead of sum(*)
c = paddle.sum(
self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1)
# update transition agent prob
self.trans_agent_prob = F.sigmoid(
self.mlp_ta(paddle.concat([c, out_prev, dec_z], axis=1)))
return c, w

@ -13,10 +13,13 @@
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
"""Tacotron2 decoder related modules."""
import paddle
import paddle.nn.functional as F
import six
from paddle import nn
from paddlespeech.t2s.modules.tacotron2.attentions import AttForwardTA
class Prenet(nn.Layer):
"""Prenet module for decoder of Spectrogram prediction network.
@ -196,3 +199,527 @@ class Postnet(nn.Layer):
for i in six.moves.range(len(self.postnet)):
xs = self.postnet[i](xs)
return xs
class ZoneOutCell(nn.Layer):
"""ZoneOut Cell module.
This is a module of zoneout described in
`Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`_.
This code is modified from `eladhoffer/seq2seq.pytorch`_.
Examples
----------
>>> lstm = paddle.nn.LSTMCell(16, 32)
>>> lstm = ZoneOutCell(lstm, 0.5)
.. _`Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`:
https://arxiv.org/abs/1606.01305
.. _`eladhoffer/seq2seq.pytorch`:
https://github.com/eladhoffer/seq2seq.pytorch
"""
def __init__(self, cell, zoneout_rate=0.1):
"""Initialize zone out cell module.
Parameters
----------
cell : nn.Layer:
Paddle recurrent cell module
e.g. `paddle.nn.LSTMCell`.
zoneout_rate : float, optional
Probability of zoneout from 0.0 to 1.0.
"""
super().__init__()
self.cell = cell
self.hidden_size = cell.hidden_size
self.zoneout_rate = zoneout_rate
if zoneout_rate > 1.0 or zoneout_rate < 0.0:
raise ValueError(
"zoneout probability must be in the range from 0.0 to 1.0.")
def forward(self, inputs, hidden):
"""Calculate forward propagation.
Parameters
----------
inputs : Tensor
Batch of input tensor (B, input_size).
hidden : tuple
- Tensor: Batch of initial hidden states (B, hidden_size).
- Tensor: Batch of initial cell states (B, hidden_size).
Returns
----------
Tensor
Batch of next hidden states (B, hidden_size).
tuple:
- Tensor: Batch of next hidden states (B, hidden_size).
- Tensor: Batch of next cell states (B, hidden_size).
"""
# we only use the second output of LSTMCell in paddle
_, next_hidden = self.cell(inputs, hidden)
next_hidden = self._zoneout(hidden, next_hidden, self.zoneout_rate)
# to have the same output format with LSTMCell in paddle
return next_hidden[0], next_hidden
def _zoneout(self, h, next_h, prob):
# apply recursively
if isinstance(h, tuple):
num_h = len(h)
if not isinstance(prob, tuple):
prob = tuple([prob] * num_h)
return tuple(
[self._zoneout(h[i], next_h[i], prob[i]) for i in range(num_h)])
if self.training:
mask = paddle.bernoulli(paddle.ones([*paddle.shape(h)]) * prob)
return mask * h + (1 - mask) * next_h
else:
return prob * h + (1 - prob) * next_h
class Decoder(nn.Layer):
"""Decoder module of Spectrogram prediction network.
This is a module of decoder of Spectrogram prediction network in Tacotron2,
which described in `Natural TTS
Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_.
The decoder generates the sequence of
features from the sequence of the hidden states.
.. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
https://arxiv.org/abs/1712.05884
"""
def __init__(
self,
idim,
odim,
att,
dlayers=2,
dunits=1024,
prenet_layers=2,
prenet_units=256,
postnet_layers=5,
postnet_chans=512,
postnet_filts=5,
output_activation_fn=None,
cumulate_att_w=True,
use_batch_norm=True,
use_concate=True,
dropout_rate=0.5,
zoneout_rate=0.1,
reduction_factor=1, ):
"""Initialize Tacotron2 decoder module.
Parameters
----------
idim : int
Dimension of the inputs.
odim : int
Dimension of the outputs.
att nn.Layer
Instance of attention class.
dlayers int, optional
The number of decoder lstm layers.
dunits : int, optional
The number of decoder lstm units.
prenet_layers : int, optional
The number of prenet layers.
prenet_units : int, optional
The number of prenet units.
postnet_layers : int, optional
The number of postnet layers.
postnet_filts : int, optional
The number of postnet filter size.
postnet_chans : int, optional
The number of postnet filter channels.
output_activation_fn : nn.Layer, optional
Activation function for outputs.
cumulate_att_w : bool, optional
Whether to cumulate previous attention weight.
use_batch_norm : bool, optional
Whether to use batch normalization.
use_concate : bool, optional
Whether to concatenate encoder embedding with decoder lstm outputs.
dropout_rate : float, optional
Dropout rate.
zoneout_rate : float, optional
Zoneout rate.
reduction_factor : int, optional
Reduction factor.
"""
super().__init__()
# store the hyperparameters
self.idim = idim
self.odim = odim
self.att = att
self.output_activation_fn = output_activation_fn
self.cumulate_att_w = cumulate_att_w
self.use_concate = use_concate
self.reduction_factor = reduction_factor
# check attention type
if isinstance(self.att, AttForwardTA):
self.use_att_extra_inputs = True
else:
self.use_att_extra_inputs = False
# define lstm network
prenet_units = prenet_units if prenet_layers != 0 else odim
self.lstm = nn.LayerList()
for layer in six.moves.range(dlayers):
iunits = idim + prenet_units if layer == 0 else dunits
lstm = nn.LSTMCell(iunits, dunits)
if zoneout_rate > 0.0:
lstm = ZoneOutCell(lstm, zoneout_rate)
self.lstm.append(lstm)
# define prenet
if prenet_layers > 0:
self.prenet = Prenet(
idim=odim,
n_layers=prenet_layers,
n_units=prenet_units,
dropout_rate=dropout_rate, )
else:
self.prenet = None
# define postnet
if postnet_layers > 0:
self.postnet = Postnet(
idim=idim,
odim=odim,
n_layers=postnet_layers,
n_chans=postnet_chans,
n_filts=postnet_filts,
use_batch_norm=use_batch_norm,
dropout_rate=dropout_rate, )
else:
self.postnet = None
# define projection layers
iunits = idim + dunits if use_concate else dunits
self.feat_out = nn.Linear(
iunits, odim * reduction_factor, bias_attr=False)
self.prob_out = nn.Linear(iunits, reduction_factor)
# initialize
# self.apply(decoder_init)
def _zero_state(self, hs):
init_hs = paddle.zeros([paddle.shape(hs)[0], self.lstm[0].hidden_size])
return init_hs
def forward(self, hs, hlens, ys):
"""Calculate forward propagation.
Parameters
----------
hs : Tensor
Batch of the sequences of padded hidden states (B, Tmax, idim).
hlens : Tensor(int64) padded
Batch of lengths of each input batch (B,).
ys : Tensor
Batch of the sequences of padded target features (B, Lmax, odim).
Returns
----------
Tensor
Batch of output tensors after postnet (B, Lmax, odim).
Tensor
Batch of output tensors before postnet (B, Lmax, odim).
Tensor
Batch of logits of stop prediction (B, Lmax).
Tensor
Batch of attention weights (B, Lmax, Tmax).
Note
----------
This computation is performed in teacher-forcing manner.
"""
# thin out frames (B, Lmax, odim) -> (B, Lmax/r, odim)
if self.reduction_factor > 1:
ys = ys[:, self.reduction_factor - 1::self.reduction_factor]
# length list should be list of int
# hlens = list(map(int, hlens))
# initialize hidden states of decoder
c_list = [self._zero_state(hs)]
z_list = [self._zero_state(hs)]
for _ in six.moves.range(1, len(self.lstm)):
c_list += [self._zero_state(hs)]
z_list += [self._zero_state(hs)]
prev_out = paddle.zeros([paddle.shape(hs)[0], self.odim])
# initialize attention
prev_att_w = None
self.att.reset()
# loop for an output sequence
outs, logits, att_ws = [], [], []
for y in ys.transpose([1, 0, 2]):
if self.use_att_extra_inputs:
att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w,
prev_out)
else:
att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w)
prenet_out = self.prenet(
prev_out) if self.prenet is not None else prev_out
xs = paddle.concat([att_c, prenet_out], axis=1)
# we only use the second output of LSTMCell in paddle
_, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0]))
z_list[0], c_list[0] = next_hidden
for i in six.moves.range(1, len(self.lstm)):
# we only use the second output of LSTMCell in paddle
_, next_hidden = self.lstm[i](z_list[i - 1],
(z_list[i], c_list[i]))
z_list[i], c_list[i] = next_hidden
zcs = (paddle.concat([z_list[-1], att_c], axis=1)
if self.use_concate else z_list[-1])
outs += [
self.feat_out(zcs).reshape([paddle.shape(hs)[0], self.odim, -1])
]
logits += [self.prob_out(zcs)]
att_ws += [att_w]
# teacher forcing
prev_out = y
if self.cumulate_att_w and prev_att_w is not None:
prev_att_w = prev_att_w + att_w # Note: error when use +=
else:
prev_att_w = att_w
# (B, Lmax)
logits = paddle.concat(logits, axis=1)
# (B, odim, Lmax)
before_outs = paddle.concat(outs, axis=2)
# (B, Lmax, Tmax)
att_ws = paddle.stack(att_ws, axis=1)
if self.reduction_factor > 1:
# (B, odim, Lmax)
before_outs = before_outs.reshape(
[paddle.shape(before_outs)[0], self.odim, -1])
if self.postnet is not None:
# (B, odim, Lmax)
after_outs = before_outs + self.postnet(before_outs)
else:
after_outs = before_outs
# (B, Lmax, odim)
before_outs = before_outs.transpose([0, 2, 1])
# (B, Lmax, odim)
after_outs = after_outs.transpose([0, 2, 1])
logits = logits
# apply activation function for scaling
if self.output_activation_fn is not None:
before_outs = self.output_activation_fn(before_outs)
after_outs = self.output_activation_fn(after_outs)
return after_outs, before_outs, logits, att_ws
def inference(
self,
h,
threshold=0.5,
minlenratio=0.0,
maxlenratio=10.0,
use_att_constraint=False,
backward_window=None,
forward_window=None, ):
"""Generate the sequence of features given the sequences of characters.
Parameters
----------
h : Tensor
Input sequence of encoder hidden states (T, C).
threshold : float, optional
Threshold to stop generation.
minlenratio : float, optional
Minimum length ratio.
If set to 1.0 and the length of input is 10,
the minimum length of outputs will be 10 * 1 = 10.
minlenratio : float, optional
Minimum length ratio.
If set to 10 and the length of input is 10,
the maximum length of outputs will be 10 * 10 = 100.
use_att_constraint : bool
Whether to apply attention constraint introduced in `Deep Voice 3`_.
backward_window : int
Backward window size in attention constraint.
forward_window : int
Forward window size in attention constraint.
Returns
----------
Tensor
Output sequence of features (L, odim).
Tensor
Output sequence of stop probabilities (L,).
Tensor
Attention weights (L, T).
Note
----------
This computation is performed in auto-regressive manner.
.. _`Deep Voice 3`: https://arxiv.org/abs/1710.07654
"""
# setup
assert len(paddle.shape(h)) == 2
hs = h.unsqueeze(0)
ilens = paddle.shape(h)[0]
maxlen = int(paddle.shape(h)[0] * maxlenratio)
minlen = int(paddle.shape(h)[0] * minlenratio)
# initialize hidden states of decoder
c_list = [self._zero_state(hs)]
z_list = [self._zero_state(hs)]
for _ in six.moves.range(1, len(self.lstm)):
c_list += [self._zero_state(hs)]
z_list += [self._zero_state(hs)]
prev_out = paddle.zeros([1, self.odim])
# initialize attention
prev_att_w = None
self.att.reset()
# setup for attention constraint
if use_att_constraint:
last_attended_idx = 0
else:
last_attended_idx = None
# loop for an output sequence
idx = 0
outs, att_ws, probs = [], [], []
while True:
# updated index
idx += self.reduction_factor
# decoder calculation
if self.use_att_extra_inputs:
att_c, att_w = self.att(
hs,
ilens,
z_list[0],
prev_att_w,
prev_out,
last_attended_idx=last_attended_idx,
backward_window=backward_window,
forward_window=forward_window, )
else:
att_c, att_w = self.att(
hs,
ilens,
z_list[0],
prev_att_w,
last_attended_idx=last_attended_idx,
backward_window=backward_window,
forward_window=forward_window, )
att_ws += [att_w]
prenet_out = self.prenet(
prev_out) if self.prenet is not None else prev_out
xs = paddle.concat([att_c, prenet_out], axis=1)
# we only use the second output of LSTMCell in paddle
_, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0]))
z_list[0], c_list[0] = next_hidden
for i in six.moves.range(1, len(self.lstm)):
# we only use the second output of LSTMCell in paddle
_, next_hidden = self.lstm[i](z_list[i - 1],
(z_list[i], c_list[i]))
z_list[i], c_list[i] = next_hidden
zcs = (paddle.concat([z_list[-1], att_c], axis=1)
if self.use_concate else z_list[-1])
# [(1, odim, r), ...]
outs += [self.feat_out(zcs).reshape([1, self.odim, -1])]
# [(r), ...]
probs += [F.sigmoid(self.prob_out(zcs))[0]]
if self.output_activation_fn is not None:
prev_out = self.output_activation_fn(
outs[-1][:, :, -1]) # (1, odim)
else:
prev_out = outs[-1][:, :, -1] # (1, odim)
if self.cumulate_att_w and prev_att_w is not None:
prev_att_w = prev_att_w + att_w # Note: error when use +=
else:
prev_att_w = att_w
if use_att_constraint:
last_attended_idx = int(att_w.argmax())
# check whether to finish generation
if sum(paddle.cast(probs[-1] >= threshold,
'int64')) > 0 or idx >= maxlen:
# check mininum length
if idx < minlen:
continue
# (1, odim, L)
outs = paddle.concat(outs, axis=2)
if self.postnet is not None:
# (1, odim, L)
outs = outs + self.postnet(outs)
# (L, odim)
outs = outs.transpose([0, 2, 1]).squeeze(0)
probs = paddle.concat(probs, axis=0)
att_ws = paddle.concat(att_ws, axis=0)
break
if self.output_activation_fn is not None:
outs = self.output_activation_fn(outs)
return outs, probs, att_ws
def calculate_all_attentions(self, hs, hlens, ys):
"""Calculate all of the attention weights.
Parameters
----------
hs : Tensor
Batch of the sequences of padded hidden states (B, Tmax, idim).
hlens : Tensor(int64)
Batch of lengths of each input batch (B,).
ys : Tensor
Batch of the sequences of padded target features (B, Lmax, odim).
Returns
----------
numpy.ndarray
Batch of attention weights (B, Lmax, Tmax).
Note
----------
This computation is performed in teacher-forcing manner.
"""
# thin out frames (B, Lmax, odim) -> (B, Lmax/r, odim)
if self.reduction_factor > 1:
ys = ys[:, self.reduction_factor - 1::self.reduction_factor]
# length list should be list of int
hlens = list(map(int, hlens))
# initialize hidden states of decoder
c_list = [self._zero_state(hs)]
z_list = [self._zero_state(hs)]
for _ in six.moves.range(1, len(self.lstm)):
c_list += [self._zero_state(hs)]
z_list += [self._zero_state(hs)]
prev_out = paddle.zeros([paddle.shape(hs)[0], self.odim])
# initialize attention
prev_att_w = None
self.att.reset()
# loop for an output sequence
att_ws = []
for y in ys.transpose([1, 0, 2]):
if self.use_att_extra_inputs:
att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w,
prev_out)
else:
att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w)
att_ws += [att_w]
prenet_out = self.prenet(
prev_out) if self.prenet is not None else prev_out
xs = paddle.concat([att_c, prenet_out], axis=1)
# we only use the second output of LSTMCell in paddle
_, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0]))
z_list[0], c_list[0] = next_hidden
for i in six.moves.range(1, len(self.lstm)):
z_list[i], c_list[i] = self.lstm[i](z_list[i - 1],
(z_list[i], c_list[i]))
# teacher forcing
prev_out = y
if self.cumulate_att_w and prev_att_w is not None:
# Note: error when use +=
prev_att_w = prev_att_w + att_w
else:
prev_att_w = att_w
# (B, Lmax, Tmax)
att_ws = paddle.stack(att_ws, axis=1)
return att_ws

@ -145,16 +145,15 @@ class Encoder(nn.Layer):
Batch of the padded sequence. Either character ids (B, Tmax)
or acoustic feature (B, Tmax, idim * encoder_reduction_factor).
Padded value should be 0.
ilens : LongTensor
ilens : Tensor(int64)
Batch of lengths of each input batch (B,).
Returns
----------
Tensor
Batch of the sequences of encoder states(B, Tmax, eunits).
LongTensor
Tensor(int64)
Batch of lengths of each sequence (B,)
"""
xs = self.embed(xs).transpose([0, 2, 1])
if self.convs is not None:
@ -170,8 +169,8 @@ class Encoder(nn.Layer):
xs = xs.transpose([0, 2, 1])
self.blstm.flatten_parameters()
# (B, Tmax, C)
xs, _ = self.blstm(xs)
# hlens 是什么
# see https://www.paddlepaddle.org.cn/documentation/docs/zh/faq/train_cn.html#paddletorch-nn-utils-rnn-pack-padded-sequencetorch-nn-utils-rnn-pad-packed-sequenceapi
xs, _ = self.blstm(xs, sequence_length=ilens)
hlens = ilens
return xs, hlens

@ -26,10 +26,13 @@ optim_classes = dict(
sgd=paddle.optimizer.SGD, )
def build_optimizers(model: nn.Layer,
optim='adadelta',
max_grad_norm=None,
learning_rate=0.01) -> paddle.optimizer:
def build_optimizers(
model: nn.Layer,
optim='adadelta',
max_grad_norm=None,
learning_rate=0.01,
weight_decay=None,
epsilon=1.0e-6, ) -> paddle.optimizer:
optim_class = optim_classes.get(optim)
if optim_class is None:
raise ValueError(f"must be one of {list(optim_classes)}: {optim}")
@ -37,10 +40,13 @@ def build_optimizers(model: nn.Layer,
grad_clip = None
if max_grad_norm:
grad_clip = paddle.nn.ClipGradByGlobalNorm(max_grad_norm)
optim = optim_class(
parameters=model.parameters(),
learning_rate=learning_rate,
grad_clip=grad_clip)
optim_dict = {}
optim_dict['parameters'] = model.parameters()
optim_dict['learning_rate'] = learning_rate
optim_dict['grad_clip'] = grad_clip
optim_dict['weight_decay'] = weight_decay
if optim_class not in {'momentum', 'sgd'}:
optim_dict['epsilon'] = epsilon
optimizers = optim_class(**optim_dict)
optimizers = optim
return optimizers

Loading…
Cancel
Save