Merge branch 'develop' of github.com:PaddlePaddle/PaddleSpeech into HEAD

pull/1015/head
TianYuan 4 years ago committed by root
commit bc0dd51149

@ -41,10 +41,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n # # export ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit # CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi # fi
# Optionally, you can add LM and test it with runtime. # Optionally, you can add LM and test it with runtime.
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then

@ -24,7 +24,7 @@ f0max: 400 # Minimum f0 for pitch extraction.
# DATA SETTING # # DATA SETTING #
########################################################### ###########################################################
batch_size: 64 batch_size: 64
num_workers: 4 num_workers: 2
########################################################### ###########################################################
@ -45,7 +45,6 @@ model:
postnet_layers: 5 # number of layers of postnset postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet
use_masking: True # whether to apply masking for padded part in loss calculation
use_scaled_pos_enc: True # whether to use scaled positional encoding use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input decoder_normalize_before: True # whether to perform layer normalization before the input

@ -7,7 +7,6 @@ gpus=0,1
stage=0 stage=0
stop_stage=100 stop_stage=100
conf_path=conf/default.yaml conf_path=conf/default.yaml
train_output_path=exp/default train_output_path=exp/default
ckpt_name=snapshot_iter_482.pdz ckpt_name=snapshot_iter_482.pdz

@ -9,7 +9,7 @@ alignment=$3
ge2e_ckpt_path=$4 ge2e_ckpt_path=$4
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../../ge2e/inference.py \ python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \
--input=${input}/wav \ --input=${input}/wav \
--output=${preprocess_path}/embed \ --output=${preprocess_path}/embed \
--checkpoint_path=${ge2e_ckpt_path} --checkpoint_path=${ge2e_ckpt_path}

@ -45,7 +45,6 @@ model:
postnet_layers: 5 # number of layers of postnset postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet
use_masking: True # whether to apply masking for padded part in loss calculation
use_scaled_pos_enc: True # whether to use scaled positional encoding use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input decoder_normalize_before: True # whether to perform layer normalization before the input

@ -0,0 +1,109 @@
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs: 24000 # sr
n_fft: 2048 # FFT size.
n_shift: 300 # Hop size.
win_length: 1200 # Window length.
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.
# Only used for feats_type != raw
fmin: 80 # Minimum frequency of Mel basis.
fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
###########################################################
# DATA SETTING #
###########################################################
batch_size: 64
num_workers: 4
###########################################################
# MODEL SETTING #
###########################################################
model:
adim: 384 # attention dimension
aheads: 2 # number of attention heads
elayers: 4 # number of encoder layers
eunits: 1536 # number of encoder ff units
dlayers: 4 # number of decoder layers
dunits: 1536 # number of decoder ff units
positionwise_layer_type: conv1d # type of position-wise layer
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input
reduction_factor: 1 # reduction factor
encoder_type: conformer # encoder type
decoder_type: conformer # decoder type
conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type
conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
conformer_activation_type: swish # conformer activation type
use_macaron_style_in_conformer: true # whether to use macaron style in conformer
use_cnn_in_conformer: true # whether to use CNN in conformer
conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder
conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder
init_type: xavier_uniform # initialization type
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
###########################################################
# UPDATER SETTING #
###########################################################
updater:
use_masking: True # whether to apply masking for padded part in loss calculation
###########################################################
# OPTIMIZER SETTING #
###########################################################
optimizer:
optim: adam # optimizer type
learning_rate: 0.001 # learning rate
###########################################################
# TRAINING SETTING #
###########################################################
max_epoch: 1000
num_snapshots: 5
###########################################################
# OTHER SETTING #
###########################################################
seed: 10086

@ -45,7 +45,6 @@ model:
postnet_layers: 5 # number of layers of postnset postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet
use_masking: True # whether to apply masking for padded part in loss calculation
use_scaled_pos_enc: True # whether to use scaled positional encoding use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input decoder_normalize_before: True # whether to perform layer normalization before the input

@ -80,7 +80,7 @@ lambda_adv: 4.0 # Loss balancing coefficient.
batch_size: 8 # Batch size. batch_size: 8 # Batch size.
batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size. batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size.
pin_memory: true # Whether to pin memory in Pytorch DataLoader. pin_memory: true # Whether to pin memory in Pytorch DataLoader.
num_workers: 4 # Number of workers in Pytorch DataLoader. num_workers: 2 # Number of workers in Pytorch DataLoader.
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.

@ -43,10 +43,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n # # export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit # CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi # fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# test a single .wav file # test a single .wav file

@ -1,6 +1,8 @@
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
# network architecture # network architecture
model: model:
cmvn_file:
cmvn_file_type: "json"
# encoder related # encoder related
encoder: transformer encoder: transformer
encoder_conf: encoder_conf:

@ -48,10 +48,10 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then # if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# export ckpt avg_n # # export ckpt avg_n
./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit # ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi # fi
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
./local/cacu_perplexity.sh || exit -1 ./local/cacu_perplexity.sh || exit -1

@ -45,7 +45,6 @@ model:
postnet_layers: 5 # number of layers of postnset postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet
use_masking: True # whether to apply masking for padded part in loss calculation
use_scaled_pos_enc: True # whether to use scaled positional encoding use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input decoder_normalize_before: True # whether to perform layer normalization before the input

@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=ge2e MODEL=ge2e
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL}

@ -35,7 +35,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# export ckpt avg_n # # export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit # CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi # fi

@ -42,7 +42,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n # # export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit # CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi # fi

@ -39,8 +39,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n # # export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit # CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi # fi

@ -45,7 +45,6 @@ model:
postnet_layers: 5 # number of layers of postnset postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet
use_masking: True # whether to apply masking for padded part in loss calculation
use_scaled_pos_enc: True # whether to use scaled positional encoding use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input decoder_normalize_before: True # whether to perform layer normalization before the input

@ -126,8 +126,12 @@ decoders_module = [
] ]
setup( setup(
name='swig_decoders', name='paddlespeech_ctcdecoders',
version='1.1', version='0.0.1a',
description="""CTC decoders""", description="CTC decoders in paddlespeech",
author="PaddlePaddle Speech and Language Team",
author_email="paddlesl@baidu.com",
url="https://github.com/PaddlePaddle/PaddleSpeech",
license='Apache 2.0',
ext_modules=decoders_module, ext_modules=decoders_module,
py_modules=['swig_decoders'], ) py_modules=['swig_decoders'])

@ -860,7 +860,7 @@ class U2Model(U2DecodeModel):
int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
""" """
# cmvn # cmvn
if configs['cmvn_file'] is not None: if 'cmvn_file' in configs and configs['cmvn_file'] is not None:
mean, istd = load_cmvn(configs['cmvn_file'], mean, istd = load_cmvn(configs['cmvn_file'],
configs['cmvn_file_type']) configs['cmvn_file_type'])
global_cmvn = GlobalCMVN( global_cmvn = GlobalCMVN(

@ -100,7 +100,7 @@ def fastspeech2_single_spk_batch_fn(examples):
def fastspeech2_multi_spk_batch_fn(examples): def fastspeech2_multi_spk_batch_fn(examples):
# fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"] # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"/"spk_emb"]
text = [np.array(item["text"], dtype=np.int64) for item in examples] text = [np.array(item["text"], dtype=np.int64) for item in examples]
speech = [np.array(item["speech"], dtype=np.float32) for item in examples] speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples] pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
@ -114,7 +114,6 @@ def fastspeech2_multi_spk_batch_fn(examples):
speech_lengths = [ speech_lengths = [
np.array(item["speech_lengths"], dtype=np.int64) for item in examples np.array(item["speech_lengths"], dtype=np.int64) for item in examples
] ]
spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
text = batch_sequences(text) text = batch_sequences(text)
pitch = batch_sequences(pitch) pitch = batch_sequences(pitch)
@ -130,7 +129,6 @@ def fastspeech2_multi_spk_batch_fn(examples):
energy = paddle.to_tensor(energy) energy = paddle.to_tensor(energy)
text_lengths = paddle.to_tensor(text_lengths) text_lengths = paddle.to_tensor(text_lengths)
speech_lengths = paddle.to_tensor(speech_lengths) speech_lengths = paddle.to_tensor(speech_lengths)
spk_id = paddle.to_tensor(spk_id)
batch = { batch = {
"text": text, "text": text,
@ -139,9 +137,20 @@ def fastspeech2_multi_spk_batch_fn(examples):
"speech": speech, "speech": speech,
"speech_lengths": speech_lengths, "speech_lengths": speech_lengths,
"pitch": pitch, "pitch": pitch,
"energy": energy, "energy": energy
"spk_id": spk_id
} }
# spk_emb has a higher priority than spk_id
if "spk_emb" in examples[0]:
spk_emb = [
np.array(item["spk_emb"], dtype=np.float32) for item in examples
]
spk_emb = batch_sequences(spk_emb)
spk_emb = paddle.to_tensor(spk_emb)
batch["spk_emb"] = spk_emb
elif "spk_id" in examples[0]:
spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
spk_id = paddle.to_tensor(spk_id)
batch["spk_id"] = spk_id
return batch return batch

@ -46,14 +46,14 @@ def evaluate(args, fastspeech2_config, pwg_config):
print("vocab_size:", vocab_size) print("vocab_size:", vocab_size)
with open(args.speaker_dict, 'rt') as f: with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()] spk_id = [line.strip().split() for line in f.readlines()]
num_speakers = len(spk_id) spk_num = len(spk_id)
print("num_speakers:", num_speakers) print("spk_num:", spk_num)
odim = fastspeech2_config.n_mels odim = fastspeech2_config.n_mels
model = FastSpeech2( model = FastSpeech2(
idim=vocab_size, idim=vocab_size,
odim=odim, odim=odim,
num_speakers=num_speakers, spk_num=spk_num,
**fastspeech2_config["model"]) **fastspeech2_config["model"])
model.set_state_dict( model.set_state_dict(

@ -51,14 +51,14 @@ def evaluate(args, fastspeech2_config, pwg_config):
print("vocab_size:", vocab_size) print("vocab_size:", vocab_size)
with open(args.speaker_dict, 'rt') as f: with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()] spk_id = [line.strip().split() for line in f.readlines()]
num_speakers = len(spk_id) spk_num = len(spk_id)
print("num_speakers:", num_speakers) print("spk_num:", spk_num)
odim = fastspeech2_config.n_mels odim = fastspeech2_config.n_mels
model = FastSpeech2( model = FastSpeech2(
idim=vocab_size, idim=vocab_size,
odim=odim, odim=odim,
num_speakers=num_speakers, spk_num=spk_num,
**fastspeech2_config["model"]) **fastspeech2_config["model"])
model.set_state_dict( model.set_state_dict(

@ -167,6 +167,10 @@ def main():
"pitch": str(pitch_path), "pitch": str(pitch_path),
"energy": str(energy_path) "energy": str(energy_path)
} }
# add spk_emb for voice cloning
if "spk_emb" in item:
record["spk_emb"] = str(item["spk_emb"])
output_metadata.append(record) output_metadata.append(record)
output_metadata.sort(key=itemgetter('utt_id')) output_metadata.sort(key=itemgetter('utt_id'))
output_metadata_path = Path(args.dumpdir) / "metadata.jsonl" output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"

@ -44,7 +44,8 @@ def process_sentence(config: Dict[str, Any],
mel_extractor=None, mel_extractor=None,
pitch_extractor=None, pitch_extractor=None,
energy_extractor=None, energy_extractor=None,
cut_sil: bool=True): cut_sil: bool=True,
spk_emb_dir: Path=None):
utt_id = fp.stem utt_id = fp.stem
# for vctk # for vctk
if utt_id.endswith("_mic2"): if utt_id.endswith("_mic2"):
@ -116,6 +117,14 @@ def process_sentence(config: Dict[str, Any],
"energy": str(energy_path), "energy": str(energy_path),
"speaker": speaker "speaker": speaker
} }
if spk_emb_dir:
if speaker in os.listdir(spk_emb_dir):
embed_name = utt_id + ".npy"
embed_path = spk_emb_dir / speaker / embed_name
if embed_path.is_file():
record["spk_emb"] = str(embed_path)
else:
return None
return record return record
@ -127,13 +136,14 @@ def process_sentences(config,
pitch_extractor=None, pitch_extractor=None,
energy_extractor=None, energy_extractor=None,
nprocs: int=1, nprocs: int=1,
cut_sil: bool=True): cut_sil: bool=True,
spk_emb_dir: Path=None):
if nprocs == 1: if nprocs == 1:
results = [] results = []
for fp in fps: for fp in fps:
record = process_sentence(config, fp, sentences, output_dir, record = process_sentence(config, fp, sentences, output_dir,
mel_extractor, pitch_extractor, mel_extractor, pitch_extractor,
energy_extractor, cut_sil) energy_extractor, cut_sil, spk_emb_dir)
if record: if record:
results.append(record) results.append(record)
else: else:
@ -144,7 +154,7 @@ def process_sentences(config,
future = pool.submit(process_sentence, config, fp, future = pool.submit(process_sentence, config, fp,
sentences, output_dir, mel_extractor, sentences, output_dir, mel_extractor,
pitch_extractor, energy_extractor, pitch_extractor, energy_extractor,
cut_sil) cut_sil, spk_emb_dir)
future.add_done_callback(lambda p: progress.update()) future.add_done_callback(lambda p: progress.update())
futures.append(future) futures.append(future)
@ -202,6 +212,11 @@ def main():
default=True, default=True,
help="whether cut sil in the edge of audio") help="whether cut sil in the edge of audio")
parser.add_argument(
"--spk_emb_dir",
default=None,
type=str,
help="directory to speaker embedding files.")
args = parser.parse_args() args = parser.parse_args()
rootdir = Path(args.rootdir).expanduser() rootdir = Path(args.rootdir).expanduser()
@ -211,6 +226,11 @@ def main():
dumpdir.mkdir(parents=True, exist_ok=True) dumpdir.mkdir(parents=True, exist_ok=True)
dur_file = Path(args.dur_file).expanduser() dur_file = Path(args.dur_file).expanduser()
if args.spk_emb_dir:
spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve()
else:
spk_emb_dir = None
assert rootdir.is_dir() assert rootdir.is_dir()
assert dur_file.is_file() assert dur_file.is_file()
@ -251,6 +271,7 @@ def main():
test_wav_files += wav_files[-sub_num_dev:] test_wav_files += wav_files[-sub_num_dev:]
else: else:
train_wav_files += wav_files train_wav_files += wav_files
elif args.dataset == "ljspeech": elif args.dataset == "ljspeech":
wav_files = sorted(list((rootdir / "wavs").rglob("*.wav"))) wav_files = sorted(list((rootdir / "wavs").rglob("*.wav")))
# split data into 3 sections # split data into 3 sections
@ -317,7 +338,8 @@ def main():
pitch_extractor, pitch_extractor,
energy_extractor, energy_extractor,
nprocs=args.num_cpu, nprocs=args.num_cpu,
cut_sil=args.cut_sil) cut_sil=args.cut_sil,
spk_emb_dir=spk_emb_dir)
if dev_wav_files: if dev_wav_files:
process_sentences( process_sentences(
config, config,
@ -327,7 +349,8 @@ def main():
mel_extractor, mel_extractor,
pitch_extractor, pitch_extractor,
energy_extractor, energy_extractor,
cut_sil=args.cut_sil) cut_sil=args.cut_sil,
spk_emb_dir=spk_emb_dir)
if test_wav_files: if test_wav_files:
process_sentences( process_sentences(
config, config,
@ -338,7 +361,8 @@ def main():
pitch_extractor, pitch_extractor,
energy_extractor, energy_extractor,
nprocs=args.num_cpu, nprocs=args.num_cpu,
cut_sil=args.cut_sil) cut_sil=args.cut_sil,
spk_emb_dir=spk_emb_dir)
if __name__ == "__main__": if __name__ == "__main__":

@ -40,16 +40,19 @@ def evaluate(args, fastspeech2_config, pwg_config):
fields = ["utt_id", "text"] fields = ["utt_id", "text"]
spk_num = None
if args.speaker_dict is not None: if args.speaker_dict is not None:
print("multiple speaker fastspeech2!") print("multiple speaker fastspeech2!")
with open(args.speaker_dict, 'rt') as f: with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()] spk_id = [line.strip().split() for line in f.readlines()]
num_speakers = len(spk_id) spk_num = len(spk_id)
fields += ["spk_id"] fields += ["spk_id"]
elif args.voice_cloning:
print("voice cloning!")
fields += ["spk_emb"]
else: else:
print("single speaker fastspeech2!") print("single speaker fastspeech2!")
num_speakers = None print("spk_num:", spk_num)
print("num_speakers:", num_speakers)
test_dataset = DataTable(data=test_metadata, fields=fields) test_dataset = DataTable(data=test_metadata, fields=fields)
@ -62,7 +65,7 @@ def evaluate(args, fastspeech2_config, pwg_config):
model = FastSpeech2( model = FastSpeech2(
idim=vocab_size, idim=vocab_size,
odim=odim, odim=odim,
num_speakers=num_speakers, spk_num=spk_num,
**fastspeech2_config["model"]) **fastspeech2_config["model"])
model.set_state_dict( model.set_state_dict(
@ -96,12 +99,15 @@ def evaluate(args, fastspeech2_config, pwg_config):
for datum in test_dataset: for datum in test_dataset:
utt_id = datum["utt_id"] utt_id = datum["utt_id"]
text = paddle.to_tensor(datum["text"]) text = paddle.to_tensor(datum["text"])
if "spk_id" in datum: spk_emb = None
spk_id = paddle.to_tensor(datum["spk_id"])
else:
spk_id = None spk_id = None
if args.voice_cloning and "spk_emb" in datum:
spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
elif "spk_id" in datum:
spk_id = paddle.to_tensor(datum["spk_id"])
with paddle.no_grad(): with paddle.no_grad():
wav = pwg_inference(fastspeech2_inference(text, spk_id=spk_id)) wav = pwg_inference(
fastspeech2_inference(text, spk_id=spk_id, spk_emb=spk_emb))
sf.write( sf.write(
str(output_dir / (utt_id + ".wav")), str(output_dir / (utt_id + ".wav")),
wav.numpy(), wav.numpy(),
@ -142,6 +148,15 @@ def main():
type=str, type=str,
default=None, default=None,
help="speaker id map file for multiple speaker model.") help="speaker id map file for multiple speaker model.")
def str2bool(str):
return True if str.lower() == 'true' else False
parser.add_argument(
"--voice-cloning",
type=str2bool,
default=False,
help="whether training voice cloning model.")
parser.add_argument("--test-metadata", type=str, help="test metadata.") parser.add_argument("--test-metadata", type=str, help="test metadata.")
parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument("--output-dir", type=str, help="output dir.")
parser.add_argument( parser.add_argument(

@ -61,18 +61,24 @@ def train_sp(args, config):
"text", "text_lengths", "speech", "speech_lengths", "durations", "text", "text_lengths", "speech", "speech_lengths", "durations",
"pitch", "energy" "pitch", "energy"
] ]
converters = {"speech": np.load, "pitch": np.load, "energy": np.load}
spk_num = None
if args.speaker_dict is not None: if args.speaker_dict is not None:
print("multiple speaker fastspeech2!") print("multiple speaker fastspeech2!")
collate_fn = fastspeech2_multi_spk_batch_fn collate_fn = fastspeech2_multi_spk_batch_fn
with open(args.speaker_dict, 'rt') as f: with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()] spk_id = [line.strip().split() for line in f.readlines()]
num_speakers = len(spk_id) spk_num = len(spk_id)
fields += ["spk_id"] fields += ["spk_id"]
elif args.voice_cloning:
print("Training voice cloning!")
collate_fn = fastspeech2_multi_spk_batch_fn
fields += ["spk_emb"]
converters["spk_emb"] = np.load
else: else:
print("single speaker fastspeech2!") print("single speaker fastspeech2!")
collate_fn = fastspeech2_single_spk_batch_fn collate_fn = fastspeech2_single_spk_batch_fn
num_speakers = None print("spk_num:", spk_num)
print("num_speakers:", num_speakers)
# dataloader has been too verbose # dataloader has been too verbose
logging.getLogger("DataLoader").disabled = True logging.getLogger("DataLoader").disabled = True
@ -83,17 +89,13 @@ def train_sp(args, config):
train_dataset = DataTable( train_dataset = DataTable(
data=train_metadata, data=train_metadata,
fields=fields, fields=fields,
converters={"speech": np.load, converters=converters, )
"pitch": np.load,
"energy": np.load}, )
with jsonlines.open(args.dev_metadata, 'r') as reader: with jsonlines.open(args.dev_metadata, 'r') as reader:
dev_metadata = list(reader) dev_metadata = list(reader)
dev_dataset = DataTable( dev_dataset = DataTable(
data=dev_metadata, data=dev_metadata,
fields=fields, fields=fields,
converters={"speech": np.load, converters=converters, )
"pitch": np.load,
"energy": np.load}, )
# collate function and dataloader # collate function and dataloader
@ -127,10 +129,7 @@ def train_sp(args, config):
odim = config.n_mels odim = config.n_mels
model = FastSpeech2( model = FastSpeech2(
idim=vocab_size, idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"])
odim=odim,
num_speakers=num_speakers,
**config["model"])
if world_size > 1: if world_size > 1:
model = DataParallel(model) model = DataParallel(model)
print("model done!") print("model done!")
@ -184,6 +183,15 @@ def main():
default=None, default=None,
help="speaker id map file for multiple speaker model.") help="speaker id map file for multiple speaker model.")
def str2bool(str):
return True if str.lower() == 'true' else False
parser.add_argument(
"--voice-cloning",
type=str2bool,
default=False,
help="whether training voice cloning model.")
args = parser.parse_args() args = parser.parse_args()
with open(args.config) as f: with open(args.config) as f:

@ -1,38 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
def cycle(iterable):
# cycle('ABCD') --> A B C D A B C D A B C D ...
saved = []
for element in iterable:
yield element
saved.append(element)
while saved:
for element in saved:
yield element
def random_cycle(iterable):
# cycle('ABCD') --> A B C D B C D A A D B C ...
saved = []
for element in iterable:
yield element
saved.append(element)
random.shuffle(saved)
while saved:
for element in saved:
yield element
random.shuffle(saved)

@ -1,131 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
from pathlib import Path
import numpy as np
from paddle.io import BatchSampler
from paddle.io import Dataset
from paddlespeech.t2s.exps.ge2e.random_cycle import random_cycle
class MultiSpeakerMelDataset(Dataset):
"""A 2 layer directory thatn contains mel spectrograms in *.npy format.
An Example file structure tree is shown below. We prefer to preprocess
raw datasets and organized them like this.
dataset_root/
speaker1/
utterance1.npy
utterance2.npy
utterance3.npy
speaker2/
utterance1.npy
utterance2.npy
utterance3.npy
"""
def __init__(self, dataset_root: Path):
self.root = Path(dataset_root).expanduser()
speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
speaker_utterances = {
speaker_dir: list(speaker_dir.glob("*.npy"))
for speaker_dir in speaker_dirs
}
self.speaker_dirs = speaker_dirs
self.speaker_to_utterances = speaker_utterances
# meta data
self.num_speakers = len(self.speaker_dirs)
self.num_utterances = np.sum(
len(utterances)
for speaker, utterances in self.speaker_to_utterances.items())
def get_example_by_index(self, speaker_index, utterance_index):
speaker_dir = self.speaker_dirs[speaker_index]
fpath = self.speaker_to_utterances[speaker_dir][utterance_index]
return self[fpath]
def __getitem__(self, fpath):
return np.load(fpath)
def __len__(self):
return int(self.num_utterances)
class MultiSpeakerSampler(BatchSampler):
"""A multi-stratal sampler designed for speaker verification task.
First, N speakers from all speakers are sampled randomly. Then, for each
speaker, randomly sample M utterances from their corresponding utterances.
"""
def __init__(self,
dataset: MultiSpeakerMelDataset,
speakers_per_batch: int,
utterances_per_speaker: int):
self._speakers = list(dataset.speaker_dirs)
self._speaker_to_utterances = dataset.speaker_to_utterances
self.speakers_per_batch = speakers_per_batch
self.utterances_per_speaker = utterances_per_speaker
def __iter__(self):
# yield list of Paths
speaker_generator = iter(random_cycle(self._speakers))
speaker_utterances_generator = {
s: iter(random_cycle(us))
for s, us in self._speaker_to_utterances.items()
}
while True:
speakers = []
for _ in range(self.speakers_per_batch):
speakers.append(next(speaker_generator))
utterances = []
for s in speakers:
us = speaker_utterances_generator[s]
for _ in range(self.utterances_per_speaker):
utterances.append(next(us))
yield utterances
class RandomClip(object):
def __init__(self, frames):
self.frames = frames
def __call__(self, spec):
# spec [T, C]
T = spec.shape[0]
start = random.randint(0, T - self.frames)
return spec[start:start + self.frames, :]
class Collate(object):
def __init__(self, num_frames):
self.random_crop = RandomClip(num_frames)
def __call__(self, examples):
frame_clips = [self.random_crop(mel) for mel in examples]
batced_clips = np.stack(frame_clips)
return batced_clips
if __name__ == "__main__":
mydataset = MultiSpeakerMelDataset(
Path("/home/chenfeiyu/datasets/SV2TTS/encoder"))
print(mydataset.get_example_by_index(0, 10))

@ -1,123 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from paddle import DataParallel
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.nn.clip import ClipGradByGlobalNorm
from paddle.optimizer import Adam
from paddlespeech.t2s.exps.ge2e.config import get_cfg_defaults
from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import Collate
from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerMelDataset
from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerSampler
from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder
from paddlespeech.t2s.training import default_argument_parser
from paddlespeech.t2s.training import ExperimentBase
class Ge2eExperiment(ExperimentBase):
def setup_model(self):
config = self.config
model = LSTMSpeakerEncoder(config.data.n_mels, config.model.num_layers,
config.model.hidden_size,
config.model.embedding_size)
optimizer = Adam(
config.training.learning_rate_init,
parameters=model.parameters(),
grad_clip=ClipGradByGlobalNorm(3))
self.model = DataParallel(model) if self.parallel else model
self.model_core = model
self.optimizer = optimizer
def setup_dataloader(self):
config = self.config
train_dataset = MultiSpeakerMelDataset(self.args.data)
sampler = MultiSpeakerSampler(train_dataset,
config.training.speakers_per_batch,
config.training.utterances_per_speaker)
train_loader = DataLoader(
train_dataset,
batch_sampler=sampler,
collate_fn=Collate(config.data.partial_n_frames),
num_workers=16)
self.train_dataset = train_dataset
self.train_loader = train_loader
def train_batch(self):
start = time.time()
batch = self.read_batch()
data_loader_time = time.time() - start
self.optimizer.clear_grad()
self.model.train()
specs = batch
loss, eer = self.model(specs, self.config.training.speakers_per_batch)
loss.backward()
self.model_core.do_gradient_ops()
self.optimizer.step()
iteration_time = time.time() - start
# logging
loss_value = float(loss)
msg = "Rank: {}, ".format(dist.get_rank())
msg += "step: {}, ".format(self.iteration)
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
iteration_time)
msg += 'loss: {:>.6f} err: {:>.6f}'.format(loss_value, eer)
self.logger.info(msg)
if dist.get_rank() == 0:
self.visualizer.add_scalar("train/loss", loss_value, self.iteration)
self.visualizer.add_scalar("train/eer", eer, self.iteration)
self.visualizer.add_scalar("param/w",
float(self.model_core.similarity_weight),
self.iteration)
self.visualizer.add_scalar("param/b",
float(self.model_core.similarity_bias),
self.iteration)
def valid(self):
pass
def main_sp(config, args):
exp = Ge2eExperiment(config, args)
exp.setup()
exp.resume_or_load()
exp.run()
def main(config, args):
if args.ngpu > 1:
dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
else:
main_sp(config, args)
if __name__ == "__main__":
config = get_cfg_defaults()
parser = default_argument_parser()
args = parser.parse_args()
if args.config:
config.merge_from_file(args.config)
if args.opts:
config.merge_from_list(args.opts)
config.freeze()
print(config)
print(args)
main(config, args)

@ -20,14 +20,14 @@ import paddle
import soundfile as sf import soundfile as sf
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
from paddlespeech.t2s.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_phones from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_phones
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_tones from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_tones
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.chinese_g2p import convert_sentence from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.chinese_g2p import convert_sentence
from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder
from paddlespeech.t2s.models.tacotron2 import Tacotron2 from paddlespeech.t2s.models.tacotron2 import Tacotron2
from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow
from paddlespeech.t2s.utils import display from paddlespeech.t2s.utils import display
from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
def voice_cloning(args): def voice_cloning(args):

@ -32,9 +32,7 @@ from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredic
from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor
from paddlespeech.t2s.modules.tacotron2.decoder import Postnet from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding from paddlespeech.t2s.modules.transformer.encoder import Encoder
from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
from paddlespeech.t2s.modules.transformer.encoder import Encoder as TransformerEncoder
class FastSpeech2(nn.Layer): class FastSpeech2(nn.Layer):
@ -66,6 +64,7 @@ class FastSpeech2(nn.Layer):
postnet_layers: int=5, postnet_layers: int=5,
postnet_chans: int=512, postnet_chans: int=512,
postnet_filts: int=5, postnet_filts: int=5,
postnet_dropout_rate: float=0.5,
positionwise_layer_type: str="conv1d", positionwise_layer_type: str="conv1d",
positionwise_conv_kernel_size: int=1, positionwise_conv_kernel_size: int=1,
use_scaled_pos_enc: bool=True, use_scaled_pos_enc: bool=True,
@ -77,10 +76,27 @@ class FastSpeech2(nn.Layer):
reduction_factor: int=1, reduction_factor: int=1,
encoder_type: str="transformer", encoder_type: str="transformer",
decoder_type: str="transformer", decoder_type: str="transformer",
# for transformer
transformer_enc_dropout_rate: float=0.1,
transformer_enc_positional_dropout_rate: float=0.1,
transformer_enc_attn_dropout_rate: float=0.1,
transformer_dec_dropout_rate: float=0.1,
transformer_dec_positional_dropout_rate: float=0.1,
transformer_dec_attn_dropout_rate: float=0.1,
# for conformer
conformer_pos_enc_layer_type: str="rel_pos",
conformer_self_attn_layer_type: str="rel_selfattn",
conformer_activation_type: str="swish",
use_macaron_style_in_conformer: bool=True,
use_cnn_in_conformer: bool=True,
zero_triu: bool=False,
conformer_enc_kernel_size: int=7,
conformer_dec_kernel_size: int=31,
# duration predictor # duration predictor
duration_predictor_layers: int=2, duration_predictor_layers: int=2,
duration_predictor_chans: int=384, duration_predictor_chans: int=384,
duration_predictor_kernel_size: int=3, duration_predictor_kernel_size: int=3,
duration_predictor_dropout_rate: float=0.1,
# energy predictor # energy predictor
energy_predictor_layers: int=2, energy_predictor_layers: int=2,
energy_predictor_chans: int=384, energy_predictor_chans: int=384,
@ -98,28 +114,150 @@ class FastSpeech2(nn.Layer):
pitch_embed_dropout: float=0.5, pitch_embed_dropout: float=0.5,
stop_gradient_from_pitch_predictor: bool=False, stop_gradient_from_pitch_predictor: bool=False,
# spk emb # spk emb
num_speakers: int=None, spk_num: int=None,
spk_embed_dim: int=None, spk_embed_dim: int=None,
spk_embed_integration_type: str="add", spk_embed_integration_type: str="add",
# tone emb # tone emb
num_tones: int=None, tone_num: int=None,
tone_embed_dim: int=None, tone_embed_dim: int=None,
tone_embed_integration_type: str="add", tone_embed_integration_type: str="add",
# training related # training related
transformer_enc_dropout_rate: float=0.1,
transformer_enc_positional_dropout_rate: float=0.1,
transformer_enc_attn_dropout_rate: float=0.1,
transformer_dec_dropout_rate: float=0.1,
transformer_dec_positional_dropout_rate: float=0.1,
transformer_dec_attn_dropout_rate: float=0.1,
duration_predictor_dropout_rate: float=0.1,
postnet_dropout_rate: float=0.5,
init_type: str="xavier_uniform", init_type: str="xavier_uniform",
init_enc_alpha: float=1.0, init_enc_alpha: float=1.0,
init_dec_alpha: float=1.0, init_dec_alpha: float=1.0, ):
use_masking: bool=False, """Initialize FastSpeech2 module.
use_weighted_masking: bool=False, ): Parameters
"""Initialize FastSpeech2 module.""" ----------
idim : int
Dimension of the inputs.
odim : int
Dimension of the outputs.
adim : int
Attention dimension.
aheads : int
Number of attention heads.
elayers : int
Number of encoder layers.
eunits : int
Number of encoder hidden units.
dlayers : int
Number of decoder layers.
dunits : int
Number of decoder hidden units.
postnet_layers : int
Number of postnet layers.
postnet_chans : int
Number of postnet channels.
postnet_filts : int
Kernel size of postnet.
postnet_dropout_rate : float
Dropout rate in postnet.
use_scaled_pos_enc : bool
Whether to use trainable scaled pos encoding.
use_batch_norm : bool
Whether to use batch normalization in encoder prenet.
encoder_normalize_before : bool
Whether to apply layernorm layer before encoder block.
decoder_normalize_before : bool
Whether to apply layernorm layer before
decoder block.
encoder_concat_after : bool
Whether to concatenate attention layer's input and output in encoder.
decoder_concat_after : bool
Whether to concatenate attention layer's input and output in decoder.
reduction_factor : int
Reduction factor.
encoder_type : str
Encoder type ("transformer" or "conformer").
decoder_type : str
Decoder type ("transformer" or "conformer").
transformer_enc_dropout_rate : float
Dropout rate in encoder except attention and positional encoding.
transformer_enc_positional_dropout_rate (float): Dropout rate after encoder
positional encoding.
transformer_enc_attn_dropout_rate (float): Dropout rate in encoder
self-attention module.
transformer_dec_dropout_rate (float): Dropout rate in decoder except
attention & positional encoding.
transformer_dec_positional_dropout_rate (float): Dropout rate after decoder
positional encoding.
transformer_dec_attn_dropout_rate (float): Dropout rate in decoder
self-attention module.
conformer_pos_enc_layer_type : str
Pos encoding layer type in conformer.
conformer_self_attn_layer_type : str
Self-attention layer type in conformer
conformer_activation_type : str
Activation function type in conformer.
use_macaron_style_in_conformer : bool
Whether to use macaron style FFN.
use_cnn_in_conformer : bool
Whether to use CNN in conformer.
zero_triu : bool
Whether to use zero triu in relative self-attention module.
conformer_enc_kernel_size : int
Kernel size of encoder conformer.
conformer_dec_kernel_size : int
Kernel size of decoder conformer.
duration_predictor_layers : int
Number of duration predictor layers.
duration_predictor_chans : int
Number of duration predictor channels.
duration_predictor_kernel_size : int
Kernel size of duration predictor.
duration_predictor_dropout_rate : float
Dropout rate in duration predictor.
pitch_predictor_layers : int
Number of pitch predictor layers.
pitch_predictor_chans : int
Number of pitch predictor channels.
pitch_predictor_kernel_size : int
Kernel size of pitch predictor.
pitch_predictor_dropout_rate : float
Dropout rate in pitch predictor.
pitch_embed_kernel_size : float
Kernel size of pitch embedding.
pitch_embed_dropout_rate : float
Dropout rate for pitch embedding.
stop_gradient_from_pitch_predictor : bool
Whether to stop gradient from pitch predictor to encoder.
energy_predictor_layers : int
Number of energy predictor layers.
energy_predictor_chans : int
Number of energy predictor channels.
energy_predictor_kernel_size : int
Kernel size of energy predictor.
energy_predictor_dropout_rate : float
Dropout rate in energy predictor.
energy_embed_kernel_size : float
Kernel size of energy embedding.
energy_embed_dropout_rate : float
Dropout rate for energy embedding.
stop_gradient_from_energy_predictor : bool
Whether to stop gradient from energy predictor to encoder.
spk_num : Optional[int]
Number of speakers. If not None, assume that the spk_embed_dim is not None,
spk_ids will be provided as the input and use spk_embedding_table.
spk_embed_dim : Optional[int]
Speaker embedding dimension. If not None,
assume that spk_emb will be provided as the input or spk_num is not None.
spk_embed_integration_type : str
How to integrate speaker embedding.
tone_num : Optional[int]
Number of tones. If not None, assume that the
tone_ids will be provided as the input and use tone_embedding_table.
tone_embed_dim : Optional[int]
Tone embedding dimension. If not None, assume that tone_num is not None.
tone_embed_integration_type : str
How to integrate tone embedding.
init_type : str
How to initialize transformer parameters.
init_enc_alpha : float
Initial value of alpha in scaled pos encoding of the encoder.
init_dec_alpha : float
Initial value of alpha in scaled pos encoding of the decoder.
"""
assert check_argument_types() assert check_argument_types()
super().__init__() super().__init__()
@ -148,30 +286,32 @@ class FastSpeech2(nn.Layer):
# initialize parameters # initialize parameters
initialize(self, init_type) initialize(self, init_type)
if self.spk_embed_dim is not None: if spk_num and self.spk_embed_dim:
self.spk_embedding_table = nn.Embedding( self.spk_embedding_table = nn.Embedding(
num_embeddings=num_speakers, num_embeddings=spk_num,
embedding_dim=self.spk_embed_dim, embedding_dim=self.spk_embed_dim,
padding_idx=self.padding_idx) padding_idx=self.padding_idx)
if self.tone_embed_dim is not None: if self.tone_embed_dim is not None:
self.tone_embedding_table = nn.Embedding( self.tone_embedding_table = nn.Embedding(
num_embeddings=num_tones, num_embeddings=tone_num,
embedding_dim=self.tone_embed_dim, embedding_dim=self.tone_embed_dim,
padding_idx=self.padding_idx) padding_idx=self.padding_idx)
# get positional encoding class # get positional encoding layer type
pos_enc_class = (ScaledPositionalEncoding transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos"
if self.use_scaled_pos_enc else PositionalEncoding)
# define encoder # define encoder
encoder_input_layer = nn.Embedding( encoder_input_layer = nn.Embedding(
num_embeddings=idim, num_embeddings=idim,
embedding_dim=adim, embedding_dim=adim,
padding_idx=self.padding_idx) padding_idx=self.padding_idx)
# add encoder type here
# 测试模型还能跑通不
# 记得改 transformer tts
if encoder_type == "transformer": if encoder_type == "transformer":
self.encoder = TransformerEncoder( print("encoder_type is transformer")
self.encoder = Encoder(
idim=idim, idim=idim,
attention_dim=adim, attention_dim=adim,
attention_heads=aheads, attention_heads=aheads,
@ -181,11 +321,36 @@ class FastSpeech2(nn.Layer):
dropout_rate=transformer_enc_dropout_rate, dropout_rate=transformer_enc_dropout_rate,
positional_dropout_rate=transformer_enc_positional_dropout_rate, positional_dropout_rate=transformer_enc_positional_dropout_rate,
attention_dropout_rate=transformer_enc_attn_dropout_rate, attention_dropout_rate=transformer_enc_attn_dropout_rate,
pos_enc_class=pos_enc_class, pos_enc_layer_type=transformer_pos_enc_layer_type,
normalize_before=encoder_normalize_before, normalize_before=encoder_normalize_before,
concat_after=encoder_concat_after, concat_after=encoder_concat_after,
positionwise_layer_type=positionwise_layer_type, positionwise_layer_type=positionwise_layer_type,
positionwise_conv_kernel_size=positionwise_conv_kernel_size, ) positionwise_conv_kernel_size=positionwise_conv_kernel_size,
encoder_type=encoder_type)
elif encoder_type == "conformer":
print("encoder_type is conformer")
self.encoder = Encoder(
idim=idim,
attention_dim=adim,
attention_heads=aheads,
linear_units=eunits,
num_blocks=elayers,
input_layer=encoder_input_layer,
dropout_rate=transformer_enc_dropout_rate,
positional_dropout_rate=transformer_enc_positional_dropout_rate,
attention_dropout_rate=transformer_enc_attn_dropout_rate,
normalize_before=encoder_normalize_before,
concat_after=encoder_concat_after,
positionwise_layer_type=positionwise_layer_type,
positionwise_conv_kernel_size=positionwise_conv_kernel_size,
macaron_style=use_macaron_style_in_conformer,
pos_enc_layer_type=conformer_pos_enc_layer_type,
selfattention_layer_type=conformer_self_attn_layer_type,
activation_type=conformer_activation_type,
use_cnn_module=use_cnn_in_conformer,
cnn_module_kernel=conformer_enc_kernel_size,
zero_triu=zero_triu,
encoder_type=encoder_type)
else: else:
raise ValueError(f"{encoder_type} is not supported.") raise ValueError(f"{encoder_type} is not supported.")
@ -251,7 +416,8 @@ class FastSpeech2(nn.Layer):
# NOTE: we use encoder as decoder # NOTE: we use encoder as decoder
# because fastspeech's decoder is the same as encoder # because fastspeech's decoder is the same as encoder
if decoder_type == "transformer": if decoder_type == "transformer":
self.decoder = TransformerEncoder( print("decoder_type is transformer")
self.decoder = Encoder(
idim=0, idim=0,
attention_dim=adim, attention_dim=adim,
attention_heads=aheads, attention_heads=aheads,
@ -262,11 +428,35 @@ class FastSpeech2(nn.Layer):
dropout_rate=transformer_dec_dropout_rate, dropout_rate=transformer_dec_dropout_rate,
positional_dropout_rate=transformer_dec_positional_dropout_rate, positional_dropout_rate=transformer_dec_positional_dropout_rate,
attention_dropout_rate=transformer_dec_attn_dropout_rate, attention_dropout_rate=transformer_dec_attn_dropout_rate,
pos_enc_class=pos_enc_class, pos_enc_layer_type=transformer_pos_enc_layer_type,
normalize_before=decoder_normalize_before, normalize_before=decoder_normalize_before,
concat_after=decoder_concat_after, concat_after=decoder_concat_after,
positionwise_layer_type=positionwise_layer_type, positionwise_layer_type=positionwise_layer_type,
positionwise_conv_kernel_size=positionwise_conv_kernel_size, ) positionwise_conv_kernel_size=positionwise_conv_kernel_size,
encoder_type=decoder_type)
elif decoder_type == "conformer":
print("decoder_type is conformer")
self.decoder = Encoder(
idim=0,
attention_dim=adim,
attention_heads=aheads,
linear_units=dunits,
num_blocks=dlayers,
input_layer=None,
dropout_rate=transformer_dec_dropout_rate,
positional_dropout_rate=transformer_dec_positional_dropout_rate,
attention_dropout_rate=transformer_dec_attn_dropout_rate,
normalize_before=decoder_normalize_before,
concat_after=decoder_concat_after,
positionwise_layer_type=positionwise_layer_type,
positionwise_conv_kernel_size=positionwise_conv_kernel_size,
macaron_style=use_macaron_style_in_conformer,
pos_enc_layer_type=conformer_pos_enc_layer_type,
selfattention_layer_type=conformer_self_attn_layer_type,
activation_type=conformer_activation_type,
use_cnn_module=use_cnn_in_conformer,
cnn_module_kernel=conformer_dec_kernel_size,
encoder_type=decoder_type)
else: else:
raise ValueError(f"{decoder_type} is not supported.") raise ValueError(f"{decoder_type} is not supported.")
@ -299,7 +489,7 @@ class FastSpeech2(nn.Layer):
pitch: paddle.Tensor, pitch: paddle.Tensor,
energy: paddle.Tensor, energy: paddle.Tensor,
tone_id: paddle.Tensor=None, tone_id: paddle.Tensor=None,
spembs: paddle.Tensor=None, spk_emb: paddle.Tensor=None,
spk_id: paddle.Tensor=None spk_id: paddle.Tensor=None
) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
"""Calculate forward propagation. """Calculate forward propagation.
@ -322,7 +512,7 @@ class FastSpeech2(nn.Layer):
Batch of padded token-averaged energy (B, Tmax, 1). Batch of padded token-averaged energy (B, Tmax, 1).
tone_id : Tensor, optional(int64) tone_id : Tensor, optional(int64)
Batch of padded tone ids (B, Tmax). Batch of padded tone ids (B, Tmax).
spembs : Tensor, optional spk_emb : Tensor, optional
Batch of speaker embeddings (B, spk_embed_dim). Batch of speaker embeddings (B, spk_embed_dim).
spk_id : Tnesor, optional(int64) spk_id : Tnesor, optional(int64)
Batch of speaker ids (B,) Batch of speaker ids (B,)
@ -366,7 +556,7 @@ class FastSpeech2(nn.Layer):
ps, ps,
es, es,
is_inference=False, is_inference=False,
spembs=spembs, spk_emb=spk_emb,
spk_id=spk_id, spk_id=spk_id,
tone_id=tone_id) tone_id=tone_id)
# modify mod part of groundtruth # modify mod part of groundtruth
@ -387,7 +577,7 @@ class FastSpeech2(nn.Layer):
es: paddle.Tensor=None, es: paddle.Tensor=None,
is_inference: bool=False, is_inference: bool=False,
alpha: float=1.0, alpha: float=1.0,
spembs=None, spk_emb=None,
spk_id=None, spk_id=None,
tone_id=None) -> Sequence[paddle.Tensor]: tone_id=None) -> Sequence[paddle.Tensor]:
# forward encoder # forward encoder
@ -397,11 +587,12 @@ class FastSpeech2(nn.Layer):
# integrate speaker embedding # integrate speaker embedding
if self.spk_embed_dim is not None: if self.spk_embed_dim is not None:
if spembs is not None: # spk_emb has a higher priority than spk_id
hs = self._integrate_with_spk_embed(hs, spembs) if spk_emb is not None:
hs = self._integrate_with_spk_embed(hs, spk_emb)
elif spk_id is not None: elif spk_id is not None:
spembs = self.spk_embedding_table(spk_id) spk_emb = self.spk_embedding_table(spk_id)
hs = self._integrate_with_spk_embed(hs, spembs) hs = self._integrate_with_spk_embed(hs, spk_emb)
# integrate tone embedding # integrate tone embedding
if self.tone_embed_dim is not None: if self.tone_embed_dim is not None:
@ -489,7 +680,7 @@ class FastSpeech2(nn.Layer):
energy: paddle.Tensor=None, energy: paddle.Tensor=None,
alpha: float=1.0, alpha: float=1.0,
use_teacher_forcing: bool=False, use_teacher_forcing: bool=False,
spembs=None, spk_emb=None,
spk_id=None, spk_id=None,
tone_id=None, tone_id=None,
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
@ -512,7 +703,7 @@ class FastSpeech2(nn.Layer):
use_teacher_forcing : bool, optional use_teacher_forcing : bool, optional
Whether to use teacher forcing. Whether to use teacher forcing.
If true, groundtruth of duration, pitch and energy will be used. If true, groundtruth of duration, pitch and energy will be used.
spembs : Tensor, optional spk_emb : Tensor, optional
peaker embedding vector (spk_embed_dim,). peaker embedding vector (spk_embed_dim,).
spk_id : Tensor, optional(int64) spk_id : Tensor, optional(int64)
Batch of padded spk ids (1,). Batch of padded spk ids (1,).
@ -527,7 +718,6 @@ class FastSpeech2(nn.Layer):
# input of embedding must be int64 # input of embedding must be int64
x = paddle.cast(text, 'int64') x = paddle.cast(text, 'int64')
y = speech y = speech
spemb = spembs
d, p, e = durations, pitch, energy d, p, e = durations, pitch, energy
# setup batch axis # setup batch axis
ilens = paddle.shape(x)[0] ilens = paddle.shape(x)[0]
@ -537,8 +727,8 @@ class FastSpeech2(nn.Layer):
if y is not None: if y is not None:
ys = y.unsqueeze(0) ys = y.unsqueeze(0)
if spemb is not None: if spk_emb is not None:
spembs = spemb.unsqueeze(0) spk_emb = spk_emb.unsqueeze(0)
if tone_id is not None: if tone_id is not None:
tone_id = tone_id.unsqueeze(0) tone_id = tone_id.unsqueeze(0)
@ -548,7 +738,7 @@ class FastSpeech2(nn.Layer):
ds = d.unsqueeze(0) if d is not None else None ds = d.unsqueeze(0) if d is not None else None
ps = p.unsqueeze(0) if p is not None else None ps = p.unsqueeze(0) if p is not None else None
es = e.unsqueeze(0) if e is not None else None es = e.unsqueeze(0) if e is not None else None
# ds, ps, es = , p.unsqueeze(0), e.unsqueeze(0)
# (1, L, odim) # (1, L, odim)
_, outs, d_outs, p_outs, e_outs = self._forward( _, outs, d_outs, p_outs, e_outs = self._forward(
xs, xs,
@ -557,7 +747,7 @@ class FastSpeech2(nn.Layer):
ds=ds, ds=ds,
ps=ps, ps=ps,
es=es, es=es,
spembs=spembs, spk_emb=spk_emb,
spk_id=spk_id, spk_id=spk_id,
tone_id=tone_id, tone_id=tone_id,
is_inference=True) is_inference=True)
@ -569,19 +759,19 @@ class FastSpeech2(nn.Layer):
ys, ys,
is_inference=True, is_inference=True,
alpha=alpha, alpha=alpha,
spembs=spembs, spk_emb=spk_emb,
spk_id=spk_id, spk_id=spk_id,
tone_id=tone_id) tone_id=tone_id)
return outs[0], d_outs[0], p_outs[0], e_outs[0] return outs[0], d_outs[0], p_outs[0], e_outs[0]
def _integrate_with_spk_embed(self, hs, spembs): def _integrate_with_spk_embed(self, hs, spk_emb):
"""Integrate speaker embedding with hidden states. """Integrate speaker embedding with hidden states.
Parameters Parameters
---------- ----------
hs : Tensor hs : Tensor
Batch of hidden state sequences (B, Tmax, adim). Batch of hidden state sequences (B, Tmax, adim).
spembs : Tensor spk_emb : Tensor
Batch of speaker embeddings (B, spk_embed_dim). Batch of speaker embeddings (B, spk_embed_dim).
Returns Returns
@ -591,13 +781,13 @@ class FastSpeech2(nn.Layer):
""" """
if self.spk_embed_integration_type == "add": if self.spk_embed_integration_type == "add":
# apply projection and then add to hidden states # apply projection and then add to hidden states
spembs = self.spk_projection(F.normalize(spembs)) spk_emb = self.spk_projection(F.normalize(spk_emb))
hs = hs + spembs.unsqueeze(1) hs = hs + spk_emb.unsqueeze(1)
elif self.spk_embed_integration_type == "concat": elif self.spk_embed_integration_type == "concat":
# concat hidden states with spk embeds and then apply projection # concat hidden states with spk embeds and then apply projection
spembs = F.normalize(spembs).unsqueeze(1).expand( spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
shape=[-1, hs.shape[1], -1]) shape=[-1, hs.shape[1], -1])
hs = self.spk_projection(paddle.concat([hs, spembs], axis=-1)) hs = self.spk_projection(paddle.concat([hs, spk_emb], axis=-1))
else: else:
raise NotImplementedError("support only add or concat.") raise NotImplementedError("support only add or concat.")
@ -682,9 +872,9 @@ class FastSpeech2Inference(nn.Layer):
self.normalizer = normalizer self.normalizer = normalizer
self.acoustic_model = model self.acoustic_model = model
def forward(self, text, spk_id=None): def forward(self, text, spk_id=None, spk_emb=None):
normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
text, spk_id=spk_id) text, spk_id=spk_id, spk_emb=spk_emb)
logmel = self.normalizer.inverse(normalized_mel) logmel = self.normalizer.inverse(normalized_mel)
return logmel return logmel

@ -54,6 +54,10 @@ class FastSpeech2Updater(StandardUpdater):
losses_dict = {} losses_dict = {}
# spk_id!=None in multiple spk fastspeech2 # spk_id!=None in multiple spk fastspeech2
spk_id = batch["spk_id"] if "spk_id" in batch else None spk_id = batch["spk_id"] if "spk_id" in batch else None
spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
# No explicit speaker identifier labels are used during voice cloning training.
if spk_emb is not None:
spk_id = None
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model( before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model(
text=batch["text"], text=batch["text"],
@ -63,7 +67,8 @@ class FastSpeech2Updater(StandardUpdater):
durations=batch["durations"], durations=batch["durations"],
pitch=batch["pitch"], pitch=batch["pitch"],
energy=batch["energy"], energy=batch["energy"],
spk_id=spk_id) spk_id=spk_id,
spk_emb=spk_emb)
l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
after_outs=after_outs, after_outs=after_outs,
@ -126,6 +131,9 @@ class FastSpeech2Evaluator(StandardEvaluator):
losses_dict = {} losses_dict = {}
# spk_id!=None in multiple spk fastspeech2 # spk_id!=None in multiple spk fastspeech2
spk_id = batch["spk_id"] if "spk_id" in batch else None spk_id = batch["spk_id"] if "spk_id" in batch else None
spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
if spk_emb is not None:
spk_id = None
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model( before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model(
text=batch["text"], text=batch["text"],
@ -135,7 +143,8 @@ class FastSpeech2Evaluator(StandardEvaluator):
durations=batch["durations"], durations=batch["durations"],
pitch=batch["pitch"], pitch=batch["pitch"],
energy=batch["energy"], energy=batch["energy"],
spk_id=spk_id) spk_id=spk_id,
spk_emb=spk_emb)
l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
after_outs=after_outs, after_outs=after_outs,

@ -257,9 +257,9 @@ class TransformerTTS(nn.Layer):
self.padding_idx = 0 self.padding_idx = 0
# set_global_initializer 会影响后面的全局,包括 create_parameter # set_global_initializer 会影响后面的全局,包括 create_parameter
initialize(self, init_type) initialize(self, init_type)
# get positional encoding class
pos_enc_class = (ScaledPositionalEncoding # get positional encoding layer type
if self.use_scaled_pos_enc else PositionalEncoding) transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos"
# define transformer encoder # define transformer encoder
if eprenet_conv_layers != 0: if eprenet_conv_layers != 0:
@ -291,7 +291,7 @@ class TransformerTTS(nn.Layer):
dropout_rate=transformer_enc_dropout_rate, dropout_rate=transformer_enc_dropout_rate,
positional_dropout_rate=transformer_enc_positional_dropout_rate, positional_dropout_rate=transformer_enc_positional_dropout_rate,
attention_dropout_rate=transformer_enc_attn_dropout_rate, attention_dropout_rate=transformer_enc_attn_dropout_rate,
pos_enc_class=pos_enc_class, pos_enc_layer_type=transformer_pos_enc_layer_type,
normalize_before=encoder_normalize_before, normalize_before=encoder_normalize_before,
concat_after=encoder_concat_after, concat_after=encoder_concat_after,
positionwise_layer_type=positionwise_layer_type, positionwise_layer_type=positionwise_layer_type,
@ -330,6 +330,9 @@ class TransformerTTS(nn.Layer):
nn.Linear(dprenet_units, adim), ) nn.Linear(dprenet_units, adim), )
else: else:
decoder_input_layer = "linear" decoder_input_layer = "linear"
# get positional encoding class
pos_enc_class = (ScaledPositionalEncoding
if self.use_scaled_pos_enc else PositionalEncoding)
self.decoder = Decoder( self.decoder = Decoder(
odim=odim, # odim is needed when no prenet is used odim=odim, # odim is needed when no prenet is used
attention_dim=adim, attention_dim=adim,
@ -391,7 +394,7 @@ class TransformerTTS(nn.Layer):
text_lengths: paddle.Tensor, text_lengths: paddle.Tensor,
speech: paddle.Tensor, speech: paddle.Tensor,
speech_lengths: paddle.Tensor, speech_lengths: paddle.Tensor,
spembs: paddle.Tensor=None, spk_emb: paddle.Tensor=None,
) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
"""Calculate forward propagation. """Calculate forward propagation.
@ -405,7 +408,7 @@ class TransformerTTS(nn.Layer):
Batch of padded target features (B, Lmax, odim). Batch of padded target features (B, Lmax, odim).
speech_lengths : Tensor(int64) speech_lengths : Tensor(int64)
Batch of the lengths of each target (B,). Batch of the lengths of each target (B,).
spembs : Tensor, optional spk_emb : Tensor, optional
Batch of speaker embeddings (B, spk_embed_dim). Batch of speaker embeddings (B, spk_embed_dim).
Returns Returns
@ -439,7 +442,7 @@ class TransformerTTS(nn.Layer):
# calculate transformer outputs # calculate transformer outputs
after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens, after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens,
spembs) spk_emb)
# modifiy mod part of groundtruth # modifiy mod part of groundtruth
@ -467,7 +470,7 @@ class TransformerTTS(nn.Layer):
ilens: paddle.Tensor, ilens: paddle.Tensor,
ys: paddle.Tensor, ys: paddle.Tensor,
olens: paddle.Tensor, olens: paddle.Tensor,
spembs: paddle.Tensor, spk_emb: paddle.Tensor,
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
# forward encoder # forward encoder
x_masks = self._source_mask(ilens) x_masks = self._source_mask(ilens)
@ -480,7 +483,7 @@ class TransformerTTS(nn.Layer):
# integrate speaker embedding # integrate speaker embedding
if self.spk_embed_dim is not None: if self.spk_embed_dim is not None:
hs = self._integrate_with_spk_embed(hs, spembs) hs = self._integrate_with_spk_embed(hs, spk_emb)
# thin out frames for reduction factor (B, Lmax, odim) -> (B, Lmax//r, odim) # thin out frames for reduction factor (B, Lmax, odim) -> (B, Lmax//r, odim)
if self.reduction_factor > 1: if self.reduction_factor > 1:
@ -514,7 +517,7 @@ class TransformerTTS(nn.Layer):
self, self,
text: paddle.Tensor, text: paddle.Tensor,
speech: paddle.Tensor=None, speech: paddle.Tensor=None,
spembs: paddle.Tensor=None, spk_emb: paddle.Tensor=None,
threshold: float=0.5, threshold: float=0.5,
minlenratio: float=0.0, minlenratio: float=0.0,
maxlenratio: float=10.0, maxlenratio: float=10.0,
@ -528,7 +531,7 @@ class TransformerTTS(nn.Layer):
Input sequence of characters (T,). Input sequence of characters (T,).
speech : Tensor, optional speech : Tensor, optional
Feature sequence to extract style (N, idim). Feature sequence to extract style (N, idim).
spembs : Tensor, optional spk_emb : Tensor, optional
Speaker embedding vector (spk_embed_dim,). Speaker embedding vector (spk_embed_dim,).
threshold : float, optional threshold : float, optional
Threshold in inference. Threshold in inference.
@ -551,7 +554,6 @@ class TransformerTTS(nn.Layer):
""" """
# input of embedding must be int64 # input of embedding must be int64
y = speech y = speech
spemb = spembs
# add eos at the last of sequence # add eos at the last of sequence
text = numpy.pad( text = numpy.pad(
@ -564,12 +566,12 @@ class TransformerTTS(nn.Layer):
# get teacher forcing outputs # get teacher forcing outputs
xs, ys = x.unsqueeze(0), y.unsqueeze(0) xs, ys = x.unsqueeze(0), y.unsqueeze(0)
spembs = None if spemb is None else spemb.unsqueeze(0) spk_emb = None if spk_emb is None else spk_emb.unsqueeze(0)
ilens = paddle.to_tensor( ilens = paddle.to_tensor(
[xs.shape[1]], dtype=paddle.int64, place=xs.place) [xs.shape[1]], dtype=paddle.int64, place=xs.place)
olens = paddle.to_tensor( olens = paddle.to_tensor(
[ys.shape[1]], dtype=paddle.int64, place=ys.place) [ys.shape[1]], dtype=paddle.int64, place=ys.place)
outs, *_ = self._forward(xs, ilens, ys, olens, spembs) outs, *_ = self._forward(xs, ilens, ys, olens, spk_emb)
# get attention weights # get attention weights
att_ws = [] att_ws = []
@ -590,9 +592,9 @@ class TransformerTTS(nn.Layer):
hs = hs + style_embs.unsqueeze(1) hs = hs + style_embs.unsqueeze(1)
# integrate speaker embedding # integrate speaker embedding
if self.spk_embed_dim is not None: if spk_emb is not None:
spembs = spemb.unsqueeze(0) spk_emb = spk_emb.unsqueeze(0)
hs = self._integrate_with_spk_embed(hs, spembs) hs = self._integrate_with_spk_embed(hs, spk_emb)
# set limits of length # set limits of length
maxlen = int(hs.shape[1] * maxlenratio / self.reduction_factor) maxlen = int(hs.shape[1] * maxlenratio / self.reduction_factor)
@ -726,14 +728,14 @@ class TransformerTTS(nn.Layer):
def _integrate_with_spk_embed(self, def _integrate_with_spk_embed(self,
hs: paddle.Tensor, hs: paddle.Tensor,
spembs: paddle.Tensor) -> paddle.Tensor: spk_emb: paddle.Tensor) -> paddle.Tensor:
"""Integrate speaker embedding with hidden states. """Integrate speaker embedding with hidden states.
Parameters Parameters
---------- ----------
hs : Tensor hs : Tensor
Batch of hidden state sequences (B, Tmax, adim). Batch of hidden state sequences (B, Tmax, adim).
spembs : Tensor spk_emb : Tensor
Batch of speaker embeddings (B, spk_embed_dim). Batch of speaker embeddings (B, spk_embed_dim).
Returns Returns
@ -744,13 +746,13 @@ class TransformerTTS(nn.Layer):
""" """
if self.spk_embed_integration_type == "add": if self.spk_embed_integration_type == "add":
# apply projection and then add to hidden states # apply projection and then add to hidden states
spembs = self.projection(F.normalize(spembs)) spk_emb = self.projection(F.normalize(spk_emb))
hs = hs + spembs.unsqueeze(1) hs = hs + spk_emb.unsqueeze(1)
elif self.spk_embed_integration_type == "concat": elif self.spk_embed_integration_type == "concat":
# concat hidden states with spk embeds and then apply projection # concat hidden states with spk embeds and then apply projection
spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.shape[1], spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(-1, hs.shape[1],
-1) -1)
hs = self.projection(paddle.concat([hs, spembs], axis=-1)) hs = self.projection(paddle.concat([hs, spk_emb], axis=-1))
else: else:
raise NotImplementedError("support only add or concat.") raise NotImplementedError("support only add or concat.")

@ -1,274 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
"""Encoder definition."""
import logging
import paddle
from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer
from paddlespeech.t2s.modules.layer_norm import LayerNorm
from paddlespeech.t2s.modules.nets_utils import get_activation
from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention
from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
from paddlespeech.t2s.modules.transformer.repeat import repeat
from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
class Encoder(paddle.nn.Layer):
"""Conformer encoder module.
Parameters
----------
idim : int
Input dimension.
attention_dim : int
Dimension of attention.
attention_heads : int
The number of heads of multi head attention.
linear_units : int
The number of units of position-wise feed forward.
num_blocks : int
The number of decoder blocks.
dropout_rate : float
Dropout rate.
positional_dropout_rate : float
Dropout rate after adding positional encoding.
attention_dropout_rate : float
Dropout rate in attention.
input_layer : Union[str, paddle.nn.Layer]
Input layer type.
normalize_before : bool
Whether to use layer_norm before the first block.
concat_after : bool
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
positionwise_layer_type : str
"linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size : int
Kernel size of positionwise conv1d layer.
macaron_style : bool
Whether to use macaron style for positionwise layer.
pos_enc_layer_type : str
Encoder positional encoding layer type.
selfattention_layer_type : str
Encoder attention layer type.
activation_type : str
Encoder activation function type.
use_cnn_module : bool
Whether to use convolution module.
zero_triu : bool
Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel : int
Kernerl size of convolution module.
padding_idx : int
Padding idx for input_layer=embed.
stochastic_depth_rate : float
Maximum probability to skip the encoder layer.
intermediate_layers : Union[List[int], None]
indices of intermediate CTC layer.
indices start from 1.
if not None, intermediate outputs are returned (which changes return type
signature.)
"""
def __init__(
self,
idim,
attention_dim=256,
attention_heads=4,
linear_units=2048,
num_blocks=6,
dropout_rate=0.1,
positional_dropout_rate=0.1,
attention_dropout_rate=0.0,
input_layer="conv2d",
normalize_before=True,
concat_after=False,
positionwise_layer_type="linear",
positionwise_conv_kernel_size=1,
macaron_style=False,
pos_enc_layer_type="abs_pos",
selfattention_layer_type="selfattn",
activation_type="swish",
use_cnn_module=False,
zero_triu=False,
cnn_module_kernel=31,
padding_idx=-1,
stochastic_depth_rate=0.0,
intermediate_layers=None, ):
"""Construct an Encoder object."""
super(Encoder, self).__init__()
activation = get_activation(activation_type)
if pos_enc_layer_type == "abs_pos":
pos_enc_class = PositionalEncoding
elif pos_enc_layer_type == "scaled_abs_pos":
pos_enc_class = ScaledPositionalEncoding
elif pos_enc_layer_type == "rel_pos":
assert selfattention_layer_type == "rel_selfattn"
pos_enc_class = RelPositionalEncoding
elif pos_enc_layer_type == "legacy_rel_pos":
pos_enc_class = LegacyRelPositionalEncoding
assert selfattention_layer_type == "legacy_rel_selfattn"
else:
raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
self.conv_subsampling_factor = 1
if input_layer == "linear":
self.embed = paddle.nn.Sequential(
paddle.nn.Linear(idim, attention_dim),
paddle.nn.LayerNorm(attention_dim),
paddle.nn.Dropout(dropout_rate),
pos_enc_class(attention_dim, positional_dropout_rate), )
elif input_layer == "conv2d":
self.embed = Conv2dSubsampling(
idim,
attention_dim,
dropout_rate,
pos_enc_class(attention_dim, positional_dropout_rate), )
self.conv_subsampling_factor = 4
elif input_layer == "embed":
self.embed = paddle.nn.Sequential(
paddle.nn.Embedding(
idim, attention_dim, padding_idx=padding_idx),
pos_enc_class(attention_dim, positional_dropout_rate), )
elif isinstance(input_layer, paddle.nn.Layer):
self.embed = paddle.nn.Sequential(
input_layer,
pos_enc_class(attention_dim, positional_dropout_rate), )
elif input_layer is None:
self.embed = paddle.nn.Sequential(
pos_enc_class(attention_dim, positional_dropout_rate))
else:
raise ValueError("unknown input_layer: " + input_layer)
self.normalize_before = normalize_before
# self-attention module definition
if selfattention_layer_type == "selfattn":
logging.info("encoder self-attention layer type = self-attention")
encoder_selfattn_layer = MultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, attention_dim,
attention_dropout_rate, )
elif selfattention_layer_type == "legacy_rel_selfattn":
assert pos_enc_layer_type == "legacy_rel_pos"
encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, attention_dim,
attention_dropout_rate, )
elif selfattention_layer_type == "rel_selfattn":
logging.info(
"encoder self-attention layer type = relative self-attention")
assert pos_enc_layer_type == "rel_pos"
encoder_selfattn_layer = RelPositionMultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, attention_dim,
attention_dropout_rate, zero_triu, )
else:
raise ValueError("unknown encoder_attn_layer: " +
selfattention_layer_type)
# feed-forward module definition
if positionwise_layer_type == "linear":
positionwise_layer = PositionwiseFeedForward
positionwise_layer_args = (attention_dim, linear_units,
dropout_rate, activation, )
elif positionwise_layer_type == "conv1d":
positionwise_layer = MultiLayeredConv1d
positionwise_layer_args = (attention_dim, linear_units,
positionwise_conv_kernel_size,
dropout_rate, )
elif positionwise_layer_type == "conv1d-linear":
positionwise_layer = Conv1dLinear
positionwise_layer_args = (attention_dim, linear_units,
positionwise_conv_kernel_size,
dropout_rate, )
else:
raise NotImplementedError("Support only linear or conv1d.")
# convolution module definition
convolution_layer = ConvolutionModule
convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
self.encoders = repeat(
num_blocks,
lambda lnum: EncoderLayer(
attention_dim,
encoder_selfattn_layer(*encoder_selfattn_layer_args),
positionwise_layer(*positionwise_layer_args),
positionwise_layer(*positionwise_layer_args) if macaron_style else None,
convolution_layer(*convolution_layer_args) if use_cnn_module else None,
dropout_rate,
normalize_before,
concat_after,
stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
if self.normalize_before:
self.after_norm = LayerNorm(attention_dim)
self.intermediate_layers = intermediate_layers
def forward(self, xs, masks):
"""Encode input sequence.
Parameters
----------
xs : paddle.Tensor
Input tensor (#batch, time, idim).
masks (paddle.Tensor): Mask tensor (#batch, 1, time).
Returns
----------
paddle.Tensor
Output tensor (#batch, time, attention_dim).
paddle.Tensor
Mask tensor (#batch, time).
"""
if isinstance(self.embed, (Conv2dSubsampling)):
xs, masks = self.embed(xs, masks)
else:
xs = self.embed(xs)
if self.intermediate_layers is None:
xs, masks = self.encoders(xs, masks)
else:
intermediate_outputs = []
for layer_idx, encoder_layer in enumerate(self.encoders):
xs, masks = encoder_layer(xs, masks)
if (self.intermediate_layers is not None and
layer_idx + 1 in self.intermediate_layers):
# intermediate branches also require normalization.
encoder_output = xs
if isinstance(encoder_output, tuple):
encoder_output = encoder_output[0]
if self.normalize_before:
encoder_output = self.after_norm(encoder_output)
intermediate_outputs.append(encoder_output)
if isinstance(xs, tuple):
xs = xs[0]
if self.normalize_before:
xs = self.after_norm(xs)
if self.intermediate_layers is not None:
return xs, masks, intermediate_outputs
return xs, masks

@ -37,7 +37,7 @@ class MultiHeadedAttention(nn.Layer):
def __init__(self, n_head, n_feat, dropout_rate): def __init__(self, n_head, n_feat, dropout_rate):
"""Construct an MultiHeadedAttention object.""" """Construct an MultiHeadedAttention object."""
super(MultiHeadedAttention, self).__init__() super().__init__()
assert n_feat % n_head == 0 assert n_feat % n_head == 0
# We assume d_v always equals d_k # We assume d_v always equals d_k
self.d_k = n_feat // n_head self.d_k = n_feat // n_head
@ -70,7 +70,7 @@ class MultiHeadedAttention(nn.Layer):
paddle.Tensor paddle.Tensor
Transformed value tensor (#batch, n_head, time2, d_k). Transformed value tensor (#batch, n_head, time2, d_k).
""" """
n_batch = query.shape[0] n_batch = paddle.shape(query)[0]
q = paddle.reshape( q = paddle.reshape(
self.linear_q(query), [n_batch, -1, self.h, self.d_k]) self.linear_q(query), [n_batch, -1, self.h, self.d_k])
@ -104,7 +104,7 @@ class MultiHeadedAttention(nn.Layer):
Transformed value (#batch, time1, d_model) Transformed value (#batch, time1, d_model)
weighted by the attention score (#batch, time1, time2). weighted by the attention score (#batch, time1, time2).
""" """
n_batch = value.shape[0] n_batch = paddle.shape(value)[0]
softmax = paddle.nn.Softmax(axis=-1) softmax = paddle.nn.Softmax(axis=-1)
if mask is not None: if mask is not None:
mask = mask.unsqueeze(1) mask = mask.unsqueeze(1)
@ -126,8 +126,8 @@ class MultiHeadedAttention(nn.Layer):
# (batch, time1, d_model) # (batch, time1, d_model)
x = (paddle.reshape( x = (paddle.reshape(
x.transpose((0, 2, 1, 3)), (n_batch, -1, self.h * self.d_k))) x.transpose((0, 2, 1, 3)), (n_batch, -1, self.h * self.d_k)))
# (batch, time1, d_model)
return self.linear_out(x) # (batch, time1, d_model) return self.linear_out(x)
def forward(self, query, key, value, mask=None): def forward(self, query, key, value, mask=None):
"""Compute scaled dot product attention. """Compute scaled dot product attention.
@ -153,3 +153,113 @@ class MultiHeadedAttention(nn.Layer):
(0, 1, 3, 2))) / math.sqrt(self.d_k) (0, 1, 3, 2))) / math.sqrt(self.d_k)
return self.forward_attention(v, scores, mask) return self.forward_attention(v, scores, mask)
class RelPositionMultiHeadedAttention(MultiHeadedAttention):
"""Multi-Head Attention layer with relative position encoding (new implementation).
Details can be found in https://github.com/espnet/espnet/pull/2816.
Paper: https://arxiv.org/abs/1901.02860
Parameters
----------
n_head : int
The number of heads.
n_feat : int
The number of features.
dropout_rate : float
Dropout rate.
zero_triu : bool
Whether to zero the upper triangular part of attention matrix.
"""
def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
"""Construct an RelPositionMultiHeadedAttention object."""
super().__init__(n_head, n_feat, dropout_rate)
self.zero_triu = zero_triu
# linear transformation for positional encoding
self.linear_pos = nn.Linear(n_feat, n_feat, bias_attr=False)
# these two learnable bias are used in matrix c and matrix d
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
self.pos_bias_u = paddle.create_parameter(
shape=(self.h, self.d_k),
dtype='float32',
default_initializer=paddle.nn.initializer.XavierUniform())
self.pos_bias_v = paddle.create_parameter(
shape=(self.h, self.d_k),
dtype='float32',
default_initializer=paddle.nn.initializer.XavierUniform())
def rel_shift(self, x):
"""Compute relative positional encoding.
Parameters
----------
x : paddle.Tensor
Input tensor (batch, head, time1, 2*time1-1).
time1 means the length of query vector.
Returns
----------
paddle.Tensor
Output tensor.
"""
b, h, t1, t2 = paddle.shape(x)
zero_pad = paddle.zeros((b, h, t1, 1))
x_padded = paddle.concat([zero_pad, x], axis=-1)
x_padded = x_padded.reshape([b, h, t2 + 1, t1])
# only keep the positions from 0 to time2
x = x_padded[:, :, 1:].reshape([b, h, t1, t2])[:, :, :, :t2 // 2 + 1]
if self.zero_triu:
ones = paddle.ones((t1, t2))
x = x * paddle.tril(ones, t2 - 1)[None, None, :, :]
return x
def forward(self, query, key, value, pos_emb, mask):
"""Compute 'Scaled Dot Product Attention' with rel. positional encoding.
Parameters
----------
query : paddle.Tensor
Query tensor (#batch, time1, size).
key : paddle.Tensor
Key tensor (#batch, time2, size).
value : paddle.Tensor
Value tensor (#batch, time2, size).
pos_emb : paddle.Tensor
Positional embedding tensor
(#batch, 2*time1-1, size).
mask : paddle.Tensor
Mask tensor (#batch, 1, time2) or
(#batch, time1, time2).
Returns
----------
paddle.Tensor
Output tensor (#batch, time1, d_model).
"""
q, k, v = self.forward_qkv(query, key, value)
# (batch, time1, head, d_k)
q = q.transpose([0, 2, 1, 3])
n_batch_pos = paddle.shape(pos_emb)[0]
p = self.linear_pos(pos_emb).reshape(
[n_batch_pos, -1, self.h, self.d_k])
# (batch, head, 2*time1-1, d_k)
p = p.transpose([0, 2, 1, 3])
# (batch, head, time1, d_k)
q_with_bias_u = (q + self.pos_bias_u).transpose([0, 2, 1, 3])
# (batch, head, time1, d_k)
q_with_bias_v = (q + self.pos_bias_v).transpose([0, 2, 1, 3])
# compute attention score
# first compute matrix a and matrix c
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
# (batch, head, time1, time2)
matrix_ac = paddle.matmul(q_with_bias_u, k.transpose([0, 1, 3, 2]))
# compute matrix b and matrix d
# (batch, head, time1, 2*time1-1)
matrix_bd = paddle.matmul(q_with_bias_v, p.transpose([0, 1, 3, 2]))
matrix_bd = self.rel_shift(matrix_bd)
# (batch, head, time1, time2)
scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k)
return self.forward_attention(v, scores, mask)

@ -139,3 +139,76 @@ class ScaledPositionalEncoding(PositionalEncoding):
T = paddle.shape(x)[1] T = paddle.shape(x)[1]
x = x + self.alpha * self.pe[:, :T] x = x + self.alpha * self.pe[:, :T]
return self.dropout(x) return self.dropout(x)
class RelPositionalEncoding(paddle.nn.Layer):
"""Relative positional encoding module (new implementation).
Details can be found in https://github.com/espnet/espnet/pull/2816.
See : Appendix B in https://arxiv.org/abs/1901.02860
Parameters
----------
d_model : int
Embedding dimension.
dropout_rate : float
Dropout rate.
max_len : int
Maximum input length.
"""
def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
"""Construct an PositionalEncoding object."""
super(RelPositionalEncoding, self).__init__()
self.d_model = d_model
self.xscale = math.sqrt(self.d_model)
self.dropout = paddle.nn.Dropout(p=dropout_rate)
self.pe = None
self.dtype = dtype
self.extend_pe(paddle.expand(paddle.zeros([1]), (1, max_len)))
def extend_pe(self, x):
"""Reset the positional encodings."""
if self.pe is not None:
# self.pe contains both positive and negative parts
# the length of self.pe is 2 * input_len - 1
if paddle.shape(self.pe)[1] >= paddle.shape(x)[1] * 2 - 1:
return
# Suppose `i` means to the position of query vecotr and `j` means the
# position of key vector. We use position relative positions when keys
# are to the left (i>j) and negative relative positions otherwise (i<j).
x_shape = paddle.shape(x)
pe_positive = paddle.zeros([x_shape[1], self.d_model])
pe_negative = paddle.zeros([x_shape[1], self.d_model])
position = paddle.arange(0, x_shape[1], dtype=self.dtype).unsqueeze(1)
div_term = paddle.exp(
paddle.arange(0, self.d_model, 2, dtype=self.dtype) *
-(math.log(10000.0) / self.d_model))
pe_positive[:, 0::2] = paddle.sin(position * div_term)
pe_positive[:, 1::2] = paddle.cos(position * div_term)
pe_negative[:, 0::2] = paddle.sin(-1 * position * div_term)
pe_negative[:, 1::2] = paddle.cos(-1 * position * div_term)
# Reserve the order of positive indices and concat both positive and
# negative indices. This is used to support the shifting trick
# as in https://arxiv.org/abs/1901.02860
pe_positive = paddle.flip(pe_positive, [0]).unsqueeze(0)
pe_negative = pe_negative[1:].unsqueeze(0)
pe = paddle.concat([pe_positive, pe_negative], axis=1)
self.pe = pe
def forward(self, x: paddle.Tensor):
"""Add positional encoding.
Parameters
----------
x : paddle.Tensor
Input tensor (batch, time, `*`).
Returns
----------
paddle.Tensor
Encoded tensor (batch, time, `*`).
"""
self.extend_pe(x)
x = x * self.xscale
T = paddle.shape(x)[1]
pe_size = paddle.shape(self.pe)
pos_emb = self.pe[:, pe_size[1] // 2 - T + 1:pe_size[1] // 2 + T, ]
return self.dropout(x), self.dropout(pos_emb)

@ -12,15 +12,26 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet) # Modified from espnet(https://github.com/espnet/espnet)
from typing import List
from typing import Union
from paddle import nn from paddle import nn
from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer as ConformerEncoderLayer
from paddlespeech.t2s.modules.layer_norm import LayerNorm
from paddlespeech.t2s.modules.nets_utils import get_activation
from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
from paddlespeech.t2s.modules.transformer.encoder_layer import EncoderLayer from paddlespeech.t2s.modules.transformer.encoder_layer import EncoderLayer
from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
from paddlespeech.t2s.modules.transformer.repeat import repeat from paddlespeech.t2s.modules.transformer.repeat import repeat
from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
class Encoder(nn.Layer): class Encoder(nn.Layer):
@ -46,9 +57,6 @@ class Encoder(nn.Layer):
Dropout rate in attention. Dropout rate in attention.
input_layer : Union[str, paddle.nn.Layer] input_layer : Union[str, paddle.nn.Layer]
Input layer type. Input layer type.
pos_enc_class : paddle.nn.Layer
Positional encoding module class.
`PositionalEncoding `or `ScaledPositionalEncoding`
normalize_before : bool normalize_before : bool
Whether to use layer_norm before the first block. Whether to use layer_norm before the first block.
concat_after : bool concat_after : bool
@ -60,98 +68,137 @@ class Encoder(nn.Layer):
"linear", "conv1d", or "conv1d-linear". "linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size : int positionwise_conv_kernel_size : int
Kernel size of positionwise conv1d layer. Kernel size of positionwise conv1d layer.
macaron_style : bool
Whether to use macaron style for positionwise layer.
pos_enc_layer_type : str
Encoder positional encoding layer type.
selfattention_layer_type : str selfattention_layer_type : str
Encoder attention layer type. Encoder attention layer type.
activation_type : str
Encoder activation function type.
use_cnn_module : bool
Whether to use convolution module.
zero_triu : bool
Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel : int
Kernerl size of convolution module.
padding_idx : int padding_idx : int
Padding idx for input_layer=embed. Padding idx for input_layer=embed.
stochastic_depth_rate : float
Maximum probability to skip the encoder layer.
intermediate_layers : Union[List[int], None]
indices of intermediate CTC layer.
indices start from 1.
if not None, intermediate outputs are returned (which changes return type
signature.)
encoder_type: str
"transformer", or "conformer".
""" """
def __init__( def __init__(self,
self, idim: int,
idim, attention_dim: int=256,
attention_dim=256, attention_heads: int=4,
attention_heads=4, linear_units: int=2048,
linear_units=2048, num_blocks: int=6,
num_blocks=6, dropout_rate: float=0.1,
dropout_rate=0.1, positional_dropout_rate: float=0.1,
positional_dropout_rate=0.1, attention_dropout_rate: float=0.0,
attention_dropout_rate=0.0, input_layer: str="conv2d",
input_layer="conv2d", normalize_before: bool=True,
pos_enc_class=PositionalEncoding, concat_after: bool=False,
normalize_before=True, positionwise_layer_type: str="linear",
concat_after=False, positionwise_conv_kernel_size: int=1,
positionwise_layer_type="linear", macaron_style: bool=False,
positionwise_conv_kernel_size=1, pos_enc_layer_type: str="abs_pos",
selfattention_layer_type="selfattn", selfattention_layer_type: str="selfattn",
padding_idx=-1, ): activation_type: str="swish",
use_cnn_module: bool=False,
zero_triu: bool=False,
cnn_module_kernel: int=31,
padding_idx: int=-1,
stochastic_depth_rate: float=0.0,
intermediate_layers: Union[List[int], None]=None,
encoder_type: str="transformer"):
"""Construct an Encoder object.""" """Construct an Encoder object."""
super(Encoder, self).__init__() super().__init__()
activation = get_activation(activation_type)
pos_enc_class = self.get_pos_enc_class(pos_enc_layer_type,
selfattention_layer_type)
self.encoder_type = encoder_type
self.conv_subsampling_factor = 1 self.conv_subsampling_factor = 1
if input_layer == "linear": self.embed = self.get_embed(
self.embed = nn.Sequential( idim=idim,
nn.Linear(idim, attention_dim, bias_attr=True), input_layer=input_layer,
nn.LayerNorm(attention_dim), attention_dim=attention_dim,
nn.Dropout(dropout_rate), pos_enc_class=pos_enc_class,
nn.ReLU(), dropout_rate=dropout_rate,
pos_enc_class(attention_dim, positional_dropout_rate), ) positional_dropout_rate=positional_dropout_rate,
elif input_layer == "embed": padding_idx=padding_idx)
self.embed = nn.Sequential(
nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
pos_enc_class(attention_dim, positional_dropout_rate), )
elif isinstance(input_layer, nn.Layer):
self.embed = nn.Sequential(
input_layer,
pos_enc_class(attention_dim, positional_dropout_rate), )
elif input_layer is None:
self.embed = nn.Sequential(
pos_enc_class(attention_dim, positional_dropout_rate))
else:
raise ValueError("unknown input_layer: " + input_layer)
self.normalize_before = normalize_before self.normalize_before = normalize_before
# self-attention module definition
encoder_selfattn_layer, encoder_selfattn_layer_args = self.get_encoder_selfattn_layer(
selfattention_layer_type=selfattention_layer_type,
attention_heads=attention_heads,
attention_dim=attention_dim,
attention_dropout_rate=attention_dropout_rate,
zero_triu=zero_triu,
pos_enc_layer_type=pos_enc_layer_type)
# feed-forward module definition
positionwise_layer, positionwise_layer_args = self.get_positionwise_layer( positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
positionwise_layer_type, positionwise_layer_type, attention_dim, linear_units, dropout_rate,
attention_dim, positionwise_conv_kernel_size, activation)
linear_units,
dropout_rate,
positionwise_conv_kernel_size, )
if selfattention_layer_type in [
"selfattn",
"rel_selfattn",
"legacy_rel_selfattn",
]:
encoder_selfattn_layer = MultiHeadedAttention
encoder_selfattn_layer_args = [
(attention_heads, attention_dim, attention_dropout_rate, )
] * num_blocks
else: # convolution module definition
raise NotImplementedError(selfattention_layer_type) convolution_layer = ConvolutionModule
convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
if self.encoder_type == "transformer":
self.encoders = repeat( self.encoders = repeat(
num_blocks, num_blocks,
lambda lnum: EncoderLayer( lambda lnum: EncoderLayer(
attention_dim, attention_dim,
encoder_selfattn_layer(*encoder_selfattn_layer_args[lnum]), encoder_selfattn_layer(*encoder_selfattn_layer_args),
positionwise_layer(*positionwise_layer_args), positionwise_layer(*positionwise_layer_args),
dropout_rate, dropout_rate,
normalize_before, normalize_before,
concat_after, ), ) concat_after, ), )
elif self.encoder_type == "conformer":
self.encoders = repeat(
num_blocks,
lambda lnum: ConformerEncoderLayer(
attention_dim,
encoder_selfattn_layer(*encoder_selfattn_layer_args),
positionwise_layer(*positionwise_layer_args),
positionwise_layer(*positionwise_layer_args) if macaron_style else None,
convolution_layer(*convolution_layer_args) if use_cnn_module else None,
dropout_rate,
normalize_before,
concat_after,
stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
self.intermediate_layers = intermediate_layers
else:
raise NotImplementedError("Support only linear or conv1d.")
if self.normalize_before: if self.normalize_before:
self.after_norm = nn.LayerNorm(attention_dim) self.after_norm = LayerNorm(attention_dim)
def get_positionwise_layer( def get_positionwise_layer(self,
self, positionwise_layer_type: str="linear",
positionwise_layer_type="linear", attention_dim: int=256,
attention_dim=256, linear_units: int=2048,
linear_units=2048, dropout_rate: float=0.1,
dropout_rate=0.1, positionwise_conv_kernel_size: int=1,
positionwise_conv_kernel_size=1, ): activation: nn.Layer=nn.ReLU()):
"""Define positionwise layer.""" """Define positionwise layer."""
if positionwise_layer_type == "linear": if positionwise_layer_type == "linear":
positionwise_layer = PositionwiseFeedForward positionwise_layer = PositionwiseFeedForward
positionwise_layer_args = (attention_dim, linear_units, positionwise_layer_args = (attention_dim, linear_units,
dropout_rate) dropout_rate, activation)
elif positionwise_layer_type == "conv1d": elif positionwise_layer_type == "conv1d":
positionwise_layer = MultiLayeredConv1d positionwise_layer = MultiLayeredConv1d
positionwise_layer_args = (attention_dim, linear_units, positionwise_layer_args = (attention_dim, linear_units,
@ -166,6 +213,81 @@ class Encoder(nn.Layer):
raise NotImplementedError("Support only linear or conv1d.") raise NotImplementedError("Support only linear or conv1d.")
return positionwise_layer, positionwise_layer_args return positionwise_layer, positionwise_layer_args
def get_encoder_selfattn_layer(self,
selfattention_layer_type: str="selfattn",
attention_heads: int=4,
attention_dim: int=256,
attention_dropout_rate: float=0.0,
zero_triu: bool=False,
pos_enc_layer_type: str="abs_pos"):
if selfattention_layer_type == "selfattn":
encoder_selfattn_layer = MultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, attention_dim,
attention_dropout_rate, )
elif selfattention_layer_type == "rel_selfattn":
assert pos_enc_layer_type == "rel_pos"
encoder_selfattn_layer = RelPositionMultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, attention_dim,
attention_dropout_rate, zero_triu, )
else:
raise ValueError("unknown encoder_attn_layer: " +
selfattention_layer_type)
return encoder_selfattn_layer, encoder_selfattn_layer_args
def get_pos_enc_class(self,
pos_enc_layer_type: str="abs_pos",
selfattention_layer_type: str="selfattn"):
if pos_enc_layer_type == "abs_pos":
pos_enc_class = PositionalEncoding
elif pos_enc_layer_type == "scaled_abs_pos":
pos_enc_class = ScaledPositionalEncoding
elif pos_enc_layer_type == "rel_pos":
assert selfattention_layer_type == "rel_selfattn"
pos_enc_class = RelPositionalEncoding
else:
raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
return pos_enc_class
def get_embed(self,
idim,
input_layer="conv2d",
attention_dim: int=256,
pos_enc_class=PositionalEncoding,
dropout_rate: int=0.1,
positional_dropout_rate: int=0.1,
padding_idx: int=-1):
if input_layer == "linear":
embed = nn.Sequential(
nn.Linear(idim, attention_dim),
nn.LayerNorm(attention_dim),
nn.Dropout(dropout_rate),
nn.ReLU(),
pos_enc_class(attention_dim, positional_dropout_rate), )
elif input_layer == "conv2d":
embed = Conv2dSubsampling(
idim,
attention_dim,
dropout_rate,
pos_enc_class(attention_dim, positional_dropout_rate), )
self.conv_subsampling_factor = 4
elif input_layer == "embed":
embed = nn.Sequential(
nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
pos_enc_class(attention_dim, positional_dropout_rate), )
elif isinstance(input_layer, nn.Layer):
embed = nn.Sequential(
input_layer,
pos_enc_class(attention_dim, positional_dropout_rate), )
elif input_layer is None:
embed = nn.Sequential(
pos_enc_class(attention_dim, positional_dropout_rate))
else:
raise ValueError("unknown input_layer: " + input_layer)
return embed
def forward(self, xs, masks): def forward(self, xs, masks):
"""Encode input sequence. """Encode input sequence.
@ -174,21 +296,55 @@ class Encoder(nn.Layer):
xs : paddle.Tensor xs : paddle.Tensor
Input tensor (#batch, time, idim). Input tensor (#batch, time, idim).
masks : paddle.Tensor masks : paddle.Tensor
Mask tensor (#batch, time). Mask tensor (#batch, 1, time).
Returns Returns
---------- ----------
paddle.Tensor paddle.Tensor
Output tensor (#batch, time, attention_dim). Output tensor (#batch, time, attention_dim).
paddle.Tensor paddle.Tensor
Mask tensor (#batch, time). Mask tensor (#batch, 1, time).
""" """
if self.encoder_type == "transformer":
xs = self.embed(xs)
xs, masks = self.encoders(xs, masks)
if self.normalize_before:
xs = self.after_norm(xs)
return xs, masks
elif self.encoder_type == "conformer":
if isinstance(self.embed, (Conv2dSubsampling)):
xs, masks = self.embed(xs, masks)
else:
xs = self.embed(xs) xs = self.embed(xs)
if self.intermediate_layers is None:
xs, masks = self.encoders(xs, masks) xs, masks = self.encoders(xs, masks)
else:
intermediate_outputs = []
for layer_idx, encoder_layer in enumerate(self.encoders):
xs, masks = encoder_layer(xs, masks)
if (self.intermediate_layers is not None and
layer_idx + 1 in self.intermediate_layers):
# intermediate branches also require normalization.
encoder_output = xs
if isinstance(encoder_output, tuple):
encoder_output = encoder_output[0]
if self.normalize_before:
encoder_output = self.after_norm(encoder_output)
intermediate_outputs.append(encoder_output)
if isinstance(xs, tuple):
xs = xs[0]
if self.normalize_before: if self.normalize_before:
xs = self.after_norm(xs) xs = self.after_norm(xs)
if self.intermediate_layers is not None:
return xs, masks, intermediate_outputs
return xs, masks return xs, masks
else:
raise ValueError(f"{self.encoder_type} is not supported.")
def forward_one_step(self, xs, masks, cache=None): def forward_one_step(self, xs, masks, cache=None):
"""Encode input frame. """Encode input frame.

@ -18,38 +18,6 @@ import paddle
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
class TooShortUttError(Exception):
"""Raised when the utt is too short for subsampling.
Parameters
----------
message : str
Message for error catch
actual_size : int
the short size that cannot pass the subsampling
limit : int
the limit size for subsampling
"""
def __init__(self, message, actual_size, limit):
"""Construct a TooShortUttError for error handler."""
super().__init__(message)
self.actual_size = actual_size
self.limit = limit
def check_short_utt(ins, size):
"""Check if the utterance is too short for subsampling."""
if isinstance(ins, Conv2dSubsampling2) and size < 3:
return True, 3
if isinstance(ins, Conv2dSubsampling) and size < 7:
return True, 7
if isinstance(ins, Conv2dSubsampling6) and size < 11:
return True, 11
if isinstance(ins, Conv2dSubsampling8) and size < 15:
return True, 15
return False, -1
class Conv2dSubsampling(paddle.nn.Layer): class Conv2dSubsampling(paddle.nn.Layer):
"""Convolutional 2D subsampling (to 1/4 length). """Convolutional 2D subsampling (to 1/4 length).
Parameters Parameters
@ -112,178 +80,3 @@ class Conv2dSubsampling(paddle.nn.Layer):
raise NotImplementedError( raise NotImplementedError(
"Support only `-1` (for `reset_parameters`).") "Support only `-1` (for `reset_parameters`).")
return self.out[key] return self.out[key]
class Conv2dSubsampling2(paddle.nn.Layer):
"""Convolutional 2D subsampling (to 1/2 length).
Parameters
----------
idim : int
Input dimension.
odim : int
Output dimension.
dropout_rate : float
Dropout rate.
pos_enc : paddle.nn.Layer
Custom position encoding layer.
"""
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
"""Construct an Conv2dSubsampling2 object."""
super(Conv2dSubsampling2, self).__init__()
self.conv = paddle.nn.Sequential(
paddle.nn.Conv2D(1, odim, 3, 2),
paddle.nn.ReLU(),
paddle.nn.Conv2D(odim, odim, 3, 1),
paddle.nn.ReLU(), )
self.out = paddle.nn.Sequential(
paddle.nn.Linear(odim * (((idim - 1) // 2 - 2)), odim),
pos_enc if pos_enc is not None else
PositionalEncoding(odim, dropout_rate), )
def forward(self, x, x_mask):
"""Subsample x.
Parameters
----------
x : paddle.Tensor
Input tensor (#batch, time, idim).
x_mask : paddle.Tensor
Input mask (#batch, 1, time).
Returns
----------
paddle.Tensor
ubsampled tensor (#batch, time', odim),
where time' = time // 2.
paddle.Tensor
Subsampled mask (#batch, 1, time'),
where time' = time // 2.
"""
# (b, c, t, f)
x = x.unsqueeze(1)
x = self.conv(x)
b, c, t, f = paddle.shape(x)
x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
if x_mask is None:
return x, None
return x, x_mask[:, :, :-2:2][:, :, :-2:1]
def __getitem__(self, key):
"""Get item.
When reset_parameters() is called, if use_scaled_pos_enc is used,
return the positioning encoding.
"""
if key != -1:
raise NotImplementedError(
"Support only `-1` (for `reset_parameters`).")
return self.out[key]
class Conv2dSubsampling6(paddle.nn.Layer):
"""Convolutional 2D subsampling (to 1/6 length).
Parameters
----------
idim : int
Input dimension.
odim : int
Output dimension.
dropout_rate : float
Dropout rate.
pos_enc : paddle.nn.Layer
Custom position encoding layer.
"""
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
"""Construct an Conv2dSubsampling6 object."""
super(Conv2dSubsampling6, self).__init__()
self.conv = paddle.nn.Sequential(
paddle.nn.Conv2D(1, odim, 3, 2),
paddle.nn.ReLU(),
paddle.nn.Conv2D(odim, odim, 5, 3),
paddle.nn.ReLU(), )
self.out = paddle.nn.Sequential(
paddle.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim),
pos_enc if pos_enc is not None else
PositionalEncoding(odim, dropout_rate), )
def forward(self, x, x_mask):
"""Subsample x.
Parameters
----------
x : paddle.Tensor
Input tensor (#batch, time, idim).
x_mask paddle.Tensor
Input mask (#batch, 1, time).
Returns
----------
paddle.Tensor
Subsampled tensor (#batch, time', odim),
where time' = time // 6.
paddle.Tensor
Subsampled mask (#batch, 1, time'),
where time' = time // 6.
"""
# (b, c, t, f)
x = x.unsqueeze(1)
x = self.conv(x)
b, c, t, f = paddle.shape(x)
x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
if x_mask is None:
return x, None
return x, x_mask[:, :, :-2:2][:, :, :-4:3]
class Conv2dSubsampling8(paddle.nn.Layer):
"""Convolutional 2D subsampling (to 1/8 length).
Parameters
----------
idim : int
Input dimension.
odim : int
Output dimension.
dropout_rate : float
Dropout rate.
pos_enc : paddle.nn.Layer
Custom position encoding layer.
"""
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
"""Construct an Conv2dSubsampling8 object."""
super(Conv2dSubsampling8, self).__init__()
self.conv = paddle.nn.Sequential(
paddle.nn.Conv2D(1, odim, 3, 2),
paddle.nn.ReLU(),
paddle.nn.Conv2D(odim, odim, 3, 2),
paddle.nn.ReLU(),
paddle.nn.Conv2D(odim, odim, 3, 2),
paddle.nn.ReLU(), )
self.out = paddle.nn.Sequential(
paddle.nn.Linear(odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2),
odim),
pos_enc if pos_enc is not None else
PositionalEncoding(odim, dropout_rate), )
def forward(self, x, x_mask):
"""Subsample x.
Parameters
----------
x : paddle.Tensor
Input tensor (#batch, time, idim).
x_mask : paddle.Tensor
Input mask (#batch, 1, time).
Returns
----------
paddle.Tensor
Subsampled tensor (#batch, time', odim),
where time' = time // 8.
paddle.Tensor
Subsampled mask (#batch, 1, time'),
where time' = time // 8.
"""
# (b, c, t, f)
x = x.unsqueeze(1)
x = self.conv(x)
b, c, t, f = paddle.shape(x)
x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
if x_mask is None:
return x, None
return x, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]

Loading…
Cancel
Save