Merge branch 'develop' of github.com:PaddlePaddle/PaddleSpeech into HEAD

pull/1015/head
TianYuan 4 years ago committed by root
commit bc0dd51149

@ -41,10 +41,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# # export ckpt avg_n
# CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi
# Optionally, you can add LM and test it with runtime.
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then

@ -24,7 +24,7 @@ f0max: 400 # Minimum f0 for pitch extraction.
# DATA SETTING #
###########################################################
batch_size: 64
num_workers: 4
num_workers: 2
###########################################################
@ -45,7 +45,6 @@ model:
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_masking: True # whether to apply masking for padded part in loss calculation
use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input

@ -7,7 +7,6 @@ gpus=0,1
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_482.pdz

@ -9,7 +9,7 @@ alignment=$3
ge2e_ckpt_path=$4
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../../ge2e/inference.py \
python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \
--input=${input}/wav \
--output=${preprocess_path}/embed \
--checkpoint_path=${ge2e_ckpt_path}

@ -45,7 +45,6 @@ model:
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_masking: True # whether to apply masking for padded part in loss calculation
use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input

@ -0,0 +1,109 @@
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs: 24000 # sr
n_fft: 2048 # FFT size.
n_shift: 300 # Hop size.
win_length: 1200 # Window length.
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.
# Only used for feats_type != raw
fmin: 80 # Minimum frequency of Mel basis.
fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
###########################################################
# DATA SETTING #
###########################################################
batch_size: 64
num_workers: 4
###########################################################
# MODEL SETTING #
###########################################################
model:
adim: 384 # attention dimension
aheads: 2 # number of attention heads
elayers: 4 # number of encoder layers
eunits: 1536 # number of encoder ff units
dlayers: 4 # number of decoder layers
dunits: 1536 # number of decoder ff units
positionwise_layer_type: conv1d # type of position-wise layer
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input
reduction_factor: 1 # reduction factor
encoder_type: conformer # encoder type
decoder_type: conformer # decoder type
conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type
conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
conformer_activation_type: swish # conformer activation type
use_macaron_style_in_conformer: true # whether to use macaron style in conformer
use_cnn_in_conformer: true # whether to use CNN in conformer
conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder
conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder
init_type: xavier_uniform # initialization type
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
###########################################################
# UPDATER SETTING #
###########################################################
updater:
use_masking: True # whether to apply masking for padded part in loss calculation
###########################################################
# OPTIMIZER SETTING #
###########################################################
optimizer:
optim: adam # optimizer type
learning_rate: 0.001 # learning rate
###########################################################
# TRAINING SETTING #
###########################################################
max_epoch: 1000
num_snapshots: 5
###########################################################
# OTHER SETTING #
###########################################################
seed: 10086

@ -45,7 +45,6 @@ model:
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_masking: True # whether to apply masking for padded part in loss calculation
use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input

@ -80,7 +80,7 @@ lambda_adv: 4.0 # Loss balancing coefficient.
batch_size: 8 # Batch size.
batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size.
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
num_workers: 4 # Number of workers in Pytorch DataLoader.
num_workers: 2 # Number of workers in Pytorch DataLoader.
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.

@ -43,10 +43,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# # export ckpt avg_n
# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# test a single .wav file

@ -1,6 +1,8 @@
# https://yaml.org/type/float.html
# network architecture
model:
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:

@ -48,10 +48,10 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# export ckpt avg_n
./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
# if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# # export ckpt avg_n
# ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
./local/cacu_perplexity.sh || exit -1

@ -45,7 +45,6 @@ model:
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_masking: True # whether to apply masking for padded part in loss calculation
use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input

@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=ge2e
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL}

@ -35,7 +35,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
# if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# # export ckpt avg_n
# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi

@ -42,7 +42,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# # export ckpt avg_n
# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi

@ -39,8 +39,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# # export ckpt avg_n
# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi

@ -45,7 +45,6 @@ model:
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_masking: True # whether to apply masking for padded part in loss calculation
use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input

@ -126,8 +126,12 @@ decoders_module = [
]
setup(
name='swig_decoders',
version='1.1',
description="""CTC decoders""",
name='paddlespeech_ctcdecoders',
version='0.0.1a',
description="CTC decoders in paddlespeech",
author="PaddlePaddle Speech and Language Team",
author_email="paddlesl@baidu.com",
url="https://github.com/PaddlePaddle/PaddleSpeech",
license='Apache 2.0',
ext_modules=decoders_module,
py_modules=['swig_decoders'], )
py_modules=['swig_decoders'])

@ -860,7 +860,7 @@ class U2Model(U2DecodeModel):
int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
"""
# cmvn
if configs['cmvn_file'] is not None:
if 'cmvn_file' in configs and configs['cmvn_file'] is not None:
mean, istd = load_cmvn(configs['cmvn_file'],
configs['cmvn_file_type'])
global_cmvn = GlobalCMVN(

@ -100,7 +100,7 @@ def fastspeech2_single_spk_batch_fn(examples):
def fastspeech2_multi_spk_batch_fn(examples):
# fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"]
# fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"/"spk_emb"]
text = [np.array(item["text"], dtype=np.int64) for item in examples]
speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
@ -114,7 +114,6 @@ def fastspeech2_multi_spk_batch_fn(examples):
speech_lengths = [
np.array(item["speech_lengths"], dtype=np.int64) for item in examples
]
spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
text = batch_sequences(text)
pitch = batch_sequences(pitch)
@ -130,7 +129,6 @@ def fastspeech2_multi_spk_batch_fn(examples):
energy = paddle.to_tensor(energy)
text_lengths = paddle.to_tensor(text_lengths)
speech_lengths = paddle.to_tensor(speech_lengths)
spk_id = paddle.to_tensor(spk_id)
batch = {
"text": text,
@ -139,9 +137,20 @@ def fastspeech2_multi_spk_batch_fn(examples):
"speech": speech,
"speech_lengths": speech_lengths,
"pitch": pitch,
"energy": energy,
"spk_id": spk_id
"energy": energy
}
# spk_emb has a higher priority than spk_id
if "spk_emb" in examples[0]:
spk_emb = [
np.array(item["spk_emb"], dtype=np.float32) for item in examples
]
spk_emb = batch_sequences(spk_emb)
spk_emb = paddle.to_tensor(spk_emb)
batch["spk_emb"] = spk_emb
elif "spk_id" in examples[0]:
spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
spk_id = paddle.to_tensor(spk_id)
batch["spk_id"] = spk_id
return batch

@ -46,14 +46,14 @@ def evaluate(args, fastspeech2_config, pwg_config):
print("vocab_size:", vocab_size)
with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()]
num_speakers = len(spk_id)
print("num_speakers:", num_speakers)
spk_num = len(spk_id)
print("spk_num:", spk_num)
odim = fastspeech2_config.n_mels
model = FastSpeech2(
idim=vocab_size,
odim=odim,
num_speakers=num_speakers,
spk_num=spk_num,
**fastspeech2_config["model"])
model.set_state_dict(

@ -51,14 +51,14 @@ def evaluate(args, fastspeech2_config, pwg_config):
print("vocab_size:", vocab_size)
with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()]
num_speakers = len(spk_id)
print("num_speakers:", num_speakers)
spk_num = len(spk_id)
print("spk_num:", spk_num)
odim = fastspeech2_config.n_mels
model = FastSpeech2(
idim=vocab_size,
odim=odim,
num_speakers=num_speakers,
spk_num=spk_num,
**fastspeech2_config["model"])
model.set_state_dict(

@ -167,6 +167,10 @@ def main():
"pitch": str(pitch_path),
"energy": str(energy_path)
}
# add spk_emb for voice cloning
if "spk_emb" in item:
record["spk_emb"] = str(item["spk_emb"])
output_metadata.append(record)
output_metadata.sort(key=itemgetter('utt_id'))
output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"

@ -44,7 +44,8 @@ def process_sentence(config: Dict[str, Any],
mel_extractor=None,
pitch_extractor=None,
energy_extractor=None,
cut_sil: bool=True):
cut_sil: bool=True,
spk_emb_dir: Path=None):
utt_id = fp.stem
# for vctk
if utt_id.endswith("_mic2"):
@ -116,6 +117,14 @@ def process_sentence(config: Dict[str, Any],
"energy": str(energy_path),
"speaker": speaker
}
if spk_emb_dir:
if speaker in os.listdir(spk_emb_dir):
embed_name = utt_id + ".npy"
embed_path = spk_emb_dir / speaker / embed_name
if embed_path.is_file():
record["spk_emb"] = str(embed_path)
else:
return None
return record
@ -127,13 +136,14 @@ def process_sentences(config,
pitch_extractor=None,
energy_extractor=None,
nprocs: int=1,
cut_sil: bool=True):
cut_sil: bool=True,
spk_emb_dir: Path=None):
if nprocs == 1:
results = []
for fp in fps:
record = process_sentence(config, fp, sentences, output_dir,
mel_extractor, pitch_extractor,
energy_extractor, cut_sil)
energy_extractor, cut_sil, spk_emb_dir)
if record:
results.append(record)
else:
@ -144,7 +154,7 @@ def process_sentences(config,
future = pool.submit(process_sentence, config, fp,
sentences, output_dir, mel_extractor,
pitch_extractor, energy_extractor,
cut_sil)
cut_sil, spk_emb_dir)
future.add_done_callback(lambda p: progress.update())
futures.append(future)
@ -202,6 +212,11 @@ def main():
default=True,
help="whether cut sil in the edge of audio")
parser.add_argument(
"--spk_emb_dir",
default=None,
type=str,
help="directory to speaker embedding files.")
args = parser.parse_args()
rootdir = Path(args.rootdir).expanduser()
@ -211,6 +226,11 @@ def main():
dumpdir.mkdir(parents=True, exist_ok=True)
dur_file = Path(args.dur_file).expanduser()
if args.spk_emb_dir:
spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve()
else:
spk_emb_dir = None
assert rootdir.is_dir()
assert dur_file.is_file()
@ -251,6 +271,7 @@ def main():
test_wav_files += wav_files[-sub_num_dev:]
else:
train_wav_files += wav_files
elif args.dataset == "ljspeech":
wav_files = sorted(list((rootdir / "wavs").rglob("*.wav")))
# split data into 3 sections
@ -317,7 +338,8 @@ def main():
pitch_extractor,
energy_extractor,
nprocs=args.num_cpu,
cut_sil=args.cut_sil)
cut_sil=args.cut_sil,
spk_emb_dir=spk_emb_dir)
if dev_wav_files:
process_sentences(
config,
@ -327,7 +349,8 @@ def main():
mel_extractor,
pitch_extractor,
energy_extractor,
cut_sil=args.cut_sil)
cut_sil=args.cut_sil,
spk_emb_dir=spk_emb_dir)
if test_wav_files:
process_sentences(
config,
@ -338,7 +361,8 @@ def main():
pitch_extractor,
energy_extractor,
nprocs=args.num_cpu,
cut_sil=args.cut_sil)
cut_sil=args.cut_sil,
spk_emb_dir=spk_emb_dir)
if __name__ == "__main__":

@ -40,16 +40,19 @@ def evaluate(args, fastspeech2_config, pwg_config):
fields = ["utt_id", "text"]
spk_num = None
if args.speaker_dict is not None:
print("multiple speaker fastspeech2!")
with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()]
num_speakers = len(spk_id)
spk_num = len(spk_id)
fields += ["spk_id"]
elif args.voice_cloning:
print("voice cloning!")
fields += ["spk_emb"]
else:
print("single speaker fastspeech2!")
num_speakers = None
print("num_speakers:", num_speakers)
print("spk_num:", spk_num)
test_dataset = DataTable(data=test_metadata, fields=fields)
@ -62,7 +65,7 @@ def evaluate(args, fastspeech2_config, pwg_config):
model = FastSpeech2(
idim=vocab_size,
odim=odim,
num_speakers=num_speakers,
spk_num=spk_num,
**fastspeech2_config["model"])
model.set_state_dict(
@ -96,12 +99,15 @@ def evaluate(args, fastspeech2_config, pwg_config):
for datum in test_dataset:
utt_id = datum["utt_id"]
text = paddle.to_tensor(datum["text"])
if "spk_id" in datum:
spk_emb = None
spk_id = None
if args.voice_cloning and "spk_emb" in datum:
spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
elif "spk_id" in datum:
spk_id = paddle.to_tensor(datum["spk_id"])
else:
spk_id = None
with paddle.no_grad():
wav = pwg_inference(fastspeech2_inference(text, spk_id=spk_id))
wav = pwg_inference(
fastspeech2_inference(text, spk_id=spk_id, spk_emb=spk_emb))
sf.write(
str(output_dir / (utt_id + ".wav")),
wav.numpy(),
@ -142,6 +148,15 @@ def main():
type=str,
default=None,
help="speaker id map file for multiple speaker model.")
def str2bool(str):
return True if str.lower() == 'true' else False
parser.add_argument(
"--voice-cloning",
type=str2bool,
default=False,
help="whether training voice cloning model.")
parser.add_argument("--test-metadata", type=str, help="test metadata.")
parser.add_argument("--output-dir", type=str, help="output dir.")
parser.add_argument(

@ -61,18 +61,24 @@ def train_sp(args, config):
"text", "text_lengths", "speech", "speech_lengths", "durations",
"pitch", "energy"
]
converters = {"speech": np.load, "pitch": np.load, "energy": np.load}
spk_num = None
if args.speaker_dict is not None:
print("multiple speaker fastspeech2!")
collate_fn = fastspeech2_multi_spk_batch_fn
with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()]
num_speakers = len(spk_id)
spk_num = len(spk_id)
fields += ["spk_id"]
elif args.voice_cloning:
print("Training voice cloning!")
collate_fn = fastspeech2_multi_spk_batch_fn
fields += ["spk_emb"]
converters["spk_emb"] = np.load
else:
print("single speaker fastspeech2!")
collate_fn = fastspeech2_single_spk_batch_fn
num_speakers = None
print("num_speakers:", num_speakers)
print("spk_num:", spk_num)
# dataloader has been too verbose
logging.getLogger("DataLoader").disabled = True
@ -83,17 +89,13 @@ def train_sp(args, config):
train_dataset = DataTable(
data=train_metadata,
fields=fields,
converters={"speech": np.load,
"pitch": np.load,
"energy": np.load}, )
converters=converters, )
with jsonlines.open(args.dev_metadata, 'r') as reader:
dev_metadata = list(reader)
dev_dataset = DataTable(
data=dev_metadata,
fields=fields,
converters={"speech": np.load,
"pitch": np.load,
"energy": np.load}, )
converters=converters, )
# collate function and dataloader
@ -127,10 +129,7 @@ def train_sp(args, config):
odim = config.n_mels
model = FastSpeech2(
idim=vocab_size,
odim=odim,
num_speakers=num_speakers,
**config["model"])
idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"])
if world_size > 1:
model = DataParallel(model)
print("model done!")
@ -184,6 +183,15 @@ def main():
default=None,
help="speaker id map file for multiple speaker model.")
def str2bool(str):
return True if str.lower() == 'true' else False
parser.add_argument(
"--voice-cloning",
type=str2bool,
default=False,
help="whether training voice cloning model.")
args = parser.parse_args()
with open(args.config) as f:

@ -1,38 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
def cycle(iterable):
# cycle('ABCD') --> A B C D A B C D A B C D ...
saved = []
for element in iterable:
yield element
saved.append(element)
while saved:
for element in saved:
yield element
def random_cycle(iterable):
# cycle('ABCD') --> A B C D B C D A A D B C ...
saved = []
for element in iterable:
yield element
saved.append(element)
random.shuffle(saved)
while saved:
for element in saved:
yield element
random.shuffle(saved)

@ -1,131 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
from pathlib import Path
import numpy as np
from paddle.io import BatchSampler
from paddle.io import Dataset
from paddlespeech.t2s.exps.ge2e.random_cycle import random_cycle
class MultiSpeakerMelDataset(Dataset):
"""A 2 layer directory thatn contains mel spectrograms in *.npy format.
An Example file structure tree is shown below. We prefer to preprocess
raw datasets and organized them like this.
dataset_root/
speaker1/
utterance1.npy
utterance2.npy
utterance3.npy
speaker2/
utterance1.npy
utterance2.npy
utterance3.npy
"""
def __init__(self, dataset_root: Path):
self.root = Path(dataset_root).expanduser()
speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
speaker_utterances = {
speaker_dir: list(speaker_dir.glob("*.npy"))
for speaker_dir in speaker_dirs
}
self.speaker_dirs = speaker_dirs
self.speaker_to_utterances = speaker_utterances
# meta data
self.num_speakers = len(self.speaker_dirs)
self.num_utterances = np.sum(
len(utterances)
for speaker, utterances in self.speaker_to_utterances.items())
def get_example_by_index(self, speaker_index, utterance_index):
speaker_dir = self.speaker_dirs[speaker_index]
fpath = self.speaker_to_utterances[speaker_dir][utterance_index]
return self[fpath]
def __getitem__(self, fpath):
return np.load(fpath)
def __len__(self):
return int(self.num_utterances)
class MultiSpeakerSampler(BatchSampler):
"""A multi-stratal sampler designed for speaker verification task.
First, N speakers from all speakers are sampled randomly. Then, for each
speaker, randomly sample M utterances from their corresponding utterances.
"""
def __init__(self,
dataset: MultiSpeakerMelDataset,
speakers_per_batch: int,
utterances_per_speaker: int):
self._speakers = list(dataset.speaker_dirs)
self._speaker_to_utterances = dataset.speaker_to_utterances
self.speakers_per_batch = speakers_per_batch
self.utterances_per_speaker = utterances_per_speaker
def __iter__(self):
# yield list of Paths
speaker_generator = iter(random_cycle(self._speakers))
speaker_utterances_generator = {
s: iter(random_cycle(us))
for s, us in self._speaker_to_utterances.items()
}
while True:
speakers = []
for _ in range(self.speakers_per_batch):
speakers.append(next(speaker_generator))
utterances = []
for s in speakers:
us = speaker_utterances_generator[s]
for _ in range(self.utterances_per_speaker):
utterances.append(next(us))
yield utterances
class RandomClip(object):
def __init__(self, frames):
self.frames = frames
def __call__(self, spec):
# spec [T, C]
T = spec.shape[0]
start = random.randint(0, T - self.frames)
return spec[start:start + self.frames, :]
class Collate(object):
def __init__(self, num_frames):
self.random_crop = RandomClip(num_frames)
def __call__(self, examples):
frame_clips = [self.random_crop(mel) for mel in examples]
batced_clips = np.stack(frame_clips)
return batced_clips
if __name__ == "__main__":
mydataset = MultiSpeakerMelDataset(
Path("/home/chenfeiyu/datasets/SV2TTS/encoder"))
print(mydataset.get_example_by_index(0, 10))

@ -1,123 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from paddle import DataParallel
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.nn.clip import ClipGradByGlobalNorm
from paddle.optimizer import Adam
from paddlespeech.t2s.exps.ge2e.config import get_cfg_defaults
from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import Collate
from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerMelDataset
from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerSampler
from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder
from paddlespeech.t2s.training import default_argument_parser
from paddlespeech.t2s.training import ExperimentBase
class Ge2eExperiment(ExperimentBase):
def setup_model(self):
config = self.config
model = LSTMSpeakerEncoder(config.data.n_mels, config.model.num_layers,
config.model.hidden_size,
config.model.embedding_size)
optimizer = Adam(
config.training.learning_rate_init,
parameters=model.parameters(),
grad_clip=ClipGradByGlobalNorm(3))
self.model = DataParallel(model) if self.parallel else model
self.model_core = model
self.optimizer = optimizer
def setup_dataloader(self):
config = self.config
train_dataset = MultiSpeakerMelDataset(self.args.data)
sampler = MultiSpeakerSampler(train_dataset,
config.training.speakers_per_batch,
config.training.utterances_per_speaker)
train_loader = DataLoader(
train_dataset,
batch_sampler=sampler,
collate_fn=Collate(config.data.partial_n_frames),
num_workers=16)
self.train_dataset = train_dataset
self.train_loader = train_loader
def train_batch(self):
start = time.time()
batch = self.read_batch()
data_loader_time = time.time() - start
self.optimizer.clear_grad()
self.model.train()
specs = batch
loss, eer = self.model(specs, self.config.training.speakers_per_batch)
loss.backward()
self.model_core.do_gradient_ops()
self.optimizer.step()
iteration_time = time.time() - start
# logging
loss_value = float(loss)
msg = "Rank: {}, ".format(dist.get_rank())
msg += "step: {}, ".format(self.iteration)
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
iteration_time)
msg += 'loss: {:>.6f} err: {:>.6f}'.format(loss_value, eer)
self.logger.info(msg)
if dist.get_rank() == 0:
self.visualizer.add_scalar("train/loss", loss_value, self.iteration)
self.visualizer.add_scalar("train/eer", eer, self.iteration)
self.visualizer.add_scalar("param/w",
float(self.model_core.similarity_weight),
self.iteration)
self.visualizer.add_scalar("param/b",
float(self.model_core.similarity_bias),
self.iteration)
def valid(self):
pass
def main_sp(config, args):
exp = Ge2eExperiment(config, args)
exp.setup()
exp.resume_or_load()
exp.run()
def main(config, args):
if args.ngpu > 1:
dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
else:
main_sp(config, args)
if __name__ == "__main__":
config = get_cfg_defaults()
parser = default_argument_parser()
args = parser.parse_args()
if args.config:
config.merge_from_file(args.config)
if args.opts:
config.merge_from_list(args.opts)
config.freeze()
print(config)
print(args)
main(config, args)

@ -20,14 +20,14 @@ import paddle
import soundfile as sf
from matplotlib import pyplot as plt
from paddlespeech.t2s.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_phones
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_tones
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.chinese_g2p import convert_sentence
from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder
from paddlespeech.t2s.models.tacotron2 import Tacotron2
from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow
from paddlespeech.t2s.utils import display
from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
def voice_cloning(args):

@ -32,9 +32,7 @@ from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredic
from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor
from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
from paddlespeech.t2s.modules.transformer.encoder import Encoder as TransformerEncoder
from paddlespeech.t2s.modules.transformer.encoder import Encoder
class FastSpeech2(nn.Layer):
@ -66,6 +64,7 @@ class FastSpeech2(nn.Layer):
postnet_layers: int=5,
postnet_chans: int=512,
postnet_filts: int=5,
postnet_dropout_rate: float=0.5,
positionwise_layer_type: str="conv1d",
positionwise_conv_kernel_size: int=1,
use_scaled_pos_enc: bool=True,
@ -77,10 +76,27 @@ class FastSpeech2(nn.Layer):
reduction_factor: int=1,
encoder_type: str="transformer",
decoder_type: str="transformer",
# for transformer
transformer_enc_dropout_rate: float=0.1,
transformer_enc_positional_dropout_rate: float=0.1,
transformer_enc_attn_dropout_rate: float=0.1,
transformer_dec_dropout_rate: float=0.1,
transformer_dec_positional_dropout_rate: float=0.1,
transformer_dec_attn_dropout_rate: float=0.1,
# for conformer
conformer_pos_enc_layer_type: str="rel_pos",
conformer_self_attn_layer_type: str="rel_selfattn",
conformer_activation_type: str="swish",
use_macaron_style_in_conformer: bool=True,
use_cnn_in_conformer: bool=True,
zero_triu: bool=False,
conformer_enc_kernel_size: int=7,
conformer_dec_kernel_size: int=31,
# duration predictor
duration_predictor_layers: int=2,
duration_predictor_chans: int=384,
duration_predictor_kernel_size: int=3,
duration_predictor_dropout_rate: float=0.1,
# energy predictor
energy_predictor_layers: int=2,
energy_predictor_chans: int=384,
@ -98,28 +114,150 @@ class FastSpeech2(nn.Layer):
pitch_embed_dropout: float=0.5,
stop_gradient_from_pitch_predictor: bool=False,
# spk emb
num_speakers: int=None,
spk_num: int=None,
spk_embed_dim: int=None,
spk_embed_integration_type: str="add",
# tone emb
num_tones: int=None,
# tone emb
tone_num: int=None,
tone_embed_dim: int=None,
tone_embed_integration_type: str="add",
# training related
transformer_enc_dropout_rate: float=0.1,
transformer_enc_positional_dropout_rate: float=0.1,
transformer_enc_attn_dropout_rate: float=0.1,
transformer_dec_dropout_rate: float=0.1,
transformer_dec_positional_dropout_rate: float=0.1,
transformer_dec_attn_dropout_rate: float=0.1,
duration_predictor_dropout_rate: float=0.1,
postnet_dropout_rate: float=0.5,
init_type: str="xavier_uniform",
init_enc_alpha: float=1.0,
init_dec_alpha: float=1.0,
use_masking: bool=False,
use_weighted_masking: bool=False, ):
"""Initialize FastSpeech2 module."""
init_dec_alpha: float=1.0, ):
"""Initialize FastSpeech2 module.
Parameters
----------
idim : int
Dimension of the inputs.
odim : int
Dimension of the outputs.
adim : int
Attention dimension.
aheads : int
Number of attention heads.
elayers : int
Number of encoder layers.
eunits : int
Number of encoder hidden units.
dlayers : int
Number of decoder layers.
dunits : int
Number of decoder hidden units.
postnet_layers : int
Number of postnet layers.
postnet_chans : int
Number of postnet channels.
postnet_filts : int
Kernel size of postnet.
postnet_dropout_rate : float
Dropout rate in postnet.
use_scaled_pos_enc : bool
Whether to use trainable scaled pos encoding.
use_batch_norm : bool
Whether to use batch normalization in encoder prenet.
encoder_normalize_before : bool
Whether to apply layernorm layer before encoder block.
decoder_normalize_before : bool
Whether to apply layernorm layer before
decoder block.
encoder_concat_after : bool
Whether to concatenate attention layer's input and output in encoder.
decoder_concat_after : bool
Whether to concatenate attention layer's input and output in decoder.
reduction_factor : int
Reduction factor.
encoder_type : str
Encoder type ("transformer" or "conformer").
decoder_type : str
Decoder type ("transformer" or "conformer").
transformer_enc_dropout_rate : float
Dropout rate in encoder except attention and positional encoding.
transformer_enc_positional_dropout_rate (float): Dropout rate after encoder
positional encoding.
transformer_enc_attn_dropout_rate (float): Dropout rate in encoder
self-attention module.
transformer_dec_dropout_rate (float): Dropout rate in decoder except
attention & positional encoding.
transformer_dec_positional_dropout_rate (float): Dropout rate after decoder
positional encoding.
transformer_dec_attn_dropout_rate (float): Dropout rate in decoder
self-attention module.
conformer_pos_enc_layer_type : str
Pos encoding layer type in conformer.
conformer_self_attn_layer_type : str
Self-attention layer type in conformer
conformer_activation_type : str
Activation function type in conformer.
use_macaron_style_in_conformer : bool
Whether to use macaron style FFN.
use_cnn_in_conformer : bool
Whether to use CNN in conformer.
zero_triu : bool
Whether to use zero triu in relative self-attention module.
conformer_enc_kernel_size : int
Kernel size of encoder conformer.
conformer_dec_kernel_size : int
Kernel size of decoder conformer.
duration_predictor_layers : int
Number of duration predictor layers.
duration_predictor_chans : int
Number of duration predictor channels.
duration_predictor_kernel_size : int
Kernel size of duration predictor.
duration_predictor_dropout_rate : float
Dropout rate in duration predictor.
pitch_predictor_layers : int
Number of pitch predictor layers.
pitch_predictor_chans : int
Number of pitch predictor channels.
pitch_predictor_kernel_size : int
Kernel size of pitch predictor.
pitch_predictor_dropout_rate : float
Dropout rate in pitch predictor.
pitch_embed_kernel_size : float
Kernel size of pitch embedding.
pitch_embed_dropout_rate : float
Dropout rate for pitch embedding.
stop_gradient_from_pitch_predictor : bool
Whether to stop gradient from pitch predictor to encoder.
energy_predictor_layers : int
Number of energy predictor layers.
energy_predictor_chans : int
Number of energy predictor channels.
energy_predictor_kernel_size : int
Kernel size of energy predictor.
energy_predictor_dropout_rate : float
Dropout rate in energy predictor.
energy_embed_kernel_size : float
Kernel size of energy embedding.
energy_embed_dropout_rate : float
Dropout rate for energy embedding.
stop_gradient_from_energy_predictor : bool
Whether to stop gradient from energy predictor to encoder.
spk_num : Optional[int]
Number of speakers. If not None, assume that the spk_embed_dim is not None,
spk_ids will be provided as the input and use spk_embedding_table.
spk_embed_dim : Optional[int]
Speaker embedding dimension. If not None,
assume that spk_emb will be provided as the input or spk_num is not None.
spk_embed_integration_type : str
How to integrate speaker embedding.
tone_num : Optional[int]
Number of tones. If not None, assume that the
tone_ids will be provided as the input and use tone_embedding_table.
tone_embed_dim : Optional[int]
Tone embedding dimension. If not None, assume that tone_num is not None.
tone_embed_integration_type : str
How to integrate tone embedding.
init_type : str
How to initialize transformer parameters.
init_enc_alpha : float
Initial value of alpha in scaled pos encoding of the encoder.
init_dec_alpha : float
Initial value of alpha in scaled pos encoding of the decoder.
"""
assert check_argument_types()
super().__init__()
@ -148,30 +286,50 @@ class FastSpeech2(nn.Layer):
# initialize parameters
initialize(self, init_type)
if self.spk_embed_dim is not None:
if spk_num and self.spk_embed_dim:
self.spk_embedding_table = nn.Embedding(
num_embeddings=num_speakers,
num_embeddings=spk_num,
embedding_dim=self.spk_embed_dim,
padding_idx=self.padding_idx)
if self.tone_embed_dim is not None:
self.tone_embedding_table = nn.Embedding(
num_embeddings=num_tones,
num_embeddings=tone_num,
embedding_dim=self.tone_embed_dim,
padding_idx=self.padding_idx)
# get positional encoding class
pos_enc_class = (ScaledPositionalEncoding
if self.use_scaled_pos_enc else PositionalEncoding)
# get positional encoding layer type
transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos"
# define encoder
encoder_input_layer = nn.Embedding(
num_embeddings=idim,
embedding_dim=adim,
padding_idx=self.padding_idx)
# add encoder type here
# 测试模型还能跑通不
# 记得改 transformer tts
if encoder_type == "transformer":
self.encoder = TransformerEncoder(
print("encoder_type is transformer")
self.encoder = Encoder(
idim=idim,
attention_dim=adim,
attention_heads=aheads,
linear_units=eunits,
num_blocks=elayers,
input_layer=encoder_input_layer,
dropout_rate=transformer_enc_dropout_rate,
positional_dropout_rate=transformer_enc_positional_dropout_rate,
attention_dropout_rate=transformer_enc_attn_dropout_rate,
pos_enc_layer_type=transformer_pos_enc_layer_type,
normalize_before=encoder_normalize_before,
concat_after=encoder_concat_after,
positionwise_layer_type=positionwise_layer_type,
positionwise_conv_kernel_size=positionwise_conv_kernel_size,
encoder_type=encoder_type)
elif encoder_type == "conformer":
print("encoder_type is conformer")
self.encoder = Encoder(
idim=idim,
attention_dim=adim,
attention_heads=aheads,
@ -181,11 +339,18 @@ class FastSpeech2(nn.Layer):
dropout_rate=transformer_enc_dropout_rate,
positional_dropout_rate=transformer_enc_positional_dropout_rate,
attention_dropout_rate=transformer_enc_attn_dropout_rate,
pos_enc_class=pos_enc_class,
normalize_before=encoder_normalize_before,
concat_after=encoder_concat_after,
positionwise_layer_type=positionwise_layer_type,
positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
positionwise_conv_kernel_size=positionwise_conv_kernel_size,
macaron_style=use_macaron_style_in_conformer,
pos_enc_layer_type=conformer_pos_enc_layer_type,
selfattention_layer_type=conformer_self_attn_layer_type,
activation_type=conformer_activation_type,
use_cnn_module=use_cnn_in_conformer,
cnn_module_kernel=conformer_enc_kernel_size,
zero_triu=zero_triu,
encoder_type=encoder_type)
else:
raise ValueError(f"{encoder_type} is not supported.")
@ -251,7 +416,8 @@ class FastSpeech2(nn.Layer):
# NOTE: we use encoder as decoder
# because fastspeech's decoder is the same as encoder
if decoder_type == "transformer":
self.decoder = TransformerEncoder(
print("decoder_type is transformer")
self.decoder = Encoder(
idim=0,
attention_dim=adim,
attention_heads=aheads,
@ -262,11 +428,35 @@ class FastSpeech2(nn.Layer):
dropout_rate=transformer_dec_dropout_rate,
positional_dropout_rate=transformer_dec_positional_dropout_rate,
attention_dropout_rate=transformer_dec_attn_dropout_rate,
pos_enc_class=pos_enc_class,
pos_enc_layer_type=transformer_pos_enc_layer_type,
normalize_before=decoder_normalize_before,
concat_after=decoder_concat_after,
positionwise_layer_type=positionwise_layer_type,
positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
positionwise_conv_kernel_size=positionwise_conv_kernel_size,
encoder_type=decoder_type)
elif decoder_type == "conformer":
print("decoder_type is conformer")
self.decoder = Encoder(
idim=0,
attention_dim=adim,
attention_heads=aheads,
linear_units=dunits,
num_blocks=dlayers,
input_layer=None,
dropout_rate=transformer_dec_dropout_rate,
positional_dropout_rate=transformer_dec_positional_dropout_rate,
attention_dropout_rate=transformer_dec_attn_dropout_rate,
normalize_before=decoder_normalize_before,
concat_after=decoder_concat_after,
positionwise_layer_type=positionwise_layer_type,
positionwise_conv_kernel_size=positionwise_conv_kernel_size,
macaron_style=use_macaron_style_in_conformer,
pos_enc_layer_type=conformer_pos_enc_layer_type,
selfattention_layer_type=conformer_self_attn_layer_type,
activation_type=conformer_activation_type,
use_cnn_module=use_cnn_in_conformer,
cnn_module_kernel=conformer_dec_kernel_size,
encoder_type=decoder_type)
else:
raise ValueError(f"{decoder_type} is not supported.")
@ -299,7 +489,7 @@ class FastSpeech2(nn.Layer):
pitch: paddle.Tensor,
energy: paddle.Tensor,
tone_id: paddle.Tensor=None,
spembs: paddle.Tensor=None,
spk_emb: paddle.Tensor=None,
spk_id: paddle.Tensor=None
) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
"""Calculate forward propagation.
@ -322,7 +512,7 @@ class FastSpeech2(nn.Layer):
Batch of padded token-averaged energy (B, Tmax, 1).
tone_id : Tensor, optional(int64)
Batch of padded tone ids (B, Tmax).
spembs : Tensor, optional
spk_emb : Tensor, optional
Batch of speaker embeddings (B, spk_embed_dim).
spk_id : Tnesor, optional(int64)
Batch of speaker ids (B,)
@ -366,7 +556,7 @@ class FastSpeech2(nn.Layer):
ps,
es,
is_inference=False,
spembs=spembs,
spk_emb=spk_emb,
spk_id=spk_id,
tone_id=tone_id)
# modify mod part of groundtruth
@ -387,7 +577,7 @@ class FastSpeech2(nn.Layer):
es: paddle.Tensor=None,
is_inference: bool=False,
alpha: float=1.0,
spembs=None,
spk_emb=None,
spk_id=None,
tone_id=None) -> Sequence[paddle.Tensor]:
# forward encoder
@ -397,11 +587,12 @@ class FastSpeech2(nn.Layer):
# integrate speaker embedding
if self.spk_embed_dim is not None:
if spembs is not None:
hs = self._integrate_with_spk_embed(hs, spembs)
# spk_emb has a higher priority than spk_id
if spk_emb is not None:
hs = self._integrate_with_spk_embed(hs, spk_emb)
elif spk_id is not None:
spembs = self.spk_embedding_table(spk_id)
hs = self._integrate_with_spk_embed(hs, spembs)
spk_emb = self.spk_embedding_table(spk_id)
hs = self._integrate_with_spk_embed(hs, spk_emb)
# integrate tone embedding
if self.tone_embed_dim is not None:
@ -489,7 +680,7 @@ class FastSpeech2(nn.Layer):
energy: paddle.Tensor=None,
alpha: float=1.0,
use_teacher_forcing: bool=False,
spembs=None,
spk_emb=None,
spk_id=None,
tone_id=None,
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
@ -512,7 +703,7 @@ class FastSpeech2(nn.Layer):
use_teacher_forcing : bool, optional
Whether to use teacher forcing.
If true, groundtruth of duration, pitch and energy will be used.
spembs : Tensor, optional
spk_emb : Tensor, optional
peaker embedding vector (spk_embed_dim,).
spk_id : Tensor, optional(int64)
Batch of padded spk ids (1,).
@ -527,7 +718,6 @@ class FastSpeech2(nn.Layer):
# input of embedding must be int64
x = paddle.cast(text, 'int64')
y = speech
spemb = spembs
d, p, e = durations, pitch, energy
# setup batch axis
ilens = paddle.shape(x)[0]
@ -537,8 +727,8 @@ class FastSpeech2(nn.Layer):
if y is not None:
ys = y.unsqueeze(0)
if spemb is not None:
spembs = spemb.unsqueeze(0)
if spk_emb is not None:
spk_emb = spk_emb.unsqueeze(0)
if tone_id is not None:
tone_id = tone_id.unsqueeze(0)
@ -548,7 +738,7 @@ class FastSpeech2(nn.Layer):
ds = d.unsqueeze(0) if d is not None else None
ps = p.unsqueeze(0) if p is not None else None
es = e.unsqueeze(0) if e is not None else None
# ds, ps, es = , p.unsqueeze(0), e.unsqueeze(0)
# (1, L, odim)
_, outs, d_outs, p_outs, e_outs = self._forward(
xs,
@ -557,7 +747,7 @@ class FastSpeech2(nn.Layer):
ds=ds,
ps=ps,
es=es,
spembs=spembs,
spk_emb=spk_emb,
spk_id=spk_id,
tone_id=tone_id,
is_inference=True)
@ -569,19 +759,19 @@ class FastSpeech2(nn.Layer):
ys,
is_inference=True,
alpha=alpha,
spembs=spembs,
spk_emb=spk_emb,
spk_id=spk_id,
tone_id=tone_id)
return outs[0], d_outs[0], p_outs[0], e_outs[0]
def _integrate_with_spk_embed(self, hs, spembs):
def _integrate_with_spk_embed(self, hs, spk_emb):
"""Integrate speaker embedding with hidden states.
Parameters
----------
hs : Tensor
Batch of hidden state sequences (B, Tmax, adim).
spembs : Tensor
spk_emb : Tensor
Batch of speaker embeddings (B, spk_embed_dim).
Returns
@ -591,13 +781,13 @@ class FastSpeech2(nn.Layer):
"""
if self.spk_embed_integration_type == "add":
# apply projection and then add to hidden states
spembs = self.spk_projection(F.normalize(spembs))
hs = hs + spembs.unsqueeze(1)
spk_emb = self.spk_projection(F.normalize(spk_emb))
hs = hs + spk_emb.unsqueeze(1)
elif self.spk_embed_integration_type == "concat":
# concat hidden states with spk embeds and then apply projection
spembs = F.normalize(spembs).unsqueeze(1).expand(
spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
shape=[-1, hs.shape[1], -1])
hs = self.spk_projection(paddle.concat([hs, spembs], axis=-1))
hs = self.spk_projection(paddle.concat([hs, spk_emb], axis=-1))
else:
raise NotImplementedError("support only add or concat.")
@ -682,9 +872,9 @@ class FastSpeech2Inference(nn.Layer):
self.normalizer = normalizer
self.acoustic_model = model
def forward(self, text, spk_id=None):
def forward(self, text, spk_id=None, spk_emb=None):
normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
text, spk_id=spk_id)
text, spk_id=spk_id, spk_emb=spk_emb)
logmel = self.normalizer.inverse(normalized_mel)
return logmel

@ -54,6 +54,10 @@ class FastSpeech2Updater(StandardUpdater):
losses_dict = {}
# spk_id!=None in multiple spk fastspeech2
spk_id = batch["spk_id"] if "spk_id" in batch else None
spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
# No explicit speaker identifier labels are used during voice cloning training.
if spk_emb is not None:
spk_id = None
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model(
text=batch["text"],
@ -63,7 +67,8 @@ class FastSpeech2Updater(StandardUpdater):
durations=batch["durations"],
pitch=batch["pitch"],
energy=batch["energy"],
spk_id=spk_id)
spk_id=spk_id,
spk_emb=spk_emb)
l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
after_outs=after_outs,
@ -126,6 +131,9 @@ class FastSpeech2Evaluator(StandardEvaluator):
losses_dict = {}
# spk_id!=None in multiple spk fastspeech2
spk_id = batch["spk_id"] if "spk_id" in batch else None
spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
if spk_emb is not None:
spk_id = None
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model(
text=batch["text"],
@ -135,7 +143,8 @@ class FastSpeech2Evaluator(StandardEvaluator):
durations=batch["durations"],
pitch=batch["pitch"],
energy=batch["energy"],
spk_id=spk_id)
spk_id=spk_id,
spk_emb=spk_emb)
l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
after_outs=after_outs,

@ -257,9 +257,9 @@ class TransformerTTS(nn.Layer):
self.padding_idx = 0
# set_global_initializer 会影响后面的全局,包括 create_parameter
initialize(self, init_type)
# get positional encoding class
pos_enc_class = (ScaledPositionalEncoding
if self.use_scaled_pos_enc else PositionalEncoding)
# get positional encoding layer type
transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos"
# define transformer encoder
if eprenet_conv_layers != 0:
@ -291,7 +291,7 @@ class TransformerTTS(nn.Layer):
dropout_rate=transformer_enc_dropout_rate,
positional_dropout_rate=transformer_enc_positional_dropout_rate,
attention_dropout_rate=transformer_enc_attn_dropout_rate,
pos_enc_class=pos_enc_class,
pos_enc_layer_type=transformer_pos_enc_layer_type,
normalize_before=encoder_normalize_before,
concat_after=encoder_concat_after,
positionwise_layer_type=positionwise_layer_type,
@ -330,6 +330,9 @@ class TransformerTTS(nn.Layer):
nn.Linear(dprenet_units, adim), )
else:
decoder_input_layer = "linear"
# get positional encoding class
pos_enc_class = (ScaledPositionalEncoding
if self.use_scaled_pos_enc else PositionalEncoding)
self.decoder = Decoder(
odim=odim, # odim is needed when no prenet is used
attention_dim=adim,
@ -391,7 +394,7 @@ class TransformerTTS(nn.Layer):
text_lengths: paddle.Tensor,
speech: paddle.Tensor,
speech_lengths: paddle.Tensor,
spembs: paddle.Tensor=None,
spk_emb: paddle.Tensor=None,
) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
"""Calculate forward propagation.
@ -405,7 +408,7 @@ class TransformerTTS(nn.Layer):
Batch of padded target features (B, Lmax, odim).
speech_lengths : Tensor(int64)
Batch of the lengths of each target (B,).
spembs : Tensor, optional
spk_emb : Tensor, optional
Batch of speaker embeddings (B, spk_embed_dim).
Returns
@ -439,7 +442,7 @@ class TransformerTTS(nn.Layer):
# calculate transformer outputs
after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens,
spembs)
spk_emb)
# modifiy mod part of groundtruth
@ -467,7 +470,7 @@ class TransformerTTS(nn.Layer):
ilens: paddle.Tensor,
ys: paddle.Tensor,
olens: paddle.Tensor,
spembs: paddle.Tensor,
spk_emb: paddle.Tensor,
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
# forward encoder
x_masks = self._source_mask(ilens)
@ -480,7 +483,7 @@ class TransformerTTS(nn.Layer):
# integrate speaker embedding
if self.spk_embed_dim is not None:
hs = self._integrate_with_spk_embed(hs, spembs)
hs = self._integrate_with_spk_embed(hs, spk_emb)
# thin out frames for reduction factor (B, Lmax, odim) -> (B, Lmax//r, odim)
if self.reduction_factor > 1:
@ -514,7 +517,7 @@ class TransformerTTS(nn.Layer):
self,
text: paddle.Tensor,
speech: paddle.Tensor=None,
spembs: paddle.Tensor=None,
spk_emb: paddle.Tensor=None,
threshold: float=0.5,
minlenratio: float=0.0,
maxlenratio: float=10.0,
@ -528,7 +531,7 @@ class TransformerTTS(nn.Layer):
Input sequence of characters (T,).
speech : Tensor, optional
Feature sequence to extract style (N, idim).
spembs : Tensor, optional
spk_emb : Tensor, optional
Speaker embedding vector (spk_embed_dim,).
threshold : float, optional
Threshold in inference.
@ -551,7 +554,6 @@ class TransformerTTS(nn.Layer):
"""
# input of embedding must be int64
y = speech
spemb = spembs
# add eos at the last of sequence
text = numpy.pad(
@ -564,12 +566,12 @@ class TransformerTTS(nn.Layer):
# get teacher forcing outputs
xs, ys = x.unsqueeze(0), y.unsqueeze(0)
spembs = None if spemb is None else spemb.unsqueeze(0)
spk_emb = None if spk_emb is None else spk_emb.unsqueeze(0)
ilens = paddle.to_tensor(
[xs.shape[1]], dtype=paddle.int64, place=xs.place)
olens = paddle.to_tensor(
[ys.shape[1]], dtype=paddle.int64, place=ys.place)
outs, *_ = self._forward(xs, ilens, ys, olens, spembs)
outs, *_ = self._forward(xs, ilens, ys, olens, spk_emb)
# get attention weights
att_ws = []
@ -590,9 +592,9 @@ class TransformerTTS(nn.Layer):
hs = hs + style_embs.unsqueeze(1)
# integrate speaker embedding
if self.spk_embed_dim is not None:
spembs = spemb.unsqueeze(0)
hs = self._integrate_with_spk_embed(hs, spembs)
if spk_emb is not None:
spk_emb = spk_emb.unsqueeze(0)
hs = self._integrate_with_spk_embed(hs, spk_emb)
# set limits of length
maxlen = int(hs.shape[1] * maxlenratio / self.reduction_factor)
@ -726,14 +728,14 @@ class TransformerTTS(nn.Layer):
def _integrate_with_spk_embed(self,
hs: paddle.Tensor,
spembs: paddle.Tensor) -> paddle.Tensor:
spk_emb: paddle.Tensor) -> paddle.Tensor:
"""Integrate speaker embedding with hidden states.
Parameters
----------
hs : Tensor
Batch of hidden state sequences (B, Tmax, adim).
spembs : Tensor
spk_emb : Tensor
Batch of speaker embeddings (B, spk_embed_dim).
Returns
@ -744,13 +746,13 @@ class TransformerTTS(nn.Layer):
"""
if self.spk_embed_integration_type == "add":
# apply projection and then add to hidden states
spembs = self.projection(F.normalize(spembs))
hs = hs + spembs.unsqueeze(1)
spk_emb = self.projection(F.normalize(spk_emb))
hs = hs + spk_emb.unsqueeze(1)
elif self.spk_embed_integration_type == "concat":
# concat hidden states with spk embeds and then apply projection
spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.shape[1],
-1)
hs = self.projection(paddle.concat([hs, spembs], axis=-1))
spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(-1, hs.shape[1],
-1)
hs = self.projection(paddle.concat([hs, spk_emb], axis=-1))
else:
raise NotImplementedError("support only add or concat.")

@ -1,274 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
"""Encoder definition."""
import logging
import paddle
from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer
from paddlespeech.t2s.modules.layer_norm import LayerNorm
from paddlespeech.t2s.modules.nets_utils import get_activation
from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention
from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
from paddlespeech.t2s.modules.transformer.repeat import repeat
from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
class Encoder(paddle.nn.Layer):
"""Conformer encoder module.
Parameters
----------
idim : int
Input dimension.
attention_dim : int
Dimension of attention.
attention_heads : int
The number of heads of multi head attention.
linear_units : int
The number of units of position-wise feed forward.
num_blocks : int
The number of decoder blocks.
dropout_rate : float
Dropout rate.
positional_dropout_rate : float
Dropout rate after adding positional encoding.
attention_dropout_rate : float
Dropout rate in attention.
input_layer : Union[str, paddle.nn.Layer]
Input layer type.
normalize_before : bool
Whether to use layer_norm before the first block.
concat_after : bool
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
positionwise_layer_type : str
"linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size : int
Kernel size of positionwise conv1d layer.
macaron_style : bool
Whether to use macaron style for positionwise layer.
pos_enc_layer_type : str
Encoder positional encoding layer type.
selfattention_layer_type : str
Encoder attention layer type.
activation_type : str
Encoder activation function type.
use_cnn_module : bool
Whether to use convolution module.
zero_triu : bool
Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel : int
Kernerl size of convolution module.
padding_idx : int
Padding idx for input_layer=embed.
stochastic_depth_rate : float
Maximum probability to skip the encoder layer.
intermediate_layers : Union[List[int], None]
indices of intermediate CTC layer.
indices start from 1.
if not None, intermediate outputs are returned (which changes return type
signature.)
"""
def __init__(
self,
idim,
attention_dim=256,
attention_heads=4,
linear_units=2048,
num_blocks=6,
dropout_rate=0.1,
positional_dropout_rate=0.1,
attention_dropout_rate=0.0,
input_layer="conv2d",
normalize_before=True,
concat_after=False,
positionwise_layer_type="linear",
positionwise_conv_kernel_size=1,
macaron_style=False,
pos_enc_layer_type="abs_pos",
selfattention_layer_type="selfattn",
activation_type="swish",
use_cnn_module=False,
zero_triu=False,
cnn_module_kernel=31,
padding_idx=-1,
stochastic_depth_rate=0.0,
intermediate_layers=None, ):
"""Construct an Encoder object."""
super(Encoder, self).__init__()
activation = get_activation(activation_type)
if pos_enc_layer_type == "abs_pos":
pos_enc_class = PositionalEncoding
elif pos_enc_layer_type == "scaled_abs_pos":
pos_enc_class = ScaledPositionalEncoding
elif pos_enc_layer_type == "rel_pos":
assert selfattention_layer_type == "rel_selfattn"
pos_enc_class = RelPositionalEncoding
elif pos_enc_layer_type == "legacy_rel_pos":
pos_enc_class = LegacyRelPositionalEncoding
assert selfattention_layer_type == "legacy_rel_selfattn"
else:
raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
self.conv_subsampling_factor = 1
if input_layer == "linear":
self.embed = paddle.nn.Sequential(
paddle.nn.Linear(idim, attention_dim),
paddle.nn.LayerNorm(attention_dim),
paddle.nn.Dropout(dropout_rate),
pos_enc_class(attention_dim, positional_dropout_rate), )
elif input_layer == "conv2d":
self.embed = Conv2dSubsampling(
idim,
attention_dim,
dropout_rate,
pos_enc_class(attention_dim, positional_dropout_rate), )
self.conv_subsampling_factor = 4
elif input_layer == "embed":
self.embed = paddle.nn.Sequential(
paddle.nn.Embedding(
idim, attention_dim, padding_idx=padding_idx),
pos_enc_class(attention_dim, positional_dropout_rate), )
elif isinstance(input_layer, paddle.nn.Layer):
self.embed = paddle.nn.Sequential(
input_layer,
pos_enc_class(attention_dim, positional_dropout_rate), )
elif input_layer is None:
self.embed = paddle.nn.Sequential(
pos_enc_class(attention_dim, positional_dropout_rate))
else:
raise ValueError("unknown input_layer: " + input_layer)
self.normalize_before = normalize_before
# self-attention module definition
if selfattention_layer_type == "selfattn":
logging.info("encoder self-attention layer type = self-attention")
encoder_selfattn_layer = MultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, attention_dim,
attention_dropout_rate, )
elif selfattention_layer_type == "legacy_rel_selfattn":
assert pos_enc_layer_type == "legacy_rel_pos"
encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, attention_dim,
attention_dropout_rate, )
elif selfattention_layer_type == "rel_selfattn":
logging.info(
"encoder self-attention layer type = relative self-attention")
assert pos_enc_layer_type == "rel_pos"
encoder_selfattn_layer = RelPositionMultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, attention_dim,
attention_dropout_rate, zero_triu, )
else:
raise ValueError("unknown encoder_attn_layer: " +
selfattention_layer_type)
# feed-forward module definition
if positionwise_layer_type == "linear":
positionwise_layer = PositionwiseFeedForward
positionwise_layer_args = (attention_dim, linear_units,
dropout_rate, activation, )
elif positionwise_layer_type == "conv1d":
positionwise_layer = MultiLayeredConv1d
positionwise_layer_args = (attention_dim, linear_units,
positionwise_conv_kernel_size,
dropout_rate, )
elif positionwise_layer_type == "conv1d-linear":
positionwise_layer = Conv1dLinear
positionwise_layer_args = (attention_dim, linear_units,
positionwise_conv_kernel_size,
dropout_rate, )
else:
raise NotImplementedError("Support only linear or conv1d.")
# convolution module definition
convolution_layer = ConvolutionModule
convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
self.encoders = repeat(
num_blocks,
lambda lnum: EncoderLayer(
attention_dim,
encoder_selfattn_layer(*encoder_selfattn_layer_args),
positionwise_layer(*positionwise_layer_args),
positionwise_layer(*positionwise_layer_args) if macaron_style else None,
convolution_layer(*convolution_layer_args) if use_cnn_module else None,
dropout_rate,
normalize_before,
concat_after,
stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
if self.normalize_before:
self.after_norm = LayerNorm(attention_dim)
self.intermediate_layers = intermediate_layers
def forward(self, xs, masks):
"""Encode input sequence.
Parameters
----------
xs : paddle.Tensor
Input tensor (#batch, time, idim).
masks (paddle.Tensor): Mask tensor (#batch, 1, time).
Returns
----------
paddle.Tensor
Output tensor (#batch, time, attention_dim).
paddle.Tensor
Mask tensor (#batch, time).
"""
if isinstance(self.embed, (Conv2dSubsampling)):
xs, masks = self.embed(xs, masks)
else:
xs = self.embed(xs)
if self.intermediate_layers is None:
xs, masks = self.encoders(xs, masks)
else:
intermediate_outputs = []
for layer_idx, encoder_layer in enumerate(self.encoders):
xs, masks = encoder_layer(xs, masks)
if (self.intermediate_layers is not None and
layer_idx + 1 in self.intermediate_layers):
# intermediate branches also require normalization.
encoder_output = xs
if isinstance(encoder_output, tuple):
encoder_output = encoder_output[0]
if self.normalize_before:
encoder_output = self.after_norm(encoder_output)
intermediate_outputs.append(encoder_output)
if isinstance(xs, tuple):
xs = xs[0]
if self.normalize_before:
xs = self.after_norm(xs)
if self.intermediate_layers is not None:
return xs, masks, intermediate_outputs
return xs, masks

@ -37,7 +37,7 @@ class MultiHeadedAttention(nn.Layer):
def __init__(self, n_head, n_feat, dropout_rate):
"""Construct an MultiHeadedAttention object."""
super(MultiHeadedAttention, self).__init__()
super().__init__()
assert n_feat % n_head == 0
# We assume d_v always equals d_k
self.d_k = n_feat // n_head
@ -70,7 +70,7 @@ class MultiHeadedAttention(nn.Layer):
paddle.Tensor
Transformed value tensor (#batch, n_head, time2, d_k).
"""
n_batch = query.shape[0]
n_batch = paddle.shape(query)[0]
q = paddle.reshape(
self.linear_q(query), [n_batch, -1, self.h, self.d_k])
@ -104,7 +104,7 @@ class MultiHeadedAttention(nn.Layer):
Transformed value (#batch, time1, d_model)
weighted by the attention score (#batch, time1, time2).
"""
n_batch = value.shape[0]
n_batch = paddle.shape(value)[0]
softmax = paddle.nn.Softmax(axis=-1)
if mask is not None:
mask = mask.unsqueeze(1)
@ -126,8 +126,8 @@ class MultiHeadedAttention(nn.Layer):
# (batch, time1, d_model)
x = (paddle.reshape(
x.transpose((0, 2, 1, 3)), (n_batch, -1, self.h * self.d_k)))
return self.linear_out(x) # (batch, time1, d_model)
# (batch, time1, d_model)
return self.linear_out(x)
def forward(self, query, key, value, mask=None):
"""Compute scaled dot product attention.
@ -153,3 +153,113 @@ class MultiHeadedAttention(nn.Layer):
(0, 1, 3, 2))) / math.sqrt(self.d_k)
return self.forward_attention(v, scores, mask)
class RelPositionMultiHeadedAttention(MultiHeadedAttention):
"""Multi-Head Attention layer with relative position encoding (new implementation).
Details can be found in https://github.com/espnet/espnet/pull/2816.
Paper: https://arxiv.org/abs/1901.02860
Parameters
----------
n_head : int
The number of heads.
n_feat : int
The number of features.
dropout_rate : float
Dropout rate.
zero_triu : bool
Whether to zero the upper triangular part of attention matrix.
"""
def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
"""Construct an RelPositionMultiHeadedAttention object."""
super().__init__(n_head, n_feat, dropout_rate)
self.zero_triu = zero_triu
# linear transformation for positional encoding
self.linear_pos = nn.Linear(n_feat, n_feat, bias_attr=False)
# these two learnable bias are used in matrix c and matrix d
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
self.pos_bias_u = paddle.create_parameter(
shape=(self.h, self.d_k),
dtype='float32',
default_initializer=paddle.nn.initializer.XavierUniform())
self.pos_bias_v = paddle.create_parameter(
shape=(self.h, self.d_k),
dtype='float32',
default_initializer=paddle.nn.initializer.XavierUniform())
def rel_shift(self, x):
"""Compute relative positional encoding.
Parameters
----------
x : paddle.Tensor
Input tensor (batch, head, time1, 2*time1-1).
time1 means the length of query vector.
Returns
----------
paddle.Tensor
Output tensor.
"""
b, h, t1, t2 = paddle.shape(x)
zero_pad = paddle.zeros((b, h, t1, 1))
x_padded = paddle.concat([zero_pad, x], axis=-1)
x_padded = x_padded.reshape([b, h, t2 + 1, t1])
# only keep the positions from 0 to time2
x = x_padded[:, :, 1:].reshape([b, h, t1, t2])[:, :, :, :t2 // 2 + 1]
if self.zero_triu:
ones = paddle.ones((t1, t2))
x = x * paddle.tril(ones, t2 - 1)[None, None, :, :]
return x
def forward(self, query, key, value, pos_emb, mask):
"""Compute 'Scaled Dot Product Attention' with rel. positional encoding.
Parameters
----------
query : paddle.Tensor
Query tensor (#batch, time1, size).
key : paddle.Tensor
Key tensor (#batch, time2, size).
value : paddle.Tensor
Value tensor (#batch, time2, size).
pos_emb : paddle.Tensor
Positional embedding tensor
(#batch, 2*time1-1, size).
mask : paddle.Tensor
Mask tensor (#batch, 1, time2) or
(#batch, time1, time2).
Returns
----------
paddle.Tensor
Output tensor (#batch, time1, d_model).
"""
q, k, v = self.forward_qkv(query, key, value)
# (batch, time1, head, d_k)
q = q.transpose([0, 2, 1, 3])
n_batch_pos = paddle.shape(pos_emb)[0]
p = self.linear_pos(pos_emb).reshape(
[n_batch_pos, -1, self.h, self.d_k])
# (batch, head, 2*time1-1, d_k)
p = p.transpose([0, 2, 1, 3])
# (batch, head, time1, d_k)
q_with_bias_u = (q + self.pos_bias_u).transpose([0, 2, 1, 3])
# (batch, head, time1, d_k)
q_with_bias_v = (q + self.pos_bias_v).transpose([0, 2, 1, 3])
# compute attention score
# first compute matrix a and matrix c
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
# (batch, head, time1, time2)
matrix_ac = paddle.matmul(q_with_bias_u, k.transpose([0, 1, 3, 2]))
# compute matrix b and matrix d
# (batch, head, time1, 2*time1-1)
matrix_bd = paddle.matmul(q_with_bias_v, p.transpose([0, 1, 3, 2]))
matrix_bd = self.rel_shift(matrix_bd)
# (batch, head, time1, time2)
scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k)
return self.forward_attention(v, scores, mask)

@ -96,14 +96,14 @@ class ScaledPositionalEncoding(PositionalEncoding):
Parameters
----------
d_model : int
Embedding dimension.
dropout_rate : float
Dropout rate.
max_len : int
Maximum input length.
dtype : str
dtype of param
d_model : int
Embedding dimension.
dropout_rate : float
Dropout rate.
max_len : int
Maximum input length.
dtype : str
dtype of param
"""
def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
@ -128,14 +128,87 @@ class ScaledPositionalEncoding(PositionalEncoding):
Parameters
----------
x : paddle.Tensor
Input tensor (batch, time, `*`).
x : paddle.Tensor
Input tensor (batch, time, `*`).
Returns
----------
paddle.Tensor
Encoded tensor (batch, time, `*`).
paddle.Tensor
Encoded tensor (batch, time, `*`).
"""
self.extend_pe(x)
T = paddle.shape(x)[1]
x = x + self.alpha * self.pe[:, :T]
return self.dropout(x)
class RelPositionalEncoding(paddle.nn.Layer):
"""Relative positional encoding module (new implementation).
Details can be found in https://github.com/espnet/espnet/pull/2816.
See : Appendix B in https://arxiv.org/abs/1901.02860
Parameters
----------
d_model : int
Embedding dimension.
dropout_rate : float
Dropout rate.
max_len : int
Maximum input length.
"""
def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
"""Construct an PositionalEncoding object."""
super(RelPositionalEncoding, self).__init__()
self.d_model = d_model
self.xscale = math.sqrt(self.d_model)
self.dropout = paddle.nn.Dropout(p=dropout_rate)
self.pe = None
self.dtype = dtype
self.extend_pe(paddle.expand(paddle.zeros([1]), (1, max_len)))
def extend_pe(self, x):
"""Reset the positional encodings."""
if self.pe is not None:
# self.pe contains both positive and negative parts
# the length of self.pe is 2 * input_len - 1
if paddle.shape(self.pe)[1] >= paddle.shape(x)[1] * 2 - 1:
return
# Suppose `i` means to the position of query vecotr and `j` means the
# position of key vector. We use position relative positions when keys
# are to the left (i>j) and negative relative positions otherwise (i<j).
x_shape = paddle.shape(x)
pe_positive = paddle.zeros([x_shape[1], self.d_model])
pe_negative = paddle.zeros([x_shape[1], self.d_model])
position = paddle.arange(0, x_shape[1], dtype=self.dtype).unsqueeze(1)
div_term = paddle.exp(
paddle.arange(0, self.d_model, 2, dtype=self.dtype) *
-(math.log(10000.0) / self.d_model))
pe_positive[:, 0::2] = paddle.sin(position * div_term)
pe_positive[:, 1::2] = paddle.cos(position * div_term)
pe_negative[:, 0::2] = paddle.sin(-1 * position * div_term)
pe_negative[:, 1::2] = paddle.cos(-1 * position * div_term)
# Reserve the order of positive indices and concat both positive and
# negative indices. This is used to support the shifting trick
# as in https://arxiv.org/abs/1901.02860
pe_positive = paddle.flip(pe_positive, [0]).unsqueeze(0)
pe_negative = pe_negative[1:].unsqueeze(0)
pe = paddle.concat([pe_positive, pe_negative], axis=1)
self.pe = pe
def forward(self, x: paddle.Tensor):
"""Add positional encoding.
Parameters
----------
x : paddle.Tensor
Input tensor (batch, time, `*`).
Returns
----------
paddle.Tensor
Encoded tensor (batch, time, `*`).
"""
self.extend_pe(x)
x = x * self.xscale
T = paddle.shape(x)[1]
pe_size = paddle.shape(self.pe)
pos_emb = self.pe[:, pe_size[1] // 2 - T + 1:pe_size[1] // 2 + T, ]
return self.dropout(x), self.dropout(pos_emb)

@ -12,15 +12,26 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
from typing import List
from typing import Union
from paddle import nn
from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer as ConformerEncoderLayer
from paddlespeech.t2s.modules.layer_norm import LayerNorm
from paddlespeech.t2s.modules.nets_utils import get_activation
from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
from paddlespeech.t2s.modules.transformer.encoder_layer import EncoderLayer
from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
from paddlespeech.t2s.modules.transformer.repeat import repeat
from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
class Encoder(nn.Layer):
@ -46,9 +57,6 @@ class Encoder(nn.Layer):
Dropout rate in attention.
input_layer : Union[str, paddle.nn.Layer]
Input layer type.
pos_enc_class : paddle.nn.Layer
Positional encoding module class.
`PositionalEncoding `or `ScaledPositionalEncoding`
normalize_before : bool
Whether to use layer_norm before the first block.
concat_after : bool
@ -60,98 +68,137 @@ class Encoder(nn.Layer):
"linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size : int
Kernel size of positionwise conv1d layer.
macaron_style : bool
Whether to use macaron style for positionwise layer.
pos_enc_layer_type : str
Encoder positional encoding layer type.
selfattention_layer_type : str
Encoder attention layer type.
activation_type : str
Encoder activation function type.
use_cnn_module : bool
Whether to use convolution module.
zero_triu : bool
Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel : int
Kernerl size of convolution module.
padding_idx : int
Padding idx for input_layer=embed.
stochastic_depth_rate : float
Maximum probability to skip the encoder layer.
intermediate_layers : Union[List[int], None]
indices of intermediate CTC layer.
indices start from 1.
if not None, intermediate outputs are returned (which changes return type
signature.)
encoder_type: str
"transformer", or "conformer".
"""
def __init__(
self,
idim,
attention_dim=256,
attention_heads=4,
linear_units=2048,
num_blocks=6,
dropout_rate=0.1,
positional_dropout_rate=0.1,
attention_dropout_rate=0.0,
input_layer="conv2d",
pos_enc_class=PositionalEncoding,
normalize_before=True,
concat_after=False,
positionwise_layer_type="linear",
positionwise_conv_kernel_size=1,
selfattention_layer_type="selfattn",
padding_idx=-1, ):
def __init__(self,
idim: int,
attention_dim: int=256,
attention_heads: int=4,
linear_units: int=2048,
num_blocks: int=6,
dropout_rate: float=0.1,
positional_dropout_rate: float=0.1,
attention_dropout_rate: float=0.0,
input_layer: str="conv2d",
normalize_before: bool=True,
concat_after: bool=False,
positionwise_layer_type: str="linear",
positionwise_conv_kernel_size: int=1,
macaron_style: bool=False,
pos_enc_layer_type: str="abs_pos",
selfattention_layer_type: str="selfattn",
activation_type: str="swish",
use_cnn_module: bool=False,
zero_triu: bool=False,
cnn_module_kernel: int=31,
padding_idx: int=-1,
stochastic_depth_rate: float=0.0,
intermediate_layers: Union[List[int], None]=None,
encoder_type: str="transformer"):
"""Construct an Encoder object."""
super(Encoder, self).__init__()
super().__init__()
activation = get_activation(activation_type)
pos_enc_class = self.get_pos_enc_class(pos_enc_layer_type,
selfattention_layer_type)
self.encoder_type = encoder_type
self.conv_subsampling_factor = 1
if input_layer == "linear":
self.embed = nn.Sequential(
nn.Linear(idim, attention_dim, bias_attr=True),
nn.LayerNorm(attention_dim),
nn.Dropout(dropout_rate),
nn.ReLU(),
pos_enc_class(attention_dim, positional_dropout_rate), )
elif input_layer == "embed":
self.embed = nn.Sequential(
nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
pos_enc_class(attention_dim, positional_dropout_rate), )
elif isinstance(input_layer, nn.Layer):
self.embed = nn.Sequential(
input_layer,
pos_enc_class(attention_dim, positional_dropout_rate), )
elif input_layer is None:
self.embed = nn.Sequential(
pos_enc_class(attention_dim, positional_dropout_rate))
else:
raise ValueError("unknown input_layer: " + input_layer)
self.embed = self.get_embed(
idim=idim,
input_layer=input_layer,
attention_dim=attention_dim,
pos_enc_class=pos_enc_class,
dropout_rate=dropout_rate,
positional_dropout_rate=positional_dropout_rate,
padding_idx=padding_idx)
self.normalize_before = normalize_before
# self-attention module definition
encoder_selfattn_layer, encoder_selfattn_layer_args = self.get_encoder_selfattn_layer(
selfattention_layer_type=selfattention_layer_type,
attention_heads=attention_heads,
attention_dim=attention_dim,
attention_dropout_rate=attention_dropout_rate,
zero_triu=zero_triu,
pos_enc_layer_type=pos_enc_layer_type)
# feed-forward module definition
positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
positionwise_layer_type,
attention_dim,
linear_units,
dropout_rate,
positionwise_conv_kernel_size, )
if selfattention_layer_type in [
"selfattn",
"rel_selfattn",
"legacy_rel_selfattn",
]:
encoder_selfattn_layer = MultiHeadedAttention
encoder_selfattn_layer_args = [
(attention_heads, attention_dim, attention_dropout_rate, )
] * num_blocks
positionwise_layer_type, attention_dim, linear_units, dropout_rate,
positionwise_conv_kernel_size, activation)
# convolution module definition
convolution_layer = ConvolutionModule
convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
if self.encoder_type == "transformer":
self.encoders = repeat(
num_blocks,
lambda lnum: EncoderLayer(
attention_dim,
encoder_selfattn_layer(*encoder_selfattn_layer_args),
positionwise_layer(*positionwise_layer_args),
dropout_rate,
normalize_before,
concat_after, ), )
elif self.encoder_type == "conformer":
self.encoders = repeat(
num_blocks,
lambda lnum: ConformerEncoderLayer(
attention_dim,
encoder_selfattn_layer(*encoder_selfattn_layer_args),
positionwise_layer(*positionwise_layer_args),
positionwise_layer(*positionwise_layer_args) if macaron_style else None,
convolution_layer(*convolution_layer_args) if use_cnn_module else None,
dropout_rate,
normalize_before,
concat_after,
stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
self.intermediate_layers = intermediate_layers
else:
raise NotImplementedError(selfattention_layer_type)
raise NotImplementedError("Support only linear or conv1d.")
self.encoders = repeat(
num_blocks,
lambda lnum: EncoderLayer(
attention_dim,
encoder_selfattn_layer(*encoder_selfattn_layer_args[lnum]),
positionwise_layer(*positionwise_layer_args),
dropout_rate,
normalize_before,
concat_after, ), )
if self.normalize_before:
self.after_norm = nn.LayerNorm(attention_dim)
def get_positionwise_layer(
self,
positionwise_layer_type="linear",
attention_dim=256,
linear_units=2048,
dropout_rate=0.1,
positionwise_conv_kernel_size=1, ):
self.after_norm = LayerNorm(attention_dim)
def get_positionwise_layer(self,
positionwise_layer_type: str="linear",
attention_dim: int=256,
linear_units: int=2048,
dropout_rate: float=0.1,
positionwise_conv_kernel_size: int=1,
activation: nn.Layer=nn.ReLU()):
"""Define positionwise layer."""
if positionwise_layer_type == "linear":
positionwise_layer = PositionwiseFeedForward
positionwise_layer_args = (attention_dim, linear_units,
dropout_rate)
dropout_rate, activation)
elif positionwise_layer_type == "conv1d":
positionwise_layer = MultiLayeredConv1d
positionwise_layer_args = (attention_dim, linear_units,
@ -166,6 +213,81 @@ class Encoder(nn.Layer):
raise NotImplementedError("Support only linear or conv1d.")
return positionwise_layer, positionwise_layer_args
def get_encoder_selfattn_layer(self,
selfattention_layer_type: str="selfattn",
attention_heads: int=4,
attention_dim: int=256,
attention_dropout_rate: float=0.0,
zero_triu: bool=False,
pos_enc_layer_type: str="abs_pos"):
if selfattention_layer_type == "selfattn":
encoder_selfattn_layer = MultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, attention_dim,
attention_dropout_rate, )
elif selfattention_layer_type == "rel_selfattn":
assert pos_enc_layer_type == "rel_pos"
encoder_selfattn_layer = RelPositionMultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, attention_dim,
attention_dropout_rate, zero_triu, )
else:
raise ValueError("unknown encoder_attn_layer: " +
selfattention_layer_type)
return encoder_selfattn_layer, encoder_selfattn_layer_args
def get_pos_enc_class(self,
pos_enc_layer_type: str="abs_pos",
selfattention_layer_type: str="selfattn"):
if pos_enc_layer_type == "abs_pos":
pos_enc_class = PositionalEncoding
elif pos_enc_layer_type == "scaled_abs_pos":
pos_enc_class = ScaledPositionalEncoding
elif pos_enc_layer_type == "rel_pos":
assert selfattention_layer_type == "rel_selfattn"
pos_enc_class = RelPositionalEncoding
else:
raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
return pos_enc_class
def get_embed(self,
idim,
input_layer="conv2d",
attention_dim: int=256,
pos_enc_class=PositionalEncoding,
dropout_rate: int=0.1,
positional_dropout_rate: int=0.1,
padding_idx: int=-1):
if input_layer == "linear":
embed = nn.Sequential(
nn.Linear(idim, attention_dim),
nn.LayerNorm(attention_dim),
nn.Dropout(dropout_rate),
nn.ReLU(),
pos_enc_class(attention_dim, positional_dropout_rate), )
elif input_layer == "conv2d":
embed = Conv2dSubsampling(
idim,
attention_dim,
dropout_rate,
pos_enc_class(attention_dim, positional_dropout_rate), )
self.conv_subsampling_factor = 4
elif input_layer == "embed":
embed = nn.Sequential(
nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
pos_enc_class(attention_dim, positional_dropout_rate), )
elif isinstance(input_layer, nn.Layer):
embed = nn.Sequential(
input_layer,
pos_enc_class(attention_dim, positional_dropout_rate), )
elif input_layer is None:
embed = nn.Sequential(
pos_enc_class(attention_dim, positional_dropout_rate))
else:
raise ValueError("unknown input_layer: " + input_layer)
return embed
def forward(self, xs, masks):
"""Encode input sequence.
@ -174,21 +296,55 @@ class Encoder(nn.Layer):
xs : paddle.Tensor
Input tensor (#batch, time, idim).
masks : paddle.Tensor
Mask tensor (#batch, time).
Mask tensor (#batch, 1, time).
Returns
----------
paddle.Tensor
Output tensor (#batch, time, attention_dim).
paddle.Tensor
Mask tensor (#batch, time).
Mask tensor (#batch, 1, time).
"""
if self.encoder_type == "transformer":
xs = self.embed(xs)
xs, masks = self.encoders(xs, masks)
if self.normalize_before:
xs = self.after_norm(xs)
return xs, masks
elif self.encoder_type == "conformer":
if isinstance(self.embed, (Conv2dSubsampling)):
xs, masks = self.embed(xs, masks)
else:
xs = self.embed(xs)
xs = self.embed(xs)
xs, masks = self.encoders(xs, masks)
if self.normalize_before:
xs = self.after_norm(xs)
return xs, masks
if self.intermediate_layers is None:
xs, masks = self.encoders(xs, masks)
else:
intermediate_outputs = []
for layer_idx, encoder_layer in enumerate(self.encoders):
xs, masks = encoder_layer(xs, masks)
if (self.intermediate_layers is not None and
layer_idx + 1 in self.intermediate_layers):
# intermediate branches also require normalization.
encoder_output = xs
if isinstance(encoder_output, tuple):
encoder_output = encoder_output[0]
if self.normalize_before:
encoder_output = self.after_norm(encoder_output)
intermediate_outputs.append(encoder_output)
if isinstance(xs, tuple):
xs = xs[0]
if self.normalize_before:
xs = self.after_norm(xs)
if self.intermediate_layers is not None:
return xs, masks, intermediate_outputs
return xs, masks
else:
raise ValueError(f"{self.encoder_type} is not supported.")
def forward_one_step(self, xs, masks, cache=None):
"""Encode input frame.

@ -18,38 +18,6 @@ import paddle
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
class TooShortUttError(Exception):
"""Raised when the utt is too short for subsampling.
Parameters
----------
message : str
Message for error catch
actual_size : int
the short size that cannot pass the subsampling
limit : int
the limit size for subsampling
"""
def __init__(self, message, actual_size, limit):
"""Construct a TooShortUttError for error handler."""
super().__init__(message)
self.actual_size = actual_size
self.limit = limit
def check_short_utt(ins, size):
"""Check if the utterance is too short for subsampling."""
if isinstance(ins, Conv2dSubsampling2) and size < 3:
return True, 3
if isinstance(ins, Conv2dSubsampling) and size < 7:
return True, 7
if isinstance(ins, Conv2dSubsampling6) and size < 11:
return True, 11
if isinstance(ins, Conv2dSubsampling8) and size < 15:
return True, 15
return False, -1
class Conv2dSubsampling(paddle.nn.Layer):
"""Convolutional 2D subsampling (to 1/4 length).
Parameters
@ -112,178 +80,3 @@ class Conv2dSubsampling(paddle.nn.Layer):
raise NotImplementedError(
"Support only `-1` (for `reset_parameters`).")
return self.out[key]
class Conv2dSubsampling2(paddle.nn.Layer):
"""Convolutional 2D subsampling (to 1/2 length).
Parameters
----------
idim : int
Input dimension.
odim : int
Output dimension.
dropout_rate : float
Dropout rate.
pos_enc : paddle.nn.Layer
Custom position encoding layer.
"""
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
"""Construct an Conv2dSubsampling2 object."""
super(Conv2dSubsampling2, self).__init__()
self.conv = paddle.nn.Sequential(
paddle.nn.Conv2D(1, odim, 3, 2),
paddle.nn.ReLU(),
paddle.nn.Conv2D(odim, odim, 3, 1),
paddle.nn.ReLU(), )
self.out = paddle.nn.Sequential(
paddle.nn.Linear(odim * (((idim - 1) // 2 - 2)), odim),
pos_enc if pos_enc is not None else
PositionalEncoding(odim, dropout_rate), )
def forward(self, x, x_mask):
"""Subsample x.
Parameters
----------
x : paddle.Tensor
Input tensor (#batch, time, idim).
x_mask : paddle.Tensor
Input mask (#batch, 1, time).
Returns
----------
paddle.Tensor
ubsampled tensor (#batch, time', odim),
where time' = time // 2.
paddle.Tensor
Subsampled mask (#batch, 1, time'),
where time' = time // 2.
"""
# (b, c, t, f)
x = x.unsqueeze(1)
x = self.conv(x)
b, c, t, f = paddle.shape(x)
x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
if x_mask is None:
return x, None
return x, x_mask[:, :, :-2:2][:, :, :-2:1]
def __getitem__(self, key):
"""Get item.
When reset_parameters() is called, if use_scaled_pos_enc is used,
return the positioning encoding.
"""
if key != -1:
raise NotImplementedError(
"Support only `-1` (for `reset_parameters`).")
return self.out[key]
class Conv2dSubsampling6(paddle.nn.Layer):
"""Convolutional 2D subsampling (to 1/6 length).
Parameters
----------
idim : int
Input dimension.
odim : int
Output dimension.
dropout_rate : float
Dropout rate.
pos_enc : paddle.nn.Layer
Custom position encoding layer.
"""
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
"""Construct an Conv2dSubsampling6 object."""
super(Conv2dSubsampling6, self).__init__()
self.conv = paddle.nn.Sequential(
paddle.nn.Conv2D(1, odim, 3, 2),
paddle.nn.ReLU(),
paddle.nn.Conv2D(odim, odim, 5, 3),
paddle.nn.ReLU(), )
self.out = paddle.nn.Sequential(
paddle.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim),
pos_enc if pos_enc is not None else
PositionalEncoding(odim, dropout_rate), )
def forward(self, x, x_mask):
"""Subsample x.
Parameters
----------
x : paddle.Tensor
Input tensor (#batch, time, idim).
x_mask paddle.Tensor
Input mask (#batch, 1, time).
Returns
----------
paddle.Tensor
Subsampled tensor (#batch, time', odim),
where time' = time // 6.
paddle.Tensor
Subsampled mask (#batch, 1, time'),
where time' = time // 6.
"""
# (b, c, t, f)
x = x.unsqueeze(1)
x = self.conv(x)
b, c, t, f = paddle.shape(x)
x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
if x_mask is None:
return x, None
return x, x_mask[:, :, :-2:2][:, :, :-4:3]
class Conv2dSubsampling8(paddle.nn.Layer):
"""Convolutional 2D subsampling (to 1/8 length).
Parameters
----------
idim : int
Input dimension.
odim : int
Output dimension.
dropout_rate : float
Dropout rate.
pos_enc : paddle.nn.Layer
Custom position encoding layer.
"""
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
"""Construct an Conv2dSubsampling8 object."""
super(Conv2dSubsampling8, self).__init__()
self.conv = paddle.nn.Sequential(
paddle.nn.Conv2D(1, odim, 3, 2),
paddle.nn.ReLU(),
paddle.nn.Conv2D(odim, odim, 3, 2),
paddle.nn.ReLU(),
paddle.nn.Conv2D(odim, odim, 3, 2),
paddle.nn.ReLU(), )
self.out = paddle.nn.Sequential(
paddle.nn.Linear(odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2),
odim),
pos_enc if pos_enc is not None else
PositionalEncoding(odim, dropout_rate), )
def forward(self, x, x_mask):
"""Subsample x.
Parameters
----------
x : paddle.Tensor
Input tensor (#batch, time, idim).
x_mask : paddle.Tensor
Input mask (#batch, 1, time).
Returns
----------
paddle.Tensor
Subsampled tensor (#batch, time', odim),
where time' = time // 8.
paddle.Tensor
Subsampled mask (#batch, 1, time'),
where time' = time // 8.
"""
# (b, c, t, f)
x = x.unsqueeze(1)
x = self.conv(x)
b, c, t, f = paddle.shape(x)
x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
if x_mask is None:
return x, None
return x, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]

Loading…
Cancel
Save