parent
4a28751df0
commit
3d5e078c91
@ -0,0 +1,139 @@
|
|||||||
|
# This is the hyperparameter configuration file for MelGAN.
|
||||||
|
# Please make sure this is adjusted for the CSMSC dataset. If you want to
|
||||||
|
# apply to the other dataset, you might need to carefully change some parameters.
|
||||||
|
# This configuration requires ~ 8GB memory and will finish within 7 days on Titan V.
|
||||||
|
|
||||||
|
# This configuration is based on full-band MelGAN but the hop size and sampling
|
||||||
|
# rate is different from the paper (16kHz vs 24kHz). The number of iteraions
|
||||||
|
# is not shown in the paper so currently we train 1M iterations (not sure enough
|
||||||
|
# to converge). The optimizer setting is based on @dathudeptrai advice.
|
||||||
|
# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# FEATURE EXTRACTION SETTING #
|
||||||
|
###########################################################
|
||||||
|
fs: 24000 # Sampling rate.
|
||||||
|
n_fft: 2048 # FFT size. (in samples)
|
||||||
|
n_shift: 300 # Hop size. (in samples)
|
||||||
|
win_length: 1200 # Window length. (in samples)
|
||||||
|
# If set to null, it will be the same as fft_size.
|
||||||
|
window: "hann" # Window function.
|
||||||
|
n_mels: 80 # Number of mel basis.
|
||||||
|
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
|
||||||
|
fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
||||||
|
###########################################################
|
||||||
|
generator_params:
|
||||||
|
in_channels: 80 # Number of input channels.
|
||||||
|
out_channels: 4 # Number of output channels.
|
||||||
|
kernel_size: 7 # Kernel size of initial and final conv layers.
|
||||||
|
channels: 384 # Initial number of channels for conv layers.
|
||||||
|
upsample_scales: [5, 5, 3] # List of Upsampling scales.
|
||||||
|
stack_kernel_size: 3 # Kernel size of dilated conv layers in residual stack.
|
||||||
|
stacks: 4 # Number of stacks in a single residual stack module.
|
||||||
|
use_weight_norm: True # Whether to use weight normalization.
|
||||||
|
use_causal_conv: False # Whether to use causal convolution.
|
||||||
|
use_final_nonlinear_activation: True
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
||||||
|
###########################################################
|
||||||
|
discriminator_params:
|
||||||
|
in_channels: 1 # Number of input channels.
|
||||||
|
out_channels: 1 # Number of output channels.
|
||||||
|
scales: 3 # Number of multi-scales.
|
||||||
|
downsample_pooling: "AvgPool1D" # Pooling type for the input downsampling.
|
||||||
|
downsample_pooling_params: # Parameters of the above pooling function.
|
||||||
|
kernel_size: 4
|
||||||
|
stride: 2
|
||||||
|
padding: 1
|
||||||
|
exclusive: True
|
||||||
|
kernel_sizes: [5, 3] # List of kernel size.
|
||||||
|
channels: 16 # Number of channels of the initial conv layer.
|
||||||
|
max_downsample_channels: 512 # Maximum number of channels of downsampling layers.
|
||||||
|
downsample_scales: [4, 4, 4] # List of downsampling scales.
|
||||||
|
nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
|
||||||
|
nonlinear_activation_params: # Parameters of nonlinear activation function.
|
||||||
|
negative_slope: 0.2
|
||||||
|
use_weight_norm: True # Whether to use weight norm.
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# STFT LOSS SETTING #
|
||||||
|
###########################################################
|
||||||
|
use_stft_loss: true
|
||||||
|
stft_loss_params:
|
||||||
|
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
|
||||||
|
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
|
||||||
|
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
|
||||||
|
window: "hann" # Window function for STFT-based loss
|
||||||
|
use_subband_stft_loss: true
|
||||||
|
subband_stft_loss_params:
|
||||||
|
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
|
||||||
|
hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss
|
||||||
|
win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
|
||||||
|
window: "hann" # Window function for STFT-based loss
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# ADVERSARIAL LOSS SETTING #
|
||||||
|
###########################################################
|
||||||
|
use_feat_match_loss: false # Whether to use feature matching loss.
|
||||||
|
lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss.
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# DATA LOADER SETTING #
|
||||||
|
###########################################################
|
||||||
|
batch_size: 64 # Batch size.
|
||||||
|
batch_max_steps: 16200 # Length of each audio in batch. Make sure dividable by hop_size.
|
||||||
|
num_workers: 2 # Number of workers in DataLoader.
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OPTIMIZER & SCHEDULER SETTING #
|
||||||
|
###########################################################
|
||||||
|
generator_optimizer_params:
|
||||||
|
epsilon: 1.0e-7 # Generator's epsilon.
|
||||||
|
weight_decay: 0.0 # Generator's weight decay coefficient.
|
||||||
|
|
||||||
|
generator_grad_norm: -1 # Generator's gradient norm.
|
||||||
|
generator_scheduler_params:
|
||||||
|
learning_rate: 1.0e-3 # Generator's learning rate.
|
||||||
|
gamma: 0.5 # Generator's scheduler gamma.
|
||||||
|
milestones: # At each milestone, lr will be multiplied by gamma.
|
||||||
|
- 100000
|
||||||
|
- 200000
|
||||||
|
- 300000
|
||||||
|
- 400000
|
||||||
|
- 500000
|
||||||
|
- 600000
|
||||||
|
discriminator_optimizer_params:
|
||||||
|
epsilon: 1.0e-7 # Discriminator's epsilon.
|
||||||
|
weight_decay: 0.0 # Discriminator's weight decay coefficient.
|
||||||
|
|
||||||
|
discriminator_grad_norm: -1 # Discriminator's gradient norm.
|
||||||
|
discriminator_scheduler_params:
|
||||||
|
learning_rate: 1.0e-3 # Discriminator's learning rate.
|
||||||
|
gamma: 0.5 # Discriminator's scheduler gamma.
|
||||||
|
milestones: # At each milestone, lr will be multiplied by gamma.
|
||||||
|
- 100000
|
||||||
|
- 200000
|
||||||
|
- 300000
|
||||||
|
- 400000
|
||||||
|
- 500000
|
||||||
|
- 600000
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# INTERVAL SETTING #
|
||||||
|
###########################################################
|
||||||
|
discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
|
||||||
|
train_max_steps: 1200000 # Number of training steps.
|
||||||
|
save_interval_steps: 1000 # Interval steps to save checkpoint.
|
||||||
|
eval_interval_steps: 1000 # Interval steps to evaluate the network.
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
num_snapshots: 10 # max number of snapshots to keep while training
|
||||||
|
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,63 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
source path.sh
|
||||||
|
|
||||||
|
gpus=0
|
||||||
|
stage=0
|
||||||
|
stop_stage=100
|
||||||
|
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||||
|
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \
|
||||||
|
--fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
|
||||||
|
--fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
|
||||||
|
--fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
|
||||||
|
--dur-file=durations.txt \
|
||||||
|
--output-dir=dump_finetune \
|
||||||
|
--phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
python3 local/link_wav.py \
|
||||||
|
--old-dump-dir=dump \
|
||||||
|
--dump-dir=dump_finetune
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
# get features' stats(mean and std)
|
||||||
|
echo "Get features' stats ..."
|
||||||
|
cp dump/train/feats_stats.npy dump_finetune/train/
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
# normalize, dev and test should use train's stats
|
||||||
|
echo "Normalize ..."
|
||||||
|
|
||||||
|
python3 ${BIN_DIR}/../normalize.py \
|
||||||
|
--metadata=dump_finetune/train/raw/metadata.jsonl \
|
||||||
|
--dumpdir=dump_finetune/train/norm \
|
||||||
|
--stats=dump_finetune/train/feats_stats.npy
|
||||||
|
python3 ${BIN_DIR}/../normalize.py \
|
||||||
|
--metadata=dump_finetune/dev/raw/metadata.jsonl \
|
||||||
|
--dumpdir=dump_finetune/dev/norm \
|
||||||
|
--stats=dump_finetune/train/feats_stats.npy
|
||||||
|
|
||||||
|
python3 ${BIN_DIR}/../normalize.py \
|
||||||
|
--metadata=dump_finetune/test/raw/metadata.jsonl \
|
||||||
|
--dumpdir=dump_finetune/test/norm \
|
||||||
|
--stats=dump_finetune/train/feats_stats.npy
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||||
|
CUDA_VISIBLE_DEVICES=${gpus} \
|
||||||
|
FLAGS_cudnn_exhaustive_search=true \
|
||||||
|
FLAGS_conv_workspace_size_limit=4000 \
|
||||||
|
python ${BIN_DIR}/train.py \
|
||||||
|
--train-metadata=dump_finetune/train/norm/metadata.jsonl \
|
||||||
|
--dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
|
||||||
|
--config=conf/finetune.yaml \
|
||||||
|
--output-dir=exp/finetune \
|
||||||
|
--ngpu=1
|
||||||
|
fi
|
@ -0,0 +1,85 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
from operator import itemgetter
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import jsonlines
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# parse config and args
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Preprocess audio and then extract features .")
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--old-dump-dir",
|
||||||
|
default=None,
|
||||||
|
type=str,
|
||||||
|
help="directory to dump feature files.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--dump-dir",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="directory to finetune dump feature files.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
old_dump_dir = Path(args.old_dump_dir).expanduser()
|
||||||
|
old_dump_dir = old_dump_dir.resolve()
|
||||||
|
dump_dir = Path(args.dump_dir).expanduser()
|
||||||
|
# use absolute path
|
||||||
|
dump_dir = dump_dir.resolve()
|
||||||
|
dump_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
assert old_dump_dir.is_dir()
|
||||||
|
assert dump_dir.is_dir()
|
||||||
|
|
||||||
|
for sub in ["train", "dev", "test"]:
|
||||||
|
# 把 old_dump_dir 里面的 *-wave.npy 软连接到 dump_dir 的对应位置
|
||||||
|
output_dir = dump_dir / sub
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
results = []
|
||||||
|
for name in os.listdir(output_dir / "raw"):
|
||||||
|
# 003918_feats.npy
|
||||||
|
utt_id = name.split("_")[0]
|
||||||
|
mel_path = output_dir / ("raw/" + name)
|
||||||
|
gen_mel = np.load(mel_path)
|
||||||
|
wave_name = utt_id + "_wave.npy"
|
||||||
|
wav = np.load(old_dump_dir / sub / ("raw/" + wave_name))
|
||||||
|
os.symlink(old_dump_dir / sub / ("raw/" + wave_name),
|
||||||
|
output_dir / ("raw/" + wave_name))
|
||||||
|
num_sample = wav.shape[0]
|
||||||
|
num_frames = gen_mel.shape[0]
|
||||||
|
wav_path = output_dir / ("raw/" + wave_name)
|
||||||
|
|
||||||
|
record = {
|
||||||
|
"utt_id": utt_id,
|
||||||
|
"num_samples": num_sample,
|
||||||
|
"num_frames": num_frames,
|
||||||
|
"feats": str(mel_path),
|
||||||
|
"wave": str(wav_path),
|
||||||
|
}
|
||||||
|
results.append(record)
|
||||||
|
|
||||||
|
results.sort(key=itemgetter("utt_id"))
|
||||||
|
|
||||||
|
with jsonlines.open(output_dir / "raw/metadata.jsonl", 'w') as writer:
|
||||||
|
for item in results:
|
||||||
|
writer.write(item)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -0,0 +1,167 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# generate mels using durations.txt
|
||||||
|
# for mb melgan finetune
|
||||||
|
# 长度和原本的 mel 不一致怎么办?
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
import yaml
|
||||||
|
from yacs.config import CfgNode
|
||||||
|
|
||||||
|
from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
|
||||||
|
from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
|
||||||
|
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
|
||||||
|
from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
|
||||||
|
from paddlespeech.t2s.modules.normalizer import ZScore
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(args, fastspeech2_config):
|
||||||
|
|
||||||
|
# construct dataset for evaluation
|
||||||
|
with open(args.phones_dict, "r") as f:
|
||||||
|
phn_id = [line.strip().split() for line in f.readlines()]
|
||||||
|
vocab_size = len(phn_id)
|
||||||
|
print("vocab_size:", vocab_size)
|
||||||
|
|
||||||
|
phone_dict = {}
|
||||||
|
for phn, id in phn_id:
|
||||||
|
phone_dict[phn] = int(id)
|
||||||
|
|
||||||
|
odim = fastspeech2_config.n_mels
|
||||||
|
model = FastSpeech2(
|
||||||
|
idim=vocab_size, odim=odim, **fastspeech2_config["model"])
|
||||||
|
|
||||||
|
model.set_state_dict(
|
||||||
|
paddle.load(args.fastspeech2_checkpoint)["main_params"])
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
stat = np.load(args.fastspeech2_stat)
|
||||||
|
mu, std = stat
|
||||||
|
mu = paddle.to_tensor(mu)
|
||||||
|
std = paddle.to_tensor(std)
|
||||||
|
fastspeech2_normalizer = ZScore(mu, std)
|
||||||
|
|
||||||
|
fastspeech2_inference = StyleFastSpeech2Inference(fastspeech2_normalizer,
|
||||||
|
model)
|
||||||
|
fastspeech2_inference.eval()
|
||||||
|
|
||||||
|
output_dir = Path(args.output_dir)
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
sentences, speaker_set = get_phn_dur(args.dur_file)
|
||||||
|
merge_silence(sentences)
|
||||||
|
|
||||||
|
for i, utt_id in enumerate(sentences):
|
||||||
|
phones = sentences[utt_id][0]
|
||||||
|
durations = sentences[utt_id][1]
|
||||||
|
speaker = sentences[utt_id][2]
|
||||||
|
# 裁剪掉开头和结尾的 sil
|
||||||
|
if args.cut_sil:
|
||||||
|
if phones[0] == "sil" and len(durations) > 1:
|
||||||
|
durations = durations[1:]
|
||||||
|
phones = phones[1:]
|
||||||
|
if phones[-1] == 'sil' and len(durations) > 1:
|
||||||
|
durations = durations[:-1]
|
||||||
|
phones = phones[:-1]
|
||||||
|
# sentences[utt_id][0] = phones
|
||||||
|
# sentences[utt_id][1] = durations
|
||||||
|
|
||||||
|
phone_ids = [phone_dict[phn] for phn in phones]
|
||||||
|
phone_ids = paddle.to_tensor(np.array(phone_ids))
|
||||||
|
durations = paddle.to_tensor(np.array(durations))
|
||||||
|
# 生成的和真实的可能有 1, 2 帧的差距,但是 batch_fn 会修复
|
||||||
|
# split data into 3 sections
|
||||||
|
if args.dataset == "baker":
|
||||||
|
num_train = 9800
|
||||||
|
num_dev = 100
|
||||||
|
if i in range(0, num_train):
|
||||||
|
sub_output_dir = output_dir / ("train/raw")
|
||||||
|
elif i in range(num_train, num_train + num_dev):
|
||||||
|
sub_output_dir = output_dir / ("dev/raw")
|
||||||
|
else:
|
||||||
|
sub_output_dir = output_dir / ("test/raw")
|
||||||
|
sub_output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
with paddle.no_grad():
|
||||||
|
mel = fastspeech2_inference(phone_ids, durations=durations)
|
||||||
|
np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# parse args and config and redirect to train_sp
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Synthesize with fastspeech2 & parallel wavegan.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--dataset",
|
||||||
|
default="baker",
|
||||||
|
type=str,
|
||||||
|
help="name of dataset, should in {baker, ljspeech, vctk} now")
|
||||||
|
parser.add_argument(
|
||||||
|
"--fastspeech2-config", type=str, help="fastspeech2 config file.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--fastspeech2-checkpoint",
|
||||||
|
type=str,
|
||||||
|
help="fastspeech2 checkpoint to load.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--fastspeech2-stat",
|
||||||
|
type=str,
|
||||||
|
help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--phones-dict",
|
||||||
|
type=str,
|
||||||
|
default="phone_id_map.txt",
|
||||||
|
help="phone vocabulary file.")
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--dur-file", default=None, type=str, help="path to durations.txt.")
|
||||||
|
parser.add_argument("--output-dir", type=str, help="output dir.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
|
||||||
|
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
|
||||||
|
|
||||||
|
def str2bool(str):
|
||||||
|
return True if str.lower() == 'true' else False
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--cut-sil",
|
||||||
|
type=str2bool,
|
||||||
|
default=True,
|
||||||
|
help="whether cut sil in the edge of audio")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.ngpu == 0:
|
||||||
|
paddle.set_device("cpu")
|
||||||
|
elif args.ngpu > 0:
|
||||||
|
paddle.set_device("gpu")
|
||||||
|
else:
|
||||||
|
print("ngpu should >= 0 !")
|
||||||
|
|
||||||
|
with open(args.fastspeech2_config) as f:
|
||||||
|
fastspeech2_config = CfgNode(yaml.safe_load(f))
|
||||||
|
|
||||||
|
print("========Args========")
|
||||||
|
print(yaml.safe_dump(vars(args)))
|
||||||
|
print("========Config========")
|
||||||
|
print(fastspeech2_config)
|
||||||
|
|
||||||
|
evaluate(args, fastspeech2_config)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -1,348 +0,0 @@
|
|||||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import math
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import paddle
|
|
||||||
from paddle import nn
|
|
||||||
from paddle.nn import functional as F
|
|
||||||
|
|
||||||
|
|
||||||
def scaled_dot_product_attention(q, k, v, mask=None, dropout=0.0,
|
|
||||||
training=True):
|
|
||||||
r"""Scaled dot product attention with masking.
|
|
||||||
|
|
||||||
Assume that q, k, v all have the same leading dimensions (denoted as * in
|
|
||||||
descriptions below). Dropout is applied to attention weights before
|
|
||||||
weighted sum of values.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
-----------
|
|
||||||
q : Tensor [shape=(\*, T_q, d)]
|
|
||||||
the query tensor.
|
|
||||||
k : Tensor [shape=(\*, T_k, d)]
|
|
||||||
the key tensor.
|
|
||||||
v : Tensor [shape=(\*, T_k, d_v)]
|
|
||||||
the value tensor.
|
|
||||||
mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional
|
|
||||||
the mask tensor, zeros correspond to paddings. Defaults to None.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
----------
|
|
||||||
out : Tensor [shape=(\*, T_q, d_v)]
|
|
||||||
the context vector.
|
|
||||||
attn_weights : Tensor [shape=(\*, T_q, T_k)]
|
|
||||||
the attention weights.
|
|
||||||
"""
|
|
||||||
d = q.shape[-1] # we only support imperative execution
|
|
||||||
qk = paddle.matmul(q, k, transpose_y=True)
|
|
||||||
scaled_logit = paddle.scale(qk, 1.0 / math.sqrt(d))
|
|
||||||
|
|
||||||
if mask is not None:
|
|
||||||
scaled_logit += paddle.scale((1.0 - mask), -1e9) # hard coded here
|
|
||||||
|
|
||||||
attn_weights = F.softmax(scaled_logit, axis=-1)
|
|
||||||
attn_weights = F.dropout(attn_weights, dropout, training=training)
|
|
||||||
out = paddle.matmul(attn_weights, v)
|
|
||||||
return out, attn_weights
|
|
||||||
|
|
||||||
|
|
||||||
def drop_head(x, drop_n_heads, training=True):
|
|
||||||
"""Drop n context vectors from multiple ones.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : Tensor [shape=(batch_size, num_heads, time_steps, channels)]
|
|
||||||
The input, multiple context vectors.
|
|
||||||
drop_n_heads : int [0<= drop_n_heads <= num_heads]
|
|
||||||
Number of vectors to drop.
|
|
||||||
training : bool
|
|
||||||
A flag indicating whether it is in training. If `False`, no dropout is
|
|
||||||
applied.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Tensor
|
|
||||||
The output.
|
|
||||||
"""
|
|
||||||
if not training or (drop_n_heads == 0):
|
|
||||||
return x
|
|
||||||
|
|
||||||
batch_size, num_heads, _, _ = x.shape
|
|
||||||
# drop all heads
|
|
||||||
if num_heads == drop_n_heads:
|
|
||||||
return paddle.zeros_like(x)
|
|
||||||
|
|
||||||
mask = np.ones([batch_size, num_heads])
|
|
||||||
mask[:, :drop_n_heads] = 0
|
|
||||||
for subarray in mask:
|
|
||||||
np.random.shuffle(subarray)
|
|
||||||
scale = float(num_heads) / (num_heads - drop_n_heads)
|
|
||||||
mask = scale * np.reshape(mask, [batch_size, num_heads, 1, 1])
|
|
||||||
out = x * paddle.to_tensor(mask)
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
def _split_heads(x, num_heads):
|
|
||||||
batch_size, time_steps, _ = x.shape
|
|
||||||
x = paddle.reshape(x, [batch_size, time_steps, num_heads, -1])
|
|
||||||
x = paddle.transpose(x, [0, 2, 1, 3])
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
def _concat_heads(x):
|
|
||||||
batch_size, _, time_steps, _ = x.shape
|
|
||||||
x = paddle.transpose(x, [0, 2, 1, 3])
|
|
||||||
x = paddle.reshape(x, [batch_size, time_steps, -1])
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
# Standard implementations of Monohead Attention & Multihead Attention
|
|
||||||
class MonoheadAttention(nn.Layer):
|
|
||||||
"""Monohead Attention module.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
model_dim : int
|
|
||||||
Feature size of the query.
|
|
||||||
dropout : float, optional
|
|
||||||
Dropout probability of scaled dot product attention and final context
|
|
||||||
vector. Defaults to 0.0.
|
|
||||||
k_dim : int, optional
|
|
||||||
Feature size of the key of each scaled dot product attention. If not
|
|
||||||
provided, it is set to `model_dim / num_heads`. Defaults to None.
|
|
||||||
v_dim : int, optional
|
|
||||||
Feature size of the key of each scaled dot product attention. If not
|
|
||||||
provided, it is set to `model_dim / num_heads`. Defaults to None.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
model_dim: int,
|
|
||||||
dropout: float=0.0,
|
|
||||||
k_dim: int=None,
|
|
||||||
v_dim: int=None):
|
|
||||||
super(MonoheadAttention, self).__init__()
|
|
||||||
k_dim = k_dim or model_dim
|
|
||||||
v_dim = v_dim or model_dim
|
|
||||||
self.affine_q = nn.Linear(model_dim, k_dim)
|
|
||||||
self.affine_k = nn.Linear(model_dim, k_dim)
|
|
||||||
self.affine_v = nn.Linear(model_dim, v_dim)
|
|
||||||
self.affine_o = nn.Linear(v_dim, model_dim)
|
|
||||||
|
|
||||||
self.model_dim = model_dim
|
|
||||||
self.dropout = dropout
|
|
||||||
|
|
||||||
def forward(self, q, k, v, mask):
|
|
||||||
"""Compute context vector and attention weights.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
-----------
|
|
||||||
q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
|
|
||||||
The queries.
|
|
||||||
k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
|
|
||||||
The keys.
|
|
||||||
v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
|
|
||||||
The values.
|
|
||||||
mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
|
|
||||||
The mask.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
----------
|
|
||||||
out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
|
|
||||||
The context vector.
|
|
||||||
attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
|
|
||||||
The attention weights.
|
|
||||||
"""
|
|
||||||
q = self.affine_q(q) # (B, T, C)
|
|
||||||
k = self.affine_k(k)
|
|
||||||
v = self.affine_v(v)
|
|
||||||
|
|
||||||
context_vectors, attention_weights = scaled_dot_product_attention(
|
|
||||||
q, k, v, mask, self.dropout, self.training)
|
|
||||||
|
|
||||||
out = self.affine_o(context_vectors)
|
|
||||||
return out, attention_weights
|
|
||||||
|
|
||||||
|
|
||||||
class MultiheadAttention(nn.Layer):
|
|
||||||
"""Multihead Attention module.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
-----------
|
|
||||||
model_dim: int
|
|
||||||
The feature size of query.
|
|
||||||
num_heads : int
|
|
||||||
The number of attention heads.
|
|
||||||
dropout : float, optional
|
|
||||||
Dropout probability of scaled dot product attention and final context
|
|
||||||
vector. Defaults to 0.0.
|
|
||||||
k_dim : int, optional
|
|
||||||
Feature size of the key of each scaled dot product attention. If not
|
|
||||||
provided, it is set to ``model_dim / num_heads``. Defaults to None.
|
|
||||||
v_dim : int, optional
|
|
||||||
Feature size of the key of each scaled dot product attention. If not
|
|
||||||
provided, it is set to ``model_dim / num_heads``. Defaults to None.
|
|
||||||
|
|
||||||
Raises
|
|
||||||
---------
|
|
||||||
ValueError
|
|
||||||
If ``model_dim`` is not divisible by ``num_heads``.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
model_dim: int,
|
|
||||||
num_heads: int,
|
|
||||||
dropout: float=0.0,
|
|
||||||
k_dim: int=None,
|
|
||||||
v_dim: int=None):
|
|
||||||
super(MultiheadAttention, self).__init__()
|
|
||||||
if model_dim % num_heads != 0:
|
|
||||||
raise ValueError("model_dim must be divisible by num_heads")
|
|
||||||
depth = model_dim // num_heads
|
|
||||||
k_dim = k_dim or depth
|
|
||||||
v_dim = v_dim or depth
|
|
||||||
self.affine_q = nn.Linear(model_dim, num_heads * k_dim)
|
|
||||||
self.affine_k = nn.Linear(model_dim, num_heads * k_dim)
|
|
||||||
self.affine_v = nn.Linear(model_dim, num_heads * v_dim)
|
|
||||||
self.affine_o = nn.Linear(num_heads * v_dim, model_dim)
|
|
||||||
|
|
||||||
self.num_heads = num_heads
|
|
||||||
self.model_dim = model_dim
|
|
||||||
self.dropout = dropout
|
|
||||||
|
|
||||||
def forward(self, q, k, v, mask):
|
|
||||||
"""Compute context vector and attention weights.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
-----------
|
|
||||||
q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
|
|
||||||
The queries.
|
|
||||||
k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
|
|
||||||
The keys.
|
|
||||||
v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
|
|
||||||
The values.
|
|
||||||
mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
|
|
||||||
The mask.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
----------
|
|
||||||
out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
|
|
||||||
The context vector.
|
|
||||||
attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
|
|
||||||
The attention weights.
|
|
||||||
"""
|
|
||||||
q = _split_heads(self.affine_q(q), self.num_heads) # (B, h, T, C)
|
|
||||||
k = _split_heads(self.affine_k(k), self.num_heads)
|
|
||||||
v = _split_heads(self.affine_v(v), self.num_heads)
|
|
||||||
mask = paddle.unsqueeze(mask, 1) # unsqueeze for the h dim
|
|
||||||
|
|
||||||
context_vectors, attention_weights = scaled_dot_product_attention(
|
|
||||||
q, k, v, mask, self.dropout, self.training)
|
|
||||||
# NOTE: there is more sophisticated implementation: Scheduled DropHead
|
|
||||||
context_vectors = _concat_heads(context_vectors) # (B, T, h*C)
|
|
||||||
out = self.affine_o(context_vectors)
|
|
||||||
return out, attention_weights
|
|
||||||
|
|
||||||
|
|
||||||
class LocationSensitiveAttention(nn.Layer):
|
|
||||||
"""Location Sensitive Attention module.
|
|
||||||
|
|
||||||
Reference: `Attention-Based Models for Speech Recognition <https://arxiv.org/pdf/1506.07503.pdf>`_
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
-----------
|
|
||||||
d_query: int
|
|
||||||
The feature size of query.
|
|
||||||
d_key : int
|
|
||||||
The feature size of key.
|
|
||||||
d_attention : int
|
|
||||||
The feature size of dimension.
|
|
||||||
location_filters : int
|
|
||||||
Filter size of attention convolution.
|
|
||||||
location_kernel_size : int
|
|
||||||
Kernel size of attention convolution.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
d_query: int,
|
|
||||||
d_key: int,
|
|
||||||
d_attention: int,
|
|
||||||
location_filters: int,
|
|
||||||
location_kernel_size: int):
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False)
|
|
||||||
self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False)
|
|
||||||
self.value = nn.Linear(d_attention, 1, bias_attr=False)
|
|
||||||
|
|
||||||
# Location Layer
|
|
||||||
self.location_conv = nn.Conv1D(
|
|
||||||
2,
|
|
||||||
location_filters,
|
|
||||||
kernel_size=location_kernel_size,
|
|
||||||
padding=int((location_kernel_size - 1) / 2),
|
|
||||||
bias_attr=False,
|
|
||||||
data_format='NLC')
|
|
||||||
self.location_layer = nn.Linear(
|
|
||||||
location_filters, d_attention, bias_attr=False)
|
|
||||||
|
|
||||||
def forward(self,
|
|
||||||
query,
|
|
||||||
processed_key,
|
|
||||||
value,
|
|
||||||
attention_weights_cat,
|
|
||||||
mask=None):
|
|
||||||
"""Compute context vector and attention weights.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
-----------
|
|
||||||
query : Tensor [shape=(batch_size, d_query)]
|
|
||||||
The queries.
|
|
||||||
processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
|
|
||||||
The keys after linear layer.
|
|
||||||
value : Tensor [shape=(batch_size, time_steps_k, d_key)]
|
|
||||||
The values.
|
|
||||||
attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
|
|
||||||
Attention weights concat.
|
|
||||||
mask : Tensor, optional
|
|
||||||
The mask. Shape should be (batch_size, times_steps_k, 1).
|
|
||||||
Defaults to None.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
----------
|
|
||||||
attention_context : Tensor [shape=(batch_size, d_attention)]
|
|
||||||
The context vector.
|
|
||||||
attention_weights : Tensor [shape=(batch_size, time_steps_k)]
|
|
||||||
The attention weights.
|
|
||||||
"""
|
|
||||||
|
|
||||||
processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
|
|
||||||
processed_attention_weights = self.location_layer(
|
|
||||||
self.location_conv(attention_weights_cat))
|
|
||||||
# (B, T_enc, 1)
|
|
||||||
alignment = self.value(
|
|
||||||
paddle.tanh(processed_attention_weights + processed_key +
|
|
||||||
processed_query))
|
|
||||||
|
|
||||||
if mask is not None:
|
|
||||||
alignment = alignment + (1.0 - mask) * -1e9
|
|
||||||
|
|
||||||
attention_weights = F.softmax(alignment, axis=1)
|
|
||||||
attention_context = paddle.matmul(
|
|
||||||
attention_weights, value, transpose_x=True)
|
|
||||||
|
|
||||||
attention_weights = paddle.squeeze(attention_weights, axis=-1)
|
|
||||||
attention_context = paddle.squeeze(attention_context, axis=1)
|
|
||||||
|
|
||||||
return attention_context, attention_weights
|
|
@ -0,0 +1,84 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# Modified from espnet(https://github.com/espnet/espnet)
|
||||||
|
"""ConvolutionModule definition."""
|
||||||
|
from paddle import nn
|
||||||
|
|
||||||
|
|
||||||
|
class ConvolutionModule(nn.Layer):
|
||||||
|
"""ConvolutionModule in Conformer model.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
channels : int
|
||||||
|
The number of channels of conv layers.
|
||||||
|
kernel_size : int
|
||||||
|
Kernerl size of conv layers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
|
||||||
|
"""Construct an ConvolutionModule object."""
|
||||||
|
super().__init__()
|
||||||
|
# kernerl_size should be a odd number for 'SAME' padding
|
||||||
|
assert (kernel_size - 1) % 2 == 0
|
||||||
|
|
||||||
|
self.pointwise_conv1 = nn.Conv1D(
|
||||||
|
channels,
|
||||||
|
2 * channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
bias_attr=bias, )
|
||||||
|
self.depthwise_conv = nn.Conv1D(
|
||||||
|
channels,
|
||||||
|
channels,
|
||||||
|
kernel_size,
|
||||||
|
stride=1,
|
||||||
|
padding=(kernel_size - 1) // 2,
|
||||||
|
groups=channels,
|
||||||
|
bias_attr=bias, )
|
||||||
|
self.norm = nn.BatchNorm1D(channels)
|
||||||
|
self.pointwise_conv2 = nn.Conv1D(
|
||||||
|
channels,
|
||||||
|
channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
bias_attr=bias, )
|
||||||
|
self.activation = activation
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
"""Compute convolution module.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : paddle.Tensor
|
||||||
|
Input tensor (#batch, time, channels).
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Output tensor (#batch, time, channels).
|
||||||
|
"""
|
||||||
|
# exchange the temporal dimension and the feature dimension
|
||||||
|
x = x.transpose([0, 2, 1])
|
||||||
|
|
||||||
|
# GLU mechanism
|
||||||
|
x = self.pointwise_conv1(x) # (batch, 2*channel, dim)
|
||||||
|
x = nn.functional.glu(x, axis=1) # (batch, channel, dim)
|
||||||
|
|
||||||
|
# 1D Depthwise Conv
|
||||||
|
x = self.depthwise_conv(x)
|
||||||
|
x = self.activation(self.norm(x))
|
||||||
|
|
||||||
|
x = self.pointwise_conv2(x)
|
||||||
|
|
||||||
|
return x.transpose([0, 2, 1])
|
@ -0,0 +1,274 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# Modified from espnet(https://github.com/espnet/espnet)
|
||||||
|
"""Encoder definition."""
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
|
||||||
|
from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer
|
||||||
|
from paddlespeech.t2s.modules.layer_norm import LayerNorm
|
||||||
|
from paddlespeech.t2s.modules.nets_utils import get_activation
|
||||||
|
from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention
|
||||||
|
from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
|
||||||
|
from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
|
||||||
|
from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding
|
||||||
|
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
|
||||||
|
from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
|
||||||
|
from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
|
||||||
|
from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
|
||||||
|
from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
|
||||||
|
from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
|
||||||
|
from paddlespeech.t2s.modules.transformer.repeat import repeat
|
||||||
|
from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
|
||||||
|
|
||||||
|
|
||||||
|
class Encoder(paddle.nn.Layer):
|
||||||
|
"""Conformer encoder module.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
idim : int
|
||||||
|
Input dimension.
|
||||||
|
attention_dim : int
|
||||||
|
Dimension of attention.
|
||||||
|
attention_heads : int
|
||||||
|
The number of heads of multi head attention.
|
||||||
|
linear_units : int
|
||||||
|
The number of units of position-wise feed forward.
|
||||||
|
num_blocks : int
|
||||||
|
The number of decoder blocks.
|
||||||
|
dropout_rate : float
|
||||||
|
Dropout rate.
|
||||||
|
positional_dropout_rate : float
|
||||||
|
Dropout rate after adding positional encoding.
|
||||||
|
attention_dropout_rate : float
|
||||||
|
Dropout rate in attention.
|
||||||
|
input_layer : Union[str, paddle.nn.Layer]
|
||||||
|
Input layer type.
|
||||||
|
normalize_before : bool
|
||||||
|
Whether to use layer_norm before the first block.
|
||||||
|
concat_after : bool
|
||||||
|
Whether to concat attention layer's input and output.
|
||||||
|
if True, additional linear will be applied.
|
||||||
|
i.e. x -> x + linear(concat(x, att(x)))
|
||||||
|
if False, no additional linear will be applied. i.e. x -> x + att(x)
|
||||||
|
positionwise_layer_type : str
|
||||||
|
"linear", "conv1d", or "conv1d-linear".
|
||||||
|
positionwise_conv_kernel_size : int
|
||||||
|
Kernel size of positionwise conv1d layer.
|
||||||
|
macaron_style : bool
|
||||||
|
Whether to use macaron style for positionwise layer.
|
||||||
|
pos_enc_layer_type : str
|
||||||
|
Encoder positional encoding layer type.
|
||||||
|
selfattention_layer_type : str
|
||||||
|
Encoder attention layer type.
|
||||||
|
activation_type : str
|
||||||
|
Encoder activation function type.
|
||||||
|
use_cnn_module : bool
|
||||||
|
Whether to use convolution module.
|
||||||
|
zero_triu : bool
|
||||||
|
Whether to zero the upper triangular part of attention matrix.
|
||||||
|
cnn_module_kernel : int
|
||||||
|
Kernerl size of convolution module.
|
||||||
|
padding_idx : int
|
||||||
|
Padding idx for input_layer=embed.
|
||||||
|
stochastic_depth_rate : float
|
||||||
|
Maximum probability to skip the encoder layer.
|
||||||
|
intermediate_layers : Union[List[int], None]
|
||||||
|
indices of intermediate CTC layer.
|
||||||
|
indices start from 1.
|
||||||
|
if not None, intermediate outputs are returned (which changes return type
|
||||||
|
signature.)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
idim,
|
||||||
|
attention_dim=256,
|
||||||
|
attention_heads=4,
|
||||||
|
linear_units=2048,
|
||||||
|
num_blocks=6,
|
||||||
|
dropout_rate=0.1,
|
||||||
|
positional_dropout_rate=0.1,
|
||||||
|
attention_dropout_rate=0.0,
|
||||||
|
input_layer="conv2d",
|
||||||
|
normalize_before=True,
|
||||||
|
concat_after=False,
|
||||||
|
positionwise_layer_type="linear",
|
||||||
|
positionwise_conv_kernel_size=1,
|
||||||
|
macaron_style=False,
|
||||||
|
pos_enc_layer_type="abs_pos",
|
||||||
|
selfattention_layer_type="selfattn",
|
||||||
|
activation_type="swish",
|
||||||
|
use_cnn_module=False,
|
||||||
|
zero_triu=False,
|
||||||
|
cnn_module_kernel=31,
|
||||||
|
padding_idx=-1,
|
||||||
|
stochastic_depth_rate=0.0,
|
||||||
|
intermediate_layers=None, ):
|
||||||
|
"""Construct an Encoder object."""
|
||||||
|
super(Encoder, self).__init__()
|
||||||
|
|
||||||
|
activation = get_activation(activation_type)
|
||||||
|
if pos_enc_layer_type == "abs_pos":
|
||||||
|
pos_enc_class = PositionalEncoding
|
||||||
|
elif pos_enc_layer_type == "scaled_abs_pos":
|
||||||
|
pos_enc_class = ScaledPositionalEncoding
|
||||||
|
elif pos_enc_layer_type == "rel_pos":
|
||||||
|
assert selfattention_layer_type == "rel_selfattn"
|
||||||
|
pos_enc_class = RelPositionalEncoding
|
||||||
|
elif pos_enc_layer_type == "legacy_rel_pos":
|
||||||
|
pos_enc_class = LegacyRelPositionalEncoding
|
||||||
|
assert selfattention_layer_type == "legacy_rel_selfattn"
|
||||||
|
else:
|
||||||
|
raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
|
||||||
|
|
||||||
|
self.conv_subsampling_factor = 1
|
||||||
|
if input_layer == "linear":
|
||||||
|
self.embed = paddle.nn.Sequential(
|
||||||
|
paddle.nn.Linear(idim, attention_dim),
|
||||||
|
paddle.nn.LayerNorm(attention_dim),
|
||||||
|
paddle.nn.Dropout(dropout_rate),
|
||||||
|
pos_enc_class(attention_dim, positional_dropout_rate), )
|
||||||
|
elif input_layer == "conv2d":
|
||||||
|
self.embed = Conv2dSubsampling(
|
||||||
|
idim,
|
||||||
|
attention_dim,
|
||||||
|
dropout_rate,
|
||||||
|
pos_enc_class(attention_dim, positional_dropout_rate), )
|
||||||
|
self.conv_subsampling_factor = 4
|
||||||
|
|
||||||
|
elif input_layer == "embed":
|
||||||
|
self.embed = paddle.nn.Sequential(
|
||||||
|
paddle.nn.Embedding(
|
||||||
|
idim, attention_dim, padding_idx=padding_idx),
|
||||||
|
pos_enc_class(attention_dim, positional_dropout_rate), )
|
||||||
|
elif isinstance(input_layer, paddle.nn.Layer):
|
||||||
|
self.embed = paddle.nn.Sequential(
|
||||||
|
input_layer,
|
||||||
|
pos_enc_class(attention_dim, positional_dropout_rate), )
|
||||||
|
elif input_layer is None:
|
||||||
|
self.embed = paddle.nn.Sequential(
|
||||||
|
pos_enc_class(attention_dim, positional_dropout_rate))
|
||||||
|
else:
|
||||||
|
raise ValueError("unknown input_layer: " + input_layer)
|
||||||
|
self.normalize_before = normalize_before
|
||||||
|
|
||||||
|
# self-attention module definition
|
||||||
|
if selfattention_layer_type == "selfattn":
|
||||||
|
logging.info("encoder self-attention layer type = self-attention")
|
||||||
|
encoder_selfattn_layer = MultiHeadedAttention
|
||||||
|
encoder_selfattn_layer_args = (attention_heads, attention_dim,
|
||||||
|
attention_dropout_rate, )
|
||||||
|
elif selfattention_layer_type == "legacy_rel_selfattn":
|
||||||
|
assert pos_enc_layer_type == "legacy_rel_pos"
|
||||||
|
encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
|
||||||
|
encoder_selfattn_layer_args = (attention_heads, attention_dim,
|
||||||
|
attention_dropout_rate, )
|
||||||
|
elif selfattention_layer_type == "rel_selfattn":
|
||||||
|
logging.info(
|
||||||
|
"encoder self-attention layer type = relative self-attention")
|
||||||
|
assert pos_enc_layer_type == "rel_pos"
|
||||||
|
encoder_selfattn_layer = RelPositionMultiHeadedAttention
|
||||||
|
encoder_selfattn_layer_args = (attention_heads, attention_dim,
|
||||||
|
attention_dropout_rate, zero_triu, )
|
||||||
|
else:
|
||||||
|
raise ValueError("unknown encoder_attn_layer: " +
|
||||||
|
selfattention_layer_type)
|
||||||
|
|
||||||
|
# feed-forward module definition
|
||||||
|
if positionwise_layer_type == "linear":
|
||||||
|
positionwise_layer = PositionwiseFeedForward
|
||||||
|
positionwise_layer_args = (attention_dim, linear_units,
|
||||||
|
dropout_rate, activation, )
|
||||||
|
elif positionwise_layer_type == "conv1d":
|
||||||
|
positionwise_layer = MultiLayeredConv1d
|
||||||
|
positionwise_layer_args = (attention_dim, linear_units,
|
||||||
|
positionwise_conv_kernel_size,
|
||||||
|
dropout_rate, )
|
||||||
|
elif positionwise_layer_type == "conv1d-linear":
|
||||||
|
positionwise_layer = Conv1dLinear
|
||||||
|
positionwise_layer_args = (attention_dim, linear_units,
|
||||||
|
positionwise_conv_kernel_size,
|
||||||
|
dropout_rate, )
|
||||||
|
else:
|
||||||
|
raise NotImplementedError("Support only linear or conv1d.")
|
||||||
|
|
||||||
|
# convolution module definition
|
||||||
|
convolution_layer = ConvolutionModule
|
||||||
|
convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
|
||||||
|
|
||||||
|
self.encoders = repeat(
|
||||||
|
num_blocks,
|
||||||
|
lambda lnum: EncoderLayer(
|
||||||
|
attention_dim,
|
||||||
|
encoder_selfattn_layer(*encoder_selfattn_layer_args),
|
||||||
|
positionwise_layer(*positionwise_layer_args),
|
||||||
|
positionwise_layer(*positionwise_layer_args) if macaron_style else None,
|
||||||
|
convolution_layer(*convolution_layer_args) if use_cnn_module else None,
|
||||||
|
dropout_rate,
|
||||||
|
normalize_before,
|
||||||
|
concat_after,
|
||||||
|
stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
|
||||||
|
if self.normalize_before:
|
||||||
|
self.after_norm = LayerNorm(attention_dim)
|
||||||
|
|
||||||
|
self.intermediate_layers = intermediate_layers
|
||||||
|
|
||||||
|
def forward(self, xs, masks):
|
||||||
|
"""Encode input sequence.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
xs : paddle.Tensor
|
||||||
|
Input tensor (#batch, time, idim).
|
||||||
|
masks (paddle.Tensor): Mask tensor (#batch, 1, time).
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Output tensor (#batch, time, attention_dim).
|
||||||
|
paddle.Tensor
|
||||||
|
Mask tensor (#batch, time).
|
||||||
|
"""
|
||||||
|
if isinstance(self.embed, (Conv2dSubsampling)):
|
||||||
|
xs, masks = self.embed(xs, masks)
|
||||||
|
else:
|
||||||
|
xs = self.embed(xs)
|
||||||
|
|
||||||
|
if self.intermediate_layers is None:
|
||||||
|
xs, masks = self.encoders(xs, masks)
|
||||||
|
else:
|
||||||
|
intermediate_outputs = []
|
||||||
|
for layer_idx, encoder_layer in enumerate(self.encoders):
|
||||||
|
xs, masks = encoder_layer(xs, masks)
|
||||||
|
|
||||||
|
if (self.intermediate_layers is not None and
|
||||||
|
layer_idx + 1 in self.intermediate_layers):
|
||||||
|
# intermediate branches also require normalization.
|
||||||
|
encoder_output = xs
|
||||||
|
if isinstance(encoder_output, tuple):
|
||||||
|
encoder_output = encoder_output[0]
|
||||||
|
if self.normalize_before:
|
||||||
|
encoder_output = self.after_norm(encoder_output)
|
||||||
|
intermediate_outputs.append(encoder_output)
|
||||||
|
|
||||||
|
if isinstance(xs, tuple):
|
||||||
|
xs = xs[0]
|
||||||
|
|
||||||
|
if self.normalize_before:
|
||||||
|
xs = self.after_norm(xs)
|
||||||
|
|
||||||
|
if self.intermediate_layers is not None:
|
||||||
|
return xs, masks, intermediate_outputs
|
||||||
|
return xs, masks
|
@ -0,0 +1,196 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# Modified from espnet(https://github.com/espnet/espnet)
|
||||||
|
"""Encoder self-attention layer definition."""
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
|
||||||
|
from paddlespeech.t2s.modules.layer_norm import LayerNorm
|
||||||
|
|
||||||
|
|
||||||
|
class EncoderLayer(nn.Layer):
|
||||||
|
"""Encoder layer module.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
size : int
|
||||||
|
Input dimension.
|
||||||
|
self_attn : paddle.nn.Layer
|
||||||
|
Self-attention module instance.
|
||||||
|
`MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
|
||||||
|
can be used as the argument.
|
||||||
|
feed_forward : paddle.nn.Layer
|
||||||
|
Feed-forward module instance.
|
||||||
|
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
|
||||||
|
can be used as the argument.
|
||||||
|
feed_forward_macaron : paddle.nn.Layer
|
||||||
|
Additional feed-forward module instance.
|
||||||
|
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
|
||||||
|
can be used as the argument.
|
||||||
|
conv_module : paddle.nn.Layer
|
||||||
|
Convolution module instance.
|
||||||
|
`ConvlutionModule` instance can be used as the argument.
|
||||||
|
dropout_rate : float
|
||||||
|
Dropout rate.
|
||||||
|
normalize_before : bool
|
||||||
|
Whether to use layer_norm before the first block.
|
||||||
|
concat_after : bool
|
||||||
|
Whether to concat attention layer's input and output.
|
||||||
|
if True, additional linear will be applied.
|
||||||
|
i.e. x -> x + linear(concat(x, att(x)))
|
||||||
|
if False, no additional linear will be applied. i.e. x -> x + att(x)
|
||||||
|
stochastic_depth_rate : float
|
||||||
|
Proability to skip this layer.
|
||||||
|
During training, the layer may skip residual computation and return input
|
||||||
|
as-is with given probability.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
size,
|
||||||
|
self_attn,
|
||||||
|
feed_forward,
|
||||||
|
feed_forward_macaron,
|
||||||
|
conv_module,
|
||||||
|
dropout_rate,
|
||||||
|
normalize_before=True,
|
||||||
|
concat_after=False,
|
||||||
|
stochastic_depth_rate=0.0, ):
|
||||||
|
"""Construct an EncoderLayer object."""
|
||||||
|
super(EncoderLayer, self).__init__()
|
||||||
|
self.self_attn = self_attn
|
||||||
|
self.feed_forward = feed_forward
|
||||||
|
self.feed_forward_macaron = feed_forward_macaron
|
||||||
|
self.conv_module = conv_module
|
||||||
|
self.norm_ff = LayerNorm(size) # for the FNN module
|
||||||
|
self.norm_mha = LayerNorm(size) # for the MHA module
|
||||||
|
if feed_forward_macaron is not None:
|
||||||
|
self.norm_ff_macaron = LayerNorm(size)
|
||||||
|
self.ff_scale = 0.5
|
||||||
|
else:
|
||||||
|
self.ff_scale = 1.0
|
||||||
|
if self.conv_module is not None:
|
||||||
|
self.norm_conv = LayerNorm(size) # for the CNN module
|
||||||
|
self.norm_final = LayerNorm(
|
||||||
|
size) # for the final output of the block
|
||||||
|
self.dropout = nn.Dropout(dropout_rate)
|
||||||
|
self.size = size
|
||||||
|
self.normalize_before = normalize_before
|
||||||
|
self.concat_after = concat_after
|
||||||
|
if self.concat_after:
|
||||||
|
self.concat_linear = nn.Linear(size + size, size)
|
||||||
|
self.stochastic_depth_rate = stochastic_depth_rate
|
||||||
|
|
||||||
|
def forward(self, x_input, mask, cache=None):
|
||||||
|
"""Compute encoded features.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x_input : Union[Tuple, paddle.Tensor]
|
||||||
|
Input tensor w/ or w/o pos emb.
|
||||||
|
- w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
|
||||||
|
- w/o pos emb: Tensor (#batch, time, size).
|
||||||
|
mask : paddle.Tensor
|
||||||
|
Mask tensor for the input (#batch, time).
|
||||||
|
cache paddle.Tensor
|
||||||
|
Cache tensor of the input (#batch, time - 1, size).
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Output tensor (#batch, time, size).
|
||||||
|
paddle.Tensor
|
||||||
|
Mask tensor (#batch, time).
|
||||||
|
"""
|
||||||
|
if isinstance(x_input, tuple):
|
||||||
|
x, pos_emb = x_input[0], x_input[1]
|
||||||
|
else:
|
||||||
|
x, pos_emb = x_input, None
|
||||||
|
|
||||||
|
skip_layer = False
|
||||||
|
# with stochastic depth, residual connection `x + f(x)` becomes
|
||||||
|
# `x <- x + 1 / (1 - p) * f(x)` at training time.
|
||||||
|
stoch_layer_coeff = 1.0
|
||||||
|
if self.training and self.stochastic_depth_rate > 0:
|
||||||
|
skip_layer = paddle.rand(1).item() < self.stochastic_depth_rate
|
||||||
|
stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
|
||||||
|
|
||||||
|
if skip_layer:
|
||||||
|
if cache is not None:
|
||||||
|
x = paddle.concat([cache, x], axis=1)
|
||||||
|
if pos_emb is not None:
|
||||||
|
return (x, pos_emb), mask
|
||||||
|
return x, mask
|
||||||
|
|
||||||
|
# whether to use macaron style
|
||||||
|
if self.feed_forward_macaron is not None:
|
||||||
|
residual = x
|
||||||
|
if self.normalize_before:
|
||||||
|
x = self.norm_ff_macaron(x)
|
||||||
|
x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
|
||||||
|
self.feed_forward_macaron(x))
|
||||||
|
if not self.normalize_before:
|
||||||
|
x = self.norm_ff_macaron(x)
|
||||||
|
|
||||||
|
# multi-headed self-attention module
|
||||||
|
residual = x
|
||||||
|
if self.normalize_before:
|
||||||
|
x = self.norm_mha(x)
|
||||||
|
|
||||||
|
if cache is None:
|
||||||
|
x_q = x
|
||||||
|
else:
|
||||||
|
assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
|
||||||
|
x_q = x[:, -1:, :]
|
||||||
|
residual = residual[:, -1:, :]
|
||||||
|
mask = None if mask is None else mask[:, -1:, :]
|
||||||
|
|
||||||
|
if pos_emb is not None:
|
||||||
|
x_att = self.self_attn(x_q, x, x, pos_emb, mask)
|
||||||
|
else:
|
||||||
|
x_att = self.self_attn(x_q, x, x, mask)
|
||||||
|
|
||||||
|
if self.concat_after:
|
||||||
|
x_concat = paddle.concat((x, x_att), axis=-1)
|
||||||
|
x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
|
||||||
|
else:
|
||||||
|
x = residual + stoch_layer_coeff * self.dropout(x_att)
|
||||||
|
if not self.normalize_before:
|
||||||
|
x = self.norm_mha(x)
|
||||||
|
|
||||||
|
# convolution module
|
||||||
|
if self.conv_module is not None:
|
||||||
|
residual = x
|
||||||
|
if self.normalize_before:
|
||||||
|
x = self.norm_conv(x)
|
||||||
|
x = residual + stoch_layer_coeff * self.dropout(self.conv_module(x))
|
||||||
|
if not self.normalize_before:
|
||||||
|
x = self.norm_conv(x)
|
||||||
|
|
||||||
|
# feed forward module
|
||||||
|
residual = x
|
||||||
|
if self.normalize_before:
|
||||||
|
x = self.norm_ff(x)
|
||||||
|
x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
|
||||||
|
self.feed_forward(x))
|
||||||
|
if not self.normalize_before:
|
||||||
|
x = self.norm_ff(x)
|
||||||
|
|
||||||
|
if self.conv_module is not None:
|
||||||
|
x = self.norm_final(x)
|
||||||
|
|
||||||
|
if cache is not None:
|
||||||
|
x = paddle.concat([cache, x], axis=1)
|
||||||
|
|
||||||
|
if pos_emb is not None:
|
||||||
|
return (x, pos_emb), mask
|
||||||
|
|
||||||
|
return x, mask
|
@ -1,208 +0,0 @@
|
|||||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
from paddle import nn
|
|
||||||
from paddle.nn import functional as F
|
|
||||||
|
|
||||||
from paddlespeech.t2s.modules import attention as attn
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"PositionwiseFFN",
|
|
||||||
"TransformerEncoderLayer",
|
|
||||||
"TransformerDecoderLayer",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class PositionwiseFFN(nn.Layer):
|
|
||||||
"""A faithful implementation of Position-wise Feed-Forward Network
|
|
||||||
in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
|
|
||||||
It is basically a 2-layer MLP, with relu actication and dropout in between.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
input_size: int
|
|
||||||
The feature size of the intput. It is also the feature size of the
|
|
||||||
output.
|
|
||||||
hidden_size: int
|
|
||||||
The hidden size.
|
|
||||||
dropout: float
|
|
||||||
The probability of the Dropout applied to the output of the first
|
|
||||||
layer, by default 0.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, input_size: int, hidden_size: int, dropout=0.0):
|
|
||||||
super(PositionwiseFFN, self).__init__()
|
|
||||||
self.linear1 = nn.Linear(input_size, hidden_size)
|
|
||||||
self.linear2 = nn.Linear(hidden_size, input_size)
|
|
||||||
self.dropout = nn.Dropout(dropout)
|
|
||||||
|
|
||||||
self.input_size = input_size
|
|
||||||
self.hidden_szie = hidden_size
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
r"""Forward pass of positionwise feed forward network.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : Tensor [shape=(\*, input_size)]
|
|
||||||
The input tensor, where ``\*`` means arbitary shape.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Tensor [shape=(\*, input_size)]
|
|
||||||
The output tensor.
|
|
||||||
"""
|
|
||||||
l1 = self.dropout(F.relu(self.linear1(x)))
|
|
||||||
l2 = self.linear2(l1)
|
|
||||||
return l2
|
|
||||||
|
|
||||||
|
|
||||||
class TransformerEncoderLayer(nn.Layer):
|
|
||||||
"""A faithful implementation of Transformer encoder layer in
|
|
||||||
`Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
d_model :int
|
|
||||||
The feature size of the input. It is also the feature size of the
|
|
||||||
output.
|
|
||||||
n_heads : int
|
|
||||||
The number of heads of self attention (a ``MultiheadAttention``
|
|
||||||
layer).
|
|
||||||
d_ffn : int
|
|
||||||
The hidden size of the positional feed forward network (a
|
|
||||||
``PositionwiseFFN`` layer).
|
|
||||||
dropout : float, optional
|
|
||||||
The probability of the dropout in MultiHeadAttention and
|
|
||||||
PositionwiseFFN, by default 0.
|
|
||||||
|
|
||||||
Notes
|
|
||||||
------
|
|
||||||
It uses the PostLN (post layer norm) scheme.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
|
|
||||||
super(TransformerEncoderLayer, self).__init__()
|
|
||||||
self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
|
|
||||||
self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
|
|
||||||
|
|
||||||
self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
|
|
||||||
self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
|
|
||||||
|
|
||||||
self.dropout = dropout
|
|
||||||
|
|
||||||
def forward(self, x, mask):
|
|
||||||
"""Forward pass of TransformerEncoderLayer.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : Tensor [shape=(batch_size, time_steps, d_model)]
|
|
||||||
The input.
|
|
||||||
mask : Tensor
|
|
||||||
The padding mask. The shape is (batch_size, time_steps,
|
|
||||||
time_steps) or broadcastable shape.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
x :Tensor [shape=(batch_size, time_steps, d_model)]
|
|
||||||
The encoded output.
|
|
||||||
|
|
||||||
attn_weights : Tensor [shape=(batch_size, n_heads, time_steps, time_steps)]
|
|
||||||
The attention weights of the self attention.
|
|
||||||
"""
|
|
||||||
context_vector, attn_weights = self.self_mha(x, x, x, mask)
|
|
||||||
x = self.layer_norm1(
|
|
||||||
F.dropout(x + context_vector, self.dropout, training=self.training))
|
|
||||||
|
|
||||||
x = self.layer_norm2(
|
|
||||||
F.dropout(x + self.ffn(x), self.dropout, training=self.training))
|
|
||||||
return x, attn_weights
|
|
||||||
|
|
||||||
|
|
||||||
class TransformerDecoderLayer(nn.Layer):
|
|
||||||
"""A faithful implementation of Transformer decoder layer in
|
|
||||||
`Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
d_model :int
|
|
||||||
The feature size of the input. It is also the feature size of the
|
|
||||||
output.
|
|
||||||
n_heads : int
|
|
||||||
The number of heads of attentions (``MultiheadAttention``
|
|
||||||
layers).
|
|
||||||
d_ffn : int
|
|
||||||
The hidden size of the positional feed forward network (a
|
|
||||||
``PositionwiseFFN`` layer).
|
|
||||||
dropout : float, optional
|
|
||||||
The probability of the dropout in MultiHeadAttention and
|
|
||||||
PositionwiseFFN, by default 0.
|
|
||||||
|
|
||||||
Notes
|
|
||||||
------
|
|
||||||
It uses the PostLN (post layer norm) scheme.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
|
|
||||||
super(TransformerDecoderLayer, self).__init__()
|
|
||||||
self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
|
|
||||||
self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
|
|
||||||
|
|
||||||
self.cross_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
|
|
||||||
self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
|
|
||||||
|
|
||||||
self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
|
|
||||||
self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6)
|
|
||||||
|
|
||||||
self.dropout = dropout
|
|
||||||
|
|
||||||
def forward(self, q, k, v, encoder_mask, decoder_mask):
|
|
||||||
"""Forward pass of TransformerEncoderLayer.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
q : Tensor [shape=(batch_size, time_steps_q, d_model)]
|
|
||||||
The decoder input.
|
|
||||||
k : Tensor [shape=(batch_size, time_steps_k, d_model)]
|
|
||||||
The keys.
|
|
||||||
v : Tensor [shape=(batch_size, time_steps_k, d_model)]
|
|
||||||
The values
|
|
||||||
encoder_mask : Tensor
|
|
||||||
Encoder padding mask, shape is ``(batch_size, time_steps_k,
|
|
||||||
time_steps_k)`` or broadcastable shape.
|
|
||||||
decoder_mask : Tensor
|
|
||||||
Decoder mask, shape is ``(batch_size, time_steps_q, time_steps_k)``
|
|
||||||
or broadcastable shape.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
--------
|
|
||||||
q : Tensor [shape=(batch_size, time_steps_q, d_model)]
|
|
||||||
The decoder output.
|
|
||||||
self_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_q)]
|
|
||||||
Decoder self attention.
|
|
||||||
|
|
||||||
cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)]
|
|
||||||
Decoder-encoder cross attention.
|
|
||||||
"""
|
|
||||||
context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask)
|
|
||||||
q = self.layer_norm1(
|
|
||||||
F.dropout(q + context_vector, self.dropout, training=self.training))
|
|
||||||
|
|
||||||
context_vector, cross_attn_weights = self.cross_mha(q, k, v,
|
|
||||||
encoder_mask)
|
|
||||||
q = self.layer_norm2(
|
|
||||||
F.dropout(q + context_vector, self.dropout, training=self.training))
|
|
||||||
|
|
||||||
q = self.layer_norm3(
|
|
||||||
F.dropout(q + self.ffn(q), self.dropout, training=self.training))
|
|
||||||
return q, self_attn_weights, cross_attn_weights
|
|
@ -0,0 +1,291 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# Modified from espnet(https://github.com/espnet/espnet)
|
||||||
|
# Conv2dSubsampling 测试通过
|
||||||
|
"""Subsampling layer definition."""
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
|
||||||
|
|
||||||
|
|
||||||
|
class TooShortUttError(Exception):
|
||||||
|
"""Raised when the utt is too short for subsampling.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
message : str
|
||||||
|
Message for error catch
|
||||||
|
actual_size : int
|
||||||
|
the short size that cannot pass the subsampling
|
||||||
|
limit : int
|
||||||
|
the limit size for subsampling
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, message, actual_size, limit):
|
||||||
|
"""Construct a TooShortUttError for error handler."""
|
||||||
|
super().__init__(message)
|
||||||
|
self.actual_size = actual_size
|
||||||
|
self.limit = limit
|
||||||
|
|
||||||
|
|
||||||
|
def check_short_utt(ins, size):
|
||||||
|
"""Check if the utterance is too short for subsampling."""
|
||||||
|
if isinstance(ins, Conv2dSubsampling2) and size < 3:
|
||||||
|
return True, 3
|
||||||
|
if isinstance(ins, Conv2dSubsampling) and size < 7:
|
||||||
|
return True, 7
|
||||||
|
if isinstance(ins, Conv2dSubsampling6) and size < 11:
|
||||||
|
return True, 11
|
||||||
|
if isinstance(ins, Conv2dSubsampling8) and size < 15:
|
||||||
|
return True, 15
|
||||||
|
return False, -1
|
||||||
|
|
||||||
|
|
||||||
|
class Conv2dSubsampling(paddle.nn.Layer):
|
||||||
|
"""Convolutional 2D subsampling (to 1/4 length).
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
idim : int
|
||||||
|
Input dimension.
|
||||||
|
odim : int
|
||||||
|
Output dimension.
|
||||||
|
dropout_rate : float
|
||||||
|
Dropout rate.
|
||||||
|
pos_enc : paddle.nn.Layer
|
||||||
|
Custom position encoding layer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
|
||||||
|
"""Construct an Conv2dSubsampling object."""
|
||||||
|
super(Conv2dSubsampling, self).__init__()
|
||||||
|
self.conv = paddle.nn.Sequential(
|
||||||
|
paddle.nn.Conv2D(1, odim, 3, 2),
|
||||||
|
paddle.nn.ReLU(),
|
||||||
|
paddle.nn.Conv2D(odim, odim, 3, 2),
|
||||||
|
paddle.nn.ReLU(), )
|
||||||
|
self.out = paddle.nn.Sequential(
|
||||||
|
paddle.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim),
|
||||||
|
pos_enc if pos_enc is not None else
|
||||||
|
PositionalEncoding(odim, dropout_rate), )
|
||||||
|
|
||||||
|
def forward(self, x, x_mask):
|
||||||
|
"""Subsample x.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : paddle.Tensor
|
||||||
|
Input tensor (#batch, time, idim).
|
||||||
|
x_mask : paddle.Tensor
|
||||||
|
Input mask (#batch, 1, time).
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Subsampled tensor (#batch, time', odim),
|
||||||
|
where time' = time // 4.
|
||||||
|
paddle.Tensor
|
||||||
|
Subsampled mask (#batch, 1, time'),
|
||||||
|
where time' = time // 4.
|
||||||
|
"""
|
||||||
|
# (b, c, t, f)
|
||||||
|
x = x.unsqueeze(1)
|
||||||
|
x = self.conv(x)
|
||||||
|
b, c, t, f = x.shape
|
||||||
|
# x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
|
||||||
|
x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
|
||||||
|
if x_mask is None:
|
||||||
|
return x, None
|
||||||
|
return x, x_mask[:, :, :-2:2][:, :, :-2:2]
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
"""Get item.
|
||||||
|
When reset_parameters() is called, if use_scaled_pos_enc is used,
|
||||||
|
return the positioning encoding.
|
||||||
|
"""
|
||||||
|
if key != -1:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Support only `-1` (for `reset_parameters`).")
|
||||||
|
return self.out[key]
|
||||||
|
|
||||||
|
|
||||||
|
class Conv2dSubsampling2(paddle.nn.Layer):
|
||||||
|
"""Convolutional 2D subsampling (to 1/2 length).
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
idim : int
|
||||||
|
Input dimension.
|
||||||
|
odim : int
|
||||||
|
Output dimension.
|
||||||
|
dropout_rate : float
|
||||||
|
Dropout rate.
|
||||||
|
pos_enc : paddle.nn.Layer
|
||||||
|
Custom position encoding layer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
|
||||||
|
"""Construct an Conv2dSubsampling2 object."""
|
||||||
|
super(Conv2dSubsampling2, self).__init__()
|
||||||
|
self.conv = paddle.nn.Sequential(
|
||||||
|
paddle.nn.Conv2D(1, odim, 3, 2),
|
||||||
|
paddle.nn.ReLU(),
|
||||||
|
paddle.nn.Conv2D(odim, odim, 3, 1),
|
||||||
|
paddle.nn.ReLU(), )
|
||||||
|
self.out = paddle.nn.Sequential(
|
||||||
|
paddle.nn.Linear(odim * (((idim - 1) // 2 - 2)), odim),
|
||||||
|
pos_enc if pos_enc is not None else
|
||||||
|
PositionalEncoding(odim, dropout_rate), )
|
||||||
|
|
||||||
|
def forward(self, x, x_mask):
|
||||||
|
"""Subsample x.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : paddle.Tensor
|
||||||
|
Input tensor (#batch, time, idim).
|
||||||
|
x_mask : paddle.Tensor
|
||||||
|
Input mask (#batch, 1, time).
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
ubsampled tensor (#batch, time', odim),
|
||||||
|
where time' = time // 2.
|
||||||
|
paddle.Tensor
|
||||||
|
Subsampled mask (#batch, 1, time'),
|
||||||
|
where time' = time // 2.
|
||||||
|
"""
|
||||||
|
# (b, c, t, f)
|
||||||
|
x = x.unsqueeze(1)
|
||||||
|
x = self.conv(x)
|
||||||
|
b, c, t, f = x.shape
|
||||||
|
x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
|
||||||
|
if x_mask is None:
|
||||||
|
return x, None
|
||||||
|
return x, x_mask[:, :, :-2:2][:, :, :-2:1]
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
"""Get item.
|
||||||
|
When reset_parameters() is called, if use_scaled_pos_enc is used,
|
||||||
|
return the positioning encoding.
|
||||||
|
"""
|
||||||
|
if key != -1:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Support only `-1` (for `reset_parameters`).")
|
||||||
|
return self.out[key]
|
||||||
|
|
||||||
|
|
||||||
|
class Conv2dSubsampling6(paddle.nn.Layer):
|
||||||
|
"""Convolutional 2D subsampling (to 1/6 length).
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
idim : int
|
||||||
|
Input dimension.
|
||||||
|
odim : int
|
||||||
|
Output dimension.
|
||||||
|
dropout_rate : float
|
||||||
|
Dropout rate.
|
||||||
|
pos_enc : paddle.nn.Layer
|
||||||
|
Custom position encoding layer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
|
||||||
|
"""Construct an Conv2dSubsampling6 object."""
|
||||||
|
super(Conv2dSubsampling6, self).__init__()
|
||||||
|
self.conv = paddle.nn.Sequential(
|
||||||
|
paddle.nn.Conv2D(1, odim, 3, 2),
|
||||||
|
paddle.nn.ReLU(),
|
||||||
|
paddle.nn.Conv2D(odim, odim, 5, 3),
|
||||||
|
paddle.nn.ReLU(), )
|
||||||
|
self.out = paddle.nn.Sequential(
|
||||||
|
paddle.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim),
|
||||||
|
pos_enc if pos_enc is not None else
|
||||||
|
PositionalEncoding(odim, dropout_rate), )
|
||||||
|
|
||||||
|
def forward(self, x, x_mask):
|
||||||
|
"""Subsample x.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : paddle.Tensor
|
||||||
|
Input tensor (#batch, time, idim).
|
||||||
|
x_mask paddle.Tensor
|
||||||
|
Input mask (#batch, 1, time).
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Subsampled tensor (#batch, time', odim),
|
||||||
|
where time' = time // 6.
|
||||||
|
paddle.Tensor
|
||||||
|
Subsampled mask (#batch, 1, time'),
|
||||||
|
where time' = time // 6.
|
||||||
|
"""
|
||||||
|
# (b, c, t, f)
|
||||||
|
x = x.unsqueeze(1)
|
||||||
|
x = self.conv(x)
|
||||||
|
b, c, t, f = x.shape
|
||||||
|
x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
|
||||||
|
if x_mask is None:
|
||||||
|
return x, None
|
||||||
|
return x, x_mask[:, :, :-2:2][:, :, :-4:3]
|
||||||
|
|
||||||
|
|
||||||
|
class Conv2dSubsampling8(paddle.nn.Layer):
|
||||||
|
"""Convolutional 2D subsampling (to 1/8 length).
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
idim : int
|
||||||
|
Input dimension.
|
||||||
|
odim : int
|
||||||
|
Output dimension.
|
||||||
|
dropout_rate : float
|
||||||
|
Dropout rate.
|
||||||
|
pos_enc : paddle.nn.Layer
|
||||||
|
Custom position encoding layer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
|
||||||
|
"""Construct an Conv2dSubsampling8 object."""
|
||||||
|
super(Conv2dSubsampling8, self).__init__()
|
||||||
|
self.conv = paddle.nn.Sequential(
|
||||||
|
paddle.nn.Conv2D(1, odim, 3, 2),
|
||||||
|
paddle.nn.ReLU(),
|
||||||
|
paddle.nn.Conv2D(odim, odim, 3, 2),
|
||||||
|
paddle.nn.ReLU(),
|
||||||
|
paddle.nn.Conv2D(odim, odim, 3, 2),
|
||||||
|
paddle.nn.ReLU(), )
|
||||||
|
self.out = paddle.nn.Sequential(
|
||||||
|
paddle.nn.Linear(odim * (((
|
||||||
|
(idim - 1) // 2 - 1) // 2 - 1) // 2), odim),
|
||||||
|
pos_enc if pos_enc is not None else
|
||||||
|
PositionalEncoding(odim, dropout_rate), )
|
||||||
|
|
||||||
|
def forward(self, x, x_mask):
|
||||||
|
"""Subsample x.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : paddle.Tensor
|
||||||
|
Input tensor (#batch, time, idim).
|
||||||
|
x_mask : paddle.Tensor
|
||||||
|
Input mask (#batch, 1, time).
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Subsampled tensor (#batch, time', odim),
|
||||||
|
where time' = time // 8.
|
||||||
|
paddle.Tensor
|
||||||
|
Subsampled mask (#batch, 1, time'),
|
||||||
|
where time' = time // 8.
|
||||||
|
"""
|
||||||
|
# (b, c, t, f)
|
||||||
|
x = x.unsqueeze(1)
|
||||||
|
x = self.conv(x)
|
||||||
|
b, c, t, f = x.shape
|
||||||
|
x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
|
||||||
|
if x_mask is None:
|
||||||
|
return x, None
|
||||||
|
return x, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]
|
@ -0,0 +1,20 @@
|
|||||||
|
# Install conda dependencies
|
||||||
|
conda install -c conda-forge sox libsndfile swig bzip2 bottleneck gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 --yes
|
||||||
|
|
||||||
|
# Install the python lib
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
# Install the auto_log
|
||||||
|
pushd tools/extras
|
||||||
|
bash install_autolog.sh
|
||||||
|
popd
|
||||||
|
|
||||||
|
# Install the ctcdecoder
|
||||||
|
pushd paddlespeech/s2t/decoders/ctcdecoder/swig
|
||||||
|
bash -e setup.sh
|
||||||
|
popd
|
||||||
|
|
||||||
|
# Install the python_speech_features
|
||||||
|
pushd third_party
|
||||||
|
bash -e install.sh
|
||||||
|
popd
|
@ -1,345 +0,0 @@
|
|||||||
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
import traceback
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
|
||||||
parser = argparse.ArgumentParser(description=__doc__)
|
|
||||||
parser.add_argument(
|
|
||||||
"--filename", type=str, help="The name of log which need to analysis.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--log_with_profiler",
|
|
||||||
type=str,
|
|
||||||
help="The path of train log with profiler")
|
|
||||||
parser.add_argument(
|
|
||||||
"--profiler_path", type=str, help="The path of profiler timeline log.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--keyword", type=str, help="Keyword to specify analysis data")
|
|
||||||
parser.add_argument(
|
|
||||||
"--separator",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help="Separator of different field in log")
|
|
||||||
parser.add_argument(
|
|
||||||
'--position', type=int, default=None, help='The position of data field')
|
|
||||||
parser.add_argument(
|
|
||||||
'--range',
|
|
||||||
type=str,
|
|
||||||
default="",
|
|
||||||
help='The range of data field to intercept')
|
|
||||||
parser.add_argument(
|
|
||||||
'--base_batch_size', type=int, help='base_batch size on gpu')
|
|
||||||
parser.add_argument(
|
|
||||||
'--skip_steps',
|
|
||||||
type=int,
|
|
||||||
default=0,
|
|
||||||
help='The number of steps to be skipped')
|
|
||||||
parser.add_argument(
|
|
||||||
'--model_mode',
|
|
||||||
type=int,
|
|
||||||
default=-1,
|
|
||||||
help='Analysis mode, default value is -1')
|
|
||||||
parser.add_argument('--ips_unit', type=str, default=None, help='IPS unit')
|
|
||||||
parser.add_argument(
|
|
||||||
'--model_name',
|
|
||||||
type=str,
|
|
||||||
default=0,
|
|
||||||
help='training model_name, transformer_base')
|
|
||||||
parser.add_argument(
|
|
||||||
'--mission_name', type=str, default=0, help='training mission name')
|
|
||||||
parser.add_argument(
|
|
||||||
'--direction_id', type=int, default=0, help='training direction_id')
|
|
||||||
parser.add_argument(
|
|
||||||
'--run_mode',
|
|
||||||
type=str,
|
|
||||||
default="sp",
|
|
||||||
help='multi process or single process')
|
|
||||||
parser.add_argument(
|
|
||||||
'--index',
|
|
||||||
type=int,
|
|
||||||
default=1,
|
|
||||||
help='{1: speed, 2:mem, 3:profiler, 6:max_batch_size}')
|
|
||||||
parser.add_argument(
|
|
||||||
'--gpu_num', type=int, default=1, help='nums of training gpus')
|
|
||||||
parser.add_argument(
|
|
||||||
'--use_num', type=int, default=1, help='nums of used recoders')
|
|
||||||
args = parser.parse_args()
|
|
||||||
args.separator = None if args.separator == "None" else args.separator
|
|
||||||
return args
|
|
||||||
|
|
||||||
|
|
||||||
def _is_number(num):
|
|
||||||
pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$')
|
|
||||||
result = pattern.match(num)
|
|
||||||
if result:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
class TimeAnalyzer(object):
|
|
||||||
def __init__(self,
|
|
||||||
filename,
|
|
||||||
keyword=None,
|
|
||||||
separator=None,
|
|
||||||
position=None,
|
|
||||||
range="-1"):
|
|
||||||
if filename is None:
|
|
||||||
raise Exception("Please specify the filename!")
|
|
||||||
|
|
||||||
if keyword is None:
|
|
||||||
raise Exception("Please specify the keyword!")
|
|
||||||
|
|
||||||
self.filename = filename
|
|
||||||
self.keyword = keyword
|
|
||||||
self.separator = separator
|
|
||||||
self.position = position
|
|
||||||
self.range = range
|
|
||||||
self.records = None
|
|
||||||
self._distil()
|
|
||||||
|
|
||||||
def _distil(self):
|
|
||||||
self.records = []
|
|
||||||
with open(self.filename, "r") as f_object:
|
|
||||||
lines = f_object.readlines()
|
|
||||||
for line in lines:
|
|
||||||
if self.keyword not in line:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
result = None
|
|
||||||
|
|
||||||
# Distil the string from a line.
|
|
||||||
line = line.strip()
|
|
||||||
line_words = line.split(
|
|
||||||
self.separator) if self.separator else line.split()
|
|
||||||
print("line_words", line_words)
|
|
||||||
if args.position:
|
|
||||||
result = line_words[self.position]
|
|
||||||
else:
|
|
||||||
# Distil the string following the keyword.
|
|
||||||
for i in range(len(line_words) - 1):
|
|
||||||
if line_words[i] == self.keyword:
|
|
||||||
result = line_words[i + 1]
|
|
||||||
break
|
|
||||||
|
|
||||||
# Distil the result from the picked string.
|
|
||||||
if not self.range:
|
|
||||||
result = result[0:]
|
|
||||||
elif _is_number(self.range):
|
|
||||||
result = result[0:int(self.range)]
|
|
||||||
else:
|
|
||||||
result = result[int(self.range.split(":")[0]):int(
|
|
||||||
self.range.split(":")[1])]
|
|
||||||
self.records.append(float(result))
|
|
||||||
except Exception as exc:
|
|
||||||
pass
|
|
||||||
#print("line is: {}; separator={}; position={}".format(line, self.separator, self.position))
|
|
||||||
self.records.sort()
|
|
||||||
self.records = self.records[:args.use_num]
|
|
||||||
print("records", self.records)
|
|
||||||
print("Extract {} records: separator={}; position={}".format(
|
|
||||||
len(self.records), self.separator, self.position))
|
|
||||||
|
|
||||||
def _get_fps(self,
|
|
||||||
mode,
|
|
||||||
batch_size,
|
|
||||||
gpu_num,
|
|
||||||
avg_of_records,
|
|
||||||
run_mode,
|
|
||||||
unit=None):
|
|
||||||
if mode == -1 and run_mode == 'sp':
|
|
||||||
assert unit, "Please set the unit when mode is -1."
|
|
||||||
fps = gpu_num * avg_of_records
|
|
||||||
elif mode == -1 and run_mode == 'mp':
|
|
||||||
assert unit, "Please set the unit when mode is -1."
|
|
||||||
fps = gpu_num * avg_of_records #temporarily, not used now
|
|
||||||
print("------------this is mp")
|
|
||||||
elif mode == 0:
|
|
||||||
# s/step -> samples/s
|
|
||||||
fps = (batch_size * gpu_num) / avg_of_records
|
|
||||||
unit = "samples/s"
|
|
||||||
elif mode == 1:
|
|
||||||
# steps/s -> steps/s
|
|
||||||
fps = avg_of_records
|
|
||||||
unit = "steps/s"
|
|
||||||
elif mode == 2:
|
|
||||||
# s/step -> steps/s
|
|
||||||
fps = 1 / avg_of_records
|
|
||||||
unit = "steps/s"
|
|
||||||
elif mode == 3:
|
|
||||||
# steps/s -> samples/s
|
|
||||||
fps = batch_size * gpu_num * avg_of_records
|
|
||||||
unit = "samples/s"
|
|
||||||
elif mode == 4:
|
|
||||||
# s/epoch -> s/epoch
|
|
||||||
fps = avg_of_records
|
|
||||||
unit = "s/epoch"
|
|
||||||
else:
|
|
||||||
ValueError("Unsupported analysis mode.")
|
|
||||||
|
|
||||||
return fps, unit
|
|
||||||
|
|
||||||
def analysis(self,
|
|
||||||
batch_size,
|
|
||||||
gpu_num=1,
|
|
||||||
skip_steps=0,
|
|
||||||
mode=-1,
|
|
||||||
run_mode='sp',
|
|
||||||
unit=None):
|
|
||||||
if batch_size <= 0:
|
|
||||||
print("base_batch_size should larger than 0.")
|
|
||||||
return 0, ''
|
|
||||||
|
|
||||||
if len(
|
|
||||||
self.records
|
|
||||||
) <= skip_steps: # to address the condition which item of log equals to skip_steps
|
|
||||||
print("no records")
|
|
||||||
return 0, ''
|
|
||||||
|
|
||||||
sum_of_records = 0
|
|
||||||
sum_of_records_skipped = 0
|
|
||||||
skip_min = self.records[skip_steps]
|
|
||||||
skip_max = self.records[skip_steps]
|
|
||||||
|
|
||||||
count = len(self.records)
|
|
||||||
for i in range(count):
|
|
||||||
sum_of_records += self.records[i]
|
|
||||||
if i >= skip_steps:
|
|
||||||
sum_of_records_skipped += self.records[i]
|
|
||||||
if self.records[i] < skip_min:
|
|
||||||
skip_min = self.records[i]
|
|
||||||
if self.records[i] > skip_max:
|
|
||||||
skip_max = self.records[i]
|
|
||||||
|
|
||||||
avg_of_records = sum_of_records / float(count)
|
|
||||||
avg_of_records_skipped = sum_of_records_skipped / float(count -
|
|
||||||
skip_steps)
|
|
||||||
|
|
||||||
fps, fps_unit = self._get_fps(mode, batch_size, gpu_num, avg_of_records,
|
|
||||||
run_mode, unit)
|
|
||||||
fps_skipped, _ = self._get_fps(mode, batch_size, gpu_num,
|
|
||||||
avg_of_records_skipped, run_mode, unit)
|
|
||||||
if mode == -1:
|
|
||||||
print("average ips of %d steps, skip 0 step:" % count)
|
|
||||||
print("\tAvg: %.3f %s" % (avg_of_records, fps_unit))
|
|
||||||
print("\tFPS: %.3f %s" % (fps, fps_unit))
|
|
||||||
if skip_steps > 0:
|
|
||||||
print("average ips of %d steps, skip %d steps:" %
|
|
||||||
(count, skip_steps))
|
|
||||||
print("\tAvg: %.3f %s" % (avg_of_records_skipped, fps_unit))
|
|
||||||
print("\tMin: %.3f %s" % (skip_min, fps_unit))
|
|
||||||
print("\tMax: %.3f %s" % (skip_max, fps_unit))
|
|
||||||
print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
|
|
||||||
elif mode == 1 or mode == 3:
|
|
||||||
print("average latency of %d steps, skip 0 step:" % count)
|
|
||||||
print("\tAvg: %.3f steps/s" % avg_of_records)
|
|
||||||
print("\tFPS: %.3f %s" % (fps, fps_unit))
|
|
||||||
if skip_steps > 0:
|
|
||||||
print("average latency of %d steps, skip %d steps:" %
|
|
||||||
(count, skip_steps))
|
|
||||||
print("\tAvg: %.3f steps/s" % avg_of_records_skipped)
|
|
||||||
print("\tMin: %.3f steps/s" % skip_min)
|
|
||||||
print("\tMax: %.3f steps/s" % skip_max)
|
|
||||||
print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
|
|
||||||
elif mode == 0 or mode == 2:
|
|
||||||
print("average latency of %d steps, skip 0 step:" % count)
|
|
||||||
print("\tAvg: %.3f s/step" % avg_of_records)
|
|
||||||
print("\tFPS: %.3f %s" % (fps, fps_unit))
|
|
||||||
if skip_steps > 0:
|
|
||||||
print("average latency of %d steps, skip %d steps:" %
|
|
||||||
(count, skip_steps))
|
|
||||||
print("\tAvg: %.3f s/step" % avg_of_records_skipped)
|
|
||||||
print("\tMin: %.3f s/step" % skip_min)
|
|
||||||
print("\tMax: %.3f s/step" % skip_max)
|
|
||||||
print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
|
|
||||||
|
|
||||||
return round(fps_skipped, 3), fps_unit
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
args = parse_args()
|
|
||||||
run_info = dict()
|
|
||||||
run_info["log_file"] = args.filename
|
|
||||||
run_info["model_name"] = args.model_name
|
|
||||||
run_info["mission_name"] = args.mission_name
|
|
||||||
run_info["direction_id"] = args.direction_id
|
|
||||||
run_info["run_mode"] = args.run_mode
|
|
||||||
run_info["index"] = args.index
|
|
||||||
run_info["gpu_num"] = args.gpu_num
|
|
||||||
run_info["FINAL_RESULT"] = 0
|
|
||||||
run_info["JOB_FAIL_FLAG"] = 0
|
|
||||||
|
|
||||||
try:
|
|
||||||
if args.index == 1:
|
|
||||||
if args.gpu_num == 1:
|
|
||||||
run_info["log_with_profiler"] = args.log_with_profiler
|
|
||||||
run_info["profiler_path"] = args.profiler_path
|
|
||||||
analyzer = TimeAnalyzer(args.filename, args.keyword, args.separator,
|
|
||||||
args.position, args.range)
|
|
||||||
run_info["FINAL_RESULT"], run_info["UNIT"] = analyzer.analysis(
|
|
||||||
batch_size=args.base_batch_size,
|
|
||||||
gpu_num=args.gpu_num,
|
|
||||||
skip_steps=args.skip_steps,
|
|
||||||
mode=args.model_mode,
|
|
||||||
run_mode=args.run_mode,
|
|
||||||
unit=args.ips_unit)
|
|
||||||
# if int(os.getenv('job_fail_flag')) == 1 or int(run_info["FINAL_RESULT"]) == 0:
|
|
||||||
# run_info["JOB_FAIL_FLAG"] = 1
|
|
||||||
elif args.index == 3:
|
|
||||||
run_info["FINAL_RESULT"] = {}
|
|
||||||
records_fo_total = TimeAnalyzer(args.filename, 'Framework overhead',
|
|
||||||
None, 3, '').records
|
|
||||||
records_fo_ratio = TimeAnalyzer(args.filename, 'Framework overhead',
|
|
||||||
None, 5).records
|
|
||||||
records_ct_total = TimeAnalyzer(args.filename, 'Computation time',
|
|
||||||
None, 3, '').records
|
|
||||||
records_gm_total = TimeAnalyzer(args.filename,
|
|
||||||
'GpuMemcpy Calls',
|
|
||||||
None, 4, '').records
|
|
||||||
records_gm_ratio = TimeAnalyzer(args.filename,
|
|
||||||
'GpuMemcpy Calls',
|
|
||||||
None, 6).records
|
|
||||||
records_gmas_total = TimeAnalyzer(args.filename,
|
|
||||||
'GpuMemcpyAsync Calls',
|
|
||||||
None, 4, '').records
|
|
||||||
records_gms_total = TimeAnalyzer(args.filename,
|
|
||||||
'GpuMemcpySync Calls',
|
|
||||||
None, 4, '').records
|
|
||||||
run_info["FINAL_RESULT"]["Framework_Total"] = records_fo_total[
|
|
||||||
0] if records_fo_total else 0
|
|
||||||
run_info["FINAL_RESULT"]["Framework_Ratio"] = records_fo_ratio[
|
|
||||||
0] if records_fo_ratio else 0
|
|
||||||
run_info["FINAL_RESULT"][
|
|
||||||
"ComputationTime_Total"] = records_ct_total[
|
|
||||||
0] if records_ct_total else 0
|
|
||||||
run_info["FINAL_RESULT"]["GpuMemcpy_Total"] = records_gm_total[
|
|
||||||
0] if records_gm_total else 0
|
|
||||||
run_info["FINAL_RESULT"]["GpuMemcpy_Ratio"] = records_gm_ratio[
|
|
||||||
0] if records_gm_ratio else 0
|
|
||||||
run_info["FINAL_RESULT"][
|
|
||||||
"GpuMemcpyAsync_Total"] = records_gmas_total[
|
|
||||||
0] if records_gmas_total else 0
|
|
||||||
run_info["FINAL_RESULT"]["GpuMemcpySync_Total"] = records_gms_total[
|
|
||||||
0] if records_gms_total else 0
|
|
||||||
else:
|
|
||||||
print("Not support!")
|
|
||||||
except Exception:
|
|
||||||
traceback.print_exc()
|
|
||||||
print("{}".format(json.dumps(run_info))
|
|
||||||
) # it's required, for the log file path insert to the database
|
|
Loading…
Reference in new issue