add conformer

pull/1015/head
TianYuan 3 years ago
parent 4a28751df0
commit 3d5e078c91

@ -154,7 +154,7 @@ If you want to try more functions like training and tuning, please see [Speech-t
## Model List
PaddleSpeech supports a series of most popular models, summarized in [released models](./docs/source/released_models.md) with available pretrained models.
PaddleSpeech supports a series of most popular models, summarized in [released models](./docs/source/released_model.md) with available pretrained models.
Speech-to-Text module contains *Acoustic Model* and *Language Model*, with the following details:
@ -344,4 +344,4 @@ year={2021}
PaddleSpeech is provided under the [Apache-2.0 License](./LICENSE).
PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information.
PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information.

@ -13,7 +13,6 @@
# limitations under the License.
import argparse
from pathlib import Path
from typing import Union
import numpy as np
import paddle
@ -23,129 +22,12 @@ from yacs.config import CfgNode
from paddlespeech.t2s.frontend.zh_frontend import Frontend
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference
from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
from paddlespeech.t2s.models.parallel_wavegan import PWGInference
from paddlespeech.t2s.modules.normalizer import ZScore
class StyleFastSpeech2Inference(FastSpeech2Inference):
def __init__(self, normalizer, model, pitch_stats_path, energy_stats_path):
super().__init__(normalizer, model)
pitch_mean, pitch_std = np.load(pitch_stats_path)
self.pitch_mean = paddle.to_tensor(pitch_mean)
self.pitch_std = paddle.to_tensor(pitch_std)
energy_mean, energy_std = np.load(energy_stats_path)
self.energy_mean = paddle.to_tensor(energy_mean)
self.energy_std = paddle.to_tensor(energy_std)
def denorm(self, data, mean, std):
return data * std + mean
def norm(self, data, mean, std):
return (data - mean) / std
def forward(self,
text: paddle.Tensor,
durations: Union[paddle.Tensor, np.ndarray]=None,
durations_scale: Union[int, float]=None,
durations_bias: Union[int, float]=None,
pitch: Union[paddle.Tensor, np.ndarray]=None,
pitch_scale: Union[int, float]=None,
pitch_bias: Union[int, float]=None,
energy: Union[paddle.Tensor, np.ndarray]=None,
energy_scale: Union[int, float]=None,
energy_bias: Union[int, float]=None,
robot: bool=False):
"""
Parameters
----------
text : Tensor(int64)
Input sequence of characters (T,).
speech : Tensor, optional
Feature sequence to extract style (N, idim).
durations : paddle.Tensor/np.ndarray, optional (int64)
Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
durations_scale: int/float, optional
durations_bias: int/float, optional
pitch : paddle.Tensor/np.ndarray, optional
Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
pitch_scale: int/float, optional
In denormed HZ domain.
pitch_bias: int/float, optional
In denormed HZ domain.
energy : paddle.Tensor/np.ndarray, optional
Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
energy_scale: int/float, optional
In denormed domain.
energy_bias: int/float, optional
In denormed domain.
robot : bool, optional
Weather output robot style
Returns
----------
Tensor
Output sequence of features (L, odim).
"""
normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
text, durations=None, pitch=None, energy=None)
# priority: groundtruth > scale/bias > previous output
# set durations
if isinstance(durations, np.ndarray):
durations = paddle.to_tensor(durations)
elif isinstance(durations, paddle.Tensor):
durations = durations
elif durations_scale or durations_bias:
durations_scale = durations_scale if durations_scale is not None else 1
durations_bias = durations_bias if durations_bias is not None else 0
durations = durations_scale * d_outs + durations_bias
else:
durations = d_outs
if robot:
# set normed pitch to zeros have the same effect with set denormd ones to mean
pitch = paddle.zeros(p_outs.shape)
# set pitch, can overwrite robot set
if isinstance(pitch, np.ndarray):
pitch = paddle.to_tensor(pitch)
elif isinstance(pitch, paddle.Tensor):
pitch = pitch
elif pitch_scale or pitch_bias:
pitch_scale = pitch_scale if pitch_scale is not None else 1
pitch_bias = pitch_bias if pitch_bias is not None else 0
p_Hz = paddle.exp(
self.denorm(p_outs, self.pitch_mean, self.pitch_std))
p_HZ = pitch_scale * p_Hz + pitch_bias
pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std)
else:
pitch = p_outs
# set energy
if isinstance(energy, np.ndarray):
energy = paddle.to_tensor(energy)
elif isinstance(energy, paddle.Tensor):
energy = energy
elif energy_scale or energy_bias:
energy_scale = energy_scale if energy_scale is not None else 1
energy_bias = energy_bias if energy_bias is not None else 0
e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std)
e_dnorm = energy_scale * e_dnorm + energy_bias
energy = self.norm(e_dnorm, self.energy_mean, self.energy_std)
else:
energy = e_outs
normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
text,
durations=durations,
pitch=pitch,
energy=energy,
use_teacher_forcing=True)
logmel = self.normalizer.inverse(normalized_mel)
return logmel
def evaluate(args, fastspeech2_config, pwg_config):
# construct dataset for evaluation

@ -23,7 +23,7 @@ Contents
.. toctree::
:maxdepth: 1
:caption: Speech-To-Text
:caption: Speech-to-Text
asr/models_introduction
asr/data_preparation
@ -33,7 +33,7 @@ Contents
.. toctree::
:maxdepth: 1
:caption: Text-To-Speech
:caption: Text-to-Speech
tts/basic_usage
tts/advanced_usage

@ -16,6 +16,22 @@ cd DeepSpeech
pip install -e .
```
For user who only needs the basic function of paddlespeech, using conda to do installing is recommended.
You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) to select a version and install it by yourself, or you can use the scripts below to install the last miniconda version.
```python
pushd tools
bash extras/install_miniconda.sh
popd
bash
```
After installing the conda, run the setup.sh to complete the installing process.
```python
bash setup.sh
```
## Setup (Other Platform)
- Make sure these libraries or tools in [dependencies](./dependencies.md) installed. More information please see: `setup.py `and ` tools/Makefile`.

@ -1,11 +1,11 @@
# PaddleSpeech
## What is PaddleSpeech?
PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech - Speech-To-Text (Automatic Speech Recognition, ASR) and Text-To-Speech Synthesis (TTS), with modules involving state-of-art and influential models.
PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech - Speech-to-Text (Automatic Speech Recognition, ASR) and Text-to-Speech Synthesis (TTS), with modules involving state-of-art and influential models.
## What can PaddleSpeech do?
### Speech-To-Text
### Speech-to-Text
PaddleSpeech ASR mainly consists of components below:
- Implementation of models and commonly used neural network layers.
- Dataset abstraction and common data preprocessing pipelines.
@ -29,9 +29,9 @@ PaddleSpeech ASR provides you with a complete ASR pipeline, including:
- attention decoding (used in Transformer and Conformer)
- attention rescoring (used in Transformer and Conformer)
Speech-To-Text helps you training the ASR model very simply.
Speech-to-Text helps you training the ASR model very simply.
### Text-To-Speech
### Text-to-Speech
TTS mainly consists of components below:
- Implementation of models and commonly used neural network layers.
- Dataset abstraction and common data preprocessing pipelines.
@ -53,4 +53,4 @@ PaddleSpeech TTS provides you with a complete TTS pipeline, including:
- Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis
- GE2E
Text-To-Speech helps you to train TTS models with simple commands.
Text-to-Speech helps you to train TTS models with simple commands.

@ -1,7 +1,7 @@
# Released Models
## Speech-To-Text Models
## Speech-to-Text Models
### Acoustic Model Released in paddle 2.X
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :---------
@ -27,7 +27,7 @@ Language Model | Training Data | Token-based | Size | Descriptions
[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
## Text-To-Speech Models
## Text-to-Speech Models
### Acoustic Models
Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static)
:-------------:| :------------:| :-----: | :-----:| :-----:| :-----:

@ -0,0 +1,139 @@
# This is the hyperparameter configuration file for MelGAN.
# Please make sure this is adjusted for the CSMSC dataset. If you want to
# apply to the other dataset, you might need to carefully change some parameters.
# This configuration requires ~ 8GB memory and will finish within 7 days on Titan V.
# This configuration is based on full-band MelGAN but the hop size and sampling
# rate is different from the paper (16kHz vs 24kHz). The number of iteraions
# is not shown in the paper so currently we train 1M iterations (not sure enough
# to converge). The optimizer setting is based on @dathudeptrai advice.
# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs: 24000 # Sampling rate.
n_fft: 2048 # FFT size. (in samples)
n_shift: 300 # Hop size. (in samples)
win_length: 1200 # Window length. (in samples)
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.
n_mels: 80 # Number of mel basis.
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
generator_params:
in_channels: 80 # Number of input channels.
out_channels: 4 # Number of output channels.
kernel_size: 7 # Kernel size of initial and final conv layers.
channels: 384 # Initial number of channels for conv layers.
upsample_scales: [5, 5, 3] # List of Upsampling scales.
stack_kernel_size: 3 # Kernel size of dilated conv layers in residual stack.
stacks: 4 # Number of stacks in a single residual stack module.
use_weight_norm: True # Whether to use weight normalization.
use_causal_conv: False # Whether to use causal convolution.
use_final_nonlinear_activation: True
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
discriminator_params:
in_channels: 1 # Number of input channels.
out_channels: 1 # Number of output channels.
scales: 3 # Number of multi-scales.
downsample_pooling: "AvgPool1D" # Pooling type for the input downsampling.
downsample_pooling_params: # Parameters of the above pooling function.
kernel_size: 4
stride: 2
padding: 1
exclusive: True
kernel_sizes: [5, 3] # List of kernel size.
channels: 16 # Number of channels of the initial conv layer.
max_downsample_channels: 512 # Maximum number of channels of downsampling layers.
downsample_scales: [4, 4, 4] # List of downsampling scales.
nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
nonlinear_activation_params: # Parameters of nonlinear activation function.
negative_slope: 0.2
use_weight_norm: True # Whether to use weight norm.
###########################################################
# STFT LOSS SETTING #
###########################################################
use_stft_loss: true
stft_loss_params:
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
window: "hann" # Window function for STFT-based loss
use_subband_stft_loss: true
subband_stft_loss_params:
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss
win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
window: "hann" # Window function for STFT-based loss
###########################################################
# ADVERSARIAL LOSS SETTING #
###########################################################
use_feat_match_loss: false # Whether to use feature matching loss.
lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss.
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 64 # Batch size.
batch_max_steps: 16200 # Length of each audio in batch. Make sure dividable by hop_size.
num_workers: 2 # Number of workers in DataLoader.
###########################################################
# OPTIMIZER & SCHEDULER SETTING #
###########################################################
generator_optimizer_params:
epsilon: 1.0e-7 # Generator's epsilon.
weight_decay: 0.0 # Generator's weight decay coefficient.
generator_grad_norm: -1 # Generator's gradient norm.
generator_scheduler_params:
learning_rate: 1.0e-3 # Generator's learning rate.
gamma: 0.5 # Generator's scheduler gamma.
milestones: # At each milestone, lr will be multiplied by gamma.
- 100000
- 200000
- 300000
- 400000
- 500000
- 600000
discriminator_optimizer_params:
epsilon: 1.0e-7 # Discriminator's epsilon.
weight_decay: 0.0 # Discriminator's weight decay coefficient.
discriminator_grad_norm: -1 # Discriminator's gradient norm.
discriminator_scheduler_params:
learning_rate: 1.0e-3 # Discriminator's learning rate.
gamma: 0.5 # Discriminator's scheduler gamma.
milestones: # At each milestone, lr will be multiplied by gamma.
- 100000
- 200000
- 300000
- 400000
- 500000
- 600000
###########################################################
# INTERVAL SETTING #
###########################################################
discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
train_max_steps: 1200000 # Number of training steps.
save_interval_steps: 1000 # Interval steps to save checkpoint.
eval_interval_steps: 1000 # Interval steps to evaluate the network.
###########################################################
# OTHER SETTING #
###########################################################
num_snapshots: 10 # max number of snapshots to keep while training
seed: 42 # random seed for paddle, random, and np.random

@ -0,0 +1,63 @@
#!/bin/bash
source path.sh
gpus=0
stage=0
stop_stage=100
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \
--fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
--fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
--fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
--dur-file=durations.txt \
--output-dir=dump_finetune \
--phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 local/link_wav.py \
--old-dump-dir=dump \
--dump-dir=dump_finetune
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# get features' stats(mean and std)
echo "Get features' stats ..."
cp dump/train/feats_stats.npy dump_finetune/train/
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# normalize, dev and test should use train's stats
echo "Normalize ..."
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump_finetune/train/raw/metadata.jsonl \
--dumpdir=dump_finetune/train/norm \
--stats=dump_finetune/train/feats_stats.npy
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump_finetune/dev/raw/metadata.jsonl \
--dumpdir=dump_finetune/dev/norm \
--stats=dump_finetune/train/feats_stats.npy
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump_finetune/test/raw/metadata.jsonl \
--dumpdir=dump_finetune/test/norm \
--stats=dump_finetune/train/feats_stats.npy
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=${gpus} \
FLAGS_cudnn_exhaustive_search=true \
FLAGS_conv_workspace_size_limit=4000 \
python ${BIN_DIR}/train.py \
--train-metadata=dump_finetune/train/norm/metadata.jsonl \
--dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
--config=conf/finetune.yaml \
--output-dir=exp/finetune \
--ngpu=1
fi

@ -0,0 +1,85 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from operator import itemgetter
from pathlib import Path
import jsonlines
import numpy as np
def main():
# parse config and args
parser = argparse.ArgumentParser(
description="Preprocess audio and then extract features .")
parser.add_argument(
"--old-dump-dir",
default=None,
type=str,
help="directory to dump feature files.")
parser.add_argument(
"--dump-dir",
type=str,
required=True,
help="directory to finetune dump feature files.")
args = parser.parse_args()
old_dump_dir = Path(args.old_dump_dir).expanduser()
old_dump_dir = old_dump_dir.resolve()
dump_dir = Path(args.dump_dir).expanduser()
# use absolute path
dump_dir = dump_dir.resolve()
dump_dir.mkdir(parents=True, exist_ok=True)
assert old_dump_dir.is_dir()
assert dump_dir.is_dir()
for sub in ["train", "dev", "test"]:
# 把 old_dump_dir 里面的 *-wave.npy 软连接到 dump_dir 的对应位置
output_dir = dump_dir / sub
output_dir.mkdir(parents=True, exist_ok=True)
results = []
for name in os.listdir(output_dir / "raw"):
# 003918_feats.npy
utt_id = name.split("_")[0]
mel_path = output_dir / ("raw/" + name)
gen_mel = np.load(mel_path)
wave_name = utt_id + "_wave.npy"
wav = np.load(old_dump_dir / sub / ("raw/" + wave_name))
os.symlink(old_dump_dir / sub / ("raw/" + wave_name),
output_dir / ("raw/" + wave_name))
num_sample = wav.shape[0]
num_frames = gen_mel.shape[0]
wav_path = output_dir / ("raw/" + wave_name)
record = {
"utt_id": utt_id,
"num_samples": num_sample,
"num_frames": num_frames,
"feats": str(mel_path),
"wave": str(wav_path),
}
results.append(record)
results.sort(key=itemgetter("utt_id"))
with jsonlines.open(output_dir / "raw/metadata.jsonl", 'w') as writer:
for item in results:
writer.write(item)
if __name__ == "__main__":
main()

@ -1,36 +1,6 @@
# https://yaml.org/type/float.html
data:
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test-clean
collator:
vocab_filepath: data/lang_char/train_960_unigram5000_units.txt
unit_type: spm
spm_model_prefix: data/lang_char/train_960_unigram5000
feat_dim: 83
stride_ms: 10.0
window_ms: 25.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size: 30
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
augmentation_config: conf/augmentation.json
num_workers: 0
subsampling_factor: 1
num_encs: 1
# network architecture
model:
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:
@ -63,6 +33,33 @@ model:
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
data:
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test-clean
collator:
vocab_filepath: data/lang_char/train_960_unigram5000_units.txt
unit_type: spm
spm_model_prefix: data/lang_char/train_960_unigram5000
feat_dim: 83
stride_ms: 10.0
window_ms: 25.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size: 30
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
augmentation_config: conf/augmentation.json
num_workers: 0
subsampling_factor: 1
num_encs: 1
training:
n_epoch: 120

@ -110,10 +110,10 @@ class Clip(object):
if len(x) < c.shape[0] * self.hop_size:
x = np.pad(x, (0, c.shape[0] * self.hop_size - len(x)), mode="edge")
elif len(x) > c.shape[0] * self.hop_size:
print(
f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
)
x = x[:c.shape[1] * self.hop_size]
# print(
# f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
# )
x = x[:c.shape[0] * self.hop_size]
# check the legnth is valid
assert len(x) == c.shape[

@ -0,0 +1,167 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# generate mels using durations.txt
# for mb melgan finetune
# 长度和原本的 mel 不一致怎么办?
import argparse
from pathlib import Path
import numpy as np
import paddle
import yaml
from yacs.config import CfgNode
from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
from paddlespeech.t2s.modules.normalizer import ZScore
def evaluate(args, fastspeech2_config):
# construct dataset for evaluation
with open(args.phones_dict, "r") as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
phone_dict = {}
for phn, id in phn_id:
phone_dict[phn] = int(id)
odim = fastspeech2_config.n_mels
model = FastSpeech2(
idim=vocab_size, odim=odim, **fastspeech2_config["model"])
model.set_state_dict(
paddle.load(args.fastspeech2_checkpoint)["main_params"])
model.eval()
stat = np.load(args.fastspeech2_stat)
mu, std = stat
mu = paddle.to_tensor(mu)
std = paddle.to_tensor(std)
fastspeech2_normalizer = ZScore(mu, std)
fastspeech2_inference = StyleFastSpeech2Inference(fastspeech2_normalizer,
model)
fastspeech2_inference.eval()
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
sentences, speaker_set = get_phn_dur(args.dur_file)
merge_silence(sentences)
for i, utt_id in enumerate(sentences):
phones = sentences[utt_id][0]
durations = sentences[utt_id][1]
speaker = sentences[utt_id][2]
# 裁剪掉开头和结尾的 sil
if args.cut_sil:
if phones[0] == "sil" and len(durations) > 1:
durations = durations[1:]
phones = phones[1:]
if phones[-1] == 'sil' and len(durations) > 1:
durations = durations[:-1]
phones = phones[:-1]
# sentences[utt_id][0] = phones
# sentences[utt_id][1] = durations
phone_ids = [phone_dict[phn] for phn in phones]
phone_ids = paddle.to_tensor(np.array(phone_ids))
durations = paddle.to_tensor(np.array(durations))
# 生成的和真实的可能有 1, 2 帧的差距,但是 batch_fn 会修复
# split data into 3 sections
if args.dataset == "baker":
num_train = 9800
num_dev = 100
if i in range(0, num_train):
sub_output_dir = output_dir / ("train/raw")
elif i in range(num_train, num_train + num_dev):
sub_output_dir = output_dir / ("dev/raw")
else:
sub_output_dir = output_dir / ("test/raw")
sub_output_dir.mkdir(parents=True, exist_ok=True)
with paddle.no_grad():
mel = fastspeech2_inference(phone_ids, durations=durations)
np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
def main():
# parse args and config and redirect to train_sp
parser = argparse.ArgumentParser(
description="Synthesize with fastspeech2 & parallel wavegan.")
parser.add_argument(
"--dataset",
default="baker",
type=str,
help="name of dataset, should in {baker, ljspeech, vctk} now")
parser.add_argument(
"--fastspeech2-config", type=str, help="fastspeech2 config file.")
parser.add_argument(
"--fastspeech2-checkpoint",
type=str,
help="fastspeech2 checkpoint to load.")
parser.add_argument(
"--fastspeech2-stat",
type=str,
help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
)
parser.add_argument(
"--phones-dict",
type=str,
default="phone_id_map.txt",
help="phone vocabulary file.")
parser.add_argument(
"--dur-file", default=None, type=str, help="path to durations.txt.")
parser.add_argument("--output-dir", type=str, help="output dir.")
parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
parser.add_argument("--verbose", type=int, default=1, help="verbose.")
def str2bool(str):
return True if str.lower() == 'true' else False
parser.add_argument(
"--cut-sil",
type=str2bool,
default=True,
help="whether cut sil in the edge of audio")
args = parser.parse_args()
if args.ngpu == 0:
paddle.set_device("cpu")
elif args.ngpu > 0:
paddle.set_device("gpu")
else:
print("ngpu should >= 0 !")
with open(args.fastspeech2_config) as f:
fastspeech2_config = CfgNode(yaml.safe_load(f))
print("========Args========")
print(yaml.safe_dump(vars(args)))
print("========Config========")
print(fastspeech2_config)
evaluate(args, fastspeech2_config)
if __name__ == "__main__":
main()

@ -16,23 +16,25 @@
from typing import Dict
from typing import Sequence
from typing import Tuple
from typing import Union
import numpy as np
import paddle
import paddle.nn.functional as F
from paddle import nn
from typeguard import check_argument_types
from paddlespeech.t2s.modules.fastspeech2_predictor.duration_predictor import DurationPredictor
from paddlespeech.t2s.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss
from paddlespeech.t2s.modules.fastspeech2_predictor.length_regulator import LengthRegulator
from paddlespeech.t2s.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
from paddlespeech.t2s.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
from paddlespeech.t2s.modules.nets_utils import initialize
from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
from paddlespeech.t2s.modules.nets_utils import make_pad_mask
from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictor
from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictorLoss
from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor
from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
from paddlespeech.t2s.modules.transformer.encoder import Encoder as TransformerEncoder
class FastSpeech2(nn.Layer):
@ -687,6 +689,129 @@ class FastSpeech2Inference(nn.Layer):
return logmel
class StyleFastSpeech2Inference(FastSpeech2Inference):
def __init__(self,
normalizer,
model,
pitch_stats_path=None,
energy_stats_path=None):
super().__init__(normalizer, model)
if pitch_stats_path:
pitch_mean, pitch_std = np.load(pitch_stats_path)
self.pitch_mean = paddle.to_tensor(pitch_mean)
self.pitch_std = paddle.to_tensor(pitch_std)
if energy_stats_path:
energy_mean, energy_std = np.load(energy_stats_path)
self.energy_mean = paddle.to_tensor(energy_mean)
self.energy_std = paddle.to_tensor(energy_std)
def denorm(self, data, mean, std):
return data * std + mean
def norm(self, data, mean, std):
return (data - mean) / std
def forward(self,
text: paddle.Tensor,
durations: Union[paddle.Tensor, np.ndarray]=None,
durations_scale: Union[int, float]=None,
durations_bias: Union[int, float]=None,
pitch: Union[paddle.Tensor, np.ndarray]=None,
pitch_scale: Union[int, float]=None,
pitch_bias: Union[int, float]=None,
energy: Union[paddle.Tensor, np.ndarray]=None,
energy_scale: Union[int, float]=None,
energy_bias: Union[int, float]=None,
robot: bool=False):
"""
Parameters
----------
text : Tensor(int64)
Input sequence of characters (T,).
speech : Tensor, optional
Feature sequence to extract style (N, idim).
durations : paddle.Tensor/np.ndarray, optional (int64)
Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
durations_scale: int/float, optional
durations_bias: int/float, optional
pitch : paddle.Tensor/np.ndarray, optional
Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
pitch_scale: int/float, optional
In denormed HZ domain.
pitch_bias: int/float, optional
In denormed HZ domain.
energy : paddle.Tensor/np.ndarray, optional
Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
energy_scale: int/float, optional
In denormed domain.
energy_bias: int/float, optional
In denormed domain.
robot : bool, optional
Weather output robot style
Returns
----------
Tensor
Output sequence of features (L, odim).
"""
normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
text, durations=None, pitch=None, energy=None)
# priority: groundtruth > scale/bias > previous output
# set durations
if isinstance(durations, np.ndarray):
durations = paddle.to_tensor(durations)
elif isinstance(durations, paddle.Tensor):
durations = durations
elif durations_scale or durations_bias:
durations_scale = durations_scale if durations_scale is not None else 1
durations_bias = durations_bias if durations_bias is not None else 0
durations = durations_scale * d_outs + durations_bias
else:
durations = d_outs
if robot:
# set normed pitch to zeros have the same effect with set denormd ones to mean
pitch = paddle.zeros(p_outs.shape)
# set pitch, can overwrite robot set
if isinstance(pitch, np.ndarray):
pitch = paddle.to_tensor(pitch)
elif isinstance(pitch, paddle.Tensor):
pitch = pitch
elif pitch_scale or pitch_bias:
pitch_scale = pitch_scale if pitch_scale is not None else 1
pitch_bias = pitch_bias if pitch_bias is not None else 0
p_Hz = paddle.exp(
self.denorm(p_outs, self.pitch_mean, self.pitch_std))
p_HZ = pitch_scale * p_Hz + pitch_bias
pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std)
else:
pitch = p_outs
# set energy
if isinstance(energy, np.ndarray):
energy = paddle.to_tensor(energy)
elif isinstance(energy, paddle.Tensor):
energy = energy
elif energy_scale or energy_bias:
energy_scale = energy_scale if energy_scale is not None else 1
energy_bias = energy_bias if energy_bias is not None else 0
e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std)
e_dnorm = energy_scale * e_dnorm + energy_bias
energy = self.norm(e_dnorm, self.energy_mean, self.energy_std)
else:
energy = e_outs
normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
text,
durations=durations,
pitch=pitch,
energy=energy,
use_teacher_forcing=True)
logmel = self.normalizer.inverse(normalized_mel)
return logmel
class FastSpeech2Loss(nn.Layer):
"""Loss function module for FastSpeech2."""

@ -23,12 +23,6 @@ import paddle.nn.functional as F
from paddle import nn
from typeguard import check_argument_types
from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
from paddlespeech.t2s.modules.fastspeech2_transformer.decoder import Decoder
from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
from paddlespeech.t2s.modules.fastspeech2_transformer.encoder import Encoder
from paddlespeech.t2s.modules.fastspeech2_transformer.mask import subsequent_mask
from paddlespeech.t2s.modules.nets_utils import initialize
from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
from paddlespeech.t2s.modules.nets_utils import make_pad_mask
@ -36,6 +30,12 @@ from paddlespeech.t2s.modules.style_encoder import StyleEncoder
from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
from paddlespeech.t2s.modules.tacotron2.decoder import Prenet as DecoderPrenet
from paddlespeech.t2s.modules.tacotron2.encoder import Encoder as EncoderPrenet
from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
from paddlespeech.t2s.modules.transformer.decoder import Decoder
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
from paddlespeech.t2s.modules.transformer.encoder import Encoder
from paddlespeech.t2s.modules.transformer.mask import subsequent_mask
class TransformerTTS(nn.Layer):

@ -11,10 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .attention import *
from .conv import *
from .geometry import *
from .losses import *
from .masking import *
from .positional_encoding import *
from .transformer import *

@ -1,348 +0,0 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import numpy as np
import paddle
from paddle import nn
from paddle.nn import functional as F
def scaled_dot_product_attention(q, k, v, mask=None, dropout=0.0,
training=True):
r"""Scaled dot product attention with masking.
Assume that q, k, v all have the same leading dimensions (denoted as * in
descriptions below). Dropout is applied to attention weights before
weighted sum of values.
Parameters
-----------
q : Tensor [shape=(\*, T_q, d)]
the query tensor.
k : Tensor [shape=(\*, T_k, d)]
the key tensor.
v : Tensor [shape=(\*, T_k, d_v)]
the value tensor.
mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional
the mask tensor, zeros correspond to paddings. Defaults to None.
Returns
----------
out : Tensor [shape=(\*, T_q, d_v)]
the context vector.
attn_weights : Tensor [shape=(\*, T_q, T_k)]
the attention weights.
"""
d = q.shape[-1] # we only support imperative execution
qk = paddle.matmul(q, k, transpose_y=True)
scaled_logit = paddle.scale(qk, 1.0 / math.sqrt(d))
if mask is not None:
scaled_logit += paddle.scale((1.0 - mask), -1e9) # hard coded here
attn_weights = F.softmax(scaled_logit, axis=-1)
attn_weights = F.dropout(attn_weights, dropout, training=training)
out = paddle.matmul(attn_weights, v)
return out, attn_weights
def drop_head(x, drop_n_heads, training=True):
"""Drop n context vectors from multiple ones.
Parameters
----------
x : Tensor [shape=(batch_size, num_heads, time_steps, channels)]
The input, multiple context vectors.
drop_n_heads : int [0<= drop_n_heads <= num_heads]
Number of vectors to drop.
training : bool
A flag indicating whether it is in training. If `False`, no dropout is
applied.
Returns
-------
Tensor
The output.
"""
if not training or (drop_n_heads == 0):
return x
batch_size, num_heads, _, _ = x.shape
# drop all heads
if num_heads == drop_n_heads:
return paddle.zeros_like(x)
mask = np.ones([batch_size, num_heads])
mask[:, :drop_n_heads] = 0
for subarray in mask:
np.random.shuffle(subarray)
scale = float(num_heads) / (num_heads - drop_n_heads)
mask = scale * np.reshape(mask, [batch_size, num_heads, 1, 1])
out = x * paddle.to_tensor(mask)
return out
def _split_heads(x, num_heads):
batch_size, time_steps, _ = x.shape
x = paddle.reshape(x, [batch_size, time_steps, num_heads, -1])
x = paddle.transpose(x, [0, 2, 1, 3])
return x
def _concat_heads(x):
batch_size, _, time_steps, _ = x.shape
x = paddle.transpose(x, [0, 2, 1, 3])
x = paddle.reshape(x, [batch_size, time_steps, -1])
return x
# Standard implementations of Monohead Attention & Multihead Attention
class MonoheadAttention(nn.Layer):
"""Monohead Attention module.
Parameters
----------
model_dim : int
Feature size of the query.
dropout : float, optional
Dropout probability of scaled dot product attention and final context
vector. Defaults to 0.0.
k_dim : int, optional
Feature size of the key of each scaled dot product attention. If not
provided, it is set to `model_dim / num_heads`. Defaults to None.
v_dim : int, optional
Feature size of the key of each scaled dot product attention. If not
provided, it is set to `model_dim / num_heads`. Defaults to None.
"""
def __init__(self,
model_dim: int,
dropout: float=0.0,
k_dim: int=None,
v_dim: int=None):
super(MonoheadAttention, self).__init__()
k_dim = k_dim or model_dim
v_dim = v_dim or model_dim
self.affine_q = nn.Linear(model_dim, k_dim)
self.affine_k = nn.Linear(model_dim, k_dim)
self.affine_v = nn.Linear(model_dim, v_dim)
self.affine_o = nn.Linear(v_dim, model_dim)
self.model_dim = model_dim
self.dropout = dropout
def forward(self, q, k, v, mask):
"""Compute context vector and attention weights.
Parameters
-----------
q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
The queries.
k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
The keys.
v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
The values.
mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
The mask.
Returns
----------
out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
The context vector.
attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
The attention weights.
"""
q = self.affine_q(q) # (B, T, C)
k = self.affine_k(k)
v = self.affine_v(v)
context_vectors, attention_weights = scaled_dot_product_attention(
q, k, v, mask, self.dropout, self.training)
out = self.affine_o(context_vectors)
return out, attention_weights
class MultiheadAttention(nn.Layer):
"""Multihead Attention module.
Parameters
-----------
model_dim: int
The feature size of query.
num_heads : int
The number of attention heads.
dropout : float, optional
Dropout probability of scaled dot product attention and final context
vector. Defaults to 0.0.
k_dim : int, optional
Feature size of the key of each scaled dot product attention. If not
provided, it is set to ``model_dim / num_heads``. Defaults to None.
v_dim : int, optional
Feature size of the key of each scaled dot product attention. If not
provided, it is set to ``model_dim / num_heads``. Defaults to None.
Raises
---------
ValueError
If ``model_dim`` is not divisible by ``num_heads``.
"""
def __init__(self,
model_dim: int,
num_heads: int,
dropout: float=0.0,
k_dim: int=None,
v_dim: int=None):
super(MultiheadAttention, self).__init__()
if model_dim % num_heads != 0:
raise ValueError("model_dim must be divisible by num_heads")
depth = model_dim // num_heads
k_dim = k_dim or depth
v_dim = v_dim or depth
self.affine_q = nn.Linear(model_dim, num_heads * k_dim)
self.affine_k = nn.Linear(model_dim, num_heads * k_dim)
self.affine_v = nn.Linear(model_dim, num_heads * v_dim)
self.affine_o = nn.Linear(num_heads * v_dim, model_dim)
self.num_heads = num_heads
self.model_dim = model_dim
self.dropout = dropout
def forward(self, q, k, v, mask):
"""Compute context vector and attention weights.
Parameters
-----------
q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
The queries.
k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
The keys.
v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
The values.
mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
The mask.
Returns
----------
out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
The context vector.
attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
The attention weights.
"""
q = _split_heads(self.affine_q(q), self.num_heads) # (B, h, T, C)
k = _split_heads(self.affine_k(k), self.num_heads)
v = _split_heads(self.affine_v(v), self.num_heads)
mask = paddle.unsqueeze(mask, 1) # unsqueeze for the h dim
context_vectors, attention_weights = scaled_dot_product_attention(
q, k, v, mask, self.dropout, self.training)
# NOTE: there is more sophisticated implementation: Scheduled DropHead
context_vectors = _concat_heads(context_vectors) # (B, T, h*C)
out = self.affine_o(context_vectors)
return out, attention_weights
class LocationSensitiveAttention(nn.Layer):
"""Location Sensitive Attention module.
Reference: `Attention-Based Models for Speech Recognition <https://arxiv.org/pdf/1506.07503.pdf>`_
Parameters
-----------
d_query: int
The feature size of query.
d_key : int
The feature size of key.
d_attention : int
The feature size of dimension.
location_filters : int
Filter size of attention convolution.
location_kernel_size : int
Kernel size of attention convolution.
"""
def __init__(self,
d_query: int,
d_key: int,
d_attention: int,
location_filters: int,
location_kernel_size: int):
super().__init__()
self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False)
self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False)
self.value = nn.Linear(d_attention, 1, bias_attr=False)
# Location Layer
self.location_conv = nn.Conv1D(
2,
location_filters,
kernel_size=location_kernel_size,
padding=int((location_kernel_size - 1) / 2),
bias_attr=False,
data_format='NLC')
self.location_layer = nn.Linear(
location_filters, d_attention, bias_attr=False)
def forward(self,
query,
processed_key,
value,
attention_weights_cat,
mask=None):
"""Compute context vector and attention weights.
Parameters
-----------
query : Tensor [shape=(batch_size, d_query)]
The queries.
processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
The keys after linear layer.
value : Tensor [shape=(batch_size, time_steps_k, d_key)]
The values.
attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
Attention weights concat.
mask : Tensor, optional
The mask. Shape should be (batch_size, times_steps_k, 1).
Defaults to None.
Returns
----------
attention_context : Tensor [shape=(batch_size, d_attention)]
The context vector.
attention_weights : Tensor [shape=(batch_size, time_steps_k)]
The attention weights.
"""
processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
processed_attention_weights = self.location_layer(
self.location_conv(attention_weights_cat))
# (B, T_enc, 1)
alignment = self.value(
paddle.tanh(processed_attention_weights + processed_key +
processed_query))
if mask is not None:
alignment = alignment + (1.0 - mask) * -1e9
attention_weights = F.softmax(alignment, axis=1)
attention_context = paddle.matmul(
attention_weights, value, transpose_x=True)
attention_weights = paddle.squeeze(attention_weights, axis=-1)
attention_context = paddle.squeeze(attention_context, axis=1)
return attention_context, attention_weights

@ -0,0 +1,84 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
"""ConvolutionModule definition."""
from paddle import nn
class ConvolutionModule(nn.Layer):
"""ConvolutionModule in Conformer model.
Parameters
----------
channels : int
The number of channels of conv layers.
kernel_size : int
Kernerl size of conv layers.
"""
def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
"""Construct an ConvolutionModule object."""
super().__init__()
# kernerl_size should be a odd number for 'SAME' padding
assert (kernel_size - 1) % 2 == 0
self.pointwise_conv1 = nn.Conv1D(
channels,
2 * channels,
kernel_size=1,
stride=1,
padding=0,
bias_attr=bias, )
self.depthwise_conv = nn.Conv1D(
channels,
channels,
kernel_size,
stride=1,
padding=(kernel_size - 1) // 2,
groups=channels,
bias_attr=bias, )
self.norm = nn.BatchNorm1D(channels)
self.pointwise_conv2 = nn.Conv1D(
channels,
channels,
kernel_size=1,
stride=1,
padding=0,
bias_attr=bias, )
self.activation = activation
def forward(self, x):
"""Compute convolution module.
Parameters
----------
x : paddle.Tensor
Input tensor (#batch, time, channels).
Returns
----------
paddle.Tensor
Output tensor (#batch, time, channels).
"""
# exchange the temporal dimension and the feature dimension
x = x.transpose([0, 2, 1])
# GLU mechanism
x = self.pointwise_conv1(x) # (batch, 2*channel, dim)
x = nn.functional.glu(x, axis=1) # (batch, channel, dim)
# 1D Depthwise Conv
x = self.depthwise_conv(x)
x = self.activation(self.norm(x))
x = self.pointwise_conv2(x)
return x.transpose([0, 2, 1])

@ -0,0 +1,274 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
"""Encoder definition."""
import logging
import paddle
from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer
from paddlespeech.t2s.modules.layer_norm import LayerNorm
from paddlespeech.t2s.modules.nets_utils import get_activation
from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention
from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
from paddlespeech.t2s.modules.transformer.repeat import repeat
from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
class Encoder(paddle.nn.Layer):
"""Conformer encoder module.
Parameters
----------
idim : int
Input dimension.
attention_dim : int
Dimension of attention.
attention_heads : int
The number of heads of multi head attention.
linear_units : int
The number of units of position-wise feed forward.
num_blocks : int
The number of decoder blocks.
dropout_rate : float
Dropout rate.
positional_dropout_rate : float
Dropout rate after adding positional encoding.
attention_dropout_rate : float
Dropout rate in attention.
input_layer : Union[str, paddle.nn.Layer]
Input layer type.
normalize_before : bool
Whether to use layer_norm before the first block.
concat_after : bool
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
positionwise_layer_type : str
"linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size : int
Kernel size of positionwise conv1d layer.
macaron_style : bool
Whether to use macaron style for positionwise layer.
pos_enc_layer_type : str
Encoder positional encoding layer type.
selfattention_layer_type : str
Encoder attention layer type.
activation_type : str
Encoder activation function type.
use_cnn_module : bool
Whether to use convolution module.
zero_triu : bool
Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel : int
Kernerl size of convolution module.
padding_idx : int
Padding idx for input_layer=embed.
stochastic_depth_rate : float
Maximum probability to skip the encoder layer.
intermediate_layers : Union[List[int], None]
indices of intermediate CTC layer.
indices start from 1.
if not None, intermediate outputs are returned (which changes return type
signature.)
"""
def __init__(
self,
idim,
attention_dim=256,
attention_heads=4,
linear_units=2048,
num_blocks=6,
dropout_rate=0.1,
positional_dropout_rate=0.1,
attention_dropout_rate=0.0,
input_layer="conv2d",
normalize_before=True,
concat_after=False,
positionwise_layer_type="linear",
positionwise_conv_kernel_size=1,
macaron_style=False,
pos_enc_layer_type="abs_pos",
selfattention_layer_type="selfattn",
activation_type="swish",
use_cnn_module=False,
zero_triu=False,
cnn_module_kernel=31,
padding_idx=-1,
stochastic_depth_rate=0.0,
intermediate_layers=None, ):
"""Construct an Encoder object."""
super(Encoder, self).__init__()
activation = get_activation(activation_type)
if pos_enc_layer_type == "abs_pos":
pos_enc_class = PositionalEncoding
elif pos_enc_layer_type == "scaled_abs_pos":
pos_enc_class = ScaledPositionalEncoding
elif pos_enc_layer_type == "rel_pos":
assert selfattention_layer_type == "rel_selfattn"
pos_enc_class = RelPositionalEncoding
elif pos_enc_layer_type == "legacy_rel_pos":
pos_enc_class = LegacyRelPositionalEncoding
assert selfattention_layer_type == "legacy_rel_selfattn"
else:
raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
self.conv_subsampling_factor = 1
if input_layer == "linear":
self.embed = paddle.nn.Sequential(
paddle.nn.Linear(idim, attention_dim),
paddle.nn.LayerNorm(attention_dim),
paddle.nn.Dropout(dropout_rate),
pos_enc_class(attention_dim, positional_dropout_rate), )
elif input_layer == "conv2d":
self.embed = Conv2dSubsampling(
idim,
attention_dim,
dropout_rate,
pos_enc_class(attention_dim, positional_dropout_rate), )
self.conv_subsampling_factor = 4
elif input_layer == "embed":
self.embed = paddle.nn.Sequential(
paddle.nn.Embedding(
idim, attention_dim, padding_idx=padding_idx),
pos_enc_class(attention_dim, positional_dropout_rate), )
elif isinstance(input_layer, paddle.nn.Layer):
self.embed = paddle.nn.Sequential(
input_layer,
pos_enc_class(attention_dim, positional_dropout_rate), )
elif input_layer is None:
self.embed = paddle.nn.Sequential(
pos_enc_class(attention_dim, positional_dropout_rate))
else:
raise ValueError("unknown input_layer: " + input_layer)
self.normalize_before = normalize_before
# self-attention module definition
if selfattention_layer_type == "selfattn":
logging.info("encoder self-attention layer type = self-attention")
encoder_selfattn_layer = MultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, attention_dim,
attention_dropout_rate, )
elif selfattention_layer_type == "legacy_rel_selfattn":
assert pos_enc_layer_type == "legacy_rel_pos"
encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, attention_dim,
attention_dropout_rate, )
elif selfattention_layer_type == "rel_selfattn":
logging.info(
"encoder self-attention layer type = relative self-attention")
assert pos_enc_layer_type == "rel_pos"
encoder_selfattn_layer = RelPositionMultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, attention_dim,
attention_dropout_rate, zero_triu, )
else:
raise ValueError("unknown encoder_attn_layer: " +
selfattention_layer_type)
# feed-forward module definition
if positionwise_layer_type == "linear":
positionwise_layer = PositionwiseFeedForward
positionwise_layer_args = (attention_dim, linear_units,
dropout_rate, activation, )
elif positionwise_layer_type == "conv1d":
positionwise_layer = MultiLayeredConv1d
positionwise_layer_args = (attention_dim, linear_units,
positionwise_conv_kernel_size,
dropout_rate, )
elif positionwise_layer_type == "conv1d-linear":
positionwise_layer = Conv1dLinear
positionwise_layer_args = (attention_dim, linear_units,
positionwise_conv_kernel_size,
dropout_rate, )
else:
raise NotImplementedError("Support only linear or conv1d.")
# convolution module definition
convolution_layer = ConvolutionModule
convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
self.encoders = repeat(
num_blocks,
lambda lnum: EncoderLayer(
attention_dim,
encoder_selfattn_layer(*encoder_selfattn_layer_args),
positionwise_layer(*positionwise_layer_args),
positionwise_layer(*positionwise_layer_args) if macaron_style else None,
convolution_layer(*convolution_layer_args) if use_cnn_module else None,
dropout_rate,
normalize_before,
concat_after,
stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
if self.normalize_before:
self.after_norm = LayerNorm(attention_dim)
self.intermediate_layers = intermediate_layers
def forward(self, xs, masks):
"""Encode input sequence.
Parameters
----------
xs : paddle.Tensor
Input tensor (#batch, time, idim).
masks (paddle.Tensor): Mask tensor (#batch, 1, time).
Returns
----------
paddle.Tensor
Output tensor (#batch, time, attention_dim).
paddle.Tensor
Mask tensor (#batch, time).
"""
if isinstance(self.embed, (Conv2dSubsampling)):
xs, masks = self.embed(xs, masks)
else:
xs = self.embed(xs)
if self.intermediate_layers is None:
xs, masks = self.encoders(xs, masks)
else:
intermediate_outputs = []
for layer_idx, encoder_layer in enumerate(self.encoders):
xs, masks = encoder_layer(xs, masks)
if (self.intermediate_layers is not None and
layer_idx + 1 in self.intermediate_layers):
# intermediate branches also require normalization.
encoder_output = xs
if isinstance(encoder_output, tuple):
encoder_output = encoder_output[0]
if self.normalize_before:
encoder_output = self.after_norm(encoder_output)
intermediate_outputs.append(encoder_output)
if isinstance(xs, tuple):
xs = xs[0]
if self.normalize_before:
xs = self.after_norm(xs)
if self.intermediate_layers is not None:
return xs, masks, intermediate_outputs
return xs, masks

@ -0,0 +1,196 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
"""Encoder self-attention layer definition."""
import paddle
from paddle import nn
from paddlespeech.t2s.modules.layer_norm import LayerNorm
class EncoderLayer(nn.Layer):
"""Encoder layer module.
Parameters
----------
size : int
Input dimension.
self_attn : paddle.nn.Layer
Self-attention module instance.
`MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
can be used as the argument.
feed_forward : paddle.nn.Layer
Feed-forward module instance.
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
can be used as the argument.
feed_forward_macaron : paddle.nn.Layer
Additional feed-forward module instance.
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
can be used as the argument.
conv_module : paddle.nn.Layer
Convolution module instance.
`ConvlutionModule` instance can be used as the argument.
dropout_rate : float
Dropout rate.
normalize_before : bool
Whether to use layer_norm before the first block.
concat_after : bool
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
stochastic_depth_rate : float
Proability to skip this layer.
During training, the layer may skip residual computation and return input
as-is with given probability.
"""
def __init__(
self,
size,
self_attn,
feed_forward,
feed_forward_macaron,
conv_module,
dropout_rate,
normalize_before=True,
concat_after=False,
stochastic_depth_rate=0.0, ):
"""Construct an EncoderLayer object."""
super(EncoderLayer, self).__init__()
self.self_attn = self_attn
self.feed_forward = feed_forward
self.feed_forward_macaron = feed_forward_macaron
self.conv_module = conv_module
self.norm_ff = LayerNorm(size) # for the FNN module
self.norm_mha = LayerNorm(size) # for the MHA module
if feed_forward_macaron is not None:
self.norm_ff_macaron = LayerNorm(size)
self.ff_scale = 0.5
else:
self.ff_scale = 1.0
if self.conv_module is not None:
self.norm_conv = LayerNorm(size) # for the CNN module
self.norm_final = LayerNorm(
size) # for the final output of the block
self.dropout = nn.Dropout(dropout_rate)
self.size = size
self.normalize_before = normalize_before
self.concat_after = concat_after
if self.concat_after:
self.concat_linear = nn.Linear(size + size, size)
self.stochastic_depth_rate = stochastic_depth_rate
def forward(self, x_input, mask, cache=None):
"""Compute encoded features.
Parameters
----------
x_input : Union[Tuple, paddle.Tensor]
Input tensor w/ or w/o pos emb.
- w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
- w/o pos emb: Tensor (#batch, time, size).
mask : paddle.Tensor
Mask tensor for the input (#batch, time).
cache paddle.Tensor
Cache tensor of the input (#batch, time - 1, size).
Returns
----------
paddle.Tensor
Output tensor (#batch, time, size).
paddle.Tensor
Mask tensor (#batch, time).
"""
if isinstance(x_input, tuple):
x, pos_emb = x_input[0], x_input[1]
else:
x, pos_emb = x_input, None
skip_layer = False
# with stochastic depth, residual connection `x + f(x)` becomes
# `x <- x + 1 / (1 - p) * f(x)` at training time.
stoch_layer_coeff = 1.0
if self.training and self.stochastic_depth_rate > 0:
skip_layer = paddle.rand(1).item() < self.stochastic_depth_rate
stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
if skip_layer:
if cache is not None:
x = paddle.concat([cache, x], axis=1)
if pos_emb is not None:
return (x, pos_emb), mask
return x, mask
# whether to use macaron style
if self.feed_forward_macaron is not None:
residual = x
if self.normalize_before:
x = self.norm_ff_macaron(x)
x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
self.feed_forward_macaron(x))
if not self.normalize_before:
x = self.norm_ff_macaron(x)
# multi-headed self-attention module
residual = x
if self.normalize_before:
x = self.norm_mha(x)
if cache is None:
x_q = x
else:
assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
x_q = x[:, -1:, :]
residual = residual[:, -1:, :]
mask = None if mask is None else mask[:, -1:, :]
if pos_emb is not None:
x_att = self.self_attn(x_q, x, x, pos_emb, mask)
else:
x_att = self.self_attn(x_q, x, x, mask)
if self.concat_after:
x_concat = paddle.concat((x, x_att), axis=-1)
x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
else:
x = residual + stoch_layer_coeff * self.dropout(x_att)
if not self.normalize_before:
x = self.norm_mha(x)
# convolution module
if self.conv_module is not None:
residual = x
if self.normalize_before:
x = self.norm_conv(x)
x = residual + stoch_layer_coeff * self.dropout(self.conv_module(x))
if not self.normalize_before:
x = self.norm_conv(x)
# feed forward module
residual = x
if self.normalize_before:
x = self.norm_ff(x)
x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
self.feed_forward(x))
if not self.normalize_before:
x = self.norm_ff(x)
if self.conv_module is not None:
x = self.norm_final(x)
if cache is not None:
x = paddle.concat([cache, x], axis=1)
if pos_emb is not None:
return (x, pos_emb), mask
return x, mask

@ -17,6 +17,14 @@ from paddle import nn
from typeguard import check_argument_types
class Swish(paddle.nn.Layer):
"""Construct an Swish object."""
def forward(self, x):
"""Return Swich activation function."""
return x * paddle.nn.Sigmoid(x)
def pad_list(xs, pad_value):
"""Perform padding for the list of tensors.
@ -150,3 +158,17 @@ def initialize(model: nn.Layer, init: str):
nn.initializer.Constant())
else:
raise ValueError("Unknown initialization: " + init)
def get_activation(act):
"""Return activation function."""
activation_funcs = {
"hardtanh": paddle.nn.Hardtanh,
"tanh": paddle.nn.Tanh,
"relu": paddle.nn.ReLU,
"selu": paddle.nn.SELU,
"swish": Swish,
}
return activation_funcs[act]()

@ -19,7 +19,7 @@ import paddle
from paddle import nn
from typeguard import check_argument_types
from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
class StyleEncoder(nn.Layer):

@ -1,208 +0,0 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle import nn
from paddle.nn import functional as F
from paddlespeech.t2s.modules import attention as attn
__all__ = [
"PositionwiseFFN",
"TransformerEncoderLayer",
"TransformerDecoderLayer",
]
class PositionwiseFFN(nn.Layer):
"""A faithful implementation of Position-wise Feed-Forward Network
in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
It is basically a 2-layer MLP, with relu actication and dropout in between.
Parameters
----------
input_size: int
The feature size of the intput. It is also the feature size of the
output.
hidden_size: int
The hidden size.
dropout: float
The probability of the Dropout applied to the output of the first
layer, by default 0.
"""
def __init__(self, input_size: int, hidden_size: int, dropout=0.0):
super(PositionwiseFFN, self).__init__()
self.linear1 = nn.Linear(input_size, hidden_size)
self.linear2 = nn.Linear(hidden_size, input_size)
self.dropout = nn.Dropout(dropout)
self.input_size = input_size
self.hidden_szie = hidden_size
def forward(self, x):
r"""Forward pass of positionwise feed forward network.
Parameters
----------
x : Tensor [shape=(\*, input_size)]
The input tensor, where ``\*`` means arbitary shape.
Returns
-------
Tensor [shape=(\*, input_size)]
The output tensor.
"""
l1 = self.dropout(F.relu(self.linear1(x)))
l2 = self.linear2(l1)
return l2
class TransformerEncoderLayer(nn.Layer):
"""A faithful implementation of Transformer encoder layer in
`Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
Parameters
----------
d_model :int
The feature size of the input. It is also the feature size of the
output.
n_heads : int
The number of heads of self attention (a ``MultiheadAttention``
layer).
d_ffn : int
The hidden size of the positional feed forward network (a
``PositionwiseFFN`` layer).
dropout : float, optional
The probability of the dropout in MultiHeadAttention and
PositionwiseFFN, by default 0.
Notes
------
It uses the PostLN (post layer norm) scheme.
"""
def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
super(TransformerEncoderLayer, self).__init__()
self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
self.dropout = dropout
def forward(self, x, mask):
"""Forward pass of TransformerEncoderLayer.
Parameters
----------
x : Tensor [shape=(batch_size, time_steps, d_model)]
The input.
mask : Tensor
The padding mask. The shape is (batch_size, time_steps,
time_steps) or broadcastable shape.
Returns
-------
x :Tensor [shape=(batch_size, time_steps, d_model)]
The encoded output.
attn_weights : Tensor [shape=(batch_size, n_heads, time_steps, time_steps)]
The attention weights of the self attention.
"""
context_vector, attn_weights = self.self_mha(x, x, x, mask)
x = self.layer_norm1(
F.dropout(x + context_vector, self.dropout, training=self.training))
x = self.layer_norm2(
F.dropout(x + self.ffn(x), self.dropout, training=self.training))
return x, attn_weights
class TransformerDecoderLayer(nn.Layer):
"""A faithful implementation of Transformer decoder layer in
`Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
Parameters
----------
d_model :int
The feature size of the input. It is also the feature size of the
output.
n_heads : int
The number of heads of attentions (``MultiheadAttention``
layers).
d_ffn : int
The hidden size of the positional feed forward network (a
``PositionwiseFFN`` layer).
dropout : float, optional
The probability of the dropout in MultiHeadAttention and
PositionwiseFFN, by default 0.
Notes
------
It uses the PostLN (post layer norm) scheme.
"""
def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
super(TransformerDecoderLayer, self).__init__()
self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
self.cross_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6)
self.dropout = dropout
def forward(self, q, k, v, encoder_mask, decoder_mask):
"""Forward pass of TransformerEncoderLayer.
Parameters
----------
q : Tensor [shape=(batch_size, time_steps_q, d_model)]
The decoder input.
k : Tensor [shape=(batch_size, time_steps_k, d_model)]
The keys.
v : Tensor [shape=(batch_size, time_steps_k, d_model)]
The values
encoder_mask : Tensor
Encoder padding mask, shape is ``(batch_size, time_steps_k,
time_steps_k)`` or broadcastable shape.
decoder_mask : Tensor
Decoder mask, shape is ``(batch_size, time_steps_q, time_steps_k)``
or broadcastable shape.
Returns
--------
q : Tensor [shape=(batch_size, time_steps_q, d_model)]
The decoder output.
self_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_q)]
Decoder self attention.
cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)]
Decoder-encoder cross attention.
"""
context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask)
q = self.layer_norm1(
F.dropout(q + context_vector, self.dropout, training=self.training))
context_vector, cross_attn_weights = self.cross_mha(q, k, v,
encoder_mask)
q = self.layer_norm2(
F.dropout(q + context_vector, self.dropout, training=self.training))
q = self.layer_norm3(
F.dropout(q + self.ffn(q), self.dropout, training=self.training))
return q, self_attn_weights, cross_attn_weights

@ -23,14 +23,14 @@ import paddle
import paddle.nn.functional as F
from paddle import nn
from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
from paddlespeech.t2s.modules.fastspeech2_transformer.decoder_layer import DecoderLayer
from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
from paddlespeech.t2s.modules.fastspeech2_transformer.lightconv import LightweightConvolution
from paddlespeech.t2s.modules.fastspeech2_transformer.mask import subsequent_mask
from paddlespeech.t2s.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
from paddlespeech.t2s.modules.fastspeech2_transformer.repeat import repeat
from paddlespeech.t2s.modules.layer_norm import LayerNorm
from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
from paddlespeech.t2s.modules.transformer.decoder_layer import DecoderLayer
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
from paddlespeech.t2s.modules.transformer.lightconv import LightweightConvolution
from paddlespeech.t2s.modules.transformer.mask import subsequent_mask
from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
from paddlespeech.t2s.modules.transformer.repeat import repeat
class Decoder(nn.Layer):

@ -14,13 +14,13 @@
# Modified from espnet(https://github.com/espnet/espnet)
from paddle import nn
from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
from paddlespeech.t2s.modules.fastspeech2_transformer.encoder_layer import EncoderLayer
from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import Conv1dLinear
from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import MultiLayeredConv1d
from paddlespeech.t2s.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
from paddlespeech.t2s.modules.fastspeech2_transformer.repeat import repeat
from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
from paddlespeech.t2s.modules.transformer.encoder_layer import EncoderLayer
from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
from paddlespeech.t2s.modules.transformer.repeat import repeat
class Encoder(nn.Layer):

@ -0,0 +1,291 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
# Conv2dSubsampling 测试通过
"""Subsampling layer definition."""
import paddle
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
class TooShortUttError(Exception):
"""Raised when the utt is too short for subsampling.
Parameters
----------
message : str
Message for error catch
actual_size : int
the short size that cannot pass the subsampling
limit : int
the limit size for subsampling
"""
def __init__(self, message, actual_size, limit):
"""Construct a TooShortUttError for error handler."""
super().__init__(message)
self.actual_size = actual_size
self.limit = limit
def check_short_utt(ins, size):
"""Check if the utterance is too short for subsampling."""
if isinstance(ins, Conv2dSubsampling2) and size < 3:
return True, 3
if isinstance(ins, Conv2dSubsampling) and size < 7:
return True, 7
if isinstance(ins, Conv2dSubsampling6) and size < 11:
return True, 11
if isinstance(ins, Conv2dSubsampling8) and size < 15:
return True, 15
return False, -1
class Conv2dSubsampling(paddle.nn.Layer):
"""Convolutional 2D subsampling (to 1/4 length).
Parameters
----------
idim : int
Input dimension.
odim : int
Output dimension.
dropout_rate : float
Dropout rate.
pos_enc : paddle.nn.Layer
Custom position encoding layer.
"""
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
"""Construct an Conv2dSubsampling object."""
super(Conv2dSubsampling, self).__init__()
self.conv = paddle.nn.Sequential(
paddle.nn.Conv2D(1, odim, 3, 2),
paddle.nn.ReLU(),
paddle.nn.Conv2D(odim, odim, 3, 2),
paddle.nn.ReLU(), )
self.out = paddle.nn.Sequential(
paddle.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim),
pos_enc if pos_enc is not None else
PositionalEncoding(odim, dropout_rate), )
def forward(self, x, x_mask):
"""Subsample x.
Parameters
----------
x : paddle.Tensor
Input tensor (#batch, time, idim).
x_mask : paddle.Tensor
Input mask (#batch, 1, time).
Returns
----------
paddle.Tensor
Subsampled tensor (#batch, time', odim),
where time' = time // 4.
paddle.Tensor
Subsampled mask (#batch, 1, time'),
where time' = time // 4.
"""
# (b, c, t, f)
x = x.unsqueeze(1)
x = self.conv(x)
b, c, t, f = x.shape
# x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
if x_mask is None:
return x, None
return x, x_mask[:, :, :-2:2][:, :, :-2:2]
def __getitem__(self, key):
"""Get item.
When reset_parameters() is called, if use_scaled_pos_enc is used,
return the positioning encoding.
"""
if key != -1:
raise NotImplementedError(
"Support only `-1` (for `reset_parameters`).")
return self.out[key]
class Conv2dSubsampling2(paddle.nn.Layer):
"""Convolutional 2D subsampling (to 1/2 length).
Parameters
----------
idim : int
Input dimension.
odim : int
Output dimension.
dropout_rate : float
Dropout rate.
pos_enc : paddle.nn.Layer
Custom position encoding layer.
"""
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
"""Construct an Conv2dSubsampling2 object."""
super(Conv2dSubsampling2, self).__init__()
self.conv = paddle.nn.Sequential(
paddle.nn.Conv2D(1, odim, 3, 2),
paddle.nn.ReLU(),
paddle.nn.Conv2D(odim, odim, 3, 1),
paddle.nn.ReLU(), )
self.out = paddle.nn.Sequential(
paddle.nn.Linear(odim * (((idim - 1) // 2 - 2)), odim),
pos_enc if pos_enc is not None else
PositionalEncoding(odim, dropout_rate), )
def forward(self, x, x_mask):
"""Subsample x.
Parameters
----------
x : paddle.Tensor
Input tensor (#batch, time, idim).
x_mask : paddle.Tensor
Input mask (#batch, 1, time).
Returns
----------
paddle.Tensor
ubsampled tensor (#batch, time', odim),
where time' = time // 2.
paddle.Tensor
Subsampled mask (#batch, 1, time'),
where time' = time // 2.
"""
# (b, c, t, f)
x = x.unsqueeze(1)
x = self.conv(x)
b, c, t, f = x.shape
x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
if x_mask is None:
return x, None
return x, x_mask[:, :, :-2:2][:, :, :-2:1]
def __getitem__(self, key):
"""Get item.
When reset_parameters() is called, if use_scaled_pos_enc is used,
return the positioning encoding.
"""
if key != -1:
raise NotImplementedError(
"Support only `-1` (for `reset_parameters`).")
return self.out[key]
class Conv2dSubsampling6(paddle.nn.Layer):
"""Convolutional 2D subsampling (to 1/6 length).
Parameters
----------
idim : int
Input dimension.
odim : int
Output dimension.
dropout_rate : float
Dropout rate.
pos_enc : paddle.nn.Layer
Custom position encoding layer.
"""
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
"""Construct an Conv2dSubsampling6 object."""
super(Conv2dSubsampling6, self).__init__()
self.conv = paddle.nn.Sequential(
paddle.nn.Conv2D(1, odim, 3, 2),
paddle.nn.ReLU(),
paddle.nn.Conv2D(odim, odim, 5, 3),
paddle.nn.ReLU(), )
self.out = paddle.nn.Sequential(
paddle.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim),
pos_enc if pos_enc is not None else
PositionalEncoding(odim, dropout_rate), )
def forward(self, x, x_mask):
"""Subsample x.
Parameters
----------
x : paddle.Tensor
Input tensor (#batch, time, idim).
x_mask paddle.Tensor
Input mask (#batch, 1, time).
Returns
----------
paddle.Tensor
Subsampled tensor (#batch, time', odim),
where time' = time // 6.
paddle.Tensor
Subsampled mask (#batch, 1, time'),
where time' = time // 6.
"""
# (b, c, t, f)
x = x.unsqueeze(1)
x = self.conv(x)
b, c, t, f = x.shape
x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
if x_mask is None:
return x, None
return x, x_mask[:, :, :-2:2][:, :, :-4:3]
class Conv2dSubsampling8(paddle.nn.Layer):
"""Convolutional 2D subsampling (to 1/8 length).
Parameters
----------
idim : int
Input dimension.
odim : int
Output dimension.
dropout_rate : float
Dropout rate.
pos_enc : paddle.nn.Layer
Custom position encoding layer.
"""
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
"""Construct an Conv2dSubsampling8 object."""
super(Conv2dSubsampling8, self).__init__()
self.conv = paddle.nn.Sequential(
paddle.nn.Conv2D(1, odim, 3, 2),
paddle.nn.ReLU(),
paddle.nn.Conv2D(odim, odim, 3, 2),
paddle.nn.ReLU(),
paddle.nn.Conv2D(odim, odim, 3, 2),
paddle.nn.ReLU(), )
self.out = paddle.nn.Sequential(
paddle.nn.Linear(odim * (((
(idim - 1) // 2 - 1) // 2 - 1) // 2), odim),
pos_enc if pos_enc is not None else
PositionalEncoding(odim, dropout_rate), )
def forward(self, x, x_mask):
"""Subsample x.
Parameters
----------
x : paddle.Tensor
Input tensor (#batch, time, idim).
x_mask : paddle.Tensor
Input mask (#batch, 1, time).
Returns
----------
paddle.Tensor
Subsampled tensor (#batch, time', odim),
where time' = time // 8.
paddle.Tensor
Subsampled mask (#batch, 1, time'),
where time' = time // 8.
"""
# (b, c, t, f)
x = x.unsqueeze(1)
x = self.conv(x)
b, c, t, f = x.shape
x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
if x_mask is None:
return x, None
return x, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]

@ -28,7 +28,7 @@ python-dateutil
pyworld
resampy==0.2.2
sacrebleu
scipy==1.2.1
scipy
sentencepiece
snakeviz
soundfile~=0.10
@ -44,3 +44,9 @@ visualdl==2.2.0
webrtcvad
yacs
yq
pypi-kenlm
GPUtil
psutil
pynvml
distro

@ -0,0 +1,20 @@
# Install conda dependencies
conda install -c conda-forge sox libsndfile swig bzip2 bottleneck gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 --yes
# Install the python lib
pip install -r requirements.txt
# Install the auto_log
pushd tools/extras
bash install_autolog.sh
popd
# Install the ctcdecoder
pushd paddlespeech/s2t/decoders/ctcdecoder/swig
bash -e setup.sh
popd
# Install the python_speech_features
pushd third_party
bash -e install.sh
popd

@ -43,16 +43,6 @@ bash prepare.sh
bash run.sh
```
### Analyse the sp
```
bash run_analysis_sp.sh
```
### Analyse the mp
```
bash run_analysis_mp.sh
```
### The log
```
{"log_file": "recoder_sp_bs16_fp32_ngpu1.txt",

@ -1,345 +0,0 @@
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import argparse
import json
import re
import traceback
def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--filename", type=str, help="The name of log which need to analysis.")
parser.add_argument(
"--log_with_profiler",
type=str,
help="The path of train log with profiler")
parser.add_argument(
"--profiler_path", type=str, help="The path of profiler timeline log.")
parser.add_argument(
"--keyword", type=str, help="Keyword to specify analysis data")
parser.add_argument(
"--separator",
type=str,
default=None,
help="Separator of different field in log")
parser.add_argument(
'--position', type=int, default=None, help='The position of data field')
parser.add_argument(
'--range',
type=str,
default="",
help='The range of data field to intercept')
parser.add_argument(
'--base_batch_size', type=int, help='base_batch size on gpu')
parser.add_argument(
'--skip_steps',
type=int,
default=0,
help='The number of steps to be skipped')
parser.add_argument(
'--model_mode',
type=int,
default=-1,
help='Analysis mode, default value is -1')
parser.add_argument('--ips_unit', type=str, default=None, help='IPS unit')
parser.add_argument(
'--model_name',
type=str,
default=0,
help='training model_name, transformer_base')
parser.add_argument(
'--mission_name', type=str, default=0, help='training mission name')
parser.add_argument(
'--direction_id', type=int, default=0, help='training direction_id')
parser.add_argument(
'--run_mode',
type=str,
default="sp",
help='multi process or single process')
parser.add_argument(
'--index',
type=int,
default=1,
help='{1: speed, 2:mem, 3:profiler, 6:max_batch_size}')
parser.add_argument(
'--gpu_num', type=int, default=1, help='nums of training gpus')
parser.add_argument(
'--use_num', type=int, default=1, help='nums of used recoders')
args = parser.parse_args()
args.separator = None if args.separator == "None" else args.separator
return args
def _is_number(num):
pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$')
result = pattern.match(num)
if result:
return True
else:
return False
class TimeAnalyzer(object):
def __init__(self,
filename,
keyword=None,
separator=None,
position=None,
range="-1"):
if filename is None:
raise Exception("Please specify the filename!")
if keyword is None:
raise Exception("Please specify the keyword!")
self.filename = filename
self.keyword = keyword
self.separator = separator
self.position = position
self.range = range
self.records = None
self._distil()
def _distil(self):
self.records = []
with open(self.filename, "r") as f_object:
lines = f_object.readlines()
for line in lines:
if self.keyword not in line:
continue
try:
result = None
# Distil the string from a line.
line = line.strip()
line_words = line.split(
self.separator) if self.separator else line.split()
print("line_words", line_words)
if args.position:
result = line_words[self.position]
else:
# Distil the string following the keyword.
for i in range(len(line_words) - 1):
if line_words[i] == self.keyword:
result = line_words[i + 1]
break
# Distil the result from the picked string.
if not self.range:
result = result[0:]
elif _is_number(self.range):
result = result[0:int(self.range)]
else:
result = result[int(self.range.split(":")[0]):int(
self.range.split(":")[1])]
self.records.append(float(result))
except Exception as exc:
pass
#print("line is: {}; separator={}; position={}".format(line, self.separator, self.position))
self.records.sort()
self.records = self.records[:args.use_num]
print("records", self.records)
print("Extract {} records: separator={}; position={}".format(
len(self.records), self.separator, self.position))
def _get_fps(self,
mode,
batch_size,
gpu_num,
avg_of_records,
run_mode,
unit=None):
if mode == -1 and run_mode == 'sp':
assert unit, "Please set the unit when mode is -1."
fps = gpu_num * avg_of_records
elif mode == -1 and run_mode == 'mp':
assert unit, "Please set the unit when mode is -1."
fps = gpu_num * avg_of_records #temporarily, not used now
print("------------this is mp")
elif mode == 0:
# s/step -> samples/s
fps = (batch_size * gpu_num) / avg_of_records
unit = "samples/s"
elif mode == 1:
# steps/s -> steps/s
fps = avg_of_records
unit = "steps/s"
elif mode == 2:
# s/step -> steps/s
fps = 1 / avg_of_records
unit = "steps/s"
elif mode == 3:
# steps/s -> samples/s
fps = batch_size * gpu_num * avg_of_records
unit = "samples/s"
elif mode == 4:
# s/epoch -> s/epoch
fps = avg_of_records
unit = "s/epoch"
else:
ValueError("Unsupported analysis mode.")
return fps, unit
def analysis(self,
batch_size,
gpu_num=1,
skip_steps=0,
mode=-1,
run_mode='sp',
unit=None):
if batch_size <= 0:
print("base_batch_size should larger than 0.")
return 0, ''
if len(
self.records
) <= skip_steps: # to address the condition which item of log equals to skip_steps
print("no records")
return 0, ''
sum_of_records = 0
sum_of_records_skipped = 0
skip_min = self.records[skip_steps]
skip_max = self.records[skip_steps]
count = len(self.records)
for i in range(count):
sum_of_records += self.records[i]
if i >= skip_steps:
sum_of_records_skipped += self.records[i]
if self.records[i] < skip_min:
skip_min = self.records[i]
if self.records[i] > skip_max:
skip_max = self.records[i]
avg_of_records = sum_of_records / float(count)
avg_of_records_skipped = sum_of_records_skipped / float(count -
skip_steps)
fps, fps_unit = self._get_fps(mode, batch_size, gpu_num, avg_of_records,
run_mode, unit)
fps_skipped, _ = self._get_fps(mode, batch_size, gpu_num,
avg_of_records_skipped, run_mode, unit)
if mode == -1:
print("average ips of %d steps, skip 0 step:" % count)
print("\tAvg: %.3f %s" % (avg_of_records, fps_unit))
print("\tFPS: %.3f %s" % (fps, fps_unit))
if skip_steps > 0:
print("average ips of %d steps, skip %d steps:" %
(count, skip_steps))
print("\tAvg: %.3f %s" % (avg_of_records_skipped, fps_unit))
print("\tMin: %.3f %s" % (skip_min, fps_unit))
print("\tMax: %.3f %s" % (skip_max, fps_unit))
print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
elif mode == 1 or mode == 3:
print("average latency of %d steps, skip 0 step:" % count)
print("\tAvg: %.3f steps/s" % avg_of_records)
print("\tFPS: %.3f %s" % (fps, fps_unit))
if skip_steps > 0:
print("average latency of %d steps, skip %d steps:" %
(count, skip_steps))
print("\tAvg: %.3f steps/s" % avg_of_records_skipped)
print("\tMin: %.3f steps/s" % skip_min)
print("\tMax: %.3f steps/s" % skip_max)
print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
elif mode == 0 or mode == 2:
print("average latency of %d steps, skip 0 step:" % count)
print("\tAvg: %.3f s/step" % avg_of_records)
print("\tFPS: %.3f %s" % (fps, fps_unit))
if skip_steps > 0:
print("average latency of %d steps, skip %d steps:" %
(count, skip_steps))
print("\tAvg: %.3f s/step" % avg_of_records_skipped)
print("\tMin: %.3f s/step" % skip_min)
print("\tMax: %.3f s/step" % skip_max)
print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
return round(fps_skipped, 3), fps_unit
if __name__ == "__main__":
args = parse_args()
run_info = dict()
run_info["log_file"] = args.filename
run_info["model_name"] = args.model_name
run_info["mission_name"] = args.mission_name
run_info["direction_id"] = args.direction_id
run_info["run_mode"] = args.run_mode
run_info["index"] = args.index
run_info["gpu_num"] = args.gpu_num
run_info["FINAL_RESULT"] = 0
run_info["JOB_FAIL_FLAG"] = 0
try:
if args.index == 1:
if args.gpu_num == 1:
run_info["log_with_profiler"] = args.log_with_profiler
run_info["profiler_path"] = args.profiler_path
analyzer = TimeAnalyzer(args.filename, args.keyword, args.separator,
args.position, args.range)
run_info["FINAL_RESULT"], run_info["UNIT"] = analyzer.analysis(
batch_size=args.base_batch_size,
gpu_num=args.gpu_num,
skip_steps=args.skip_steps,
mode=args.model_mode,
run_mode=args.run_mode,
unit=args.ips_unit)
# if int(os.getenv('job_fail_flag')) == 1 or int(run_info["FINAL_RESULT"]) == 0:
# run_info["JOB_FAIL_FLAG"] = 1
elif args.index == 3:
run_info["FINAL_RESULT"] = {}
records_fo_total = TimeAnalyzer(args.filename, 'Framework overhead',
None, 3, '').records
records_fo_ratio = TimeAnalyzer(args.filename, 'Framework overhead',
None, 5).records
records_ct_total = TimeAnalyzer(args.filename, 'Computation time',
None, 3, '').records
records_gm_total = TimeAnalyzer(args.filename,
'GpuMemcpy Calls',
None, 4, '').records
records_gm_ratio = TimeAnalyzer(args.filename,
'GpuMemcpy Calls',
None, 6).records
records_gmas_total = TimeAnalyzer(args.filename,
'GpuMemcpyAsync Calls',
None, 4, '').records
records_gms_total = TimeAnalyzer(args.filename,
'GpuMemcpySync Calls',
None, 4, '').records
run_info["FINAL_RESULT"]["Framework_Total"] = records_fo_total[
0] if records_fo_total else 0
run_info["FINAL_RESULT"]["Framework_Ratio"] = records_fo_ratio[
0] if records_fo_ratio else 0
run_info["FINAL_RESULT"][
"ComputationTime_Total"] = records_ct_total[
0] if records_ct_total else 0
run_info["FINAL_RESULT"]["GpuMemcpy_Total"] = records_gm_total[
0] if records_gm_total else 0
run_info["FINAL_RESULT"]["GpuMemcpy_Ratio"] = records_gm_ratio[
0] if records_gm_ratio else 0
run_info["FINAL_RESULT"][
"GpuMemcpyAsync_Total"] = records_gmas_total[
0] if records_gmas_total else 0
run_info["FINAL_RESULT"]["GpuMemcpySync_Total"] = records_gms_total[
0] if records_gms_total else 0
else:
print("Not support!")
except Exception:
traceback.print_exc()
print("{}".format(json.dumps(run_info))
) # it's required, for the log file path insert to the database

@ -1,5 +1,6 @@
source ../../../tools/venv/bin/activate
cd ../../../
pip install -e . # 安装pdspeech
cd -
#Enter the example dir
pushd ../../../examples/aishell/s1

@ -1,8 +1,12 @@
# 提供可稳定复现性能的脚本默认在标准docker环境内py37执行 paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7 paddle=2.1.2 py=37
# 执行目录:需说明
CUR_DIR=${PWD}
source ../../../tools/venv/bin/activate
CUR_DIR=${PWD} # PaddleSpeech/tests/benchmark/conformer
cd ../../../
log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} # benchmark系统指定该参数,不需要跑profile时,log_path指向存speed的目录
cd ${CUR_DIR}
sed -i '/set\ -xe/d' run_benchmark.sh
#cd **
pushd ../../../examples/aishell/s1
# 1 安装该模型需要的依赖 (如需开启优化策略请注明)
@ -11,26 +15,33 @@ pushd ../../../examples/aishell/s1
source path.sh
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
mkdir -p conf/benchmark
#yq e ".training.accum_grad=1" conf/conformer.yaml > conf/benchmark/conformer.yaml
cp conf/conformer.yaml conf/benchmark/conformer.yaml
sed -i "s/ accum_grad: 2/ accum_grad: 1/g" conf/benchmark/conformer.yaml
fp_item_list=(fp32)
bs_item=(16 30)
config_path=conf/conformer.yaml
config_path=conf/benchmark/conformer.yaml
seed=0
output=exp/conformer
profiler_options=None
model_item=conformer
for fp_item in ${fp_item_list[@]}; do
for batch_size in ${bs_item[@]}
for bs_item in ${bs_item[@]}
do
rm exp -rf
log_name=speech_${model_item}_bs${bs_item}_${fp_item} # 如:clas_MobileNetv1_mp_bs32_fp32_8
echo "index is speed, 8gpus, run_mode is multi_process, begin, conformer"
run_mode=mp
ngpu=8
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${batch_size} ${fp_item} ${CUR_DIR}
rm exp -rf
echo "index is speed, 1gpus, begin, conformer"
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1
sleep 60
log_name=speech_${model_item}_bs${bs_item}_${fp_item} # 如:clas_MobileNetv1_mp_bs32_fp32_8
echo "index is speed, 1gpus, begin, ${log_name}"
run_mode=sp
ngpu=1
CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${batch_size} ${fp_item} ${CUR_DIR}
CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min)
sleep 60
done
done

@ -12,17 +12,24 @@ function _set_params(){
profiler_options=${6:-"None"}
batch_size=${7:-"32"}
fp_item=${8:-"fp32"}
TRAIN_LOG_DIR=${9:-$(pwd)}
model_item=${9:-"conformer"}
benchmark_max_step=0
run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数
# 添加日志解析需要的参数
base_batch_size=${batch_size}
mission_name="语音识别"
direction_id="1"
ips_unit="sent./sec"
skip_steps=10 # 解析日志有些模型前几个step耗时长需要跳过 (必填)
keyword="ips:" # 解析日志,筛选出数据所在行的关键字 (必填)
index="1"
model_name=${model_item}_bs${batch_size}_${fp_item}
# 以下不用修改
device=${CUDA_VISIBLE_DEVICES//,/ }
arr=(${device})
num_gpu_devices=${#arr[*]}
log_file=${run_log_path}/recoder_${run_mode}_bs${batch_size}_${fp_item}_ngpu${ngpu}.txt
log_file=${run_log_path}/recoder_${model_item}_${run_mode}_bs${batch_size}_${fp_item}_ngpu${ngpu}
}
function _train(){
@ -36,11 +43,9 @@ function _train(){
--benchmark-batch-size ${batch_size}
--benchmark-max-step ${benchmark_max_step} "
echo "run_mode "${run_mode}
case ${run_mode} in
sp) train_cmd="python3 -u ${BIN_DIR}/train.py "${train_cmd} ;;
mp) train_cmd="python3 -u ${BIN_DIR}/train.py "${train_cmd} ;;
sp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;;
mp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;;
*) echo "choose run_mode(sp or mp)"; exit 1;
esac
echo ${train_cmd}
@ -61,5 +66,8 @@ function _train(){
fi
}
source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开
_set_params $@
_train
# _train # 如果只想产出训练log,不解析,可取消注释
_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开

@ -13,6 +13,8 @@ else
fi
bash Miniconda3-latest-Linux-x86_64.sh -b
$HOME/miniconda3/bin/conda init
$HOME/miniconda3/bin/python -m pip install --user tqdm
$HOME/miniconda3/bin/python -m pip install --user scikit-learn
$HOME/miniconda3/bin/python -m pip install --user librosa

Loading…
Cancel
Save