[TTS]Add slim for TTS (#2729)

pull/2736/head
TianYuan 2 years ago committed by GitHub
parent 6f927d55db
commit 3f6afc4834
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1 @@
../../tts3/local/PTQ_static.sh

@ -72,3 +72,8 @@ fi
if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
fi
# PTQ_static
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} speedyspeech_csmsc || exit -1
fi

@ -0,0 +1,8 @@
train_output_path=$1
model_name=$2
weight_bits=$3
python3 ${BIN_DIR}/../PTQ_dynamic.py \
--inference_dir ${train_output_path}/inference \
--model_name ${model_name} \
--weight_bits ${weight_bits}

@ -0,0 +1,8 @@
train_output_path=$1
model_name=$2
python3 ${BIN_DIR}/../PTQ_static.py \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--inference_dir ${train_output_path}/inference \
--model_name ${model_name} \
--onnx_forma=True

@ -76,3 +76,16 @@ fi
if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
fi
# PTQ_dynamic
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
./local/PTQ_dynamic.sh ${train_output_path} fastspeech2_csmsc 8
# ./local/PTQ_dynamic.sh ${train_output_path} pwgan_csmsc 8
# ./local/PTQ_dynamic.sh ${train_output_path} mb_melgan_csmsc 8
# ./local/PTQ_dynamic.sh ${train_output_path} hifigan_csmsc 8
fi
# PTQ_static
if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} fastspeech2_csmsc || exit -1
fi

@ -122,3 +122,8 @@ fi
if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict_streaming.sh ${train_output_path} || exit -1
fi
# PTQ_static
if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} fastspeech2_csmsc || exit -1
fi

@ -0,0 +1,8 @@
train_output_path=$1
model_name=$2
python3 ${BIN_DIR}/../../PTQ_static.py \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--inference_dir ${train_output_path}/inference \
--model_name ${model_name} \
--onnx_format=True

@ -30,3 +30,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
# PTQ_static
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} pwgan_csmsc || exit -1
fi

@ -0,0 +1 @@
../../voc1/local/PTQ_static.sh

@ -30,3 +30,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
# PTQ_static
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} mb_melgan_csmsc || exit -1
fi

@ -0,0 +1 @@
../../voc1/local/PTQ_static.sh

@ -30,3 +30,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
# PTQ_static
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} hifigan_csmsc || exit -1
fi

@ -538,3 +538,70 @@ def vits_multi_spk_batch_fn(examples):
spk_id = paddle.to_tensor(spk_id)
batch["spk_id"] = spk_id
return batch
# for PaddleSlim
def fastspeech2_single_spk_batch_fn_static(examples):
text = [np.array(item["text"], dtype=np.int64) for item in examples]
text = np.array(text)
# do not need batch axis in infer
text = text[0]
batch = {
"text": text,
}
return batch
def fastspeech2_multi_spk_batch_fn_static(examples):
text = [np.array(item["text"], dtype=np.int64) for item in examples]
text = np.array(text)
text = text[0]
batch = {
"text": text,
}
if "spk_id" in examples[0]:
spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
spk_id = np.array(spk_id)
spk_id = spk_id[0]
batch["spk_id"] = spk_id
if "spk_emb" in examples[0]:
spk_emb = [
np.array(item["spk_emb"], dtype=np.float32) for item in examples
]
spk_emb = np.array(spk_emb)
spk_emb = spk_id[spk_emb]
batch["spk_emb"] = spk_emb
return batch
def speedyspeech_single_spk_batch_fn_static(examples):
phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
phones = np.array(phones)
tones = np.array(tones)
phones = phones[0]
tones = tones[0]
batch = {
"phones": phones,
"tones": tones,
}
return batch
def speedyspeech_multi_spk_batch_fn_static(examples):
phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
phones = np.array(phones)
tones = np.array(tones)
phones = phones[0]
tones = tones[0]
batch = {
"phones": phones,
"tones": tones,
}
if "spk_id" in examples[0]:
spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
spk_id = np.array(spk_id)
spk_id = spk_id[0]
batch["spk_id"] = spk_id
return batch

@ -55,13 +55,12 @@ class Clip(object):
Args:
batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
Returns:
Returns:
Tensor:
Target signal batch (B, 1, T).
Tensor:
Auxiliary feature batch (B, C, T'), where
T = (T' - 2 * aux_context_window) * hop_size.
Tensor:
Target signal batch (B, 1, T).
"""
# check length
batch = [
@ -106,11 +105,7 @@ class Clip(object):
if len(x) < c.shape[0] * self.hop_size:
x = np.pad(x, (0, c.shape[0] * self.hop_size - len(x)), mode="edge")
elif len(x) > c.shape[0] * self.hop_size:
# print(
# f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
# )
x = x[:c.shape[0] * self.hop_size]
# check the legnth is valid
assert len(x) == c.shape[
0] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[0]})"
@ -218,3 +213,47 @@ class WaveRNNClip(Clip):
y = label_2_float(paddle.cast(y, dtype='float32'), self.bits)
return x, y, mels
# for paddleslim
class Clip_static(Clip):
"""Collate functor for training vocoders.
"""
def __call__(self, batch):
"""Convert into batch tensors.
Args:
batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
Returns:
Dict[str, np.array]:
Auxiliary feature batch (B, C, T'), where
T = (T' - 2 * aux_context_window) * hop_size.
"""
# check length
batch = [
self._adjust_length(b['wave'], b['feats']) for b in batch
if b['feats'].shape[0] > self.mel_threshold
]
xs, cs = [b[0] for b in batch], [b[1] for b in batch]
# make batch with random cut
c_lengths = [c.shape[0] for c in cs]
start_frames = np.array([
np.random.randint(self.start_offset, cl + self.end_offset)
for cl in c_lengths
])
c_starts = start_frames - self.aux_context_window
c_ends = start_frames + self.batch_max_frames + self.aux_context_window
c_batch = np.stack(
[c[start:end] for c, start, end in zip(cs, c_starts, c_ends)])
# infer axis (T',C) is different with train axis (B, C, T')
# c_batch = c_batch.transpose([0, 2, 1]) # (B, C, T')
# do not need batch axis in infer
c_batch = c_batch[0]
batch = {"logmel": c_batch}
return batch

@ -0,0 +1,80 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import paddle
from paddleslim.quant import quant_post_dynamic
def parse_args():
parser = argparse.ArgumentParser(
description="Paddle Slim Dynamic with acoustic model & vocoder.")
# acoustic model
parser.add_argument(
'--model_name',
type=str,
default='fastspeech2_csmsc',
choices=[
'speedyspeech_csmsc',
'fastspeech2_csmsc',
'fastspeech2_aishell3',
'fastspeech2_ljspeech',
'fastspeech2_vctk',
'tacotron2_csmsc',
'fastspeech2_mix',
'pwgan_csmsc',
'pwgan_aishell3',
'pwgan_ljspeech',
'pwgan_vctk',
'mb_melgan_csmsc',
'hifigan_csmsc',
'hifigan_aishell3',
'hifigan_ljspeech',
'hifigan_vctk',
'wavernn_csmsc',
],
help='Choose model type of tts task.')
parser.add_argument(
"--inference_dir", type=str, help="dir to save inference models")
parser.add_argument(
"--weight_bits",
type=int,
default=8,
choices=[8, 16],
help="The bits for the quantized weight, and it should be 8 or 16. Default is 8.",
)
args, _ = parser.parse_known_args()
return args
# only inference for models trained with csmsc now
def main():
args = parse_args()
paddle.enable_static()
quant_post_dynamic(
model_dir=args.inference_dir,
save_model_dir=args.inference_dir,
model_filename=args.model_name + ".pdmodel",
params_filename=args.model_name + ".pdiparams",
save_model_filename=args.model_name + "_" + str(args.weight_bits) +
"bits.pdmodel",
save_params_filename=args.model_name + "_" + str(args.weight_bits) +
"bits.pdiparams",
weight_bits=args.weight_bits, )
if __name__ == "__main__":
main()

@ -0,0 +1,156 @@
import argparse
import random
import jsonlines
import numpy as np
import paddle
from paddleslim.quant import quant_post_static
from paddlespeech.t2s.exps.syn_utils import get_dev_dataloader
from paddlespeech.t2s.utils import str2bool
def parse_args():
parser = argparse.ArgumentParser(
description="Paddle Slim Static with acoustic model & vocoder.")
parser.add_argument(
"--batch_size", type=int, default=1, help="Minibatch size.")
parser.add_argument("--batch_num", type=int, default=1, help="Batch number")
parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
# model_path save_path
parser.add_argument(
"--inference_dir", type=str, help="dir to save inference models")
parser.add_argument(
'--model_name',
type=str,
default='fastspeech2_csmsc',
choices=[
'speedyspeech_csmsc',
'fastspeech2_csmsc',
'fastspeech2_aishell3',
'fastspeech2_ljspeech',
'fastspeech2_vctk',
'fastspeech2_mix',
'pwgan_csmsc',
'pwgan_aishell3',
'pwgan_ljspeech',
'pwgan_vctk',
'mb_melgan_csmsc',
'hifigan_csmsc',
'hifigan_aishell3',
'hifigan_ljspeech',
'hifigan_vctk',
],
help='Choose model type of tts task.')
parser.add_argument(
"--algo", type=str, default='avg', help="calibration algorithm.")
parser.add_argument(
"--round_type",
type=str,
default='round',
help="The method of converting the quantized weights.")
parser.add_argument(
"--hist_percent",
type=float,
default=0.9999,
help="The percentile of algo:hist.")
parser.add_argument(
"--is_full_quantize",
type=str2bool,
default=False,
help="Whether is full quantization or not.")
parser.add_argument(
"--bias_correction",
type=str2bool,
default=False,
help="Whether to use bias correction.")
parser.add_argument(
"--ce_test", type=str2bool, default=False, help="Whether to CE test.")
parser.add_argument(
"--onnx_format",
type=str2bool,
default=False,
help="Whether to export the quantized model with format of ONNX.")
parser.add_argument(
"--phones-dict", type=str, default=None, help="phone vocabulary file.")
parser.add_argument(
"--speaker-dict",
type=str,
default=None,
help="speaker id map file for multiple speaker model.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument(
"--quantizable_op_type",
type=list,
nargs='+',
default=[
"conv2d_transpose", "conv2d", "depthwise_conv2d", "mul", "matmul",
"matmul_v2"
],
help="The list of op types that will be quantized.")
args = parser.parse_args()
return args
def quantize(args):
shuffle = True
if args.ce_test:
# set seed
seed = 111
np.random.seed(seed)
paddle.seed(seed)
random.seed(seed)
shuffle = False
place = paddle.CUDAPlace(0) if args.ngpu > 0 else paddle.CPUPlace()
with jsonlines.open(args.dev_metadata, 'r') as reader:
dev_metadata = list(reader)
dataloader = get_dev_dataloader(
dev_metadata=dev_metadata,
am=args.model_name,
batch_size=args.batch_size,
speaker_dict=args.speaker_dict,
shuffle=shuffle)
exe = paddle.static.Executor(place)
exe.run()
print("onnx_format:", args.onnx_format)
quant_post_static(
executor=exe,
model_dir=args.inference_dir,
quantize_model_path=args.inference_dir + "/" + args.model_name +
"_quant",
data_loader=dataloader,
model_filename=args.model_name + ".pdmodel",
params_filename=args.model_name + ".pdiparams",
save_model_filename=args.model_name + ".pdmodel",
save_params_filename=args.model_name + ".pdiparams",
batch_size=args.batch_size,
algo=args.algo,
round_type=args.round_type,
hist_percent=args.hist_percent,
is_full_quantize=args.is_full_quantize,
bias_correction=args.bias_correction,
onnx_format=args.onnx_format,
quantizable_op_type=args.quantizable_op_type)
def main():
args = parse_args()
new_quantizable_op_type = []
for item in args.quantizable_op_type:
new_quantizable_op_type.append(''.join(item))
args.quantizable_op_type = new_quantizable_op_type
paddle.enable_static()
quantize(args)
if __name__ == '__main__':
main()

@ -25,10 +25,13 @@ import onnxruntime as ort
import paddle
from paddle import inference
from paddle import jit
from paddle.io import DataLoader
from paddle.static import InputSpec
from yacs.config import CfgNode
from paddlespeech.t2s.datasets.am_batch_fn import *
from paddlespeech.t2s.datasets.data_table import DataTable
from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip_static
from paddlespeech.t2s.frontend import English
from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
from paddlespeech.t2s.frontend.zh_frontend import Frontend
@ -118,6 +121,7 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
return sentences
# am only
def get_test_dataset(test_metadata: List[Dict[str, Any]],
am: str,
speaker_dict: Optional[os.PathLike]=None,
@ -158,6 +162,100 @@ def get_test_dataset(test_metadata: List[Dict[str, Any]],
return test_dataset
# am and voc, for PTQ_static
def get_dev_dataloader(dev_metadata: List[Dict[str, Any]],
am: str,
batch_size: int=1,
speaker_dict: Optional[os.PathLike]=None,
voice_cloning: bool=False,
n_shift: int=300,
batch_max_steps: int=16200,
shuffle: bool=True):
# model: {model_name}_{dataset}
am_name = am[:am.rindex('_')]
am_dataset = am[am.rindex('_') + 1:]
converters = {}
if am_name == 'fastspeech2':
fields = ["utt_id", "text"]
if am_dataset in {"aishell3", "vctk",
"mix"} and speaker_dict is not None:
print("multiple speaker fastspeech2!")
collate_fn = fastspeech2_multi_spk_batch_fn_static
fields += ["spk_id"]
elif voice_cloning:
print("voice cloning!")
collate_fn = fastspeech2_multi_spk_batch_fn_static
fields += ["spk_emb"]
else:
print("single speaker fastspeech2!")
collate_fn = fastspeech2_single_spk_batch_fn_static
elif am_name == 'speedyspeech':
fields = ["utt_id", "phones", "tones"]
if am_dataset in {"aishell3", "vctk",
"mix"} and speaker_dict is not None:
print("multiple speaker speedyspeech!")
collate_fn = speedyspeech_multi_spk_batch_fn_static
fields += ["spk_id"]
else:
print("single speaker speedyspeech!")
collate_fn = speedyspeech_single_spk_batch_fn_static
fields = ["utt_id", "phones", "tones"]
elif am_name == 'tacotron2':
fields = ["utt_id", "text"]
if voice_cloning:
print("voice cloning!")
collate_fn = tacotron2_multi_spk_batch_fn_static
fields += ["spk_emb"]
else:
print("single speaker tacotron2!")
collate_fn = tacotron2_single_spk_batch_fn_static
else:
print("voc dataloader")
# am
if am_name not in {'pwgan', 'mb_melgan', 'hifigan'}:
dev_dataset = DataTable(
data=dev_metadata,
fields=fields,
converters=converters, )
dev_dataloader = DataLoader(
dev_dataset,
shuffle=shuffle,
drop_last=False,
batch_size=batch_size,
collate_fn=collate_fn)
# vocoder
else:
# pwgan: batch_max_steps: 25500 aux_context_window: 2
# mb_melgan: batch_max_steps: 16200 aux_context_window 0
# hifigan: batch_max_steps: 8400 aux_context_window 0
aux_context_window = 0
if am_name == 'pwgan':
aux_context_window = 2
train_batch_fn = Clip_static(
batch_max_steps=batch_max_steps,
hop_size=n_shift,
aux_context_window=aux_context_window)
dev_dataset = DataTable(
data=dev_metadata,
fields=["wave", "feats"],
converters={
"wave": np.load,
"feats": np.load,
}, )
dev_dataloader = DataLoader(
dev_dataset,
shuffle=shuffle,
drop_last=False,
batch_size=batch_size,
collate_fn=train_batch_fn)
return dev_dataloader
# frontend
def get_frontend(lang: str='zh',
phones_dict: Optional[os.PathLike]=None,

Loading…
Cancel
Save