[TTS]Add slim for TTS (#2729)

pull/2736/head
TianYuan 3 years ago committed by GitHub
parent 6f927d55db
commit 3f6afc4834
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1 @@
../../tts3/local/PTQ_static.sh

@ -72,3 +72,8 @@ fi
if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
fi fi
# PTQ_static
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} speedyspeech_csmsc || exit -1
fi

@ -0,0 +1,8 @@
train_output_path=$1
model_name=$2
weight_bits=$3
python3 ${BIN_DIR}/../PTQ_dynamic.py \
--inference_dir ${train_output_path}/inference \
--model_name ${model_name} \
--weight_bits ${weight_bits}

@ -0,0 +1,8 @@
train_output_path=$1
model_name=$2
python3 ${BIN_DIR}/../PTQ_static.py \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--inference_dir ${train_output_path}/inference \
--model_name ${model_name} \
--onnx_forma=True

@ -76,3 +76,16 @@ fi
if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
fi fi
# PTQ_dynamic
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
./local/PTQ_dynamic.sh ${train_output_path} fastspeech2_csmsc 8
# ./local/PTQ_dynamic.sh ${train_output_path} pwgan_csmsc 8
# ./local/PTQ_dynamic.sh ${train_output_path} mb_melgan_csmsc 8
# ./local/PTQ_dynamic.sh ${train_output_path} hifigan_csmsc 8
fi
# PTQ_static
if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} fastspeech2_csmsc || exit -1
fi

@ -122,3 +122,8 @@ fi
if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict_streaming.sh ${train_output_path} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict_streaming.sh ${train_output_path} || exit -1
fi fi
# PTQ_static
if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} fastspeech2_csmsc || exit -1
fi

@ -0,0 +1,8 @@
train_output_path=$1
model_name=$2
python3 ${BIN_DIR}/../../PTQ_static.py \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--inference_dir ${train_output_path}/inference \
--model_name ${model_name} \
--onnx_format=True

@ -30,3 +30,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize # synthesize
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi fi
# PTQ_static
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} pwgan_csmsc || exit -1
fi

@ -0,0 +1 @@
../../voc1/local/PTQ_static.sh

@ -30,3 +30,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize # synthesize
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi fi
# PTQ_static
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} mb_melgan_csmsc || exit -1
fi

@ -0,0 +1 @@
../../voc1/local/PTQ_static.sh

@ -30,3 +30,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize # synthesize
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi fi
# PTQ_static
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} hifigan_csmsc || exit -1
fi

@ -538,3 +538,70 @@ def vits_multi_spk_batch_fn(examples):
spk_id = paddle.to_tensor(spk_id) spk_id = paddle.to_tensor(spk_id)
batch["spk_id"] = spk_id batch["spk_id"] = spk_id
return batch return batch
# for PaddleSlim
def fastspeech2_single_spk_batch_fn_static(examples):
text = [np.array(item["text"], dtype=np.int64) for item in examples]
text = np.array(text)
# do not need batch axis in infer
text = text[0]
batch = {
"text": text,
}
return batch
def fastspeech2_multi_spk_batch_fn_static(examples):
text = [np.array(item["text"], dtype=np.int64) for item in examples]
text = np.array(text)
text = text[0]
batch = {
"text": text,
}
if "spk_id" in examples[0]:
spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
spk_id = np.array(spk_id)
spk_id = spk_id[0]
batch["spk_id"] = spk_id
if "spk_emb" in examples[0]:
spk_emb = [
np.array(item["spk_emb"], dtype=np.float32) for item in examples
]
spk_emb = np.array(spk_emb)
spk_emb = spk_id[spk_emb]
batch["spk_emb"] = spk_emb
return batch
def speedyspeech_single_spk_batch_fn_static(examples):
phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
phones = np.array(phones)
tones = np.array(tones)
phones = phones[0]
tones = tones[0]
batch = {
"phones": phones,
"tones": tones,
}
return batch
def speedyspeech_multi_spk_batch_fn_static(examples):
phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
phones = np.array(phones)
tones = np.array(tones)
phones = phones[0]
tones = tones[0]
batch = {
"phones": phones,
"tones": tones,
}
if "spk_id" in examples[0]:
spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
spk_id = np.array(spk_id)
spk_id = spk_id[0]
batch["spk_id"] = spk_id
return batch

@ -55,13 +55,12 @@ class Clip(object):
Args: Args:
batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C). batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
Returns: Returns:
Tensor:
Target signal batch (B, 1, T).
Tensor: Tensor:
Auxiliary feature batch (B, C, T'), where Auxiliary feature batch (B, C, T'), where
T = (T' - 2 * aux_context_window) * hop_size. T = (T' - 2 * aux_context_window) * hop_size.
Tensor:
Target signal batch (B, 1, T).
""" """
# check length # check length
batch = [ batch = [
@ -106,11 +105,7 @@ class Clip(object):
if len(x) < c.shape[0] * self.hop_size: if len(x) < c.shape[0] * self.hop_size:
x = np.pad(x, (0, c.shape[0] * self.hop_size - len(x)), mode="edge") x = np.pad(x, (0, c.shape[0] * self.hop_size - len(x)), mode="edge")
elif len(x) > c.shape[0] * self.hop_size: elif len(x) > c.shape[0] * self.hop_size:
# print(
# f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
# )
x = x[:c.shape[0] * self.hop_size] x = x[:c.shape[0] * self.hop_size]
# check the legnth is valid # check the legnth is valid
assert len(x) == c.shape[ assert len(x) == c.shape[
0] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[0]})" 0] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[0]})"
@ -218,3 +213,47 @@ class WaveRNNClip(Clip):
y = label_2_float(paddle.cast(y, dtype='float32'), self.bits) y = label_2_float(paddle.cast(y, dtype='float32'), self.bits)
return x, y, mels return x, y, mels
# for paddleslim
class Clip_static(Clip):
"""Collate functor for training vocoders.
"""
def __call__(self, batch):
"""Convert into batch tensors.
Args:
batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
Returns:
Dict[str, np.array]:
Auxiliary feature batch (B, C, T'), where
T = (T' - 2 * aux_context_window) * hop_size.
"""
# check length
batch = [
self._adjust_length(b['wave'], b['feats']) for b in batch
if b['feats'].shape[0] > self.mel_threshold
]
xs, cs = [b[0] for b in batch], [b[1] for b in batch]
# make batch with random cut
c_lengths = [c.shape[0] for c in cs]
start_frames = np.array([
np.random.randint(self.start_offset, cl + self.end_offset)
for cl in c_lengths
])
c_starts = start_frames - self.aux_context_window
c_ends = start_frames + self.batch_max_frames + self.aux_context_window
c_batch = np.stack(
[c[start:end] for c, start, end in zip(cs, c_starts, c_ends)])
# infer axis (T',C) is different with train axis (B, C, T')
# c_batch = c_batch.transpose([0, 2, 1]) # (B, C, T')
# do not need batch axis in infer
c_batch = c_batch[0]
batch = {"logmel": c_batch}
return batch

@ -0,0 +1,80 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import paddle
from paddleslim.quant import quant_post_dynamic
def parse_args():
parser = argparse.ArgumentParser(
description="Paddle Slim Dynamic with acoustic model & vocoder.")
# acoustic model
parser.add_argument(
'--model_name',
type=str,
default='fastspeech2_csmsc',
choices=[
'speedyspeech_csmsc',
'fastspeech2_csmsc',
'fastspeech2_aishell3',
'fastspeech2_ljspeech',
'fastspeech2_vctk',
'tacotron2_csmsc',
'fastspeech2_mix',
'pwgan_csmsc',
'pwgan_aishell3',
'pwgan_ljspeech',
'pwgan_vctk',
'mb_melgan_csmsc',
'hifigan_csmsc',
'hifigan_aishell3',
'hifigan_ljspeech',
'hifigan_vctk',
'wavernn_csmsc',
],
help='Choose model type of tts task.')
parser.add_argument(
"--inference_dir", type=str, help="dir to save inference models")
parser.add_argument(
"--weight_bits",
type=int,
default=8,
choices=[8, 16],
help="The bits for the quantized weight, and it should be 8 or 16. Default is 8.",
)
args, _ = parser.parse_known_args()
return args
# only inference for models trained with csmsc now
def main():
args = parse_args()
paddle.enable_static()
quant_post_dynamic(
model_dir=args.inference_dir,
save_model_dir=args.inference_dir,
model_filename=args.model_name + ".pdmodel",
params_filename=args.model_name + ".pdiparams",
save_model_filename=args.model_name + "_" + str(args.weight_bits) +
"bits.pdmodel",
save_params_filename=args.model_name + "_" + str(args.weight_bits) +
"bits.pdiparams",
weight_bits=args.weight_bits, )
if __name__ == "__main__":
main()

@ -0,0 +1,156 @@
import argparse
import random
import jsonlines
import numpy as np
import paddle
from paddleslim.quant import quant_post_static
from paddlespeech.t2s.exps.syn_utils import get_dev_dataloader
from paddlespeech.t2s.utils import str2bool
def parse_args():
parser = argparse.ArgumentParser(
description="Paddle Slim Static with acoustic model & vocoder.")
parser.add_argument(
"--batch_size", type=int, default=1, help="Minibatch size.")
parser.add_argument("--batch_num", type=int, default=1, help="Batch number")
parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
# model_path save_path
parser.add_argument(
"--inference_dir", type=str, help="dir to save inference models")
parser.add_argument(
'--model_name',
type=str,
default='fastspeech2_csmsc',
choices=[
'speedyspeech_csmsc',
'fastspeech2_csmsc',
'fastspeech2_aishell3',
'fastspeech2_ljspeech',
'fastspeech2_vctk',
'fastspeech2_mix',
'pwgan_csmsc',
'pwgan_aishell3',
'pwgan_ljspeech',
'pwgan_vctk',
'mb_melgan_csmsc',
'hifigan_csmsc',
'hifigan_aishell3',
'hifigan_ljspeech',
'hifigan_vctk',
],
help='Choose model type of tts task.')
parser.add_argument(
"--algo", type=str, default='avg', help="calibration algorithm.")
parser.add_argument(
"--round_type",
type=str,
default='round',
help="The method of converting the quantized weights.")
parser.add_argument(
"--hist_percent",
type=float,
default=0.9999,
help="The percentile of algo:hist.")
parser.add_argument(
"--is_full_quantize",
type=str2bool,
default=False,
help="Whether is full quantization or not.")
parser.add_argument(
"--bias_correction",
type=str2bool,
default=False,
help="Whether to use bias correction.")
parser.add_argument(
"--ce_test", type=str2bool, default=False, help="Whether to CE test.")
parser.add_argument(
"--onnx_format",
type=str2bool,
default=False,
help="Whether to export the quantized model with format of ONNX.")
parser.add_argument(
"--phones-dict", type=str, default=None, help="phone vocabulary file.")
parser.add_argument(
"--speaker-dict",
type=str,
default=None,
help="speaker id map file for multiple speaker model.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument(
"--quantizable_op_type",
type=list,
nargs='+',
default=[
"conv2d_transpose", "conv2d", "depthwise_conv2d", "mul", "matmul",
"matmul_v2"
],
help="The list of op types that will be quantized.")
args = parser.parse_args()
return args
def quantize(args):
shuffle = True
if args.ce_test:
# set seed
seed = 111
np.random.seed(seed)
paddle.seed(seed)
random.seed(seed)
shuffle = False
place = paddle.CUDAPlace(0) if args.ngpu > 0 else paddle.CPUPlace()
with jsonlines.open(args.dev_metadata, 'r') as reader:
dev_metadata = list(reader)
dataloader = get_dev_dataloader(
dev_metadata=dev_metadata,
am=args.model_name,
batch_size=args.batch_size,
speaker_dict=args.speaker_dict,
shuffle=shuffle)
exe = paddle.static.Executor(place)
exe.run()
print("onnx_format:", args.onnx_format)
quant_post_static(
executor=exe,
model_dir=args.inference_dir,
quantize_model_path=args.inference_dir + "/" + args.model_name +
"_quant",
data_loader=dataloader,
model_filename=args.model_name + ".pdmodel",
params_filename=args.model_name + ".pdiparams",
save_model_filename=args.model_name + ".pdmodel",
save_params_filename=args.model_name + ".pdiparams",
batch_size=args.batch_size,
algo=args.algo,
round_type=args.round_type,
hist_percent=args.hist_percent,
is_full_quantize=args.is_full_quantize,
bias_correction=args.bias_correction,
onnx_format=args.onnx_format,
quantizable_op_type=args.quantizable_op_type)
def main():
args = parse_args()
new_quantizable_op_type = []
for item in args.quantizable_op_type:
new_quantizable_op_type.append(''.join(item))
args.quantizable_op_type = new_quantizable_op_type
paddle.enable_static()
quantize(args)
if __name__ == '__main__':
main()

@ -25,10 +25,13 @@ import onnxruntime as ort
import paddle import paddle
from paddle import inference from paddle import inference
from paddle import jit from paddle import jit
from paddle.io import DataLoader
from paddle.static import InputSpec from paddle.static import InputSpec
from yacs.config import CfgNode from yacs.config import CfgNode
from paddlespeech.t2s.datasets.am_batch_fn import *
from paddlespeech.t2s.datasets.data_table import DataTable from paddlespeech.t2s.datasets.data_table import DataTable
from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip_static
from paddlespeech.t2s.frontend import English from paddlespeech.t2s.frontend import English
from paddlespeech.t2s.frontend.mix_frontend import MixFrontend from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.frontend.zh_frontend import Frontend
@ -118,6 +121,7 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
return sentences return sentences
# am only
def get_test_dataset(test_metadata: List[Dict[str, Any]], def get_test_dataset(test_metadata: List[Dict[str, Any]],
am: str, am: str,
speaker_dict: Optional[os.PathLike]=None, speaker_dict: Optional[os.PathLike]=None,
@ -158,6 +162,100 @@ def get_test_dataset(test_metadata: List[Dict[str, Any]],
return test_dataset return test_dataset
# am and voc, for PTQ_static
def get_dev_dataloader(dev_metadata: List[Dict[str, Any]],
am: str,
batch_size: int=1,
speaker_dict: Optional[os.PathLike]=None,
voice_cloning: bool=False,
n_shift: int=300,
batch_max_steps: int=16200,
shuffle: bool=True):
# model: {model_name}_{dataset}
am_name = am[:am.rindex('_')]
am_dataset = am[am.rindex('_') + 1:]
converters = {}
if am_name == 'fastspeech2':
fields = ["utt_id", "text"]
if am_dataset in {"aishell3", "vctk",
"mix"} and speaker_dict is not None:
print("multiple speaker fastspeech2!")
collate_fn = fastspeech2_multi_spk_batch_fn_static
fields += ["spk_id"]
elif voice_cloning:
print("voice cloning!")
collate_fn = fastspeech2_multi_spk_batch_fn_static
fields += ["spk_emb"]
else:
print("single speaker fastspeech2!")
collate_fn = fastspeech2_single_spk_batch_fn_static
elif am_name == 'speedyspeech':
fields = ["utt_id", "phones", "tones"]
if am_dataset in {"aishell3", "vctk",
"mix"} and speaker_dict is not None:
print("multiple speaker speedyspeech!")
collate_fn = speedyspeech_multi_spk_batch_fn_static
fields += ["spk_id"]
else:
print("single speaker speedyspeech!")
collate_fn = speedyspeech_single_spk_batch_fn_static
fields = ["utt_id", "phones", "tones"]
elif am_name == 'tacotron2':
fields = ["utt_id", "text"]
if voice_cloning:
print("voice cloning!")
collate_fn = tacotron2_multi_spk_batch_fn_static
fields += ["spk_emb"]
else:
print("single speaker tacotron2!")
collate_fn = tacotron2_single_spk_batch_fn_static
else:
print("voc dataloader")
# am
if am_name not in {'pwgan', 'mb_melgan', 'hifigan'}:
dev_dataset = DataTable(
data=dev_metadata,
fields=fields,
converters=converters, )
dev_dataloader = DataLoader(
dev_dataset,
shuffle=shuffle,
drop_last=False,
batch_size=batch_size,
collate_fn=collate_fn)
# vocoder
else:
# pwgan: batch_max_steps: 25500 aux_context_window: 2
# mb_melgan: batch_max_steps: 16200 aux_context_window 0
# hifigan: batch_max_steps: 8400 aux_context_window 0
aux_context_window = 0
if am_name == 'pwgan':
aux_context_window = 2
train_batch_fn = Clip_static(
batch_max_steps=batch_max_steps,
hop_size=n_shift,
aux_context_window=aux_context_window)
dev_dataset = DataTable(
data=dev_metadata,
fields=["wave", "feats"],
converters={
"wave": np.load,
"feats": np.load,
}, )
dev_dataloader = DataLoader(
dev_dataset,
shuffle=shuffle,
drop_last=False,
batch_size=batch_size,
collate_fn=train_batch_fn)
return dev_dataloader
# frontend # frontend
def get_frontend(lang: str='zh', def get_frontend(lang: str='zh',
phones_dict: Optional[os.PathLike]=None, phones_dict: Optional[os.PathLike]=None,

Loading…
Cancel
Save