speedyspeech code adapt for mlu (#3828)

* speedyspeech code adapt for mlu

* fix inference

* fix help message
pull/3841/head
zhuyipin 1 month ago committed by GitHub
parent 4be005858b
commit a9ece28ba6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -0,0 +1,33 @@
#!/bin/bash
train_output_path=$1
stage=0
stop_stage=0
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device mlu
fi
# hifigan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device mlu
fi

@ -0,0 +1,99 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
stage=0
stop_stage=0
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi
# the pretrained models haven't release now
# style melgan
# style melgan's Dygraph to Static Graph is not ready now
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
# --inference_dir=${train_output_path}/inference
fi
# hifigan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nmlu=1
fi
# wavernn
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in wavernn syn_e2e"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference \
--ngpu=0 \
--nmlu=1
fi

@ -0,0 +1,90 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
stage=0
stop_stage=0
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi
# style melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi
# hifigan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "in hifigan syn"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi
# wavernn
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in wavernn syn"
FLAGS_allocator_strategy=naive_best_fit \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--tones_dict=dump/tone_id_map.txt \
--phones_dict=dump/phone_id_map.txt \
--ngpu=0 \
--nmlu=1
fi

@ -0,0 +1,16 @@
#!/bin/bash
config_path=$1
train_output_path=$2
# export MLU_VISIBLE_DEVICES=8
python ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=${config_path} \
--output-dir=${train_output_path} \
--ngpu=0 \
--nmlu=2 \
--phones-dict=dump/phone_id_map.txt \
--tones-dict=dump/tone_id_map.txt \
--use-relative-path=True

@ -0,0 +1,76 @@
#!/bin/bash
set -e
source path.sh
export CUSTOM_DEVICE_BLACK_LIST=elementwise_max
mlus=0
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_30600.pdz
# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
FLAGS_selected_mlus=${mlus} ./local/train_mlu.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan by default
FLAGS_selected_mlus=${mlus} ./local/synthesize_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan by default
FLAGS_selected_mlus=${mlus} ./local/synthesize_e2e_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# inference with static model
FLAGS_selected_mlus=${mlus} ./local/inference_mlu.sh ${train_output_path} || exit -1
fi
# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
# we have only tested the following models so far
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# install paddle2onnx
pip install paddle2onnx --upgrade
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc
# considering the balance between speed and quality, we recommend that you use hifigan as vocoder
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
fi
# inference with onnxruntime
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
./local/ort_predict.sh ${train_output_path}
fi
# must run after stage 3 (which stage generated static models)
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
./local/export2lite.sh ${train_output_path} inference pdlite speedyspeech_csmsc x86
./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86
# ./local/export2lite.sh ${train_output_path} inference pdlite mb_melgan_csmsc x86
# ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_csmsc x86
fi
if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
fi
# PTQ_static
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} speedyspeech_csmsc || exit -1
fi

@ -112,7 +112,7 @@ def parse_args():
parser.add_argument(
"--device",
default="gpu",
choices=["gpu", "cpu", "xpu", "npu"],
choices=["gpu", "cpu", "xpu", "npu", "mlu"],
help="Device selected for inference.", )
parser.add_argument('--cpu_threads', type=int, default=1)

@ -55,6 +55,8 @@ def train_sp(args, config):
paddle.device.set_device("npu")
if world_size > 1:
paddle.distributed.init_parallel_env()
elif args.nmlu > 0:
paddle.device.set_device("mlu")
else:
paddle.set_device("cpu")
@ -194,13 +196,19 @@ def main():
"--nxpu",
type=int,
default=0,
help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
)
parser.add_argument(
"--nnpu",
type=int,
default=0,
help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
)
parser.add_argument(
"--nmlu",
type=int,
default=1,
help="if wish to use npu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
)
parser.add_argument(
"--ngpu",

@ -222,18 +222,25 @@ def parse_args():
"--ngpu",
type=int,
default=1,
help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")
help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu, mlu or cpu."
)
parser.add_argument(
"--nxpu",
type=int,
default=0,
help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
)
parser.add_argument(
"--nnpu",
type=int,
default=0,
help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
)
parser.add_argument(
"--nmlu",
type=int,
default=0,
help="if wish to use xpu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
)
parser.add_argument("--test_metadata", type=str, help="test metadata.")
parser.add_argument("--output_dir", type=str, help="output dir.")
@ -256,10 +263,14 @@ def main():
paddle.set_device("xpu")
elif args.nnpu > 0:
paddle.set_device("npu")
elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0:
elif args.nmlu > 0:
paddle.set_device("mlu")
elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0 and args.nmlu == 0:
paddle.set_device("cpu")
else:
print("ngpu, nxpu and nnpu should be >= 0")
print(
"one of ngpu, nxpu, nnpu or nmlu should be greater than 0 or all of them equal to 0"
)
evaluate(args)

@ -302,18 +302,25 @@ def parse_args():
"--ngpu",
type=int,
default=1,
help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")
help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu, mlu or cpu."
)
parser.add_argument(
"--nxpu",
type=int,
default=0,
help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
)
parser.add_argument(
"--nnpu",
type=int,
default=0,
help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
)
parser.add_argument(
"--nmlu",
type=int,
default=0,
help="if wish to use xpu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
)
parser.add_argument(
"--text",
@ -350,10 +357,14 @@ def main():
paddle.set_device("xpu")
elif args.nnpu > 0:
paddle.set_device("npu")
elif args.ngpu == 0 and args.nxpu == 0 or args.nnpu == 0:
elif args.nmlu > 0:
paddle.set_device("mlu")
elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0 and args.nmlu == 0:
paddle.set_device("cpu")
else:
print("ngpu, nxpu and nnpu should be >= 0")
print(
"one of ngpu, nxpu, nnpu or nmlu should be greater than 0 or all of them equal to 0"
)
evaluate(args)

Loading…
Cancel
Save