speedyspeech code adapt for mlu (#3828)
* speedyspeech code adapt for mlu * fix inference * fix help messagepull/3841/head
parent
4be005858b
commit
a9ece28ba6
@ -0,0 +1,33 @@
|
||||
#!/bin/bash
|
||||
|
||||
train_output_path=$1
|
||||
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
# for more GAN Vocoders
|
||||
# multi band melgan
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
python3 ${BIN_DIR}/../inference.py \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--am=speedyspeech_csmsc \
|
||||
--voc=mb_melgan_csmsc \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/pd_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--device mlu
|
||||
fi
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
python3 ${BIN_DIR}/../inference.py \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--am=speedyspeech_csmsc \
|
||||
--voc=hifigan_csmsc \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/pd_infer_out \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--device mlu
|
||||
fi
|
@ -0,0 +1,99 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
# for more GAN Vocoders
|
||||
# multi band melgan
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/feats_stats.npy \
|
||||
--voc=mb_melgan_csmsc \
|
||||
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
|
||||
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--lang=zh \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/test_e2e \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--ngpu=0 \
|
||||
--nmlu=1
|
||||
fi
|
||||
|
||||
# the pretrained models haven't release now
|
||||
# style melgan
|
||||
# style melgan's Dygraph to Static Graph is not ready now
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/feats_stats.npy \
|
||||
--voc=style_melgan_csmsc \
|
||||
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
|
||||
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--lang=zh \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/test_e2e \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--ngpu=0 \
|
||||
--nmlu=1
|
||||
# --inference_dir=${train_output_path}/inference
|
||||
fi
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/feats_stats.npy \
|
||||
--voc=hifigan_csmsc \
|
||||
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
|
||||
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--lang=zh \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/test_e2e \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--ngpu=0 \
|
||||
--nmlu=1
|
||||
fi
|
||||
|
||||
# wavernn
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
echo "in wavernn syn_e2e"
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/feats_stats.npy \
|
||||
--voc=wavernn_csmsc \
|
||||
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
|
||||
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
|
||||
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
|
||||
--lang=zh \
|
||||
--text=${BIN_DIR}/../../assets/sentences.txt \
|
||||
--output_dir=${train_output_path}/test_e2e \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--inference_dir=${train_output_path}/inference \
|
||||
--ngpu=0 \
|
||||
--nmlu=1
|
||||
fi
|
@ -0,0 +1,90 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
stage=0
|
||||
stop_stage=0
|
||||
|
||||
# for more GAN Vocoders
|
||||
# multi band melgan
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/feats_stats.npy \
|
||||
--voc=mb_melgan_csmsc \
|
||||
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
|
||||
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--ngpu=0 \
|
||||
--nmlu=1
|
||||
fi
|
||||
|
||||
# style melgan
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/feats_stats.npy \
|
||||
--voc=style_melgan_csmsc \
|
||||
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
|
||||
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--ngpu=0 \
|
||||
--nmlu=1
|
||||
fi
|
||||
|
||||
# hifigan
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
echo "in hifigan syn"
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/feats_stats.npy \
|
||||
--voc=hifigan_csmsc \
|
||||
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
|
||||
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
|
||||
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--ngpu=0 \
|
||||
--nmlu=1
|
||||
fi
|
||||
|
||||
# wavernn
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
echo "in wavernn syn"
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
python3 ${BIN_DIR}/../synthesize.py \
|
||||
--am=speedyspeech_csmsc \
|
||||
--am_config=${config_path} \
|
||||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--am_stat=dump/train/feats_stats.npy \
|
||||
--voc=wavernn_csmsc \
|
||||
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
|
||||
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
|
||||
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
|
||||
--test_metadata=dump/test/norm/metadata.jsonl \
|
||||
--output_dir=${train_output_path}/test \
|
||||
--tones_dict=dump/tone_id_map.txt \
|
||||
--phones_dict=dump/phone_id_map.txt \
|
||||
--ngpu=0 \
|
||||
--nmlu=1
|
||||
fi
|
@ -0,0 +1,16 @@
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
# export MLU_VISIBLE_DEVICES=8
|
||||
python ${BIN_DIR}/train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--ngpu=0 \
|
||||
--nmlu=2 \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--tones-dict=dump/tone_id_map.txt \
|
||||
--use-relative-path=True
|
@ -0,0 +1,76 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
export CUSTOM_DEVICE_BLACK_LIST=elementwise_max
|
||||
mlus=0
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
ckpt_name=snapshot_iter_30600.pdz
|
||||
|
||||
# with the following command, you can choose the stage range you want to run
|
||||
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||
# this can not be mixed use with `$1`, `$2` ...
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
./local/preprocess.sh ${conf_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
FLAGS_selected_mlus=${mlus} ./local/train_mlu.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# synthesize, vocoder is pwgan by default
|
||||
FLAGS_selected_mlus=${mlus} ./local/synthesize_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# synthesize_e2e, vocoder is pwgan by default
|
||||
FLAGS_selected_mlus=${mlus} ./local/synthesize_e2e_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
# inference with static model
|
||||
FLAGS_selected_mlus=${mlus} ./local/inference_mlu.sh ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
|
||||
# we have only tested the following models so far
|
||||
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
||||
# install paddle2onnx
|
||||
pip install paddle2onnx --upgrade
|
||||
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc
|
||||
# considering the balance between speed and quality, we recommend that you use hifigan as vocoder
|
||||
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
|
||||
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
|
||||
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
|
||||
fi
|
||||
|
||||
# inference with onnxruntime
|
||||
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
|
||||
./local/ort_predict.sh ${train_output_path}
|
||||
fi
|
||||
|
||||
# must run after stage 3 (which stage generated static models)
|
||||
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
|
||||
./local/export2lite.sh ${train_output_path} inference pdlite speedyspeech_csmsc x86
|
||||
./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86
|
||||
# ./local/export2lite.sh ${train_output_path} inference pdlite mb_melgan_csmsc x86
|
||||
# ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_csmsc x86
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
# PTQ_static
|
||||
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} speedyspeech_csmsc || exit -1
|
||||
fi
|
Loading…
Reference in new issue