From a9ece28ba63bc0841ef5488b3dba46e8d5aa180b Mon Sep 17 00:00:00 2001 From: zhuyipin Date: Thu, 29 Aug 2024 10:56:01 +0800 Subject: [PATCH 1/3] speedyspeech code adapt for mlu (#3828) * speedyspeech code adapt for mlu * fix inference * fix help message --- examples/csmsc/tts2/local/inference_mlu.sh | 33 +++++++ .../csmsc/tts2/local/synthesize_e2e_mlu.sh | 99 +++++++++++++++++++ examples/csmsc/tts2/local/synthesize_mlu.sh | 90 +++++++++++++++++ examples/csmsc/tts2/local/train_mlu.sh | 16 +++ examples/csmsc/tts2/run_mlu.sh | 76 ++++++++++++++ paddlespeech/t2s/exps/inference.py | 2 +- paddlespeech/t2s/exps/speedyspeech/train.py | 12 ++- paddlespeech/t2s/exps/synthesize.py | 21 +++- paddlespeech/t2s/exps/synthesize_e2e.py | 21 +++- 9 files changed, 357 insertions(+), 13 deletions(-) create mode 100755 examples/csmsc/tts2/local/inference_mlu.sh create mode 100755 examples/csmsc/tts2/local/synthesize_e2e_mlu.sh create mode 100755 examples/csmsc/tts2/local/synthesize_mlu.sh create mode 100755 examples/csmsc/tts2/local/train_mlu.sh create mode 100755 examples/csmsc/tts2/run_mlu.sh diff --git a/examples/csmsc/tts2/local/inference_mlu.sh b/examples/csmsc/tts2/local/inference_mlu.sh new file mode 100755 index 000000000..d1bade84d --- /dev/null +++ b/examples/csmsc/tts2/local/inference_mlu.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=speedyspeech_csmsc \ + --voc=mb_melgan_csmsc \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --device mlu +fi + +# hifigan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=speedyspeech_csmsc \ + --voc=hifigan_csmsc \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --device mlu +fi diff --git a/examples/csmsc/tts2/local/synthesize_e2e_mlu.sh b/examples/csmsc/tts2/local/synthesize_e2e_mlu.sh new file mode 100755 index 000000000..7ad2024ff --- /dev/null +++ b/examples/csmsc/tts2/local/synthesize_e2e_mlu.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +stage=0 +stop_stage=0 + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nmlu=1 +fi + +# the pretrained models haven't release now +# style melgan +# style melgan's Dygraph to Static Graph is not ready now +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nmlu=1 + # --inference_dir=${train_output_path}/inference +fi + +# hifigan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nmlu=1 +fi + +# wavernn +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "in wavernn syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../../assets/sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference \ + --ngpu=0 \ + --nmlu=1 +fi diff --git a/examples/csmsc/tts2/local/synthesize_mlu.sh b/examples/csmsc/tts2/local/synthesize_mlu.sh new file mode 100755 index 000000000..6c0b0b650 --- /dev/null +++ b/examples/csmsc/tts2/local/synthesize_mlu.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 +stage=0 +stop_stage=0 + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nmlu=1 +fi + +# style melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nmlu=1 +fi + +# hifigan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + echo "in hifigan syn" + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --ngpu=0 \ + --nmlu=1 +fi + +# wavernn +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "in wavernn syn" + FLAGS_allocator_strategy=naive_best_fit \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --tones_dict=dump/tone_id_map.txt \ + --phones_dict=dump/phone_id_map.txt \ + --ngpu=0 \ + --nmlu=1 +fi diff --git a/examples/csmsc/tts2/local/train_mlu.sh b/examples/csmsc/tts2/local/train_mlu.sh new file mode 100755 index 000000000..4c1486434 --- /dev/null +++ b/examples/csmsc/tts2/local/train_mlu.sh @@ -0,0 +1,16 @@ + +#!/bin/bash + +config_path=$1 +train_output_path=$2 +# export MLU_VISIBLE_DEVICES=8 +python ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=0 \ + --nmlu=2 \ + --phones-dict=dump/phone_id_map.txt \ + --tones-dict=dump/tone_id_map.txt \ + --use-relative-path=True diff --git a/examples/csmsc/tts2/run_mlu.sh b/examples/csmsc/tts2/run_mlu.sh new file mode 100755 index 000000000..848e54077 --- /dev/null +++ b/examples/csmsc/tts2/run_mlu.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +set -e +source path.sh +export CUSTOM_DEVICE_BLACK_LIST=elementwise_max +mlus=0 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_30600.pdz + +# with the following command, you can choose the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + FLAGS_selected_mlus=${mlus} ./local/train_mlu.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan by default + FLAGS_selected_mlus=${mlus} ./local/synthesize_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan by default + FLAGS_selected_mlus=${mlus} ./local/synthesize_e2e_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # inference with static model + FLAGS_selected_mlus=${mlus} ./local/inference_mlu.sh ${train_output_path} || exit -1 +fi + +# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first +# we have only tested the following models so far +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + # install paddle2onnx + pip install paddle2onnx --upgrade + ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc + # considering the balance between speed and quality, we recommend that you use hifigan as vocoder + ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc + # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc + # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc +fi + +# inference with onnxruntime +if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + ./local/ort_predict.sh ${train_output_path} +fi + +# must run after stage 3 (which stage generated static models) +if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then + ./local/export2lite.sh ${train_output_path} inference pdlite speedyspeech_csmsc x86 + ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86 + # ./local/export2lite.sh ${train_output_path} inference pdlite mb_melgan_csmsc x86 + # ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_csmsc x86 +fi + +if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1 +fi + +# PTQ_static +if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} speedyspeech_csmsc || exit -1 +fi diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py index 21d105ade..e8ddd3bef 100644 --- a/paddlespeech/t2s/exps/inference.py +++ b/paddlespeech/t2s/exps/inference.py @@ -112,7 +112,7 @@ def parse_args(): parser.add_argument( "--device", default="gpu", - choices=["gpu", "cpu", "xpu", "npu"], + choices=["gpu", "cpu", "xpu", "npu", "mlu"], help="Device selected for inference.", ) parser.add_argument('--cpu_threads', type=int, default=1) diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py index b82d68802..b1916fbc4 100644 --- a/paddlespeech/t2s/exps/speedyspeech/train.py +++ b/paddlespeech/t2s/exps/speedyspeech/train.py @@ -55,6 +55,8 @@ def train_sp(args, config): paddle.device.set_device("npu") if world_size > 1: paddle.distributed.init_parallel_env() + elif args.nmlu > 0: + paddle.device.set_device("mlu") else: paddle.set_device("cpu") @@ -194,13 +196,19 @@ def main(): "--nxpu", type=int, default=0, - help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu." + help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu." ) parser.add_argument( "--nnpu", type=int, default=0, - help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu." + help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu." + ) + parser.add_argument( + "--nmlu", + type=int, + default=1, + help="if wish to use npu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu." ) parser.add_argument( "--ngpu", diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py index 9eb459894..b159725e2 100644 --- a/paddlespeech/t2s/exps/synthesize.py +++ b/paddlespeech/t2s/exps/synthesize.py @@ -222,18 +222,25 @@ def parse_args(): "--ngpu", type=int, default=1, - help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.") + help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu, mlu or cpu." + ) parser.add_argument( "--nxpu", type=int, default=0, - help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu." + help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu." ) parser.add_argument( "--nnpu", type=int, default=0, - help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu." + help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu." + ) + parser.add_argument( + "--nmlu", + type=int, + default=0, + help="if wish to use xpu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu." ) parser.add_argument("--test_metadata", type=str, help="test metadata.") parser.add_argument("--output_dir", type=str, help="output dir.") @@ -256,10 +263,14 @@ def main(): paddle.set_device("xpu") elif args.nnpu > 0: paddle.set_device("npu") - elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0: + elif args.nmlu > 0: + paddle.set_device("mlu") + elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0 and args.nmlu == 0: paddle.set_device("cpu") else: - print("ngpu, nxpu and nnpu should be >= 0") + print( + "one of ngpu, nxpu, nnpu or nmlu should be greater than 0 or all of them equal to 0" + ) evaluate(args) diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index b9073124b..08a14b315 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -302,18 +302,25 @@ def parse_args(): "--ngpu", type=int, default=1, - help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.") + help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu, mlu or cpu." + ) parser.add_argument( "--nxpu", type=int, default=0, - help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu." + help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu." ) parser.add_argument( "--nnpu", type=int, default=0, - help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu." + help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu." + ) + parser.add_argument( + "--nmlu", + type=int, + default=0, + help="if wish to use xpu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu." ) parser.add_argument( "--text", @@ -350,10 +357,14 @@ def main(): paddle.set_device("xpu") elif args.nnpu > 0: paddle.set_device("npu") - elif args.ngpu == 0 and args.nxpu == 0 or args.nnpu == 0: + elif args.nmlu > 0: + paddle.set_device("mlu") + elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0 and args.nmlu == 0: paddle.set_device("cpu") else: - print("ngpu, nxpu and nnpu should be >= 0") + print( + "one of ngpu, nxpu, nnpu or nmlu should be greater than 0 or all of them equal to 0" + ) evaluate(args) From d9eb82a6324bdc3ab7bfd9d38ced92ae7e9693c5 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Thu, 29 Aug 2024 19:35:26 +0800 Subject: [PATCH 2/3] fix unit test (#3835) --- tests/unit/asr/deepspeech2_model_test.py | 10 ++++---- .../unit/asr/deepspeech2_online_model_test.py | 24 +++++++++---------- .../unit/server/offline/test_server_client.sh | 2 ++ tests/unit/tts/test_data_table.py | 2 +- 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/tests/unit/asr/deepspeech2_model_test.py b/tests/unit/asr/deepspeech2_model_test.py index 5835445d2..fd42192ea 100644 --- a/tests/unit/asr/deepspeech2_model_test.py +++ b/tests/unit/asr/deepspeech2_model_test.py @@ -48,7 +48,7 @@ class TestDeepSpeech2Model(unittest.TestCase): num_rnn_layers=3, rnn_size=1024, use_gru=False, - share_rnn_weights=False, ) + rnn_direction="forward", ) loss = model(self.audio, self.audio_len, self.text, self.text_len) self.assertEqual(loss.numel(), 1) @@ -60,7 +60,7 @@ class TestDeepSpeech2Model(unittest.TestCase): num_rnn_layers=3, rnn_size=1024, use_gru=True, - share_rnn_weights=False, ) + rnn_direction="forward", ) loss = model(self.audio, self.audio_len, self.text, self.text_len) self.assertEqual(loss.numel(), 1) @@ -72,7 +72,7 @@ class TestDeepSpeech2Model(unittest.TestCase): num_rnn_layers=3, rnn_size=1024, use_gru=False, - share_rnn_weights=True, ) + rnn_direction="bidirect", ) loss = model(self.audio, self.audio_len, self.text, self.text_len) self.assertEqual(loss.numel(), 1) @@ -84,7 +84,7 @@ class TestDeepSpeech2Model(unittest.TestCase): num_rnn_layers=3, rnn_size=1024, use_gru=True, - share_rnn_weights=True, ) + rnn_direction="bidirect", ) loss = model(self.audio, self.audio_len, self.text, self.text_len) self.assertEqual(loss.numel(), 1) @@ -96,7 +96,7 @@ class TestDeepSpeech2Model(unittest.TestCase): num_rnn_layers=3, rnn_size=1024, use_gru=False, - share_rnn_weights=False, ) + rnn_direction="forward", ) loss = model(self.audio, self.audio_len, self.text, self.text_len) self.assertEqual(loss.numel(), 1) diff --git a/tests/unit/asr/deepspeech2_online_model_test.py b/tests/unit/asr/deepspeech2_online_model_test.py index f23c49263..f7ea87b12 100644 --- a/tests/unit/asr/deepspeech2_online_model_test.py +++ b/tests/unit/asr/deepspeech2_online_model_test.py @@ -19,11 +19,11 @@ import numpy as np import paddle from paddle import inference -from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline -from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline +from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel +from paddlespeech.s2t.models.ds2 import DeepSpeech2Model -class TestDeepSpeech2ModelOnline(unittest.TestCase): +class TestDeepSpeech2Model(unittest.TestCase): def setUp(self): paddle.set_device('cpu') @@ -45,7 +45,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): self.text_len = paddle.to_tensor(text_len, dtype='int64') def test_ds2_1(self): - model = DeepSpeech2ModelOnline( + model = DeepSpeech2Model( feat_size=self.feat_dim, dict_size=10, num_conv_layers=2, @@ -58,7 +58,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): self.assertEqual(loss.numel(), 1) def test_ds2_2(self): - model = DeepSpeech2ModelOnline( + model = DeepSpeech2Model( feat_size=self.feat_dim, dict_size=10, num_conv_layers=2, @@ -71,7 +71,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): self.assertEqual(loss.numel(), 1) def test_ds2_3(self): - model = DeepSpeech2ModelOnline( + model = DeepSpeech2Model( feat_size=self.feat_dim, dict_size=10, num_conv_layers=2, @@ -84,7 +84,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): self.assertEqual(loss.numel(), 1) def test_ds2_4(self): - model = DeepSpeech2ModelOnline( + model = DeepSpeech2Model( feat_size=self.feat_dim, dict_size=10, num_conv_layers=2, @@ -97,7 +97,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): self.assertEqual(loss.numel(), 1) def test_ds2_5(self): - model = DeepSpeech2ModelOnline( + model = DeepSpeech2Model( feat_size=self.feat_dim, dict_size=10, num_conv_layers=2, @@ -110,7 +110,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): self.assertEqual(loss.numel(), 1) def test_ds2_6(self): - model = DeepSpeech2ModelOnline( + model = DeepSpeech2Model( feat_size=self.feat_dim, dict_size=10, num_conv_layers=2, @@ -125,7 +125,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): def test_ds2_7(self): use_gru = False - model = DeepSpeech2ModelOnline( + model = DeepSpeech2Model( feat_size=self.feat_dim, dict_size=10, num_conv_layers=2, @@ -156,7 +156,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): def test_ds2_8(self): use_gru = True - model = DeepSpeech2ModelOnline( + model = DeepSpeech2Model( feat_size=self.feat_dim, dict_size=10, num_conv_layers=2, @@ -191,7 +191,7 @@ class TestDeepSpeech2StaticModelOnline(unittest.TestCase): export_prefix = "exp/deepspeech2_online/checkpoints/test_export" if not os.path.exists(os.path.dirname(export_prefix)): os.makedirs(os.path.dirname(export_prefix), mode=0o755) - infer_model = DeepSpeech2InferModelOnline( + infer_model = DeepSpeech2InferModel( feat_size=161, dict_size=4233, num_conv_layers=2, diff --git a/tests/unit/server/offline/test_server_client.sh b/tests/unit/server/offline/test_server_client.sh index dc52609c5..29bdd4032 100644 --- a/tests/unit/server/offline/test_server_client.sh +++ b/tests/unit/server/offline/test_server_client.sh @@ -1,5 +1,7 @@ #!/bin/bash # bash test_server_client.sh +## require lsof to get server pid +## apt-get install -y lsof StartService(){ # Start service diff --git a/tests/unit/tts/test_data_table.py b/tests/unit/tts/test_data_table.py index 3ff5bc1af..773942a2e 100644 --- a/tests/unit/tts/test_data_table.py +++ b/tests/unit/tts/test_data_table.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from paddlespeech.t2s.datasets.data_tabel import DataTable +from paddlespeech.t2s.datasets.data_table import DataTable def test_audio_dataset(): From 7e52aaed74f87b02af6d03098ff9f65e3224f5ce Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Fri, 30 Aug 2024 13:09:29 +0800 Subject: [PATCH 3/3] Add tests (#3836) * Add tests * fix * Fix * Fix * disable deepspeech2_online_model_test * disable test_data_table * Fix --- tests/unit/ci.sh | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 tests/unit/ci.sh diff --git a/tests/unit/ci.sh b/tests/unit/ci.sh new file mode 100644 index 000000000..9342a2685 --- /dev/null +++ b/tests/unit/ci.sh @@ -0,0 +1,31 @@ +function main(){ + set -ex + speech_ci_path=`pwd` + + echo "Start asr" + cd ${speech_ci_path}/asr + bash deepspeech2_online_model_test.sh + python error_rate_test.py + python mask_test.py + python reverse_pad_list.py + echo "End asr" + + echo "Start TTS" + cd ${speech_ci_path}/tts + python test_data_table.py + python test_enfrontend.py + python test_mixfrontend.py + echo "End TTS" + + echo "Start Vector" + cd ${speech_ci_path}/vector + python test_augment.py + echo "End Vector" + + echo "Start cli" + cd ${speech_ci_path}/cli + bash test_cli.sh + echo "End cli" +} + +main