From a9ece28ba63bc0841ef5488b3dba46e8d5aa180b Mon Sep 17 00:00:00 2001
From: zhuyipin <yipinzhu@outlook.com>
Date: Thu, 29 Aug 2024 10:56:01 +0800
Subject: [PATCH 1/3] speedyspeech code adapt for mlu (#3828)

* speedyspeech code adapt for mlu

* fix inference

* fix help message
---
 examples/csmsc/tts2/local/inference_mlu.sh    | 33 +++++++
 .../csmsc/tts2/local/synthesize_e2e_mlu.sh    | 99 +++++++++++++++++++
 examples/csmsc/tts2/local/synthesize_mlu.sh   | 90 +++++++++++++++++
 examples/csmsc/tts2/local/train_mlu.sh        | 16 +++
 examples/csmsc/tts2/run_mlu.sh                | 76 ++++++++++++++
 paddlespeech/t2s/exps/inference.py            |  2 +-
 paddlespeech/t2s/exps/speedyspeech/train.py   | 12 ++-
 paddlespeech/t2s/exps/synthesize.py           | 21 +++-
 paddlespeech/t2s/exps/synthesize_e2e.py       | 21 +++-
 9 files changed, 357 insertions(+), 13 deletions(-)
 create mode 100755 examples/csmsc/tts2/local/inference_mlu.sh
 create mode 100755 examples/csmsc/tts2/local/synthesize_e2e_mlu.sh
 create mode 100755 examples/csmsc/tts2/local/synthesize_mlu.sh
 create mode 100755 examples/csmsc/tts2/local/train_mlu.sh
 create mode 100755 examples/csmsc/tts2/run_mlu.sh

diff --git a/examples/csmsc/tts2/local/inference_mlu.sh b/examples/csmsc/tts2/local/inference_mlu.sh
new file mode 100755
index 000000000..d1bade84d
--- /dev/null
+++ b/examples/csmsc/tts2/local/inference_mlu.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=mb_melgan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device mlu
+fi
+
+# hifigan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=hifigan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device mlu
+fi
diff --git a/examples/csmsc/tts2/local/synthesize_e2e_mlu.sh b/examples/csmsc/tts2/local/synthesize_e2e_mlu.sh
new file mode 100755
index 000000000..7ad2024ff
--- /dev/null
+++ b/examples/csmsc/tts2/local/synthesize_e2e_mlu.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nmlu=1
+fi
+
+# the pretrained models haven't release now
+# style melgan
+# style melgan's Dygraph to Static Graph is not ready now
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nmlu=1
+        # --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nmlu=1
+fi
+
+# wavernn
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "in wavernn syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nmlu=1
+fi
diff --git a/examples/csmsc/tts2/local/synthesize_mlu.sh b/examples/csmsc/tts2/local/synthesize_mlu.sh
new file mode 100755
index 000000000..6c0b0b650
--- /dev/null
+++ b/examples/csmsc/tts2/local/synthesize_mlu.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+stage=0
+stop_stage=0
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nmlu=1
+fi
+
+# style melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nmlu=1
+fi
+
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "in hifigan syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nmlu=1
+fi
+
+# wavernn
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "in wavernn syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --tones_dict=dump/tone_id_map.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nmlu=1
+fi
diff --git a/examples/csmsc/tts2/local/train_mlu.sh b/examples/csmsc/tts2/local/train_mlu.sh
new file mode 100755
index 000000000..4c1486434
--- /dev/null
+++ b/examples/csmsc/tts2/local/train_mlu.sh
@@ -0,0 +1,16 @@
+
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+# export MLU_VISIBLE_DEVICES=8
+python ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=0 \
+    --nmlu=2 \
+    --phones-dict=dump/phone_id_map.txt \
+    --tones-dict=dump/tone_id_map.txt \
+    --use-relative-path=True
diff --git a/examples/csmsc/tts2/run_mlu.sh b/examples/csmsc/tts2/run_mlu.sh
new file mode 100755
index 000000000..848e54077
--- /dev/null
+++ b/examples/csmsc/tts2/run_mlu.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+set -e
+source path.sh
+export CUSTOM_DEVICE_BLACK_LIST=elementwise_max
+mlus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_30600.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    FLAGS_selected_mlus=${mlus} ./local/train_mlu.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan by default
+    FLAGS_selected_mlus=${mlus} ./local/synthesize_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan by default
+    FLAGS_selected_mlus=${mlus} ./local/synthesize_e2e_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # inference with static model
+    FLAGS_selected_mlus=${mlus} ./local/inference_mlu.sh ${train_output_path} || exit -1
+fi
+
+# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
+# we have only tested the following models so far
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # install paddle2onnx
+    pip install paddle2onnx --upgrade
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc
+    # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
+    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
+    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
+fi
+
+# inference with onnxruntime
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    ./local/ort_predict.sh ${train_output_path}
+fi
+
+# must run after stage 3 (which stage generated static models)
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+    ./local/export2lite.sh ${train_output_path} inference pdlite speedyspeech_csmsc x86
+    ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86
+    # ./local/export2lite.sh ${train_output_path} inference pdlite mb_melgan_csmsc x86
+    # ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_csmsc x86
+fi
+
+if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
+fi
+
+# PTQ_static
+if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh  ${train_output_path} speedyspeech_csmsc || exit -1
+fi
diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py
index 21d105ade..e8ddd3bef 100644
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@@ -112,7 +112,7 @@ def parse_args():
     parser.add_argument(
         "--device",
         default="gpu",
-        choices=["gpu", "cpu", "xpu", "npu"],
+        choices=["gpu", "cpu", "xpu", "npu", "mlu"],
         help="Device selected for inference.", )
     parser.add_argument('--cpu_threads', type=int, default=1)
 
diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py
index b82d68802..b1916fbc4 100644
--- a/paddlespeech/t2s/exps/speedyspeech/train.py
+++ b/paddlespeech/t2s/exps/speedyspeech/train.py
@@ -55,6 +55,8 @@ def train_sp(args, config):
         paddle.device.set_device("npu")
         if world_size > 1:
             paddle.distributed.init_parallel_env()
+    elif args.nmlu > 0:
+        paddle.device.set_device("mlu")
     else:
         paddle.set_device("cpu")
 
@@ -194,13 +196,19 @@ def main():
         "--nxpu",
         type=int,
         default=0,
-        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
     )
     parser.add_argument(
         "--nnpu",
         type=int,
         default=0,
-        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
+        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
+    )
+    parser.add_argument(
+        "--nmlu",
+        type=int,
+        default=1,
+        help="if wish to use npu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
     )
     parser.add_argument(
         "--ngpu",
diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py
index 9eb459894..b159725e2 100644
--- a/paddlespeech/t2s/exps/synthesize.py
+++ b/paddlespeech/t2s/exps/synthesize.py
@@ -222,18 +222,25 @@ def parse_args():
         "--ngpu",
         type=int,
         default=1,
-        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")
+        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu, mlu or cpu."
+    )
     parser.add_argument(
         "--nxpu",
         type=int,
         default=0,
-        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
     )
     parser.add_argument(
         "--nnpu",
         type=int,
         default=0,
-        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
+        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
+    )
+    parser.add_argument(
+        "--nmlu",
+        type=int,
+        default=0,
+        help="if wish to use xpu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
     )
     parser.add_argument("--test_metadata", type=str, help="test metadata.")
     parser.add_argument("--output_dir", type=str, help="output dir.")
@@ -256,10 +263,14 @@ def main():
         paddle.set_device("xpu")
     elif args.nnpu > 0:
         paddle.set_device("npu")
-    elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0:
+    elif args.nmlu > 0:
+        paddle.set_device("mlu")
+    elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0 and args.nmlu == 0:
         paddle.set_device("cpu")
     else:
-        print("ngpu, nxpu and nnpu should be >= 0")
+        print(
+            "one of ngpu, nxpu, nnpu or nmlu should be greater than 0 or all of them equal to 0"
+        )
 
     evaluate(args)
 
diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
index b9073124b..08a14b315 100644
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -302,18 +302,25 @@ def parse_args():
         "--ngpu",
         type=int,
         default=1,
-        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")
+        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu, mlu or cpu."
+    )
     parser.add_argument(
         "--nxpu",
         type=int,
         default=0,
-        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
     )
     parser.add_argument(
         "--nnpu",
         type=int,
         default=0,
-        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
+        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
+    )
+    parser.add_argument(
+        "--nmlu",
+        type=int,
+        default=0,
+        help="if wish to use xpu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
     )
     parser.add_argument(
         "--text",
@@ -350,10 +357,14 @@ def main():
         paddle.set_device("xpu")
     elif args.nnpu > 0:
         paddle.set_device("npu")
-    elif args.ngpu == 0 and args.nxpu == 0 or args.nnpu == 0:
+    elif args.nmlu > 0:
+        paddle.set_device("mlu")
+    elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0 and args.nmlu == 0:
         paddle.set_device("cpu")
     else:
-        print("ngpu, nxpu and nnpu should be >= 0")
+        print(
+            "one of ngpu, nxpu, nnpu or nmlu should be greater than 0 or all of them equal to 0"
+        )
 
     evaluate(args)
 

From d9eb82a6324bdc3ab7bfd9d38ced92ae7e9693c5 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Thu, 29 Aug 2024 19:35:26 +0800
Subject: [PATCH 2/3] fix unit test (#3835)

---
 tests/unit/asr/deepspeech2_model_test.py      | 10 ++++----
 .../unit/asr/deepspeech2_online_model_test.py | 24 +++++++++----------
 .../unit/server/offline/test_server_client.sh |  2 ++
 tests/unit/tts/test_data_table.py             |  2 +-
 4 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/tests/unit/asr/deepspeech2_model_test.py b/tests/unit/asr/deepspeech2_model_test.py
index 5835445d2..fd42192ea 100644
--- a/tests/unit/asr/deepspeech2_model_test.py
+++ b/tests/unit/asr/deepspeech2_model_test.py
@@ -48,7 +48,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
             num_rnn_layers=3,
             rnn_size=1024,
             use_gru=False,
-            share_rnn_weights=False, )
+            rnn_direction="forward", )
         loss = model(self.audio, self.audio_len, self.text, self.text_len)
         self.assertEqual(loss.numel(), 1)
 
@@ -60,7 +60,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
             num_rnn_layers=3,
             rnn_size=1024,
             use_gru=True,
-            share_rnn_weights=False, )
+            rnn_direction="forward", )
         loss = model(self.audio, self.audio_len, self.text, self.text_len)
         self.assertEqual(loss.numel(), 1)
 
@@ -72,7 +72,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
             num_rnn_layers=3,
             rnn_size=1024,
             use_gru=False,
-            share_rnn_weights=True, )
+            rnn_direction="bidirect", )
         loss = model(self.audio, self.audio_len, self.text, self.text_len)
         self.assertEqual(loss.numel(), 1)
 
@@ -84,7 +84,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
             num_rnn_layers=3,
             rnn_size=1024,
             use_gru=True,
-            share_rnn_weights=True, )
+            rnn_direction="bidirect", )
         loss = model(self.audio, self.audio_len, self.text, self.text_len)
         self.assertEqual(loss.numel(), 1)
 
@@ -96,7 +96,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
             num_rnn_layers=3,
             rnn_size=1024,
             use_gru=False,
-            share_rnn_weights=False, )
+            rnn_direction="forward", )
         loss = model(self.audio, self.audio_len, self.text, self.text_len)
         self.assertEqual(loss.numel(), 1)
 
diff --git a/tests/unit/asr/deepspeech2_online_model_test.py b/tests/unit/asr/deepspeech2_online_model_test.py
index f23c49263..f7ea87b12 100644
--- a/tests/unit/asr/deepspeech2_online_model_test.py
+++ b/tests/unit/asr/deepspeech2_online_model_test.py
@@ -19,11 +19,11 @@ import numpy as np
 import paddle
 from paddle import inference
 
-from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline
-from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
+from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
+from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
 
 
-class TestDeepSpeech2ModelOnline(unittest.TestCase):
+class TestDeepSpeech2Model(unittest.TestCase):
     def setUp(self):
         paddle.set_device('cpu')
 
@@ -45,7 +45,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
         self.text_len = paddle.to_tensor(text_len, dtype='int64')
 
     def test_ds2_1(self):
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -58,7 +58,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
         self.assertEqual(loss.numel(), 1)
 
     def test_ds2_2(self):
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -71,7 +71,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
         self.assertEqual(loss.numel(), 1)
 
     def test_ds2_3(self):
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -84,7 +84,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
         self.assertEqual(loss.numel(), 1)
 
     def test_ds2_4(self):
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -97,7 +97,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
         self.assertEqual(loss.numel(), 1)
 
     def test_ds2_5(self):
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -110,7 +110,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
         self.assertEqual(loss.numel(), 1)
 
     def test_ds2_6(self):
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -125,7 +125,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
 
     def test_ds2_7(self):
         use_gru = False
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -156,7 +156,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
 
     def test_ds2_8(self):
         use_gru = True
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -191,7 +191,7 @@ class TestDeepSpeech2StaticModelOnline(unittest.TestCase):
         export_prefix = "exp/deepspeech2_online/checkpoints/test_export"
         if not os.path.exists(os.path.dirname(export_prefix)):
             os.makedirs(os.path.dirname(export_prefix), mode=0o755)
-        infer_model = DeepSpeech2InferModelOnline(
+        infer_model = DeepSpeech2InferModel(
             feat_size=161,
             dict_size=4233,
             num_conv_layers=2,
diff --git a/tests/unit/server/offline/test_server_client.sh b/tests/unit/server/offline/test_server_client.sh
index dc52609c5..29bdd4032 100644
--- a/tests/unit/server/offline/test_server_client.sh
+++ b/tests/unit/server/offline/test_server_client.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 # bash test_server_client.sh
+## require lsof to get server pid
+## apt-get install -y lsof
 
 StartService(){
     # Start service 
diff --git a/tests/unit/tts/test_data_table.py b/tests/unit/tts/test_data_table.py
index 3ff5bc1af..773942a2e 100644
--- a/tests/unit/tts/test_data_table.py
+++ b/tests/unit/tts/test_data_table.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddlespeech.t2s.datasets.data_tabel import DataTable
+from paddlespeech.t2s.datasets.data_table import DataTable
 
 
 def test_audio_dataset():

From 7e52aaed74f87b02af6d03098ff9f65e3224f5ce Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Fri, 30 Aug 2024 13:09:29 +0800
Subject: [PATCH 3/3] Add tests (#3836)

* Add tests

* fix

* Fix

* Fix

* disable deepspeech2_online_model_test

* disable test_data_table

* Fix
---
 tests/unit/ci.sh | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 tests/unit/ci.sh

diff --git a/tests/unit/ci.sh b/tests/unit/ci.sh
new file mode 100644
index 000000000..9342a2685
--- /dev/null
+++ b/tests/unit/ci.sh
@@ -0,0 +1,31 @@
+function main(){
+  set -ex
+  speech_ci_path=`pwd`
+
+  echo "Start asr"
+  cd ${speech_ci_path}/asr
+  bash deepspeech2_online_model_test.sh
+  python error_rate_test.py
+  python mask_test.py
+  python reverse_pad_list.py
+  echo "End asr"
+
+  echo "Start TTS"
+  cd ${speech_ci_path}/tts
+  python test_data_table.py
+  python test_enfrontend.py
+  python test_mixfrontend.py
+  echo "End TTS"
+
+  echo "Start Vector"
+  cd ${speech_ci_path}/vector
+  python test_augment.py
+  echo "End Vector"
+
+  echo "Start cli"
+  cd ${speech_ci_path}/cli
+  bash test_cli.sh
+  echo "End cli"
+}
+
+main