speedyspeech code adapt for npu (#3804)

* speedyspeech code adapt for npu * fix npu inference * fix e2e synthesize * add paddle version control for memory optim config * fix code style * fix code style * fix help message * fix code style * fix help message
1 year ago · 0b568136d9
parent d615fc33de
commit 0b568136d9
10 changed files with 392 additions and 18 deletions
--- a/examples/csmsc/tts2/local/inference_npu.sh
+++ b/examples/csmsc/tts2/local/inference_npu.sh
@ -0,0 +1,46 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=pwgan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device npu
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=mb_melgan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device npu
+fi
+
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=hifigan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device npu
+fi
--- a/examples/csmsc/tts2/local/synthesize_e2e_npu.sh
+++ b/examples/csmsc/tts2/local/synthesize_e2e_npu.sh
@ -0,0 +1,124 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=pwgan_csmsc \
+        --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nnpu=1
+
+
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nnpu=1
+fi
+
+# the pretrained models haven't release now
+# style melgan
+# style melgan's Dygraph to Static Graph is not ready now
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nnpu=1
+        # --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nnpu=1
+fi
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "in wavernn syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nnpu=1
+fi
--- a/examples/csmsc/tts2/local/synthesize_npu.sh
+++ b/examples/csmsc/tts2/local/synthesize_npu.sh
@ -0,0 +1,110 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=pwgan_csmsc \
+        --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nnpu=1
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nnpu=1
+fi
+
+# style melgan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nnpu=1
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "in hifigan syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nnpu=1
+fi
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "in wavernn syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --tones_dict=dump/tone_id_map.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nnpu=1
+fi
--- a/examples/csmsc/tts2/local/train_npu.sh
+++ b/examples/csmsc/tts2/local/train_npu.sh
@ -0,0 +1,16 @@
+
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=0 \
+    --nnpu=1 \
+    --phones-dict=dump/phone_id_map.txt \
+    --tones-dict=dump/tone_id_map.txt \
+    --use-relative-path=True
--- a/examples/csmsc/tts2/run_npu.sh
+++ b/examples/csmsc/tts2/run_npu.sh
@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+npus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_76.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run_xpu.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    FLAGS_selected_npus=${npus} ./local/train_npu.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan by default
+    FLAGS_selected_npus=${npus} ./local/synthesize_npu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan by default
+    FLAGS_selected_npus=${npus} ./local/synthesize_e2e_npu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # inference with static model
+    FLAGS_selected_npus=${npus} ./local/inference_npu.sh ${train_output_path} || exit -1
+fi
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@ -112,7 +112,7 @@ def parse_args():
    parser.add_argument(
        "--device",
        default="gpu",
-        choices=["gpu", "cpu", "xpu"],
+        choices=["gpu", "cpu", "xpu", "npu"],
        help="Device selected for inference.", )
    parser.add_argument('--cpu_threads', type=int, default=1)

--- a/paddlespeech/t2s/exps/speedyspeech/train.py
+++ b/paddlespeech/t2s/exps/speedyspeech/train.py
@ -45,15 +45,18 @@ def train_sp(args, config):
    # decides device type and whether to run in parallel
    # setup running environment correctly
    world_size = paddle.distributed.get_world_size()
-    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
-        if (not paddle.is_compiled_with_xpu()) or args.nxpu == 0:
-            paddle.set_device("cpu")
-        else:
-            paddle.set_device("xpu")
-    else:
+    if paddle.is_compiled_with_cuda() and args.ngpu > 0:
        paddle.set_device("gpu")
        if world_size > 1:
            paddle.distributed.init_parallel_env()
+    elif paddle.is_compiled_with_xpu() and args.nxpu > 0:
+        paddle.device.set_device("xpu")
+    elif args.nnpu > 0:
+        paddle.device.set_device("npu")
+        if world_size > 1:
+            paddle.distributed.init_parallel_env()
+    else:
+        paddle.set_device("cpu")

    # set the random seed, it is a must for multiprocess training
    seed_everything(config.seed)
@ -191,9 +194,19 @@ def main():
        "--nxpu",
        type=int,
        default=0,
-        help="if nxpu == 0 and ngpu == 0, use cpu.")
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
+    )
+    parser.add_argument(
+        "--nnpu",
+        type=int,
+        default=0,
+        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
+    )
    parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu")
+        "--ngpu",
+        type=int,
+        default=1,
+        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")

    parser.add_argument(
        "--use-relative-path",
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@ -591,7 +591,8 @@ def get_predictor(

    config = inference.Config(
        str(Path(model_dir) / model_file), str(Path(model_dir) / params_file))
-    config.enable_memory_optim()
+    if paddle.__version__ <= "2.5.2" and paddle.__version__ != "0.0.0":
+        config.enable_memory_optim()
    config.switch_ir_optim(True)
    if device == "gpu":
        config.enable_use_gpu(100, device_id)
--- a/paddlespeech/t2s/exps/synthesize.py
+++ b/paddlespeech/t2s/exps/synthesize.py
@ -219,12 +219,21 @@ def parse_args():
    )
    # other
    parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.")
+        "--ngpu",
+        type=int,
+        default=1,
+        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")
    parser.add_argument(
        "--nxpu",
        type=int,
        default=0,
-        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu."
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
+    )
+    parser.add_argument(
+        "--nnpu",
+        type=int,
+        default=0,
+        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
    )
    parser.add_argument("--test_metadata", type=str, help="test metadata.")
    parser.add_argument("--output_dir", type=str, help="output dir.")
@ -245,10 +254,12 @@ def main():
        paddle.set_device("gpu")
    elif args.nxpu > 0:
        paddle.set_device("xpu")
-    elif args.ngpu == 0 and args.nxpu == 0:
+    elif args.nnpu > 0:
+        paddle.set_device("npu")
+    elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0:
        paddle.set_device("cpu")
    else:
-        print("ngpu or nxpu should >= 0 !")
+        print("ngpu, nxpu and nnpu should be >= 0")

    evaluate(args)

--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@ -299,12 +299,21 @@ def parse_args():
        default=None,
        help="dir to save inference models")
    parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.")
+        "--ngpu",
+        type=int,
+        default=1,
+        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")
    parser.add_argument(
        "--nxpu",
        type=int,
        default=0,
-        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu."
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
+    )
+    parser.add_argument(
+        "--nnpu",
+        type=int,
+        default=0,
+        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
    )
    parser.add_argument(
        "--text",
@ -339,10 +348,12 @@ def main():
        paddle.set_device("gpu")
    elif args.nxpu > 0:
        paddle.set_device("xpu")
-    elif args.ngpu == 0 and args.nxpu == 0:
+    elif args.nnpu > 0:
+        paddle.set_device("npu")
+    elif args.ngpu == 0 and args.nxpu == 0 or args.nnpu == 0:
        paddle.set_device("cpu")
    else:
-        print("ngpu or nxpu should >= 0 !")
+        print("ngpu, nxpu and nnpu should be >= 0")

    evaluate(args)