From caaa5cd50202a37a81133bd015d8c07a91456041 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 12 Jul 2022 08:24:39 +0000 Subject: [PATCH 1/2] more cli for speech demos --- demos/custom_streaming_asr/setup_docker.sh | 0 demos/keyword_spotting/run.sh | 0 demos/speaker_verification/run.sh | 0 demos/speech_recognition/run.sh | 18 +++++++++++++++++- demos/speech_server/asr_client.sh | 0 demos/speech_server/cls_client.sh | 0 demos/speech_server/server.sh | 2 +- demos/speech_server/sid_client.sh | 10 ++++++++++ demos/speech_server/text_client.sh | 4 ++++ demos/speech_server/tts_client.sh | 0 .../{ => local}/punc_server.py | 0 .../{ => local}/streaming_asr_server.py | 0 demos/streaming_asr_server/run.sh | 0 demos/streaming_asr_server/server.sh | 7 +++---- demos/streaming_asr_server/test.sh | 2 +- demos/streaming_tts_server/start_server.sh | 3 --- demos/streaming_tts_server/test_client.sh | 9 --------- demos/text_to_speech/run.sh | 8 +++++++- paddlespeech/server/bin/paddlespeech_client.py | 1 + 19 files changed, 44 insertions(+), 20 deletions(-) mode change 100644 => 100755 demos/custom_streaming_asr/setup_docker.sh mode change 100644 => 100755 demos/keyword_spotting/run.sh mode change 100644 => 100755 demos/speaker_verification/run.sh mode change 100644 => 100755 demos/speech_recognition/run.sh mode change 100644 => 100755 demos/speech_server/asr_client.sh mode change 100644 => 100755 demos/speech_server/cls_client.sh mode change 100644 => 100755 demos/speech_server/server.sh create mode 100755 demos/speech_server/sid_client.sh create mode 100755 demos/speech_server/text_client.sh mode change 100644 => 100755 demos/speech_server/tts_client.sh rename demos/streaming_asr_server/{ => local}/punc_server.py (100%) rename demos/streaming_asr_server/{ => local}/streaming_asr_server.py (100%) mode change 100644 => 100755 demos/streaming_asr_server/run.sh delete mode 100644 demos/streaming_tts_server/start_server.sh delete mode 100644 demos/streaming_tts_server/test_client.sh diff --git a/demos/custom_streaming_asr/setup_docker.sh b/demos/custom_streaming_asr/setup_docker.sh old mode 100644 new mode 100755 diff --git a/demos/keyword_spotting/run.sh b/demos/keyword_spotting/run.sh old mode 100644 new mode 100755 diff --git a/demos/speaker_verification/run.sh b/demos/speaker_verification/run.sh old mode 100644 new mode 100755 diff --git a/demos/speech_recognition/run.sh b/demos/speech_recognition/run.sh old mode 100644 new mode 100755 index 19ce0ebb3..e48ff3e96 --- a/demos/speech_recognition/run.sh +++ b/demos/speech_recognition/run.sh @@ -1,6 +1,7 @@ #!/bin/bash -wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav # asr paddlespeech asr --input ./zh.wav @@ -8,3 +9,18 @@ paddlespeech asr --input ./zh.wav # asr + punc paddlespeech asr --input ./zh.wav | paddlespeech text --task punc + + +# asr help +paddlespeech asr --help + + +# english asr +paddlespeech asr --lang en --model transformer_librispeech --input ./en.wav + +# model stats +paddlespeech stats --task asr + + +# paddlespeech help +paddlespeech --help diff --git a/demos/speech_server/asr_client.sh b/demos/speech_server/asr_client.sh old mode 100644 new mode 100755 diff --git a/demos/speech_server/cls_client.sh b/demos/speech_server/cls_client.sh old mode 100644 new mode 100755 diff --git a/demos/speech_server/server.sh b/demos/speech_server/server.sh old mode 100644 new mode 100755 index e5961286b..fd719ffc1 --- a/demos/speech_server/server.sh +++ b/demos/speech_server/server.sh @@ -1,3 +1,3 @@ #!/bin/bash -paddlespeech_server start --config_file ./conf/application.yaml +paddlespeech_server start --config_file ./conf/application.yaml &> server.log & diff --git a/demos/speech_server/sid_client.sh b/demos/speech_server/sid_client.sh new file mode 100755 index 000000000..99bab21ae --- /dev/null +++ b/demos/speech_server/sid_client.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav +wget -c https://paddlespeech.bj.bcebos.com/vector/audio/123456789.wav + +# sid extract +paddlespeech_client vector --server_ip 127.0.0.1 --port 8090 --task spk --input ./85236145389.wav + +# sid score +paddlespeech_client vector --server_ip 127.0.0.1 --port 8090 --task score --enroll ./85236145389.wav --test ./123456789.wav diff --git a/demos/speech_server/text_client.sh b/demos/speech_server/text_client.sh new file mode 100755 index 000000000..098f159fb --- /dev/null +++ b/demos/speech_server/text_client.sh @@ -0,0 +1,4 @@ +#!/bin/bash + + +paddlespeech_client text --server_ip 127.0.0.1 --port 8090 --input 今天的天气真好啊你下午有空吗我想约你一起去吃饭 diff --git a/demos/speech_server/tts_client.sh b/demos/speech_server/tts_client.sh old mode 100644 new mode 100755 diff --git a/demos/streaming_asr_server/punc_server.py b/demos/streaming_asr_server/local/punc_server.py similarity index 100% rename from demos/streaming_asr_server/punc_server.py rename to demos/streaming_asr_server/local/punc_server.py diff --git a/demos/streaming_asr_server/streaming_asr_server.py b/demos/streaming_asr_server/local/streaming_asr_server.py similarity index 100% rename from demos/streaming_asr_server/streaming_asr_server.py rename to demos/streaming_asr_server/local/streaming_asr_server.py diff --git a/demos/streaming_asr_server/run.sh b/demos/streaming_asr_server/run.sh old mode 100644 new mode 100755 diff --git a/demos/streaming_asr_server/server.sh b/demos/streaming_asr_server/server.sh index f532546e7..961cb046a 100755 --- a/demos/streaming_asr_server/server.sh +++ b/demos/streaming_asr_server/server.sh @@ -1,9 +1,8 @@ -export CUDA_VISIBLE_DEVICE=0,1,2,3 - export CUDA_VISIBLE_DEVICE=0,1,2,3 +#export CUDA_VISIBLE_DEVICE=0,1,2,3 -# nohup python3 punc_server.py --config_file conf/punc_application.yaml > punc.log 2>&1 & +# nohup python3 local/punc_server.py --config_file conf/punc_application.yaml > punc.log 2>&1 & paddlespeech_server start --config_file conf/punc_application.yaml &> punc.log & -# nohup python3 streaming_asr_server.py --config_file conf/ws_conformer_wenetspeech_application.yaml > streaming_asr.log 2>&1 & +# nohup python3 local/streaming_asr_server.py --config_file conf/ws_conformer_wenetspeech_application.yaml > streaming_asr.log 2>&1 & paddlespeech_server start --config_file conf/ws_conformer_wenetspeech_application.yaml &> streaming_asr.log & diff --git a/demos/streaming_asr_server/test.sh b/demos/streaming_asr_server/test.sh index 67a5ec4c5..386c7f894 100755 --- a/demos/streaming_asr_server/test.sh +++ b/demos/streaming_asr_server/test.sh @@ -7,5 +7,5 @@ paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input ./zh.wa # read the wav and call streaming and punc service # If `127.0.0.1` is not accessible, you need to use the actual service IP address. -paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav +paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav diff --git a/demos/streaming_tts_server/start_server.sh b/demos/streaming_tts_server/start_server.sh deleted file mode 100644 index 9c71f2fe2..000000000 --- a/demos/streaming_tts_server/start_server.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -# start server -paddlespeech_server start --config_file ./conf/tts_online_application.yaml \ No newline at end of file diff --git a/demos/streaming_tts_server/test_client.sh b/demos/streaming_tts_server/test_client.sh deleted file mode 100644 index bd88f20b1..000000000 --- a/demos/streaming_tts_server/test_client.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -# http client test -# If `127.0.0.1` is not accessible, you need to use the actual service IP address. -paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol http --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav - -# websocket client test -# If `127.0.0.1` is not accessible, you need to use the actual service IP address. -# paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol websocket --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav diff --git a/demos/text_to_speech/run.sh b/demos/text_to_speech/run.sh index b1340241b..2b588be55 100755 --- a/demos/text_to_speech/run.sh +++ b/demos/text_to_speech/run.sh @@ -4,4 +4,10 @@ paddlespeech tts --input 今天的天气不错啊 # Batch process -echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts \ No newline at end of file +echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts + +# Text Frontend +paddlespeech tts --input 今天是2022/10/29,最低温度是-3℃. + + + diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index e8e57fff0..96368c0f3 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -718,6 +718,7 @@ class VectorClientExecutor(BaseExecutor): logger.info(f"the input audio: {input}") handler = VectorHttpHandler(server_ip=server_ip, port=port) res = handler.run(input, audio_format, sample_rate) + logger.info(f"The spk embedding is: {res}") return res elif task == "score": from paddlespeech.server.utils.audio_handler import VectorScoreHttpHandler From 74245cc115122e2f1774d720be3c068ee4b52525 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 12 Jul 2022 09:40:48 +0000 Subject: [PATCH 2/2] add streaming tts scripts --- demos/streaming_tts_server/client.sh | 9 ++ .../conf/tts_online_ws_application.yaml | 103 ++++++++++++++++++ demos/streaming_tts_server/server.sh | 10 ++ 3 files changed, 122 insertions(+) create mode 100755 demos/streaming_tts_server/client.sh create mode 100644 demos/streaming_tts_server/conf/tts_online_ws_application.yaml create mode 100755 demos/streaming_tts_server/server.sh diff --git a/demos/streaming_tts_server/client.sh b/demos/streaming_tts_server/client.sh new file mode 100755 index 000000000..e93da58a8 --- /dev/null +++ b/demos/streaming_tts_server/client.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# http client test +# If `127.0.0.1` is not accessible, you need to use the actual service IP address. +paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol http --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.http.wav + +# websocket client test +# If `127.0.0.1` is not accessible, you need to use the actual service IP address. +paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8192 --protocol websocket --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.ws.wav diff --git a/demos/streaming_tts_server/conf/tts_online_ws_application.yaml b/demos/streaming_tts_server/conf/tts_online_ws_application.yaml new file mode 100644 index 000000000..146f06f15 --- /dev/null +++ b/demos/streaming_tts_server/conf/tts_online_ws_application.yaml @@ -0,0 +1,103 @@ +# This is the parameter configuration file for streaming tts server. + +################################################################################# +# SERVER SETTING # +################################################################################# +host: 0.0.0.0 +port: 8192 + +# The task format in the engin_list is: _ +# engine_list choices = ['tts_online', 'tts_online-onnx'], the inference speed of tts_online-onnx is faster than tts_online. +# protocol choices = ['websocket', 'http'] +protocol: 'websocket' +engine_list: ['tts_online-onnx'] + + +################################################################################# +# ENGINE CONFIG # +################################################################################# + +################################### TTS ######################################### +################### speech task: tts; engine_type: online ####################### +tts_online: + # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc'] + # fastspeech2_cnndecoder_csmsc support streaming am infer. + am: 'fastspeech2_csmsc' + am_config: + am_ckpt: + am_stat: + phones_dict: + tones_dict: + speaker_dict: + spk_id: 0 + + # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc'] + # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference + voc: 'mb_melgan_csmsc' + voc_config: + voc_ckpt: + voc_stat: + + # others + lang: 'zh' + device: 'cpu' # set 'gpu:id' or 'cpu' + # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, + # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio + am_block: 72 + am_pad: 12 + # voc_pad and voc_block voc model to streaming voc infer, + # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal + # when voc model is hifigan_csmsc, voc_pad set 19, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal + voc_block: 36 + voc_pad: 14 + + + +################################################################################# +# ENGINE CONFIG # +################################################################################# + +################################### TTS ######################################### +################### speech task: tts; engine_type: online-onnx ####################### +tts_online-onnx: + # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx'] + # fastspeech2_cnndecoder_csmsc_onnx support streaming am infer. + am: 'fastspeech2_cnndecoder_csmsc_onnx' + # am_ckpt is a list, if am is fastspeech2_cnndecoder_csmsc_onnx, am_ckpt = [encoder model, decoder model, postnet model]; + # if am is fastspeech2_csmsc_onnx, am_ckpt = [ckpt model]; + am_ckpt: # list + am_stat: + phones_dict: + tones_dict: + speaker_dict: + spk_id: 0 + am_sample_rate: 24000 + am_sess_conf: + device: "cpu" # set 'gpu:id' or 'cpu' + use_trt: False + cpu_threads: 4 + + # voc (vocoder) choices=['mb_melgan_csmsc_onnx, hifigan_csmsc_onnx'] + # Both mb_melgan_csmsc_onnx and hifigan_csmsc_onnx support streaming voc inference + voc: 'hifigan_csmsc_onnx' + voc_ckpt: + voc_sample_rate: 24000 + voc_sess_conf: + device: "cpu" # set 'gpu:id' or 'cpu' + use_trt: False + cpu_threads: 4 + + # others + lang: 'zh' + # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, + # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio + am_block: 72 + am_pad: 12 + # voc_pad and voc_block voc model to streaming voc infer, + # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal + # when voc model is hifigan_csmsc_onnx, voc_pad set 19, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal + voc_block: 36 + voc_pad: 14 + # voc_upsample should be same as n_shift on voc config. + voc_upsample: 300 + diff --git a/demos/streaming_tts_server/server.sh b/demos/streaming_tts_server/server.sh new file mode 100755 index 000000000..d34ddba02 --- /dev/null +++ b/demos/streaming_tts_server/server.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# http server +paddlespeech_server start --config_file ./conf/tts_online_application.yaml &> tts.http.log & + + +# websocket server +paddlespeech_server start --config_file ./conf/tts_online_ws_application.yaml &> tts.ws.log & + +