diff --git a/demos/asr_hub/README.md b/demos/asr_hub/README.md new file mode 100644 index 00000000..19e83f9a --- /dev/null +++ b/demos/asr_hub/README.md @@ -0,0 +1,5 @@ +# ASR + +```shell +CUDA_VISIBLE_DEVICES=0 ./run.sh +``` diff --git a/demos/asr_hub/hub_infer.py b/demos/asr_hub/hub_infer.py new file mode 100644 index 00000000..b540be1d --- /dev/null +++ b/demos/asr_hub/hub_infer.py @@ -0,0 +1,46 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os + +import paddle +import paddlehub as hub + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu']) +parser.add_argument("--wav_en", type=str) +parser.add_argument("--wav_zh", type=str) +args = parser.parse_args() +# yapf: enable + +if __name__ == '__main__': + paddle.set_device(args.device) + + s2t_en_model = hub.Module(name='u2_conformer_librispeech') + s2t_zh_model = hub.Module(name='u2_conformer_aishell') + + args.wav_en = os.path.abspath(os.path.expanduser(args.wav_en)) + args.wav_zh = os.path.abspath(os.path.expanduser(args.wav_zh)) + + assert os.path.isfile(args.wav_en) and os.path.isfile( + args.wav_zh), 'Wav files not exist.' + + print('[S2T][en]Wav: {}'.format(args.wav_en)) + text_en = s2t_en_model.speech_recognize(args.wav_en) + print('[S2T][en]Text: {}'.format(text_en)) + + print('[S2T][zh]Wav: {}'.format(args.wav_zh)) + text_zh = s2t_zh_model.speech_recognize(args.wav_zh) + print('[S2T][zh]Text: {}'.format(text_zh)) diff --git a/demos/asr_hub/run.sh b/demos/asr_hub/run.sh new file mode 100755 index 00000000..e60f239b --- /dev/null +++ b/demos/asr_hub/run.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +mkdir -p data +wav_en=data/en.wav +wav_zh=data/zh.wav +test -e ${wav_en} || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav -P data +test -e ${wav_zh} || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav -P data + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +if [ ${ngpu} == 0 ];then + device=cpu +else + device=gpu +fi + +echo "using ${device}..." + +python3 -u hub_infer.py \ +--device ${device} \ +--wav_en ${wav_en} \ +--wav_zh ${wav_zh} + +exit 0 diff --git a/demos/echo_hub/README.md b/demos/echo_hub/README.md index 10e443f2..3248f517 100644 --- a/demos/echo_hub/README.md +++ b/demos/echo_hub/README.md @@ -1,3 +1,13 @@ # echo system ASR + TTS + +中文: +```shell +CUDA_VISIBLE_DEVICES=0 ./run.sh 用科技让复杂的世界更简单 . zh +``` + +英文: +```shell +CUDA_VISIBLE_DEVICES=0 ./run.sh "Text to speech system converts normal language text into speech." . en +``` diff --git a/demos/echo_hub/hub_infer.py b/demos/echo_hub/hub_infer.py new file mode 100644 index 00000000..abeb409d --- /dev/null +++ b/demos/echo_hub/hub_infer.py @@ -0,0 +1,55 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os + +import librosa +import paddle +import paddlehub as hub +import soundfile as sf + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--lang", type=str, default='zh', choices=['zh', 'en']) +parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu']) +parser.add_argument("--text", type=str, nargs='+') +parser.add_argument("--output_dir", type=str) +args = parser.parse_args() +# yapf: enable + +if __name__ == '__main__': + paddle.set_device(args.device) + + output_dir = os.path.abspath(os.path.expanduser(args.output_dir)) + if args.lang == 'zh': + t2s_model = hub.Module(name='fastspeech2_baker', output_dir=output_dir) + s2t_model = hub.Module(name='u2_conformer_aishell') + else: + t2s_model = hub.Module( + name='fastspeech2_ljspeech', output_dir=output_dir) + s2t_model = hub.Module(name='u2_conformer_librispeech') + + if isinstance(args.text, list): + args.text = ' '.join(args.text) + + wavs = t2s_model.generate([args.text], device=args.device) + print('[T2S]Wav file has been generated: {}'.format(wavs[0])) + # convert sr to 16k + x, sr = librosa.load(wavs[0]) + y = librosa.resample(x, sr, 16000) + wav_16k = wavs[0].replace('.wav', '_16k.wav') + sf.write(wav_16k, y, 16000) + print('[S2T]Resample to 16k: {}'.format(wav_16k)) + text = s2t_model.speech_recognize(wav_16k) + print('[S2T]Text recognized from wav file: {}'.format(text)) diff --git a/demos/echo_hub/run.sh b/demos/echo_hub/run.sh index a52d300f..8348af81 100755 --- a/demos/echo_hub/run.sh +++ b/demos/echo_hub/run.sh @@ -1,17 +1,35 @@ #!/bin/bash -mkdir -p data +if [ $# != 2 -a $# != 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} text output_dir [lang]" + exit -1 +fi -wav_en=data/en.wav -wav_zh=data/zh.wav +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +if [ ${ngpu} == 0 ];then + device=cpu +else + device=gpu +fi -test -e ${wav_en} || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav -P data -test -e ${wav_zh} || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav -P data +echo "using ${device}..." -pip install paddlehub +text=$1 +output_dir=$2 +if [ $# == 3 ];then + lang=$3 +else + lang=zh +fi -asr_en_cmd="import paddlehub as hub; model = hub.Module(name='u2_conformer_librispeech'); print(model.speech_recognize(\"${wav_en}\", device='gpu'))" -asr_zh_cmd="import paddlehub as hub; model = hub.Module(name='u2_conformer_aishell'); print(model.speech_recognize(\"${wav_zh}\", device='gpu'))" +if [ ! -d $output_dir ];then + mkdir -p $output_dir +fi -python -c "${asr_en_cmd}" -python -c "${asr_zh_cmd}" \ No newline at end of file +python3 -u hub_infer.py \ +--lang ${lang} \ +--device ${device} \ +--text \"${text}\" \ +--output_dir ${output_dir} + +exit 0 diff --git a/demos/tts_hub/README.md b/demos/tts_hub/README.md new file mode 100644 index 00000000..f5fa599a --- /dev/null +++ b/demos/tts_hub/README.md @@ -0,0 +1,11 @@ +# TTS + +中文: +```shell +CUDA_VISIBLE_DEVICES=0 ./run.sh 用科技让复杂的世界更简单 . zh +``` + +英文: +```shell +CUDA_VISIBLE_DEVICES=0 ./run.sh "Text to speech system converts normal language text into speech." . en +``` diff --git a/demos/tts_hub/hub_infer.py b/demos/tts_hub/hub_infer.py new file mode 100644 index 00000000..2430400e --- /dev/null +++ b/demos/tts_hub/hub_infer.py @@ -0,0 +1,43 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os + +import paddle +import paddlehub as hub + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--lang", type=str, default='zh', choices=['zh', 'en']) +parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu']) +parser.add_argument("--text", type=str, nargs='+') +parser.add_argument("--output_dir", type=str) +args = parser.parse_args() +# yapf: enable + +if __name__ == '__main__': + paddle.set_device(args.device) + + output_dir = os.path.abspath(os.path.expanduser(args.output_dir)) + if args.lang == 'zh': + t2s_model = hub.Module(name='fastspeech2_baker', output_dir=output_dir) + else: + t2s_model = hub.Module( + name='fastspeech2_ljspeech', output_dir=output_dir) + + if isinstance(args.text, list): + args.text = ' '.join(args.text) + + wavs = t2s_model.generate([args.text], device=args.device) + print('[T2S]Wav file has been generated: {}'.format(wavs[0])) diff --git a/demos/tts_hub/run.sh b/demos/tts_hub/run.sh new file mode 100755 index 00000000..8348af81 --- /dev/null +++ b/demos/tts_hub/run.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +if [ $# != 2 -a $# != 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} text output_dir [lang]" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +if [ ${ngpu} == 0 ];then + device=cpu +else + device=gpu +fi + +echo "using ${device}..." + +text=$1 +output_dir=$2 +if [ $# == 3 ];then + lang=$3 +else + lang=zh +fi + +if [ ! -d $output_dir ];then + mkdir -p $output_dir +fi + +python3 -u hub_infer.py \ +--lang ${lang} \ +--device ${device} \ +--text \"${text}\" \ +--output_dir ${output_dir} + +exit 0