From 35414ee58d67e109cfb590c20e70b7fb32f64490 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Wed, 8 Dec 2021 14:59:14 +0800 Subject: [PATCH] Update asr and audio tagging demo. --- demos/asr_hub/README.md | 5 -- demos/asr_hub/hub_infer.py | 46 ----------------- demos/asr_hub/run.sh | 30 ------------ demos/audio_tagging/README.md | 79 ++++++++++++++++++++++++++++++ demos/audio_tagging/tag.py | 37 ++++++++++++++ demos/echo_hub/.gitignore | 1 - demos/echo_hub/README.md | 13 ----- demos/echo_hub/hub_infer.py | 55 --------------------- demos/echo_hub/run.sh | 42 ---------------- demos/speech_recognition/README.md | 59 ++++++++++++++++++++++ demos/speech_recognition/asr.py | 37 ++++++++++++++ demos/tts_hub/README.md | 11 ----- demos/tts_hub/hub_infer.py | 43 ---------------- demos/tts_hub/run.sh | 42 ---------------- 14 files changed, 212 insertions(+), 288 deletions(-) delete mode 100644 demos/asr_hub/README.md delete mode 100644 demos/asr_hub/hub_infer.py delete mode 100755 demos/asr_hub/run.sh create mode 100644 demos/audio_tagging/README.md create mode 100644 demos/audio_tagging/tag.py delete mode 100644 demos/echo_hub/.gitignore delete mode 100644 demos/echo_hub/README.md delete mode 100644 demos/echo_hub/hub_infer.py delete mode 100755 demos/echo_hub/run.sh create mode 100644 demos/speech_recognition/README.md create mode 100644 demos/speech_recognition/asr.py delete mode 100644 demos/tts_hub/README.md delete mode 100644 demos/tts_hub/hub_infer.py delete mode 100755 demos/tts_hub/run.sh diff --git a/demos/asr_hub/README.md b/demos/asr_hub/README.md deleted file mode 100644 index 19e83f9a1..000000000 --- a/demos/asr_hub/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# ASR - -```shell -CUDA_VISIBLE_DEVICES=0 ./run.sh -``` diff --git a/demos/asr_hub/hub_infer.py b/demos/asr_hub/hub_infer.py deleted file mode 100644 index b540be1d5..000000000 --- a/demos/asr_hub/hub_infer.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os - -import paddle -import paddlehub as hub - -# yapf: disable -parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu']) -parser.add_argument("--wav_en", type=str) -parser.add_argument("--wav_zh", type=str) -args = parser.parse_args() -# yapf: enable - -if __name__ == '__main__': - paddle.set_device(args.device) - - s2t_en_model = hub.Module(name='u2_conformer_librispeech') - s2t_zh_model = hub.Module(name='u2_conformer_aishell') - - args.wav_en = os.path.abspath(os.path.expanduser(args.wav_en)) - args.wav_zh = os.path.abspath(os.path.expanduser(args.wav_zh)) - - assert os.path.isfile(args.wav_en) and os.path.isfile( - args.wav_zh), 'Wav files not exist.' - - print('[S2T][en]Wav: {}'.format(args.wav_en)) - text_en = s2t_en_model.speech_recognize(args.wav_en) - print('[S2T][en]Text: {}'.format(text_en)) - - print('[S2T][zh]Wav: {}'.format(args.wav_zh)) - text_zh = s2t_zh_model.speech_recognize(args.wav_zh) - print('[S2T][zh]Text: {}'.format(text_zh)) diff --git a/demos/asr_hub/run.sh b/demos/asr_hub/run.sh deleted file mode 100755 index 040fc4939..000000000 --- a/demos/asr_hub/run.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -if python -c "import paddlehub" &> /dev/null; then - echo 'PaddleHub has already been installed.' -else - echo 'Installing PaddleHub...' - pip install paddlehub -U -fi - -mkdir -p data -wav_en=data/en.wav -wav_zh=data/zh.wav -test -e ${wav_en} || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav -P data -test -e ${wav_zh} || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav -P data - -ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') -if [ ${ngpu} == 0 ];then - device=cpu -else - device=gpu -fi - -echo "using ${device}..." - -python3 -u hub_infer.py \ ---device ${device} \ ---wav_en ${wav_en} \ ---wav_zh ${wav_zh} - -exit 0 diff --git a/demos/audio_tagging/README.md b/demos/audio_tagging/README.md new file mode 100644 index 000000000..eba3619e4 --- /dev/null +++ b/demos/audio_tagging/README.md @@ -0,0 +1,79 @@ +# Audio Tagging + +## Introduction +Audio tagging is the task of labelling an audio clip with one or more labels or tags, includeing music tagging, acoustic scene classification, audio event classification, etc. + +This demo is an implementation to tag an audio file with 527 [AudioSet](https://research.google.com/audioset/) labels. It can be done by a single command line or a few lines in python using `PaddleSpeech`. + +## Usage +### 1. Installation +```sh +pip install paddlespeech +``` + +### 2. Prepare Input File +Input of this demo should be a WAV file(`.wav`). + +Here are sample files for this demo that can be downloaded: +```sh +!wget https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav +``` + +### 3. Usage +- Command Line(Recommended) + ```sh + paddlespeech cls --input ~/cat.wav --topk 10 + ``` + Command usage: + - `input`(required): Audio file to tag. + - `model`: Model type of tagging task. Default: `panns_cnn10`. + - `config`: Config of tagging task. Use pretrained model when it is None. Default: `None`. + - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`. + - `label_file`: Label file of tagging task. Use audioset labels when it is None. Default: `None`. + - `topk`: Show topk tagging labels of result. Default: `1`. + - `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment. + + Output: + ```sh + [2021-12-08 14:49:40,671] [ INFO] [utils.py] [L225] - CLS Result: + Cat: 0.8991316556930542 + Domestic animals, pets: 0.8806838393211365 + Meow: 0.8784668445587158 + Animal: 0.8776564598083496 + Caterwaul: 0.2232048511505127 + Speech: 0.03101264126598835 + Music: 0.02870696596801281 + Inside, small room: 0.016673989593982697 + Purr: 0.008387474343180656 + Bird: 0.006304860580712557 + ``` + +- Python API + ```sh + python tag.py --input ~/cat.wav + ``` + Output: + ```sh + CLS Result: + Cat: 0.8991316556930542 + Domestic animals, pets: 0.8806838393211365 + Meow: 0.8784668445587158 + Animal: 0.8776564598083496 + Caterwaul: 0.2232048511505127 + Speech: 0.03101264126598835 + Music: 0.02870696596801281 + Inside, small room: 0.016673989593982697 + Purr: 0.008387474343180656 + Bird: 0.006304860580712557 + ``` + + +### 4.Pretrained Models + +Here is a list of pretrained models released by PaddleSpeech and can be used by command and python api: + +| Model | Sample Rate +| :--- | :---: +| panns_cnn6| 32000 +| panns_cnn10| 32000 +| panns_cnn14| 32000 diff --git a/demos/audio_tagging/tag.py b/demos/audio_tagging/tag.py new file mode 100644 index 000000000..cda3c5ad4 --- /dev/null +++ b/demos/audio_tagging/tag.py @@ -0,0 +1,37 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +import paddle + +from paddlespeech.cli import CLSExecutor + +# yapf: disable +parser = argparse.ArgumentParser() +parser.add_argument( + '--input', type=str, required=True, help='Audio file to recognize.') +args = parser.parse_args() +# yapf: enable + +if __name__ == '__main__': + cls_executor = CLSExecutor() + result = cls_executor( + model_type='panns_cnn10', + cfg_path=None, # Set `cfg_path` and `ckpt_path` to None to use pretrained model. + label_file=None, + ckpt_path=None, + audio_file=args.input, + topk=10, + device=paddle.get_device(), ) + print('CLS Result: \n{}'.format(result)) diff --git a/demos/echo_hub/.gitignore b/demos/echo_hub/.gitignore deleted file mode 100644 index 1269488f7..000000000 --- a/demos/echo_hub/.gitignore +++ /dev/null @@ -1 +0,0 @@ -data diff --git a/demos/echo_hub/README.md b/demos/echo_hub/README.md deleted file mode 100644 index 3248f5179..000000000 --- a/demos/echo_hub/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# echo system - -ASR + TTS - -中文: -```shell -CUDA_VISIBLE_DEVICES=0 ./run.sh 用科技让复杂的世界更简单 . zh -``` - -英文: -```shell -CUDA_VISIBLE_DEVICES=0 ./run.sh "Text to speech system converts normal language text into speech." . en -``` diff --git a/demos/echo_hub/hub_infer.py b/demos/echo_hub/hub_infer.py deleted file mode 100644 index abeb409dd..000000000 --- a/demos/echo_hub/hub_infer.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os - -import librosa -import paddle -import paddlehub as hub -import soundfile as sf - -# yapf: disable -parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--lang", type=str, default='zh', choices=['zh', 'en']) -parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu']) -parser.add_argument("--text", type=str, nargs='+') -parser.add_argument("--output_dir", type=str) -args = parser.parse_args() -# yapf: enable - -if __name__ == '__main__': - paddle.set_device(args.device) - - output_dir = os.path.abspath(os.path.expanduser(args.output_dir)) - if args.lang == 'zh': - t2s_model = hub.Module(name='fastspeech2_baker', output_dir=output_dir) - s2t_model = hub.Module(name='u2_conformer_aishell') - else: - t2s_model = hub.Module( - name='fastspeech2_ljspeech', output_dir=output_dir) - s2t_model = hub.Module(name='u2_conformer_librispeech') - - if isinstance(args.text, list): - args.text = ' '.join(args.text) - - wavs = t2s_model.generate([args.text], device=args.device) - print('[T2S]Wav file has been generated: {}'.format(wavs[0])) - # convert sr to 16k - x, sr = librosa.load(wavs[0]) - y = librosa.resample(x, sr, 16000) - wav_16k = wavs[0].replace('.wav', '_16k.wav') - sf.write(wav_16k, y, 16000) - print('[S2T]Resample to 16k: {}'.format(wav_16k)) - text = s2t_model.speech_recognize(wav_16k) - print('[S2T]Text recognized from wav file: {}'.format(text)) diff --git a/demos/echo_hub/run.sh b/demos/echo_hub/run.sh deleted file mode 100755 index f3e87f2e0..000000000 --- a/demos/echo_hub/run.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -if python -c "import paddlehub" &> /dev/null; then - echo 'PaddleHub has already been installed.' -else - echo 'Installing PaddleHub...' - pip install paddlehub -U -fi - -if [ $# != 2 -a $# != 3 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} text output_dir [lang]" - exit -1 -fi - -ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') -if [ ${ngpu} == 0 ];then - device=cpu -else - device=gpu -fi - -echo "using ${device}..." - -text=$1 -output_dir=$2 -if [ $# == 3 ];then - lang=$3 -else - lang=zh -fi - -if [ ! -d $output_dir ];then - mkdir -p $output_dir -fi - -python3 -u hub_infer.py \ ---lang ${lang} \ ---device ${device} \ ---text \"${text}\" \ ---output_dir ${output_dir} - -exit 0 diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md new file mode 100644 index 000000000..86bde037a --- /dev/null +++ b/demos/speech_recognition/README.md @@ -0,0 +1,59 @@ +# ASR(Automatic Speech Recognition) + +## Introduction +ASR, or Automatic Speech Recognition, refers to the problem of getting a program to automatically transcribe spoken language (speech-to-text). + +This demo is an implementation to recognize text from a specific audio file. It can be done by a single command line or a few lines in python using `PaddleSpeech`. + +## Usage +### 1. Installation +```sh +pip install paddlespeech +``` + +### 2. Prepare Input File +Input of this demo should be a WAV file(`.wav`), and the sample rate must be same as the model's. + +Here are sample files for this demo that can be downloaded: +```sh +!wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +``` + +### 3. Usage +- Command Line(Recommended) + ```sh + paddlespeech asr --input ~/zh.wav + ``` + Command usage: + - `input`(required): Audio file to recognize. + - `model`: Model type of asr task. Default: `conformer_wenetspeech`. + - `lang`: Model language. Default: `zh`. + - `sr`: Sample rate of the model. Default: `16000`. + - `config`: Config of asr task. Use pretrained model when it is None. Default: `None`. + - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`. + - `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment. + + Output: + ```sh + [2021-12-08 13:12:34,063] [ INFO] [utils.py] [L225] - ASR Result: 我认为跑步最重要的就是给我带来了身体健康 + ``` + +- Python API + ```sh + python asr.py --input ~/zh.wav + ``` + Output: + ```sh + ASR Result: + 我认为跑步最重要的就是给我带来了身体健康 + ``` + + +### 4.Pretrained Models + +Here is a list of pretrained models released by PaddleSpeech and can be used by command and python api: + +| Model | Language | Sample Rate +| :--- | :---: | :---: | +| conformer_wenetspeech| zh| 16000 +| transformer_aishell| zh| 16000 diff --git a/demos/speech_recognition/asr.py b/demos/speech_recognition/asr.py new file mode 100644 index 000000000..3ac8b91df --- /dev/null +++ b/demos/speech_recognition/asr.py @@ -0,0 +1,37 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +import paddle + +from paddlespeech.cli import ASRExecutor + +# yapf: disable +parser = argparse.ArgumentParser() +parser.add_argument( + '--input', type=str, required=True, help='Audio file to recognize.') +args = parser.parse_args() +# yapf: enable + +if __name__ == '__main__': + asr_executor = ASRExecutor() + text = asr_executor( + model='conformer_wenetspeech', + lang='zh', + sample_rate=16000, + config=None, # Set `conf` and `ckpt_path` to None to use pretrained model. + ckpt_path=None, + audio_file=args.input, + device=paddle.get_device(), ) + print('ASR Result: \n{}'.format(text)) diff --git a/demos/tts_hub/README.md b/demos/tts_hub/README.md deleted file mode 100644 index f5fa599a0..000000000 --- a/demos/tts_hub/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# TTS - -中文: -```shell -CUDA_VISIBLE_DEVICES=0 ./run.sh 用科技让复杂的世界更简单 . zh -``` - -英文: -```shell -CUDA_VISIBLE_DEVICES=0 ./run.sh "Text to speech system converts normal language text into speech." . en -``` diff --git a/demos/tts_hub/hub_infer.py b/demos/tts_hub/hub_infer.py deleted file mode 100644 index 2430400ed..000000000 --- a/demos/tts_hub/hub_infer.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os - -import paddle -import paddlehub as hub - -# yapf: disable -parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--lang", type=str, default='zh', choices=['zh', 'en']) -parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu']) -parser.add_argument("--text", type=str, nargs='+') -parser.add_argument("--output_dir", type=str) -args = parser.parse_args() -# yapf: enable - -if __name__ == '__main__': - paddle.set_device(args.device) - - output_dir = os.path.abspath(os.path.expanduser(args.output_dir)) - if args.lang == 'zh': - t2s_model = hub.Module(name='fastspeech2_baker', output_dir=output_dir) - else: - t2s_model = hub.Module( - name='fastspeech2_ljspeech', output_dir=output_dir) - - if isinstance(args.text, list): - args.text = ' '.join(args.text) - - wavs = t2s_model.generate([args.text], device=args.device) - print('[T2S]Wav file has been generated: {}'.format(wavs[0])) diff --git a/demos/tts_hub/run.sh b/demos/tts_hub/run.sh deleted file mode 100755 index f3e87f2e0..000000000 --- a/demos/tts_hub/run.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -if python -c "import paddlehub" &> /dev/null; then - echo 'PaddleHub has already been installed.' -else - echo 'Installing PaddleHub...' - pip install paddlehub -U -fi - -if [ $# != 2 -a $# != 3 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} text output_dir [lang]" - exit -1 -fi - -ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') -if [ ${ngpu} == 0 ];then - device=cpu -else - device=gpu -fi - -echo "using ${device}..." - -text=$1 -output_dir=$2 -if [ $# == 3 ];then - lang=$3 -else - lang=zh -fi - -if [ ! -d $output_dir ];then - mkdir -p $output_dir -fi - -python3 -u hub_infer.py \ ---lang ${lang} \ ---device ${device} \ ---text \"${text}\" \ ---output_dir ${output_dir} - -exit 0