Merge pull request #1086 from KPatr1ck/demo

[Demo]Update asr and audio tagging demo.
pull/1089/head
Hui Zhang 4 years ago committed by GitHub
commit 3e780dfe1f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,5 +0,0 @@
# ASR
```shell
CUDA_VISIBLE_DEVICES=0 ./run.sh
```

@ -1,46 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import paddle
import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu'])
parser.add_argument("--wav_en", type=str)
parser.add_argument("--wav_zh", type=str)
args = parser.parse_args()
# yapf: enable
if __name__ == '__main__':
paddle.set_device(args.device)
s2t_en_model = hub.Module(name='u2_conformer_librispeech')
s2t_zh_model = hub.Module(name='u2_conformer_aishell')
args.wav_en = os.path.abspath(os.path.expanduser(args.wav_en))
args.wav_zh = os.path.abspath(os.path.expanduser(args.wav_zh))
assert os.path.isfile(args.wav_en) and os.path.isfile(
args.wav_zh), 'Wav files not exist.'
print('[S2T][en]Wav: {}'.format(args.wav_en))
text_en = s2t_en_model.speech_recognize(args.wav_en)
print('[S2T][en]Text: {}'.format(text_en))
print('[S2T][zh]Wav: {}'.format(args.wav_zh))
text_zh = s2t_zh_model.speech_recognize(args.wav_zh)
print('[S2T][zh]Text: {}'.format(text_zh))

@ -1,30 +0,0 @@
#!/bin/bash
if python -c "import paddlehub" &> /dev/null; then
echo 'PaddleHub has already been installed.'
else
echo 'Installing PaddleHub...'
pip install paddlehub -U
fi
mkdir -p data
wav_en=data/en.wav
wav_zh=data/zh.wav
test -e ${wav_en} || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav -P data
test -e ${wav_zh} || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav -P data
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
if [ ${ngpu} == 0 ];then
device=cpu
else
device=gpu
fi
echo "using ${device}..."
python3 -u hub_infer.py \
--device ${device} \
--wav_en ${wav_en} \
--wav_zh ${wav_zh}
exit 0

@ -0,0 +1,95 @@
# Audio Tagging
## Introduction
Audio tagging is the task of labelling an audio clip with one or more labels or tags, includeing music tagging, acoustic scene classification, audio event classification, etc.
This demo is an implementation to tag an audio file with 527 [AudioSet](https://research.google.com/audioset/) labels. It can be done by a single command line or a few lines in python using `PaddleSpeech`.
## Usage
### 1. Installation
```bash
pip install paddlespeech
```
### 2. Prepare Input File
Input of this demo should be a WAV file(`.wav`).
Here are sample files for this demo that can be downloaded:
```bash
wget https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav
```
### 3. Usage
- Command Line(Recommended)
```bash
paddlespeech cls --input ~/cat.wav --topk 10
```
Usage:
```bash
paddlespeech cls --help
```
Arguments:
- `input`(required): Audio file to tag.
- `model`: Model type of tagging task. Default: `panns_cnn14`.
- `config`: Config of tagging task. Use pretrained model when it is None. Default: `None`.
- `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`.
- `label_file`: Label file of tagging task. Use audioset labels when it is None. Default: `None`.
- `topk`: Show topk tagging labels of result. Default: `1`.
- `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment.
Output:
```bash
[2021-12-08 14:49:40,671] [ INFO] [utils.py] [L225] - CLS Result:
Cat: 0.8991316556930542
Domestic animals, pets: 0.8806838393211365
Meow: 0.8784668445587158
Animal: 0.8776564598083496
Caterwaul: 0.2232048511505127
Speech: 0.03101264126598835
Music: 0.02870696596801281
Inside, small room: 0.016673989593982697
Purr: 0.008387474343180656
Bird: 0.006304860580712557
```
- Python API
```python
import paddle
from paddlespeech.cli import CLSExecutor
cls_executor = CLSExecutor()
result = cls_executor(
model='panns_cnn14',
config=None, # Set `config` and `ckpt_path` to None to use pretrained model.
label_file=None,
ckpt_path=None,
audio_file='./cat.wav',
topk=10,
device=paddle.get_device())
print('CLS Result: \n{}'.format(result))
```
Output:
```bash
CLS Result:
Cat: 0.8991316556930542
Domestic animals, pets: 0.8806838393211365
Meow: 0.8784668445587158
Animal: 0.8776564598083496
Caterwaul: 0.2232048511505127
Speech: 0.03101264126598835
Music: 0.02870696596801281
Inside, small room: 0.016673989593982697
Purr: 0.008387474343180656
Bird: 0.006304860580712557
```
### 4.Pretrained Models
Here is a list of pretrained models released by PaddleSpeech and can be used by command and python api:
| Model | Sample Rate
| :--- | :---:
| panns_cnn6| 32000
| panns_cnn10| 32000
| panns_cnn14| 32000

@ -1 +0,0 @@
data

@ -1,13 +0,0 @@
# echo system
ASR + TTS
中文:
```shell
CUDA_VISIBLE_DEVICES=0 ./run.sh 用科技让复杂的世界更简单 . zh
```
英文:
```shell
CUDA_VISIBLE_DEVICES=0 ./run.sh "Text to speech system converts normal language text into speech." . en
```

@ -1,55 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import librosa
import paddle
import paddlehub as hub
import soundfile as sf
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--lang", type=str, default='zh', choices=['zh', 'en'])
parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu'])
parser.add_argument("--text", type=str, nargs='+')
parser.add_argument("--output_dir", type=str)
args = parser.parse_args()
# yapf: enable
if __name__ == '__main__':
paddle.set_device(args.device)
output_dir = os.path.abspath(os.path.expanduser(args.output_dir))
if args.lang == 'zh':
t2s_model = hub.Module(name='fastspeech2_baker', output_dir=output_dir)
s2t_model = hub.Module(name='u2_conformer_aishell')
else:
t2s_model = hub.Module(
name='fastspeech2_ljspeech', output_dir=output_dir)
s2t_model = hub.Module(name='u2_conformer_librispeech')
if isinstance(args.text, list):
args.text = ' '.join(args.text)
wavs = t2s_model.generate([args.text], device=args.device)
print('[T2S]Wav file has been generated: {}'.format(wavs[0]))
# convert sr to 16k
x, sr = librosa.load(wavs[0])
y = librosa.resample(x, sr, 16000)
wav_16k = wavs[0].replace('.wav', '_16k.wav')
sf.write(wav_16k, y, 16000)
print('[S2T]Resample to 16k: {}'.format(wav_16k))
text = s2t_model.speech_recognize(wav_16k)
print('[S2T]Text recognized from wav file: {}'.format(text))

@ -1,42 +0,0 @@
#!/bin/bash
if python -c "import paddlehub" &> /dev/null; then
echo 'PaddleHub has already been installed.'
else
echo 'Installing PaddleHub...'
pip install paddlehub -U
fi
if [ $# != 2 -a $# != 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} text output_dir [lang]"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
if [ ${ngpu} == 0 ];then
device=cpu
else
device=gpu
fi
echo "using ${device}..."
text=$1
output_dir=$2
if [ $# == 3 ];then
lang=$3
else
lang=zh
fi
if [ ! -d $output_dir ];then
mkdir -p $output_dir
fi
python3 -u hub_infer.py \
--lang ${lang} \
--device ${device} \
--text \"${text}\" \
--output_dir ${output_dir}
exit 0

@ -0,0 +1,76 @@
# ASR(Automatic Speech Recognition)
## Introduction
ASR, or Automatic Speech Recognition, refers to the problem of getting a program to automatically transcribe spoken language (speech-to-text).
This demo is an implementation to recognize text from a specific audio file. It can be done by a single command line or a few lines in python using `PaddleSpeech`.
## Usage
### 1. Installation
```bash
pip install paddlespeech
```
### 2. Prepare Input File
Input of this demo should be a WAV file(`.wav`), and the sample rate must be same as the model's.
Here are sample files for this demo that can be downloaded:
```bash
wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
```
### 3. Usage
- Command Line(Recommended)
```bash
paddlespeech asr --input ~/zh.wav
```
Usage:
```bash
paddlespeech asr --help
```
Arguments:
- `input`(required): Audio file to recognize.
- `model`: Model type of asr task. Default: `conformer_wenetspeech`.
- `lang`: Model language. Default: `zh`.
- `sr`: Sample rate of the model. Default: `16000`.
- `config`: Config of asr task. Use pretrained model when it is None. Default: `None`.
- `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`.
- `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment.
Output:
```bash
[2021-12-08 13:12:34,063] [ INFO] [utils.py] [L225] - ASR Result: 我认为跑步最重要的就是给我带来了身体健康
```
- Python API
```python
import paddle
from paddlespeech.cli import ASRExecutor
asr_executor = ASRExecutor()
text = asr_executor(
model='conformer_wenetspeech',
lang='zh',
sample_rate=16000,
config=None, # Set `config` and `ckpt_path` to None to use pretrained model.
ckpt_path=None,
audio_file='./zh.wav',
device=paddle.get_device())
print('ASR Result: \n{}'.format(text))
```
Output:
```bash
ASR Result:
我认为跑步最重要的就是给我带来了身体健康
```
### 4.Pretrained Models
Here is a list of pretrained models released by PaddleSpeech and can be used by command and python api:
| Model | Language | Sample Rate
| :--- | :---: | :---: |
| conformer_wenetspeech| zh| 16000
| transformer_aishell| zh| 16000

@ -1,11 +0,0 @@
# TTS
中文:
```shell
CUDA_VISIBLE_DEVICES=0 ./run.sh 用科技让复杂的世界更简单 . zh
```
英文:
```shell
CUDA_VISIBLE_DEVICES=0 ./run.sh "Text to speech system converts normal language text into speech." . en
```

@ -1,43 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import paddle
import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--lang", type=str, default='zh', choices=['zh', 'en'])
parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu'])
parser.add_argument("--text", type=str, nargs='+')
parser.add_argument("--output_dir", type=str)
args = parser.parse_args()
# yapf: enable
if __name__ == '__main__':
paddle.set_device(args.device)
output_dir = os.path.abspath(os.path.expanduser(args.output_dir))
if args.lang == 'zh':
t2s_model = hub.Module(name='fastspeech2_baker', output_dir=output_dir)
else:
t2s_model = hub.Module(
name='fastspeech2_ljspeech', output_dir=output_dir)
if isinstance(args.text, list):
args.text = ' '.join(args.text)
wavs = t2s_model.generate([args.text], device=args.device)
print('[T2S]Wav file has been generated: {}'.format(wavs[0]))

@ -1,42 +0,0 @@
#!/bin/bash
if python -c "import paddlehub" &> /dev/null; then
echo 'PaddleHub has already been installed.'
else
echo 'Installing PaddleHub...'
pip install paddlehub -U
fi
if [ $# != 2 -a $# != 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} text output_dir [lang]"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
if [ ${ngpu} == 0 ];then
device=cpu
else
device=gpu
fi
echo "using ${device}..."
text=$1
output_dir=$2
if [ $# == 3 ];then
lang=$3
else
lang=zh
fi
if [ ! -d $output_dir ];then
mkdir -p $output_dir
fi
python3 -u hub_infer.py \
--lang ${lang} \
--device ${device} \
--text \"${text}\" \
--output_dir ${output_dir}
exit 0
Loading…
Cancel
Save