Update asr and audio tagging demo.

pull/1086/head
KP 4 years ago
parent 08efbab2d4
commit 35414ee58d

@ -1,5 +0,0 @@
# ASR
```shell
CUDA_VISIBLE_DEVICES=0 ./run.sh
```

@ -1,46 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import paddle
import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu'])
parser.add_argument("--wav_en", type=str)
parser.add_argument("--wav_zh", type=str)
args = parser.parse_args()
# yapf: enable
if __name__ == '__main__':
paddle.set_device(args.device)
s2t_en_model = hub.Module(name='u2_conformer_librispeech')
s2t_zh_model = hub.Module(name='u2_conformer_aishell')
args.wav_en = os.path.abspath(os.path.expanduser(args.wav_en))
args.wav_zh = os.path.abspath(os.path.expanduser(args.wav_zh))
assert os.path.isfile(args.wav_en) and os.path.isfile(
args.wav_zh), 'Wav files not exist.'
print('[S2T][en]Wav: {}'.format(args.wav_en))
text_en = s2t_en_model.speech_recognize(args.wav_en)
print('[S2T][en]Text: {}'.format(text_en))
print('[S2T][zh]Wav: {}'.format(args.wav_zh))
text_zh = s2t_zh_model.speech_recognize(args.wav_zh)
print('[S2T][zh]Text: {}'.format(text_zh))

@ -1,30 +0,0 @@
#!/bin/bash
if python -c "import paddlehub" &> /dev/null; then
echo 'PaddleHub has already been installed.'
else
echo 'Installing PaddleHub...'
pip install paddlehub -U
fi
mkdir -p data
wav_en=data/en.wav
wav_zh=data/zh.wav
test -e ${wav_en} || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav -P data
test -e ${wav_zh} || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav -P data
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
if [ ${ngpu} == 0 ];then
device=cpu
else
device=gpu
fi
echo "using ${device}..."
python3 -u hub_infer.py \
--device ${device} \
--wav_en ${wav_en} \
--wav_zh ${wav_zh}
exit 0

@ -0,0 +1,79 @@
# Audio Tagging
## Introduction
Audio tagging is the task of labelling an audio clip with one or more labels or tags, includeing music tagging, acoustic scene classification, audio event classification, etc.
This demo is an implementation to tag an audio file with 527 [AudioSet](https://research.google.com/audioset/) labels. It can be done by a single command line or a few lines in python using `PaddleSpeech`.
## Usage
### 1. Installation
```sh
pip install paddlespeech
```
### 2. Prepare Input File
Input of this demo should be a WAV file(`.wav`).
Here are sample files for this demo that can be downloaded:
```sh
!wget https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav
```
### 3. Usage
- Command Line(Recommended)
```sh
paddlespeech cls --input ~/cat.wav --topk 10
```
Command usage:
- `input`(required): Audio file to tag.
- `model`: Model type of tagging task. Default: `panns_cnn10`.
- `config`: Config of tagging task. Use pretrained model when it is None. Default: `None`.
- `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`.
- `label_file`: Label file of tagging task. Use audioset labels when it is None. Default: `None`.
- `topk`: Show topk tagging labels of result. Default: `1`.
- `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment.
Output:
```sh
[2021-12-08 14:49:40,671] [ INFO] [utils.py] [L225] - CLS Result:
Cat: 0.8991316556930542
Domestic animals, pets: 0.8806838393211365
Meow: 0.8784668445587158
Animal: 0.8776564598083496
Caterwaul: 0.2232048511505127
Speech: 0.03101264126598835
Music: 0.02870696596801281
Inside, small room: 0.016673989593982697
Purr: 0.008387474343180656
Bird: 0.006304860580712557
```
- Python API
```sh
python tag.py --input ~/cat.wav
```
Output:
```sh
CLS Result:
Cat: 0.8991316556930542
Domestic animals, pets: 0.8806838393211365
Meow: 0.8784668445587158
Animal: 0.8776564598083496
Caterwaul: 0.2232048511505127
Speech: 0.03101264126598835
Music: 0.02870696596801281
Inside, small room: 0.016673989593982697
Purr: 0.008387474343180656
Bird: 0.006304860580712557
```
### 4.Pretrained Models
Here is a list of pretrained models released by PaddleSpeech and can be used by command and python api:
| Model | Sample Rate
| :--- | :---:
| panns_cnn6| 32000
| panns_cnn10| 32000
| panns_cnn14| 32000

@ -0,0 +1,37 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import paddle
from paddlespeech.cli import CLSExecutor
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument(
'--input', type=str, required=True, help='Audio file to recognize.')
args = parser.parse_args()
# yapf: enable
if __name__ == '__main__':
cls_executor = CLSExecutor()
result = cls_executor(
model_type='panns_cnn10',
cfg_path=None, # Set `cfg_path` and `ckpt_path` to None to use pretrained model.
label_file=None,
ckpt_path=None,
audio_file=args.input,
topk=10,
device=paddle.get_device(), )
print('CLS Result: \n{}'.format(result))

@ -1 +0,0 @@
data

@ -1,13 +0,0 @@
# echo system
ASR + TTS
中文:
```shell
CUDA_VISIBLE_DEVICES=0 ./run.sh 用科技让复杂的世界更简单 . zh
```
英文:
```shell
CUDA_VISIBLE_DEVICES=0 ./run.sh "Text to speech system converts normal language text into speech." . en
```

@ -1,55 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import librosa
import paddle
import paddlehub as hub
import soundfile as sf
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--lang", type=str, default='zh', choices=['zh', 'en'])
parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu'])
parser.add_argument("--text", type=str, nargs='+')
parser.add_argument("--output_dir", type=str)
args = parser.parse_args()
# yapf: enable
if __name__ == '__main__':
paddle.set_device(args.device)
output_dir = os.path.abspath(os.path.expanduser(args.output_dir))
if args.lang == 'zh':
t2s_model = hub.Module(name='fastspeech2_baker', output_dir=output_dir)
s2t_model = hub.Module(name='u2_conformer_aishell')
else:
t2s_model = hub.Module(
name='fastspeech2_ljspeech', output_dir=output_dir)
s2t_model = hub.Module(name='u2_conformer_librispeech')
if isinstance(args.text, list):
args.text = ' '.join(args.text)
wavs = t2s_model.generate([args.text], device=args.device)
print('[T2S]Wav file has been generated: {}'.format(wavs[0]))
# convert sr to 16k
x, sr = librosa.load(wavs[0])
y = librosa.resample(x, sr, 16000)
wav_16k = wavs[0].replace('.wav', '_16k.wav')
sf.write(wav_16k, y, 16000)
print('[S2T]Resample to 16k: {}'.format(wav_16k))
text = s2t_model.speech_recognize(wav_16k)
print('[S2T]Text recognized from wav file: {}'.format(text))

@ -1,42 +0,0 @@
#!/bin/bash
if python -c "import paddlehub" &> /dev/null; then
echo 'PaddleHub has already been installed.'
else
echo 'Installing PaddleHub...'
pip install paddlehub -U
fi
if [ $# != 2 -a $# != 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} text output_dir [lang]"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
if [ ${ngpu} == 0 ];then
device=cpu
else
device=gpu
fi
echo "using ${device}..."
text=$1
output_dir=$2
if [ $# == 3 ];then
lang=$3
else
lang=zh
fi
if [ ! -d $output_dir ];then
mkdir -p $output_dir
fi
python3 -u hub_infer.py \
--lang ${lang} \
--device ${device} \
--text \"${text}\" \
--output_dir ${output_dir}
exit 0

@ -0,0 +1,59 @@
# ASR(Automatic Speech Recognition)
## Introduction
ASR, or Automatic Speech Recognition, refers to the problem of getting a program to automatically transcribe spoken language (speech-to-text).
This demo is an implementation to recognize text from a specific audio file. It can be done by a single command line or a few lines in python using `PaddleSpeech`.
## Usage
### 1. Installation
```sh
pip install paddlespeech
```
### 2. Prepare Input File
Input of this demo should be a WAV file(`.wav`), and the sample rate must be same as the model's.
Here are sample files for this demo that can be downloaded:
```sh
!wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
```
### 3. Usage
- Command Line(Recommended)
```sh
paddlespeech asr --input ~/zh.wav
```
Command usage:
- `input`(required): Audio file to recognize.
- `model`: Model type of asr task. Default: `conformer_wenetspeech`.
- `lang`: Model language. Default: `zh`.
- `sr`: Sample rate of the model. Default: `16000`.
- `config`: Config of asr task. Use pretrained model when it is None. Default: `None`.
- `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`.
- `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment.
Output:
```sh
[2021-12-08 13:12:34,063] [ INFO] [utils.py] [L225] - ASR Result: 我认为跑步最重要的就是给我带来了身体健康
```
- Python API
```sh
python asr.py --input ~/zh.wav
```
Output:
```sh
ASR Result:
我认为跑步最重要的就是给我带来了身体健康
```
### 4.Pretrained Models
Here is a list of pretrained models released by PaddleSpeech and can be used by command and python api:
| Model | Language | Sample Rate
| :--- | :---: | :---: |
| conformer_wenetspeech| zh| 16000
| transformer_aishell| zh| 16000

@ -0,0 +1,37 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import paddle
from paddlespeech.cli import ASRExecutor
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument(
'--input', type=str, required=True, help='Audio file to recognize.')
args = parser.parse_args()
# yapf: enable
if __name__ == '__main__':
asr_executor = ASRExecutor()
text = asr_executor(
model='conformer_wenetspeech',
lang='zh',
sample_rate=16000,
config=None, # Set `conf` and `ckpt_path` to None to use pretrained model.
ckpt_path=None,
audio_file=args.input,
device=paddle.get_device(), )
print('ASR Result: \n{}'.format(text))

@ -1,11 +0,0 @@
# TTS
中文:
```shell
CUDA_VISIBLE_DEVICES=0 ./run.sh 用科技让复杂的世界更简单 . zh
```
英文:
```shell
CUDA_VISIBLE_DEVICES=0 ./run.sh "Text to speech system converts normal language text into speech." . en
```

@ -1,43 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import paddle
import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--lang", type=str, default='zh', choices=['zh', 'en'])
parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu'])
parser.add_argument("--text", type=str, nargs='+')
parser.add_argument("--output_dir", type=str)
args = parser.parse_args()
# yapf: enable
if __name__ == '__main__':
paddle.set_device(args.device)
output_dir = os.path.abspath(os.path.expanduser(args.output_dir))
if args.lang == 'zh':
t2s_model = hub.Module(name='fastspeech2_baker', output_dir=output_dir)
else:
t2s_model = hub.Module(
name='fastspeech2_ljspeech', output_dir=output_dir)
if isinstance(args.text, list):
args.text = ' '.join(args.text)
wavs = t2s_model.generate([args.text], device=args.device)
print('[T2S]Wav file has been generated: {}'.format(wavs[0]))

@ -1,42 +0,0 @@
#!/bin/bash
if python -c "import paddlehub" &> /dev/null; then
echo 'PaddleHub has already been installed.'
else
echo 'Installing PaddleHub...'
pip install paddlehub -U
fi
if [ $# != 2 -a $# != 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} text output_dir [lang]"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
if [ ${ngpu} == 0 ];then
device=cpu
else
device=gpu
fi
echo "using ${device}..."
text=$1
output_dir=$2
if [ $# == 3 ];then
lang=$3
else
lang=zh
fi
if [ ! -d $output_dir ];then
mkdir -p $output_dir
fi
python3 -u hub_infer.py \
--lang ${lang} \
--device ${device} \
--text \"${text}\" \
--output_dir ${output_dir}
exit 0
Loading…
Cancel
Save