From 35414ee58d67e109cfb590c20e70b7fb32f64490 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Wed, 8 Dec 2021 14:59:14 +0800
Subject: [PATCH] Update asr and audio tagging demo.

---
 demos/asr_hub/README.md            |  5 --
 demos/asr_hub/hub_infer.py         | 46 -----------------
 demos/asr_hub/run.sh               | 30 ------------
 demos/audio_tagging/README.md      | 79 ++++++++++++++++++++++++++++++
 demos/audio_tagging/tag.py         | 37 ++++++++++++++
 demos/echo_hub/.gitignore          |  1 -
 demos/echo_hub/README.md           | 13 -----
 demos/echo_hub/hub_infer.py        | 55 ---------------------
 demos/echo_hub/run.sh              | 42 ----------------
 demos/speech_recognition/README.md | 59 ++++++++++++++++++++++
 demos/speech_recognition/asr.py    | 37 ++++++++++++++
 demos/tts_hub/README.md            | 11 -----
 demos/tts_hub/hub_infer.py         | 43 ----------------
 demos/tts_hub/run.sh               | 42 ----------------
 14 files changed, 212 insertions(+), 288 deletions(-)
 delete mode 100644 demos/asr_hub/README.md
 delete mode 100644 demos/asr_hub/hub_infer.py
 delete mode 100755 demos/asr_hub/run.sh
 create mode 100644 demos/audio_tagging/README.md
 create mode 100644 demos/audio_tagging/tag.py
 delete mode 100644 demos/echo_hub/.gitignore
 delete mode 100644 demos/echo_hub/README.md
 delete mode 100644 demos/echo_hub/hub_infer.py
 delete mode 100755 demos/echo_hub/run.sh
 create mode 100644 demos/speech_recognition/README.md
 create mode 100644 demos/speech_recognition/asr.py
 delete mode 100644 demos/tts_hub/README.md
 delete mode 100644 demos/tts_hub/hub_infer.py
 delete mode 100755 demos/tts_hub/run.sh

diff --git a/demos/asr_hub/README.md b/demos/asr_hub/README.md
deleted file mode 100644
index 19e83f9a1..000000000
--- a/demos/asr_hub/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# ASR
-
-```shell
-CUDA_VISIBLE_DEVICES=0 ./run.sh
-```
diff --git a/demos/asr_hub/hub_infer.py b/demos/asr_hub/hub_infer.py
deleted file mode 100644
index b540be1d5..000000000
--- a/demos/asr_hub/hub_infer.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-
-import paddle
-import paddlehub as hub
-
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu'])
-parser.add_argument("--wav_en", type=str)
-parser.add_argument("--wav_zh", type=str)
-args = parser.parse_args()
-# yapf: enable
-
-if __name__ == '__main__':
-    paddle.set_device(args.device)
-
-    s2t_en_model = hub.Module(name='u2_conformer_librispeech')
-    s2t_zh_model = hub.Module(name='u2_conformer_aishell')
-
-    args.wav_en = os.path.abspath(os.path.expanduser(args.wav_en))
-    args.wav_zh = os.path.abspath(os.path.expanduser(args.wav_zh))
-
-    assert os.path.isfile(args.wav_en) and os.path.isfile(
-        args.wav_zh), 'Wav files not exist.'
-
-    print('[S2T][en]Wav: {}'.format(args.wav_en))
-    text_en = s2t_en_model.speech_recognize(args.wav_en)
-    print('[S2T][en]Text: {}'.format(text_en))
-
-    print('[S2T][zh]Wav: {}'.format(args.wav_zh))
-    text_zh = s2t_zh_model.speech_recognize(args.wav_zh)
-    print('[S2T][zh]Text: {}'.format(text_zh))
diff --git a/demos/asr_hub/run.sh b/demos/asr_hub/run.sh
deleted file mode 100755
index 040fc4939..000000000
--- a/demos/asr_hub/run.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-if python -c "import paddlehub" &> /dev/null; then
-    echo 'PaddleHub has already been installed.'
-else
-    echo 'Installing PaddleHub...'
-    pip install paddlehub -U
-fi
-
-mkdir -p data
-wav_en=data/en.wav
-wav_zh=data/zh.wav
-test -e ${wav_en}  || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav -P data
-test -e ${wav_zh}  || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav -P data
-
-ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
-if [ ${ngpu} == 0 ];then
-    device=cpu
-else
-    device=gpu
-fi
-
-echo "using ${device}..."
-
-python3 -u hub_infer.py \
---device ${device} \
---wav_en ${wav_en} \
---wav_zh ${wav_zh}
-
-exit 0
diff --git a/demos/audio_tagging/README.md b/demos/audio_tagging/README.md
new file mode 100644
index 000000000..eba3619e4
--- /dev/null
+++ b/demos/audio_tagging/README.md
@@ -0,0 +1,79 @@
+# Audio Tagging
+
+## Introduction
+Audio tagging is the task of labelling an audio clip with one or more labels or tags, includeing music tagging, acoustic scene classification, audio event classification, etc.
+
+This demo is an implementation to tag an audio file with 527 [AudioSet](https://research.google.com/audioset/) labels. It can be done by a single command line  or a few lines in python using `PaddleSpeech`. 
+
+## Usage
+### 1. Installation
+```sh
+pip install paddlespeech
+```
+
+### 2. Prepare Input File
+Input of this demo should be a WAV file(`.wav`).
+
+Here are sample files for this demo that can be downloaded:
+```sh
+!wget https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav
+```
+
+### 3. Usage
+- Command Line(Recommended)
+  ```sh
+  paddlespeech cls --input ~/cat.wav --topk 10
+  ```
+  Command usage:
+  - `input`(required): Audio file to tag.
+  - `model`: Model type of tagging task. Default: `panns_cnn10`.
+  - `config`: Config of tagging task. Use pretrained model when it is None. Default: `None`.
+  - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`.
+  - `label_file`: Label file of tagging task. Use audioset labels when it is None. Default: `None`.
+  - `topk`: Show topk tagging labels of result. Default: `1`.
+  - `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment.
+
+  Output:
+  ```sh
+  [2021-12-08 14:49:40,671] [    INFO] [utils.py] [L225] - CLS Result:
+  Cat: 0.8991316556930542
+  Domestic animals, pets: 0.8806838393211365
+  Meow: 0.8784668445587158
+  Animal: 0.8776564598083496
+  Caterwaul: 0.2232048511505127
+  Speech: 0.03101264126598835
+  Music: 0.02870696596801281
+  Inside, small room: 0.016673989593982697
+  Purr: 0.008387474343180656
+  Bird: 0.006304860580712557
+  ```
+
+- Python API
+  ```sh
+  python tag.py --input ~/cat.wav
+  ```
+  Output:
+  ```sh
+  CLS Result:
+  Cat: 0.8991316556930542
+  Domestic animals, pets: 0.8806838393211365
+  Meow: 0.8784668445587158
+  Animal: 0.8776564598083496
+  Caterwaul: 0.2232048511505127
+  Speech: 0.03101264126598835
+  Music: 0.02870696596801281
+  Inside, small room: 0.016673989593982697
+  Purr: 0.008387474343180656
+  Bird: 0.006304860580712557
+  ```
+
+
+### 4.Pretrained Models
+
+Here is a list of pretrained models released by PaddleSpeech and can be used by command and python api:
+
+| Model | Sample Rate
+| :--- | :---: 
+| panns_cnn6| 32000
+| panns_cnn10| 32000
+| panns_cnn14| 32000
diff --git a/demos/audio_tagging/tag.py b/demos/audio_tagging/tag.py
new file mode 100644
index 000000000..cda3c5ad4
--- /dev/null
+++ b/demos/audio_tagging/tag.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import paddle
+
+from paddlespeech.cli import CLSExecutor
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    '--input', type=str, required=True, help='Audio file to recognize.')
+args = parser.parse_args()
+# yapf: enable
+
+if __name__ == '__main__':
+    cls_executor = CLSExecutor()
+    result = cls_executor(
+        model_type='panns_cnn10',
+        cfg_path=None,  # Set `cfg_path` and `ckpt_path` to None to use pretrained model.
+        label_file=None,
+        ckpt_path=None,
+        audio_file=args.input,
+        topk=10,
+        device=paddle.get_device(), )
+    print('CLS Result: \n{}'.format(result))
diff --git a/demos/echo_hub/.gitignore b/demos/echo_hub/.gitignore
deleted file mode 100644
index 1269488f7..000000000
--- a/demos/echo_hub/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-data
diff --git a/demos/echo_hub/README.md b/demos/echo_hub/README.md
deleted file mode 100644
index 3248f5179..000000000
--- a/demos/echo_hub/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# echo system
-
-ASR + TTS
-
-中文：
-```shell
-CUDA_VISIBLE_DEVICES=0 ./run.sh 用科技让复杂的世界更简单 . zh
-```
-
-英文：
-```shell
-CUDA_VISIBLE_DEVICES=0 ./run.sh "Text to speech system converts normal language text into speech." . en
-```
diff --git a/demos/echo_hub/hub_infer.py b/demos/echo_hub/hub_infer.py
deleted file mode 100644
index abeb409dd..000000000
--- a/demos/echo_hub/hub_infer.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-
-import librosa
-import paddle
-import paddlehub as hub
-import soundfile as sf
-
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument("--lang", type=str, default='zh', choices=['zh', 'en'])
-parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu'])
-parser.add_argument("--text", type=str, nargs='+')
-parser.add_argument("--output_dir", type=str)
-args = parser.parse_args()
-# yapf: enable
-
-if __name__ == '__main__':
-    paddle.set_device(args.device)
-
-    output_dir = os.path.abspath(os.path.expanduser(args.output_dir))
-    if args.lang == 'zh':
-        t2s_model = hub.Module(name='fastspeech2_baker', output_dir=output_dir)
-        s2t_model = hub.Module(name='u2_conformer_aishell')
-    else:
-        t2s_model = hub.Module(
-            name='fastspeech2_ljspeech', output_dir=output_dir)
-        s2t_model = hub.Module(name='u2_conformer_librispeech')
-
-    if isinstance(args.text, list):
-        args.text = ' '.join(args.text)
-
-    wavs = t2s_model.generate([args.text], device=args.device)
-    print('[T2S]Wav file has been generated: {}'.format(wavs[0]))
-    # convert sr to 16k
-    x, sr = librosa.load(wavs[0])
-    y = librosa.resample(x, sr, 16000)
-    wav_16k = wavs[0].replace('.wav', '_16k.wav')
-    sf.write(wav_16k, y, 16000)
-    print('[S2T]Resample to 16k: {}'.format(wav_16k))
-    text = s2t_model.speech_recognize(wav_16k)
-    print('[S2T]Text recognized from wav file: {}'.format(text))
diff --git a/demos/echo_hub/run.sh b/demos/echo_hub/run.sh
deleted file mode 100755
index f3e87f2e0..000000000
--- a/demos/echo_hub/run.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-
-if python -c "import paddlehub" &> /dev/null; then
-    echo 'PaddleHub has already been installed.'
-else
-    echo 'Installing PaddleHub...'
-    pip install paddlehub -U
-fi
-
-if [ $# != 2 -a $# != 3 ];then
-    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} text output_dir [lang]"
-    exit -1
-fi
-
-ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
-if [ ${ngpu} == 0 ];then
-    device=cpu
-else
-    device=gpu
-fi
-
-echo "using ${device}..."
-
-text=$1
-output_dir=$2
-if [ $# == 3 ];then
-    lang=$3
-else
-    lang=zh
-fi
-
-if [ ! -d $output_dir ];then
-    mkdir -p $output_dir
-fi
-
-python3 -u hub_infer.py \
---lang ${lang} \
---device ${device} \
---text \"${text}\" \
---output_dir ${output_dir}
-
-exit 0
diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md
new file mode 100644
index 000000000..86bde037a
--- /dev/null
+++ b/demos/speech_recognition/README.md
@@ -0,0 +1,59 @@
+# ASR(Automatic Speech Recognition)
+
+## Introduction
+ASR, or Automatic Speech Recognition, refers to the problem of getting a program to automatically transcribe spoken language (speech-to-text). 
+
+This demo is an implementation to recognize text from a specific audio file. It can be done by a single command line  or a few lines in python using `PaddleSpeech`. 
+
+## Usage
+### 1. Installation
+```sh
+pip install paddlespeech
+```
+
+### 2. Prepare Input File
+Input of this demo should be a WAV file(`.wav`), and the sample rate must be same as the model's.
+
+Here are sample files for this demo that can be downloaded:
+```sh
+!wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+```
+
+### 3. Usage
+- Command Line(Recommended)
+  ```sh
+  paddlespeech asr --input ~/zh.wav
+  ```
+  Command usage:
+  - `input`(required): Audio file to recognize.
+  - `model`: Model type of asr task. Default: `conformer_wenetspeech`.
+  - `lang`: Model language. Default: `zh`.
+  - `sr`: Sample rate of the model. Default: `16000`.
+  - `config`: Config of asr task. Use pretrained model when it is None. Default: `None`.
+  - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`.
+  - `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment.
+
+  Output:
+  ```sh
+  [2021-12-08 13:12:34,063] [    INFO] [utils.py] [L225] - ASR Result: 我认为跑步最重要的就是给我带来了身体健康
+  ```
+
+- Python API
+  ```sh
+  python asr.py --input ~/zh.wav
+  ```
+  Output:
+  ```sh
+  ASR Result:
+  我认为跑步最重要的就是给我带来了身体健康
+  ```
+
+
+### 4.Pretrained Models
+
+Here is a list of pretrained models released by PaddleSpeech and can be used by command and python api:
+
+| Model | Language | Sample Rate
+| :--- | :---: | :---: |
+| conformer_wenetspeech| zh| 16000
+| transformer_aishell| zh| 16000
diff --git a/demos/speech_recognition/asr.py b/demos/speech_recognition/asr.py
new file mode 100644
index 000000000..3ac8b91df
--- /dev/null
+++ b/demos/speech_recognition/asr.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import paddle
+
+from paddlespeech.cli import ASRExecutor
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    '--input', type=str, required=True, help='Audio file to recognize.')
+args = parser.parse_args()
+# yapf: enable
+
+if __name__ == '__main__':
+    asr_executor = ASRExecutor()
+    text = asr_executor(
+        model='conformer_wenetspeech',
+        lang='zh',
+        sample_rate=16000,
+        config=None,  # Set `conf` and `ckpt_path` to None to use pretrained model.
+        ckpt_path=None,
+        audio_file=args.input,
+        device=paddle.get_device(), )
+    print('ASR Result: \n{}'.format(text))
diff --git a/demos/tts_hub/README.md b/demos/tts_hub/README.md
deleted file mode 100644
index f5fa599a0..000000000
--- a/demos/tts_hub/README.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# TTS
-
-中文：
-```shell
-CUDA_VISIBLE_DEVICES=0 ./run.sh 用科技让复杂的世界更简单 . zh
-```
-
-英文：
-```shell
-CUDA_VISIBLE_DEVICES=0 ./run.sh "Text to speech system converts normal language text into speech." . en
-```
diff --git a/demos/tts_hub/hub_infer.py b/demos/tts_hub/hub_infer.py
deleted file mode 100644
index 2430400ed..000000000
--- a/demos/tts_hub/hub_infer.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-
-import paddle
-import paddlehub as hub
-
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument("--lang", type=str, default='zh', choices=['zh', 'en'])
-parser.add_argument("--device", type=str, default='gpu', choices=['cpu', 'gpu'])
-parser.add_argument("--text", type=str, nargs='+')
-parser.add_argument("--output_dir", type=str)
-args = parser.parse_args()
-# yapf: enable
-
-if __name__ == '__main__':
-    paddle.set_device(args.device)
-
-    output_dir = os.path.abspath(os.path.expanduser(args.output_dir))
-    if args.lang == 'zh':
-        t2s_model = hub.Module(name='fastspeech2_baker', output_dir=output_dir)
-    else:
-        t2s_model = hub.Module(
-            name='fastspeech2_ljspeech', output_dir=output_dir)
-
-    if isinstance(args.text, list):
-        args.text = ' '.join(args.text)
-
-    wavs = t2s_model.generate([args.text], device=args.device)
-    print('[T2S]Wav file has been generated: {}'.format(wavs[0]))
diff --git a/demos/tts_hub/run.sh b/demos/tts_hub/run.sh
deleted file mode 100755
index f3e87f2e0..000000000
--- a/demos/tts_hub/run.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-
-if python -c "import paddlehub" &> /dev/null; then
-    echo 'PaddleHub has already been installed.'
-else
-    echo 'Installing PaddleHub...'
-    pip install paddlehub -U
-fi
-
-if [ $# != 2 -a $# != 3 ];then
-    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} text output_dir [lang]"
-    exit -1
-fi
-
-ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
-if [ ${ngpu} == 0 ];then
-    device=cpu
-else
-    device=gpu
-fi
-
-echo "using ${device}..."
-
-text=$1
-output_dir=$2
-if [ $# == 3 ];then
-    lang=$3
-else
-    lang=zh
-fi
-
-if [ ! -d $output_dir ];then
-    mkdir -p $output_dir
-fi
-
-python3 -u hub_infer.py \
---lang ${lang} \
---device ${device} \
---text \"${text}\" \
---output_dir ${output_dir}
-
-exit 0