From 7394a187327a42447e174ff06084620abfaadf26 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Wed, 15 Dec 2021 10:57:49 +0800 Subject: [PATCH 1/3] Add default arguments in cls python api. --- paddlespeech/cli/asr/infer.py | 12 +++++++++--- paddlespeech/cli/cls/infer.py | 12 +++++++++--- paddlespeech/cli/st/infer.py | 15 +++++++++++---- paddlespeech/cli/text/infer.py | 6 +++--- paddlespeech/cli/tts/infer.py | 2 +- 5 files changed, 33 insertions(+), 14 deletions(-) diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index e020b501..89a9fcfa 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -421,7 +421,7 @@ class ASRExecutor(BaseExecutor): device = parser_args.device try: - res = self(model, lang, sample_rate, config, ckpt_path, audio_file, + res = self(audio_file, model, lang, sample_rate, config, ckpt_path, device) logger.info('ASR Result: {}'.format(res)) return True @@ -429,8 +429,14 @@ class ASRExecutor(BaseExecutor): logger.exception(e) return False - def __call__(self, model, lang, sample_rate, config, ckpt_path, audio_file, - device): + def __call__(self, + audio_file: os.PathLike, + model: str='conformer_wenetspeech', + lang: str='zh', + sample_rate: int=16000, + config: os.PathLike=None, + ckpt_path: os.PathLike=None, + device=paddle.get_device()): """ Python API to call an executor. """ diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py index 37f2a9d2..c31ad361 100644 --- a/paddlespeech/cli/cls/infer.py +++ b/paddlespeech/cli/cls/infer.py @@ -237,7 +237,7 @@ class CLSExecutor(BaseExecutor): device = parser_args.device try: - res = self(model_type, cfg_path, label_file, ckpt_path, audio_file, + res = self(audio_file, model_type, cfg_path, ckpt_path, label_file, topk, device) logger.info('CLS Result:\n{}'.format(res)) return True @@ -245,8 +245,14 @@ class CLSExecutor(BaseExecutor): logger.exception(e) return False - def __call__(self, model, config, ckpt_path, label_file, audio_file, topk, - device): + def __call__(self, + audio_file: os.PathLike, + model: str='panns_cnn14', + config: Optional[os.PathLike]=None, + ckpt_path: Optional[os.PathLike]=None, + label_file: Optional[os.PathLike]=None, + topk: int=1, + device: str=paddle.get_device()): """ Python API to call an executor. """ diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py index fd32e3b4..553b025f 100644 --- a/paddlespeech/cli/st/infer.py +++ b/paddlespeech/cli/st/infer.py @@ -326,16 +326,23 @@ class STExecutor(BaseExecutor): device = parser_args.device try: - res = self(model, src_lang, tgt_lang, sample_rate, config, - ckpt_path, audio_file, device) + res = self(audio_file, model, src_lang, tgt_lang, sample_rate, + config, ckpt_path, device) logger.info("ST Result: {}".format(res)) return True except Exception as e: logger.exception(e) return False - def __call__(self, model, src_lang, tgt_lang, sample_rate, config, - ckpt_path, audio_file, device): + def __call__(self, + audio_file: os.PathLike, + model: str='fat_st_ted', + src_lang: str='en', + tgt_lang: str='zh', + sample_rate: int=16000, + config: Optional[os.PathLike]=None, + ckpt_path: Optional[os.PathLike]=None, + device: str=paddle.get_device()): """ Python API to call an executor. """ diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py index 0d8dbbb8..13d170f5 100644 --- a/paddlespeech/cli/text/infer.py +++ b/paddlespeech/cli/text/infer.py @@ -265,9 +265,9 @@ class TextExecutor(BaseExecutor): task: str='punc', model: str='ernie_linear_wudao', lang: str='zh', - config: os.PathLike=None, - ckpt_path: os.PathLike=None, - punc_vocab: os.PathLike=None, + config: Optional[os.PathLike]=None, + ckpt_path: Optional[os.PathLike]=None, + punc_vocab: Optional[os.PathLike]=None, device: str=paddle.get_device(), ): """ Python API to call an executor. diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index b3733e05..f60f4224 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -616,7 +616,7 @@ class TTSExecutor(BaseExecutor): voc_ckpt: Optional[os.PathLike]=None, voc_stat: Optional[os.PathLike]=None, lang: str='zh', - device: str='gpu', + device: str=paddle.get_device(), output: str='output.wav'): """ Python API to call an executor. From 16d6ed3842fc1fd5a6bbdb598b9fbd0c45590716 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Wed, 15 Dec 2021 16:08:38 +0800 Subject: [PATCH 2/3] Add automatic_video_subtitiles demo. --- demos/audio_tagging/README.md | 2 +- demos/automatic_video_subtitiles/README.md | 52 +++++++++++++++++++ demos/automatic_video_subtitiles/recognize.py | 43 +++++++++++++++ demos/automatic_video_subtitiles/run.sh | 20 +++++++ demos/punctuation_restoration/README.md | 12 +++-- demos/speech_recognition/README.md | 2 +- demos/speech_translation/README.md | 2 +- paddlespeech/cli/text/infer.py | 29 ++++++++--- 8 files changed, 146 insertions(+), 16 deletions(-) create mode 100644 demos/automatic_video_subtitiles/README.md create mode 100644 demos/automatic_video_subtitiles/recognize.py create mode 100755 demos/automatic_video_subtitiles/run.sh diff --git a/demos/audio_tagging/README.md b/demos/audio_tagging/README.md index 1144cbb1..9031c2fe 100644 --- a/demos/audio_tagging/README.md +++ b/demos/audio_tagging/README.md @@ -16,7 +16,7 @@ Input of this demo should be a WAV file(`.wav`). Here are sample files for this demo that can be downloaded: ```bash -wget https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav ``` ### 3. Usage diff --git a/demos/automatic_video_subtitiles/README.md b/demos/automatic_video_subtitiles/README.md new file mode 100644 index 00000000..cb900a6d --- /dev/null +++ b/demos/automatic_video_subtitiles/README.md @@ -0,0 +1,52 @@ +# Automatic Video Subtitiles + +## Introduction +Automatic video subtitiles can generate subtitiles from a specific video by using Automatic Speech Recognition (ASR) system. + +This demo is an implementation to automatic video subtitiles from a video file. It can be done by a single command or a few lines in python using `PaddleSpeech`. + +## Usage +### 1. Installation +```bash +pip install paddlespeech +``` + +### 2. Prepare Input +Get a video file with speech of the specific language: +```bash +wget -c https://paddlespeech.bj.bcebos.com/demos/asr_demos/subtitle_demo1.mp4 +``` + +Extract `.wav` with one channel and 16000 sample rate from the video: +```bash +ffmpeg -i subtitle_demo1.mp4 -ac 1 -ar 16000 -vn input.wav +``` + + +### 3. Usage + +- Python API + ```python + import paddle + from paddlespeech.cli import ASRExecutor, TextExecutor + + asr_executor = ASRExecutor() + text_executor = TextExecutor() + + text = asr_executor( + audio_file='input.wav', + device=paddle.get_device()) + result = text_executor( + text=text, + task='punc', + model='ernie_linear_p3_wudao', + device=paddle.get_device()) + print('Text Result: \n{}'.format(result)) + ``` + Output: + ```bash + Text Result: + 当我说我可以把三十年的经验变成一个准确的算法,他们说不可能。当我说我们十个人就能实现对十九个城市变电站七乘二十四小时的实时监管,他们说不可能。 + ``` + +automatic_video_subtitiles diff --git a/demos/automatic_video_subtitiles/recognize.py b/demos/automatic_video_subtitiles/recognize.py new file mode 100644 index 00000000..72e3c3a8 --- /dev/null +++ b/demos/automatic_video_subtitiles/recognize.py @@ -0,0 +1,43 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os + +import paddle + +from paddlespeech.cli import ASRExecutor +from paddlespeech.cli import TextExecutor + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--input", type=str, required=True) +parser.add_argument("--device", type=str, default=paddle.get_device()) +args = parser.parse_args() +# yapf: enable + +if __name__ == "__main__": + asr_executor = ASRExecutor() + text_executor = TextExecutor() + + text = asr_executor( + audio_file=os.path.abspath(os.path.expanduser(args.input)), + device=args.device) + result = text_executor( + text=text, + task='punc', + model='ernie_linear_p3_wudao', + device=args.device) + + print('ASR Result: \n{}'.format(text)) + print('Text Result: \n{}'.format(result)) diff --git a/demos/automatic_video_subtitiles/run.sh b/demos/automatic_video_subtitiles/run.sh new file mode 100755 index 00000000..9b9fd2cc --- /dev/null +++ b/demos/automatic_video_subtitiles/run.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +video_url=https://paddlespeech.bj.bcebos.com/demos/asr_demos/subtitle_demo1.mp4 +video_file=$(basename ${video_url}) +audio_file=$(echo ${video_file} | awk -F'.' '{print $1}').wav +num_channels=1 +sr=16000 + +# Download video +if [ ! -f ${video_file} ]; then + wget -c ${video_url} +fi + +# Extract audio from video +if [ ! -f ${audio_file} ]; then + ffmpeg -i ${video_file} -ac ${num_channels} -ar ${sr} -vn ${audio_file} +fi + +python -u recognize.py --input ${audio_file} +exit 0 diff --git a/demos/punctuation_restoration/README.md b/demos/punctuation_restoration/README.md index 18d462d4..d55d069a 100644 --- a/demos/punctuation_restoration/README.md +++ b/demos/punctuation_restoration/README.md @@ -27,7 +27,7 @@ Input of this demo should be a text of the specific language that can be passed Arguments: - `input`(required): Input raw text. - `task`: Choose subtask. Default: `punc`. - - `model`: Model type of text task. Default: `ernie_linear_wudao`. + - `model`: Model type of text task. Default: `ernie_linear_p7_wudao`. - `lang`: Choose model language.. Default: `zh`. - `config`: Config of text task. Use pretrained model when it is None. Default: `None`. - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`. @@ -49,7 +49,7 @@ Input of this demo should be a text of the specific language that can be passed result = text_executor( text='今天的天气真不错啊你下午有空吗我想约你一起去吃饭', task='punc', - model='ernie_linear_wudao', + model='ernie_linear_p7_wudao', lang='zh', config=None, ckpt_path=None, @@ -68,6 +68,8 @@ Input of this demo should be a text of the specific language that can be passed Here is a list of pretrained models released by PaddleSpeech that can be used by command and python api: -| Model | Task | Language -| :--- | :---: | :---: -| ernie_linear_wudao| punc(Punctuation Restoration) | zh +- Punctuation Restoration + | Model | Language | Number of Punctuation Characters + | :--- | :---: | :---: + | ernie_linear_p3_wudao| zh | 3(,。?) + | ernie_linear_p7_wudao| zh | 7(,。!?、:;) diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md index c9116531..e1343464 100644 --- a/demos/speech_recognition/README.md +++ b/demos/speech_recognition/README.md @@ -16,7 +16,7 @@ Input of this demo should be a WAV file(`.wav`), and the sample rate must be sam Here are sample files for this demo that can be downloaded: ```bash -wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav ``` ### 3. Usage diff --git a/demos/speech_translation/README.md b/demos/speech_translation/README.md index caca05dd..e3fa18c6 100644 --- a/demos/speech_translation/README.md +++ b/demos/speech_translation/README.md @@ -16,7 +16,7 @@ Input of this demo should be a WAV file(`.wav`). Here are sample files for this demo that can be downloaded: ```bash -wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav ``` ### 3. Usage (not support for Windows now) diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py index 13d170f5..da9c5fe0 100644 --- a/paddlespeech/cli/text/infer.py +++ b/paddlespeech/cli/text/infer.py @@ -34,9 +34,9 @@ pretrained_models = { # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k". # Command line and python api use "{model_name}[_{dataset}]" as --model, usage: # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav" - "ernie_linear_wudao-punc-zh": { + "ernie_linear_p7_wudao-punc-zh": { 'url': - 'https://paddlespeech.bj.bcebos.com/text/ernie_linear_wudao-punc-zh.tar.gz', + 'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p7_wudao-punc-zh.tar.gz', 'md5': '12283e2ddde1797c5d1e57036b512746', 'cfg_path': @@ -46,14 +46,28 @@ pretrained_models = { 'vocab_file': 'punc_vocab.txt', }, + "ernie_linear_p3_wudao-punc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_wudao-punc-zh.tar.gz', + 'md5': + '448eb2fdf85b6a997e7e652e80c51dd2', + 'cfg_path': + 'ckpt/model_config.json', + 'ckpt_path': + 'ckpt/model_state.pdparams', + 'vocab_file': + 'punc_vocab.txt', + }, } model_alias = { - "ernie_linear": "paddlespeech.text.models:ErnieLinear", + "ernie_linear_p7": "paddlespeech.text.models:ErnieLinear", + "ernie_linear_p3": "paddlespeech.text.models:ErnieLinear", } tokenizer_alias = { - "ernie_linear": "paddlenlp.transformers:ErnieTokenizer", + "ernie_linear_p7": "paddlenlp.transformers:ErnieTokenizer", + "ernie_linear_p3": "paddlenlp.transformers:ErnieTokenizer", } @@ -75,7 +89,7 @@ class TextExecutor(BaseExecutor): self.parser.add_argument( '--model', type=str, - default='ernie_linear_wudao', + default='ernie_linear_p7_wudao', choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()], help='Choose model type of text task.') self.parser.add_argument( @@ -123,7 +137,7 @@ class TextExecutor(BaseExecutor): def _init_from_path(self, task: str='punc', - model_type: str='ernie_linear_wudao', + model_type: str='ernie_linear_p7_wudao', lang: str='zh', cfg_path: Optional[os.PathLike]=None, ckpt_path: Optional[os.PathLike]=None, @@ -182,7 +196,6 @@ class TextExecutor(BaseExecutor): Input preprocess and return paddle.Tensor stored in self.input. Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet). """ - logger.info("Preprocessing input text: " + text) if self.task == 'punc': clean_text = self._clean_text(text) assert len(clean_text) > 0, f'Invalid input string: {text}' @@ -263,7 +276,7 @@ class TextExecutor(BaseExecutor): self, text: str, task: str='punc', - model: str='ernie_linear_wudao', + model: str='ernie_linear_p7_wudao', lang: str='zh', config: Optional[os.PathLike]=None, ckpt_path: Optional[os.PathLike]=None, From b9c86fb79f2d2637a8b2e8dc533432a449554f49 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Wed, 15 Dec 2021 16:12:57 +0800 Subject: [PATCH 3/3] Add automatic_video_subtitiles demo. --- demos/automatic_video_subtitiles/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/demos/automatic_video_subtitiles/README.md b/demos/automatic_video_subtitiles/README.md index cb900a6d..b95c18ed 100644 --- a/demos/automatic_video_subtitiles/README.md +++ b/demos/automatic_video_subtitiles/README.md @@ -48,5 +48,3 @@ ffmpeg -i subtitle_demo1.mp4 -ac 1 -ar 16000 -vn input.wav Text Result: 当我说我可以把三十年的经验变成一个准确的算法,他们说不可能。当我说我们十个人就能实现对十九个城市变电站七乘二十四小时的实时监管,他们说不可能。 ``` - -automatic_video_subtitiles