Merge pull request #1131 from KPatr1ck/video_demo

[Demo]Add automatic_video_subtitiles demo.
pull/1139/head
Hui Zhang 3 years ago committed by GitHub
commit 5b650b9792
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -16,7 +16,7 @@ Input of this demo should be a WAV file(`.wav`).
Here are sample files for this demo that can be downloaded:
```bash
wget https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav
```
### 3. Usage

@ -0,0 +1,50 @@
# Automatic Video Subtitiles
## Introduction
Automatic video subtitiles can generate subtitiles from a specific video by using Automatic Speech Recognition (ASR) system.
This demo is an implementation to automatic video subtitiles from a video file. It can be done by a single command or a few lines in python using `PaddleSpeech`.
## Usage
### 1. Installation
```bash
pip install paddlespeech
```
### 2. Prepare Input
Get a video file with speech of the specific language:
```bash
wget -c https://paddlespeech.bj.bcebos.com/demos/asr_demos/subtitle_demo1.mp4
```
Extract `.wav` with one channel and 16000 sample rate from the video:
```bash
ffmpeg -i subtitle_demo1.mp4 -ac 1 -ar 16000 -vn input.wav
```
### 3. Usage
- Python API
```python
import paddle
from paddlespeech.cli import ASRExecutor, TextExecutor
asr_executor = ASRExecutor()
text_executor = TextExecutor()
text = asr_executor(
audio_file='input.wav',
device=paddle.get_device())
result = text_executor(
text=text,
task='punc',
model='ernie_linear_p3_wudao',
device=paddle.get_device())
print('Text Result: \n{}'.format(result))
```
Output:
```bash
Text Result:
当我说我可以把三十年的经验变成一个准确的算法,他们说不可能。当我说我们十个人就能实现对十九个城市变电站七乘二十四小时的实时监管,他们说不可能。
```

@ -0,0 +1,43 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import paddle
from paddlespeech.cli import ASRExecutor
from paddlespeech.cli import TextExecutor
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--input", type=str, required=True)
parser.add_argument("--device", type=str, default=paddle.get_device())
args = parser.parse_args()
# yapf: enable
if __name__ == "__main__":
asr_executor = ASRExecutor()
text_executor = TextExecutor()
text = asr_executor(
audio_file=os.path.abspath(os.path.expanduser(args.input)),
device=args.device)
result = text_executor(
text=text,
task='punc',
model='ernie_linear_p3_wudao',
device=args.device)
print('ASR Result: \n{}'.format(text))
print('Text Result: \n{}'.format(result))

@ -0,0 +1,20 @@
#!/bin/bash
video_url=https://paddlespeech.bj.bcebos.com/demos/asr_demos/subtitle_demo1.mp4
video_file=$(basename ${video_url})
audio_file=$(echo ${video_file} | awk -F'.' '{print $1}').wav
num_channels=1
sr=16000
# Download video
if [ ! -f ${video_file} ]; then
wget -c ${video_url}
fi
# Extract audio from video
if [ ! -f ${audio_file} ]; then
ffmpeg -i ${video_file} -ac ${num_channels} -ar ${sr} -vn ${audio_file}
fi
python -u recognize.py --input ${audio_file}
exit 0

@ -27,7 +27,7 @@ Input of this demo should be a text of the specific language that can be passed
Arguments:
- `input`(required): Input raw text.
- `task`: Choose subtask. Default: `punc`.
- `model`: Model type of text task. Default: `ernie_linear_wudao`.
- `model`: Model type of text task. Default: `ernie_linear_p7_wudao`.
- `lang`: Choose model language.. Default: `zh`.
- `config`: Config of text task. Use pretrained model when it is None. Default: `None`.
- `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`.
@ -49,7 +49,7 @@ Input of this demo should be a text of the specific language that can be passed
result = text_executor(
text='今天的天气真不错啊你下午有空吗我想约你一起去吃饭',
task='punc',
model='ernie_linear_wudao',
model='ernie_linear_p7_wudao',
lang='zh',
config=None,
ckpt_path=None,
@ -68,6 +68,8 @@ Input of this demo should be a text of the specific language that can be passed
Here is a list of pretrained models released by PaddleSpeech that can be used by command and python api:
| Model | Task | Language
- Punctuation Restoration
| Model | Language | Number of Punctuation Characters
| :--- | :---: | :---:
| ernie_linear_wudao| punc(Punctuation Restoration) | zh
| ernie_linear_p3_wudao| zh | 3(,。?)
| ernie_linear_p7_wudao| zh | 7(,。!?、:;)

@ -16,7 +16,7 @@ Input of this demo should be a WAV file(`.wav`), and the sample rate must be sam
Here are sample files for this demo that can be downloaded:
```bash
wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
```
### 3. Usage

@ -16,7 +16,7 @@ Input of this demo should be a WAV file(`.wav`).
Here are sample files for this demo that can be downloaded:
```bash
wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
```
### 3. Usage (not support for Windows now)

@ -421,7 +421,7 @@ class ASRExecutor(BaseExecutor):
device = parser_args.device
try:
res = self(model, lang, sample_rate, config, ckpt_path, audio_file,
res = self(audio_file, model, lang, sample_rate, config, ckpt_path,
device)
logger.info('ASR Result: {}'.format(res))
return True
@ -429,8 +429,14 @@ class ASRExecutor(BaseExecutor):
logger.exception(e)
return False
def __call__(self, model, lang, sample_rate, config, ckpt_path, audio_file,
device):
def __call__(self,
audio_file: os.PathLike,
model: str='conformer_wenetspeech',
lang: str='zh',
sample_rate: int=16000,
config: os.PathLike=None,
ckpt_path: os.PathLike=None,
device=paddle.get_device()):
"""
Python API to call an executor.
"""

@ -237,7 +237,7 @@ class CLSExecutor(BaseExecutor):
device = parser_args.device
try:
res = self(model_type, cfg_path, label_file, ckpt_path, audio_file,
res = self(audio_file, model_type, cfg_path, ckpt_path, label_file,
topk, device)
logger.info('CLS Result:\n{}'.format(res))
return True
@ -245,8 +245,14 @@ class CLSExecutor(BaseExecutor):
logger.exception(e)
return False
def __call__(self, model, config, ckpt_path, label_file, audio_file, topk,
device):
def __call__(self,
audio_file: os.PathLike,
model: str='panns_cnn14',
config: Optional[os.PathLike]=None,
ckpt_path: Optional[os.PathLike]=None,
label_file: Optional[os.PathLike]=None,
topk: int=1,
device: str=paddle.get_device()):
"""
Python API to call an executor.
"""

@ -326,16 +326,23 @@ class STExecutor(BaseExecutor):
device = parser_args.device
try:
res = self(model, src_lang, tgt_lang, sample_rate, config,
ckpt_path, audio_file, device)
res = self(audio_file, model, src_lang, tgt_lang, sample_rate,
config, ckpt_path, device)
logger.info("ST Result: {}".format(res))
return True
except Exception as e:
logger.exception(e)
return False
def __call__(self, model, src_lang, tgt_lang, sample_rate, config,
ckpt_path, audio_file, device):
def __call__(self,
audio_file: os.PathLike,
model: str='fat_st_ted',
src_lang: str='en',
tgt_lang: str='zh',
sample_rate: int=16000,
config: Optional[os.PathLike]=None,
ckpt_path: Optional[os.PathLike]=None,
device: str=paddle.get_device()):
"""
Python API to call an executor.
"""

@ -34,9 +34,9 @@ pretrained_models = {
# e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
# Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
# "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
"ernie_linear_wudao-punc-zh": {
"ernie_linear_p7_wudao-punc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/text/ernie_linear_wudao-punc-zh.tar.gz',
'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p7_wudao-punc-zh.tar.gz',
'md5':
'12283e2ddde1797c5d1e57036b512746',
'cfg_path':
@ -46,14 +46,28 @@ pretrained_models = {
'vocab_file':
'punc_vocab.txt',
},
"ernie_linear_p3_wudao-punc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_wudao-punc-zh.tar.gz',
'md5':
'448eb2fdf85b6a997e7e652e80c51dd2',
'cfg_path':
'ckpt/model_config.json',
'ckpt_path':
'ckpt/model_state.pdparams',
'vocab_file':
'punc_vocab.txt',
},
}
model_alias = {
"ernie_linear": "paddlespeech.text.models:ErnieLinear",
"ernie_linear_p7": "paddlespeech.text.models:ErnieLinear",
"ernie_linear_p3": "paddlespeech.text.models:ErnieLinear",
}
tokenizer_alias = {
"ernie_linear": "paddlenlp.transformers:ErnieTokenizer",
"ernie_linear_p7": "paddlenlp.transformers:ErnieTokenizer",
"ernie_linear_p3": "paddlenlp.transformers:ErnieTokenizer",
}
@ -75,7 +89,7 @@ class TextExecutor(BaseExecutor):
self.parser.add_argument(
'--model',
type=str,
default='ernie_linear_wudao',
default='ernie_linear_p7_wudao',
choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()],
help='Choose model type of text task.')
self.parser.add_argument(
@ -123,7 +137,7 @@ class TextExecutor(BaseExecutor):
def _init_from_path(self,
task: str='punc',
model_type: str='ernie_linear_wudao',
model_type: str='ernie_linear_p7_wudao',
lang: str='zh',
cfg_path: Optional[os.PathLike]=None,
ckpt_path: Optional[os.PathLike]=None,
@ -182,7 +196,6 @@ class TextExecutor(BaseExecutor):
Input preprocess and return paddle.Tensor stored in self.input.
Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet).
"""
logger.info("Preprocessing input text: " + text)
if self.task == 'punc':
clean_text = self._clean_text(text)
assert len(clean_text) > 0, f'Invalid input string: {text}'
@ -263,11 +276,11 @@ class TextExecutor(BaseExecutor):
self,
text: str,
task: str='punc',
model: str='ernie_linear_wudao',
model: str='ernie_linear_p7_wudao',
lang: str='zh',
config: os.PathLike=None,
ckpt_path: os.PathLike=None,
punc_vocab: os.PathLike=None,
config: Optional[os.PathLike]=None,
ckpt_path: Optional[os.PathLike]=None,
punc_vocab: Optional[os.PathLike]=None,
device: str=paddle.get_device(), ):
"""
Python API to call an executor.

@ -616,7 +616,7 @@ class TTSExecutor(BaseExecutor):
voc_ckpt: Optional[os.PathLike]=None,
voc_stat: Optional[os.PathLike]=None,
lang: str='zh',
device: str='gpu',
device: str=paddle.get_device(),
output: str='output.wav'):
"""
Python API to call an executor.

Loading…
Cancel
Save