From 70a8a75476bbca944e6995c69948bb61f482c471 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Thu, 9 Dec 2021 12:12:08 +0800 Subject: [PATCH] Add st demo. --- demos/audio_tagging/README.md | 4 +- demos/speech_recognition/README.md | 6 +-- demos/speech_translation/README.md | 77 ++++++++++++++++++++++++++++++ paddlespeech/cli/asr/infer.py | 11 +++-- paddlespeech/cli/cls/infer.py | 2 +- paddlespeech/cli/st/infer.py | 42 ++++++++-------- 6 files changed, 111 insertions(+), 31 deletions(-) create mode 100644 demos/speech_translation/README.md diff --git a/demos/audio_tagging/README.md b/demos/audio_tagging/README.md index d954ddfb6..5073393d4 100644 --- a/demos/audio_tagging/README.md +++ b/demos/audio_tagging/README.md @@ -3,7 +3,7 @@ ## Introduction Audio tagging is the task of labelling an audio clip with one or more labels or tags, includeing music tagging, acoustic scene classification, audio event classification, etc. -This demo is an implementation to tag an audio file with 527 [AudioSet](https://research.google.com/audioset/) labels. It can be done by a single command line or a few lines in python using `PaddleSpeech`. +This demo is an implementation to tag an audio file with 527 [AudioSet](https://research.google.com/audioset/) labels. It can be done by a single command or a few lines in python using `PaddleSpeech`. ## Usage ### 1. Installation @@ -86,7 +86,7 @@ wget https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech ### 4.Pretrained Models -Here is a list of pretrained models released by PaddleSpeech and can be used by command and python api: +Here is a list of pretrained models released by PaddleSpeech that can be used by command and python api: | Model | Sample Rate | :--- | :---: diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md index 891c7b9f6..60ee8e4d4 100644 --- a/demos/speech_recognition/README.md +++ b/demos/speech_recognition/README.md @@ -3,7 +3,7 @@ ## Introduction ASR, or Automatic Speech Recognition, refers to the problem of getting a program to automatically transcribe spoken language (speech-to-text). -This demo is an implementation to recognize text from a specific audio file. It can be done by a single command line or a few lines in python using `PaddleSpeech`. +This demo is an implementation to recognize text from a specific audio file. It can be done by a single command or a few lines in python using `PaddleSpeech`. ## Usage ### 1. Installation @@ -32,7 +32,7 @@ wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech. - `input`(required): Audio file to recognize. - `model`: Model type of asr task. Default: `conformer_wenetspeech`. - `lang`: Model language. Default: `zh`. - - `sr`: Sample rate of the model. Default: `16000`. + - `sample_rate`: Sample rate of the model. Default: `16000`. - `config`: Config of asr task. Use pretrained model when it is None. Default: `None`. - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`. - `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment. @@ -68,7 +68,7 @@ wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech. ### 4.Pretrained Models -Here is a list of pretrained models released by PaddleSpeech and can be used by command and python api: +Here is a list of pretrained models released by PaddleSpeech that can be used by command and python api: | Model | Language | Sample Rate | :--- | :---: | :---: | diff --git a/demos/speech_translation/README.md b/demos/speech_translation/README.md new file mode 100644 index 000000000..b2f29168a --- /dev/null +++ b/demos/speech_translation/README.md @@ -0,0 +1,77 @@ +# Speech Translation + +## Introduction +Speech translation is the process by which conversational spoken phrases are instantly translated and spoken aloud in a second language. + +This demo is an implementation to recognize text from a specific audio file and translate to target language. It can be done by a single command or a few lines in python using `PaddleSpeech`. + +## Usage +### 1. Installation +```bash +pip install paddlespeech +``` + +### 2. Prepare Input File +Input of this demo should be a WAV file(`.wav`). + +Here are sample files for this demo that can be downloaded: +```bash +wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +``` + +### 3. Usage +- Command Line(Recommended) + ```bash + paddlespeech st --input ~/en.wav + ``` + Usage: + ```bash + paddlespeech st --help + ``` + Arguments: + - `input`(required): Audio file to recognize and translate. + - `model`: Model type of st task. Default: `fat_st_ted`. + - `src_lang`: Source language. Default: `en`. + - `tgt_lang`: Target language. Default: `zh`. + - `sample_rate`: Sample rate of the model. Default: `16000`. + - `config`: Config of st task. Use pretrained model when it is None. Default: `None`. + - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`. + - `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment. + + Output: + ```bash + [2021-12-09 11:13:03,178] [ INFO] [utils.py] [L225] - ST Result: ['我 在 这栋 建筑 的 古老 门上 敲门 。'] + ``` + +- Python API + ```python + import paddle + from paddlespeech.cli import STExecutor + + st_executor = STExecutor() + text = st_executor( + model='fat_st_ted', + src_lang='en', + tgt_lang='zh', + sample_rate=16000, + config=None, # Set `config` and `ckpt_path` to None to use pretrained model. + ckpt_path=None, + audio_file='./en.wav', + device=paddle.get_device()) + print('ST Result: \n{}'.format(text)) + ``` + + Output: + ```bash + ST Result: + ['我 在 这栋 建筑 的 古老 门上 敲门 。'] + ``` + + +### 4.Pretrained Models + +Here is a list of pretrained models released by PaddleSpeech that can be used by command and python api: + +| Model | Source Language | Target Language +| :--- | :---: | :---: | +| fat_st_ted| en| zh diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 1d235201d..1e59f015a 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -88,6 +88,7 @@ class ASRExecutor(BaseExecutor): '--model', type=str, default='conformer_wenetspeech', + choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()], help='Choose model type of asr task.') self.parser.add_argument( '--lang', @@ -95,7 +96,7 @@ class ASRExecutor(BaseExecutor): default='zh', help='Choose model language. zh or en') self.parser.add_argument( - "--sr", + "--sample_rate", type=int, default=16000, choices=[8000, 16000], @@ -200,8 +201,8 @@ class ASRExecutor(BaseExecutor): raise Exception("wrong type") # Enter the path of model root - model_name = ''.join( - model_type.split('_')[:-1]) # model_type: {model_name}_{dataset} + model_name = model_type[:model_type.rindex( + '_')] # model_type: {model_name}_{dataset} model_class = dynamic_import(model_name, model_alias) model_conf = self.config.model logger.info(model_conf) @@ -314,7 +315,7 @@ class ASRExecutor(BaseExecutor): num_processes=cfg.num_proc_bsearch) self._outputs["result"] = result_transcripts[0] - elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type: + elif "conformer" in model_type or "transformer" in model_type: result_transcripts = self.model.decode( audio, audio_len, @@ -419,7 +420,7 @@ class ASRExecutor(BaseExecutor): model = parser_args.model lang = parser_args.lang - sample_rate = parser_args.sr + sample_rate = parser_args.sample_rate config = parser_args.config ckpt_path = parser_args.ckpt_path audio_file = parser_args.input diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py index 0b4982d15..b73d16679 100644 --- a/paddlespeech/cli/cls/infer.py +++ b/paddlespeech/cli/cls/infer.py @@ -81,6 +81,7 @@ class CLSExecutor(BaseExecutor): '--model', type=str, default='panns_cnn14', + choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()], help='Choose model type of cls task.') self.parser.add_argument( '--config', @@ -250,7 +251,6 @@ class CLSExecutor(BaseExecutor): Python API to call an executor. """ audio_file = os.path.abspath(audio_file) - # self._check(audio_file, sample_rate) paddle.set_device(device) self._init_from_path(model, config, ckpt_path, label_file) self.preprocess(audio_file) diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py index 534b9e3b9..d7b53a072 100644 --- a/paddlespeech/cli/st/infer.py +++ b/paddlespeech/cli/st/infer.py @@ -23,9 +23,6 @@ import numpy as np import paddle import soundfile from kaldiio import WriteHelper -from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer -from paddlespeech.s2t.utils.dynamic_import import dynamic_import -from paddlespeech.s2t.utils.utility import UpdateConfig from yacs.config import CfgNode from ..executor import BaseExecutor @@ -33,11 +30,14 @@ from ..utils import cli_register from ..utils import download_and_decompress from ..utils import logger from ..utils import MODEL_HOME +from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer +from paddlespeech.s2t.utils.dynamic_import import dynamic_import +from paddlespeech.s2t.utils.utility import UpdateConfig __all__ = ["STExecutor"] pretrained_models = { - "fat_st_ted_en-zh": { + "fat_st_ted-en-zh": { "url": "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/fat_st_ted-en-zh.tar.gz", "md5": @@ -49,7 +49,7 @@ pretrained_models = { } } -model_alias = {"fat_st_ted": "paddlespeech.s2t.models.u2_st:U2STModel"} +model_alias = {"fat_st": "paddlespeech.s2t.models.u2_st:U2STModel"} kaldi_bins = { "url": @@ -70,9 +70,10 @@ class STExecutor(BaseExecutor): self.parser.add_argument( "--input", type=str, required=True, help="Audio file to translate.") self.parser.add_argument( - "--model_type", + "--model", type=str, default="fat_st_ted", + choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()], help="Choose model type of st task.") self.parser.add_argument( "--src_lang", @@ -91,7 +92,7 @@ class STExecutor(BaseExecutor): choices=[16000], help='Choose the audio sample rate of the model. 8000 or 16000') self.parser.add_argument( - "--cfg_path", + "--config", type=str, default=None, help="Config of st task. Use deault config when it is None.") @@ -150,7 +151,7 @@ class STExecutor(BaseExecutor): return if cfg_path is None or ckpt_path is None: - tag = model_type + "_" + src_lang + "-" + tgt_lang + tag = model_type + "-" + src_lang + "-" + tgt_lang res_path = self._get_pretrained_path(tag) self.cfg_path = os.path.join(res_path, pretrained_models[tag]["cfg_path"]) @@ -186,7 +187,9 @@ class STExecutor(BaseExecutor): model_conf = self.config.model logger.info(model_conf) - model_class = dynamic_import(model_type, model_alias) + model_name = model_type[:model_type.rindex( + '_')] # model_type: {model_name}_{dataset} + model_class = dynamic_import(model_name, model_alias) self.model = model_class.from_config(model_conf) self.model.eval() @@ -213,7 +216,7 @@ class STExecutor(BaseExecutor): audio_file = os.path.abspath(wav_file) logger.info("Preprocess audio_file:" + audio_file) - if model_type == "fat_st_ted": + if "fat_st" in model_type: cmvn = self.config.collator.cmvn_path utt_name = "_tmp" @@ -321,25 +324,25 @@ class STExecutor(BaseExecutor): """ parser_args = self.parser.parse_args(argv) - model_type = parser_args.model_type + model = parser_args.model src_lang = parser_args.src_lang tgt_lang = parser_args.tgt_lang sample_rate = parser_args.sample_rate - cfg_path = parser_args.cfg_path + config = parser_args.config ckpt_path = parser_args.ckpt_path audio_file = parser_args.input device = parser_args.device try: - res = self(model_type, src_lang, tgt_lang, sample_rate, cfg_path, + res = self(model, src_lang, tgt_lang, sample_rate, config, ckpt_path, audio_file, device) logger.info("ST Result: {}".format(res)) return True except Exception as e: - print(e) + logger.exception(e) return False - def __call__(self, model_type, src_lang, tgt_lang, sample_rate, cfg_path, + def __call__(self, model, src_lang, tgt_lang, sample_rate, config, ckpt_path, audio_file, device): """ Python API to call an executor. @@ -347,10 +350,9 @@ class STExecutor(BaseExecutor): audio_file = os.path.abspath(audio_file) self._check(audio_file, sample_rate) paddle.set_device(device) - self._init_from_path(model_type, src_lang, tgt_lang, cfg_path, - ckpt_path) - self.preprocess(audio_file, model_type) - self.infer(model_type) - res = self.postprocess(model_type) + self._init_from_path(model, src_lang, tgt_lang, config, ckpt_path) + self.preprocess(audio_file, model) + self.infer(model) + res = self.postprocess(model) return res