refactor: update params/input/output/namestyle

5 years ago · 20d88ec673
parent f5c61ced28 cca681f6d3
commit 20d88ec673
14 changed files with 479 additions and 71 deletions
--- a/docs/source/tts/models_introduction.md
+++ b/docs/source/tts/models_introduction.md
@ -251,8 +251,10 @@ Vocoders based on neural networks usually is speech synthesis, which learns the
 - GAN
    - WaveGAN
    - **Parallel WaveGAN**
-    - MelGAN
-    - HiFi-GAN
+    - **MelGAN**
+    - **Style MelGAN**
+    - **Multi Band MelGAN**
+    - **HiFi GAN**
 - VAE
    - Wave-VAE
 - Diffusion
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@ -203,7 +203,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
 ```

 ## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip).
+Pretrained FastSpeech2 model with no silence in the edge of audios:
+- [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)
+- [fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)

 Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip).

--- a/paddlespeech/cli/README.md
+++ b/paddlespeech/cli/README.md
@ -7,3 +7,6 @@

 ## ASR
 `paddlespeech asr --input ./test_audio.wav`
+
+ ## Multi-label Classification
+ `paddlespeech cls --input ./test_audio.wav`
--- a/paddlespeech/cli/init.py
+++ b/paddlespeech/cli/init.py
@ -14,4 +14,5 @@
 from .asr import ASRExecutor
 from .base_commands import BaseCommand
 from .base_commands import HelpCommand
+from .cls import CLSExecutor
 from .st import STExecutor
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@ -39,7 +39,11 @@ from paddlespeech.s2t.utils.utility import UpdateConfig
 __all__ = ['ASRExecutor']

 pretrained_models = {
-    "wenetspeech_zh_16k": {
+    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+    # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
+    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
+    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
+    "conformer_wenetspeech-zh-16k": {
        'url':
        'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/conformer.model.tar.gz',
        'md5':
@ -49,7 +53,7 @@ pretrained_models = {
        'ckpt_path':
        'exp/conformer/checkpoints/wenetspeech',
    },
-    "transformer_zh_16k": {
+    "transformer_aishell-zh-16k": {
        'url':
        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz',
        'md5':
@ -83,7 +87,7 @@ class ASRExecutor(BaseExecutor):
        self.parser.add_argument(
            '--model',
            type=str,
-            default='wenetspeech',
+            default='conformer_wenetspeech',
            help='Choose model type of asr task.')
        self.parser.add_argument(
            '--lang',
@ -137,9 +141,13 @@ class ASRExecutor(BaseExecutor):
        """
            Init model and other resources from a specific path.
        """
+        if hasattr(self, 'model'):
+            logger.info('Model had been initialized.')
+            return
+
        if cfg_path is None or ckpt_path is None:
            sample_rate_str = '16k' if sample_rate == 16000 else '8k'
-            tag = model_type + '_' + lang + '_' + sample_rate_str
+            tag = model_type + '-' + lang + '-' + sample_rate_str
            res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
            self.res_path = res_path
            self.cfg_path = os.path.join(res_path,
@ -161,7 +169,7 @@ class ASRExecutor(BaseExecutor):
        self.config.decoding.decoding_method = "attention_rescoring"

        with UpdateConfig(self.config):
-            if model_type == "ds2_online" or model_type == "ds2_offline":
+            if "ds2_online" in model_type or "ds2_offline" in model_type:
                from paddlespeech.s2t.io.collator import SpeechCollator
                self.config.collator.vocab_filepath = os.path.join(
                    res_path, self.config.collator.vocab_filepath)
@ -174,7 +182,7 @@ class ASRExecutor(BaseExecutor):
                    spm_model_prefix=self.config.collator.spm_model_prefix)
                self.config.model.input_dim = self.collate_fn_test.feature_size
                self.config.model.output_dim = text_feature.vocab_size
-            elif model_type == "conformer" or model_type == "transformer" or model_type == "wenetspeech":
+            elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
                self.config.collator.vocab_filepath = os.path.join(
                    res_path, self.config.collator.vocab_filepath)
                self.config.collator.augmentation_config = os.path.join(
@ -192,7 +200,9 @@ class ASRExecutor(BaseExecutor):
                raise Exception("wrong type")
        # Enter the path of model root

-        model_class = dynamic_import(model_type, model_alias)
+        model_name = ''.join(
+            model_type.split('_')[:-1])  # model_type: {model_name}_{dataset}
+        model_class = dynamic_import(model_name, model_alias)
        model_conf = self.config.model
        logger.info(model_conf)
        model = model_class.from_config(model_conf)
@ -213,7 +223,7 @@ class ASRExecutor(BaseExecutor):
        logger.info("Preprocess audio_file:" + audio_file)

        # Get the object for feature extraction
-        if model_type == "ds2_online" or model_type == "ds2_offline":
+        if "ds2_online" in model_type or "ds2_offline" in model_type:
            audio, _ = self.collate_fn_test.process_utterance(
                audio_file=audio_file, transcript=" ")
            audio_len = audio.shape[0]
@ -225,7 +235,7 @@ class ASRExecutor(BaseExecutor):
            self._inputs["audio_len"] = audio_len
            logger.info(f"audio feat shape: {audio.shape}")

-        elif model_type == "conformer" or model_type == "transformer" or model_type == "wenetspeech":
+        elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
            logger.info("get the preprocess conf")
            preprocess_conf_file = self.config.collator.augmentation_config
            # redirect the cmvn path
@ -289,7 +299,7 @@ class ASRExecutor(BaseExecutor):
        cfg = self.config.decoding
        audio = self._inputs["audio"]
        audio_len = self._inputs["audio_len"]
-        if model_type == "ds2_online" or model_type == "ds2_offline":
+        if "ds2_online" in model_type or "ds2_offline" in model_type:
            result_transcripts = self.model.decode(
                audio,
                audio_len,
@ -304,7 +314,7 @@ class ASRExecutor(BaseExecutor):
                num_processes=cfg.num_proc_bsearch)
            self._outputs["result"] = result_transcripts[0]

-        elif model_type == "conformer" or model_type == "transformer" or model_type == "wenetspeech":
+        elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
            result_transcripts = self.model.decode(
                audio,
                audio_len,
@ -361,7 +371,7 @@ class ASRExecutor(BaseExecutor):
            audio, audio_sample_rate = soundfile.read(
                audio_file, dtype="int16", always_2d=True)
        except Exception as e:
-            logger.error(str(e))
+            logger.exception(e)
            logger.error(
                "can not open the audio file, please check the audio file format is 'wav'. \n \
                 you can try to use sox to change the file format.\n \
@ -421,7 +431,7 @@ class ASRExecutor(BaseExecutor):
            logger.info('ASR Result: {}'.format(res))
            return True
        except Exception as e:
-            print(e)
+            logger.exception(e)
            return False

    def __call__(self, model, lang, sample_rate, config, ckpt_path, audio_file,
--- a/paddlespeech/cli/cls/init.py
+++ b/paddlespeech/cli/cls/init.py
--- a/paddlespeech/cli/cls/init.py
+++ b/paddlespeech/cli/cls/init.py
@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .infer import CLSExecutor
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@ -0,0 +1,260 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from typing import List
+from typing import Optional
+from typing import Union
+
+import numpy as np
+import paddle
+import yaml
+
+from ..executor import BaseExecutor
+from ..utils import cli_register
+from ..utils import download_and_decompress
+from ..utils import logger
+from ..utils import MODEL_HOME
+from paddleaudio import load
+from paddleaudio.features import LogMelSpectrogram
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+
+__all__ = ['CLSExecutor']
+
+pretrained_models = {
+    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+    # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
+    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
+    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
+    "panns_cnn6-32k": {
+        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz',
+        'md5': '4cf09194a95df024fd12f84712cf0f9c',
+        'cfg_path': 'panns.yaml',
+        'ckpt_path': 'cnn6.pdparams',
+        'label_file': 'audioset_labels.txt',
+    },
+    "panns_cnn10-32k": {
+        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz',
+        'md5': 'cb8427b22176cc2116367d14847f5413',
+        'cfg_path': 'panns.yaml',
+        'ckpt_path': 'cnn10.pdparams',
+        'label_file': 'audioset_labels.txt',
+    },
+    "panns_cnn14-32k": {
+        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz',
+        'md5': 'e3b9b5614a1595001161d0ab95edee97',
+        'cfg_path': 'panns.yaml',
+        'ckpt_path': 'cnn14.pdparams',
+        'label_file': 'audioset_labels.txt',
+    },
+}
+
+model_alias = {
+    "panns_cnn6": "paddlespeech.cls.models.panns:CNN6",
+    "panns_cnn10": "paddlespeech.cls.models.panns:CNN10",
+    "panns_cnn14": "paddlespeech.cls.models.panns:CNN14",
+}
+
+
+@cli_register(
+    name='paddlespeech.cls', description='Audio classification infer command.')
+class CLSExecutor(BaseExecutor):
+    def __init__(self):
+        super(CLSExecutor, self).__init__()
+
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech.cls', add_help=True)
+        self.parser.add_argument(
+            '--input', type=str, required=True, help='Audio file to classify.')
+        self.parser.add_argument(
+            '--model',
+            type=str,
+            default='panns_cnn14',
+            help='Choose model type of cls task.')
+        self.parser.add_argument(
+            '--config',
+            type=str,
+            default=None,
+            help='Config of cls task. Use deault config when it is None.')
+        self.parser.add_argument(
+            '--ckpt_path',
+            type=str,
+            default=None,
+            help='Checkpoint file of model.')
+        self.parser.add_argument(
+            '--label_file',
+            type=str,
+            default=None,
+            help='Label file of cls task.')
+        self.parser.add_argument(
+            '--topk',
+            type=int,
+            default=1,
+            help='Return topk scores of classification result.')
+        self.parser.add_argument(
+            '--device',
+            type=str,
+            default=paddle.get_device(),
+            help='Choose device to execute model inference.')
+
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
+        """
+            Download and returns pretrained resources path of current task.
+        """
+        assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format(
+            tag)
+
+        res_path = os.path.join(MODEL_HOME, tag)
+        decompressed_path = download_and_decompress(pretrained_models[tag],
+                                                    res_path)
+        decompressed_path = os.path.abspath(decompressed_path)
+        logger.info(
+            'Use pretrained model stored in: {}'.format(decompressed_path))
+
+        return decompressed_path
+
+    def _init_from_path(self,
+                        model_type: str='panns_cnn14',
+                        cfg_path: Optional[os.PathLike]=None,
+                        ckpt_path: Optional[os.PathLike]=None,
+                        label_file: Optional[os.PathLike]=None):
+        """
+            Init model and other resources from a specific path.
+        """
+        if hasattr(self, 'model'):
+            logger.info('Model had been initialized.')
+            return
+
+        if label_file is None or ckpt_path is None:
+            tag = model_type + '-' + '32k'  # panns_cnn14-32k
+            self.res_path = self._get_pretrained_path(tag)
+            self.cfg_path = os.path.join(self.res_path,
+                                         pretrained_models[tag]['cfg_path'])
+            self.label_file = os.path.join(self.res_path,
+                                           pretrained_models[tag]['label_file'])
+            self.ckpt_path = os.path.join(self.res_path,
+                                          pretrained_models[tag]['ckpt_path'])
+        else:
+            self.cfg_path = os.path.abspath(cfg_path)
+            self.label_file = os.path.abspath(label_file)
+            self.ckpt_path = os.path.abspath(ckpt_path)
+
+        # config
+        with open(self.cfg_path, 'r') as f:
+            self._conf = yaml.safe_load(f)
+
+        # labels
+        self._label_list = []
+        with open(self.label_file, 'r') as f:
+            for line in f:
+                self._label_list.append(line.strip())
+
+        # model
+        model_class = dynamic_import(model_type, model_alias)
+        model_dict = paddle.load(self.ckpt_path)
+        self.model = model_class(extract_embedding=False)
+        self.model.set_state_dict(model_dict)
+        self.model.eval()
+
+    def preprocess(self, audio_file: Union[str, os.PathLike]):
+        """
+            Input preprocess and return paddle.Tensor stored in self.input.
+            Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet).
+        """
+        feat_conf = self._conf['feature']
+        logger.info(feat_conf)
+        waveform, _ = load(
+            file=audio_file,
+            sr=feat_conf['sample_rate'],
+            mono=True,
+            dtype='float32')
+        logger.info("Preprocessing audio_file:" + audio_file)
+
+        # Feature extraction
+        feature_extractor = LogMelSpectrogram(
+            sr=feat_conf['sample_rate'],
+            n_fft=feat_conf['n_fft'],
+            hop_length=feat_conf['hop_length'],
+            window=feat_conf['window'],
+            win_length=feat_conf['window_length'],
+            f_min=feat_conf['f_min'],
+            f_max=feat_conf['f_max'],
+            n_mels=feat_conf['n_mels'], )
+        feats = feature_extractor(
+            paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0)))
+        self._inputs['feats'] = paddle.transpose(feats, [0, 2, 1]).unsqueeze(
+            1)  # [B, N, T] -> [B, 1, T, N]
+
+    @paddle.no_grad()
+    def infer(self):
+        """
+            Model inference and result stored in self.output.
+        """
+        self._outputs['logits'] = self.model(self._inputs['feats'])
+
+    def _generate_topk_label(self, result: np.ndarray, topk: int) -> str:
+        assert topk <= len(
+            self._label_list), 'Value of topk is larger than number of labels.'
+
+        topk_idx = (-result).argsort()[:topk]
+        ret = ''
+        for idx in topk_idx:
+            label, score = self._label_list[idx], result[idx]
+            ret += f'{label}: {score}\n'
+        return ret
+
+    def postprocess(self, topk: int) -> Union[str, os.PathLike]:
+        """
+            Output postprocess and return human-readable results such as texts and audio files.
+        """
+        return self._generate_topk_label(
+            result=self._outputs['logits'].squeeze(0).numpy(), topk=topk)
+
+    def execute(self, argv: List[str]) -> bool:
+        """
+            Command line entry.
+        """
+        parser_args = self.parser.parse_args(argv)
+
+        model_type = parser_args.model
+        label_file = parser_args.label_file
+        cfg_path = parser_args.config
+        ckpt_path = parser_args.ckpt_path
+        audio_file = parser_args.input
+        topk = parser_args.topk
+        device = parser_args.device
+
+        try:
+            res = self(model_type, cfg_path, label_file, ckpt_path, audio_file,
+                       topk, device)
+            logger.info('CLS Result:\n{}'.format(res))
+            return True
+        except Exception as e:
+            logger.exception(e)
+            return False
+
+    def __call__(self, model, config, ckpt_path, label_file, audio_file, topk,
+                 device):
+        """
+            Python API to call an executor.
+        """
+        audio_file = os.path.abspath(audio_file)
+        # self._check(audio_file, sample_rate)
+        paddle.set_device(device)
+        self._init_from_path(model, config, ckpt_path, label_file)
+        self.preprocess(audio_file)
+        self.infer()
+        res = self.postprocess(topk)  # Retrieve result of cls.
+
+        return res
--- a/paddlespeech/cli/st/infer.py
+++ b/paddlespeech/cli/st/infer.py
@ -21,6 +21,7 @@ from typing import Union
 import kaldi_io
 import numpy as np
 import paddle
+import soundfile
 from kaldiio import WriteHelper
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
@ -36,19 +37,19 @@ from ..utils import MODEL_HOME
 __all__ = ["STExecutor"]

 pretrained_models = {
-    "fat_st_ted_en_zh": {
+    "fat_st_ted_en-zh": {
        "url":
-        "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/fat_st_mtl.model.tar.gz",
+        "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/fat_st_ted-en-zh.tar.gz",
        "md5":
-        "210b8eacc390d9965334fa8e96c49a13",
+        "fa0a7425b91b4f8d259c70b2aca5ae67",
        "cfg_path":
        "conf/transformer_mtl_noam.yaml",
        "ckpt_path":
-        "exp/transformer_mtl_noam/checkpoints/fat_st_ted_en_zh",
+        "exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams",
    }
 }

-model_alias = {"fat_st": "paddlespeech.s2t.models.u2_st:U2STModel"}
+model_alias = {"fat_st_ted": "paddlespeech.s2t.models.u2_st:U2STModel"}

 kaldi_bins = {
    "url":
@ -69,17 +70,28 @@ class STExecutor(BaseExecutor):
        self.parser.add_argument(
            "--input", type=str, required=True, help="Audio file to translate.")
        self.parser.add_argument(
-            "--model",
+            "--model_type",
            type=str,
-            default="fat_st",
+            default="fat_st_ted",
            help="Choose model type of st task.")
        self.parser.add_argument(
-            "--lang",
+            "--src_lang",
            type=str,
-            default="ted_en_zh",
-            help="Choose model language.")
+            default="en",
+            help="Choose model source language.")
        self.parser.add_argument(
-            "--config",
+            "--tgt_lang",
+            type=str,
+            default="zh",
+            help="Choose model target language.")
+        self.parser.add_argument(
+            "--sample_rate",
+            type=int,
+            default=16000,
+            choices=[16000],
+            help='Choose the audio sample rate of the model. 8000 or 16000')
+        self.parser.add_argument(
+            "--cfg_path",
            type=str,
            default=None,
            help="Config of st task. Use deault config when it is None.")
@ -117,20 +129,28 @@ class STExecutor(BaseExecutor):
        decompressed_path = download_and_decompress(kaldi_bins, MODEL_HOME)
        decompressed_path = os.path.abspath(decompressed_path)
        logger.info("Kaldi_bins stored in: {}".format(decompressed_path))
-        os.environ['LD_LIBRARY_PATH'] += f':{decompressed_path}'
+        if "LD_LIBRARY_PATH" in os.environ:
+            os.environ["LD_LIBRARY_PATH"] += f":{decompressed_path}"
+        else:
+            os.environ["LD_LIBRARY_PATH"] = f"{decompressed_path}"
        os.environ["PATH"] += f":{decompressed_path}"
        return decompressed_path

    def _init_from_path(self,
-                        model_type: str="fat_st",
-                        lang: str="zh",
+                        model_type: str="fat_st_ted",
+                        src_lang: str="en",
+                        tgt_lang: str="zh",
                        cfg_path: Optional[os.PathLike]=None,
                        ckpt_path: Optional[os.PathLike]=None):
        """
            Init model and other resources from a specific path.
        """
+        if hasattr(self, 'model'):
+            logger.info('Model had been initialized.')
+            return
+
        if cfg_path is None or ckpt_path is None:
-            tag = model_type + "_" + lang
+            tag = model_type + "_" + src_lang + "-" + tgt_lang
            res_path = self._get_pretrained_path(tag)
            self.cfg_path = os.path.join(res_path,
                                         pretrained_models[tag]["cfg_path"])
@ -171,13 +191,20 @@ class STExecutor(BaseExecutor):
        self.model.eval()

        # load model
-        params_path = self.ckpt_path + ".pdparams"
+        params_path = self.ckpt_path
        model_dict = paddle.load(params_path)
        self.model.set_state_dict(model_dict)

        # set kaldi bins
        self._set_kaldi_bins()

+    def _check(self, audio_file: str, sample_rate: int):
+        _, audio_sample_rate = soundfile.read(
+            audio_file, dtype="int16", always_2d=True)
+        if audio_sample_rate != sample_rate:
+            raise Exception("invalid sample rate")
+            sys.exit(-1)
+
    def preprocess(self, wav_file: Union[str, os.PathLike], model_type: str):
        """
            Input preprocess and return paddle.Tensor stored in self.input.
@ -186,7 +213,7 @@ class STExecutor(BaseExecutor):
        audio_file = os.path.abspath(wav_file)
        logger.info("Preprocess audio_file:" + audio_file)

-        if model_type == "fat_st":
+        if model_type == "fat_st_ted":
            cmvn = self.config.collator.cmvn_path
            utt_name = "_tmp"

@ -198,7 +225,8 @@ class STExecutor(BaseExecutor):
            fbank_extract_process = subprocess.Popen(
                fbank_extract_command,
                stdin=subprocess.PIPE,
-                stdout=subprocess.PIPE)
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE)
            fbank_extract_process.stdin.write(
                f"{utt_name} {wav_file}".encode("utf8"))
            fbank_extract_process.stdin.close()
@ -207,14 +235,18 @@ class STExecutor(BaseExecutor):

            extract_command = ["compute-kaldi-pitch-feats", "scp:-", "ark:-"]
            pitch_extract_process = subprocess.Popen(
-                extract_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+                extract_command,
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE)
            pitch_extract_process.stdin.write(
                f"{utt_name} {wav_file}".encode("utf8"))
            process_command = ["process-kaldi-pitch-feats", "ark:", "ark:-"]
            pitch_process = subprocess.Popen(
                process_command,
                stdin=pitch_extract_process.stdout,
-                stdout=subprocess.PIPE)
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE)
            pitch_extract_process.stdin.close()
            pitch_feat = dict(
                kaldi_io.read_mat_ark(pitch_process.stdout))[utt_name]
@ -228,19 +260,19 @@ class STExecutor(BaseExecutor):
                "ark:-"
            ]
            cmvn_process = subprocess.Popen(
-                cmvn_command, stdout=subprocess.PIPE)
+                cmvn_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            process_command = [
                "copy-feats", "--compress=true", "ark:-", "ark:-"
            ]
            process = subprocess.Popen(
                process_command,
                stdin=cmvn_process.stdout,
-                stdout=subprocess.PIPE)
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE)
            norm_feat = dict(kaldi_io.read_mat_ark(process.stdout))[utt_name]
-            self.audio = paddle.to_tensor(norm_feat).unsqueeze(0)
-            self.audio_len = paddle.to_tensor(
-                self.audio.shape[1], dtype="int64")
-            logger.info(f"audio feat shape: {self.audio.shape}")
+            self._inputs["audio"] = paddle.to_tensor(norm_feat).unsqueeze(0)
+            self._inputs["audio_len"] = paddle.to_tensor(
+                self._inputs["audio"].shape[1], dtype="int64")
        else:
            raise ValueError("Wrong model type.")

@ -250,9 +282,9 @@ class STExecutor(BaseExecutor):
            Model inference and result stored in self.output.
        """
        cfg = self.config.decoding
-        audio = self.audio
-        audio_len = self.audio_len
-        if model_type == "fat_st":
+        audio = self._inputs["audio"]
+        audio_len = self._inputs["audio_len"]
+        if model_type == "fat_st_ted":
            hyps = self.model.decode(
                audio,
                audio_len,
@ -270,7 +302,7 @@ class STExecutor(BaseExecutor):
                decoding_chunk_size=cfg.decoding_chunk_size,
                num_decoding_left_chunks=cfg.num_decoding_left_chunks,
                simulate_streaming=cfg.simulate_streaming)
-            self.result_transcripts = hyps
+            self._outputs["result"] = hyps
        else:
            raise ValueError("Wrong model type.")

@ -278,8 +310,8 @@ class STExecutor(BaseExecutor):
        """
            Output postprocess and return human-readable results such as texts and audio files.
        """
-        if model_type == "fat_st":
-            return self.result_transcripts
+        if model_type == "fat_st_ted":
+            return self._outputs["result"]
        else:
            raise ValueError("Wrong model type.")

@ -289,30 +321,36 @@ class STExecutor(BaseExecutor):
        """
        parser_args = self.parser.parse_args(argv)

-        model = parser_args.model
-        lang = parser_args.lang
-        config = parser_args.config
+        model_type = parser_args.model_type
+        src_lang = parser_args.src_lang
+        tgt_lang = parser_args.tgt_lang
+        sample_rate = parser_args.sample_rate
+        cfg_path = parser_args.cfg_path
        ckpt_path = parser_args.ckpt_path
        audio_file = parser_args.input
        device = parser_args.device

        try:
-            res = self(model, lang, config, ckpt_path, audio_file, device)
-            logger.info('ST Result: {}'.format(res))
+            res = self(model_type, src_lang, tgt_lang, sample_rate, cfg_path,
+                       ckpt_path, audio_file, device)
+            logger.info("ST Result: {}".format(res))
            return True
        except Exception as e:
            print(e)
            return False

-    def __call__(self, model, lang, config, ckpt_path, audio_file, device):
+    def __call__(self, model_type, src_lang, tgt_lang, sample_rate, cfg_path,
+                 ckpt_path, audio_file, device):
        """
            Python API to call an executor.
        """
        audio_file = os.path.abspath(audio_file)
+        self._check(audio_file, sample_rate)
        paddle.set_device(device)
-        self._init_from_path(model, lang, config, ckpt_path)
-        self.preprocess(audio_file, model)
-        self.infer(model)
-        res = self.postprocess(model)
+        self._init_from_path(model_type, src_lang, tgt_lang, cfg_path,
+                             ckpt_path)
+        self.preprocess(audio_file, model_type)
+        self.infer(model_type)
+        res = self.postprocess(model_type)

        return res
--- a/paddlespeech/cli/utils.py
+++ b/paddlespeech/cli/utils.py
@ -12,10 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import functools
+import hashlib
 import logging
 import os
+import tarfile
+import zipfile
 from typing import Any
 from typing import Dict
+from typing import List

 from paddle.framework import load
 from paddle.utils import download
@ -55,12 +59,69 @@ def get_command(name: str) -> Any:
    return com['_entry']


-def decompress(file: str) -> os.PathLike:
-    """
-    Extracts all files from a compressed file.
-    """
-    assert os.path.isfile(file), "File: {} not exists.".format(file)
-    return download._decompress(file)
+def _md5check(filepath: os.PathLike, md5sum: str) -> bool:
+    logger.info("File {} md5 checking...".format(filepath))
+    md5 = hashlib.md5()
+    with open(filepath, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    calc_md5sum = md5.hexdigest()
+
+    if calc_md5sum != md5sum:
+        logger.info("File {} md5 check failed, {}(calc) != "
+                    "{}(base)".format(filepath, calc_md5sum, md5sum))
+        return False
+    else:
+        logger.info("File {} md5 check passed.".format(filepath))
+        return True
+
+
+def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike:
+    file_dir = os.path.dirname(filepath)
+
+    if tarfile.is_tarfile(filepath):
+        files = tarfile.open(filepath, "r:*")
+        file_list = files.getnames()
+    elif zipfile.is_zipfile(filepath):
+        files = zipfile.ZipFile(filepath, 'r')
+        file_list = files.namelist()
+    else:
+        return file_dir
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+    files.close()
+    return uncompressed_path
+
+
+def _is_a_single_file(file_list: List[os.PathLike]) -> bool:
+    if len(file_list) == 1 and file_list[0].find(os.sep) < -1:
+        return True
+    return False
+
+
+def _is_a_single_dir(file_list: List[os.PathLike]) -> bool:
+    new_file_list = []
+    for file_path in file_list:
+        if '/' in file_path:
+            file_path = file_path.replace('/', os.sep)
+        elif '\\' in file_path:
+            file_path = file_path.replace('\\', os.sep)
+        new_file_list.append(file_path)
+
+    file_name = new_file_list[0].split(os.sep)[0]
+    for i in range(1, len(new_file_list)):
+        if file_name != new_file_list[i].split(os.sep)[0]:
+            return False
+    return True


 def download_and_decompress(archive: Dict[str, str], path: str) -> os.PathLike:
@ -72,7 +133,17 @@ def download_and_decompress(archive: Dict[str, str], path: str) -> os.PathLike:

    assert 'url' in archive and 'md5' in archive, \
        'Dictionary keys of "url" and "md5" are required in the archive, but got: {}'.format(list(archive.keys()))
-    return download.get_path_from_url(archive['url'], path, archive['md5'])
+
+    filepath = os.path.join(path, os.path.basename(archive['url']))
+    if os.path.isfile(filepath) and _md5check(filepath, archive['md5']):
+        uncompress_path = _get_uncompress_path(filepath)
+        if not os.path.isdir(uncompress_path):
+            download._decompress(filepath)
+    else:
+        uncompress_path = download.get_path_from_url(archive['url'], path,
+                                                     archive['md5'])
+
+    return uncompress_path


 def load_state_dict_from_url(url: str, path: str, md5: str=None) -> os.PathLike:
@ -128,11 +199,16 @@ class Logger(object):
            'EVAL': 22,
            'WARNING': 30,
            'ERROR': 40,
-            'CRITICAL': 50
+            'CRITICAL': 50,
+            'EXCEPTION': 100,
        }
        for key, level in log_config.items():
            logging.addLevelName(level, key)
-            self.__dict__[key.lower()] = functools.partial(self.__call__, level)
+            if key == 'EXCEPTION':
+                self.__dict__[key.lower()] = self.logger.exception
+            else:
+                self.__dict__[key.lower()] = functools.partial(self.__call__,
+                                                               level)

        self.format = logging.Formatter(
            fmt='[%(asctime)-15s] [%(levelname)8s] [%(filename)s] [L%(lineno)d] - %(message)s'
--- a/requirements.txt
+++ b/requirements.txt
@ -14,6 +14,7 @@ loguru
 matplotlib
 nara_wpe
 nltk
+paddleaudio
 paddlespeech_ctcdecoders
 paddlespeech_feat
 pandas
--- a/setup.py
+++ b/setup.py
@ -43,6 +43,7 @@ requirements = {
        "nara_wpe",
        "nltk",
        "pandas",
+        "paddleaudio",
        "paddlespeech_ctcdecoders",
        "paddlespeech_feat",
        "praatio~=4.1",
@ -197,7 +198,7 @@ setup_info = dict(
        "pwgan",
        "gan",
    ],
-    python_requires='>=3.6',
+    python_requires='>=3.7',
    install_requires=requirements["install"],
    extras_require={
        'develop':
--- a/tests/benchmark/conformer/run.sh
+++ b/tests/benchmark/conformer/run.sh
@ -20,7 +20,7 @@ mkdir -p conf/benchmark
 cp conf/conformer.yaml  conf/benchmark/conformer.yaml
 sed -i "s/  accum_grad: 2/  accum_grad: 1/g" conf/benchmark/conformer.yaml
 fp_item_list=(fp32)
-bs_item=(16 30)
+bs_item=(16)
 config_path=conf/benchmark/conformer.yaml
 seed=0
 output=exp/conformer
--- a/tests/benchmark/pwgan/run_all.sh
+++ b/tests/benchmark/pwgan/run_all.sh
@ -38,7 +38,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
      model_mode_list=(pwgan)
      fp_item_list=(fp32)
      # 满 bs 是 26
-      bs_item_list=(6 26)
+      bs_item_list=(6)
      for model_mode in ${model_mode_list[@]}; do
            for fp_item in ${fp_item_list[@]}; do
            for bs_item in ${bs_item_list[@]}; do
@ -55,4 +55,4 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
                  done
            done
      done
-fi
+fi