support hubert cli

2 years ago · 6f3585b199
parent 7658e54c47
commit 6f3585b199
6 changed files with 76 additions and 50 deletions
--- a/examples/librispeech/asr4/README.md
+++ b/examples/librispeech/asr4/README.md
@ -163,7 +163,7 @@ using the `tar` scripts to unpack the model and then you can use the script to t
 For example:
 ```bash
-wget https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr3/hubertASR-large-100h-librispeech_ckpt_1.4.0.model.tar.gz
+wget https://paddlespeech.bj.bcebos.com/hubert/hubertASR-large-100h-librispeech_ckpt_1.4.0.model.tar.gz
 tar xzvf hubertASR-large-100h-librispeech_ckpt_1.4.0.model.tar.gz
 source path.sh
 # If you have process the data and get the manifest file， you can skip the following 2 steps
@ -184,7 +184,7 @@ In some situations, you want to use the trained model to do the inference for th
 ```
 you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model through the script below:
 ```bash
-wget https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr3/hubertASR-large-100h-librispeech_ckpt_1.4.0.model.tar.gz
+wget https://paddlespeech.bj.bcebos.com/hubert/hubertASR-large-100h-librispeech_ckpt_1.4.0.model.tar.gz
 tar xzvf hubertASR-large-100h-librispeech_ckpt_1.4.0.model.tar.gz
 ```
 You can download the audio demo:
--- a/paddlespeech/cli/ssl/infer.py
+++ b/paddlespeech/cli/ssl/infer.py
@ -150,10 +150,10 @@ class SSLExecutor(BaseExecutor):
                        model_prefix = 'wav2vec2ASR_librispeech'
                    elif lang == 'zh':
                        model_prefix = 'wav2vec2ASR_aishell1'
-                    tag = model_prefix  + '-' + lang + '-' + sample_rate_str
+                    tag = model_prefix + '-' + lang + '-' + sample_rate_str
                elif model_type == 'hubert':
                    if lang == 'en':
-                        model_prefix = 'hubertASR_librispeech_100'
+                        model_prefix = 'hubertASR_librispeech-100h'
                    elif lang == 'zh':
                        logger.error("zh hubertASR is not supported yet")
                    tag = model_prefix + '-' + lang + '-' + sample_rate_str
@ -185,16 +185,17 @@ class SSLExecutor(BaseExecutor):
                    self.text_feature = TextFeaturizer(
                        unit_type=self.config.unit_type,
                        vocab=self.config.vocab_filepath)
                    self.config.output_dim = len(self.config.vocab_filepath)
                elif lang == 'zh':
                    self.text_feature = AutoTokenizer.from_pretrained(
                        self.config.tokenizer)
                    self.config.output_dim = self.text_feature.vocab_size
                self.config.decode.decoding_method = decode_method
-            model_name = model_type[:model_type.rindex(
+            model_name = model_prefix[:model_prefix.rindex(
                '_')]  # model_type: {model_name}_{dataset}
        else:
            model_name = model_type
        model_class = self.task_resource.get_model_class(model_name)
        model_conf = self.config
        model = model_class.from_config(model_conf)
        self.model = model
@ -264,8 +265,7 @@ class SSLExecutor(BaseExecutor):
        audio = self._inputs["audio"]
        if task == 'asr':
            cfg = self.config.decode
-            logger.debug(
+            logger.debug(f"we will use the {model_type}ASR like model.")
                f"we will use the {model_type}ASR like model.")
            try:
                result_transcripts = self.model.decode(
                    audio,
@ -278,7 +278,8 @@ class SSLExecutor(BaseExecutor):
                logger.exception(e)
        else:
            logger.debug(
-                f"we will use the {model_type} like model to extract audio feature.")
+                f"we will use the {model_type} like model to extract audio feature."
            )
            try:
                out_feature = self.model(audio[:, :, 0])
                self._outputs["result"] = out_feature[0]
@ -455,7 +456,7 @@ class SSLExecutor(BaseExecutor):
        if rtf:
            k = self.__class__.__name__
            CLI_TIMER[k]['start'].append(time.time())
-        self.preprocess(model, audio_file)
+        self.preprocess(audio_file)
        self.infer(model, task)
        res = self.postprocess()  # Retrieve result of asr.
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@ -117,6 +117,38 @@ ssl_dynamic_pretrained_models = {
            'exp/wav2vec2ASR/checkpoints/avg_1.pdparams',
        },
    },
    "hubert-en-16k": {
        '1.4': {
            'url':
            'https://paddlespeech.bj.bcebos.com/hubert/hubert-large-lv60_ckpt_1.4.0.model.tar.gz',
            'md5':
            'efecfb87a8718aa9253b7459c1fe9b54',
            'cfg_path':
            'model.yaml',
            'ckpt_path':
            'hubert-large-lv60',
            'model':
            'hubert-large-lv60.pdparams',
            'params':
            'hubert-large-lv60.pdparams',
        },
    },
    "hubertASR_librispeech-100h-en-16k": {
        '1.4': {
            'url':
            'https://paddlespeech.bj.bcebos.com/hubert/hubertASR-large-100h-librispeech_ckpt_1.4.0.model.tar.gz',
            'md5':
            '574cefd11aaef5737969ce22a7f33ea2',
            'cfg_path':
            'model.yaml',
            'ckpt_path':
            'exp/hubertASR/checkpoints/avg_1',
            'model':
            'exp/hubertASR/checkpoints/avg_1.pdparams',
            'params':
            'exp/hubertASR/checkpoints/avg_1.pdparams',
        },
    },
 }
 # ---------------------------------
@ -521,38 +553,6 @@ asr_onnx_pretrained_models = {
            '29e02312deb2e59b3c8686c7966d4fe3'
        },
    },
    "hubert-en-16k": {
        '1.4': {
            'url':
            'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr3/hubert-large-lv60_ckpt_1.4.0.model.tar.gz',
            'md5':
            '9f0bc943adb822789bf61e674b229d17',
            'cfg_path':
            'model.yaml',
            'ckpt_path':
            'hubert-large-lv60',
            'model':
            'hubert-large-lv60.pdparams',
            'params':
            'hubert-large-lv60.pdparams',
        },
    },
    "hubertASR_librispeech_100-en-16k": {
        '1.4': {
            'url':
            'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr3/hubertASR-large-100h-librispeech_ckpt_1.4.0.model.tar.gz',
            'md5':
            '9f0bc943adb822789bf61e674b229d17',
            'cfg_path':
            'model.yaml',
            'ckpt_path':
            'exp/wav2vec2ASR/checkpoints/avg_1',
            'model':
            'exp/wav2vec2ASR/checkpoints/avg_1.pdparams',
            'params':
            'exp/wav2vec2ASR/checkpoints/avg_1.pdparams',
        },
    },
 }
 whisper_dynamic_pretrained_models = {
--- a/paddlespeech/s2t/models/hubert/hubert_ASR.py
+++ b/paddlespeech/s2t/models/hubert/hubert_ASR.py
@ -44,13 +44,11 @@ class HubertASR(nn.Layer):
        init_type = config.get("init_type", None)
        with DefaultInitializerContext(init_type):
            self.config = config
            with open(config.vocab_filepath) as f:
                dicts = [symbol.strip() for symbol in f.readlines()]
            task_cfg = self.merge_with_parent(HubertPretrainingConfig,
                                              dict(self.config.task_cfg))
            model_cfg = self.merge_with_parent(HubertConfig,
                                               dict(self.config.model_cfg))
-            hubert = HubertModel(model_cfg, task_cfg, dicts)
+            hubert = HubertModel(model_cfg, task_cfg, [None])
            self.normalize_wav = config.normalize_wav
            self.output_norm = config.output_norm
@ -329,13 +327,12 @@ class HubertBase(nn.Layer):
    def __init__(self, config: dict):
        super().__init__()
-        with open(config.vocab_filepath) as f:
+        self.config = config
            dicts = [symbol.strip() for symbol in f.readlines()]
        task_cfg = self.merge_with_parent(HubertPretrainingConfig,
                                          dict(self.config.task_cfg))
        model_cfg = self.merge_with_parent(HubertConfig,
                                           dict(self.config.model_cfg))
-        hubert = HubertModel(model_cfg, task_cfg, dicts)
+        hubert = HubertModel(model_cfg, task_cfg, [None])
        self.hubert = hubert
    @classmethod
@ -351,6 +348,21 @@ class HubertBase(nn.Layer):
        model = cls(configs)
        return model
    def merge_with_parent(self, dc: dataclass, cfg: dict):
        assert is_dataclass(dc)
        assert type(cfg) == dict
        cfg = deepcopy(cfg)
        def fix_cfg(cfg):
            target_keys = set(dc.__dataclass_fields__.keys())
            for k in list(cfg.keys()):
                if k not in target_keys:
                    del cfg[k]
        fix_cfg(cfg)
        assert len(cfg) > 0
        return dc(**cfg)
    def forward(self, wav):
-        out = self.hubert(wav)
+        out = self.hubert.extract_features(wav)
        return out
--- a/paddlespeech/s2t/models/hubert/modules/init.py
+++ b/paddlespeech/s2t/models/hubert/modules/init.py
@ -0,0 +1,13 @@
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/paddlespeech/s2t/models/hubert/modules/hubert_model.py
+++ b/paddlespeech/s2t/models/hubert/modules/hubert_model.py
@ -377,7 +377,7 @@ class HubertModel(nn.Layer):
                min_space=self.mask_channel_min_space, )
            mask_channel_indices = (paddle.to_tensor(
                mask_channel_indices, dtype='int64', place=x.place).unsqueeze(1)
-                                    .expand(-1, T, -1))
+                                    .expand([-1, T, -1]))
            x[mask_channel_indices] = 0
        return x, mask_indices