PaddleSpeech/paddlespeech/t2s/frontend/rhy_prediction/rhy_predictor.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re

import paddle
import yaml
from paddlenlp.transformers import ErnieTokenizer
from yacs.config import CfgNode

from paddlespeech.cli.utils import download_and_decompress
from paddlespeech.resource.pretrained_models import rhy_frontend_models
from paddlespeech.text.models.ernie_linear import ErnieLinear
from paddlespeech.utils.env import MODEL_HOME

DefinedClassifier = {
    'ErnieLinear': ErnieLinear,
}

model_version = '1.0'


class RhyPredictor():
    def __init__(
            self,
            model_dir: os.PathLike=MODEL_HOME, ):
        uncompress_path = download_and_decompress(
            rhy_frontend_models['rhy_e2e'][model_version], model_dir)
        with open(os.path.join(uncompress_path, 'rhy_default.yaml')) as f:
            config = CfgNode(yaml.safe_load(f))
        self.punc_list = []
        with open(os.path.join(uncompress_path, 'rhy_token'), 'r') as f:
            for line in f:
                self.punc_list.append(line.strip())
        self.punc_list = [0] + self.punc_list
        self.make_rhy_dict()
        self.model = DefinedClassifier["ErnieLinear"](**config["model"])
        pretrained_token = config['data_params']['pretrained_token']
        self.tokenizer = ErnieTokenizer.from_pretrained(pretrained_token)
        state_dict = paddle.load(
            os.path.join(uncompress_path, 'snapshot_iter_2600_main_params.pdz'))
        self.model.set_state_dict(state_dict)
        self.model.eval()

    def _clean_text(self, text):
        text = text.lower()
        text = re.sub('[^A-Za-z0-9\u4e00-\u9fa5]', '', text)
        text = re.sub(f'[{"".join([p for p in self.punc_list][1:])}]', '', text)
        return text

    def preprocess(self, text, tokenizer):
        clean_text = self._clean_text(text)
        assert len(clean_text) > 0, f'Invalid input string: {text}'
        tokenized_input = tokenizer(
            list(clean_text), return_length=True, is_split_into_words=True)
        _inputs = dict()
        _inputs['input_ids'] = tokenized_input['input_ids']
        _inputs['seg_ids'] = tokenized_input['token_type_ids']
        _inputs['seq_len'] = tokenized_input['seq_len']
        return _inputs

    def get_prediction(self, raw_text):
        _inputs = self.preprocess(raw_text, self.tokenizer)
        seq_len = _inputs['seq_len']
        input_ids = paddle.to_tensor(_inputs['input_ids']).unsqueeze(0)
        seg_ids = paddle.to_tensor(_inputs['seg_ids']).unsqueeze(0)
        logits, _ = self.model(input_ids, seg_ids)
        preds = paddle.argmax(logits, axis=-1).squeeze(0)
        tokens = self.tokenizer.convert_ids_to_tokens(
            _inputs['input_ids'][1:seq_len - 1])
        labels = preds[1:seq_len - 1].tolist()
        assert len(tokens) == len(labels)
        # add 0 for non punc
        text = ''
        for t, l in zip(tokens, labels):
            text += t
            if l != 0:  # Non punc.
                text += self.punc_list[l]
        return text

    def make_rhy_dict(self):
        self.rhy_dict = {}
        for i, p in enumerate(self.punc_list[1:]):
            self.rhy_dict[p] = 'sp' + str(i + 1)

    def pinyin_align(self, pinyins, rhy_pre):
        final_py = []
        j = 0
        for i in range(len(rhy_pre)):
            if rhy_pre[i] in self.rhy_dict:
                final_py.append(self.rhy_dict[rhy_pre[i]])
            else:
                final_py.append(pinyins[j])
                j += 1
        return final_py