You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
107 lines
4.0 KiB
107 lines
4.0 KiB
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import os
|
|
import re
|
|
|
|
import paddle
|
|
import yaml
|
|
from paddlenlp.transformers import ErnieTokenizer
|
|
from yacs.config import CfgNode
|
|
|
|
from paddlespeech.cli.utils import download_and_decompress
|
|
from paddlespeech.resource.pretrained_models import rhy_frontend_models
|
|
from paddlespeech.text.models.ernie_linear import ErnieLinear
|
|
from paddlespeech.utils.env import MODEL_HOME
|
|
|
|
DefinedClassifier = {
|
|
'ErnieLinear': ErnieLinear,
|
|
}
|
|
|
|
model_version = '1.0'
|
|
|
|
|
|
class RhyPredictor():
|
|
def __init__(
|
|
self,
|
|
model_dir: os.PathLike=MODEL_HOME, ):
|
|
uncompress_path = download_and_decompress(
|
|
rhy_frontend_models['rhy_e2e'][model_version], model_dir)
|
|
with open(os.path.join(uncompress_path, 'rhy_default.yaml')) as f:
|
|
config = CfgNode(yaml.safe_load(f))
|
|
self.punc_list = []
|
|
with open(os.path.join(uncompress_path, 'rhy_token'), 'r') as f:
|
|
for line in f:
|
|
self.punc_list.append(line.strip())
|
|
self.punc_list = [0] + self.punc_list
|
|
self.make_rhy_dict()
|
|
self.model = DefinedClassifier["ErnieLinear"](**config["model"])
|
|
pretrained_token = config['data_params']['pretrained_token']
|
|
self.tokenizer = ErnieTokenizer.from_pretrained(pretrained_token)
|
|
state_dict = paddle.load(
|
|
os.path.join(uncompress_path, 'snapshot_iter_2600_main_params.pdz'))
|
|
self.model.set_state_dict(state_dict)
|
|
self.model.eval()
|
|
|
|
def _clean_text(self, text):
|
|
text = text.lower()
|
|
text = re.sub('[^A-Za-z0-9\u4e00-\u9fa5]', '', text)
|
|
text = re.sub(f'[{"".join([p for p in self.punc_list][1:])}]', '', text)
|
|
return text
|
|
|
|
def preprocess(self, text, tokenizer):
|
|
clean_text = self._clean_text(text)
|
|
assert len(clean_text) > 0, f'Invalid input string: {text}'
|
|
tokenized_input = tokenizer(
|
|
list(clean_text), return_length=True, is_split_into_words=True)
|
|
_inputs = dict()
|
|
_inputs['input_ids'] = tokenized_input['input_ids']
|
|
_inputs['seg_ids'] = tokenized_input['token_type_ids']
|
|
_inputs['seq_len'] = tokenized_input['seq_len']
|
|
return _inputs
|
|
|
|
def get_prediction(self, raw_text):
|
|
_inputs = self.preprocess(raw_text, self.tokenizer)
|
|
seq_len = _inputs['seq_len']
|
|
input_ids = paddle.to_tensor(_inputs['input_ids']).unsqueeze(0)
|
|
seg_ids = paddle.to_tensor(_inputs['seg_ids']).unsqueeze(0)
|
|
logits, _ = self.model(input_ids, seg_ids)
|
|
preds = paddle.argmax(logits, axis=-1).squeeze(0)
|
|
tokens = self.tokenizer.convert_ids_to_tokens(
|
|
_inputs['input_ids'][1:seq_len - 1])
|
|
labels = preds[1:seq_len - 1].tolist()
|
|
assert len(tokens) == len(labels)
|
|
# add 0 for non punc
|
|
text = ''
|
|
for t, l in zip(tokens, labels):
|
|
text += t
|
|
if l != 0: # Non punc.
|
|
text += self.punc_list[l]
|
|
return text
|
|
|
|
def make_rhy_dict(self):
|
|
self.rhy_dict = {}
|
|
for i, p in enumerate(self.punc_list[1:]):
|
|
self.rhy_dict[p] = 'sp' + str(i + 1)
|
|
|
|
def pinyin_align(self, pinyins, rhy_pre):
|
|
final_py = []
|
|
j = 0
|
|
for i in range(len(rhy_pre)):
|
|
if rhy_pre[i] in self.rhy_dict:
|
|
final_py.append(self.rhy_dict[rhy_pre[i]])
|
|
else:
|
|
final_py.append(pinyins[j])
|
|
j += 1
|
|
return final_py
|