You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
84 lines
2.7 KiB
84 lines
2.7 KiB
4 years ago
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
7 years ago
|
"""Contains the text featurizer class."""
|
||
7 years ago
|
|
||
|
import os
|
||
7 years ago
|
import codecs
|
||
7 years ago
|
|
||
|
|
||
|
class TextFeaturizer(object):
|
||
7 years ago
|
"""Text featurizer, for processing or extracting features from text.
|
||
|
|
||
|
Currently, it only supports char-level tokenizing and conversion into
|
||
|
a list of token indices. Note that the token indexing order follows the
|
||
|
given vocabulary file.
|
||
|
|
||
|
:param vocab_filepath: Filepath to load vocabulary for token indices
|
||
|
conversion.
|
||
4 years ago
|
:type specgram_type: str
|
||
7 years ago
|
"""
|
||
|
|
||
7 years ago
|
def __init__(self, vocab_filepath):
|
||
4 years ago
|
self.unk = '<unk>'
|
||
7 years ago
|
self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file(
|
||
|
vocab_filepath)
|
||
|
|
||
7 years ago
|
def featurize(self, text):
|
||
|
"""Convert text string to a list of token indices in char-level.Note
|
||
|
that the token indexing order follows the given vocabulary file.
|
||
|
|
||
|
:param text: Text to process.
|
||
4 years ago
|
:type text: str
|
||
7 years ago
|
:return: List of char-level token indices.
|
||
|
:rtype: list
|
||
|
"""
|
||
7 years ago
|
tokens = self._char_tokenize(text)
|
||
4 years ago
|
ids = []
|
||
|
for token in tokens:
|
||
|
token = token if token in self._vocab_dict else self.unk
|
||
|
ids.append(self._vocab_dict[token])
|
||
|
return ids
|
||
7 years ago
|
|
||
|
@property
|
||
|
def vocab_size(self):
|
||
7 years ago
|
"""Return the vocabulary size.
|
||
|
|
||
|
:return: Vocabulary size.
|
||
|
:rtype: int
|
||
|
"""
|
||
7 years ago
|
return len(self._vocab_list)
|
||
|
|
||
|
@property
|
||
|
def vocab_list(self):
|
||
7 years ago
|
"""Return the vocabulary in list.
|
||
|
|
||
|
:return: Vocabulary in list.
|
||
|
:rtype: list
|
||
|
"""
|
||
7 years ago
|
return self._vocab_list
|
||
|
|
||
|
def _char_tokenize(self, text):
|
||
7 years ago
|
"""Character tokenizer."""
|
||
7 years ago
|
return list(text.strip())
|
||
|
|
||
|
def _load_vocabulary_from_file(self, vocab_filepath):
|
||
|
"""Load vocabulary from file."""
|
||
|
vocab_lines = []
|
||
7 years ago
|
with codecs.open(vocab_filepath, 'r', 'utf-8') as file:
|
||
7 years ago
|
vocab_lines.extend(file.readlines())
|
||
|
vocab_list = [line[:-1] for line in vocab_lines]
|
||
|
vocab_dict = dict(
|
||
|
[(token, id) for (id, token) in enumerate(vocab_list)])
|
||
|
return vocab_dict, vocab_list
|