replace space when build vocab

pull/854/head
Hui Zhang 4 years ago
parent 86f34784e3
commit 30563981f0

@ -28,6 +28,7 @@
#include "path_trie.h" #include "path_trie.h"
using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>; using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
constexpr kSPACE = "<space>"
std::vector<std::pair<double, std::string>> ctc_beam_search_decoder( std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
const std::vector<std::vector<double>> &probs_seq, const std::vector<std::vector<double>> &probs_seq,
@ -46,13 +47,8 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
"The shape of probs_seq does not match with " "The shape of probs_seq does not match with "
"the shape of the vocabulary"); "the shape of the vocabulary");
} }
// assign blank id
// size_t blank_id = vocabulary.size();
// size_t blank_id = 0;
// assign space id // assign space id
auto it = std::find(vocabulary.begin(), vocabulary.end(), " "); auto it = std::find(vocabulary.begin(), vocabulary.end(), kSPACE);
int space_id = it - vocabulary.begin(); int space_id = it - vocabulary.begin();
// if no space in vocabulary // if no space in vocabulary
if ((size_t)space_id >= vocabulary.size()) { if ((size_t)space_id >= vocabulary.size()) {

@ -54,9 +54,9 @@ class TextFeaturizer():
self.sp = spm.SentencePieceProcessor() self.sp = spm.SentencePieceProcessor()
self.sp.Load(spm_model) self.sp.Load(spm_model)
def tokenize(self, text): def tokenize(self, text, replace_space=True):
if self.unit_type == 'char': if self.unit_type == 'char':
tokens = self.char_tokenize(text) tokens = self.char_tokenize(text, replace_space)
elif self.unit_type == 'word': elif self.unit_type == 'word':
tokens = self.word_tokenize(text) tokens = self.word_tokenize(text)
else: # spm else: # spm
@ -106,17 +106,19 @@ class TextFeaturizer():
text = self.detokenize(tokens) text = self.detokenize(tokens)
return text return text
def char_tokenize(self, text): def char_tokenize(self, text, replace_space=True):
"""Character tokenizer. """Character tokenizer.
Args: Args:
text (str): text string. text (str): text string.
replace_space (bool): False only used by build_vocab.py.
Returns: Returns:
List[str]: tokens. List[str]: tokens.
""" """
text = text.strip() text = text.strip()
text = text.replace(" ", SPACE) if replace_space:
text = text.replace(" ", SPACE)
return list(text) return list(text)
def char_detokenize(self, tokens): def char_detokenize(self, tokens):

@ -61,7 +61,7 @@ args = parser.parse_args()
def count_manifest(counter, text_feature, manifest_path): def count_manifest(counter, text_feature, manifest_path):
manifest_jsons = read_manifest(manifest_path) manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons: for line_json in manifest_jsons:
line = text_feature.tokenize(line_json['text']) line = text_feature.tokenize(line_json['text'], replace_space=False)
counter.update(line) counter.update(line)
def dump_text_manifest(fileobj, manifest_path, key='text'): def dump_text_manifest(fileobj, manifest_path, key='text'):

Loading…
Cancel
Save