From 30563981f09d311c38d88b002c7422214ee758c8 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 23 Sep 2021 12:11:36 +0000 Subject: [PATCH] replace space when build vocab --- deepspeech/decoders/swig/ctc_beam_search_decoder.cpp | 8 ++------ deepspeech/frontend/featurizer/text_featurizer.py | 10 ++++++---- utils/build_vocab.py | 2 +- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp index fcb1f7642..fc5fe62da 100644 --- a/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp +++ b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp @@ -28,6 +28,7 @@ #include "path_trie.h" using FSTMATCH = fst::SortedMatcher; +constexpr kSPACE = "" std::vector> ctc_beam_search_decoder( const std::vector> &probs_seq, @@ -46,13 +47,8 @@ std::vector> ctc_beam_search_decoder( "The shape of probs_seq does not match with " "the shape of the vocabulary"); } - - // assign blank id - // size_t blank_id = vocabulary.size(); - // size_t blank_id = 0; - // assign space id - auto it = std::find(vocabulary.begin(), vocabulary.end(), " "); + auto it = std::find(vocabulary.begin(), vocabulary.end(), kSPACE); int space_id = it - vocabulary.begin(); // if no space in vocabulary if ((size_t)space_id >= vocabulary.size()) { diff --git a/deepspeech/frontend/featurizer/text_featurizer.py b/deepspeech/frontend/featurizer/text_featurizer.py index c95ed798a..026595c29 100644 --- a/deepspeech/frontend/featurizer/text_featurizer.py +++ b/deepspeech/frontend/featurizer/text_featurizer.py @@ -54,9 +54,9 @@ class TextFeaturizer(): self.sp = spm.SentencePieceProcessor() self.sp.Load(spm_model) - def tokenize(self, text): + def tokenize(self, text, replace_space=True): if self.unit_type == 'char': - tokens = self.char_tokenize(text) + tokens = self.char_tokenize(text, replace_space) elif self.unit_type == 'word': tokens = self.word_tokenize(text) else: # spm @@ -106,17 +106,19 @@ class TextFeaturizer(): text = self.detokenize(tokens) return text - def char_tokenize(self, text): + def char_tokenize(self, text, replace_space=True): """Character tokenizer. Args: text (str): text string. + replace_space (bool): False only used by build_vocab.py. Returns: List[str]: tokens. """ text = text.strip() - text = text.replace(" ", SPACE) + if replace_space: + text = text.replace(" ", SPACE) return list(text) def char_detokenize(self, tokens): diff --git a/utils/build_vocab.py b/utils/build_vocab.py index 8ba0be70c..67c22fbbf 100755 --- a/utils/build_vocab.py +++ b/utils/build_vocab.py @@ -61,7 +61,7 @@ args = parser.parse_args() def count_manifest(counter, text_feature, manifest_path): manifest_jsons = read_manifest(manifest_path) for line_json in manifest_jsons: - line = text_feature.tokenize(line_json['text']) + line = text_feature.tokenize(line_json['text'], replace_space=False) counter.update(line) def dump_text_manifest(fileobj, manifest_path, key='text'):