replace space when build vocab

4 years ago · 30563981f0
parent 86f34784e3
commit 30563981f0
3 changed files with 9 additions and 11 deletions
--- a/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
+++ b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
@ -28,6 +28,7 @@
 #include "path_trie.h"
 using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
 constexpr kSPACE = "<space>"
 std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
    const std::vector<std::vector<double>> &probs_seq,
@ -46,13 +47,8 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
                       "The shape of probs_seq does not match with "
                       "the shape of the vocabulary");
    }
    // assign blank id
    // size_t blank_id = vocabulary.size();
    // size_t blank_id = 0;
    // assign space id
-    auto it = std::find(vocabulary.begin(), vocabulary.end(), " ");
+    auto it = std::find(vocabulary.begin(), vocabulary.end(), kSPACE);
    int space_id = it - vocabulary.begin();
    // if no space in vocabulary
    if ((size_t)space_id >= vocabulary.size()) {
--- a/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/deepspeech/frontend/featurizer/text_featurizer.py
@ -54,9 +54,9 @@ class TextFeaturizer():
            self.sp = spm.SentencePieceProcessor()
            self.sp.Load(spm_model)
-    def tokenize(self, text):
+    def tokenize(self, text, replace_space=True):
        if self.unit_type == 'char':
-            tokens = self.char_tokenize(text)
+            tokens = self.char_tokenize(text, replace_space)
        elif self.unit_type == 'word':
            tokens = self.word_tokenize(text)
        else:  # spm
@ -106,17 +106,19 @@ class TextFeaturizer():
        text = self.detokenize(tokens)
        return text
-    def char_tokenize(self, text):
+    def char_tokenize(self, text, replace_space=True):
        """Character tokenizer.
        Args:
            text (str): text string.
            replace_space (bool): False only used by build_vocab.py.
        Returns:
            List[str]: tokens.
        """
        text = text.strip()
-        text = text.replace(" ", SPACE)
+        if replace_space:
            text = text.replace(" ", SPACE)
        return list(text)
    def char_detokenize(self, tokens):
--- a/utils/build_vocab.py
+++ b/utils/build_vocab.py
@ -61,7 +61,7 @@ args = parser.parse_args()
 def count_manifest(counter, text_feature, manifest_path):
    manifest_jsons = read_manifest(manifest_path)
    for line_json in manifest_jsons:
-        line = text_feature.tokenize(line_json['text'])
+        line = text_feature.tokenize(line_json['text'], replace_space=False)
        counter.update(line)
 def dump_text_manifest(fileobj, manifest_path, key='text'):