From 30563981f09d311c38d88b002c7422214ee758c8 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 23 Sep 2021 12:11:36 +0000
Subject: [PATCH] replace space when build vocab

---
 deepspeech/decoders/swig/ctc_beam_search_decoder.cpp |  8 ++------
 deepspeech/frontend/featurizer/text_featurizer.py    | 10 ++++++----
 utils/build_vocab.py                                 |  2 +-
 3 files changed, 9 insertions(+), 11 deletions(-)
diff --git a/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
index fcb1f7642..fc5fe62da 100644
--- a/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
+++ b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
@@ -28,6 +28,7 @@
 #include "path_trie.h"
 
 using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
+constexpr kSPACE = "<space>"
 
 std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
     const std::vector<std::vector<double>> &probs_seq,
@@ -46,13 +47,8 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
                        "The shape of probs_seq does not match with "
                        "the shape of the vocabulary");
     }
-
-    // assign blank id
-    // size_t blank_id = vocabulary.size();
-    // size_t blank_id = 0;
-
     // assign space id
-    auto it = std::find(vocabulary.begin(), vocabulary.end(), " ");
+    auto it = std::find(vocabulary.begin(), vocabulary.end(), kSPACE);
     int space_id = it - vocabulary.begin();
     // if no space in vocabulary
     if ((size_t)space_id >= vocabulary.size()) {
diff --git a/deepspeech/frontend/featurizer/text_featurizer.py b/deepspeech/frontend/featurizer/text_featurizer.py
index c95ed798a..026595c29 100644
--- a/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/deepspeech/frontend/featurizer/text_featurizer.py
@@ -54,9 +54,9 @@ class TextFeaturizer():
             self.sp = spm.SentencePieceProcessor()
             self.sp.Load(spm_model)
 
-    def tokenize(self, text):
+    def tokenize(self, text, replace_space=True):
         if self.unit_type == 'char':
-            tokens = self.char_tokenize(text)
+            tokens = self.char_tokenize(text, replace_space)
         elif self.unit_type == 'word':
             tokens = self.word_tokenize(text)
         else:  # spm
@@ -106,17 +106,19 @@ class TextFeaturizer():
         text = self.detokenize(tokens)
         return text
 
-    def char_tokenize(self, text):
+    def char_tokenize(self, text, replace_space=True):
         """Character tokenizer.
 
         Args:
             text (str): text string.
+            replace_space (bool): False only used by build_vocab.py.
 
         Returns:
             List[str]: tokens.
         """
         text = text.strip()
-        text = text.replace(" ", SPACE)
+        if replace_space:
+            text = text.replace(" ", SPACE)
         return list(text)
 
     def char_detokenize(self, tokens):
diff --git a/utils/build_vocab.py b/utils/build_vocab.py
index 8ba0be70c..67c22fbbf 100755
--- a/utils/build_vocab.py
+++ b/utils/build_vocab.py
@@ -61,7 +61,7 @@ args = parser.parse_args()
 def count_manifest(counter, text_feature, manifest_path):
     manifest_jsons = read_manifest(manifest_path)
     for line_json in manifest_jsons:
-        line = text_feature.tokenize(line_json['text'])
+        line = text_feature.tokenize(line_json['text'], replace_space=False)
         counter.update(line)
 
 def dump_text_manifest(fileobj, manifest_path, key='text'):