From 9f0c3467e57057b9fa9cf668345243da058fa1b7 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Fri, 17 Nov 2017 15:50:10 +0800 Subject: [PATCH 1/4] fix decoders: force indices in FST starting from one & add version check in setup --- .clang_format.hook | 2 +- decoders/swig/path_trie.cpp | 2 +- decoders/swig/scorer.cpp | 12 ++---------- decoders/swig/scorer.h | 2 +- decoders/swig/setup.py | 2 +- setup.sh | 2 +- 6 files changed, 7 insertions(+), 15 deletions(-) diff --git a/.clang_format.hook b/.clang_format.hook index 40d70f56..8141fffb 100755 --- a/.clang_format.hook +++ b/.clang_format.hook @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -e -readonly VERSION="3.8" +readonly VERSION="3.6" version=$(clang-format -version) diff --git a/decoders/swig/path_trie.cpp b/decoders/swig/path_trie.cpp index 40d90970..152efa82 100644 --- a/decoders/swig/path_trie.cpp +++ b/decoders/swig/path_trie.cpp @@ -52,7 +52,7 @@ PathTrie* PathTrie::get_path_trie(int new_char, bool reset) { } else { if (has_dictionary_) { matcher_->SetState(dictionary_state_); - bool found = matcher_->Find(new_char); + bool found = matcher_->Find(new_char + 1); if (!found) { // Adding this character causes word outside dictionary auto FSTZERO = fst::TropicalWeight::Zero(); diff --git a/decoders/swig/scorer.cpp b/decoders/swig/scorer.cpp index 686c67c7..39da13d1 100644 --- a/decoders/swig/scorer.cpp +++ b/decoders/swig/scorer.cpp @@ -152,10 +152,8 @@ void Scorer::set_char_map(const std::vector& char_list) { for (size_t i = 0; i < char_list_.size(); i++) { if (char_list_[i] == " ") { SPACE_ID_ = i; - char_map_[' '] = i; - } else if (char_list_[i].size() == 1) { - char_map_[char_list_[i][0]] = i; } + char_map_[char_list_[i]] = i + 1; // Force index starting from zero } } @@ -193,17 +191,11 @@ std::vector Scorer::make_ngram(PathTrie* prefix) { void Scorer::fill_dictionary(bool add_space) { fst::StdVectorFst dictionary; - // First reverse char_list so ints can be accessed by chars - std::unordered_map char_map; - for (size_t i = 0; i < char_list_.size(); i++) { - char_map[char_list_[i]] = i; - } - // For each unigram convert to ints and put in trie int dict_size = 0; for (const auto& word : vocabulary_) { bool added = add_word_to_dictionary( - word, char_map, add_space, SPACE_ID_, &dictionary); + word, char_map_, add_space, SPACE_ID_ + 1, &dictionary); dict_size += added ? 1 : 0; } diff --git a/decoders/swig/scorer.h b/decoders/swig/scorer.h index 61836463..5ebc719c 100644 --- a/decoders/swig/scorer.h +++ b/decoders/swig/scorer.h @@ -104,7 +104,7 @@ private: int SPACE_ID_; std::vector char_list_; - std::unordered_map char_map_; + std::unordered_map char_map_; std::vector vocabulary_; }; diff --git a/decoders/swig/setup.py b/decoders/swig/setup.py index b6bc0ca0..a4bb2e9d 100644 --- a/decoders/swig/setup.py +++ b/decoders/swig/setup.py @@ -113,7 +113,7 @@ decoders_module = [ setup( name='swig_decoders', - version='1.0', + version='1.1', description="""CTC decoders""", ext_modules=decoders_module, py_modules=['swig_decoders'], ) diff --git a/setup.sh b/setup.sh index 7c40415d..ec5e47ec 100644 --- a/setup.sh +++ b/setup.sh @@ -27,7 +27,7 @@ if [ $? != 0 ]; then fi # install decoders -python -c "import swig_decoders" +python -c "import pkg_resources; pkg_resources.require(\"swig_decoders==1.1\")" if [ $? != 0 ]; then cd decoders/swig > /dev/null sh setup.sh From 2587ebf2f7c790195719a0fb659acec68f780e5b Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Fri, 17 Nov 2017 15:50:55 +0800 Subject: [PATCH 2/4] revert clang_format version --- .clang_format.hook | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.clang_format.hook b/.clang_format.hook index 8141fffb..40d70f56 100755 --- a/.clang_format.hook +++ b/.clang_format.hook @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -e -readonly VERSION="3.6" +readonly VERSION="3.8" version=$(clang-format -version) From 3ea19973c66a6a10320888ba47a8857bebf5abfa Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Fri, 17 Nov 2017 17:19:53 +0800 Subject: [PATCH 3/4] add more comments to explain the modification --- .clang_format.hook | 2 +- decoders/swig/scorer.cpp | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.clang_format.hook b/.clang_format.hook index 40d70f56..8141fffb 100755 --- a/.clang_format.hook +++ b/.clang_format.hook @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -e -readonly VERSION="3.8" +readonly VERSION="3.6" version=$(clang-format -version) diff --git a/decoders/swig/scorer.cpp b/decoders/swig/scorer.cpp index 39da13d1..27b61cd0 100644 --- a/decoders/swig/scorer.cpp +++ b/decoders/swig/scorer.cpp @@ -149,11 +149,15 @@ void Scorer::set_char_map(const std::vector& char_list) { char_list_ = char_list; char_map_.clear(); + // Set the char map for the FST for spelling correction for (size_t i = 0; i < char_list_.size(); i++) { if (char_list_[i] == " ") { SPACE_ID_ = i; } - char_map_[char_list_[i]] = i + 1; // Force index starting from zero + // The initial state of FST is state 0, hence the index of chars in + // the FST should start from 1 to avoid the conflict with the initial + // state, otherwise wrong decoding results would be given. + char_map_[char_list_[i]] = i + 1; } } From dd770948a0cc71da4f96a0fd446deec0b631a369 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Fri, 17 Nov 2017 17:20:44 +0800 Subject: [PATCH 4/4] revert clang_format version --- .clang_format.hook | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.clang_format.hook b/.clang_format.hook index 8141fffb..4cbc972b 100755 --- a/.clang_format.hook +++ b/.clang_format.hook @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -e -readonly VERSION="3.6" +readonly VERSION="3.9" version=$(clang-format -version)