rm ds2 && rm boost

3 years ago · f37f34d3ce
parent 0547d7961e
commit f37f34d3ce
32 changed files with 1265 additions and 3277 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -57,13 +57,13 @@ repos:
        entry: bash .pre-commit-hooks/clang-format.hook -i
        language: system
        files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
-        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders|speechx/speechx/common/utils).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
    -   id: cpplint
        name: cpplint
        description: Static code analysis of C/C++ files
        language: python
        files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
-        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders|speechx/speechx/common/utils).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
        entry: cpplint --filter=-build,-whitespace,+whitespace/comma,-whitespace/indent
 -   repo: https://github.com/asottile/reorder_python_imports
    rev: v2.4.0
--- a/speechx/CMakeLists.txt
+++ b/speechx/CMakeLists.txt
@ -44,9 +44,6 @@ option(TEST_DEBUG "option for debug" OFF)
 option(USE_PROFILING "enable c++ profling" OFF)
 option(WITH_TESTING "unit test" ON)
 option(USING_U2  "compile u2 model." ON)
 option(USING_DS2 "compile with ds2 model." OFF)
 option(USING_GPU "u2 compute on GPU." OFF)
 ###############################################################################
@ -56,21 +53,6 @@ include(gflags)
 include(glog)
 # boost
 # include(boost) # not work
 set(boost_SOURCE_DIR ${fc_patch}/boost-src)
 set(BOOST_ROOT ${boost_SOURCE_DIR})
 include_directories(${boost_SOURCE_DIR})
 link_directories(${boost_SOURCE_DIR}/stage/lib)
 # Eigen
 include(eigen)
 find_package(Eigen3 REQUIRED)
 # Kenlm
 include(kenlm)
 add_dependencies(kenlm eigen boost)
 #openblas
 include(openblas)
--- a/speechx/build.sh
+++ b/speechx/build.sh
@ -4,20 +4,5 @@ set -xe
 # the build script had verified in the paddlepaddle docker image.
 # please follow the instruction below to install PaddlePaddle image.
 # https://www.paddlepaddle.org.cn/documentation/docs/zh/install/docker/linux-docker.html 
-boost_SOURCE_DIR=$PWD/fc_patch/boost-src
+cmake -B build
 if [ ! -d ${boost_SOURCE_DIR} ]; then wget -c https://boostorg.jfrog.io/artifactory/main/release/1.75.0/source/boost_1_75_0.tar.gz 
  tar xzfv boost_1_75_0.tar.gz
  mkdir -p $PWD/fc_patch
  mv boost_1_75_0 ${boost_SOURCE_DIR} 
  cd ${boost_SOURCE_DIR}
  bash ./bootstrap.sh
  ./b2
  cd -
  echo -e "\n"
 fi
 #rm -rf build
 mkdir -p build
 cmake -B build -DBOOST_ROOT:STRING=${boost_SOURCE_DIR}
 cmake --build build -j
--- a/speechx/examples/u2pp_ol/wenetspeech/path.sh
+++ b/speechx/examples/u2pp_ol/wenetspeech/path.sh
@ -3,7 +3,7 @@
 unset GREP_OPTIONS
 SPEECHX_ROOT=$PWD/../../../
-SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
+SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx/asr
 SPEECHX_TOOLS=$SPEECHX_ROOT/tools
 TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
@ -12,7 +12,7 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
 export LC_AL=C
-export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio:$SPEECHX_BUILD/recognizer
+export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/../common/frontend/audio:$SPEECHX_BUILD/recognizer
 PADDLE_LIB_PATH=$(python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out);")
 export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH
--- a/speechx/speechx/asr/decoder/CMakeLists.txt
+++ b/speechx/speechx/asr/decoder/CMakeLists.txt
@ -1,55 +1,22 @@
 include_directories(${CMAKE_CURRENT_SOURCE_DIR/ctc_decoders})
 set(srcs)
 if (USING_DS2)
 list(APPEND srcs
-  ctc_decoders/decoder_utils.cpp
+  ctc_prefix_beam_search_decoder.cc
  ctc_decoders/path_trie.cpp
  ctc_decoders/scorer.cpp
  ctc_beam_search_decoder.cc
  ctc_tlg_decoder.cc
 )
 endif()
 if (USING_U2)
  list(APPEND srcs
    ctc_prefix_beam_search_decoder.cc
  )
 endif()
 add_library(decoder STATIC ${srcs})
-target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder)
+target_link_libraries(decoder PUBLIC utils fst frontend nnet kaldi-decoder)
 # test
-if (USING_DS2)
+set(TEST_BINS 
-  set(BINS 
+  ctc_prefix_beam_search_decoder_main
-    ctc_beam_search_decoder_main
+)
    nnet_logprob_decoder_main
    ctc_tlg_decoder_main
  )
  foreach(bin_name IN LISTS BINS)
    add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
    target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
    target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
  endforeach()
 endif()
 if (USING_U2)
  set(TEST_BINS 
    ctc_prefix_beam_search_decoder_main
  )
  foreach(bin_name IN LISTS TEST_BINS)
    add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
    target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
    target_link_libraries(${bin_name} nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util)
    target_compile_options(${bin_name}  PRIVATE ${PADDLE_COMPILE_FLAGS})
    target_include_directories(${bin_name}  PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
    target_link_libraries(${bin_name}  ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
  endforeach()
-endif()
+foreach(bin_name IN LISTS TEST_BINS)
  add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
  target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
  target_link_libraries(${bin_name} nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util)
  target_compile_options(${bin_name}  PRIVATE ${PADDLE_COMPILE_FLAGS})
  target_include_directories(${bin_name}  PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
  target_link_libraries(${bin_name}  ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
 endforeach()
--- a/speechx/speechx/asr/decoder/ctc_beam_search_decoder.cc
+++ b/speechx/speechx/asr/decoder/ctc_beam_search_decoder.cc
@ -1,313 +0,0 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "decoder/ctc_beam_search_decoder.h"
 #include "base/common.h"
 #include "decoder/ctc_decoders/decoder_utils.h"
 #include "utils/file_utils.h"
 namespace ppspeech {
 using std::vector;
 using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
 CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts)
    : opts_(opts), init_ext_scorer_(nullptr), space_id_(-1), root_(nullptr) {
    LOG(INFO) << "dict path: " << opts_.dict_file;
    if (!ReadFileToVector(opts_.dict_file, &vocabulary_)) {
        LOG(INFO) << "load the dict failed";
    }
    LOG(INFO) << "read the vocabulary success, dict size: "
              << vocabulary_.size();
    LOG(INFO) << "language model path: " << opts_.lm_path;
    if (opts_.lm_path != "") {
        init_ext_scorer_ = std::make_shared<Scorer>(
            opts_.alpha, opts_.beta, opts_.lm_path, vocabulary_);
    }
    CHECK_EQ(opts_.blank, 0);
    auto it = std::find(vocabulary_.begin(), vocabulary_.end(), " ");
    space_id_ = it - vocabulary_.begin();
    // if no space in vocabulary
    if (static_cast<size_t>(space_id_) >= vocabulary_.size()) {
        space_id_ = -2;
    }
 }
 void CTCBeamSearch::Reset() {
    // num_frame_decoded_ = 0;
    // ResetPrefixes();
    InitDecoder();
 }
 void CTCBeamSearch::InitDecoder() {
    num_frame_decoded_ = 0;
    // ResetPrefixes();
    prefixes_.clear();
    root_ = std::make_shared<PathTrie>();
    root_->score = root_->log_prob_b_prev = 0.0;
    prefixes_.push_back(root_.get());
    if (init_ext_scorer_ != nullptr &&
        !init_ext_scorer_->is_character_based()) {
        auto fst_dict =
            static_cast<fst::StdVectorFst*>(init_ext_scorer_->dictionary);
        fst::StdVectorFst* dict_ptr = fst_dict->Copy(true);
        root_->set_dictionary(dict_ptr);
        auto matcher = std::make_shared<FSTMATCH>(*dict_ptr, fst::MATCH_INPUT);
        root_->set_matcher(matcher);
    }
 }
 void CTCBeamSearch::Decode(
    std::shared_ptr<kaldi::DecodableInterface> decodable) {
    return;
 }
 // todo rename, refactor
 void CTCBeamSearch::AdvanceDecode(
    const std::shared_ptr<kaldi::DecodableInterface>& decodable) {
    while (1) {
        vector<vector<BaseFloat>> likelihood;
        vector<BaseFloat> frame_prob;
        bool flag = decodable->FrameLikelihood(num_frame_decoded_, &frame_prob);
        if (flag == false) break;
        likelihood.push_back(frame_prob);
        AdvanceDecoding(likelihood);
    }
 }
 void CTCBeamSearch::ResetPrefixes() {
    for (size_t i = 0; i < prefixes_.size(); i++) {
        if (prefixes_[i] != nullptr) {
            delete prefixes_[i];
            prefixes_[i] = nullptr;
        }
    }
    prefixes_.clear();
 }
 int CTCBeamSearch::DecodeLikelihoods(const vector<vector<float>>& probs,
                                     const vector<string>& nbest_words) {
    kaldi::Timer timer;
    AdvanceDecoding(probs);
    LOG(INFO) << "ctc decoding elapsed time(s) "
              << static_cast<float>(timer.Elapsed()) / 1000.0f;
    return 0;
 }
 vector<std::pair<double, string>> CTCBeamSearch::GetNBestPath(int n) {
    int beam_size = n == -1 ? opts_.beam_size : std::min(n, opts_.beam_size);
    return get_beam_search_result(prefixes_, vocabulary_, beam_size);
 }
 vector<std::pair<double, string>> CTCBeamSearch::GetNBestPath() {
    return GetNBestPath(-1);
 }
 string CTCBeamSearch::GetBestPath() {
    std::vector<std::pair<double, std::string>> result;
    result = get_beam_search_result(prefixes_, vocabulary_, opts_.beam_size);
    return result[0].second;
 }
 string CTCBeamSearch::GetFinalBestPath() {
    CalculateApproxScore();
    LMRescore();
    return GetBestPath();
 }
 void CTCBeamSearch::AdvanceDecoding(const vector<vector<BaseFloat>>& probs) {
    size_t num_time_steps = probs.size();
    size_t beam_size = opts_.beam_size;
    double cutoff_prob = opts_.cutoff_prob;
    size_t cutoff_top_n = opts_.cutoff_top_n;
    vector<vector<double>> probs_seq(probs.size(),
                                     vector<double>(probs[0].size(), 0));
    int row = probs.size();
    int col = probs[0].size();
    for (int i = 0; i < row; i++) {
        for (int j = 0; j < col; j++) {
            probs_seq[i][j] = static_cast<double>(probs[i][j]);
        }
    }
    for (size_t time_step = 0; time_step < num_time_steps; time_step++) {
        const auto& prob = probs_seq[time_step];
        float min_cutoff = -NUM_FLT_INF;
        bool full_beam = false;
        if (init_ext_scorer_ != nullptr) {
            size_t num_prefixes_ = std::min(prefixes_.size(), beam_size);
            std::sort(prefixes_.begin(),
                      prefixes_.begin() + num_prefixes_,
                      prefix_compare);
            if (num_prefixes_ == 0) {
                continue;
            }
            min_cutoff = prefixes_[num_prefixes_ - 1]->score +
                         std::log(prob[opts_.blank]) -
                         std::max(0.0, init_ext_scorer_->beta);
            full_beam = (num_prefixes_ == beam_size);
        }
        vector<std::pair<size_t, float>> log_prob_idx =
            get_pruned_log_probs(prob, cutoff_prob, cutoff_top_n);
        // loop over chars
        size_t log_prob_idx_len = log_prob_idx.size();
        for (size_t index = 0; index < log_prob_idx_len; index++) {
            SearchOneChar(full_beam, log_prob_idx[index], min_cutoff);
        }
        prefixes_.clear();
        // update log probs
        root_->iterate_to_vec(prefixes_);
        // only preserve top beam_size prefixes_
        if (prefixes_.size() >= beam_size) {
            std::nth_element(prefixes_.begin(),
                             prefixes_.begin() + beam_size,
                             prefixes_.end(),
                             prefix_compare);
            for (size_t i = beam_size; i < prefixes_.size(); ++i) {
                prefixes_[i]->remove();
            }
        }  // end if
        num_frame_decoded_++;
    }  // end for probs_seq
 }
 int32 CTCBeamSearch::SearchOneChar(
    const bool& full_beam,
    const std::pair<size_t, BaseFloat>& log_prob_idx,
    const BaseFloat& min_cutoff) {
    size_t beam_size = opts_.beam_size;
    const auto& c = log_prob_idx.first;
    const auto& log_prob_c = log_prob_idx.second;
    size_t prefixes_len = std::min(prefixes_.size(), beam_size);
    for (size_t i = 0; i < prefixes_len; ++i) {
        auto prefix = prefixes_[i];
        if (full_beam && log_prob_c + prefix->score < min_cutoff) {
            break;
        }
        if (c == opts_.blank) {
            prefix->log_prob_b_cur =
                log_sum_exp(prefix->log_prob_b_cur, log_prob_c + prefix->score);
            continue;
        }
        // repeated character
        if (c == prefix->character) {
            // p_{nb}(l;x_{1:t}) = p(c;x_{t})p(l;x_{1:t-1})
            prefix->log_prob_nb_cur = log_sum_exp(
                prefix->log_prob_nb_cur, log_prob_c + prefix->log_prob_nb_prev);
        }
        // get new prefix
        auto prefix_new = prefix->get_path_trie(c);
        if (prefix_new != nullptr) {
            float log_p = -NUM_FLT_INF;
            if (c == prefix->character &&
                prefix->log_prob_b_prev > -NUM_FLT_INF) {
                // p_{nb}(l^{+};x_{1:t}) = p(c;x_{t})p_{b}(l;x_{1:t-1})
                log_p = log_prob_c + prefix->log_prob_b_prev;
            } else if (c != prefix->character) {
                // p_{nb}(l^{+};x_{1:t}) = p(c;x_{t}) p(l;x_{1:t-1})
                log_p = log_prob_c + prefix->score;
            }
            // language model scoring
            if (init_ext_scorer_ != nullptr &&
                (c == space_id_ || init_ext_scorer_->is_character_based())) {
                PathTrie* prefix_to_score = nullptr;
                // skip scoring the space
                if (init_ext_scorer_->is_character_based()) {
                    prefix_to_score = prefix_new;
                } else {
                    prefix_to_score = prefix;
                }
                float score = 0.0;
                vector<string> ngram;
                ngram = init_ext_scorer_->make_ngram(prefix_to_score);
                // lm score: p_{lm}(W)^{\alpha} + \beta
                score = init_ext_scorer_->get_log_cond_prob(ngram) *
                        init_ext_scorer_->alpha;
                log_p += score;
                log_p += init_ext_scorer_->beta;
            }
            // p_{nb}(l;x_{1:t})
            prefix_new->log_prob_nb_cur =
                log_sum_exp(prefix_new->log_prob_nb_cur, log_p);
        }
    }  // end of loop over prefix
    return 0;
 }
 void CTCBeamSearch::CalculateApproxScore() {
    size_t beam_size = opts_.beam_size;
    size_t num_prefixes_ = std::min(prefixes_.size(), beam_size);
    std::sort(
        prefixes_.begin(), prefixes_.begin() + num_prefixes_, prefix_compare);
    // compute aproximate ctc score as the return score, without affecting the
    // return order of decoding result. To delete when decoder gets stable.
    for (size_t i = 0; i < beam_size && i < prefixes_.size(); ++i) {
        double approx_ctc = prefixes_[i]->score;
        if (init_ext_scorer_ != nullptr) {
            vector<int> output;
            prefixes_[i]->get_path_vec(output);
            auto prefix_length = output.size();
            auto words = init_ext_scorer_->split_labels(output);
            // remove word insert
            approx_ctc = approx_ctc - prefix_length * init_ext_scorer_->beta;
            // remove language model weight:
            approx_ctc -= (init_ext_scorer_->get_sent_log_prob(words)) *
                          init_ext_scorer_->alpha;
        }
        prefixes_[i]->approx_ctc = approx_ctc;
    }
 }
 void CTCBeamSearch::LMRescore() {
    size_t beam_size = opts_.beam_size;
    if (init_ext_scorer_ != nullptr &&
        !init_ext_scorer_->is_character_based()) {
        for (size_t i = 0; i < beam_size && i < prefixes_.size(); ++i) {
            auto prefix = prefixes_[i];
            if (!prefix->is_empty() && prefix->character != space_id_) {
                float score = 0.0;
                vector<string> ngram = init_ext_scorer_->make_ngram(prefix);
                score = init_ext_scorer_->get_log_cond_prob(ngram) *
                        init_ext_scorer_->alpha;
                score += init_ext_scorer_->beta;
                prefix->score += score;
            }
        }
    }
 }
 }  // namespace ppspeech
--- a/speechx/speechx/asr/decoder/ctc_beam_search_decoder.h
+++ b/speechx/speechx/asr/decoder/ctc_beam_search_decoder.h
@ -1,73 +0,0 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // used by deepspeech2
 #pragma once
 #include "decoder/ctc_beam_search_opt.h"
 #include "decoder/ctc_decoders/path_trie.h"
 #include "decoder/ctc_decoders/scorer.h"
 #include "decoder/decoder_itf.h"
 namespace ppspeech {
 class CTCBeamSearch : public DecoderBase {
  public:
    explicit CTCBeamSearch(const CTCBeamSearchOptions& opts);
    ~CTCBeamSearch() {}
    void InitDecoder();
    void Reset();
    void AdvanceDecode(
        const std::shared_ptr<kaldi::DecodableInterface>& decodable);
    void Decode(std::shared_ptr<kaldi::DecodableInterface> decodable);
    std::string GetBestPath();
    std::vector<std::pair<double, std::string>> GetNBestPath();
    std::vector<std::pair<double, std::string>> GetNBestPath(int n);
    std::string GetFinalBestPath();
    std::string GetPartialResult() {
        CHECK(false) << "Not implement.";
        return {};
    }
    int DecodeLikelihoods(const std::vector<std::vector<BaseFloat>>& probs,
                          const std::vector<std::string>& nbest_words);
  private:
    void ResetPrefixes();
    int32 SearchOneChar(const bool& full_beam,
                        const std::pair<size_t, BaseFloat>& log_prob_idx,
                        const BaseFloat& min_cutoff);
    void CalculateApproxScore();
    void LMRescore();
    void AdvanceDecoding(const std::vector<std::vector<BaseFloat>>& probs);
    CTCBeamSearchOptions opts_;
    std::shared_ptr<Scorer> init_ext_scorer_;  // todo separate later
    std::vector<std::string> vocabulary_;      // todo remove later
    int space_id_;
    std::shared_ptr<PathTrie> root_;
    std::vector<PathTrie*> prefixes_;
    DISALLOW_COPY_AND_ASSIGN(CTCBeamSearch);
 };
 }  // namespace ppspeech
--- a/speechx/speechx/asr/decoder/ctc_beam_search_decoder_main.cc
+++ b/speechx/speechx/asr/decoder/ctc_beam_search_decoder_main.cc
@ -1,167 +0,0 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // used by deepspeech2
 #include "base/flags.h"
 #include "base/log.h"
 #include "decoder/ctc_beam_search_decoder.h"
 #include "frontend/audio/data_cache.h"
 #include "kaldi/util/table-types.h"
 #include "nnet/decodable.h"
 #include "nnet/ds2_nnet.h"
 DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
 DEFINE_string(result_wspecifier, "", "test result wspecifier");
 DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
 DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
 DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
 DEFINE_string(lm_path, "", "language model");
 DEFINE_int32(receptive_field_length,
             7,
             "receptive field of two CNN(kernel=3) downsampling module.");
 DEFINE_int32(subsampling_rate,
             4,
             "two CNN(kernel=3) module downsampling rate.");
 DEFINE_string(
    model_input_names,
    "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box",
    "model input names");
 DEFINE_string(model_output_names,
              "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0",
              "model output names");
 DEFINE_string(model_cache_names,
              "chunk_state_h_box,chunk_state_c_box",
              "model cache names");
 DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
 DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk");
 using kaldi::BaseFloat;
 using kaldi::Matrix;
 using std::vector;
 // test ds2 online decoder by feeding speech feature
 int main(int argc, char* argv[]) {
    gflags::SetUsageMessage("Usage:");
    gflags::ParseCommandLineFlags(&argc, &argv, false);
    google::InitGoogleLogging(argv[0]);
    google::InstallFailureSignalHandler();
    FLAGS_logtostderr = 1;
    CHECK_NE(FLAGS_result_wspecifier, "");
    CHECK_NE(FLAGS_feature_rspecifier, "");
    kaldi::SequentialBaseFloatMatrixReader feature_reader(
        FLAGS_feature_rspecifier);
    kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
    std::string model_path = FLAGS_model_path;
    std::string model_params = FLAGS_param_path;
    std::string dict_file = FLAGS_dict_file;
    std::string lm_path = FLAGS_lm_path;
    LOG(INFO) << "model path: " << model_path;
    LOG(INFO) << "model param: " << model_params;
    LOG(INFO) << "dict path: " << dict_file;
    LOG(INFO) << "lm path: " << lm_path;
    int32 num_done = 0, num_err = 0;
    ppspeech::CTCBeamSearchOptions opts;
    opts.dict_file = dict_file;
    opts.lm_path = lm_path;
    ppspeech::CTCBeamSearch decoder(opts);
    ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags();
    std::shared_ptr<ppspeech::PaddleNnet> nnet(
        new ppspeech::PaddleNnet(model_opts));
    std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
    std::shared_ptr<ppspeech::Decodable> decodable(
        new ppspeech::Decodable(nnet, raw_data));
    int32 chunk_size = FLAGS_receptive_field_length +
                       (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate;
    int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk;
    int32 receptive_field_length = FLAGS_receptive_field_length;
    LOG(INFO) << "chunk size (frame): " << chunk_size;
    LOG(INFO) << "chunk stride (frame): " << chunk_stride;
    LOG(INFO) << "receptive field (frame): " << receptive_field_length;
    decoder.InitDecoder();
    kaldi::Timer timer;
    for (; !feature_reader.Done(); feature_reader.Next()) {
        string utt = feature_reader.Key();
        kaldi::Matrix<BaseFloat> feature = feature_reader.Value();
        raw_data->SetDim(feature.NumCols());
        LOG(INFO) << "process utt: " << utt;
        LOG(INFO) << "rows: " << feature.NumRows();
        LOG(INFO) << "cols: " << feature.NumCols();
        int32 row_idx = 0;
        int32 padding_len = 0;
        int32 ori_feature_len = feature.NumRows();
        if ((feature.NumRows() - chunk_size) % chunk_stride != 0) {
            padding_len =
                chunk_stride - (feature.NumRows() - chunk_size) % chunk_stride;
            feature.Resize(feature.NumRows() + padding_len,
                           feature.NumCols(),
                           kaldi::kCopyData);
        }
        int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1;
        for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
            kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
                                                          feature.NumCols());
            int32 feature_chunk_size = 0;
            if (ori_feature_len > chunk_idx * chunk_stride) {
                feature_chunk_size = std::min(
                    ori_feature_len - chunk_idx * chunk_stride, chunk_size);
            }
            if (feature_chunk_size < receptive_field_length) break;
            int32 start = chunk_idx * chunk_stride;
            for (int row_id = 0; row_id < chunk_size; ++row_id) {
                kaldi::SubVector<kaldi::BaseFloat> tmp(feature, start);
                kaldi::SubVector<kaldi::BaseFloat> f_chunk_tmp(
                    feature_chunk.Data() + row_id * feature.NumCols(),
                    feature.NumCols());
                f_chunk_tmp.CopyFromVec(tmp);
                ++start;
            }
            raw_data->Accept(feature_chunk);
            if (chunk_idx == num_chunks - 1) {
                raw_data->SetFinished();
            }
            decoder.AdvanceDecode(decodable);
        }
        std::string result;
        result = decoder.GetFinalBestPath();
        decodable->Reset();
        decoder.Reset();
        if (result.empty()) {
            // the TokenWriter can not write empty string.
            ++num_err;
            KALDI_LOG << " the result of " << utt << " is empty";
            continue;
        }
        KALDI_LOG << " the result of " << utt << " is " << result;
        result_writer.Write(utt, result);
        ++num_done;
    }
    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
              << " with errors.";
    double elapsed = timer.Elapsed();
    KALDI_LOG << " cost:" << elapsed << " s";
    return (num_done != 0 ? 0 : 1);
 }
--- a/speechx/speechx/asr/decoder/ctc_decoders/.gitignore
+++ b/speechx/speechx/asr/decoder/ctc_decoders/.gitignore
@ -1,9 +0,0 @@
 ThreadPool/
 build/
 dist/
 kenlm/
 openfst-1.6.3/
 openfst-1.6.3.tar.gz
 swig_decoders.egg-info/
 decoders_wrap.cxx
 swig_decoders.py
--- a/speechx/speechx/asr/decoder/ctc_decoders/ctc_beam_search_decoder.cpp
+++ b/speechx/speechx/asr/decoder/ctc_decoders/ctc_beam_search_decoder.cpp
@ -1,607 +0,0 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "ctc_beam_search_decoder.h"
 #include <algorithm>
 #include <cmath>
 #include <iostream>
 #include <limits>
 #include <map>
 #include <utility>
 #include "ThreadPool.h"
 #include "fst/fstlib.h"
 #include "decoder_utils.h"
 #include "path_trie.h"
 using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
 std::vector<std::pair<double, std::string>> ctc_beam_search_decoding(
    const std::vector<std::vector<double>> &probs_seq,
    const std::vector<std::string> &vocabulary,
    size_t beam_size,
    double cutoff_prob,
    size_t cutoff_top_n,
    Scorer *ext_scorer,
    size_t blank_id) {
    // dimension check
    size_t num_time_steps = probs_seq.size();
    for (size_t i = 0; i < num_time_steps; ++i) {
        VALID_CHECK_EQ(probs_seq[i].size(),
                       // vocabulary.size() + 1,
                       vocabulary.size(),
                       "The shape of probs_seq does not match with "
                       "the shape of the vocabulary");
    }
    // assign space id
    auto it = std::find(vocabulary.begin(), vocabulary.end(), kSPACE);
    int space_id = it - vocabulary.begin();
    // if no space in vocabulary
    if ((size_t)space_id >= vocabulary.size()) {
        space_id = -2;
    }
    // init prefixes' root
    PathTrie root;
    root.score = root.log_prob_b_prev = 0.0;
    std::vector<PathTrie *> prefixes;
    prefixes.push_back(&root);
    if (ext_scorer != nullptr && !ext_scorer->is_character_based()) {
        auto fst_dict =
            static_cast<fst::StdVectorFst *>(ext_scorer->dictionary);
        fst::StdVectorFst *dict_ptr = fst_dict->Copy(true);
        root.set_dictionary(dict_ptr);
        auto matcher = std::make_shared<FSTMATCH>(*dict_ptr, fst::MATCH_INPUT);
        root.set_matcher(matcher);
    }
    // prefix search over time
    for (size_t time_step = 0; time_step < num_time_steps; ++time_step) {
        auto &prob = probs_seq[time_step];
        float min_cutoff = -NUM_FLT_INF;
        bool full_beam = false;
        if (ext_scorer != nullptr) {
            size_t num_prefixes = std::min(prefixes.size(), beam_size);
            std::sort(prefixes.begin(),
                      prefixes.begin() + num_prefixes,
                      prefix_compare);
            min_cutoff = prefixes[num_prefixes - 1]->score +
                         std::log(prob[blank_id]) -
                         std::max(0.0, ext_scorer->beta);
            full_beam = (num_prefixes == beam_size);
        }
        std::vector<std::pair<size_t, float>> log_prob_idx =
            get_pruned_log_probs(prob, cutoff_prob, cutoff_top_n);
        // loop over chars
        for (size_t index = 0; index < log_prob_idx.size(); index++) {
            auto c = log_prob_idx[index].first;
            auto log_prob_c = log_prob_idx[index].second;
            for (size_t i = 0; i < prefixes.size() && i < beam_size; ++i) {
                auto prefix = prefixes[i];
                if (full_beam && log_prob_c + prefix->score < min_cutoff) {
                    break;
                }
                // blank
                if (c == blank_id) {
                    prefix->log_prob_b_cur = log_sum_exp(
                        prefix->log_prob_b_cur, log_prob_c + prefix->score);
                    continue;
                }
                // repeated character
                if (c == prefix->character) {
                    prefix->log_prob_nb_cur =
                        log_sum_exp(prefix->log_prob_nb_cur,
                                    log_prob_c + prefix->log_prob_nb_prev);
                }
                // get new prefix
                auto prefix_new = prefix->get_path_trie(c);
                if (prefix_new != nullptr) {
                    float log_p = -NUM_FLT_INF;
                    if (c == prefix->character &&
                        prefix->log_prob_b_prev > -NUM_FLT_INF) {
                        log_p = log_prob_c + prefix->log_prob_b_prev;
                    } else if (c != prefix->character) {
                        log_p = log_prob_c + prefix->score;
                    }
                    // language model scoring
                    if (ext_scorer != nullptr &&
                        (c == space_id || ext_scorer->is_character_based())) {
                        PathTrie *prefix_to_score = nullptr;
                        // skip scoring the space
                        if (ext_scorer->is_character_based()) {
                            prefix_to_score = prefix_new;
                        } else {
                            prefix_to_score = prefix;
                        }
                        float score = 0.0;
                        std::vector<std::string> ngram;
                        ngram = ext_scorer->make_ngram(prefix_to_score);
                        score = ext_scorer->get_log_cond_prob(ngram) *
                                ext_scorer->alpha;
                        log_p += score;
                        log_p += ext_scorer->beta;
                    }
                    prefix_new->log_prob_nb_cur =
                        log_sum_exp(prefix_new->log_prob_nb_cur, log_p);
                }
            }  // end of loop over prefix
        }      // end of loop over vocabulary
        prefixes.clear();
        // update log probs
        root.iterate_to_vec(prefixes);
        // only preserve top beam_size prefixes
        if (prefixes.size() >= beam_size) {
            std::nth_element(prefixes.begin(),
                             prefixes.begin() + beam_size,
                             prefixes.end(),
                             prefix_compare);
            for (size_t i = beam_size; i < prefixes.size(); ++i) {
                prefixes[i]->remove();
            }
        }
    }  // end of loop over time
    // score the last word of each prefix that doesn't end with space
    if (ext_scorer != nullptr && !ext_scorer->is_character_based()) {
        for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) {
            auto prefix = prefixes[i];
            if (!prefix->is_empty() && prefix->character != space_id) {
                float score = 0.0;
                std::vector<std::string> ngram = ext_scorer->make_ngram(prefix);
                score =
                    ext_scorer->get_log_cond_prob(ngram) * ext_scorer->alpha;
                score += ext_scorer->beta;
                prefix->score += score;
            }
        }
    }
    size_t num_prefixes = std::min(prefixes.size(), beam_size);
    std::sort(
        prefixes.begin(), prefixes.begin() + num_prefixes, prefix_compare);
    // compute approximate ctc score as the return score, without affecting the
    // return order of decoding result. To delete when decoder gets stable.
    for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) {
        double approx_ctc = prefixes[i]->score;
        if (ext_scorer != nullptr) {
            std::vector<int> output;
            prefixes[i]->get_path_vec(output);
            auto prefix_length = output.size();
            auto words = ext_scorer->split_labels(output);
            // remove word insert
            approx_ctc = approx_ctc - prefix_length * ext_scorer->beta;
            // remove language model weight:
            approx_ctc -=
                (ext_scorer->get_sent_log_prob(words)) * ext_scorer->alpha;
        }
        prefixes[i]->approx_ctc = approx_ctc;
    }
    return get_beam_search_result(prefixes, vocabulary, beam_size);
 }
 std::vector<std::vector<std::pair<double, std::string>>>
 ctc_beam_search_decoding_batch(
    const std::vector<std::vector<std::vector<double>>> &probs_split,
    const std::vector<std::string> &vocabulary,
    size_t beam_size,
    size_t num_processes,
    double cutoff_prob,
    size_t cutoff_top_n,
    Scorer *ext_scorer,
    size_t blank_id) {
    VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
    // thread pool
    ThreadPool pool(num_processes);
    // number of samples
    size_t batch_size = probs_split.size();
    // enqueue the tasks of decoding
    std::vector<std::future<std::vector<std::pair<double, std::string>>>> res;
    for (size_t i = 0; i < batch_size; ++i) {
        res.emplace_back(pool.enqueue(ctc_beam_search_decoding,
                                      probs_split[i],
                                      vocabulary,
                                      beam_size,
                                      cutoff_prob,
                                      cutoff_top_n,
                                      ext_scorer,
                                      blank_id));
    }
    // get decoding results
    std::vector<std::vector<std::pair<double, std::string>>> batch_results;
    for (size_t i = 0; i < batch_size; ++i) {
        batch_results.emplace_back(res[i].get());
    }
    return batch_results;
 }
 void ctc_beam_search_decode_chunk_begin(PathTrie *root, Scorer *ext_scorer) {
    if (ext_scorer != nullptr && !ext_scorer->is_character_based()) {
        auto fst_dict =
            static_cast<fst::StdVectorFst *>(ext_scorer->dictionary);
        fst::StdVectorFst *dict_ptr = fst_dict->Copy(true);
        root->set_dictionary(dict_ptr);
        auto matcher = std::make_shared<FSTMATCH>(*dict_ptr, fst::MATCH_INPUT);
        root->set_matcher(matcher);
    }
 }
 void ctc_beam_search_decode_chunk(
    PathTrie *root,
    std::vector<PathTrie *> &prefixes,
    const std::vector<std::vector<double>> &probs_seq,
    const std::vector<std::string> &vocabulary,
    size_t beam_size,
    double cutoff_prob,
    size_t cutoff_top_n,
    Scorer *ext_scorer,
    size_t blank_id) {
    // dimension check
    size_t num_time_steps = probs_seq.size();
    for (size_t i = 0; i < num_time_steps; ++i) {
        VALID_CHECK_EQ(probs_seq[i].size(),
                       // vocabulary.size() + 1,
                       vocabulary.size(),
                       "The shape of probs_seq does not match with "
                       "the shape of the vocabulary");
    }
    // assign space id
    auto it = std::find(vocabulary.begin(), vocabulary.end(), kSPACE);
    int space_id = it - vocabulary.begin();
    // if no space in vocabulary
    if ((size_t)space_id >= vocabulary.size()) {
        space_id = -2;
    }
    // init prefixes' root
    //
    // prefix search over time
    for (size_t time_step = 0; time_step < num_time_steps; ++time_step) {
        auto &prob = probs_seq[time_step];
        float min_cutoff = -NUM_FLT_INF;
        bool full_beam = false;
        if (ext_scorer != nullptr) {
            size_t num_prefixes = std::min(prefixes.size(), beam_size);
            std::sort(prefixes.begin(),
                      prefixes.begin() + num_prefixes,
                      prefix_compare);
            min_cutoff = prefixes[num_prefixes - 1]->score +
                         std::log(prob[blank_id]) -
                         std::max(0.0, ext_scorer->beta);
            full_beam = (num_prefixes == beam_size);
        }
        std::vector<std::pair<size_t, float>> log_prob_idx =
            get_pruned_log_probs(prob, cutoff_prob, cutoff_top_n);
        // loop over chars
        for (size_t index = 0; index < log_prob_idx.size(); index++) {
            auto c = log_prob_idx[index].first;
            auto log_prob_c = log_prob_idx[index].second;
            for (size_t i = 0; i < prefixes.size() && i < beam_size; ++i) {
                auto prefix = prefixes[i];
                if (full_beam && log_prob_c + prefix->score < min_cutoff) {
                    break;
                }
                // blank
                if (c == blank_id) {
                    prefix->log_prob_b_cur = log_sum_exp(
                        prefix->log_prob_b_cur, log_prob_c + prefix->score);
                    continue;
                }
                // repeated character
                if (c == prefix->character) {
                    prefix->log_prob_nb_cur =
                        log_sum_exp(prefix->log_prob_nb_cur,
                                    log_prob_c + prefix->log_prob_nb_prev);
                }
                // get new prefix
                auto prefix_new = prefix->get_path_trie(c);
                if (prefix_new != nullptr) {
                    float log_p = -NUM_FLT_INF;
                    if (c == prefix->character &&
                        prefix->log_prob_b_prev > -NUM_FLT_INF) {
                        log_p = log_prob_c + prefix->log_prob_b_prev;
                    } else if (c != prefix->character) {
                        log_p = log_prob_c + prefix->score;
                    }
                    // language model scoring
                    if (ext_scorer != nullptr &&
                        (c == space_id || ext_scorer->is_character_based())) {
                        PathTrie *prefix_to_score = nullptr;
                        // skip scoring the space
                        if (ext_scorer->is_character_based()) {
                            prefix_to_score = prefix_new;
                        } else {
                            prefix_to_score = prefix;
                        }
                        float score = 0.0;
                        std::vector<std::string> ngram;
                        ngram = ext_scorer->make_ngram(prefix_to_score);
                        score = ext_scorer->get_log_cond_prob(ngram) *
                                ext_scorer->alpha;
                        log_p += score;
                        log_p += ext_scorer->beta;
                    }
                    prefix_new->log_prob_nb_cur =
                        log_sum_exp(prefix_new->log_prob_nb_cur, log_p);
                }
            }  // end of loop over prefix
        }      // end of loop over vocabulary
        prefixes.clear();
        // update log probs
        root->iterate_to_vec(prefixes);
        // only preserve top beam_size prefixes
        if (prefixes.size() >= beam_size) {
            std::nth_element(prefixes.begin(),
                             prefixes.begin() + beam_size,
                             prefixes.end(),
                             prefix_compare);
            for (size_t i = beam_size; i < prefixes.size(); ++i) {
                prefixes[i]->remove();
            }
        }
    }  // end of loop over time
    return;
 }
 std::vector<std::pair<double, std::string>> get_decode_result(
    std::vector<PathTrie *> &prefixes,
    const std::vector<std::string> &vocabulary,
    size_t beam_size,
    Scorer *ext_scorer) {
    auto it = std::find(vocabulary.begin(), vocabulary.end(), kSPACE);
    int space_id = it - vocabulary.begin();
    // if no space in vocabulary
    if ((size_t)space_id >= vocabulary.size()) {
        space_id = -2;
    }
    // score the last word of each prefix that doesn't end with space
    if (ext_scorer != nullptr && !ext_scorer->is_character_based()) {
        for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) {
            auto prefix = prefixes[i];
            if (!prefix->is_empty() && prefix->character != space_id) {
                float score = 0.0;
                std::vector<std::string> ngram = ext_scorer->make_ngram(prefix);
                score =
                    ext_scorer->get_log_cond_prob(ngram) * ext_scorer->alpha;
                score += ext_scorer->beta;
                prefix->score += score;
            }
        }
    }
    size_t num_prefixes = std::min(prefixes.size(), beam_size);
    std::sort(
        prefixes.begin(), prefixes.begin() + num_prefixes, prefix_compare);
    // compute aproximate ctc score as the return score, without affecting the
    // return order of decoding result. To delete when decoder gets stable.
    for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) {
        double approx_ctc = prefixes[i]->score;
        if (ext_scorer != nullptr) {
            std::vector<int> output;
            prefixes[i]->get_path_vec(output);
            auto prefix_length = output.size();
            auto words = ext_scorer->split_labels(output);
            // remove word insert
            approx_ctc = approx_ctc - prefix_length * ext_scorer->beta;
            // remove language model weight:
            approx_ctc -=
                (ext_scorer->get_sent_log_prob(words)) * ext_scorer->alpha;
        }
        prefixes[i]->approx_ctc = approx_ctc;
    }
    std::vector<std::pair<double, std::string>> res =
        get_beam_search_result(prefixes, vocabulary, beam_size);
    // pay back the last word of each prefix that doesn't end with space (for
    // decoding by chunk)
    if (ext_scorer != nullptr && !ext_scorer->is_character_based()) {
        for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) {
            auto prefix = prefixes[i];
            if (!prefix->is_empty() && prefix->character != space_id) {
                float score = 0.0;
                std::vector<std::string> ngram = ext_scorer->make_ngram(prefix);
                score =
                    ext_scorer->get_log_cond_prob(ngram) * ext_scorer->alpha;
                score += ext_scorer->beta;
                prefix->score -= score;
            }
        }
    }
    return res;
 }
 void free_storage(std::unique_ptr<CtcBeamSearchDecoderStorage> &storage) {
    storage = nullptr;
 }
 CtcBeamSearchDecoderBatch::~CtcBeamSearchDecoderBatch() {}
 CtcBeamSearchDecoderBatch::CtcBeamSearchDecoderBatch(
    const std::vector<std::string> &vocabulary,
    size_t batch_size,
    size_t beam_size,
    size_t num_processes,
    double cutoff_prob,
    size_t cutoff_top_n,
    Scorer *ext_scorer,
    size_t blank_id)
    : batch_size(batch_size),
      beam_size(beam_size),
      num_processes(num_processes),
      cutoff_prob(cutoff_prob),
      cutoff_top_n(cutoff_top_n),
      ext_scorer(ext_scorer),
      blank_id(blank_id) {
    VALID_CHECK_GT(this->beam_size, 0, "beam_size must be greater than 0!");
    VALID_CHECK_GT(
        this->num_processes, 0, "num_processes must be nonnegative!");
    this->vocabulary = vocabulary;
    for (size_t i = 0; i < batch_size; i++) {
        this->decoder_storage_vector.push_back(
            std::unique_ptr<CtcBeamSearchDecoderStorage>(
                new CtcBeamSearchDecoderStorage()));
        ctc_beam_search_decode_chunk_begin(
            this->decoder_storage_vector[i]->root, ext_scorer);
    }
 };
 /**
 * Input
 * probs_split: shape [B, T, D]
 */
 void CtcBeamSearchDecoderBatch::next(
    const std::vector<std::vector<std::vector<double>>> &probs_split,
    const std::vector<std::string> &has_value) {
    VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
    // thread pool
    size_t num_has_value = 0;
    for (int i = 0; i < has_value.size(); i++)
        if (has_value[i] == "true") num_has_value += 1;
    ThreadPool pool(std::min(num_processes, num_has_value));
    // number of samples
    size_t probs_num = probs_split.size();
    VALID_CHECK_EQ(this->batch_size,
                   probs_num,
                   "The batch size of the current input data should be same "
                   "with the input data before");
    // enqueue the tasks of decoding
    std::vector<std::future<void>> res;
    for (size_t i = 0; i < batch_size; ++i) {
        if (has_value[i] == "true") {
            res.emplace_back(pool.enqueue(
                ctc_beam_search_decode_chunk,
                std::ref(this->decoder_storage_vector[i]->root),
                std::ref(this->decoder_storage_vector[i]->prefixes),
                probs_split[i],
                this->vocabulary,
                this->beam_size,
                this->cutoff_prob,
                this->cutoff_top_n,
                this->ext_scorer,
                this->blank_id));
        }
    }
    for (size_t i = 0; i < batch_size; ++i) {
        res[i].get();
    }
    return;
 };
 /**
 * Return
 * batch_result: shape[B, beam_size,(-approx_ctc score, string)]
 */
 std::vector<std::vector<std::pair<double, std::string>>>
 CtcBeamSearchDecoderBatch::decode() {
    VALID_CHECK_GT(
        this->num_processes, 0, "num_processes must be nonnegative!");
    // thread pool
    ThreadPool pool(this->num_processes);
    // number of samples
    // enqueue the tasks of decoding
    std::vector<std::future<std::vector<std::pair<double, std::string>>>> res;
    for (size_t i = 0; i < this->batch_size; ++i) {
        res.emplace_back(
            pool.enqueue(get_decode_result,
                         std::ref(this->decoder_storage_vector[i]->prefixes),
                         this->vocabulary,
                         this->beam_size,
                         this->ext_scorer));
    }
    // get decoding results
    std::vector<std::vector<std::pair<double, std::string>>> batch_results;
    for (size_t i = 0; i < this->batch_size; ++i) {
        batch_results.emplace_back(res[i].get());
    }
    return batch_results;
 }
 /**
 * reset the state of ctcBeamSearchDecoderBatch
 */
 void CtcBeamSearchDecoderBatch::reset_state(size_t batch_size,
                                            size_t beam_size,
                                            size_t num_processes,
                                            double cutoff_prob,
                                            size_t cutoff_top_n) {
    this->batch_size = batch_size;
    this->beam_size = beam_size;
    this->num_processes = num_processes;
    this->cutoff_prob = cutoff_prob;
    this->cutoff_top_n = cutoff_top_n;
    VALID_CHECK_GT(this->beam_size, 0, "beam_size must be greater than 0!");
    VALID_CHECK_GT(
        this->num_processes, 0, "num_processes must be nonnegative!");
    // thread pool
    ThreadPool pool(this->num_processes);
    // number of samples
    // enqueue the tasks of decoding
    std::vector<std::future<void>> res;
    size_t storage_size = decoder_storage_vector.size();
    for (size_t i = 0; i < storage_size; i++) {
        res.emplace_back(pool.enqueue(
            free_storage, std::ref(this->decoder_storage_vector[i])));
    }
    for (size_t i = 0; i < storage_size; ++i) {
        res[i].get();
    }
    std::vector<std::unique_ptr<CtcBeamSearchDecoderStorage>>().swap(
        decoder_storage_vector);
    for (size_t i = 0; i < this->batch_size; i++) {
        this->decoder_storage_vector.push_back(
            std::unique_ptr<CtcBeamSearchDecoderStorage>(
                new CtcBeamSearchDecoderStorage()));
        ctc_beam_search_decode_chunk_begin(
            this->decoder_storage_vector[i]->root, this->ext_scorer);
    }
 }
--- a/speechx/speechx/asr/decoder/ctc_decoders/ctc_beam_search_decoder.h
+++ b/speechx/speechx/asr/decoder/ctc_decoders/ctc_beam_search_decoder.h
@ -1,175 +0,0 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #ifndef CTC_BEAM_SEARCH_DECODER_H_
 #define CTC_BEAM_SEARCH_DECODER_H_
 #include <string>
 #include <utility>
 #include <vector>
 #include "scorer.h"
 /* CTC Beam Search Decoder
 * Parameters:
 *     probs_seq: 2-D vector that each element is a vector of probabilities
 *               over vocabulary of one time step.
 *     vocabulary: A vector of vocabulary.
 *     beam_size: The width of beam search.
 *     cutoff_prob: Cutoff probability for pruning.
 *     cutoff_top_n: Cutoff number for pruning.
 *     ext_scorer: External scorer to evaluate a prefix, which consists of
 *                 n-gram language model scoring and word insertion term.
 *                 Default null, decoding the input sample without scorer.
 * Return:
 *     A vector that each element is a pair of score  and decoding result,
 *     in desending order.
 */
 std::vector<std::pair<double, std::string>> ctc_beam_search_decoding(
    const std::vector<std::vector<double>> &probs_seq,
    const std::vector<std::string> &vocabulary,
    size_t beam_size,
    double cutoff_prob = 1.0,
    size_t cutoff_top_n = 40,
    Scorer *ext_scorer = nullptr,
    size_t blank_id = 0);
 /* CTC Beam Search Decoder for batch data
 * Parameters:
 *     probs_seq: 3-D vector that each element is a 2-D vector that can be used
 *                by ctc_beam_search_decoder().
 *     vocabulary: A vector of vocabulary.
 *     beam_size: The width of beam search.
 *     num_processes: Number of threads for beam search.
 *     cutoff_prob: Cutoff probability for pruning.
 *     cutoff_top_n: Cutoff number for pruning.
 *     ext_scorer: External scorer to evaluate a prefix, which consists of
 *                 n-gram language model scoring and word insertion term.
 *                 Default null, decoding the input sample without scorer.
 * Return:
 *     A 2-D vector that each element is a vector of beam search decoding
 *     result for one audio sample.
 */
 std::vector<std::vector<std::pair<double, std::string>>>
 ctc_beam_search_decoding_batch(
    const std::vector<std::vector<std::vector<double>>> &probs_split,
    const std::vector<std::string> &vocabulary,
    size_t beam_size,
    size_t num_processes,
    double cutoff_prob = 1.0,
    size_t cutoff_top_n = 40,
    Scorer *ext_scorer = nullptr,
    size_t blank_id = 0);
 /**
 * Store the root and prefixes for decoder
 */
 class CtcBeamSearchDecoderStorage {
  public:
    PathTrie *root = nullptr;
    std::vector<PathTrie *> prefixes;
    CtcBeamSearchDecoderStorage() {
        // init prefixes' root
        this->root = new PathTrie();
        this->root->log_prob_b_prev = 0.0;
        // The score of root is in log scale.Since the prob=1.0, the prob score
        // in log scale is 0.0
        this->root->score = root->log_prob_b_prev;
        // std::vector<PathTrie *> prefixes;
        this->prefixes.push_back(root);
    };
    ~CtcBeamSearchDecoderStorage() {
        if (root != nullptr) {
            delete root;
            root = nullptr;
        }
    };
 };
 /**
 * The ctc beam search decoder, support batchsize >= 1
 */
 class CtcBeamSearchDecoderBatch {
  public:
    CtcBeamSearchDecoderBatch(const std::vector<std::string> &vocabulary,
                              size_t batch_size,
                              size_t beam_size,
                              size_t num_processes,
                              double cutoff_prob,
                              size_t cutoff_top_n,
                              Scorer *ext_scorer,
                              size_t blank_id);
    ~CtcBeamSearchDecoderBatch();
    void next(const std::vector<std::vector<std::vector<double>>> &probs_split,
              const std::vector<std::string> &has_value);
    std::vector<std::vector<std::pair<double, std::string>>> decode();
    void reset_state(size_t batch_size,
                     size_t beam_size,
                     size_t num_processes,
                     double cutoff_prob,
                     size_t cutoff_top_n);
  private:
    std::vector<std::string> vocabulary;
    size_t batch_size;
    size_t beam_size;
    size_t num_processes;
    double cutoff_prob;
    size_t cutoff_top_n;
    Scorer *ext_scorer;
    size_t blank_id;
    std::vector<std::unique_ptr<CtcBeamSearchDecoderStorage>>
        decoder_storage_vector;
 };
 /**
 * function for chunk decoding
 */
 void ctc_beam_search_decode_chunk(
    PathTrie *root,
    std::vector<PathTrie *> &prefixes,
    const std::vector<std::vector<double>> &probs_seq,
    const std::vector<std::string> &vocabulary,
    size_t beam_size,
    double cutoff_prob,
    size_t cutoff_top_n,
    Scorer *ext_scorer,
    size_t blank_id);
 std::vector<std::pair<double, std::string>> get_decode_result(
    std::vector<PathTrie *> &prefixes,
    const std::vector<std::string> &vocabulary,
    size_t beam_size,
    Scorer *ext_scorer);
 /**
 * free the CtcBeamSearchDecoderStorage
 */
 void free_storage(std::unique_ptr<CtcBeamSearchDecoderStorage> &storage);
 /**
 * initialize the root
 */
 void ctc_beam_search_decode_chunk_begin(PathTrie *root, Scorer *ext_scorer);
 #endif  // CTC_BEAM_SEARCH_DECODER_H_
--- a/speechx/speechx/asr/decoder/ctc_decoders/ctc_greedy_decoder.cpp
+++ b/speechx/speechx/asr/decoder/ctc_decoders/ctc_greedy_decoder.cpp
@ -1,61 +0,0 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "ctc_greedy_decoder.h"
 #include "decoder_utils.h"
 std::string ctc_greedy_decoding(
    const std::vector<std::vector<double>> &probs_seq,
    const std::vector<std::string> &vocabulary,
    size_t blank_id) {
    // dimension check
    size_t num_time_steps = probs_seq.size();
    for (size_t i = 0; i < num_time_steps; ++i) {
        VALID_CHECK_EQ(probs_seq[i].size(),
                       vocabulary.size(),
                       "The shape of probs_seq does not match with "
                       "the shape of the vocabulary");
    }
    // size_t blank_id = vocabulary.size();
    std::vector<size_t> max_idx_vec(num_time_steps, 0);
    std::vector<size_t> idx_vec;
    for (size_t i = 0; i < num_time_steps; ++i) {
        double max_prob = 0.0;
        size_t max_idx = 0;
        const std::vector<double> &probs_step = probs_seq[i];
        for (size_t j = 0; j < probs_step.size(); ++j) {
            if (max_prob < probs_step[j]) {
                max_idx = j;
                max_prob = probs_step[j];
            }
        }
        // id with maximum probability in current time step
        max_idx_vec[i] = max_idx;
        // deduplicate
        if ((i == 0) || ((i > 0) && max_idx_vec[i] != max_idx_vec[i - 1])) {
            idx_vec.push_back(max_idx_vec[i]);
        }
    }
    std::string best_path_result;
    for (size_t i = 0; i < idx_vec.size(); ++i) {
        if (idx_vec[i] != blank_id) {
            std::string ch = vocabulary[idx_vec[i]];
            best_path_result += (ch == kSPACE) ? tSPACE : ch;
        }
    }
    return best_path_result;
 }
--- a/speechx/speechx/asr/decoder/ctc_decoders/ctc_greedy_decoder.h
+++ b/speechx/speechx/asr/decoder/ctc_decoders/ctc_greedy_decoder.h
@ -1,35 +0,0 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #ifndef CTC_GREEDY_DECODER_H
 #define CTC_GREEDY_DECODER_H
 #include <string>
 #include <vector>
 /* CTC Greedy (Best Path) Decoder
 *
 * Parameters:
 *     probs_seq: 2-D vector that each element is a vector of probabilities
 *               over vocabulary of one time step.
 *     vocabulary: A vector of vocabulary.
 * Return:
 *     The decoding result in string
 */
 std::string ctc_greedy_decoding(
    const std::vector<std::vector<double>>& probs_seq,
    const std::vector<std::string>& vocabulary,
    size_t blank_id);
 #endif  // CTC_GREEDY_DECODER_H
--- a/speechx/speechx/asr/decoder/ctc_decoders/decoder_utils.cpp
+++ b/speechx/speechx/asr/decoder/ctc_decoders/decoder_utils.cpp
@ -1,193 +0,0 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "decoder_utils.h"
 #include <algorithm>
 #include <cmath>
 #include <limits>
 std::vector<std::pair<size_t, float>> get_pruned_log_probs(
    const std::vector<double> &prob_step,
    double cutoff_prob,
    size_t cutoff_top_n) {
    std::vector<std::pair<int, double>> prob_idx;
    for (size_t i = 0; i < prob_step.size(); ++i) {
        prob_idx.push_back(std::pair<int, double>(i, prob_step[i]));
    }
    // pruning of vocabulary
    size_t cutoff_len = prob_step.size();
    if (cutoff_prob < 1.0 || cutoff_top_n < cutoff_len) {
        std::sort(prob_idx.begin(),
                  prob_idx.end(),
                  pair_comp_second_rev<int, double>);
        if (cutoff_prob < 1.0) {
            double cum_prob = 0.0;
            cutoff_len = 0;
            for (size_t i = 0; i < prob_idx.size(); ++i) {
                cum_prob += prob_idx[i].second;
                cutoff_len += 1;
                if (cum_prob >= cutoff_prob || cutoff_len >= cutoff_top_n)
                    break;
            }
        }
        prob_idx = std::vector<std::pair<int, double>>(
            prob_idx.begin(), prob_idx.begin() + cutoff_len);
    }
    std::vector<std::pair<size_t, float>> log_prob_idx;
    for (size_t i = 0; i < cutoff_len; ++i) {
        log_prob_idx.push_back(std::pair<int, float>(
            prob_idx[i].first, log(prob_idx[i].second + NUM_FLT_MIN)));
    }
    return log_prob_idx;
 }
 std::vector<std::pair<double, std::string>> get_beam_search_result(
    const std::vector<PathTrie *> &prefixes,
    const std::vector<std::string> &vocabulary,
    size_t beam_size) {
    // allow for the post processing
    std::vector<PathTrie *> space_prefixes;
    if (space_prefixes.empty()) {
        for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) {
            space_prefixes.push_back(prefixes[i]);
        }
    }
    std::sort(space_prefixes.begin(), space_prefixes.end(), prefix_compare);
    std::vector<std::pair<double, std::string>> output_vecs;
    for (size_t i = 0; i < beam_size && i < space_prefixes.size(); ++i) {
        std::vector<int> output;
        space_prefixes[i]->get_path_vec(output);
        // convert index to string
        std::string output_str;
        for (size_t j = 0; j < output.size(); j++) {
            std::string ch = vocabulary[output[j]];
            output_str += (ch == kSPACE) ? tSPACE : ch;
        }
        std::pair<double, std::string> output_pair(
            -space_prefixes[i]->approx_ctc, output_str);
        output_vecs.emplace_back(output_pair);
    }
    return output_vecs;
 }
 size_t get_utf8_str_len(const std::string &str) {
    size_t str_len = 0;
    for (char c : str) {
        str_len += ((c & 0xc0) != 0x80);
    }
    return str_len;
 }
 std::vector<std::string> split_utf8_str(const std::string &str) {
    std::vector<std::string> result;
    std::string out_str;
    for (char c : str) {
        if ((c & 0xc0) != 0x80)  // new UTF-8 character
        {
            if (!out_str.empty()) {
                result.push_back(out_str);
                out_str.clear();
            }
        }
        out_str.append(1, c);
    }
    result.push_back(out_str);
    return result;
 }
 std::vector<std::string> split_str(const std::string &s,
                                   const std::string &delim) {
    std::vector<std::string> result;
    std::size_t start = 0, delim_len = delim.size();
    while (true) {
        std::size_t end = s.find(delim, start);
        if (end == std::string::npos) {
            if (start < s.size()) {
                result.push_back(s.substr(start));
            }
            break;
        }
        if (end > start) {
            result.push_back(s.substr(start, end - start));
        }
        start = end + delim_len;
    }
    return result;
 }
 bool prefix_compare(const PathTrie *x, const PathTrie *y) {
    if (x->score == y->score) {
        if (x->character == y->character) {
            return false;
        } else {
            return (x->character < y->character);
        }
    } else {
        return x->score > y->score;
    }
 }
 void add_word_to_fst(const std::vector<int> &word,
                     fst::StdVectorFst *dictionary) {
    if (dictionary->NumStates() == 0) {
        fst::StdVectorFst::StateId start = dictionary->AddState();
        assert(start == 0);
        dictionary->SetStart(start);
    }
    fst::StdVectorFst::StateId src = dictionary->Start();
    fst::StdVectorFst::StateId dst;
    for (auto c : word) {
        dst = dictionary->AddState();
        dictionary->AddArc(src, fst::StdArc(c, c, 0, dst));
        src = dst;
    }
    dictionary->SetFinal(dst, fst::StdArc::Weight::One());
 }
 bool add_word_to_dictionary(
    const std::string &word,
    const std::unordered_map<std::string, int> &char_map,
    bool add_space,
    int SPACE_ID,
    fst::StdVectorFst *dictionary) {
    auto characters = split_utf8_str(word);
    std::vector<int> int_word;
    for (auto &c : characters) {
        if (c == " ") {
            int_word.push_back(SPACE_ID);
        } else {
            auto int_c = char_map.find(c);
            if (int_c != char_map.end()) {
                int_word.push_back(int_c->second);
            } else {
                return false;  // return without adding
            }
        }
    }
    if (add_space) {
        int_word.push_back(SPACE_ID);
    }
    add_word_to_fst(int_word, dictionary);
    return true;  // return with successful adding
 }
--- a/speechx/speechx/asr/decoder/ctc_decoders/decoder_utils.h
+++ b/speechx/speechx/asr/decoder/ctc_decoders/decoder_utils.h
@ -1,111 +0,0 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #ifndef DECODER_UTILS_H_
 #define DECODER_UTILS_H_
 #include <string>
 #include <utility>
 #include "fst/log.h"
 #include "path_trie.h"
 const std::string kSPACE = "<space>";
 const std::string tSPACE = " ";
 const float NUM_FLT_INF = std::numeric_limits<float>::max();
 const float NUM_FLT_MIN = std::numeric_limits<float>::min();
 // inline function for validation check
 inline void check(
    bool x, const char *expr, const char *file, int line, const char *err) {
    if (!x) {
        std::cout << "[" << file << ":" << line << "] ";
        LOG(FATAL) << "\"" << expr << "\" check failed. " << err;
    }
 }
 #define VALID_CHECK(x, info) \
    check(static_cast<bool>(x), #x, __FILE__, __LINE__, info)
 #define VALID_CHECK_EQ(x, y, info) VALID_CHECK((x) == (y), info)
 #define VALID_CHECK_GT(x, y, info) VALID_CHECK((x) > (y), info)
 #define VALID_CHECK_LT(x, y, info) VALID_CHECK((x) < (y), info)
 // Function template for comparing two pairs
 template <typename T1, typename T2>
 bool pair_comp_first_rev(const std::pair<T1, T2> &a,
                         const std::pair<T1, T2> &b) {
    return a.first > b.first;
 }
 // Function template for comparing two pairs
 template <typename T1, typename T2>
 bool pair_comp_second_rev(const std::pair<T1, T2> &a,
                          const std::pair<T1, T2> &b) {
    return a.second > b.second;
 }
 // Return the sum of two probabilities in log scale
 template <typename T>
 T log_sum_exp(const T &x, const T &y) {
    static T num_min = -std::numeric_limits<T>::max();
    if (x <= num_min) return y;
    if (y <= num_min) return x;
    T xmax = std::max(x, y);
    return std::log(std::exp(x - xmax) + std::exp(y - xmax)) + xmax;
 }
 // Get pruned probability vector for each time step's beam search
 std::vector<std::pair<size_t, float>> get_pruned_log_probs(
    const std::vector<double> &prob_step,
    double cutoff_prob,
    size_t cutoff_top_n);
 // Get beam search result from prefixes in trie tree
 std::vector<std::pair<double, std::string>> get_beam_search_result(
    const std::vector<PathTrie *> &prefixes,
    const std::vector<std::string> &vocabulary,
    size_t beam_size);
 // Functor for prefix comparsion
 bool prefix_compare(const PathTrie *x, const PathTrie *y);
 /* Get length of utf8 encoding string
 * See: http://stackoverflow.com/a/4063229
 */
 size_t get_utf8_str_len(const std::string &str);
 /* Split a string into a list of strings on a given string
 * delimiter. NB: delimiters on beginning / end of string are
 * trimmed. Eg, "FooBarFoo" split on "Foo" returns ["Bar"].
 */
 std::vector<std::string> split_str(const std::string &s,
                                   const std::string &delim);
 /* Splits string into vector of strings representing
 * UTF-8 characters (not same as chars)
 */
 std::vector<std::string> split_utf8_str(const std::string &str);
 // Add a word in index to the dicionary of fst
 void add_word_to_fst(const std::vector<int> &word,
                     fst::StdVectorFst *dictionary);
 // Add a word in string to dictionary
 bool add_word_to_dictionary(
    const std::string &word,
    const std::unordered_map<std::string, int> &char_map,
    bool add_space,
    int SPACE_ID,
    fst::StdVectorFst *dictionary);
 #endif  // DECODER_UTILS_H
--- a/speechx/speechx/asr/decoder/ctc_decoders/path_trie.cpp
+++ b/speechx/speechx/asr/decoder/ctc_decoders/path_trie.cpp
@ -1,164 +0,0 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "path_trie.h"
 #include <algorithm>
 #include <limits>
 #include <memory>
 #include <utility>
 #include <vector>
 #include "decoder_utils.h"
 PathTrie::PathTrie() {
    log_prob_b_prev = -NUM_FLT_INF;
    log_prob_nb_prev = -NUM_FLT_INF;
    log_prob_b_cur = -NUM_FLT_INF;
    log_prob_nb_cur = -NUM_FLT_INF;
    score = -NUM_FLT_INF;
    ROOT_ = -1;
    character = ROOT_;
    exists_ = true;
    parent = nullptr;
    dictionary_ = nullptr;
    dictionary_state_ = 0;
    has_dictionary_ = false;
    matcher_ = nullptr;
 }
 PathTrie::~PathTrie() {
    for (auto child : children_) {
        delete child.second;
        child.second = nullptr;
    }
 }
 PathTrie* PathTrie::get_path_trie(int new_char, bool reset) {
    auto child = children_.begin();
    for (child = children_.begin(); child != children_.end(); ++child) {
        if (child->first == new_char) {
            break;
        }
    }
    if (child != children_.end()) {
        if (!child->second->exists_) {
            child->second->exists_ = true;
            child->second->log_prob_b_prev = -NUM_FLT_INF;
            child->second->log_prob_nb_prev = -NUM_FLT_INF;
            child->second->log_prob_b_cur = -NUM_FLT_INF;
            child->second->log_prob_nb_cur = -NUM_FLT_INF;
        }
        return (child->second);
    } else {
        if (has_dictionary_) {
            matcher_->SetState(dictionary_state_);
            bool found = matcher_->Find(new_char + 1);
            if (!found) {
                // Adding this character causes word outside dictionary
                auto FSTZERO = fst::TropicalWeight::Zero();
                auto final_weight = dictionary_->Final(dictionary_state_);
                bool is_final = (final_weight != FSTZERO);
                if (is_final && reset) {
                    dictionary_state_ = dictionary_->Start();
                }
                return nullptr;
            } else {
                PathTrie* new_path = new PathTrie;
                new_path->character = new_char;
                new_path->parent = this;
                new_path->dictionary_ = dictionary_;
                new_path->dictionary_state_ = matcher_->Value().nextstate;
                new_path->has_dictionary_ = true;
                new_path->matcher_ = matcher_;
                children_.push_back(std::make_pair(new_char, new_path));
                return new_path;
            }
        } else {
            PathTrie* new_path = new PathTrie;
            new_path->character = new_char;
            new_path->parent = this;
            children_.push_back(std::make_pair(new_char, new_path));
            return new_path;
        }
    }
 }
 PathTrie* PathTrie::get_path_vec(std::vector<int>& output) {
    return get_path_vec(output, ROOT_);
 }
 PathTrie* PathTrie::get_path_vec(std::vector<int>& output,
                                 int stop,
                                 size_t max_steps) {
    if (character == stop || character == ROOT_ || output.size() == max_steps) {
        std::reverse(output.begin(), output.end());
        return this;
    } else {
        output.push_back(character);
        return parent->get_path_vec(output, stop, max_steps);
    }
 }
 void PathTrie::iterate_to_vec(std::vector<PathTrie*>& output) {
    if (exists_) {
        log_prob_b_prev = log_prob_b_cur;
        log_prob_nb_prev = log_prob_nb_cur;
        log_prob_b_cur = -NUM_FLT_INF;
        log_prob_nb_cur = -NUM_FLT_INF;
        score = log_sum_exp(log_prob_b_prev, log_prob_nb_prev);
        output.push_back(this);
    }
    for (auto child : children_) {
        child.second->iterate_to_vec(output);
    }
 }
 void PathTrie::remove() {
    exists_ = false;
    if (children_.size() == 0) {
        if (parent != nullptr) {
            auto child = parent->children_.begin();
            for (child = parent->children_.begin();
                 child != parent->children_.end();
                 ++child) {
                if (child->first == character) {
                    parent->children_.erase(child);
                    break;
                }
            }
            if (parent->children_.size() == 0 && !parent->exists_) {
                parent->remove();
            }
        }
        delete this;
    }
 }
 void PathTrie::set_dictionary(fst::StdVectorFst* dictionary) {
    dictionary_ = dictionary;
    dictionary_state_ = dictionary->Start();
    has_dictionary_ = true;
 }
 using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
 void PathTrie::set_matcher(std::shared_ptr<FSTMATCH> matcher) {
    matcher_ = matcher;
 }
--- a/speechx/speechx/asr/decoder/ctc_decoders/path_trie.h
+++ b/speechx/speechx/asr/decoder/ctc_decoders/path_trie.h
@ -1,82 +0,0 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #ifndef PATH_TRIE_H
 #define PATH_TRIE_H
 #include <algorithm>
 #include <limits>
 #include <memory>
 #include <utility>
 #include <vector>
 #include "fst/fstlib.h"
 /* Trie tree for prefix storing and manipulating, with a dictionary in
 * finite-state transducer for spelling correction.
 */
 class PathTrie {
  public:
    PathTrie();
    ~PathTrie();
    // get new prefix after appending new char
    PathTrie* get_path_trie(int new_char, bool reset = true);
    // get the prefix in index from root to current node
    PathTrie* get_path_vec(std::vector<int>& output);
    // get the prefix in index from some stop node to current nodel
    PathTrie* get_path_vec(
        std::vector<int>& output,
        int stop,
        size_t max_steps = std::numeric_limits<size_t>::max());
    // update log probs
    void iterate_to_vec(std::vector<PathTrie*>& output);
    // set dictionary for FST
    void set_dictionary(fst::StdVectorFst* dictionary);
    void set_matcher(std::shared_ptr<fst::SortedMatcher<fst::StdVectorFst>>);
    bool is_empty() { return ROOT_ == character; }
    // remove current path from root
    void remove();
    float log_prob_b_prev;
    float log_prob_nb_prev;
    float log_prob_b_cur;
    float log_prob_nb_cur;
    float score;
    float approx_ctc;
    int character;
    PathTrie* parent;
  private:
    int ROOT_;
    bool exists_;
    bool has_dictionary_;
    std::vector<std::pair<int, PathTrie*>> children_;
    // pointer to dictionary of FST
    fst::StdVectorFst* dictionary_;
    fst::StdVectorFst::StateId dictionary_state_;
    // true if finding ars in FST
    std::shared_ptr<fst::SortedMatcher<fst::StdVectorFst>> matcher_;
 };
 #endif  // PATH_TRIE_H
--- a/speechx/speechx/asr/decoder/ctc_decoders/scorer.cpp
+++ b/speechx/speechx/asr/decoder/ctc_decoders/scorer.cpp
@ -1,232 +0,0 @@
 // Licensed under GNU Lesser General Public License v3 (LGPLv3) (LGPL-3) (the
 // "COPYING.LESSER.3");
 #include "scorer.h"
 #include <unistd.h>
 #include <iostream>
 #include "lm/config.hh"
 #include "lm/model.hh"
 #include "lm/state.hh"
 #include "decoder_utils.h"
 using namespace lm::ngram;
 // if your platform is windows ,you need add the define
 #define    F_OK    0
 Scorer::Scorer(double alpha,
               double beta,
               const std::string& lm_path,
               const std::vector<std::string>& vocab_list) {
    this->alpha = alpha;
    this->beta = beta;
    dictionary = nullptr;
    is_character_based_ = true;
    language_model_ = nullptr;
    max_order_ = 0;
    dict_size_ = 0;
    SPACE_ID_ = -1;
    setup(lm_path, vocab_list);
 }
 Scorer::~Scorer() {
    if (language_model_ != nullptr) {
        delete static_cast<lm::base::Model*>(language_model_);
    }
    if (dictionary != nullptr) {
        delete static_cast<fst::StdVectorFst*>(dictionary);
    }
 }
 void Scorer::setup(const std::string& lm_path,
                   const std::vector<std::string>& vocab_list) {
    // load language model
    load_lm(lm_path);
    // set char map for scorer
    set_char_map(vocab_list);
    // fill the dictionary for FST
    if (!is_character_based()) {
        fill_dictionary(true);
    }
 }
 void Scorer::load_lm(const std::string& lm_path) {
    const char* filename = lm_path.c_str();
    VALID_CHECK_EQ(access(filename, F_OK), 0, "Invalid language model path");
    RetriveStrEnumerateVocab enumerate;
    lm::ngram::Config config;
    config.enumerate_vocab = &enumerate;
    language_model_ = lm::ngram::LoadVirtual(filename, config);
    max_order_ = static_cast<lm::base::Model*>(language_model_)->Order();
    vocabulary_ = enumerate.vocabulary;
    for (size_t i = 0; i < vocabulary_.size(); ++i) {
        if (is_character_based_ && vocabulary_[i] != UNK_TOKEN &&
            vocabulary_[i] != START_TOKEN && vocabulary_[i] != END_TOKEN &&
            get_utf8_str_len(enumerate.vocabulary[i]) > 1) {
            is_character_based_ = false;
        }
    }
 }
 double Scorer::get_log_cond_prob(const std::vector<std::string>& words) {
    lm::base::Model* model = static_cast<lm::base::Model*>(language_model_);
    double cond_prob;
    lm::ngram::State state, tmp_state, out_state;
    // avoid to inserting <s> in begin
    model->NullContextWrite(&state);
    for (size_t i = 0; i < words.size(); ++i) {
        lm::WordIndex word_index = model->BaseVocabulary().Index(words[i]);
        // encounter OOV
        if (word_index == 0) {
            return OOV_SCORE;
        }
        cond_prob = model->BaseScore(&state, word_index, &out_state);
        tmp_state = state;
        state = out_state;
        out_state = tmp_state;
    }
    // return  log10 prob
    return cond_prob;
 }
 double Scorer::get_sent_log_prob(const std::vector<std::string>& words) {
    std::vector<std::string> sentence;
    if (words.size() == 0) {
        for (size_t i = 0; i < max_order_; ++i) {
            sentence.push_back(START_TOKEN);
        }
    } else {
        for (size_t i = 0; i < max_order_ - 1; ++i) {
            sentence.push_back(START_TOKEN);
        }
        sentence.insert(sentence.end(), words.begin(), words.end());
    }
    sentence.push_back(END_TOKEN);
    return get_log_prob(sentence);
 }
 double Scorer::get_log_prob(const std::vector<std::string>& words) {
    assert(words.size() > max_order_);
    double score = 0.0;
    for (size_t i = 0; i < words.size() - max_order_ + 1; ++i) {
        std::vector<std::string> ngram(words.begin() + i,
                                       words.begin() + i + max_order_);
        score += get_log_cond_prob(ngram);
    }
    return score;
 }
 void Scorer::reset_params(float alpha, float beta) {
    this->alpha = alpha;
    this->beta = beta;
 }
 std::string Scorer::vec2str(const std::vector<int>& input) {
    std::string word;
    for (auto ind : input) {
        word += char_list_[ind];
    }
    return word;
 }
 std::vector<std::string> Scorer::split_labels(const std::vector<int>& labels) {
    if (labels.empty()) return {};
    std::string s = vec2str(labels);
    std::vector<std::string> words;
    if (is_character_based_) {
        words = split_utf8_str(s);
    } else {
        words = split_str(s, " ");
    }
    return words;
 }
 void Scorer::set_char_map(const std::vector<std::string>& char_list) {
    char_list_ = char_list;
    char_map_.clear();
    // Set the char map for the FST for spelling correction
    for (size_t i = 0; i < char_list_.size(); i++) {
        if (char_list_[i] == kSPACE) {
            SPACE_ID_ = i;
        }
        // The initial state of FST is state 0, hence the index of chars in
        // the FST should start from 1 to avoid the conflict with the initial
        // state, otherwise wrong decoding results would be given.
        char_map_[char_list_[i]] = i + 1;
    }
 }
 std::vector<std::string> Scorer::make_ngram(PathTrie* prefix) {
    std::vector<std::string> ngram;
    PathTrie* current_node = prefix;
    PathTrie* new_node = nullptr;
    for (int order = 0; order < max_order_; order++) {
        std::vector<int> prefix_vec;
        if (is_character_based_) {
            new_node = current_node->get_path_vec(prefix_vec, SPACE_ID_, 1);
            current_node = new_node;
        } else {
            new_node = current_node->get_path_vec(prefix_vec, SPACE_ID_);
            current_node = new_node->parent;  // Skipping spaces
        }
        // reconstruct word
        std::string word = vec2str(prefix_vec);
        ngram.push_back(word);
        if (new_node->character == -1) {
            // No more spaces, but still need order
            for (int i = 0; i < max_order_ - order - 1; i++) {
                ngram.push_back(START_TOKEN);
            }
            break;
        }
    }
    std::reverse(ngram.begin(), ngram.end());
    return ngram;
 }
 void Scorer::fill_dictionary(bool add_space) {
    fst::StdVectorFst dictionary;
    // For each unigram convert to ints and put in trie
    int dict_size = 0;
    for (const auto& word : vocabulary_) {
        bool added = add_word_to_dictionary(
            word, char_map_, add_space, SPACE_ID_ + 1, &dictionary);
        dict_size += added ? 1 : 0;
    }
    dict_size_ = dict_size;
    /* Simplify FST
     * This gets rid of "epsilon" transitions in the FST.
     * These are transitions that don't require a string input to be taken.
     * Getting rid of them is necessary to make the FST deterministic, but
     * can greatly increase the size of the FST
     */
    fst::RmEpsilon(&dictionary);
    fst::StdVectorFst* new_dict = new fst::StdVectorFst;
    /* This makes the FST deterministic, meaning for any string input there's
     * only one possible state the FST could be in.  It is assumed our
     * dictionary is deterministic when using it.
     * (lest we'd have to check for multiple transitions at each state)
     */
    fst::Determinize(dictionary, new_dict);
    /* Finds the simplest equivalent fst. This is unnecessary but decreases
     * memory usage of the dictionary
     */
    fst::Minimize(new_dict);
    this->dictionary = new_dict;
 }
--- a/speechx/speechx/asr/decoder/ctc_decoders/scorer.h
+++ b/speechx/speechx/asr/decoder/ctc_decoders/scorer.h
@ -1,114 +0,0 @@
 // Licensed under GNU Lesser General Public License v3 (LGPLv3) (LGPL-3) (the
 // "COPYING.LESSER.3");
 #ifndef SCORER_H_
 #define SCORER_H_
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include "lm/enumerate_vocab.hh"
 #include "lm/virtual_interface.hh"
 #include "lm/word_index.hh"
 #include "path_trie.h"
 const double OOV_SCORE = -1000.0;
 const std::string START_TOKEN = "<s>";
 const std::string UNK_TOKEN = "<unk>";
 const std::string END_TOKEN = "</s>";
 // Implement a callback to retrive the dictionary of language model.
 class RetriveStrEnumerateVocab : public lm::EnumerateVocab {
  public:
    RetriveStrEnumerateVocab() {}
    void Add(lm::WordIndex index, const StringPiece &str) {
        vocabulary.push_back(std::string(str.data(), str.length()));
    }
    std::vector<std::string> vocabulary;
 };
 /* External scorer to query score for n-gram or sentence, including language
 * model scoring and word insertion.
 *
 * Example:
 *     Scorer scorer(alpha, beta, "path_of_language_model");
 *     scorer.get_log_cond_prob({ "WORD1", "WORD2", "WORD3" });
 *     scorer.get_sent_log_prob({ "WORD1", "WORD2", "WORD3" });
 */
 class Scorer {
  public:
    Scorer(double alpha,
           double beta,
           const std::string &lm_path,
           const std::vector<std::string> &vocabulary);
    ~Scorer();
    double get_log_cond_prob(const std::vector<std::string> &words);
    double get_sent_log_prob(const std::vector<std::string> &words);
    // return the max order
    size_t get_max_order() const { return max_order_; }
    // return the dictionary size of language model
    size_t get_dict_size() const { return dict_size_; }
    // retrun true if the language model is character based
    bool is_character_based() const { return is_character_based_; }
    // reset params alpha & beta
    void reset_params(float alpha, float beta);
    // make ngram for a given prefix
    std::vector<std::string> make_ngram(PathTrie *prefix);
    // trransform the labels in index to the vector of words (word based lm) or
    // the vector of characters (character based lm)
    std::vector<std::string> split_labels(const std::vector<int> &labels);
    // language model weight
    double alpha;
    // word insertion weight
    double beta;
    // pointer to the dictionary of FST
    void *dictionary;
  protected:
    // necessary setup: load language model, set char map, fill FST's dictionary
    void setup(const std::string &lm_path,
               const std::vector<std::string> &vocab_list);
    // load language model from given path
    void load_lm(const std::string &lm_path);
    // fill dictionary for FST
    void fill_dictionary(bool add_space);
    // set char map
    void set_char_map(const std::vector<std::string> &char_list);
    double get_log_prob(const std::vector<std::string> &words);
    // translate the vector in index to string
    std::string vec2str(const std::vector<int> &input);
  private:
    void *language_model_;
    bool is_character_based_;
    size_t max_order_;
    size_t dict_size_;
    int SPACE_ID_;
    std::vector<std::string> char_list_;
    std::unordered_map<std::string, int> char_map_;
    std::vector<std::string> vocabulary_;
 };
 #endif  // SCORER_H_
--- a/speechx/speechx/asr/decoder/nnet_logprob_decoder_main.cc
+++ b/speechx/speechx/asr/decoder/nnet_logprob_decoder_main.cc
@ -1,77 +0,0 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // todo refactor, repalce with gtest
 #include "base/flags.h"
 #include "base/log.h"
 #include "decoder/ctc_beam_search_decoder.h"
 #include "kaldi/util/table-types.h"
 #include "nnet/decodable.h"
 DEFINE_string(nnet_prob_respecifier, "", "test nnet prob rspecifier");
 DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
 DEFINE_string(lm_path, "lm.klm", "language model");
 using kaldi::BaseFloat;
 using kaldi::Matrix;
 using std::vector;
 // test decoder by feeding nnet posterior probability
 int main(int argc, char* argv[]) {
    gflags::SetUsageMessage("Usage:");
    gflags::ParseCommandLineFlags(&argc, &argv, false);
    google::InitGoogleLogging(argv[0]);
    google::InstallFailureSignalHandler();
    FLAGS_logtostderr = 1;
    kaldi::SequentialBaseFloatMatrixReader likelihood_reader(
        FLAGS_nnet_prob_respecifier);
    std::string dict_file = FLAGS_dict_file;
    std::string lm_path = FLAGS_lm_path;
    LOG(INFO) << "dict path: " << dict_file;
    LOG(INFO) << "lm path: " << lm_path;
    int32 num_done = 0, num_err = 0;
    ppspeech::CTCBeamSearchOptions opts;
    opts.dict_file = dict_file;
    opts.lm_path = lm_path;
    ppspeech::CTCBeamSearch decoder(opts);
    std::shared_ptr<ppspeech::Decodable> decodable(
        new ppspeech::Decodable(nullptr, nullptr));
    decoder.InitDecoder();
    for (; !likelihood_reader.Done(); likelihood_reader.Next()) {
        string utt = likelihood_reader.Key();
        const kaldi::Matrix<BaseFloat> likelihood = likelihood_reader.Value();
        LOG(INFO) << "process utt: " << utt;
        LOG(INFO) << "rows: " << likelihood.NumRows();
        LOG(INFO) << "cols: " << likelihood.NumCols();
        decodable->Acceptlikelihood(likelihood);
        decoder.AdvanceDecode(decodable);
        std::string result;
        result = decoder.GetFinalBestPath();
        KALDI_LOG << " the result of " << utt << " is " << result;
        decodable->Reset();
        decoder.Reset();
        ++num_done;
    }
    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
              << " with errors.";
    return (num_done != 0 ? 0 : 1);
 }
--- a/speechx/speechx/asr/decoder/param.h
+++ b/speechx/speechx/asr/decoder/param.h
@ -15,8 +15,7 @@
 #pragma once
 #include "base/common.h"
-#include "decoder/ctc_beam_search_decoder.h"
+//#include "decoder/ctc_tlg_decoder.h"
 #include "decoder/ctc_tlg_decoder.h"
 // feature
 DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
--- a/speechx/speechx/asr/nnet/CMakeLists.txt
+++ b/speechx/speechx/asr/nnet/CMakeLists.txt
@ -1,30 +1,12 @@
 set(srcs decodable.cc nnet_producer.cc)
-if(USING_DS2)
+list(APPEND srcs u2_nnet.cc)
  list(APPEND srcs ds2_nnet.cc)
 endif()
 if(USING_U2)
  list(APPEND srcs u2_nnet.cc)
 endif()
 add_library(nnet STATIC ${srcs})
 target_link_libraries(nnet utils)
-if(USING_U2)
+target_compile_options(nnet  PUBLIC ${PADDLE_COMPILE_FLAGS})
-  target_compile_options(nnet  PUBLIC ${PADDLE_COMPILE_FLAGS})
+target_include_directories(nnet  PUBLIC ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
  target_include_directories(nnet  PUBLIC ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
 endif()
 if(USING_DS2)
  set(bin_name ds2_nnet_main)
  add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
  target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
  target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet)
  target_link_libraries(${bin_name} ${DEPS})
 endif()
 # test bin
 #if(USING_U2)
--- a/speechx/speechx/asr/nnet/ds2_nnet.cc
+++ b/speechx/speechx/asr/nnet/ds2_nnet.cc
@ -1,218 +0,0 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "nnet/ds2_nnet.h"
 #include "utils/strings.h"
 namespace ppspeech {
 using kaldi::Matrix;
 using kaldi::Vector;
 using std::shared_ptr;
 using std::string;
 using std::vector;
 void PaddleNnet::InitCacheEncouts(const ModelOptions& opts) {
    std::vector<std::string> cache_names;
    cache_names = StrSplit(opts.cache_names, ",");
    std::vector<std::string> cache_shapes;
    cache_shapes = StrSplit(opts.cache_shape, ",");
    assert(cache_shapes.size() == cache_names.size());
    cache_encouts_.clear();
    cache_names_idx_.clear();
    for (size_t i = 0; i < cache_shapes.size(); i++) {
        std::vector<std::string> tmp_shape;
        tmp_shape = StrSplit(cache_shapes[i], "-");
        std::vector<int> cur_shape;
        std::transform(tmp_shape.begin(),
                       tmp_shape.end(),
                       std::back_inserter(cur_shape),
                       [](const std::string& s) { return atoi(s.c_str()); });
        cache_names_idx_[cache_names[i]] = i;
        std::shared_ptr<Tensor<BaseFloat>> cache_eout =
            std::make_shared<Tensor<BaseFloat>>(cur_shape);
        cache_encouts_.push_back(cache_eout);
    }
 }
 PaddleNnet::PaddleNnet(const ModelOptions& opts) : opts_(opts) {
    subsampling_rate_ = opts.subsample_rate;
    paddle_infer::Config config;
    config.SetModel(opts.model_path, opts.param_path);
    if (opts.use_gpu) {
        config.EnableUseGpu(500, 0);
    }
    config.SwitchIrOptim(opts.switch_ir_optim);
    if (opts.enable_fc_padding == false) {
        config.DisableFCPadding();
    }
    if (opts.enable_profile) {
        config.EnableProfile();
    }
    pool.reset(
        new paddle_infer::services::PredictorPool(config, opts.thread_num));
    if (pool == nullptr) {
        LOG(ERROR) << "create the predictor pool failed";
    }
    pool_usages.resize(opts.thread_num);
    std::fill(pool_usages.begin(), pool_usages.end(), false);
    LOG(INFO) << "load paddle model success";
    LOG(INFO) << "start to check the predictor input and output names";
    LOG(INFO) << "input names: " << opts.input_names;
    LOG(INFO) << "output names: " << opts.output_names;
    std::vector<std::string> input_names_vec = StrSplit(opts.input_names, ",");
    std::vector<std::string> output_names_vec = StrSplit(opts.output_names, ",");
    paddle_infer::Predictor* predictor = GetPredictor();
    std::vector<std::string> model_input_names = predictor->GetInputNames();
    assert(input_names_vec.size() == model_input_names.size());
    for (size_t i = 0; i < model_input_names.size(); i++) {
        assert(input_names_vec[i] == model_input_names[i]);
    }
    std::vector<std::string> model_output_names = predictor->GetOutputNames();
    assert(output_names_vec.size() == model_output_names.size());
    for (size_t i = 0; i < output_names_vec.size(); i++) {
        assert(output_names_vec[i] == model_output_names[i]);
    }
    ReleasePredictor(predictor);
    InitCacheEncouts(opts);
 }
 void PaddleNnet::Reset() { InitCacheEncouts(opts_); }
 paddle_infer::Predictor* PaddleNnet::GetPredictor() {
    paddle_infer::Predictor* predictor = nullptr;
    std::lock_guard<std::mutex> guard(pool_mutex);
    int pred_id = 0;
    while (pred_id < pool_usages.size()) {
        if (pool_usages[pred_id] == false) {
            predictor = pool->Retrive(pred_id);
            break;
        }
        ++pred_id;
    }
    if (predictor) {
        pool_usages[pred_id] = true;
        predictor_to_thread_id[predictor] = pred_id;
    } else {
        LOG(INFO) << "Failed to get predictor from pool !!!";
    }
    return predictor;
 }
 int PaddleNnet::ReleasePredictor(paddle_infer::Predictor* predictor) {
    std::lock_guard<std::mutex> guard(pool_mutex);
    auto iter = predictor_to_thread_id.find(predictor);
    if (iter == predictor_to_thread_id.end()) {
        LOG(INFO) << "there is no such predictor";
        return 0;
    }
    pool_usages[iter->second] = false;
    predictor_to_thread_id.erase(predictor);
    return 0;
 }
 shared_ptr<Tensor<BaseFloat>> PaddleNnet::GetCacheEncoder(const string& name) {
    auto iter = cache_names_idx_.find(name);
    if (iter == cache_names_idx_.end()) {
        return nullptr;
    }
    assert(iter->second < cache_encouts_.size());
    return cache_encouts_[iter->second];
 }
 void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
                             const int32& feature_dim,
                             NnetOut* out) {
    paddle_infer::Predictor* predictor = GetPredictor();
    int feat_row = features.Dim() / feature_dim;
    std::vector<std::string> input_names = predictor->GetInputNames();
    std::vector<std::string> output_names = predictor->GetOutputNames();
    // feed inputs
    std::unique_ptr<paddle_infer::Tensor> input_tensor =
        predictor->GetInputHandle(input_names[0]);
    std::vector<int> INPUT_SHAPE = {1, feat_row, feature_dim};
    input_tensor->Reshape(INPUT_SHAPE);
    input_tensor->CopyFromCpu(features.Data());
    std::unique_ptr<paddle_infer::Tensor> input_len =
        predictor->GetInputHandle(input_names[1]);
    std::vector<int> input_len_size = {1};
    input_len->Reshape(input_len_size);
    std::vector<int64_t> audio_len;
    audio_len.push_back(feat_row);
    input_len->CopyFromCpu(audio_len.data());
    std::unique_ptr<paddle_infer::Tensor> state_h =
        predictor->GetInputHandle(input_names[2]);
    shared_ptr<Tensor<BaseFloat>> h_cache = GetCacheEncoder(input_names[2]);
    state_h->Reshape(h_cache->get_shape());
    state_h->CopyFromCpu(h_cache->get_data().data());
    std::unique_ptr<paddle_infer::Tensor> state_c =
        predictor->GetInputHandle(input_names[3]);
    shared_ptr<Tensor<float>> c_cache = GetCacheEncoder(input_names[3]);
    state_c->Reshape(c_cache->get_shape());
    state_c->CopyFromCpu(c_cache->get_data().data());
    // forward
    bool success = predictor->Run();
    if (success == false) {
        LOG(INFO) << "predictor run occurs error";
    }
    // fetch outpus
    std::unique_ptr<paddle_infer::Tensor> h_out =
        predictor->GetOutputHandle(output_names[2]);
    assert(h_cache->get_shape() == h_out->shape());
    h_out->CopyToCpu(h_cache->get_data().data());
    std::unique_ptr<paddle_infer::Tensor> c_out =
        predictor->GetOutputHandle(output_names[3]);
    assert(c_cache->get_shape() == c_out->shape());
    c_out->CopyToCpu(c_cache->get_data().data());
    std::unique_ptr<paddle_infer::Tensor> output_tensor =
        predictor->GetOutputHandle(output_names[0]);
    std::vector<int> output_shape = output_tensor->shape();
    int32 row = output_shape[1];
    int32 col = output_shape[2];
    // inferences->Resize(row * col);
    // *inference_dim = col;
    out->logprobs.Resize(row * col);
    out->vocab_dim = col;
    output_tensor->CopyToCpu(out->logprobs.Data());
    ReleasePredictor(predictor);
 }
 }  // namespace ppspeech
--- a/speechx/speechx/asr/nnet/ds2_nnet.h
+++ b/speechx/speechx/asr/nnet/ds2_nnet.h
@ -1,97 +0,0 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <numeric>
 #include "base/common.h"
 #include "kaldi/matrix/kaldi-matrix.h"
 #include "nnet/nnet_itf.h"
 #include "paddle_inference_api.h"
 namespace ppspeech {
 template <typename T>
 class Tensor {
  public:
    Tensor() {}
    explicit Tensor(const std::vector<int>& shape) : _shape(shape) {
        int neml = std::accumulate(
            _shape.begin(), _shape.end(), 1, std::multiplies<int>());
        LOG(INFO) << "Tensor neml: " << neml;
        _data.resize(neml, 0);
    }
    void reshape(const std::vector<int>& shape) {
        _shape = shape;
        int neml = std::accumulate(
            _shape.begin(), _shape.end(), 1, std::multiplies<int>());
        _data.resize(neml, 0);
    }
    const std::vector<int>& get_shape() const { return _shape; }
    std::vector<T>& get_data() { return _data; }
  private:
    std::vector<int> _shape;
    std::vector<T> _data;
 };
 class PaddleNnet : public NnetBase {
  public:
    explicit PaddleNnet(const ModelOptions& opts);
    void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
                     const int32& feature_dim,
                     NnetOut* out) override;
    void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
                            float reverse_weight,
                            std::vector<float>* rescoring_score) override {
        VLOG(2) << "deepspeech2 not has AttentionRescoring.";
    }
    void Dim();
    void Reset() override;
    bool IsLogProb() override { return false; }
    std::shared_ptr<Tensor<kaldi::BaseFloat>> GetCacheEncoder(
        const std::string& name);
    void InitCacheEncouts(const ModelOptions& opts);
    void EncoderOuts(std::vector<kaldi::Vector<kaldi::BaseFloat>>* encoder_out)
        const override {}
  private:
    paddle_infer::Predictor* GetPredictor();
    int ReleasePredictor(paddle_infer::Predictor* predictor);
    std::unique_ptr<paddle_infer::services::PredictorPool> pool;
    std::vector<bool> pool_usages;
    std::mutex pool_mutex;
    std::map<paddle_infer::Predictor*, int> predictor_to_thread_id;
    std::map<std::string, int> cache_names_idx_;
    std::vector<std::shared_ptr<Tensor<kaldi::BaseFloat>>> cache_encouts_;
    ModelOptions opts_;
  public:
    DISALLOW_COPY_AND_ASSIGN(PaddleNnet);
 };
 }  // namespace ppspeech
--- a/speechx/speechx/asr/nnet/ds2_nnet_main.cc
+++ b/speechx/speechx/asr/nnet/ds2_nnet_main.cc
@ -1,142 +0,0 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "base/common.h"
 #include "decoder/param.h"
 #include "frontend/audio/assembler.h"
 #include "frontend/audio/data_cache.h"
 #include "kaldi/util/table-types.h"
 #include "nnet/decodable.h"
 #include "nnet/ds2_nnet.h"
 DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
 DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier");
 using kaldi::BaseFloat;
 using kaldi::Matrix;
 using std::vector;
 int main(int argc, char* argv[]) {
    gflags::SetUsageMessage("Usage:");
    gflags::ParseCommandLineFlags(&argc, &argv, false);
    google::InitGoogleLogging(argv[0]);
    google::InstallFailureSignalHandler();
    FLAGS_logtostderr = 1;
    kaldi::SequentialBaseFloatMatrixReader feature_reader(
        FLAGS_feature_rspecifier);
    kaldi::BaseFloatMatrixWriter nnet_writer(FLAGS_nnet_prob_wspecifier);
    std::string model_graph = FLAGS_model_path;
    std::string model_params = FLAGS_param_path;
    LOG(INFO) << "model path: " << model_graph;
    LOG(INFO) << "model param: " << model_params;
    int32 num_done = 0, num_err = 0;
    ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags();
    std::shared_ptr<ppspeech::PaddleNnet> nnet(
        new ppspeech::PaddleNnet(model_opts));
    std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
    std::shared_ptr<ppspeech::Decodable> decodable(
        new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale));
    int32 chunk_size = FLAGS_receptive_field_length +
                       (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate;
    int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk;
    int32 receptive_field_length = FLAGS_receptive_field_length;
    LOG(INFO) << "chunk size (frame): " << chunk_size;
    LOG(INFO) << "chunk stride (frame): " << chunk_stride;
    LOG(INFO) << "receptive field (frame): " << receptive_field_length;
    kaldi::Timer timer;
    for (; !feature_reader.Done(); feature_reader.Next()) {
        string utt = feature_reader.Key();
        kaldi::Matrix<BaseFloat> feature = feature_reader.Value();
        raw_data->SetDim(feature.NumCols());
        LOG(INFO) << "process utt: " << utt;
        LOG(INFO) << "rows: " << feature.NumRows();
        LOG(INFO) << "cols: " << feature.NumCols();
        int32 row_idx = 0;
        int32 padding_len = 0;
        int32 ori_feature_len = feature.NumRows();
        if ((feature.NumRows() - chunk_size) % chunk_stride != 0) {
            padding_len =
                chunk_stride - (feature.NumRows() - chunk_size) % chunk_stride;
            feature.Resize(feature.NumRows() + padding_len,
                           feature.NumCols(),
                           kaldi::kCopyData);
        }
        int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1;
        int32 frame_idx = 0;
        std::vector<kaldi::Vector<kaldi::BaseFloat>> prob_vec;
        for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
            kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
                                                          feature.NumCols());
            int32 feature_chunk_size = 0;
            if (ori_feature_len > chunk_idx * chunk_stride) {
                feature_chunk_size = std::min(
                    ori_feature_len - chunk_idx * chunk_stride, chunk_size);
            }
            if (feature_chunk_size < receptive_field_length) break;
            int32 start = chunk_idx * chunk_stride;
            for (int row_id = 0; row_id < chunk_size; ++row_id) {
                kaldi::SubVector<kaldi::BaseFloat> tmp(feature, start);
                kaldi::SubVector<kaldi::BaseFloat> f_chunk_tmp(
                    feature_chunk.Data() + row_id * feature.NumCols(),
                    feature.NumCols());
                f_chunk_tmp.CopyFromVec(tmp);
                ++start;
            }
            raw_data->Accept(feature_chunk);
            if (chunk_idx == num_chunks - 1) {
                raw_data->SetFinished();
            }
            vector<kaldi::BaseFloat> prob;
            while (decodable->FrameLikelihood(frame_idx, &prob)) {
                kaldi::Vector<kaldi::BaseFloat> vec_tmp(prob.size());
                std::memcpy(vec_tmp.Data(),
                            prob.data(),
                            sizeof(kaldi::BaseFloat) * prob.size());
                prob_vec.push_back(vec_tmp);
                frame_idx++;
            }
        }
        decodable->Reset();
        if (prob_vec.size() == 0) {
            // the TokenWriter can not write empty string.
            ++num_err;
            KALDI_LOG << " the nnet prob of " << utt << " is empty";
            continue;
        }
        kaldi::Matrix<kaldi::BaseFloat> result(prob_vec.size(),
                                               prob_vec[0].Dim());
        for (int row_idx = 0; row_idx < prob_vec.size(); ++row_idx) {
            for (int32 col_idx = 0; col_idx < prob_vec[0].Dim(); ++col_idx) {
                result(row_idx, col_idx) = prob_vec[row_idx](col_idx);
            }
        }
        nnet_writer.Write(utt, result);
        ++num_done;
    }
    double elapsed = timer.Elapsed();
    KALDI_LOG << " cost:" << elapsed << " s";
    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
              << " with errors.";
    return (num_done != 0 ? 0 : 1);
 }
--- a/speechx/speechx/asr/nnet/nnet_producer.cc
+++ b/speechx/speechx/asr/nnet/nnet_producer.cc
@ -65,7 +65,6 @@ bool NnetProducer::Compute() {
    size_t nframes = logprobs.Dim() / vocab_dim;
    VLOG(2) << "Forward out " << nframes << " decoder frames.";
    std::vector<BaseFloat> logprob(vocab_dim);
    // remove later.
    for (size_t idx = 0; idx < nframes; ++idx) {
        for (size_t prob_idx = 0; prob_idx < vocab_dim; ++prob_idx) {
            logprob[prob_idx] = logprobs(idx * vocab_dim + prob_idx);
--- a/speechx/speechx/asr/recognizer/CMakeLists.txt
+++ b/speechx/speechx/asr/recognizer/CMakeLists.txt
@ -1,46 +1,23 @@
 set(srcs)
 if (USING_DS2)
 list(APPEND srcs
-recognizer.cc
+  u2_recognizer.cc
 )
 endif()
 if (USING_U2)
  list(APPEND srcs
    u2_recognizer.cc
  )
 endif()
 add_library(recognizer STATIC ${srcs})
 target_link_libraries(recognizer PUBLIC decoder)
-# test
+set(TEST_BINS 
-if (USING_DS2)
+  u2_recognizer_main
-  set(BINS recognizer_main)
+  u2_recognizer_thread_main
-
+)
  foreach(bin_name IN LISTS BINS)
    add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
    target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
    target_link_libraries(${bin_name} PUBLIC recognizer nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
  endforeach()
 endif()
 if (USING_U2)
  set(TEST_BINS 
    u2_recognizer_main
    u2_recognizer_thread_main
  )
  foreach(bin_name IN LISTS TEST_BINS)
    add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
    target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
    target_link_libraries(${bin_name} recognizer nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util)
    target_compile_options(${bin_name}  PRIVATE ${PADDLE_COMPILE_FLAGS})
    target_include_directories(${bin_name}  PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
    target_link_libraries(${bin_name}  ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
  endforeach()
-endif()
+foreach(bin_name IN LISTS TEST_BINS)
  add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
  target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
  target_link_libraries(${bin_name} recognizer nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util)
  target_compile_options(${bin_name}  PRIVATE ${PADDLE_COMPILE_FLAGS})
  target_include_directories(${bin_name}  PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
  target_link_libraries(${bin_name}  ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
 endforeach()
--- a/speechx/speechx/asr/recognizer/recognizer.cc
+++ b/speechx/speechx/asr/recognizer/recognizer.cc
@ -1,70 +0,0 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "recognizer/recognizer.h"
 namespace ppspeech {
 using kaldi::BaseFloat;
 using kaldi::SubVector;
 using kaldi::Vector;
 using kaldi::VectorBase;
 using std::unique_ptr;
 using std::vector;
 Recognizer::Recognizer(const RecognizerResource& resource) {
    // resource_ = resource;
    const FeaturePipelineOptions& feature_opts = resource.feature_pipeline_opts;
    feature_pipeline_.reset(new FeaturePipeline(feature_opts));
    std::shared_ptr<PaddleNnet> nnet(new PaddleNnet(resource.model_opts));
    BaseFloat ac_scale = resource.acoustic_scale;
    decodable_.reset(new Decodable(nnet, feature_pipeline_, ac_scale));
    decoder_.reset(new TLGDecoder(resource.tlg_opts));
    input_finished_ = false;
 }
 void Recognizer::Accept(const Vector<BaseFloat>& waves) {
    feature_pipeline_->Accept(waves);
 }
 void Recognizer::Decode() { decoder_->AdvanceDecode(decodable_); }
 std::string Recognizer::GetFinalResult() {
    return decoder_->GetFinalBestPath();
 }
 std::string Recognizer::GetPartialResult() {
    return decoder_->GetPartialResult();
 }
 void Recognizer::SetFinished() {
    feature_pipeline_->SetFinished();
    input_finished_ = true;
 }
 bool Recognizer::IsFinished() { return input_finished_; }
 void Recognizer::Reset() {
    feature_pipeline_->Reset();
    decodable_->Reset();
    decoder_->Reset();
 }
 }  // namespace ppspeech
--- a/speechx/speechx/asr/recognizer/recognizer.h
+++ b/speechx/speechx/asr/recognizer/recognizer.h
@ -1,70 +0,0 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // todo refactor later (SGoat)
 #pragma once
 #include "decoder/ctc_beam_search_decoder.h"
 #include "decoder/ctc_tlg_decoder.h"
 #include "frontend/audio/feature_pipeline.h"
 #include "nnet/decodable.h"
 #include "nnet/ds2_nnet.h"
 DECLARE_double(acoustic_scale);
 namespace ppspeech {
 struct RecognizerResource {
    kaldi::BaseFloat acoustic_scale{1.0};
    FeaturePipelineOptions feature_pipeline_opts{};
    ModelOptions model_opts{};
    TLGDecoderOptions tlg_opts{};
    //    CTCBeamSearchOptions beam_search_opts;
    static RecognizerResource InitFromFlags() {
        RecognizerResource resource;
        resource.acoustic_scale = FLAGS_acoustic_scale;
        resource.feature_pipeline_opts =
            FeaturePipelineOptions::InitFromFlags();
        resource.feature_pipeline_opts.assembler_opts.fill_zero = true;
        LOG(INFO) << "ds2 need fill zero be true: "
                  << resource.feature_pipeline_opts.assembler_opts.fill_zero;
        resource.model_opts = ModelOptions::InitFromFlags();
        resource.tlg_opts = TLGDecoderOptions::InitFromFlags();
        return resource;
    }
 };
 class Recognizer {
  public:
    explicit Recognizer(const RecognizerResource& resouce);
    void Accept(const kaldi::Vector<kaldi::BaseFloat>& waves);
    void Decode();
    std::string GetFinalResult();
    std::string GetPartialResult();
    void SetFinished();
    bool IsFinished();
    void Reset();
  private:
    // std::shared_ptr<RecognizerResource> resource_;
    // RecognizerResource resource_;
    std::shared_ptr<FeaturePipeline> feature_pipeline_;
    std::shared_ptr<Decodable> decodable_;
    std::unique_ptr<TLGDecoder> decoder_;
    bool input_finished_;
 };
 }  // namespace ppspeech
--- a/speechx/speechx/asr/recognizer/recognizer_main.cc
+++ b/speechx/speechx/asr/recognizer/recognizer_main.cc
@ -1,105 +0,0 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "decoder/param.h"
 #include "kaldi/feat/wave-reader.h"
 #include "kaldi/util/table-types.h"
 #include "recognizer/recognizer.h"
 DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
 DEFINE_string(result_wspecifier, "", "test result wspecifier");
 DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
 DEFINE_int32(sample_rate, 16000, "sample rate");
 int main(int argc, char* argv[]) {
    gflags::SetUsageMessage("Usage:");
    gflags::ParseCommandLineFlags(&argc, &argv, false);
    google::InitGoogleLogging(argv[0]);
    google::InstallFailureSignalHandler();
    FLAGS_logtostderr = 1;
    ppspeech::RecognizerResource resource =
        ppspeech::RecognizerResource::InitFromFlags();
    ppspeech::Recognizer recognizer(resource);
    kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
        FLAGS_wav_rspecifier);
    kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
    int sample_rate = FLAGS_sample_rate;
    float streaming_chunk = FLAGS_streaming_chunk;
    int chunk_sample_size = streaming_chunk * sample_rate;
    LOG(INFO) << "sr: " << sample_rate;
    LOG(INFO) << "chunk size (s): " << streaming_chunk;
    LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
    int32 num_done = 0, num_err = 0;
    double tot_wav_duration = 0.0;
    kaldi::Timer timer;
    for (; !wav_reader.Done(); wav_reader.Next()) {
        std::string utt = wav_reader.Key();
        const kaldi::WaveData& wave_data = wav_reader.Value();
        int32 this_channel = 0;
        kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
                                                    this_channel);
        int tot_samples = waveform.Dim();
        tot_wav_duration += tot_samples * 1.0 / sample_rate;
        LOG(INFO) << "wav len (sample): " << tot_samples;
        int sample_offset = 0;
        std::vector<kaldi::Vector<BaseFloat>> feats;
        int feature_rows = 0;
        while (sample_offset < tot_samples) {
            int cur_chunk_size =
                std::min(chunk_sample_size, tot_samples - sample_offset);
            kaldi::Vector<kaldi::BaseFloat> wav_chunk(cur_chunk_size);
            for (int i = 0; i < cur_chunk_size; ++i) {
                wav_chunk(i) = waveform(sample_offset + i);
            }
            // wav_chunk = waveform.Range(sample_offset + i, cur_chunk_size);
            recognizer.Accept(wav_chunk);
            if (cur_chunk_size < chunk_sample_size) {
                recognizer.SetFinished();
            }
            recognizer.Decode();
            // no overlap
            sample_offset += cur_chunk_size;
        }
        std::string result;
        result = recognizer.GetFinalResult();
        recognizer.Reset();
        if (result.empty()) {
            // the TokenWriter can not write empty string.
            ++num_err;
            KALDI_LOG << " the result of " << utt << " is empty";
            continue;
        }
        KALDI_LOG << " the result of " << utt << " is " << result;
        result_writer.Write(utt, result);
        ++num_done;
    }
    double elapsed = timer.Elapsed();
    KALDI_LOG << "Done " << num_done << " out of " << (num_err + num_done);
    KALDI_LOG << " cost:" << elapsed << " s";
    KALDI_LOG << "total wav duration is: " << tot_wav_duration << " s";
    KALDI_LOG << "the RTF is: " << elapsed / tot_wav_duration;
 }
--- a/speechx/speechx/common/frontend/audio/cmvn_json2kaldi_main.cc
+++ b/speechx/speechx/common/frontend/audio/cmvn_json2kaldi_main.cc
@ -20,15 +20,12 @@
 #include "kaldi/matrix/kaldi-matrix.h"
 #include "kaldi/util/kaldi-io.h"
 #include "utils/file_utils.h"
-// #include "boost/json.hpp"
+#include "utils/picojson.h"
 #include <boost/json/src.hpp>
 DEFINE_string(json_file, "", "cmvn json file");
 DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn");
 DEFINE_bool(binary, true, "write cmvn in binary (true) or text(false)");
 using namespace boost::json;  // from <boost/json.hpp>
 int main(int argc, char* argv[]) {
    gflags::SetUsageMessage("Usage:");
    gflags::ParseCommandLineFlags(&argc, &argv, false);
@ -40,36 +37,49 @@ int main(int argc, char* argv[]) {
    auto ifs = std::ifstream(FLAGS_json_file);
    std::string json_str = ppspeech::ReadFile2String(FLAGS_json_file);
-    auto value = boost::json::parse(json_str);
+    picojson::value value;
-    if (!value.is_object()) {
+    std::string err;
    const char* json_end = picojson::parse(
        value, json_str.c_str(), json_str.c_str() + json_str.size(), &err);
    if (!value.is<picojson::object>()) {
        LOG(ERROR) << "Input json file format error.";
    }
-    for (auto obj : value.as_object()) {
+    const picojson::value::object& obj = value.get<picojson::object>();
-        if (obj.key() == "mean_stat") {
+    for (picojson::value::object::const_iterator elem = obj.begin();
-            VLOG(2) << "mean_stat:" << obj.value();
+         elem != obj.end();
         ++elem) {
        if (elem->first == "mean_stat") {
            VLOG(2) << "mean_stat:" << elem->second;
            // const picojson::value tmp =
            // elem->second.get(0);//<picojson::array>();
            double tmp =
                elem->second.get(0).get<double>();  //<picojson::array>();
            VLOG(2) << "tmp: " << tmp;
        }
-        if (obj.key() == "var_stat") {
+        if (elem->first == "var_stat") {
-            VLOG(2) << "var_stat: " << obj.value();
+            VLOG(2) << "var_stat: " << elem->second;
        }
-        if (obj.key() == "frame_num") {
+        if (elem->first == "frame_num") {
-            VLOG(2) << "frame_num: " << obj.value();
+            VLOG(2) << "frame_num: " << elem->second;
        }
    }
-    boost::json::array mean_stat = value.at("mean_stat").as_array();
+    const picojson::value::array& mean_stat =
        value.get("mean_stat").get<picojson::array>();
    std::vector<kaldi::BaseFloat> mean_stat_vec;
    for (auto it = mean_stat.begin(); it != mean_stat.end(); it++) {
-        mean_stat_vec.push_back(it->as_double());
+        mean_stat_vec.push_back((*it).get<double>());
    }
-    boost::json::array var_stat = value.at("var_stat").as_array();
+    const picojson::value::array& var_stat =
        value.get("var_stat").get<picojson::array>();
    std::vector<kaldi::BaseFloat> var_stat_vec;
    for (auto it = var_stat.begin(); it != var_stat.end(); it++) {
-        var_stat_vec.push_back(it->as_double());
+        var_stat_vec.push_back((*it).get<double>());
    }
-    kaldi::int32 frame_num = uint64_t(value.at("frame_num").as_int64());
+    kaldi::int32 frame_num = value.get("frame_num").get<int64_t>();
    LOG(INFO) << "nframe: " << frame_num;
    size_t mean_size = mean_stat_vec.size();
--- a/speechx/speechx/common/utils/picojson.h
+++ b/speechx/speechx/common/utils/picojson.h