opt to compile asr,cls,vad; add vad; format code (#2968)

3 years ago · b35fc01a3a
parent 78e29c8ec4
commit b35fc01a3a
30 changed files with 3811 additions and 2531 deletions
--- a/runtime/.gitignore
+++ b/runtime/.gitignore
@ -1,3 +1,6 @@
 engine/common/base/flags.h
 engine/common/base/log.h
 tools/valgrind*
 *log
 fc_patch/*
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@ -20,8 +20,7 @@ project(paddlespeech VERSION 0.1)
 set(CMAKE_VERBOSE_MAKEFILE on)
-# set std-14
+
 set(CMAKE_CXX_STANDARD 14)
 include(FetchContent)
 include(ExternalProject)
@ -31,15 +30,28 @@ set(FETCHCONTENT_QUIET off)
 get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
 set(FETCHCONTENT_BASE_DIR ${fc_patch})
 set(CMAKE_CXX_FLAGS)
 set(CMAKE_CXX_FLAGS_DEBUG)
 set(CMAKE_CXX_FLAGS_RELEASE)
 # set std-14
 set(CMAKE_CXX_STANDARD 14)
 # compiler option
 # Keep the same with openfst, -fPIC or -fpic
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --std=c++14 -pthread -fPIC -O0 -Wall -g -ldl")
 SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} --std=c++14 -pthread -fPIC -O0 -Wall -g -ggdb")
 SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} --std=c++14 -pthread -fPIC -O3 -Wall")
 add_compile_options(-fPIC)
 ###############################################################################
 # Option Configurations
 ###############################################################################
 option(WITH_ASR "build asr" ON)
 option(WITH_CLS "build cls" ON)
 option(WITH_VAD "build vad" ON)
 option(TEST_DEBUG "option for debug" OFF)
 option(USE_PROFILING "enable c++ profling" OFF)
 option(WITH_TESTING "unit test" ON)
@ -47,31 +59,40 @@ option(WITH_TESTING "unit test" ON)
 option(USING_GPU "u2 compute on GPU." OFF)
 ###############################################################################
-# Include third party
+# Include Third Party
 ###############################################################################
 include(gflags)
 include(glog)
 # openfst
 include(openfst)
 add_dependencies(openfst gflags glog)
 # paddle lib
 include(paddleinference)
 # gtest
 if(WITH_TESTING)
    include(gtest) # download, build, install gtest
 endif()
 # fastdeploy
 include(fastdeploy)
 if(WITH_ASR)
    # openfst
    include(openfst)
    add_dependencies(openfst gflags glog)
 endif()
 ###############################################################################
 # Find Package
 ###############################################################################
 # python/pybind11/threads
 find_package(Threads REQUIRED)
 # https://cmake.org/cmake/help/latest/module/FindPython3.html#module:FindPython3
 find_package(Python3 COMPONENTS Interpreter Development)
 find_package(pybind11 CONFIG)
-if(Python3_FOUND)
+
 if(WITH_ASR)
    if(Python3_FOUND)
    message(STATUS "Python3_FOUND = ${Python3_FOUND}")
    message(STATUS "Python3_EXECUTABLE = ${Python3_EXECUTABLE}")
    message(STATUS "Python3_LIBRARIES = ${Python3_LIBRARIES}")
@ -79,70 +100,76 @@ if(Python3_FOUND)
    message(STATUS "Python3_LINK_OPTIONS = ${Python3_LINK_OPTIONS}")
    set(PYTHON_LIBRARIES ${Python3_LIBRARIES} CACHE STRING "python lib" FORCE)
    set(PYTHON_INCLUDE_DIR ${Python3_INCLUDE_DIRS} CACHE STRING "python inc" FORCE)
-endif()
+    endif()
-message(STATUS "PYTHON_LIBRARIES = ${PYTHON_LIBRARIES}")
+    message(STATUS "PYTHON_LIBRARIES = ${PYTHON_LIBRARIES}")
-message(STATUS "PYTHON_INCLUDE_DIR = ${PYTHON_INCLUDE_DIR}")
+    message(STATUS "PYTHON_INCLUDE_DIR = ${PYTHON_INCLUDE_DIR}")
-if(pybind11_FOUND)
+    if(pybind11_FOUND)
    message(STATUS "pybind11_INCLUDES = ${pybind11_INCLUDE_DIRS}")
    message(STATUS "pybind11_LIBRARIES=${pybind11_LIBRARIES}")
    message(STATUS "pybind11_DEFINITIONS=${pybind11_DEFINITIONS}")
-endif()
+    endif()
-# paddle libpaddle.so
+    # paddle libpaddle.so
-# paddle include and link option
+    # paddle include and link option
-# -L/workspace/DeepSpeech-2.x/engine/venv/lib/python3.7/site-packages/paddle/libs -L/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/fluid -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so
+    # -L/workspace/DeepSpeech-2.x/engine/venv/lib/python3.7/site-packages/paddle/libs -L/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/fluid -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so
-execute_process(
+    execute_process(
        COMMAND python -c "\
-import os;\
+    import os;\
-import paddle;\
+    import paddle;\
-include_dir=paddle.sysconfig.get_include();\
+    include_dir=paddle.sysconfig.get_include();\
-paddle_dir=os.path.split(include_dir)[0];\
+    paddle_dir=os.path.split(include_dir)[0];\
-libs_dir=os.path.join(paddle_dir, 'libs');\
+    libs_dir=os.path.join(paddle_dir, 'libs');\
-fluid_dir=os.path.join(paddle_dir, 'fluid');\
+    fluid_dir=os.path.join(paddle_dir, 'fluid');\
-out=' '.join([\"-L\" + libs_dir, \"-L\" + fluid_dir]);\
+    out=' '.join([\"-L\" + libs_dir, \"-L\" + fluid_dir]);\
-out += \" -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so\"; print(out);\
+    out += \" -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so\"; print(out);\
        "
        OUTPUT_VARIABLE PADDLE_LINK_FLAGS
        RESULT_VARIABLE SUCESS)
-message(STATUS PADDLE_LINK_FLAGS= ${PADDLE_LINK_FLAGS})
+    message(STATUS PADDLE_LINK_FLAGS= ${PADDLE_LINK_FLAGS})
-string(STRIP ${PADDLE_LINK_FLAGS} PADDLE_LINK_FLAGS)
+    string(STRIP ${PADDLE_LINK_FLAGS} PADDLE_LINK_FLAGS)
-# paddle compile option
+    # paddle compile option
-# -I/workspace/DeepSpeech-2.x/engine/venv/lib/python3.7/site-packages/paddle/include
+    # -I/workspace/DeepSpeech-2.x/engine/venv/lib/python3.7/site-packages/paddle/include
-execute_process(
+    execute_process(
        COMMAND python -c "\
-import paddle; \
+    import paddle; \
-include_dir = paddle.sysconfig.get_include(); \
+    include_dir = paddle.sysconfig.get_include(); \
-print(f\"-I{include_dir}\"); \
+    print(f\"-I{include_dir}\"); \
        "
        OUTPUT_VARIABLE PADDLE_COMPILE_FLAGS)
-message(STATUS PADDLE_COMPILE_FLAGS= ${PADDLE_COMPILE_FLAGS})
+    message(STATUS PADDLE_COMPILE_FLAGS= ${PADDLE_COMPILE_FLAGS})
-string(STRIP ${PADDLE_COMPILE_FLAGS} PADDLE_COMPILE_FLAGS)
+    string(STRIP ${PADDLE_COMPILE_FLAGS} PADDLE_COMPILE_FLAGS)
-# for LD_LIBRARY_PATH
+    # for LD_LIBRARY_PATH
-# set(PADDLE_LIB_DIRS /workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid:/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/libs/)
+    # set(PADDLE_LIB_DIRS /workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid:/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/libs/)
-execute_process(
+    execute_process(
        COMMAND python -c "\
-import os; \
+    import os; \
-import paddle; \
+    import paddle; \
-include_dir=paddle.sysconfig.get_include(); \
+    include_dir=paddle.sysconfig.get_include(); \
-paddle_dir=os.path.split(include_dir)[0]; \
+    paddle_dir=os.path.split(include_dir)[0]; \
-libs_dir=os.path.join(paddle_dir, 'libs'); \
+    libs_dir=os.path.join(paddle_dir, 'libs'); \
-fluid_dir=os.path.join(paddle_dir, 'fluid'); \
+    fluid_dir=os.path.join(paddle_dir, 'fluid'); \
-out=':'.join([libs_dir, fluid_dir]); print(out); \
+    out=':'.join([libs_dir, fluid_dir]); print(out); \
        "
        OUTPUT_VARIABLE PADDLE_LIB_DIRS)
-message(STATUS PADDLE_LIB_DIRS= ${PADDLE_LIB_DIRS})
+    message(STATUS PADDLE_LIB_DIRS= ${PADDLE_LIB_DIRS})
 endif()
 add_compile_options(-fPIC)
 ###############################################################################
 # Add local library
 ###############################################################################
 set(ENGINE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/engine)
 message(STATUS "CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}")
 message(STATUS "CMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}")
 message(STATUS "CMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}")
 add_subdirectory(engine)
--- a/runtime/build.sh
+++ b/runtime/build.sh
@ -4,5 +4,5 @@ set -xe
 # the build script had verified in the paddlepaddle docker image.
 # please follow the instruction below to install PaddlePaddle image.
 # https://www.paddlepaddle.org.cn/documentation/docs/zh/install/docker/linux-docker.html 
-cmake -B build
+cmake -B build -DWITH_ASR=OFF -DWITH_CLS=OFF
 cmake --build build -j
--- a/runtime/cmake/fastdeploy.cmake
+++ b/runtime/cmake/fastdeploy.cmake
@ -8,11 +8,11 @@ windows_x86")
 set(CMAKE_VERBOSE_MAKEFILE ON)
 set(FASTDEPLOY_DIR ${CMAKE_SOURCE_DIR}/fc_patch/fastdeploy)
-if(NOT EXISTS ${FASTDEPLOY_DIR}/fastdeploy-linux-x64-1.0.2.tgz)
+if(NOT EXISTS ${FASTDEPLOY_DIR}/fastdeploy-linux-x64-1.0.4.tgz)
    exec_program("mkdir -p ${FASTDEPLOY_DIR} &&
-    wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-1.0.2.tgz -P ${FASTDEPLOY_DIR} &&
+    wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-1.0.4.tgz -P ${FASTDEPLOY_DIR} &&
-    tar xzvf ${FASTDEPLOY_DIR}/fastdeploy-linux-x64-1.0.2.tgz -C ${FASTDEPLOY_DIR} &&
+    tar xzvf ${FASTDEPLOY_DIR}/fastdeploy-linux-x64-1.0.4.tgz -C ${FASTDEPLOY_DIR} &&
-    mv ${FASTDEPLOY_DIR}/fastdeploy-linux-x64-1.0.2 ${FASTDEPLOY_DIR}/linux-x64")
+    mv ${FASTDEPLOY_DIR}/fastdeploy-linux-x64-1.0.4 ${FASTDEPLOY_DIR}/linux-x64")
 endif()
 if(NOT EXISTS ${FASTDEPLOY_DIR}/fastdeploy-android-1.0.0-shared.tgz)
@ -36,4 +36,9 @@ elseif (ARCH STREQUAL "android_armv7")
 endif()
 include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
 # fix compiler flags conflict, since fastdeploy using c++11 for project
 set(CMAKE_CXX_STANDARD 14)
 include_directories(${FASTDEPLOY_INCS})
 message(STATUS "FASTDEPLOY_INCS=${FASTDEPLOY_INCS}")
--- a/runtime/engine/CMakeLists.txt
+++ b/runtime/engine/CMakeLists.txt
@ -6,8 +6,19 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/kaldi)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/common)
 add_subdirectory(asr)
 add_subdirectory(common)
 add_subdirectory(kaldi)
 add_subdirectory(common)
 if(WITH_ASR)
    add_subdirectory(asr)
 endif()
 if(WITH_CLS)
    add_subdirectory(cls)
 endif()
 if(WITH_VAD)
    add_subdirectory(vad)
 endif()
 add_subdirectory(codelab)
 add_subdirectory(cls)
--- a/runtime/engine/asr/recognizer/u2_recognizer.cc
+++ b/runtime/engine/asr/recognizer/u2_recognizer.cc
@ -38,7 +38,8 @@ U2Recognizer::U2Recognizer(const U2RecognizerResource& resource)
        decoder_ = std::make_unique<CTCPrefixBeamSearch>(
            resource.vocab_path, resource.decoder_opts.ctc_prefix_search_opts);
    } else {
-        decoder_ = std::make_unique<TLGDecoder>(resource.decoder_opts.tlg_decoder_opts);
+        decoder_ = std::make_unique<TLGDecoder>(
            resource.decoder_opts.tlg_decoder_opts);
    }
    symbol_table_ = decoder_->WordSymbolTable();
--- a/runtime/engine/common/CMakeLists.txt
+++ b/runtime/engine/common/CMakeLists.txt
@ -3,7 +3,7 @@ ${CMAKE_CURRENT_SOURCE_DIR}
 ${CMAKE_CURRENT_SOURCE_DIR}/../
 )
 add_subdirectory(utils)
-
+add_subdirectory(base)
 add_subdirectory(matrix)
 include_directories(
--- a/runtime/engine/common/base/CMakeLists.txt
+++ b/runtime/engine/common/base/CMakeLists.txt
@ -0,0 +1,20 @@
 if(WITH_ASR)
  add_compile_options(-DWITH_ASR)
  set(PPS_FLAGS_LIB "fst/flags.h")
  set(PPS_GLOB_LIB "fst/log.h")
 else()
  set(PPS_FLAGS_LIB "gflags/gflags.h")
  set(PPS_GLOB_LIB "glog/logging.h")
 endif()
 configure_file(
    ${CMAKE_CURRENT_SOURCE_DIR}/flags.h.in
    ${CMAKE_CURRENT_SOURCE_DIR}/flags.h @ONLY
  )
 message(STATUS "Generated ${CMAKE_CURRENT_SOURCE_DIR}/flags.h")
 configure_file(
    ${CMAKE_CURRENT_SOURCE_DIR}/log.h.in
    ${CMAKE_CURRENT_SOURCE_DIR}/log.h @ONLY
  )
 message(STATUS "Generated ${CMAKE_CURRENT_SOURCE_DIR}/log.h")
--- a/runtime/engine/common/base/flags.h.in
+++ b/runtime/engine/common/base/flags.h.in
@ -14,4 +14,4 @@
 #pragma once
-#include "fst/flags.h"
+#include "@PPS_FLAGS_LIB@"
--- a/runtime/engine/common/base/log.h.in
+++ b/runtime/engine/common/base/log.h.in
@ -14,4 +14,4 @@
 #pragma once
-#include "fst/log.h"
+#include "@PPS_GLOB_LIB@"
--- a/runtime/engine/common/frontend/cmvn.cc
+++ b/runtime/engine/common/frontend/cmvn.cc
@ -33,7 +33,7 @@ CMVN::CMVN(std::string cmvn_file, unique_ptr<FrontendInterface> base_extractor)
    dim_ = mean_stats_.size() - 1;
 }
-void CMVN::ReadCMVNFromJson(string cmvn_file) {
+void CMVN::ReadCMVNFromJson(std::string cmvn_file) {
    std::string json_str = ppspeech::ReadFile2String(cmvn_file);
    picojson::value value;
    std::string err;
--- a/runtime/engine/common/frontend/feature-fbank.h
+++ b/runtime/engine/common/frontend/feature-fbank.h
@ -21,6 +21,7 @@
 #ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
 #define KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
 #include <limits>
 #include <map>
 #include "frontend/feature-window.h"
--- a/runtime/engine/common/frontend/feature-window.cc
+++ b/runtime/engine/common/frontend/feature-window.cc
@ -7,6 +7,7 @@
 #include "frontend/feature-window.h"
 #include <cmath>
 #include <limits>
 #include <vector>
 #ifndef M_2PI
--- a/runtime/engine/common/frontend/rfft.cc
+++ b/runtime/engine/common/frontend/rfft.cc
@ -17,12 +17,12 @@
 */
 #include "frontend/rfft.h"
 #include "base/log.h"
 #include <cmath>
 #include <memory>
 #include <vector>
 #include "base/log.h"
 // see fftsg.c
 #ifdef __cplusplus
 extern "C" void rdft(int n, int isgn, double *a, int *ip, double *w);
--- a/runtime/engine/common/matrix/kaldi-matrix-inl.h
+++ b/runtime/engine/common/matrix/kaldi-matrix-inl.h
@ -25,40 +25,41 @@
 namespace kaldi {
 /// Empty constructor
-template<typename Real>
+template <typename Real>
-Matrix<Real>::Matrix(): MatrixBase<Real>(NULL, 0, 0, 0) { }
+Matrix<Real>::Matrix() : MatrixBase<Real>(NULL, 0, 0, 0) {}
 /*
 template<>
 template<>
-void MatrixBase<float>::AddVecVec(const float alpha, const VectorBase<float> &ra, const VectorBase<float> &rb);
+void MatrixBase<float>::AddVecVec(const float alpha, const VectorBase<float>
 &ra, const VectorBase<float> &rb);
 template<>
 template<>
-void MatrixBase<double>::AddVecVec(const double alpha, const VectorBase<double> &ra, const VectorBase<double> &rb);
+void MatrixBase<double>::AddVecVec(const double alpha, const VectorBase<double>
 &ra, const VectorBase<double> &rb);
 */
-template<typename Real>
+template <typename Real>
-inline std::ostream & operator << (std::ostream & os, const MatrixBase<Real> & M) {
+inline std::ostream& operator<<(std::ostream& os, const MatrixBase<Real>& M) {
    M.Write(os, false);
    return os;
 }
-template<typename Real>
+template <typename Real>
-inline std::istream & operator >> (std::istream & is, Matrix<Real> & M) {
+inline std::istream& operator>>(std::istream& is, Matrix<Real>& M) {
    M.Read(is, false);
    return is;
 }
-template<typename Real>
+template <typename Real>
-inline std::istream & operator >> (std::istream & is, MatrixBase<Real> & M) {
+inline std::istream& operator>>(std::istream& is, MatrixBase<Real>& M) {
    M.Read(is, false);
    return is;
 }
-}// namespace kaldi
+}  // namespace kaldi
 #endif  // KALDI_MATRIX_KALDI_MATRIX_INL_H_
--- a/runtime/engine/common/matrix/kaldi-matrix.cc
+++ b/runtime/engine/common/matrix/kaldi-matrix.cc
--- a/runtime/engine/common/matrix/kaldi-matrix.h
+++ b/runtime/engine/common/matrix/kaldi-matrix.h
@ -38,7 +38,7 @@ namespace kaldi {
 /// Base class which provides matrix operations not involving resizing
 /// or allocation.   Classes Matrix and SubMatrix inherit from it and take care
 /// of allocation and resizing.
-template<typename Real>
+template <typename Real>
 class MatrixBase {
  public:
    // so this child can access protected members of other instances.
@ -62,22 +62,20 @@ class MatrixBase {
    }
    /// Gives pointer to raw data (const).
-  inline const Real* Data() const {
+    inline const Real *Data() const { return data_; }
    return data_;
  }
    /// Gives pointer to raw data (non-const).
-  inline Real* Data() { return data_; }
+    inline Real *Data() { return data_; }
    /// Returns pointer to data for one row (non-const)
-  inline  Real* RowData(MatrixIndexT i) {
+    inline Real *RowData(MatrixIndexT i) {
        KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                     static_cast<UnsignedMatrixIndexT>(num_rows_));
        return data_ + i * stride_;
    }
    /// Returns pointer to data for one row (const)
-  inline const Real* RowData(MatrixIndexT i) const {
+    inline const Real *RowData(MatrixIndexT i) const {
        KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                     static_cast<UnsignedMatrixIndexT>(num_rows_));
        return data_ + i * stride_;
@ -85,8 +83,9 @@ class MatrixBase {
    /// Indexing operator, non-const
    /// (only checks sizes if compiled with -DKALDI_PARANOID)
-  inline Real&  operator() (MatrixIndexT r, MatrixIndexT c) {
+    inline Real &operator()(MatrixIndexT r, MatrixIndexT c) {
-    KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
+        KALDI_PARANOID_ASSERT(
            static_cast<UnsignedMatrixIndexT>(r) <
                static_cast<UnsignedMatrixIndexT>(num_rows_) &&
            static_cast<UnsignedMatrixIndexT>(c) <
                static_cast<UnsignedMatrixIndexT>(num_cols_));
@ -94,12 +93,13 @@ class MatrixBase {
    }
    /// Indexing operator, provided for ease of debugging (gdb doesn't work
    /// with parenthesis operator).
-  Real &Index (MatrixIndexT r, MatrixIndexT c) {  return (*this)(r, c); }
+    Real &Index(MatrixIndexT r, MatrixIndexT c) { return (*this)(r, c); }
    /// Indexing operator, const
    /// (only checks sizes if compiled with -DKALDI_PARANOID)
-  inline const Real operator() (MatrixIndexT r, MatrixIndexT c) const {
+    inline const Real operator()(MatrixIndexT r, MatrixIndexT c) const {
-    KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
+        KALDI_PARANOID_ASSERT(
            static_cast<UnsignedMatrixIndexT>(r) <
                static_cast<UnsignedMatrixIndexT>(num_rows_) &&
            static_cast<UnsignedMatrixIndexT>(c) <
                static_cast<UnsignedMatrixIndexT>(num_cols_));
@ -115,22 +115,22 @@ class MatrixBase {
    /// Sets to zero, except ones along diagonal [for non-square matrices too]
    /// Copy given matrix. (no resize is done).
-  template<typename OtherReal>
+    template <typename OtherReal>
-  void CopyFromMat(const MatrixBase<OtherReal> & M,
+    void CopyFromMat(const MatrixBase<OtherReal> &M,
                     MatrixTransposeType trans = kNoTrans);
    /// Copy from compressed matrix.
-  //void CopyFromMat(const CompressedMatrix &M);
+    // void CopyFromMat(const CompressedMatrix &M);
    /// Copy given tpmatrix. (no resize is done).
-  //template<typename OtherReal>
+    // template<typename OtherReal>
-  //void CopyFromTp(const TpMatrix<OtherReal> &M,
+    // void CopyFromTp(const TpMatrix<OtherReal> &M,
-                  //MatrixTransposeType trans = kNoTrans);
+    // MatrixTransposeType trans = kNoTrans);
    /// Copy from CUDA matrix.  Implemented in ../cudamatrix/cu-matrix.h
-  //template<typename OtherReal>
+    // template<typename OtherReal>
-  //void CopyFromMat(const CuMatrixBase<OtherReal> &M,
+    // void CopyFromMat(const CuMatrixBase<OtherReal> &M,
-                   //MatrixTransposeType trans = kNoTrans);
+    // MatrixTransposeType trans = kNoTrans);
    /// This function has two modes of operation.  If v.Dim() == NumRows() *
    /// NumCols(), then treats the vector as a row-by-row concatenation of a
@ -138,10 +138,11 @@ class MatrixBase {
    /// if v.Dim() == NumCols(), it sets each row of *this to a copy of v.
    void CopyRowsFromVec(const VectorBase<Real> &v);
-  /// This version of CopyRowsFromVec is implemented in ../cudamatrix/cu-vector.cc
+    /// This version of CopyRowsFromVec is implemented in
-  //void CopyRowsFromVec(const CuVectorBase<Real> &v);
+    /// ../cudamatrix/cu-vector.cc
    // void CopyRowsFromVec(const CuVectorBase<Real> &v);
-  template<typename OtherReal>
+    template <typename OtherReal>
    void CopyRowsFromVec(const VectorBase<OtherReal> &v);
    /// Copies vector into matrix, column-by-column.
@ -177,8 +178,8 @@ class MatrixBase {
                                 const MatrixIndexT num_rows,
                                 const MatrixIndexT col_offset,
                                 const MatrixIndexT num_cols) const {
-    return SubMatrix<Real>(*this, row_offset, num_rows,
+        return SubMatrix<Real>(
-                           col_offset, num_cols);
+            *this, row_offset, num_rows, col_offset, num_cols);
    }
    inline SubMatrix<Real> RowRange(const MatrixIndexT row_offset,
                                    const MatrixIndexT num_rows) const {
@ -189,7 +190,7 @@ class MatrixBase {
        return SubMatrix<Real>(*this, 0, num_rows_, col_offset, num_cols);
    }
-/*
+    /*
      /// Returns sum of all elements in matrix.
      Real Sum() const;
      /// Returns trace of matrix.
@ -223,7 +224,8 @@ class MatrixBase {
      /// each row by a scalar taken from that dimension of the vector.
      void MulRowsVec(const VectorBase<Real> &scale);
-  /// Divide each row into src.NumCols() equal groups, and then scale i'th row's
+      /// Divide each row into src.NumCols() equal groups, and then scale i'th
      row's
      /// j'th group of elements by src(i, j).  Requires src.NumRows() ==
      /// this->NumRows() and this->NumCols() % src.NumCols() == 0.
      void MulRowsGroupMat(const MatrixBase<Real> &src);
@ -242,77 +244,79 @@ class MatrixBase {
      /// Does inversion in double precision even if matrix was not double.
      void InvertDouble(Real *LogDet = NULL, Real *det_sign = NULL,
                          bool inverse_needed = true);
-*/
+    */
    /// Inverts all the elements of the matrix
    void InvertElements();
-/*
+    /*
      /// Transpose the matrix.  This one is only
      /// applicable to square matrices (the one in the
      /// Matrix child class works also for non-square.
      void Transpose();
-*/
+    */
    /// Copies column r from column indices[r] of src.
    /// As a special case, if indexes[i] == -1, sets column i to zero.
    /// all elements of "indices" must be in [-1, src.NumCols()-1],
    /// and src.NumRows() must equal this.NumRows()
-  void CopyCols(const MatrixBase<Real> &src,
+    void CopyCols(const MatrixBase<Real> &src, const MatrixIndexT *indices);
                const MatrixIndexT *indices);
    /// Copies row r from row indices[r] of src (does nothing
    /// As a special case, if indexes[i] == -1, sets row i to zero.
    /// all elements of "indices" must be in [-1, src.NumRows()-1],
    /// and src.NumCols() must equal this.NumCols()
-  void CopyRows(const MatrixBase<Real> &src,
+    void CopyRows(const MatrixBase<Real> &src, const MatrixIndexT *indices);
                const MatrixIndexT *indices);
    /// Add column indices[r] of src to column r.
    /// As a special case, if indexes[i] == -1, skip column i
    /// indices.size() must equal this->NumCols(),
    /// all elements of "reorder" must be in [-1, src.NumCols()-1],
    /// and src.NumRows() must equal this.NumRows()
-  //void AddCols(const MatrixBase<Real> &src,
+    // void AddCols(const MatrixBase<Real> &src,
    //            const MatrixIndexT *indices);
-  /// Copies row r of this matrix from an array of floats at the location given
+    /// Copies row r of this matrix from an array of floats at the location
    /// given
    /// by src[r]. If any src[r] is NULL then this.Row(r) will be set to zero.
    /// Note: we are using "pointer to const pointer to const object" for "src",
    ///       because we may create "src" by calling Data() of const CuArray
    void CopyRows(const Real *const *src);
    /// Copies row r of this matrix to the array of floats at the location given
-  /// by dst[r]. If dst[r] is NULL, does not copy anywhere.  Requires that none
+    /// by dst[r]. If dst[r] is NULL, does not copy anywhere.  Requires that
    /// none
    /// of the memory regions pointed to by the pointers in "dst" overlap (e.g.
    /// none of the pointers should be the same).
    void CopyToRows(Real *const *dst) const;
    /// Does for each row r, this.Row(r) += alpha * src.row(indexes[r]).
    /// If indexes[r] < 0, does not add anything. all elements of "indexes" must
-  /// be in [-1, src.NumRows()-1], and src.NumCols() must equal this.NumCols().
+    /// be in [-1, src.NumRows()-1], and src.NumCols() must equal
    /// this.NumCols().
    // void AddRows(Real alpha,
    //             const MatrixBase<Real> &src,
    //            const MatrixIndexT *indexes);
-  /// Does for each row r, this.Row(r) += alpha * src[r], treating src[r] as the
+    /// Does for each row r, this.Row(r) += alpha * src[r], treating src[r] as
    /// the
    /// beginning of a region of memory representing a vector of floats, of the
    /// same length as this.NumCols(). If src[r] is NULL, does not add anything.
-  //void AddRows(Real alpha, const Real *const *src);
+    // void AddRows(Real alpha, const Real *const *src);
    /// For each row r of this matrix, adds it (times alpha) to the array of
    /// floats at the location given by dst[r]. If dst[r] is NULL, does not do
    /// anything for that row. Requires that none of the memory regions pointed
    /// to by the pointers in "dst" overlap (e.g. none of the pointers should be
    /// the same).
-  //void AddToRows(Real alpha, Real *const *dst) const;
+    // void AddToRows(Real alpha, Real *const *dst) const;
    /// For each row i of *this, adds this->Row(i) to
    /// dst->Row(indexes(i)) if indexes(i) >= 0, else do nothing.
    /// Requires that all the indexes[i] that are >= 0
    /// be distinct, otherwise the behavior is undefined.
-  //void AddToRows(Real alpha,
+    // void AddToRows(Real alpha,
    //              const MatrixIndexT *indexes,
    //             MatrixBase<Real> *dst) const;
-/*
+    /*
      inline void ApplyPow(Real power) {
        this -> Pow(*this, power);
      }
@ -349,66 +353,82 @@ class MatrixBase {
      inline void ApplyLog() {
        this -> Log(*this);
      }
-*/
+    */
-  /// Eigenvalue Decomposition of a square NxN matrix into the form (*this) = P D
+    /// Eigenvalue Decomposition of a square NxN matrix into the form (*this) =
-  /// P^{-1}.  Be careful: the relationship of D to the eigenvalues we output is
+    /// P D
-  /// slightly complicated, due to the need for P to be real.  In the symmetric
+    /// P^{-1}.  Be careful: the relationship of D to the eigenvalues we output
    /// is
    /// slightly complicated, due to the need for P to be real.  In the
    /// symmetric
    /// case D is diagonal and real, but in
-  /// the non-symmetric case there may be complex-conjugate pairs of eigenvalues.
+    /// the non-symmetric case there may be complex-conjugate pairs of
-  /// In this case, for the equation (*this) = P D P^{-1} to hold, D must actually
+    /// eigenvalues.
-  /// be block diagonal, with 2x2 blocks corresponding to any such pairs.  If a
+    /// In this case, for the equation (*this) = P D P^{-1} to hold, D must
    /// actually
    /// be block diagonal, with 2x2 blocks corresponding to any such pairs.  If
    /// a
    /// pair is lambda +- i*mu, D will have a corresponding 2x2 block
    /// [lambda, mu; -mu, lambda].
-  /// Note that if the input matrix (*this) is non-invertible, P may not be invertible
+    /// Note that if the input matrix (*this) is non-invertible, P may not be
-  /// so in this case instead of the equation (*this) = P D P^{-1} holding, we have
+    /// invertible
    /// so in this case instead of the equation (*this) = P D P^{-1} holding, we
    /// have
    /// instead (*this) P = P D.
    ///
-  /// The non-member function CreateEigenvalueMatrix creates D from eigs_real and eigs_imag.
+    /// The non-member function CreateEigenvalueMatrix creates D from eigs_real
-  //void Eig(MatrixBase<Real> *P,
+    /// and eigs_imag.
    // void Eig(MatrixBase<Real> *P,
    //        VectorBase<Real> *eigs_real,
    //       VectorBase<Real> *eigs_imag) const;
-  /// The Power method attempts to take the matrix to a power using a method that
+    /// The Power method attempts to take the matrix to a power using a method
-  /// works in general for fractional and negative powers.  The input matrix must
+    /// that
    /// works in general for fractional and negative powers.  The input matrix
    /// must
    /// be invertible and have reasonable condition (or we don't guarantee the
    /// results.  The method is based on the eigenvalue decomposition.  It will
    /// return false and leave the matrix unchanged, if at entry the matrix had
-  /// real negative eigenvalues (or if it had zero eigenvalues and the power was
+    /// real negative eigenvalues (or if it had zero eigenvalues and the power
    /// was
    /// negative).
-//  bool Power(Real pow);
+    //  bool Power(Real pow);
    /** Singular value decomposition
       Major limitations:
-     For nonsquare matrices, we assume m>=n (NumRows >= NumCols), and we return
+       For nonsquare matrices, we assume m>=n (NumRows >= NumCols), and we
       return
       the "skinny" Svd, i.e. the matrix in the middle is diagonal, and the
       one on the left is rectangular.
       In Svd, *this = U*diag(S)*Vt.
-     Null pointers for U and/or Vt at input mean we do not want that output.  We
+       Null pointers for U and/or Vt at input mean we do not want that output.
       We
       expect that S.Dim() == m, U is either NULL or m by n,
       and v is either NULL or n by n.
       The singular values are not sorted (use SortSvd for that).  */
-  //void DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
+    // void DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
    //                   MatrixBase<Real> *Vt);  // Destroys calling matrix.
-  /// Compute SVD (*this) = U diag(s) Vt.   Note that the V in the call is already
+    /// Compute SVD (*this) = U diag(s) Vt.   Note that the V in the call is
    /// already
    /// transposed; the normal formulation is U diag(s) V^T.
    /// Null pointers for U or V mean we don't want that output (this saves
    /// compute).  The singular values are not sorted (use SortSvd for that).
-  //void Svd(VectorBase<Real> *s, MatrixBase<Real> *U,
+    // void Svd(VectorBase<Real> *s, MatrixBase<Real> *U,
    //        MatrixBase<Real> *Vt) const;
    /// Compute SVD but only retain the singular values.
-  //void Svd(VectorBase<Real> *s) const { Svd(s, NULL, NULL); }
+    // void Svd(VectorBase<Real> *s) const { Svd(s, NULL, NULL); }
    /// Returns smallest singular value.
-  //Real MinSingularValue() const {
+    // Real MinSingularValue() const {
    // Vector<Real> tmp(std::min(NumRows(), NumCols()));
-    //Svd(&tmp);
+    // Svd(&tmp);
-    //return tmp.Min();
+    // return tmp.Min();
    //}
-  //void TestUninitialized() const; // This function is designed so that if any element
+    // void TestUninitialized() const; // This function is designed so that if
    // any element
    // if the matrix is uninitialized memory, valgrind will complain.
    /// Returns condition number by computing Svd.  Works even if cols > rows.
@ -422,16 +442,19 @@ class MatrixBase {
    /// Returns true if matrix is Diagonal.
    bool IsDiagonal(Real cutoff = 1.0e-05) const;  // replace magic number
-  /// Returns true if the matrix is all zeros, except for ones on diagonal.  (it
+    /// Returns true if the matrix is all zeros, except for ones on diagonal.
    (it
    /// does not have to be square).  More specifically, this function returns
-  /// false if for any i, j, (*this)(i, j) differs by more than cutoff from the
+    /// false if for any i, j, (*this)(i, j) differs by more than cutoff from
    the
    /// expression (i == j ? 1 : 0).
    bool IsUnit(Real cutoff = 1.0e-05) const;     // replace magic number
    /// Returns true if matrix is all zeros.
    bool IsZero(Real cutoff = 1.0e-05) const;     // replace magic number
-  /// Frobenius norm, which is the sqrt of sum of square elements.  Same as Schatten 2-norm,
+    /// Frobenius norm, which is the sqrt of sum of square elements.  Same as
    Schatten 2-norm,
    /// or just "2-norm".
    Real FrobeniusNorm() const;
@ -461,7 +484,8 @@ class MatrixBase {
    /// Sets each element to the Heaviside step function (x > 0 ? 1 : 0) of the
    /// corresponding element in "src".  Note: in general you can make different
-  /// choices for x = 0, but for now please leave it as it (i.e. returning zero)
+    /// choices for x = 0, but for now please leave it as it (i.e. returning
    zero)
    /// because it affects the RectifiedLinearComponent in the neural net code.
    void Heaviside(const MatrixBase<Real> &src);
@ -477,7 +501,8 @@ class MatrixBase {
    /// If the power is negative and the input to the power is zero,
    /// The output will be set zero. If include_sign is true, it will
    /// multiply the result by the sign of the input.
-  void PowAbs(const MatrixBase<Real> &src, Real power, bool include_sign=false);
+    void PowAbs(const MatrixBase<Real> &src, Real power, bool
    include_sign=false);
    void Floor(const MatrixBase<Real> &src, Real floor_val);
@ -492,36 +517,52 @@ class MatrixBase {
    /// Floor(src, lower_limit);
    /// Ceiling(src, upper_limit);
    /// Exp(src)
-  void ExpLimited(const MatrixBase<Real> &src, Real lower_limit, Real upper_limit);
+    void ExpLimited(const MatrixBase<Real> &src, Real lower_limit, Real
    upper_limit);
    /// Set each element to y = log(1 + exp(x))
    void SoftHinge(const MatrixBase<Real> &src);
-  /// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j^(power))^(1 / p).
+    /// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j^(power))^(1 /
-  /// Requires src.NumRows() == this->NumRows() and  src.NumCols() % this->NumCols() == 0.
+    p).
    /// Requires src.NumRows() == this->NumRows() and  src.NumCols() %
    this->NumCols() == 0.
    void GroupPnorm(const MatrixBase<Real> &src, Real power);
    /// Calculate derivatives for the GroupPnorm function above...
-  /// if "input" is the input to the GroupPnorm function above (i.e. the "src" variable),
+    /// if "input" is the input to the GroupPnorm function above (i.e. the "src"
-  /// and "output" is the result of the computation (i.e. the "this" of that function
+    variable),
-  /// call), and *this has the same dimension as "input", then it sets each element
+    /// and "output" is the result of the computation (i.e. the "this" of that
-  /// of *this to the derivative d(output-elem)/d(input-elem) for each element of "input", where
+    function
-  /// "output-elem" is whichever element of output depends on that input element.
+    /// call), and *this has the same dimension as "input", then it sets each
-  void GroupPnormDeriv(const MatrixBase<Real> &input, const MatrixBase<Real> &output,
+    element
    /// of *this to the derivative d(output-elem)/d(input-elem) for each element
    of "input", where
    /// "output-elem" is whichever element of output depends on that input
    element.
    void GroupPnormDeriv(const MatrixBase<Real> &input, const MatrixBase<Real>
    &output,
                         Real power);
    /// Apply the function y(i) = (max_{j = i*G}^{(i+1)*G-1} x_j
-  /// Requires src.NumRows() == this->NumRows() and  src.NumCols() % this->NumCols() == 0.
+    /// Requires src.NumRows() == this->NumRows() and  src.NumCols() %
    this->NumCols() == 0.
    void GroupMax(const MatrixBase<Real> &src);
    /// Calculate derivatives for the GroupMax function above, where
-  /// "input" is the input to the GroupMax function above (i.e. the "src" variable),
+    /// "input" is the input to the GroupMax function above (i.e. the "src"
-  /// and "output" is the result of the computation (i.e. the "this" of that function
+    variable),
    /// and "output" is the result of the computation (i.e. the "this" of that
    function
    /// call), and *this must have the same dimension as "input". Each element
-  /// of *this will be set to 1 if the corresponding input equals the output of
+    /// of *this will be set to 1 if the corresponding input equals the output
-  /// the group, and 0 otherwise. The equals the function derivative where it is
+    of
-  /// defined (it's not defined where multiple inputs in the group are equal to the output).
+    /// the group, and 0 otherwise. The equals the function derivative where it
-  void GroupMaxDeriv(const MatrixBase<Real> &input, const MatrixBase<Real> &output);
+    is
    /// defined (it's not defined where multiple inputs in the group are equal
    to the output).
    void GroupMaxDeriv(const MatrixBase<Real> &input, const MatrixBase<Real>
    &output);
    /// Set each element to the tanh of the corresponding element of "src".
    void Tanh(const MatrixBase<Real> &src);
@ -535,55 +576,56 @@ class MatrixBase {
    // element-by-element, set *this = diff * (1.0 - value^2).
    void DiffTanh(const MatrixBase<Real> &value,
                  const MatrixBase<Real> &diff);
-*/
+  */
    /** Uses Svd to compute the eigenvalue decomposition of a symmetric positive
     * semi-definite matrix: (*this) = rP * diag(rS) * rP^T, with rP an
     * orthogonal matrix so rP^{-1} = rP^T.   Throws exception if input was not
     * positive semi-definite (check_thresh controls how stringent the check is;
-   * set it to 2 to ensure it won't ever complain, but it will zero out negative
+     * set it to 2 to ensure it won't ever complain, but it will zero out
     * negative
     * dimensions in your matrix.
     *
-   * Caution: if you want the eigenvalues, it may make more sense to convert to
+     * Caution: if you want the eigenvalues, it may make more sense to convert
-   * SpMatrix and use Eig() function there, which uses eigenvalue decomposition
+     * to
     * SpMatrix and use Eig() function there, which uses eigenvalue
     * decomposition
     * directly rather than SVD.
    */
    /// stream read.
    /// Use instead of stream<<*this, if you want to add to existing contents.
    // Will throw exception on failure.
-  void Read(std::istream & in, bool binary);
+    void Read(std::istream &in, bool binary);
    /// write to stream.
-  void Write(std::ostream & out, bool binary) const;
+    void Write(std::ostream &out, bool binary) const;
    // Below is internal methods for Svd, user does not have to know about this.
  protected:
    ///  Initializer, callable only from child.
-  explicit MatrixBase(Real *data, MatrixIndexT cols, MatrixIndexT rows, MatrixIndexT stride) :
+    explicit MatrixBase(Real *data,
-    data_(data), num_cols_(cols), num_rows_(rows), stride_(stride) {
+                        MatrixIndexT cols,
                        MatrixIndexT rows,
                        MatrixIndexT stride)
        : data_(data), num_cols_(cols), num_rows_(rows), stride_(stride) {
        KALDI_ASSERT_IS_FLOATING_TYPE(Real);
    }
    ///  Initializer, callable only from child.
    /// Empty initializer, for un-initialized matrix.
-  explicit MatrixBase(): data_(NULL) {
+    explicit MatrixBase() : data_(NULL) { KALDI_ASSERT_IS_FLOATING_TYPE(Real); }
    KALDI_ASSERT_IS_FLOATING_TYPE(Real);
  }
    // Make sure pointers to MatrixBase cannot be deleted.
-  ~MatrixBase() { }
+    ~MatrixBase() {}
    /// A workaround that allows SubMatrix to get a pointer to non-const data
    /// for const Matrix. Unfortunately C++ does not allow us to declare a
    /// "public const" inheritance or anything like that, so it would require
    /// a lot of work to make the SubMatrix class totally const-correct--
    /// we would have to override many of the Matrix functions.
-  inline Real*  Data_workaround() const {
+    inline Real *Data_workaround() const { return data_; }
    return data_;
  }
    /// data memory area
-  Real*   data_;
+    Real *data_;
    /// these attributes store the real matrix size as it is stored in memory
    /// including memalignment
@ -592,63 +634,66 @@ class MatrixBase {
    /** True number of columns for the internal matrix. This number may differ
     * from num_cols_ as memory alignment might be used. */
    MatrixIndexT stride_;
  private:
    KALDI_DISALLOW_COPY_AND_ASSIGN(MatrixBase);
 };
 /// A class for storing matrices.
-template<typename Real>
+template <typename Real>
 class Matrix : public MatrixBase<Real> {
  public:
    /// Empty constructor.
    Matrix();
    /// Basic constructor.
-  Matrix(const MatrixIndexT r, const MatrixIndexT c,
+    Matrix(const MatrixIndexT r,
           const MatrixIndexT c,
           MatrixResizeType resize_type = kSetZero,
-         MatrixStrideType stride_type = kDefaultStride):
+           MatrixStrideType stride_type = kDefaultStride)
-      MatrixBase<Real>() { Resize(r, c, resize_type, stride_type); }
+        : MatrixBase<Real>() {
        Resize(r, c, resize_type, stride_type);
    }
    /// Swaps the contents of *this and *other.  Shallow swap.
    void Swap(Matrix<Real> *other);
    /// Constructor from any MatrixBase. Can also copy with transpose.
    /// Allocates new memory.
-  explicit Matrix(const MatrixBase<Real> & M,
+    explicit Matrix(const MatrixBase<Real> &M,
                    MatrixTransposeType trans = kNoTrans);
    /// Same as above, but need to avoid default copy constructor.
-  Matrix(const Matrix<Real> & M);  //  (cannot make explicit)
+    Matrix(const Matrix<Real> &M);  //  (cannot make explicit)
    /// Copy constructor: as above, but from another type.
-  template<typename OtherReal>
+    template <typename OtherReal>
-  explicit Matrix(const MatrixBase<OtherReal> & M,
+    explicit Matrix(const MatrixBase<OtherReal> &M,
                    MatrixTransposeType trans = kNoTrans);
    /// Copy constructor taking TpMatrix...
-  //template <typename OtherReal>
+    // template <typename OtherReal>
-  //explicit Matrix(const TpMatrix<OtherReal> & M,
+    // explicit Matrix(const TpMatrix<OtherReal> & M,
-                  //MatrixTransposeType trans = kNoTrans) : MatrixBase<Real>() {
+    // MatrixTransposeType trans = kNoTrans) : MatrixBase<Real>() {
-    //if (trans == kNoTrans) {
+    // if (trans == kNoTrans) {
-      //Resize(M.NumRows(), M.NumCols(), kUndefined);
+    // Resize(M.NumRows(), M.NumCols(), kUndefined);
-      //this->CopyFromTp(M);
+    // this->CopyFromTp(M);
    //} else {
-      //Resize(M.NumCols(), M.NumRows(), kUndefined);
+    // Resize(M.NumCols(), M.NumRows(), kUndefined);
-      //this->CopyFromTp(M, kTrans);
+    // this->CopyFromTp(M, kTrans);
    //}
    //}
    /// read from stream.
    // Unlike one in base, allows resizing.
-  void Read(std::istream & in, bool binary);
+    void Read(std::istream &in, bool binary);
    /// Remove a specified row.
    void RemoveRow(MatrixIndexT i);
    /// Transpose the matrix.  Works for non-square
    /// matrices as well as square ones.
-  //void Transpose();
+    // void Transpose();
    /// Distructor to free matrices.
    ~Matrix() { Destroy(); }
@ -671,7 +716,7 @@ class Matrix : public MatrixBase<Real> {
                MatrixStrideType stride_type = kDefaultStride);
    /// Assignment operator that takes MatrixBase.
-  Matrix<Real> &operator = (const MatrixBase<Real> &other) {
+    Matrix<Real> &operator=(const MatrixBase<Real> &other) {
        if (MatrixBase<Real>::NumRows() != other.NumRows() ||
            MatrixBase<Real>::NumCols() != other.NumCols())
            Resize(other.NumRows(), other.NumCols(), kUndefined);
@ -680,7 +725,7 @@ class Matrix : public MatrixBase<Real> {
    }
    /// Assignment operator. Needed for inclusion in std::vector.
-  Matrix<Real> &operator = (const Matrix<Real> &other) {
+    Matrix<Real> &operator=(const Matrix<Real> &other) {
        if (MatrixBase<Real>::NumRows() != other.NumRows() ||
            MatrixBase<Real>::NumCols() != other.NumCols())
            Resize(other.NumRows(), other.NumCols(), kUndefined);
@ -694,13 +739,14 @@ class Matrix : public MatrixBase<Real> {
    void Destroy();
    /// Init assumes the current class contents are invalid (i.e. junk or have
-  /// already been freed), and it sets the matrix to newly allocated memory with
+    /// already been freed), and it sets the matrix to newly allocated memory
-  /// the specified number of rows and columns.  r == c == 0 is acceptable.  The data
+    /// with
    /// the specified number of rows and columns.  r == c == 0 is acceptable.
    /// The data
    /// memory contents will be undefined.
    void Init(const MatrixIndexT r,
              const MatrixIndexT c,
              const MatrixStrideType stride_type);
 };
 /// @} end "addtogroup matrix_group"
@ -710,7 +756,7 @@ class Matrix : public MatrixBase<Real> {
 /// A structure containing the HTK header.
 /// [TODO: change the style of the variables to Kaldi-compliant]
-template<typename Real>
+template <typename Real>
 class SubMatrix : public MatrixBase<Real> {
  public:
    // Initialize a SubMatrix from part of a matrix; this is
@ -718,7 +764,7 @@ class SubMatrix : public MatrixBase<Real> {
    // This initializer is against the proper semantics of "const", since
    // SubMatrix can change its contents.  It would be hard to implement
    // a "const-safe" version of this class.
-  SubMatrix(const MatrixBase<Real>& T,
+    SubMatrix(const MatrixBase<Real> &T,
              const MatrixIndexT ro,  // row offset, 0 < ro < NumRows()
              const MatrixIndexT r,   // number of rows, r > 0
              const MatrixIndexT co,  // column offset, 0 < co < NumCols()
@ -735,13 +781,13 @@ class SubMatrix : public MatrixBase<Real> {
    /// This type of constructor is needed for Range() to work [in Matrix base
    /// class]. Cannot make it explicit.
-  SubMatrix<Real> (const SubMatrix &other):
+    SubMatrix<Real>(const SubMatrix &other)
-  MatrixBase<Real> (other.data_, other.num_cols_, other.num_rows_,
+        : MatrixBase<Real>(
-                    other.stride_) {}
+              other.data_, other.num_cols_, other.num_rows_, other.stride_) {}
  private:
    /// Disallow assignment.
-  SubMatrix<Real> &operator = (const SubMatrix<Real> &other);
+    SubMatrix<Real> &operator=(const SubMatrix<Real> &other);
 };
 /// @} End of "addtogroup matrix_funcs_io".
@ -794,25 +840,33 @@ Real TraceMatMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
 /// the same as U->NumCols(), and we sort s from greatest to least absolute
 /// value (if sort_on_absolute_value == true) or greatest to least value
 /// otherwise, moving the columns of U, if it exists, and the rows of Vt, if it
-/// exists, around in the same way.  Note: the "absolute value" part won't matter
+/// exists, around in the same way.  Note: the "absolute value" part won't
 matter
 /// if this is an actual SVD, since singular values are non-negative.
 template<typename Real> void SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
                                     MatrixBase<Real>* Vt = NULL,
                                     bool sort_on_absolute_value = true);
-/// Creates the eigenvalue matrix D that is part of the decomposition used Matrix::Eig.
+/// Creates the eigenvalue matrix D that is part of the decomposition used
 Matrix::Eig.
 /// D will be block-diagonal with blocks of size 1 (for real eigenvalues) or 2x2
-/// for complex pairs.  If a complex pair is lambda +- i*mu, D will have a corresponding
+/// for complex pairs.  If a complex pair is lambda +- i*mu, D will have a
 corresponding
 /// 2x2 block [lambda, mu; -mu, lambda].
-/// This function will throw if any complex eigenvalues are not in complex conjugate
+/// This function will throw if any complex eigenvalues are not in complex
 conjugate
 /// pairs (or the members of such pairs are not consecutively numbered).
 template<typename Real>
-void CreateEigenvalueMatrix(const VectorBase<Real> &real, const VectorBase<Real> &imag,
+void CreateEigenvalueMatrix(const VectorBase<Real> &real, const VectorBase<Real>
 &imag,
                            MatrixBase<Real> *D);
-/// The following function is used in Matrix::Power, and separately tested, so we
+/// The following function is used in Matrix::Power, and separately tested, so
-/// declare it here mainly for the testing code to see.  It takes a complex value to
+we
-/// a power using a method that will work for noninteger powers (but will fail if the
+/// declare it here mainly for the testing code to see.  It takes a complex
 value to
 /// a power using a method that will work for noninteger powers (but will fail
 if the
 /// complex value is real and negative).
 template<typename Real>
 bool AttemptComplexPower(Real *x_re, Real *x_im, Real power);
@ -823,17 +877,17 @@ bool AttemptComplexPower(Real *x_re, Real *x_im, Real power);
 /// \addtogroup matrix_funcs_io
 /// @{
-template<typename Real>
+template <typename Real>
-std::ostream & operator << (std::ostream & Out, const MatrixBase<Real> & M);
+std::ostream &operator<<(std::ostream &Out, const MatrixBase<Real> &M);
-template<typename Real>
+template <typename Real>
-std::istream & operator >> (std::istream & In, MatrixBase<Real> & M);
+std::istream &operator>>(std::istream &In, MatrixBase<Real> &M);
 // The Matrix read allows resizing, so we override the MatrixBase one.
-template<typename Real>
+template <typename Real>
-std::istream & operator >> (std::istream & In, Matrix<Real> & M);
+std::istream &operator>>(std::istream &In, Matrix<Real> &M);
-template<typename Real>
+template <typename Real>
 bool SameDim(const MatrixBase<Real> &M, const MatrixBase<Real> &N) {
    return (M.NumRows() == N.NumRows() && M.NumCols() == N.NumCols());
 }
@ -844,7 +898,6 @@ bool SameDim(const MatrixBase<Real> &M, const MatrixBase<Real> &N) {
 }  // namespace kaldi
 // we need to include the implementation and some
 // template specializations.
 #include "matrix/kaldi-matrix-inl.h"
--- a/runtime/engine/common/matrix/kaldi-vector-inl.h
+++ b/runtime/engine/common/matrix/kaldi-vector-inl.h
@ -26,32 +26,33 @@
 namespace kaldi {
-template<typename Real>
+template <typename Real>
-std::ostream & operator << (std::ostream &os, const VectorBase<Real> &rv) {
+std::ostream &operator<<(std::ostream &os, const VectorBase<Real> &rv) {
    rv.Write(os, false);
    return os;
 }
-template<typename Real>
+template <typename Real>
-std::istream &operator >> (std::istream &is, VectorBase<Real> &rv) {
+std::istream &operator>>(std::istream &is, VectorBase<Real> &rv) {
    rv.Read(is, false);
    return is;
 }
-template<typename Real>
+template <typename Real>
-std::istream &operator >> (std::istream &is, Vector<Real> &rv) {
+std::istream &operator>>(std::istream &is, Vector<Real> &rv) {
    rv.Read(is, false);
    return is;
 }
-//template<>
+// template<>
-//template<>
+// template<>
-//void VectorBase<float>::AddVec(const float alpha, const VectorBase<float> &rv);
+// void VectorBase<float>::AddVec(const float alpha, const VectorBase<float>
 // &rv);
-//template<>
+// template<>
-//template<>
+// template<>
-//void VectorBase<double>::AddVec<double>(const double alpha,
+// void VectorBase<double>::AddVec<double>(const double alpha,
-                                        //const VectorBase<double> &rv);
+// const VectorBase<double> &rv);
 }  // namespace kaldi
--- a/runtime/engine/common/matrix/kaldi-vector.cc
+++ b/runtime/engine/common/matrix/kaldi-vector.cc
--- a/runtime/engine/common/matrix/kaldi-vector.h
+++ b/runtime/engine/common/matrix/kaldi-vector.h
@ -37,7 +37,7 @@ namespace kaldi {
 ///  Provides a vector abstraction class.
 ///  This class provides a way to work with vectors in kaldi.
 ///  It encapsulates basic operations and memory optimizations.
-template<typename Real>
+template <typename Real>
 class VectorBase {
  public:
    /// Set vector to all zeros.
@ -53,23 +53,23 @@ class VectorBase {
    inline MatrixIndexT Dim() const { return dim_; }
    /// Returns the size in memory of the vector, in bytes.
-  inline MatrixIndexT SizeInBytes() const { return (dim_*sizeof(Real)); }
+    inline MatrixIndexT SizeInBytes() const { return (dim_ * sizeof(Real)); }
    /// Returns a pointer to the start of the vector's data.
-  inline Real* Data() { return data_; }
+    inline Real *Data() { return data_; }
    /// Returns a pointer to the start of the vector's data (const).
-  inline const Real* Data() const { return data_; }
+    inline const Real *Data() const { return data_; }
    /// Indexing  operator (const).
-  inline Real operator() (MatrixIndexT i) const {
+    inline Real operator()(MatrixIndexT i) const {
        KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                              static_cast<UnsignedMatrixIndexT>(dim_));
        return *(data_ + i);
    }
    /// Indexing operator (non-const).
-  inline Real & operator() (MatrixIndexT i) {
+    inline Real &operator()(MatrixIndexT i) {
        KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                              static_cast<UnsignedMatrixIndexT>(dim_));
        return *(data_ + i);
@ -98,12 +98,12 @@ class VectorBase {
    void CopyFromVec(const VectorBase<Real> &v);
    /// Copy data from another vector of different type (double vs. float)
-  template<typename OtherReal>
+    template <typename OtherReal>
    void CopyFromVec(const VectorBase<OtherReal> &v);
    /// Performs a row stack of the matrix M
    void CopyRowsFromMat(const MatrixBase<Real> &M);
-  template<typename OtherReal>
+    template <typename OtherReal>
    void CopyRowsFromMat(const MatrixBase<OtherReal> &M);
    /// Performs a column stack of the matrix M
@ -113,12 +113,12 @@ class VectorBase {
    /// this->Copy(M[row]).
    void CopyRowFromMat(const MatrixBase<Real> &M, MatrixIndexT row);
    /// Extracts a row of the matrix M with type conversion.
-  template<typename OtherReal>
+    template <typename OtherReal>
    void CopyRowFromMat(const MatrixBase<OtherReal> &M, MatrixIndexT row);
    /// Extracts a column of the matrix M.
-  template<typename OtherReal>
+    template <typename OtherReal>
-  void CopyColFromMat(const MatrixBase<OtherReal> &M , MatrixIndexT col);
+    void CopyColFromMat(const MatrixBase<OtherReal> &M, MatrixIndexT col);
    /// Reads from C++ stream (option to add to existing contents).
    /// Throws exception on failure
@ -129,19 +129,21 @@ class VectorBase {
    friend class VectorBase<double>;
    friend class VectorBase<float>;
  protected:
-  /// Destructor;  does not deallocate memory, this is handled by child classes.
+    /// Destructor;  does not deallocate memory, this is handled by child
    /// classes.
    /// This destructor is protected so this object can only be
    /// deleted via a child.
    ~VectorBase() {}
    /// Empty initializer, corresponds to vector of zero size.
-  explicit VectorBase(): data_(NULL), dim_(0) {
+    explicit VectorBase() : data_(NULL), dim_(0) {
        KALDI_ASSERT_IS_FLOATING_TYPE(Real);
    }
    /// data memory area
-  Real* data_;
+    Real *data_;
    /// dimension of vector
    MatrixIndexT dim_;
    KALDI_DISALLOW_COPY_AND_ASSIGN(VectorBase);
@ -151,25 +153,28 @@ class VectorBase {
 *
 *  This class provides a way to work with vectors in kaldi.
 *  It encapsulates basic operations and memory optimizations.  */
-template<typename Real>
+template <typename Real>
-class Vector: public VectorBase<Real> {
+class Vector : public VectorBase<Real> {
  public:
    /// Constructor that takes no arguments.  Initializes to empty.
-  Vector(): VectorBase<Real>() {}
+    Vector() : VectorBase<Real>() {}
    /// Constructor with specific size.  Sets to all-zero by default
    /// if set_zero == false, memory contents are undefined.
    explicit Vector(const MatrixIndexT s,
                    MatrixResizeType resize_type = kSetZero)
-      : VectorBase<Real>() {  Resize(s, resize_type);  }
+        : VectorBase<Real>() {
        Resize(s, resize_type);
    }
    /// Copy constructor from CUDA vector
    /// This is defined in ../cudamatrix/cu-vector.h
-  //template<typename OtherReal>
+    // template<typename OtherReal>
-  //explicit Vector(const CuVectorBase<OtherReal> &cu);
+    // explicit Vector(const CuVectorBase<OtherReal> &cu);
    /// Copy constructor.  The need for this is controversial.
-  Vector(const Vector<Real> &v) : VectorBase<Real>()  { //  (cannot be explicit)
+    Vector(const Vector<Real> &v)
        : VectorBase<Real>() {  //  (cannot be explicit)
        Resize(v.Dim(), kUndefined);
        this->CopyFromVec(v);
    }
@ -181,19 +186,19 @@ class Vector: public VectorBase<Real> {
    }
    /// Type conversion constructor.
-  template<typename OtherReal>
+    template <typename OtherReal>
-  explicit Vector(const VectorBase<OtherReal> &v): VectorBase<Real>() {
+    explicit Vector(const VectorBase<OtherReal> &v) : VectorBase<Real>() {
        Resize(v.Dim(), kUndefined);
        this->CopyFromVec(v);
    }
-// Took this out since it is unsafe : Arnab
+    // Took this out since it is unsafe : Arnab
-//  /// Constructor from a pointer and a size; copies the data to a location
+    //  /// Constructor from a pointer and a size; copies the data to a location
-//  /// it owns.
+    //  /// it owns.
-//  Vector(const Real* Data, const MatrixIndexT s): VectorBase<Real>() {
+    //  Vector(const Real* Data, const MatrixIndexT s): VectorBase<Real>() {
-//    Resize(s);
+    //    Resize(s);
    //    CopyFromPtr(Data, s);
-//  }
+    //  }
    /// Swaps the contents of *this and *other.  Shallow swap.
@ -219,59 +224,63 @@ class Vector: public VectorBase<Real> {
    void RemoveElement(MatrixIndexT i);
    /// Assignment operator.
-  Vector<Real> &operator = (const Vector<Real> &other) {
+    Vector<Real> &operator=(const Vector<Real> &other) {
        Resize(other.Dim(), kUndefined);
        this->CopyFromVec(other);
        return *this;
    }
    /// Assignment operator that takes VectorBase.
-  Vector<Real> &operator = (const VectorBase<Real> &other) {
+    Vector<Real> &operator=(const VectorBase<Real> &other) {
        Resize(other.Dim(), kUndefined);
        this->CopyFromVec(other);
        return *this;
    }
  private:
    /// Init assumes the current contents of the class are invalid (i.e. junk or
-  /// has already been freed), and it sets the vector to newly allocated memory
+    /// has already been freed), and it sets the vector to newly allocated
-  /// with the specified dimension.  dim == 0 is acceptable.  The memory contents
+    /// memory
    /// with the specified dimension.  dim == 0 is acceptable.  The memory
    /// contents
    /// pointed to by data_ will be undefined.
    void Init(const MatrixIndexT dim);
    /// Destroy function, called internally.
    void Destroy();
 };
 /// Represents a non-allocating general vector which can be defined
 /// as a sub-vector of higher-level vector [or as the row of a matrix].
-template<typename Real>
+template <typename Real>
 class SubVector : public VectorBase<Real> {
  public:
    /// Constructor from a Vector or SubVector.
    /// SubVectors are not const-safe and it's very hard to make them
    /// so for now we just give up.  This function contains const_cast.
-  SubVector(const VectorBase<Real> &t, const MatrixIndexT origin,
+    SubVector(const VectorBase<Real> &t,
-            const MatrixIndexT length) : VectorBase<Real>() {
+              const MatrixIndexT origin,
              const MatrixIndexT length)
        : VectorBase<Real>() {
        // following assert equiv to origin>=0 && length>=0 &&
        // origin+length <= rt.dim_
-    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin)+
+        KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin) +
                         static_cast<UnsignedMatrixIndexT>(length) <=
                     static_cast<UnsignedMatrixIndexT>(t.Dim()));
-    VectorBase<Real>::data_ = const_cast<Real*> (t.Data()+origin);
+        VectorBase<Real>::data_ = const_cast<Real *>(t.Data() + origin);
        VectorBase<Real>::dim_ = length;
    }
    /// This constructor initializes the vector to point at the contents
    /// of this packed matrix (SpMatrix or TpMatrix).
    // SubVector(const PackedMatrix<Real> &M) {
-    //VectorBase<Real>::data_ = const_cast<Real*> (M.Data());
+    // VectorBase<Real>::data_ = const_cast<Real*> (M.Data());
-    //VectorBase<Real>::dim_   = (M.NumRows()*(M.NumRows()+1))/2;
+    // VectorBase<Real>::dim_   = (M.NumRows()*(M.NumRows()+1))/2;
    //}
    /// Copy constructor
-  SubVector(const SubVector &other) : VectorBase<Real> () {
+    SubVector(const SubVector &other) : VectorBase<Real>() {
        // this copy constructor needed for Range() to work in base class.
        VectorBase<Real>::data_ = other.data_;
        VectorBase<Real>::dim_ = other.dim_;
@ -280,14 +289,14 @@ class SubVector : public VectorBase<Real> {
    /// Constructor from a pointer to memory and a length.  Keeps a pointer
    /// to the data but does not take ownership (will never delete).
    /// Caution: this constructor enables you to evade const constraints.
-  SubVector(const Real *data, MatrixIndexT length) : VectorBase<Real> () {
+    SubVector(const Real *data, MatrixIndexT length) : VectorBase<Real>() {
-    VectorBase<Real>::data_ = const_cast<Real*>(data);
+        VectorBase<Real>::data_ = const_cast<Real *>(data);
        VectorBase<Real>::dim_ = length;
    }
    /// This operation does not preserve const-ness, so be careful.
    SubVector(const MatrixBase<Real> &matrix, MatrixIndexT row) {
-    VectorBase<Real>::data_ = const_cast<Real*>(matrix.RowData(row));
+        VectorBase<Real>::data_ = const_cast<Real *>(matrix.RowData(row));
        VectorBase<Real>::dim_ = matrix.NumCols();
    }
@ -295,7 +304,7 @@ class SubVector : public VectorBase<Real> {
  private:
    /// Disallow assignment operator.
-  SubVector & operator = (const SubVector &other) {}
+    SubVector &operator=(const SubVector &other) {}
 };
 /// @} end of "addtogroup matrix_group"
@ -303,43 +312,41 @@ class SubVector : public VectorBase<Real> {
 /// @{
 /// Output to a C++ stream.  Non-binary by default (use Write for
 /// binary output).
-template<typename Real>
+template <typename Real>
-std::ostream & operator << (std::ostream & out, const VectorBase<Real> & v);
+std::ostream &operator<<(std::ostream &out, const VectorBase<Real> &v);
 /// Input from a C++ stream.  Will automatically read text or
 /// binary data from the stream.
-template<typename Real>
+template <typename Real>
-std::istream & operator >> (std::istream & in, VectorBase<Real> & v);
+std::istream &operator>>(std::istream &in, VectorBase<Real> &v);
 /// Input from a C++ stream. Will automatically read text or
 /// binary data from the stream.
-template<typename Real>
+template <typename Real>
-std::istream & operator >> (std::istream & in, Vector<Real> & v);
+std::istream &operator>>(std::istream &in, Vector<Real> &v);
 /// @} end of \addtogroup matrix_funcs_io
 /// \addtogroup matrix_funcs_scalar
 /// @{
-//template<typename Real>
+// template<typename Real>
-//bool ApproxEqual(const VectorBase<Real> &a,
+// bool ApproxEqual(const VectorBase<Real> &a,
-                 //const VectorBase<Real> &b, Real tol = 0.01) {
+// const VectorBase<Real> &b, Real tol = 0.01) {
-  //return a.ApproxEqual(b, tol);
+// return a.ApproxEqual(b, tol);
 //}
-//template<typename Real>
+// template<typename Real>
-//inline void AssertEqual(VectorBase<Real> &a, VectorBase<Real> &b,
+// inline void AssertEqual(VectorBase<Real> &a, VectorBase<Real> &b,
-                        //float tol = 0.01) {
+// float tol = 0.01) {
-  //KALDI_ASSERT(a.ApproxEqual(b, tol));
+// KALDI_ASSERT(a.ApproxEqual(b, tol));
 //}
 }  // namespace kaldi
 // we need to include the implementation
 #include "matrix/kaldi-vector-inl.h"
 #endif  // KALDI_MATRIX_KALDI_VECTOR_H_
--- a/runtime/engine/common/matrix/matrix-common.h
+++ b/runtime/engine/common/matrix/matrix-common.h
@ -27,18 +27,15 @@
 namespace kaldi {
 // this enums equal to CblasTrans and CblasNoTrans constants from CBLAS library
-// we are writing them as literals because we don't want to include here matrix/kaldi-blas.h,
+// we are writing them as literals because we don't want to include here
 // matrix/kaldi-blas.h,
 // which puts many symbols into global scope (like "real") via the header f2c.h
 typedef enum {
    kTrans = 112,   // = CblasTrans
    kNoTrans = 111  // = CblasNoTrans
 } MatrixTransposeType;
-typedef enum {
+typedef enum { kSetZero, kUndefined, kCopyData } MatrixResizeType;
  kSetZero,
  kUndefined,
  kCopyData
 } MatrixResizeType;
 typedef enum {
@ -53,24 +50,33 @@ typedef enum {
    kTakeMeanAndCheck
 } SpCopyType;
-template<typename Real> class VectorBase;
+template <typename Real>
-template<typename Real> class Vector;
+class VectorBase;
-template<typename Real> class SubVector;
+template <typename Real>
-template<typename Real> class MatrixBase;
+class Vector;
-template<typename Real> class SubMatrix;
+template <typename Real>
-template<typename Real> class Matrix;
+class SubVector;
 template <typename Real>
 class MatrixBase;
 template <typename Real>
 class SubMatrix;
 template <typename Real>
 class Matrix;
 /// This class provides a way for switching between double and float types.
-template<typename T> class OtherReal { };  // useful in reading+writing routines
+template <typename T>
 class OtherReal {};  // useful in reading+writing routines
                     // to switch double and float.
 /// A specialized class for switching from float to double.
-template<> class OtherReal<float> {
+template <>
 class OtherReal<float> {
  public:
    typedef double Real;
 };
 /// A specialized class for switching from double to float.
-template<> class OtherReal<double> {
+template <>
 class OtherReal<double> {
  public:
    typedef float Real;
 };
@ -81,12 +87,10 @@ typedef int32 SignedMatrixIndexT;
 typedef uint32 UnsignedMatrixIndexT;
 // If you want to use size_t for the index type, do as follows instead:
-//typedef size_t MatrixIndexT;
+// typedef size_t MatrixIndexT;
-//typedef ssize_t SignedMatrixIndexT;
+// typedef ssize_t SignedMatrixIndexT;
-//typedef size_t UnsignedMatrixIndexT;
+// typedef size_t UnsignedMatrixIndexT;
 }
 #endif  // KALDI_MATRIX_MATRIX_COMMON_H_
--- a/runtime/engine/kaldi/CMakeLists.txt
+++ b/runtime/engine/kaldi/CMakeLists.txt
@ -1,14 +1,15 @@
 project(kaldi)
 include_directories(
 ${CMAKE_CURRENT_SOURCE_DIR}
 )
 add_subdirectory(base)
 add_subdirectory(util)
-add_subdirectory(lat)
+if(WITH_ASR)
-add_subdirectory(fstext)
+    add_subdirectory(lat)
-add_subdirectory(decoder)
+    add_subdirectory(fstext)
-add_subdirectory(lm)
+    add_subdirectory(decoder)
    add_subdirectory(lm)
-add_subdirectory(fstbin)
+    add_subdirectory(fstbin)
-add_subdirectory(lmbin)
+    add_subdirectory(lmbin)
 endif()
--- a/runtime/engine/kaldi/base/kaldi-types.h
+++ b/runtime/engine/kaldi/base/kaldi-types.h
@ -44,7 +44,19 @@ typedef float   BaseFloat;
 #ifndef COMPILE_WITHOUT_OPENFST
 #ifdef WITH_ASR
 #include <fst/types.h>
 #else
 using int8 = int8_t;
 using int16 = int16_t;
 using int32 = int32_t;
 using int64 = int64_t;
 using uint8 = uint8_t;
 using uint16 = uint16_t;
 using uint32 = uint32_t;
 using uint64 = uint64_t;
 #endif
 namespace kaldi {
  using ::int16;
--- a/runtime/engine/vad/CMakeLists.txt
+++ b/runtime/engine/vad/CMakeLists.txt
@ -0,0 +1,18 @@
 # set(CMAKE_CXX_STANDARD 11)
 # # 指定下载解压后的fastdeploy库路径
 # set(FASTDEPLOY_INSTALL_DIR "fdlib/fastdeploy-linux-x64-1.0.4" CACHE STRING force)
 # if(NOT EXISTS ${FASTDEPLOY_INSTALL_DIR})
 #     message(FATAL_ERROR "Please using cmake -B build -DFASTDEPLOY_INSTALL_DIR=${FASTDEPLOY_INSTALL_DIR}")
 # endif()
 # include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
 # # 添加FastDeploy依赖头文件
 # include_directories(${FASTDEPLOY_INCS})
 add_executable(infer_onnx_silero_vad ${CMAKE_CURRENT_SOURCE_DIR}/infer_onnx_silero_vad.cc wav.h vad.cc vad.h)
 # 添加FastDeploy库依赖
 target_link_libraries(infer_onnx_silero_vad ${FASTDEPLOY_LIBS})
--- a/runtime/engine/vad/README.md
+++ b/runtime/engine/vad/README.md
@ -0,0 +1,121 @@
 English | [简体中文](README_CN.md)
 # Silero VAD Deployment Example
 This directory provides examples that `infer_onnx_silero_vad` fast finishes the deployment of VAD models on CPU/GPU.
 Before deployment, two steps require confirmation.
 - 1. Software and hardware should meet the requirements. Please refer to [FastDeploy Environment Requirements](../../../../docs/en/build_and_install/download_prebuilt_libraries.md).  
 - 2. Download the precompiled deployment library and samples code according to your development environment. Refer to [FastDeploy Precompiled Library](../../../../docs/en/build_and_install/download_prebuilt_libraries.md).
 Taking VAD inference on Linux as an example, the compilation test can be completed by executing the following command in this directory.
 ```bash
 mkdir build
 cd build
 # Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above
 wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
 tar xvf fastdeploy-linux-x64-x.x.x.tgz
 cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
 make -j
 # Download the VAD model file and test audio. After decompression, place the model and test audio in the infer_onnx_silero_vad.cc peer directory
 wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad.tgz
 wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad_sample.wav
 # inference
 ./infer_onnx_silero_vad ../silero_vad.onnx ../silero_vad_sample.wav
 ```
 - The above command works for Linux or MacOS. Refer to:
  - [How to use FastDeploy C++ SDK in Windows](../../../../docs/en/faq/use_sdk_on_windows.md)  for SDK use-pattern in Windows
 ## VAD C++ Interface
 ### Vad Class
 ```c++
 Vad::Vad(const std::string& model_file,
    const fastdeploy::RuntimeOption& custom_option = fastdeploy::RuntimeOption())
 ```
 **Parameter**
 > * **model_file**(str): Model file path
 > * **runtime_option**(RuntimeOption): Backend inference configuration. None by default. (use the default configuration)
 ### setAudioCofig function
 **Must be called before the `init` function**
 ```c++
 void Vad::setAudioCofig(int sr, int frame_ms, float threshold, int min_silence_duration_ms, int speech_pad_ms);
 ```
 **Parameter**
 > * **sr**(int): sampling rate
 > * **frame_ms**(int): The length of each detection frame, and it is used to calculate the detection window size
 > * **threshold**(float): Result probability judgment threshold
 > * **min_silence_duration_ms**(int): The threshold used to calculate whether it is silence
 > * **speech_pad_ms**(int): Used to calculate the end time of the speech
 ### init function
 Used to initialize audio-related parameters.
 ```c++
 void Vad::init();
 ```
 ### loadAudio function
 Load audio.
 ```c++
 void Vad::loadAudio(const std::string& wavPath)
 ```
 **Parameter**
 > * **wavPath**(str): Audio file path
 ### Predict function
 Used to start model reasoning.
 ```c++
 bool Vad::Predict();
 ```
 ### getResult function
 **Used to obtain reasoning results**
 ```c++
 std::vector<std::map<std::string, float>> Vad::getResult(
            float removeThreshold = 1.6, float expandHeadThreshold = 0.32, float expandTailThreshold = 0,
            float mergeThreshold = 0.3);
 ```
 **Parameter**
 > * **removeThreshold**(float): Discard result fragment threshold; If some recognition results are too short, they will be discarded according to this threshold
 > * **expandHeadThreshold**(float): Offset at the beginning of the segment; The recognized start time may be too close to the voice part, so move forward the start time accordingly
 > * **expandTailThreshold**(float): Offset at the end of the segment; The recognized end time may be too close to the voice part, so the end time is moved back accordingly
 > * **mergeThreshold**(float): Some result segments are very close and can be combined into one, and the vocal segments can be combined accordingly
 **The output result format is**`std::vector<std::map<std::string, float>>`
 > Output a list, each element is a speech fragment
 >
 > Each clip can use 'start' to get the start time and 'end' to get the end time
 ### Tips
 1. `The setAudioCofig`function must be called before the `init` function
 2. The sampling rate of the input audio file must be consistent with that set in the code
 - [Model Description](../)
 - [How to switch the model inference backend engine](../../../../docs/en/faq/how_to_change_backend.md)
--- a/runtime/engine/vad/README_CN.md
+++ b/runtime/engine/vad/README_CN.md
@ -0,0 +1,119 @@
 [English](README.md) | 简体中文
 # Silero VAD 部署示例
 本目录下提供`infer_onnx_silero_vad`快速完成 Silero VAD 模型在CPU/GPU。
 在部署前，需确认以下两个步骤
 - 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
 - 2. 根据开发环境，下载预编译部署库和samples代码，参考[FastDeploy预编译库](../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
 以Linux上 VAD 推理为例，在本目录执行如下命令即可完成编译测试。
 ```bash
 mkdir build
 cd build
 # 下载FastDeploy预编译库，用户可在上文提到的`FastDeploy预编译库`中自行选择合适的版本使用
 wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
 tar xvf fastdeploy-linux-x64-x.x.x.tgz
 cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
 make -j
 # 下载 VAD 模型文件和测试音频，解压后将模型和测试音频放置在与 infer_onnx_silero_vad.cc 同级目录下
 wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad.tgz
 wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad_sample.wav
 # 推理
 ./infer_onnx_silero_vad ../silero_vad.onnx ../silero_vad_sample.wav
 ```
 以上命令只适用于Linux或MacOS, Windows下SDK的使用方式请参考:
 - [如何在Windows中使用FastDeploy C++ SDK](../../../../docs/cn/faq/use_sdk_on_windows.md)
 ## VAD C++ 接口
 ### Vad 类
 ```c++
 Vad::Vad(const std::string& model_file,
    const fastdeploy::RuntimeOption& custom_option = fastdeploy::RuntimeOption())
 ```
 **参数**
 > * **model_file**(str): 模型文件路径
 > * **runtime_option**(RuntimeOption): 后端推理配置，默认为None，即采用默认配置
 ### setAudioCofig 函数
 **必须在`init`函数前调用**
 ```c++
 void Vad::setAudioCofig(int sr, int frame_ms, float threshold, int min_silence_duration_ms, int speech_pad_ms);
 ```
 **参数**
 > * **sr**(int): 采样率
 > * **frame_ms**(int): 每次检测帧长，用于计算检测窗口大小
 > * **threshold**(float): 结果概率判断阈值
 > * **min_silence_duration_ms**(int): 用于计算判断是否是 silence 的阈值
 > * **speech_pad_ms**(int): 用于计算 speach 结束时刻
 ### init 函数
 用于初始化音频相关参数
 ```c++
 void Vad::init();
 ```
 ### loadAudio 函数
 加载音频
 ```c++
 void Vad::loadAudio(const std::string& wavPath)
 ```
 **参数**
 > * **wavPath**(str): 音频文件路径
 ### Predict 函数
 用于开始模型推理
 ```c++
 bool Vad::Predict();
 ```
 ### getResult 函数
 **用于获取推理结果**
 ```c++
 std::vector<std::map<std::string, float>> Vad::getResult(
            float removeThreshold = 1.6, float expandHeadThreshold = 0.32, float expandTailThreshold = 0,
            float mergeThreshold = 0.3);
 ```
 **参数**
 > * **removeThreshold**(float): 丢弃结果片段阈值；部分识别结果太短则根据此阈值丢弃
 > * **expandHeadThreshold**(float): 结果片段开始时刻偏移；识别到的开始时刻可能过于贴近发声部分，因此据此前移开始时刻
 > * **expandTailThreshold**(float): 结果片段结束时刻偏移；识别到的结束时刻可能过于贴近发声部分，因此据此后移结束时刻
 > * **mergeThreshold**(float): 有的结果片段十分靠近，可以合并成一个，据此合并发声片段
 **输出结果格式为**`std::vector<std::map<std::string, float>>`
 > 输出一个列表，每个元素是一个讲话片段
 >
 > 每个片段可以用 'start' 获取到开始时刻，用 'end' 获取到结束时刻
 ### 提示
 1. `setAudioCofig`函数必须在`init`函数前调用
 2. 输入的音频文件的采样率必须与代码中设置的保持一致
 - [模型介绍](../)
 - [如何切换模型推理后端引擎](../../../../docs/cn/faq/how_to_change_backend.md)
--- a/runtime/engine/vad/infer_onnx_silero_vad.cc
+++ b/runtime/engine/vad/infer_onnx_silero_vad.cc
@ -0,0 +1,65 @@
 #include "vad.h"
 int main(int argc, char* argv[]) {
    if (argc < 3) {
        std::cout << "Usage: infer_onnx_silero_vad path/to/model path/to/audio "
                     "run_option, "
                     "e.g ./infer_onnx_silero_vad silero_vad.onnx sample.wav"
                  << std::endl;
        return -1;
    }
    std::string model_file = argv[1];
    std::string audio_file = argv[2];
    int sr = 16000;
    Vad vad(model_file);
    // custom config, but must be set before init
    vad.SetConfig(sr, 32, 0.45f, 200, 0, 0);
    vad.Init();
    std::vector<float> inputWav;  // [0, 1]
    wav::WavReader wav_reader = wav::WavReader(audio_file);
    assert(wav_reader.sample_rate() == sr);
    auto num_samples = wav_reader.num_samples();
    inputWav.resize(num_samples);
    for (int i = 0; i < num_samples; i++) {
        inputWav[i] = wav_reader.data()[i] / 32768;
    }
    int window_size_samples = vad.WindowSizeSamples();
    for (int64_t j = 0; j < num_samples; j += window_size_samples) {
        auto start = j;
        auto end = start + window_size_samples >= num_samples
                       ? num_samples
                       : start + window_size_samples;
        auto current_chunk_size = end - start;
        std::vector<float> r{&inputWav[0] + start, &inputWav[0] + end};
        assert(r.size() == current_chunk_size);
        if (!vad.ForwardChunk(r)) {
            std::cerr << "Failed to inference while using model:"
                      << vad.ModelName() << "." << std::endl;
            return false;
        }
        Vad::State s = vad.Postprocess();
        std::cout << s << " ";
    }
    std::cout << std::endl;
    std::vector<std::map<std::string, float>> result = vad.GetResult();
    for (auto& res : result) {
        std::cout << "speak start: " << res["start"]
                  << " s, end: " << res["end"] << " s | ";
    }
    std::cout << "\b\b " << std::endl;
    vad.Reset();
    return 0;
 }
--- a/runtime/engine/vad/vad.cc
+++ b/runtime/engine/vad/vad.cc
@ -0,0 +1,306 @@
 // Copyright (c) 2023 Chen Qianhe Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "vad.h"
 #include <cstring>
 #include <iomanip>
 #ifdef NDEBUG
 #define LOG_DEBUG                                                              \
    ::fastdeploy::FDLogger(true, "[DEBUG]") << __REL_FILE__ << "(" << __LINE__ \
                                            << ")::" << __FUNCTION__ << "\t"
 #else
 #define LOG_DEBUG                            \
    ::fastdeploy::FDLogger(false, "[DEBUG]") \
        << __REL_FILE__ << "(" << __LINE__ << ")::" << __FUNCTION__ << "\t"
 #endif
 Vad::Vad(const std::string& model_file,
         const fastdeploy::RuntimeOption&
             custom_option /* = fastdeploy::RuntimeOption() */) {
    valid_cpu_backends = {fastdeploy::Backend::ORT,
                          fastdeploy::Backend::OPENVINO};
    valid_gpu_backends = {fastdeploy::Backend::ORT, fastdeploy::Backend::TRT};
    runtime_option = custom_option;
    // ORT backend
    runtime_option.UseCpu();
    runtime_option.UseOrtBackend();
    runtime_option.model_format = fastdeploy::ModelFormat::ONNX;
    // grap opt level
    runtime_option.ort_option.graph_optimization_level = 99;
    // one-thread
    runtime_option.ort_option.intra_op_num_threads = 1;
    runtime_option.ort_option.inter_op_num_threads = 1;
    // model path
    runtime_option.model_file = model_file;
 }
 void Vad::Init() {
    std::call_once(init_, [&]() { initialized = Initialize(); });
 }
 std::string Vad::ModelName() const { return "VAD"; }
 void Vad::SetConfig(int sr,
                    int frame_ms,
                    float threshold,
                    int min_silence_duration_ms,
                    int speech_pad_left_ms,
                    int speech_pad_right_ms) {
    if (initialized) {
        fastdeploy::FDERROR << "SetConfig must be called before init"
                            << std::endl;
        throw std::runtime_error("SetConfig must be called before init");
    }
    sample_rate_ = sr;
    sr_per_ms_ = sr / 1000;
    threshold_ = threshold;
    frame_ms_ = frame_ms;
    min_silence_samples_ = min_silence_duration_ms * sr_per_ms_;
    speech_pad_left_samples_ = speech_pad_left_ms * sr_per_ms_;
    speech_pad_right_samples_ = speech_pad_right_ms * sr_per_ms_;
    // init chunk size
    window_size_samples_ = frame_ms * sr_per_ms_;
    current_chunk_size_ = window_size_samples_;
    fastdeploy::FDINFO << "sr=" << sr << " threshold=" << threshold
                       << " frame_ms=" << frame_ms
                       << " min_silence_duration_ms=" << min_silence_duration_ms
                       << " speech_pad_left_ms=" << speech_pad_left_ms
                       << " speech_pad_right_ms=" << speech_pad_right_ms;
 }
 void Vad::Reset() {
    std::memset(h_.data(), 0.0f, h_.size() * sizeof(float));
    std::memset(c_.data(), 0.0f, c_.size() * sizeof(float));
    triggerd_ = false;
    temp_end_ = 0;
    current_sample_ = 0;
    speakStart_.clear();
    speakEnd_.clear();
    states_.clear();
 }
 bool Vad::Initialize() {
    // input & output holder
    inputTensors_.resize(4);
    outputTensors_.resize(3);
    // input shape
    input_node_dims_.emplace_back(1);
    input_node_dims_.emplace_back(window_size_samples_);
    // sr buffer
    sr_.resize(1);
    sr_[0] = sample_rate_;
    // hidden state buffer
    h_.resize(size_hc_);
    c_.resize(size_hc_);
    Reset();
    // InitRuntime
    if (!InitRuntime()) {
        fastdeploy::FDERROR << "Failed to initialize fastdeploy backend."
                            << std::endl;
        return false;
    }
    fastdeploy::FDINFO << "init done.";
    return true;
 }
 bool Vad::ForwardChunk(std::vector<float>& chunk) {
    // last chunk may not be window_size_samples_
    input_node_dims_.back() = chunk.size();
    assert(window_size_samples_ >= chunk.size());
    current_chunk_size_ = chunk.size();
    inputTensors_[0].name = "input";
    inputTensors_[0].SetExternalData(
        input_node_dims_, fastdeploy::FDDataType::FP32, chunk.data());
    inputTensors_[1].name = "sr";
    inputTensors_[1].SetExternalData(
        sr_node_dims_, fastdeploy::FDDataType::INT64, sr_.data());
    inputTensors_[2].name = "h";
    inputTensors_[2].SetExternalData(
        hc_node_dims_, fastdeploy::FDDataType::FP32, h_.data());
    inputTensors_[3].name = "c";
    inputTensors_[3].SetExternalData(
        hc_node_dims_, fastdeploy::FDDataType::FP32, c_.data());
    if (!Infer(inputTensors_, &outputTensors_)) {
        return false;
    }
    // Push forward sample index
    current_sample_ += current_chunk_size_;
    return true;
 }
 const Vad::State& Vad::Postprocess() {
    // update prob, h, c
    outputProb_ = *(float*)outputTensors_[0].Data();
    auto* hn = static_cast<float*>(outputTensors_[1].MutableData());
    std::memcpy(h_.data(), hn, h_.size() * sizeof(float));
    auto* cn = static_cast<float*>(outputTensors_[2].MutableData());
    std::memcpy(c_.data(), cn, c_.size() * sizeof(float));
    if (outputProb_ < threshold_ && !triggerd_) {
        // 1. Silence
        LOG_DEBUG << "{ silence: " << 1.0 * current_sample_ / sample_rate_
                  << " s; prob: " << outputProb_ << " }";
        states_.emplace_back(Vad::State::SIL);
    } else if (outputProb_ >= threshold_ && !triggerd_) {
        // 2. Start
        triggerd_ = true;
        speech_start_ =
            current_sample_ - current_chunk_size_ - speech_pad_left_samples_;
        float start_sec = 1.0 * speech_start_ / sample_rate_;
        speakStart_.emplace_back(start_sec);
        LOG_DEBUG << "{ speech start: " << start_sec
                  << " s; prob: " << outputProb_ << " }";
        states_.emplace_back(Vad::State::START);
    } else if (outputProb_ >= threshold_ - 0.15 && triggerd_) {
        // 3. Continue
        if (temp_end_ != 0) {
            // speech prob relaxation, speech continues again
            LOG_DEBUG << "{ speech fake end(sil < min_silence_ms) to continue: "
                      << 1.0 * current_sample_ / sample_rate_
                      << " s; prob: " << outputProb_ << " }";
            temp_end_ = 0;
        } else {
            // speech prob relaxation, keep tracking speech
            LOG_DEBUG << "{ speech continue: "
                      << 1.0 * current_sample_ / sample_rate_
                      << " s; prob: " << outputProb_ << " }";
        }
        states_.emplace_back(Vad::State::SPEECH);
    } else if (outputProb_ < threshold_ - 0.15 && triggerd_) {
        // 4. End
        if (temp_end_ == 0) {
            temp_end_ = current_sample_;
        }
        // check possible speech end
        if (current_sample_ - temp_end_ < min_silence_samples_) {
            // a. silence < min_slience_samples, continue speaking
            LOG_DEBUG << "{ speech fake end(sil < min_silence_ms): "
                      << 1.0 * current_sample_ / sample_rate_
                      << " s; prob: " << outputProb_ << " }";
            states_.emplace_back(Vad::State::SIL);
        } else {
            // b. silence >= min_slience_samples, end speaking
            speech_end_ = current_sample_ + speech_pad_right_samples_;
            temp_end_ = 0;
            triggerd_ = false;
            auto end_sec = 1.0 * speech_end_ / sample_rate_;
            speakEnd_.emplace_back(end_sec);
            LOG_DEBUG << "{ speech end: " << end_sec
                      << " s; prob: " << outputProb_ << " }";
            states_.emplace_back(Vad::State::END);
        }
    }
    return states_.back();
 }
 const std::vector<std::map<std::string, float>> Vad::GetResult(
    float removeThreshold,
    float expandHeadThreshold,
    float expandTailThreshold,
    float mergeThreshold) const {
    float audioLength = 1.0 * current_sample_ / sample_rate_;
    if (speakStart_.empty() && speakEnd_.empty()) {
        return {};
    }
    if (speakEnd_.size() != speakStart_.size()) {
        // set the audio length as the last end
        speakEnd_.emplace_back(audioLength);
    }
    // Remove too short segments
    //  auto startIter = speakStart_.begin();
    //  auto endIter = speakEnd_.begin();
    //  while (startIter != speakStart_.end()) {
    //      if (removeThreshold < audioLength &&
    //          *endIter - *startIter < removeThreshold) {
    //          startIter = speakStart_.erase(startIter);
    //          endIter = speakEnd_.erase(endIter);
    //      } else {
    //          startIter++;
    //          endIter++;
    //      }
    //  }
    //  // Expand to avoid to tight cut.
    //  startIter = speakStart_.begin();
    //  endIter = speakEnd_.begin();
    //  *startIter = std::fmax(0.f, *startIter - expandHeadThreshold);
    //  *endIter = std::fmin(*endIter + expandTailThreshold, *(startIter + 1));
    //  endIter = speakEnd_.end() - 1;
    //  startIter = speakStart_.end() - 1;
    //  *startIter = fmax(*startIter - expandHeadThreshold, *(endIter - 1));
    //  *endIter = std::fmin(*endIter + expandTailThreshold, audioLength);
    //  for (int i = 1; i < speakStart_.size() - 1; ++i) {
    //      speakStart_[i] = std::fmax(speakStart_[i] - expandHeadThreshold,
    //      speakEnd_[i - 1]);
    //      speakEnd_[i] = std::fmin(speakEnd_[i] + expandTailThreshold,
    //      speakStart_[i + 1]);
    //  }
    //  // Merge very closed segments
    //  startIter = speakStart_.begin() + 1;
    //  endIter = speakEnd_.begin();
    //  while (startIter != speakStart_.end()) {
    //      if (*startIter - *endIter < mergeThreshold) {
    //          startIter = speakStart_.erase(startIter);
    //          endIter = speakEnd_.erase(endIter);
    //      } else {
    //          startIter++;
    //          endIter++;
    //      }
    //  }
    std::vector<std::map<std::string, float>> result;
    for (int i = 0; i < speakStart_.size(); ++i) {
        result.emplace_back(std::map<std::string, float>(
            {{"start", speakStart_[i]}, {"end", speakEnd_[i]}}));
    }
    return result;
 }
 std::ostream& operator<<(std::ostream& os, const Vad::State& s) {
    switch (s) {
        case Vad::State::SIL:
            os << "[SIL]";
            break;
        case Vad::State::START:
            os << "[STA]";
            break;
        case Vad::State::SPEECH:
            os << "[SPE]";
            break;
        case Vad::State::END:
            os << "[END]";
            break;
        default:
            // illegal state
            os << "[ILL]";
            break;
    }
    return os;
 }
--- a/runtime/engine/vad/vad.h
+++ b/runtime/engine/vad/vad.h
@ -0,0 +1,124 @@
 // Copyright (c) 2023 Chen Qianhe Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <iostream>
 #include <mutex>
 #include <vector>
 #include "./wav.h"
 #include "fastdeploy/fastdeploy_model.h"
 #include "fastdeploy/runtime.h"
 class Vad : public fastdeploy::FastDeployModel {
  public:
    enum class State { SIL = 0, START, SPEECH, END };
    friend std::ostream& operator<<(std::ostream& os, const Vad::State& s);
    Vad(const std::string& model_file,
        const fastdeploy::RuntimeOption& custom_option =
            fastdeploy::RuntimeOption());
    void Init();
    void Reset();
    void SetConfig(int sr,
                   int frame_ms,
                   float threshold,
                   int min_silence_duration_ms,
                   int speech_pad_left_ms,
                   int speech_pad_right_ms);
    bool ForwardChunk(std::vector<float>& chunk);
    const State& Postprocess();
    const std::vector<std::map<std::string, float>> GetResult(
        float removeThreshold = 0.0,
        float expandHeadThreshold = 0.0,
        float expandTailThreshold = 0,
        float mergeThreshold = 0.0) const;
    const std::vector<State> GetStates() const { return states_; }
    int SampleRate() const { return sample_rate_; }
    int FrameMs() const { return frame_ms_; }
    int64_t WindowSizeSamples() const { return window_size_samples_; }
    float Threshold() const { return threshold_; }
    int MinSilenceDurationMs() const {
        return min_silence_samples_ / sample_rate_;
    }
    int SpeechPadLeftMs() const {
        return speech_pad_left_samples_ / sample_rate_;
    }
    int SpeechPadRightMs() const {
        return speech_pad_right_samples_ / sample_rate_;
    }
    int MinSilenceSamples() const { return min_silence_samples_; }
    int SpeechPadLeftSamples() const { return speech_pad_left_samples_; }
    int SpeechPadRightSamples() const { return speech_pad_right_samples_; }
    std::string ModelName() const override;
  private:
    bool Initialize();
  private:
    std::once_flag init_;
    // input and output
    std::vector<fastdeploy::FDTensor> inputTensors_;
    std::vector<fastdeploy::FDTensor> outputTensors_;
    // model states
    bool triggerd_ = false;
    unsigned int speech_start_ = 0;
    unsigned int speech_end_ = 0;
    unsigned int temp_end_ = 0;
    unsigned int current_sample_ = 0;
    unsigned int current_chunk_size_ = 0;
    // MAX 4294967295 samples / 8sample per ms / 1000 / 60 = 8947 minutes
    float outputProb_;
    std::vector<float> speakStart_;
    mutable std::vector<float> speakEnd_;
    std::vector<State> states_;
    /* ========================================================================
     */
    int sample_rate_ = 16000;
    int frame_ms_ = 32;  // 32, 64, 96 for 16k
    float threshold_ = 0.5f;
    int64_t window_size_samples_;  // support 256 512 768 for 8k; 512 1024 1536
                                   // for 16k.
    int sr_per_ms_;                // support 8 or 16
    int min_silence_samples_;      // sr_per_ms_ * frame_ms_
    int speech_pad_left_samples_{0};   // usually 250ms
    int speech_pad_right_samples_{0};  // usually 0
    /* ========================================================================
     */
    std::vector<int64_t> sr_;
    const size_t size_hc_ = 2 * 1 * 64;  // It's FIXED.
    std::vector<float> h_;
    std::vector<float> c_;
    std::vector<int64_t> input_node_dims_;
    const std::vector<int64_t> sr_node_dims_ = {1};
    const std::vector<int64_t> hc_node_dims_ = {2, 1, 64};
 };
--- a/runtime/engine/vad/wav.h
+++ b/runtime/engine/vad/wav.h
@ -0,0 +1,197 @@
 // Copyright (c) 2016 Personal (Binbin Zhang)
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <assert.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <string>
 namespace wav {
 struct WavHeader {
    char riff[4];  // "riff"
    unsigned int size;
    char wav[4];  // "WAVE"
    char fmt[4];  // "fmt "
    unsigned int fmt_size;
    uint16_t format;
    uint16_t channels;
    unsigned int sample_rate;
    unsigned int bytes_per_second;
    uint16_t block_size;
    uint16_t bit;
    char data[4];  // "data"
    unsigned int data_size;
 };
 class WavReader {
  public:
    WavReader() : data_(nullptr) {}
    explicit WavReader(const std::string& filename) { Open(filename); }
    bool Open(const std::string& filename) {
        FILE* fp = fopen(filename.c_str(), "rb");
        if (NULL == fp) {
            std::cout << "Error in read " << filename;
            return false;
        }
        WavHeader header;
        fread(&header, 1, sizeof(header), fp);
        if (header.fmt_size < 16) {
            fprintf(stderr,
                    "WaveData: expect PCM format data "
                    "to have fmt chunk of at least size 16.\n");
            return false;
        } else if (header.fmt_size > 16) {
            int offset = 44 - 8 + header.fmt_size - 16;
            fseek(fp, offset, SEEK_SET);
            fread(header.data, 8, sizeof(char), fp);
        }
        // check "riff" "WAVE" "fmt " "data"
        // Skip any sub-chunks between "fmt" and "data".  Usually there will
        // be a single "fact" sub chunk, but on Windows there can also be a
        // "list" sub chunk.
        while (0 != strncmp(header.data, "data", 4)) {
            // We will just ignore the data in these chunks.
            fseek(fp, header.data_size, SEEK_CUR);
            // read next sub chunk
            fread(header.data, 8, sizeof(char), fp);
        }
        num_channel_ = header.channels;
        sample_rate_ = header.sample_rate;
        bits_per_sample_ = header.bit;
        int num_data = header.data_size / (bits_per_sample_ / 8);
        data_ = new float[num_data];  // Create 1-dim array
        num_samples_ = num_data / num_channel_;
        for (int i = 0; i < num_data; ++i) {
            switch (bits_per_sample_) {
                case 8: {
                    char sample;
                    fread(&sample, 1, sizeof(char), fp);
                    data_[i] = static_cast<float>(sample);
                    break;
                }
                case 16: {
                    int16_t sample;
                    fread(&sample, 1, sizeof(int16_t), fp);
                    // std::cout << sample;
                    data_[i] = static_cast<float>(sample);
                    // std::cout << data_[i];
                    break;
                }
                case 32: {
                    int sample;
                    fread(&sample, 1, sizeof(int), fp);
                    data_[i] = static_cast<float>(sample);
                    break;
                }
                default:
                    fprintf(stderr, "unsupported quantization bits");
                    exit(1);
            }
        }
        fclose(fp);
        return true;
    }
    int num_channel() const { return num_channel_; }
    int sample_rate() const { return sample_rate_; }
    int bits_per_sample() const { return bits_per_sample_; }
    int num_samples() const { return num_samples_; }
    const float* data() const { return data_; }
  private:
    int num_channel_;
    int sample_rate_;
    int bits_per_sample_;
    int num_samples_;  // sample points per channel
    float* data_;
 };
 class WavWriter {
  public:
    WavWriter(const float* data,
              int num_samples,
              int num_channel,
              int sample_rate,
              int bits_per_sample)
        : data_(data),
          num_samples_(num_samples),
          num_channel_(num_channel),
          sample_rate_(sample_rate),
          bits_per_sample_(bits_per_sample) {}
    void Write(const std::string& filename) {
        FILE* fp = fopen(filename.c_str(), "w");
        // init char 'riff' 'WAVE' 'fmt ' 'data'
        WavHeader header;
        char wav_header[44] = {
            0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57, 0x41, 0x56,
            0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00,
            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
            0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
        memcpy(&header, wav_header, sizeof(header));
        header.channels = num_channel_;
        header.bit = bits_per_sample_;
        header.sample_rate = sample_rate_;
        header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
        header.size = sizeof(header) - 8 + header.data_size;
        header.bytes_per_second =
            sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
        header.block_size = num_channel_ * (bits_per_sample_ / 8);
        fwrite(&header, 1, sizeof(header), fp);
        for (int i = 0; i < num_samples_; ++i) {
            for (int j = 0; j < num_channel_; ++j) {
                switch (bits_per_sample_) {
                    case 8: {
                        char sample =
                            static_cast<char>(data_[i * num_channel_ + j]);
                        fwrite(&sample, 1, sizeof(sample), fp);
                        break;
                    }
                    case 16: {
                        int16_t sample =
                            static_cast<int16_t>(data_[i * num_channel_ + j]);
                        fwrite(&sample, 1, sizeof(sample), fp);
                        break;
                    }
                    case 32: {
                        int sample =
                            static_cast<int>(data_[i * num_channel_ + j]);
                        fwrite(&sample, 1, sizeof(sample), fp);
                        break;
                    }
                }
            }
        }
        fclose(fp);
    }
  private:
    const float* data_;
    int num_samples_;  // total float points in data_
    int num_channel_;
    int sample_rate_;
    int bits_per_sample_;
 };
 }  // namespace wav
`@ -14,4 +14,4 @@`

	`#pragma once`	`#pragma once`

	`#include "fst/flags.h"`	`#include "@PPS_FLAGS_LIB@"`