From 532b620454486d1b7b29f66ac1617ee8555d0006 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Sun, 9 Oct 2022 03:17:28 +0000
Subject: [PATCH 01/60] refactor speechx cmake

---
 speechx/CMakeLists.txt                        | 44 +------------------
 speechx/build.sh                              | 10 ++---
 speechx/cmake/{external => }/absl.cmake       |  0
 speechx/cmake/{external => }/boost.cmake      |  0
 speechx/cmake/{external => }/eigen.cmake      |  0
 speechx/cmake/{external => }/gflags.cmake     |  0
 speechx/cmake/{external => }/glog.cmake       |  0
 speechx/cmake/{external => }/gtest.cmake      |  0
 speechx/cmake/{external => }/kenlm.cmake      |  0
 speechx/cmake/{external => }/libsndfile.cmake |  0
 speechx/cmake/{external => }/openblas.cmake   |  5 ++-
 speechx/cmake/{external => }/openfst.cmake    |  0
 speechx/cmake/paddleinference.cmake           | 42 ++++++++++++++++++
 13 files changed, 50 insertions(+), 51 deletions(-)
 rename speechx/cmake/{external => }/absl.cmake (100%)
 rename speechx/cmake/{external => }/boost.cmake (100%)
 rename speechx/cmake/{external => }/eigen.cmake (100%)
 rename speechx/cmake/{external => }/gflags.cmake (100%)
 rename speechx/cmake/{external => }/glog.cmake (100%)
 rename speechx/cmake/{external => }/gtest.cmake (100%)
 rename speechx/cmake/{external => }/kenlm.cmake (100%)
 rename speechx/cmake/{external => }/libsndfile.cmake (100%)
 rename speechx/cmake/{external => }/openblas.cmake (92%)
 rename speechx/cmake/{external => }/openfst.cmake (100%)
 create mode 100644 speechx/cmake/paddleinference.cmake

diff --git a/speechx/CMakeLists.txt b/speechx/CMakeLists.txt
index 4b5838e5..8307d992 100644
--- a/speechx/CMakeLists.txt
+++ b/speechx/CMakeLists.txt
@@ -13,7 +13,6 @@ set(CMAKE_CXX_STANDARD 14)
 set(speechx_cmake_dir ${PROJECT_SOURCE_DIR}/cmake)
 
 # Modules
-list(APPEND CMAKE_MODULE_PATH ${speechx_cmake_dir}/external)
 list(APPEND CMAKE_MODULE_PATH ${speechx_cmake_dir})
 include(FetchContent)
 include(ExternalProject)
@@ -83,48 +82,7 @@ add_dependencies(openfst gflags glog)
 
 
 # paddle lib
-set(paddle_SOURCE_DIR ${fc_patch}/paddle-lib)
-set(paddle_PREFIX_DIR ${fc_patch}/paddle-lib-prefix)
-ExternalProject_Add(paddle
-  URL      https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/CPU/gcc8.2_avx_mkl/paddle_inference.tgz
-  URL_HASH SHA256=7c6399e778c6554a929b5a39ba2175e702e115145e8fa690d2af974101d98873
-  PREFIX            ${paddle_PREFIX_DIR} 
-  SOURCE_DIR        ${paddle_SOURCE_DIR}
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND     ""
-  INSTALL_COMMAND   ""
-)
-
-set(PADDLE_LIB ${fc_patch}/paddle-lib)
-include_directories("${PADDLE_LIB}/paddle/include")
-set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/")
-include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include")
-include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include")
-include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/include")
-
-link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib")
-link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
-link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib")
-link_directories("${PADDLE_LIB}/paddle/lib")
-link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}mklml/lib")
-
-##paddle with mkl
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml")
-include_directories("${MATH_LIB_PATH}/include")
-set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
-                 ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
-set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn")
-include_directories("${MKLDNN_PATH}/include")
-set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
-set(EXTERNAL_LIB "-lrt -ldl -lpthread")
-
-set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX})
-set(DEPS ${DEPS}
-      ${MATH_LIB} ${MKLDNN_LIB}
-      glog gflags protobuf xxhash cryptopp
-      ${EXTERNAL_LIB})
-
+include(paddleinference)
 
 
 ###############################################################################
diff --git a/speechx/build.sh b/speechx/build.sh
index a6eef656..e0a38675 100755
--- a/speechx/build.sh
+++ b/speechx/build.sh
@@ -1,4 +1,5 @@
 #!/usr/bin/env bash
+set -xe
 
 # the build script had verified in the paddlepaddle docker image.
 # please follow the instruction below to install PaddlePaddle image.
@@ -17,11 +18,6 @@ fi
 
 #rm -rf build
 mkdir -p build
-cd build
 
-cmake .. -DBOOST_ROOT:STRING=${boost_SOURCE_DIR}
-#cmake .. 
-
-make -j
-
-cd -
+cmake -B build -DBOOST_ROOT:STRING=${boost_SOURCE_DIR}
+cmake --build build
diff --git a/speechx/cmake/external/absl.cmake b/speechx/cmake/absl.cmake
similarity index 100%
rename from speechx/cmake/external/absl.cmake
rename to speechx/cmake/absl.cmake
diff --git a/speechx/cmake/external/boost.cmake b/speechx/cmake/boost.cmake
similarity index 100%
rename from speechx/cmake/external/boost.cmake
rename to speechx/cmake/boost.cmake
diff --git a/speechx/cmake/external/eigen.cmake b/speechx/cmake/eigen.cmake
similarity index 100%
rename from speechx/cmake/external/eigen.cmake
rename to speechx/cmake/eigen.cmake
diff --git a/speechx/cmake/external/gflags.cmake b/speechx/cmake/gflags.cmake
similarity index 100%
rename from speechx/cmake/external/gflags.cmake
rename to speechx/cmake/gflags.cmake
diff --git a/speechx/cmake/external/glog.cmake b/speechx/cmake/glog.cmake
similarity index 100%
rename from speechx/cmake/external/glog.cmake
rename to speechx/cmake/glog.cmake
diff --git a/speechx/cmake/external/gtest.cmake b/speechx/cmake/gtest.cmake
similarity index 100%
rename from speechx/cmake/external/gtest.cmake
rename to speechx/cmake/gtest.cmake
diff --git a/speechx/cmake/external/kenlm.cmake b/speechx/cmake/kenlm.cmake
similarity index 100%
rename from speechx/cmake/external/kenlm.cmake
rename to speechx/cmake/kenlm.cmake
diff --git a/speechx/cmake/external/libsndfile.cmake b/speechx/cmake/libsndfile.cmake
similarity index 100%
rename from speechx/cmake/external/libsndfile.cmake
rename to speechx/cmake/libsndfile.cmake
diff --git a/speechx/cmake/external/openblas.cmake b/speechx/cmake/openblas.cmake
similarity index 92%
rename from speechx/cmake/external/openblas.cmake
rename to speechx/cmake/openblas.cmake
index 5c196527..a8c3dd2d 100644
--- a/speechx/cmake/external/openblas.cmake
+++ b/speechx/cmake/openblas.cmake
@@ -43,6 +43,7 @@ ExternalProject_Add(
 
 # https://cmake.org/cmake/help/latest/module/ExternalProject.html?highlight=externalproject_get_property#external-project-definition
 ExternalProject_Get_Property(OPENBLAS INSTALL_DIR)
+message(STATUS "OPENBLAS install dir: ${INSTALL_DIR}")
 set(OpenBLAS_INSTALL_PREFIX ${INSTALL_DIR})
 add_library(openblas STATIC IMPORTED)
 add_dependencies(openblas OPENBLAS)
@@ -55,4 +56,6 @@ set_target_properties(openblas PROPERTIES IMPORTED_LOCATION ${OpenBLAS_INSTALL_P
 # ${CMAKE_INSTALL_LIBDIR}  lib
 # ${CMAKE_INSTALL_INCLUDEDIR}  include
 link_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR})
-include_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR})
\ No newline at end of file
+# include_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR})
+# fix for can not find `cblas.h`
+include_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/openblas)
\ No newline at end of file
diff --git a/speechx/cmake/external/openfst.cmake b/speechx/cmake/openfst.cmake
similarity index 100%
rename from speechx/cmake/external/openfst.cmake
rename to speechx/cmake/openfst.cmake
diff --git a/speechx/cmake/paddleinference.cmake b/speechx/cmake/paddleinference.cmake
new file mode 100644
index 00000000..d1f71f00
--- /dev/null
+++ b/speechx/cmake/paddleinference.cmake
@@ -0,0 +1,42 @@
+set(paddle_SOURCE_DIR ${fc_patch}/paddle-lib)
+set(paddle_PREFIX_DIR ${fc_patch}/paddle-lib-prefix)
+ExternalProject_Add(paddle
+  URL      https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/CPU/gcc8.2_avx_mkl/paddle_inference.tgz
+  URL_HASH SHA256=7c6399e778c6554a929b5a39ba2175e702e115145e8fa690d2af974101d98873
+  PREFIX            ${paddle_PREFIX_DIR} 
+  SOURCE_DIR        ${paddle_SOURCE_DIR}
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+)
+
+set(PADDLE_LIB ${fc_patch}/paddle-lib)
+include_directories("${PADDLE_LIB}/paddle/include")
+set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/include")
+
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib")
+link_directories("${PADDLE_LIB}/paddle/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}mklml/lib")
+
+##paddle with mkl
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml")
+include_directories("${MATH_LIB_PATH}/include")
+set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
+                 ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn")
+include_directories("${MKLDNN_PATH}/include")
+set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+
+# global vars
+set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX} CACHE FORCE "DEPS")
+set(DEPS ${DEPS}
+      ${MATH_LIB} ${MKLDNN_LIB}
+      glog gflags protobuf xxhash cryptopp
+      ${EXTERNAL_LIB} CACHE FORCE "DEPS")

From b621b5b97489a76d5d48b0eb3900a955b7eefa11 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Sun, 9 Oct 2022 06:05:35 +0000
Subject: [PATCH 02/60] add math and macros

---
 speechx/cmake/paddleinference.cmake    |  7 ++-
 speechx/speechx/base/common.h          |  3 +
 speechx/speechx/base/macros.h          |  9 ++-
 speechx/speechx/decoder/CMakeLists.txt |  1 +
 speechx/speechx/decoder/param.h        |  5 +-
 speechx/speechx/utils/math.cc          | 82 ++++++++++++++++++++++++++
 speechx/speechx/utils/math.h           | 28 +++++++++
 7 files changed, 131 insertions(+), 4 deletions(-)
 create mode 100644 speechx/speechx/utils/math.cc
 create mode 100644 speechx/speechx/utils/math.h

diff --git a/speechx/cmake/paddleinference.cmake b/speechx/cmake/paddleinference.cmake
index d1f71f00..957e423c 100644
--- a/speechx/cmake/paddleinference.cmake
+++ b/speechx/cmake/paddleinference.cmake
@@ -22,6 +22,7 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib")
 link_directories("${PADDLE_LIB}/paddle/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}mklml/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn/lib")
 
 ##paddle with mkl
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
@@ -29,14 +30,16 @@ set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml")
 include_directories("${MATH_LIB_PATH}/include")
 set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
                  ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+
 set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn")
 include_directories("${MKLDNN_PATH}/include")
 set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
 set(EXTERNAL_LIB "-lrt -ldl -lpthread")
 
 # global vars
-set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX} CACHE FORCE "DEPS")
+set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX} CACHE INTERNAL "deps")
 set(DEPS ${DEPS}
       ${MATH_LIB} ${MKLDNN_LIB}
       glog gflags protobuf xxhash cryptopp
-      ${EXTERNAL_LIB} CACHE FORCE "DEPS")
+      ${EXTERNAL_LIB} CACHE INTERNAL "deps")
+message(STATUS "Deps libraries: ${DEPS}")
diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h
index a9303cbb..778c06d7 100644
--- a/speechx/speechx/base/common.h
+++ b/speechx/speechx/base/common.h
@@ -38,3 +38,6 @@
 #include "base/flags.h"
 #include "base/log.h"
 #include "base/macros.h"
+
+#include "utils/file_utils.h"
+#include "utils/math.h"
\ No newline at end of file
diff --git a/speechx/speechx/base/macros.h b/speechx/speechx/base/macros.h
index d7d5a78d..14332a80 100644
--- a/speechx/speechx/base/macros.h
+++ b/speechx/speechx/base/macros.h
@@ -14,6 +14,9 @@
 
 #pragma once
 
+#include <limits>
+#include <string>
+
 namespace ppspeech {
 
 #ifndef DISALLOW_COPY_AND_ASSIGN
@@ -22,4 +25,8 @@ namespace ppspeech {
     void operator=(const TypeName&) = delete
 #endif
 
-}  // namespace pp_speech
\ No newline at end of file
+constexpr float kFloatMax = std::numeric_limits<float>::max();
+
+const std::string kSpaceSymbol = "\xe2\x96\x81";
+
+}  // namespace ppspeech
diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt
index 1df93511..0383c3ea 100644
--- a/speechx/speechx/decoder/CMakeLists.txt
+++ b/speechx/speechx/decoder/CMakeLists.txt
@@ -18,6 +18,7 @@ set(BINS
   tlg_decoder_main
 )
 
+message(STATUS "xxxxxxxxxx: " ${DEPS})
 foreach(bin_name IN LISTS BINS)
   add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
   target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h
index d6ee2705..ed895aed 100644
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -56,7 +56,9 @@ DEFINE_int32(max_active, 7500, "max active");
 DEFINE_double(beam, 15.0, "decoder beam");
 DEFINE_double(lattice_beam, 7.5, "decoder beam");
 
+
 namespace ppspeech {
+
 // todo refactor later
 FeaturePipelineOptions InitFeaturePipelineOptions() {
     FeaturePipelineOptions opts;
@@ -115,4 +117,5 @@ RecognizerResource InitRecognizerResoure() {
     resource.tlg_opts = InitDecoderOptions();
     return resource;
 }
-}
+
+}  // namespace ppspeech
diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc
new file mode 100644
index 00000000..fe5c7118
--- /dev/null
+++ b/speechx/speechx/utils/math.cc
@@ -0,0 +1,82 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "utils/math.h"
+
+#include "base/common.h"
+
+#include <cmath>
+#include <algorithm>
+#include <utility>
+#include <queue>
+
+
+namespace ppspeech {
+
+// Sum in log scale
+float LogSumExp(float x, float y) {
+    if (x <= -kFloatMax) return y;
+    if (y <= -kFloatMax) return x;
+    float max = std::max(x, y);
+    return max + std::log(std::exp(x - max) + std::exp(y - max));
+}
+
+// greater compare for smallest priority_queue
+template <typename T>
+struct ValGreaterComp {
+    bool operator()(const std::pair<T, int32_t>& lhs, const std::pair<T, int32_>& rhs) const {
+        return lhs.first > rhs.first || (lhs.first == rhs.first && lhs.second < rhs.second);
+    }
+}
+
+template<typename T>
+void TopK(const std::vector<T>& data, int32_t k, std::vector<T>* values, std::vector<int>* indices) {
+     int n = data.size();
+     int min_k_n = std::min(k, n);
+
+    // smallest heap, (val, idx)
+    std::vector<std::pair<T, int32_t>>  smallest_heap;
+    for (int i = 0; i < min_k_n; i++){
+        smallest_heap.emplace_back(data[i], i);
+    }
+
+    // smallest priority_queue
+    std::priority_queue<std::pair<T, int32_t>, std::vector<std::pair<T, int32_t>>, ValGreaterComp<T>> pq(ValGreaterComp<T>(), std::move(smallest_heap));
+
+    // top k
+    for (int i = k ; i < n; i++){
+        if (pq.top().first < data[i]){
+            pq.pop();
+            pq.emplace_back(data[i], i);
+        }
+    }
+
+    values->resize(min_k_n);
+    indices->resize(min_k_n);
+
+    // from largest to samllest
+    int cur = values->size() - 1;
+    while(!pq.empty()){
+        const auto& item = pq.top();
+        pq.pop();
+
+        (*values)[cur] = item.first;
+        (*indices)[cur] = item.second;
+
+        cur--;
+    }
+}
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/utils/math.h b/speechx/speechx/utils/math.h
new file mode 100644
index 00000000..452bf089
--- /dev/null
+++ b/speechx/speechx/utils/math.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include <cstdint>
+
+namespace ppspeech {
+
+// Sum in log scale
+float LogSumExp(float x, float y);
+
+template<typename T>
+void TopK(const std::vector<T>& data, int32_t k, std::vector<T>* values, std::vector<int>* indices);
+
+}  // namespace ppspeech
\ No newline at end of file

From 75c578804d6738b40a644a1c38c18a40f0252eed Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Sun, 9 Oct 2022 07:54:32 +0000
Subject: [PATCH 03/60] using FetchContent_Declare for paddleinference

---
 speechx/cmake/openblas.cmake        |  4 ++--
 speechx/cmake/paddleinference.cmake | 25 +++++++++++++++++++------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/speechx/cmake/openblas.cmake b/speechx/cmake/openblas.cmake
index a8c3dd2d..27e13207 100644
--- a/speechx/cmake/openblas.cmake
+++ b/speechx/cmake/openblas.cmake
@@ -1,7 +1,7 @@
 include(FetchContent)
 
-set(OpenBLAS_SOURCE_DIR ${fc_patch}/OpenBLAS-src)
-set(OpenBLAS_PREFIX ${fc_patch}/OpenBLAS-prefix)
+set(OpenBLAS_SOURCE_DIR ${fc_patch}/openblas-src)
+set(OpenBLAS_PREFIX ${fc_patch}/openblas-prefix)
 
 # ######################################################################################################################
 # OPENBLAS  https://github.com/lattice/quda/blob/develop/CMakeLists.txt#L575
diff --git a/speechx/cmake/paddleinference.cmake b/speechx/cmake/paddleinference.cmake
index 957e423c..311804d6 100644
--- a/speechx/cmake/paddleinference.cmake
+++ b/speechx/cmake/paddleinference.cmake
@@ -1,6 +1,18 @@
 set(paddle_SOURCE_DIR ${fc_patch}/paddle-lib)
 set(paddle_PREFIX_DIR ${fc_patch}/paddle-lib-prefix)
-ExternalProject_Add(paddle
+# ExternalProject_Add(paddle
+#   URL      https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/CPU/gcc8.2_avx_mkl/paddle_inference.tgz
+#   URL_HASH SHA256=7c6399e778c6554a929b5a39ba2175e702e115145e8fa690d2af974101d98873
+#   PREFIX            ${paddle_PREFIX_DIR} 
+#   SOURCE_DIR        ${paddle_SOURCE_DIR}
+#   CONFIGURE_COMMAND ""
+#   BUILD_COMMAND     ""
+#   INSTALL_COMMAND   ""
+# )
+
+include(FetchContent)
+FetchContent_Declare(
+  paddle
   URL      https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/CPU/gcc8.2_avx_mkl/paddle_inference.tgz
   URL_HASH SHA256=7c6399e778c6554a929b5a39ba2175e702e115145e8fa690d2af974101d98873
   PREFIX            ${paddle_PREFIX_DIR} 
@@ -9,10 +21,11 @@ ExternalProject_Add(paddle
   BUILD_COMMAND     ""
   INSTALL_COMMAND   ""
 )
+FetchContent_MakeAvailable(paddle)
+
+set(PADDLE_LIB_THIRD_PARTY_PATH "${paddle_SOURCE_DIR}/third_party/install/")
 
-set(PADDLE_LIB ${fc_patch}/paddle-lib)
-include_directories("${PADDLE_LIB}/paddle/include")
-set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/")
+include_directories("${paddle_SOURCE_DIR}/paddle/include")
 include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include")
 include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include")
 include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/include")
@@ -20,7 +33,7 @@ include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/include")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib")
-link_directories("${PADDLE_LIB}/paddle/lib")
+link_directories("${paddle_SOURCE_DIR}/paddle/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}mklml/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn/lib")
 
@@ -37,7 +50,7 @@ set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
 set(EXTERNAL_LIB "-lrt -ldl -lpthread")
 
 # global vars
-set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX} CACHE INTERNAL "deps")
+set(DEPS ${paddle_SOURCE_DIR}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX} CACHE INTERNAL "deps")
 set(DEPS ${DEPS}
       ${MATH_LIB} ${MKLDNN_LIB}
       glog gflags protobuf xxhash cryptopp

From e1fc57deb1454c926c8925fba040ada210183168 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Sun, 9 Oct 2022 09:29:37 +0000
Subject: [PATCH 04/60] add math and rename ds2 nnet

---
 speechx/speechx/base/common.h                 |  5 +++
 .../ctc_prefix_beam_search_decoder_main.cc    |  2 +-
 speechx/speechx/decoder/recognizer.h          |  2 +-
 speechx/speechx/decoder/tlg_decoder_main.cc   |  2 +-
 speechx/speechx/nnet/CMakeLists.txt           |  8 ++--
 .../nnet/{paddle_nnet.cc => ds2_nnet.cc}      |  2 +-
 .../nnet/{paddle_nnet.h => ds2_nnet.h}        |  0
 ...{nnet_forward_main.cc => ds2_nnet_main.cc} |  2 +-
 speechx/speechx/utils/math.cc                 | 37 ++++++++++++-------
 speechx/speechx/utils/math.h                  |  9 +++--
 10 files changed, 42 insertions(+), 27 deletions(-)
 rename speechx/speechx/nnet/{paddle_nnet.cc => ds2_nnet.cc} (99%)
 rename speechx/speechx/nnet/{paddle_nnet.h => ds2_nnet.h} (100%)
 rename speechx/speechx/nnet/{nnet_forward_main.cc => ds2_nnet_main.cc} (99%)

diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h
index 778c06d7..dfb14885 100644
--- a/speechx/speechx/base/common.h
+++ b/speechx/speechx/base/common.h
@@ -14,19 +14,24 @@
 
 #pragma once
 
+#include <algorithm>
 #include <condition_variable>
+#include <cstring>
 #include <deque>
 #include <fstream>
+#include <iomanip>
 #include <iostream>
 #include <istream>
 #include <map>
 #include <memory>
 #include <mutex>
+#include <numeric>
 #include <ostream>
 #include <queue>
 #include <set>
 #include <sstream>
 #include <stack>
+#include <stdexcept>
 #include <string>
 #include <thread>
 #include <unordered_map>
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
index 7cfee06c..e4e5c2af 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
@@ -20,7 +20,7 @@
 #include "frontend/audio/data_cache.h"
 #include "kaldi/util/table-types.h"
 #include "nnet/decodable.h"
-#include "nnet/paddle_nnet.h"
+#include "nnet/ds2_nnet.h"
 
 DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
 DEFINE_string(result_wspecifier, "", "test result wspecifier");
diff --git a/speechx/speechx/decoder/recognizer.h b/speechx/speechx/decoder/recognizer.h
index 35e1e167..e47ca433 100644
--- a/speechx/speechx/decoder/recognizer.h
+++ b/speechx/speechx/decoder/recognizer.h
@@ -20,7 +20,7 @@
 #include "decoder/ctc_tlg_decoder.h"
 #include "frontend/audio/feature_pipeline.h"
 #include "nnet/decodable.h"
-#include "nnet/paddle_nnet.h"
+#include "nnet/ds2_nnet.h"
 
 namespace ppspeech {
 
diff --git a/speechx/speechx/decoder/tlg_decoder_main.cc b/speechx/speechx/decoder/tlg_decoder_main.cc
index b175ed13..93f84da3 100644
--- a/speechx/speechx/decoder/tlg_decoder_main.cc
+++ b/speechx/speechx/decoder/tlg_decoder_main.cc
@@ -20,7 +20,7 @@
 #include "frontend/audio/data_cache.h"
 #include "kaldi/util/table-types.h"
 #include "nnet/decodable.h"
-#include "nnet/paddle_nnet.h"
+#include "nnet/ds2_nnet.h"
 
 DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
 DEFINE_string(result_wspecifier, "", "test result wspecifier");
diff --git a/speechx/speechx/nnet/CMakeLists.txt b/speechx/speechx/nnet/CMakeLists.txt
index c325ce75..565bba3e 100644
--- a/speechx/speechx/nnet/CMakeLists.txt
+++ b/speechx/speechx/nnet/CMakeLists.txt
@@ -2,13 +2,11 @@ project(nnet)
 
 add_library(nnet STATIC
   decodable.cc
-  paddle_nnet.cc
+  ds2_nnet.cc
 )
 target_link_libraries(nnet absl::strings)
 
-set(bin_name nnet_forward_main)
+set(bin_name ds2_nnet_main)
 add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
 target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet ${DEPS})
-
-
+target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet ${DEPS})
\ No newline at end of file
diff --git a/speechx/speechx/nnet/paddle_nnet.cc b/speechx/speechx/nnet/ds2_nnet.cc
similarity index 99%
rename from speechx/speechx/nnet/paddle_nnet.cc
rename to speechx/speechx/nnet/ds2_nnet.cc
index 881a82f5..a89c0f20 100644
--- a/speechx/speechx/nnet/paddle_nnet.cc
+++ b/speechx/speechx/nnet/ds2_nnet.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "nnet/paddle_nnet.h"
+#include "nnet/ds2_nnet.h"
 #include "absl/strings/str_split.h"
 
 namespace ppspeech {
diff --git a/speechx/speechx/nnet/paddle_nnet.h b/speechx/speechx/nnet/ds2_nnet.h
similarity index 100%
rename from speechx/speechx/nnet/paddle_nnet.h
rename to speechx/speechx/nnet/ds2_nnet.h
diff --git a/speechx/speechx/nnet/nnet_forward_main.cc b/speechx/speechx/nnet/ds2_nnet_main.cc
similarity index 99%
rename from speechx/speechx/nnet/nnet_forward_main.cc
rename to speechx/speechx/nnet/ds2_nnet_main.cc
index 0d4ea8ff..e2904208 100644
--- a/speechx/speechx/nnet/nnet_forward_main.cc
+++ b/speechx/speechx/nnet/ds2_nnet_main.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "nnet/ds2_nnet.h"
 #include "base/flags.h"
 #include "base/log.h"
 #include "frontend/audio/assembler.h"
 #include "frontend/audio/data_cache.h"
 #include "kaldi/util/table-types.h"
 #include "nnet/decodable.h"
-#include "nnet/paddle_nnet.h"
 
 DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
 DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier");
diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc
index fe5c7118..7c319295 100644
--- a/speechx/speechx/utils/math.cc
+++ b/speechx/speechx/utils/math.cc
@@ -1,4 +1,5 @@
 
+// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng)
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,10 +18,10 @@
 
 #include "base/common.h"
 
-#include <cmath>
 #include <algorithm>
-#include <utility>
+#include <cmath>
 #include <queue>
+#include <utility>
 
 
 namespace ppspeech {
@@ -36,28 +37,36 @@ float LogSumExp(float x, float y) {
 // greater compare for smallest priority_queue
 template <typename T>
 struct ValGreaterComp {
-    bool operator()(const std::pair<T, int32_t>& lhs, const std::pair<T, int32_>& rhs) const {
-        return lhs.first > rhs.first || (lhs.first == rhs.first && lhs.second < rhs.second);
+    bool operator()(const std::pair<T, int32_t>& lhs,
+                    const std::pair<T, int32_>& rhs) const {
+        return lhs.first > rhs.first ||
+               (lhs.first == rhs.first && lhs.second < rhs.second);
     }
 }
 
-template<typename T>
-void TopK(const std::vector<T>& data, int32_t k, std::vector<T>* values, std::vector<int>* indices) {
-     int n = data.size();
-     int min_k_n = std::min(k, n);
+template <typename T>
+void TopK(const std::vector<T>& data,
+          int32_t k,
+          std::vector<T>* values,
+          std::vector<int>* indices) {
+    int n = data.size();
+    int min_k_n = std::min(k, n);
 
     // smallest heap, (val, idx)
-    std::vector<std::pair<T, int32_t>>  smallest_heap;
-    for (int i = 0; i < min_k_n; i++){
+    std::vector<std::pair<T, int32_t>> smallest_heap;
+    for (int i = 0; i < min_k_n; i++) {
         smallest_heap.emplace_back(data[i], i);
     }
 
     // smallest priority_queue
-    std::priority_queue<std::pair<T, int32_t>, std::vector<std::pair<T, int32_t>>, ValGreaterComp<T>> pq(ValGreaterComp<T>(), std::move(smallest_heap));
+    std::priority_queue<std::pair<T, int32_t>,
+                        std::vector<std::pair<T, int32_t>>,
+                        ValGreaterComp<T>>
+        pq(ValGreaterComp<T>(), std::move(smallest_heap));
 
     // top k
-    for (int i = k ; i < n; i++){
-        if (pq.top().first < data[i]){
+    for (int i = k; i < n; i++) {
+        if (pq.top().first < data[i]) {
             pq.pop();
             pq.emplace_back(data[i], i);
         }
@@ -68,7 +77,7 @@ void TopK(const std::vector<T>& data, int32_t k, std::vector<T>* values, std::ve
 
     // from largest to samllest
     int cur = values->size() - 1;
-    while(!pq.empty()){
+    while (!pq.empty()) {
         const auto& item = pq.top();
         pq.pop();
 
diff --git a/speechx/speechx/utils/math.h b/speechx/speechx/utils/math.h
index 452bf089..7c863b00 100644
--- a/speechx/speechx/utils/math.h
+++ b/speechx/speechx/utils/math.h
@@ -14,15 +14,18 @@
 
 #pragma once
 
-#include <vector>
 #include <cstdint>
+#include <vector>
 
 namespace ppspeech {
 
 // Sum in log scale
 float LogSumExp(float x, float y);
 
-template<typename T>
-void TopK(const std::vector<T>& data, int32_t k, std::vector<T>* values, std::vector<int>* indices);
+template <typename T>
+void TopK(const std::vector<T>& data,
+          int32_t k,
+          std::vector<T>* values,
+          std::vector<int>* indices);
 
 }  // namespace ppspeech
\ No newline at end of file

From 290c23b9d72b82785aba0e3fe010e461adba9888 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 10 Oct 2022 08:39:51 +0000
Subject: [PATCH 05/60] add u2 nnet, u2 nnet main, codelab, and can compile

---
 examples/wenetspeech/asr1/local/test_wav.sh   |   1 +
 paddlespeech/s2t/exps/u2/bin/test_wav.py      |   9 +
 speechx/.clang-format                         |  29 +
 speechx/CMakeLists.txt                        |  41 +-
 speechx/README.md                             |  12 +-
 speechx/cmake/gflags.cmake                    |   5 +-
 speechx/cmake/gtest.cmake                     |   4 +-
 speechx/examples/codelab/feat/.gitignore      |   2 +
 speechx/examples/codelab/feat/path.sh         |   4 +-
 speechx/examples/codelab/feat/run.sh          |   6 +
 speechx/examples/codelab/nnet/path.sh         |   2 +-
 speechx/examples/codelab/u2nnet/.gitignore    |   3 +
 speechx/examples/codelab/u2nnet/README.md     |   3 +
 speechx/examples/codelab/u2nnet/path.sh       |  19 +
 speechx/examples/codelab/u2nnet/run.sh        |  59 ++
 speechx/examples/codelab/u2nnet/valgrind.sh   |  21 +
 speechx/examples/u2pp_ol/README.md            |   5 +
 speechx/speechx/base/common.h                 |   1 +
 .../codelab/glog/glog_logtostderr_main.cc     |   2 +-
 .../codelab/nnet/ds2_model_test_main.cc       |   3 +
 speechx/speechx/decoder/CMakeLists.txt        |   1 -
 .../ctc_prefix_beam_search_decoder_main.cc    |   3 +
 .../decoder/nnet_logprob_decoder_main.cc      |   3 +
 speechx/speechx/decoder/recognizer_main.cc    |   3 +
 speechx/speechx/decoder/tlg_decoder_main.cc   |   3 +
 speechx/speechx/frontend/audio/CMakeLists.txt |   2 -
 .../frontend/audio/cmvn_json2kaldi_main.cc    |   3 +
 .../frontend/audio/compute_fbank_main.cc      |  67 +-
 .../audio/compute_linear_spectrogram_main.cc  |   3 +
 speechx/speechx/model/CMakeLists.txt          |   0
 speechx/speechx/nnet/CMakeLists.txt           |  46 +-
 speechx/speechx/nnet/decodable.cc             |  32 +-
 speechx/speechx/nnet/decodable.h              |  21 +-
 speechx/speechx/nnet/ds2_nnet_main.cc         |   8 +-
 speechx/speechx/nnet/u2_nnet.cc               | 706 ++++++++++++++++++
 speechx/speechx/nnet/u2_nnet.h                | 157 ++++
 speechx/speechx/nnet/u2_nnet_main.cc          | 180 +++++
 speechx/speechx/protocol/CMakeLists.txt       |   2 -
 .../speechx/protocol/websocket/CMakeLists.txt |   2 +-
 speechx/speechx/utils/CMakeLists.txt          |   1 +
 speechx/speechx/utils/math.cc                 |   4 +-
 speechx/tools/venv.sh                         |   5 +
 42 files changed, 1425 insertions(+), 58 deletions(-)
 create mode 100644 speechx/.clang-format
 create mode 100644 speechx/examples/codelab/feat/.gitignore
 create mode 100644 speechx/examples/codelab/u2nnet/.gitignore
 create mode 100644 speechx/examples/codelab/u2nnet/README.md
 create mode 100644 speechx/examples/codelab/u2nnet/path.sh
 create mode 100755 speechx/examples/codelab/u2nnet/run.sh
 create mode 100755 speechx/examples/codelab/u2nnet/valgrind.sh
 create mode 100644 speechx/examples/u2pp_ol/README.md
 delete mode 100644 speechx/speechx/model/CMakeLists.txt
 create mode 100644 speechx/speechx/nnet/u2_nnet.cc
 create mode 100644 speechx/speechx/nnet/u2_nnet.h
 create mode 100644 speechx/speechx/nnet/u2_nnet_main.cc
 create mode 100755 speechx/tools/venv.sh

diff --git a/examples/wenetspeech/asr1/local/test_wav.sh b/examples/wenetspeech/asr1/local/test_wav.sh
index 47464262..c3a17f49 100755
--- a/examples/wenetspeech/asr1/local/test_wav.sh
+++ b/examples/wenetspeech/asr1/local/test_wav.sh
@@ -42,6 +42,7 @@ for type in  attention_rescoring; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test_wav.py \
+    --debug True \
     --ngpu ${ngpu} \
     --config ${config_path} \
     --decode_cfg ${decode_config_path} \
diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py
index 2e067ab6..67ef2e53 100644
--- a/paddlespeech/s2t/exps/u2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py
@@ -16,6 +16,8 @@ import os
 import sys
 from pathlib import Path
 
+import distutils
+import numpy as np
 import paddle
 import soundfile
 from yacs.config import CfgNode
@@ -74,6 +76,8 @@ class U2Infer():
             # fbank
             feat = self.preprocessing(audio, **self.preprocess_args)
             logger.info(f"feat shape: {feat.shape}")
+            if self.args.debug:
+                np.savetxt("feat.transform.txt", feat)
 
             ilen = paddle.to_tensor(feat.shape[0])
             xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0)
@@ -125,6 +129,11 @@ if __name__ == "__main__":
         "--result_file", type=str, help="path of save the asr result")
     parser.add_argument(
         "--audio_file", type=str, help="path of the input audio file")
+    parser.add_argument(
+        "--debug",
+        type=distutils.util.strtobool,
+        default=False,
+        help="for debug.")
     args = parser.parse_args()
 
     config = CfgNode(new_allowed=True)
diff --git a/speechx/.clang-format b/speechx/.clang-format
new file mode 100644
index 00000000..af946a4a
--- /dev/null
+++ b/speechx/.clang-format
@@ -0,0 +1,29 @@
+# This file is used by clang-format to autoformat paddle source code
+#
+# The clang-format is part of llvm toolchain.
+# It need to install llvm and clang to format source code style.
+#
+# The basic usage is,
+#   clang-format -i -style=file PATH/TO/SOURCE/CODE
+#
+# The -style=file implicit use ".clang-format" file located in one of
+# parent directory.
+# The -i means inplace change.
+#
+# The document of clang-format is
+#   http://clang.llvm.org/docs/ClangFormat.html
+#   http://clang.llvm.org/docs/ClangFormatStyleOptions.html
+---
+Language:        Cpp
+BasedOnStyle:  Google
+IndentWidth:     4
+TabWidth:        4
+ContinuationIndentWidth: 4
+MaxEmptyLinesToKeep: 2
+AccessModifierOffset: -2  # The private/protected/public has no indent in class
+Standard:  Cpp11
+AllowAllParametersOfDeclarationOnNextLine: true
+BinPackParameters: false
+BinPackArguments: false
+...
+
diff --git a/speechx/CMakeLists.txt b/speechx/CMakeLists.txt
index 8307d992..17e64c04 100644
--- a/speechx/CMakeLists.txt
+++ b/speechx/CMakeLists.txt
@@ -31,9 +31,13 @@ SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} --std=c++14 -pthread -fPIC -O3 -Wall
 ###############################################################################
 # Option Configurations
 ###############################################################################
-# option configurations 
 option(TEST_DEBUG "option for debug" OFF)
+option(USE_PROFILING "enable c++ profling" OFF)
 
+option(USING_U2  "compile u2 model." ON)
+option(USING_DS2 "compile with ds2 model." ON)
+
+option(USING_GPU "u2 compute on GPU." OFF)
 
 ###############################################################################
 # Include third party
@@ -85,6 +89,41 @@ add_dependencies(openfst gflags glog)
 include(paddleinference)
 
 
+# paddle core.so
+find_package(Threads REQUIRED)
+find_package(PythonLibs REQUIRED)
+find_package(Python3 REQUIRED)
+find_package(pybind11 CONFIG)
+
+message(STATUS "PYTHON_LIBRARIES = ${PYTHON_LIBRARIES}")
+message(STATUS "Python3_EXECUTABLE = ${Python3_EXECUTABLE}")
+message(STATUS "Pybind11_INCLUDES = ${pybind11_INCLUDE_DIRS}, pybind11_LIBRARIES=${pybind11_LIBRARIES}, pybind11_DEFINITIONS=${pybind11_DEFINITIONS}")
+
+# paddle include and link option
+execute_process(
+    COMMAND python -c "import paddle ; print(' '.join(paddle.sysconfig.get_link_flags()), end='')"
+    OUTPUT_VARIABLE PADDLE_LINK_FLAGS
+    RESULT_VARIABLE SUCESS)
+
+message(STATUS PADDLE_LINK_FLAGS= ${PADDLE_LINK_FLAGS})
+string(STRIP ${PADDLE_LINK_FLAGS} PADDLE_LINK_FLAGS)
+
+# paddle compile option
+execute_process(
+    COMMAND python -c "import paddle ; print(' '.join(paddle.sysconfig.get_compile_flags()), end='')"
+    OUTPUT_VARIABLE PADDLE_COMPILE_FLAGS)
+message(STATUS PADDLE_COMPILE_FLAGS= ${PADDLE_COMPILE_FLAGS})
+string(STRIP ${PADDLE_COMPILE_FLAGS} PADDLE_COMPILE_FLAGS)
+
+
+# for LD_LIBRARY_PATH
+# set(PADDLE_LIB_DIRS /workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid:/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/libs/)
+execute_process(
+    COMMAND python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')"
+    OUTPUT_VARIABLE PADDLE_LIB_DIRS)
+message(STATUS PADDLE_LIB_DIRS= ${PADDLE_LIB_DIRS})
+
+
 ###############################################################################
 # Add local library
 ###############################################################################
diff --git a/speechx/README.md b/speechx/README.md
index cd1cd62c..cc7b13e6 100644
--- a/speechx/README.md
+++ b/speechx/README.md
@@ -3,11 +3,14 @@
 ## Environment
 
 We develop under:
+* python - 3.7
 * docker - `registry.baidubce.com/paddlepaddle/paddle:2.2.2-gpu-cuda10.2-cudnn7`
 * os - Ubuntu 16.04.7 LTS
 * gcc/g++/gfortran - 8.2.0
 * cmake - 3.16.0
 
+> Please using `tools/env.sh` to create python `venv`, then `source venv/bin/activate` to build speechx.
+
 > We make sure all things work fun under docker, and recommend using it to develop and deploy.
 
 * [How to Install Docker](https://docs.docker.com/engine/install/)
@@ -24,13 +27,16 @@ docker run --privileged  --net=host --ipc=host -it --rm -v $PWD:/workspace --nam
 
 * More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html).
 
+2. Create python environment.
 
-2. Build `speechx` and `examples`.
+```
+bash tools/venv.sh
+```
 
-> Do not source venv.
+2. Build `speechx` and `examples`.
 
 ```
-pushd /path/to/speechx
+source venv/bin/activate
 ./build.sh
 ```
 
diff --git a/speechx/cmake/gflags.cmake b/speechx/cmake/gflags.cmake
index 66ae47f7..36bebc87 100644
--- a/speechx/cmake/gflags.cmake
+++ b/speechx/cmake/gflags.cmake
@@ -2,10 +2,9 @@ include(FetchContent)
 
 FetchContent_Declare(
   gflags
-  URL      https://github.com/gflags/gflags/archive/v2.2.1.zip
-  URL_HASH SHA256=4e44b69e709c826734dbbbd5208f61888a2faf63f239d73d8ba0011b2dccc97a
+  URL      https://github.com/gflags/gflags/archive/v2.2.2.zip
+  URL_HASH SHA256=19713a36c9f32b33df59d1c79b4958434cb005b5b47dc5400a7a4b078111d9b5
 )
-
 FetchContent_MakeAvailable(gflags)
 
 # openfst need
diff --git a/speechx/cmake/gtest.cmake b/speechx/cmake/gtest.cmake
index 7fe397fc..1ea8ed0b 100644
--- a/speechx/cmake/gtest.cmake
+++ b/speechx/cmake/gtest.cmake
@@ -1,8 +1,8 @@
 include(FetchContent)
 FetchContent_Declare(
   gtest
-  URL      https://github.com/google/googletest/archive/release-1.10.0.zip
-  URL_HASH SHA256=94c634d499558a76fa649edb13721dce6e98fb1e7018dfaeba3cd7a083945e91
+  URL      https://github.com/google/googletest/archive/release-1.11.0.zip
+  URL_HASH SHA256=353571c2440176ded91c2de6d6cd88ddd41401d14692ec1f99e35d013feda55a
 )
 FetchContent_MakeAvailable(gtest)
 
diff --git a/speechx/examples/codelab/feat/.gitignore b/speechx/examples/codelab/feat/.gitignore
new file mode 100644
index 00000000..bbd86a25
--- /dev/null
+++ b/speechx/examples/codelab/feat/.gitignore
@@ -0,0 +1,2 @@
+data
+exp
diff --git a/speechx/examples/codelab/feat/path.sh b/speechx/examples/codelab/feat/path.sh
index 3b89d01e..9d229174 100644
--- a/speechx/examples/codelab/feat/path.sh
+++ b/speechx/examples/codelab/feat/path.sh
@@ -1,12 +1,12 @@
 # This contains the locations of binarys build required for running the examples.
 
 SPEECHX_ROOT=$PWD/../../../
-SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
+SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
 
 SPEECHX_TOOLS=$SPEECHX_ROOT/tools
 TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
 
-[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
+[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; }
 
 export LC_AL=C
 
diff --git a/speechx/examples/codelab/feat/run.sh b/speechx/examples/codelab/feat/run.sh
index 1fa37f98..66bd8ae2 100755
--- a/speechx/examples/codelab/feat/run.sh
+++ b/speechx/examples/codelab/feat/run.sh
@@ -54,4 +54,10 @@ compute_linear_spectrogram_main \
     --cmvn_file=$exp_dir/cmvn.ark
 echo "compute linear spectrogram feature."
 
+compute_fbank_main \
+    --num_bins 161 \
+    --wav_rspecifier=scp:$data_dir/wav.scp \
+    --feature_wspecifier=ark,t:$exp_dir/fbank.ark \
+    --cmvn_file=$exp_dir/cmvn.ark
+echo "compute fbank feature."
 
diff --git a/speechx/examples/codelab/nnet/path.sh b/speechx/examples/codelab/nnet/path.sh
index 7d395d64..11c8aef8 100644
--- a/speechx/examples/codelab/nnet/path.sh
+++ b/speechx/examples/codelab/nnet/path.sh
@@ -6,7 +6,7 @@ SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
 SPEECHX_TOOLS=$SPEECHX_ROOT/tools
 TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
 
-[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
+[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; }
 
 export LC_AL=C
 
diff --git a/speechx/examples/codelab/u2nnet/.gitignore b/speechx/examples/codelab/u2nnet/.gitignore
new file mode 100644
index 00000000..d6fe69bc
--- /dev/null
+++ b/speechx/examples/codelab/u2nnet/.gitignore
@@ -0,0 +1,3 @@
+data
+exp
+*log
diff --git a/speechx/examples/codelab/u2nnet/README.md b/speechx/examples/codelab/u2nnet/README.md
new file mode 100644
index 00000000..772a58f0
--- /dev/null
+++ b/speechx/examples/codelab/u2nnet/README.md
@@ -0,0 +1,3 @@
+# Deepspeech2 Streaming NNet Test
+
+Using for ds2 streaming nnet inference test.
diff --git a/speechx/examples/codelab/u2nnet/path.sh b/speechx/examples/codelab/u2nnet/path.sh
new file mode 100644
index 00000000..564e9fed
--- /dev/null
+++ b/speechx/examples/codelab/u2nnet/path.sh
@@ -0,0 +1,19 @@
+# This contains the locations of binarys build required for running the examples.
+
+unset GREP_OPTIONS
+
+SPEECHX_ROOT=$PWD/../../../
+SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
+
+SPEECHX_TOOLS=$SPEECHX_ROOT/tools
+TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
+
+[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; }
+
+export LC_AL=C
+
+SPEECHX_BIN=$SPEECHX_BUILD/nnet
+export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
+
+PADDLE_LIB_PATH=$(python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')")
+export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH
diff --git a/speechx/examples/codelab/u2nnet/run.sh b/speechx/examples/codelab/u2nnet/run.sh
new file mode 100755
index 00000000..b309bc6f
--- /dev/null
+++ b/speechx/examples/codelab/u2nnet/run.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+set -x
+set -e
+
+. path.sh
+
+# 1. compile
+if [ ! -d ${SPEECHX_EXAMPLES} ]; then
+    pushd ${SPEECHX_ROOT} 
+    bash build.sh
+    popd
+fi
+
+# 2. download model
+if [ ! -f data/model/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz ]; then
+    mkdir -p data/model
+    pushd data/model
+    wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/static/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz
+    tar xzfv asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz
+    popd
+fi
+
+# produce wav scp
+if [ ! -f data/wav.scp ]; then
+    mkdir -p data
+    pushd data
+    wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
+    echo "utt1 " $PWD/zh.wav > wav.scp
+    popd 
+fi
+
+data=data
+exp=exp
+mkdir -p $exp
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/
+
+
+cmvn_json2kaldi_main \
+    --json_file  $model_dir/mean_std.json \
+    --cmvn_write_path $exp/cmvn.ark \
+    --binary=false
+echo "convert json cmvn to kaldi ark."
+
+compute_fbank_main \
+    --num_bins 80 \
+    --wav_rspecifier=scp:$data/wav.scp \
+    --cmvn_file=$exp/cmvn.ark \
+    --feature_wspecifier=ark,t:$exp/fbank.ark
+echo "compute fbank feature."
+
+u2_nnet_main \
+    --model_path=$model_dir/export.jit \
+    --feature_rspecifier=ark,t:$exp/fbank.ark \
+    --nnet_decoder_chunk=16 \
+    --receptive_field_length=7 \
+    --downsampling_rate=4 \
+    --acoustic_scale=1.0 \
+    --nnet_prob_wspecifier=ark,t:$exp/probs.ark
diff --git a/speechx/examples/codelab/u2nnet/valgrind.sh b/speechx/examples/codelab/u2nnet/valgrind.sh
new file mode 100755
index 00000000..a5aab663
--- /dev/null
+++ b/speechx/examples/codelab/u2nnet/valgrind.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# this script is for memory check, so please run ./run.sh first.
+
+set +x
+set -e
+
+. ./path.sh
+
+if [ ! -d ${SPEECHX_TOOLS}/valgrind/install ]; then
+  echo "please install valgrind in the speechx tools dir.\n" 
+  exit 1
+fi
+
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
+
+valgrind --tool=memcheck --track-origins=yes --leak-check=full --show-leak-kinds=all \
+  ds2_model_test_main \
+  --model_path=$model_dir/avg_1.jit.pdmodel \
+  --param_path=$model_dir/avg_1.jit.pdparams
diff --git a/speechx/examples/u2pp_ol/README.md b/speechx/examples/u2pp_ol/README.md
new file mode 100644
index 00000000..ce01a8fc
--- /dev/null
+++ b/speechx/examples/u2pp_ol/README.md
@@ -0,0 +1,5 @@
+# U2/U2++ Streaming ASR
+
+## Examples
+
+* `wenetspeech` - Streaming Decoding using wenetspeech u2/u2++ model. Using aishell test data for testing.    
diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h
index dfb14885..90fc96a1 100644
--- a/speechx/speechx/base/common.h
+++ b/speechx/speechx/base/common.h
@@ -34,6 +34,7 @@
 #include <stdexcept>
 #include <string>
 #include <thread>
+#include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
diff --git a/speechx/speechx/codelab/glog/glog_logtostderr_main.cc b/speechx/speechx/codelab/glog/glog_logtostderr_main.cc
index b0616a7d..c891827a 100644
--- a/speechx/speechx/codelab/glog/glog_logtostderr_main.cc
+++ b/speechx/speechx/codelab/glog/glog_logtostderr_main.cc
@@ -17,7 +17,7 @@
 int main(int argc, char* argv[]) {
     // Initialize Google’s logging library.
     google::InitGoogleLogging(argv[0]);
-
+    google::InstallFailureSignalHandler();
     FLAGS_logtostderr = 1;
 
     LOG(INFO) << "Found " << 10 << " cookies";
diff --git a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc
index 283466dc..7d99e857 100644
--- a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc
+++ b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc
@@ -195,8 +195,11 @@ void model_forward_test() {
 }
 
 int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
     google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
 
     model_forward_test();
     return 0;
diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt
index 0383c3ea..1df93511 100644
--- a/speechx/speechx/decoder/CMakeLists.txt
+++ b/speechx/speechx/decoder/CMakeLists.txt
@@ -18,7 +18,6 @@ set(BINS
   tlg_decoder_main
 )
 
-message(STATUS "xxxxxxxxxx: " ${DEPS})
 foreach(bin_name IN LISTS BINS)
   add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
   target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
index e4e5c2af..445f470f 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
@@ -53,8 +53,11 @@ using std::vector;
 
 // test ds2 online decoder by feeding speech feature
 int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
     google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
 
     CHECK(FLAGS_result_wspecifier != "");
     CHECK(FLAGS_feature_rspecifier != "");
diff --git a/speechx/speechx/decoder/nnet_logprob_decoder_main.cc b/speechx/speechx/decoder/nnet_logprob_decoder_main.cc
index 0e249cc6..e0acbe77 100644
--- a/speechx/speechx/decoder/nnet_logprob_decoder_main.cc
+++ b/speechx/speechx/decoder/nnet_logprob_decoder_main.cc
@@ -30,8 +30,11 @@ using std::vector;
 
 // test decoder by feeding nnet posterior probability
 int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
     google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
 
     kaldi::SequentialBaseFloatMatrixReader likelihood_reader(
         FLAGS_nnet_prob_respecifier);
diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/decoder/recognizer_main.cc
index 23251353..05026646 100644
--- a/speechx/speechx/decoder/recognizer_main.cc
+++ b/speechx/speechx/decoder/recognizer_main.cc
@@ -23,8 +23,11 @@ DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
 DEFINE_int32(sample_rate, 16000, "sample rate");
 
 int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
     google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
 
     ppspeech::RecognizerResource resource = ppspeech::InitRecognizerResoure();
     ppspeech::Recognizer recognizer(resource);
diff --git a/speechx/speechx/decoder/tlg_decoder_main.cc b/speechx/speechx/decoder/tlg_decoder_main.cc
index 93f84da3..b633022a 100644
--- a/speechx/speechx/decoder/tlg_decoder_main.cc
+++ b/speechx/speechx/decoder/tlg_decoder_main.cc
@@ -55,8 +55,11 @@ using std::vector;
 
 // test TLG decoder by feeding speech feature.
 int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
     google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
 
     kaldi::SequentialBaseFloatMatrixReader feature_reader(
         FLAGS_feature_rspecifier);
diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt
index 8ae63256..050d78be 100644
--- a/speechx/speechx/frontend/audio/CMakeLists.txt
+++ b/speechx/speechx/frontend/audio/CMakeLists.txt
@@ -1,5 +1,3 @@
-project(frontend)
-
 add_library(frontend STATIC
   cmvn.cc
   db_norm.cc
diff --git a/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc b/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc
index 0def1466..93bad688 100644
--- a/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc
+++ b/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc
@@ -30,8 +30,11 @@ DEFINE_bool(binary, true, "write cmvn in binary (true) or text(false)");
 using namespace boost::json;  // from <boost/json.hpp>
 
 int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
     google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
 
     LOG(INFO) << "cmvn josn path: " << FLAGS_json_file;
 
diff --git a/speechx/speechx/frontend/audio/compute_fbank_main.cc b/speechx/speechx/frontend/audio/compute_fbank_main.cc
index f7a42315..93a6d407 100644
--- a/speechx/speechx/frontend/audio/compute_fbank_main.cc
+++ b/speechx/speechx/frontend/audio/compute_fbank_main.cc
@@ -32,13 +32,21 @@ DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
 DEFINE_string(cmvn_file, "", "read cmvn");
 DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
 DEFINE_int32(num_bins, 161, "fbank num bins");
+DEFINE_int32(sample_rate, 16000, "sampe rate: 16k, 8k.");
 
 int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
     google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
 
+    CHECK(FLAGS_wav_rspecifier.size() > 0);
+    CHECK(FLAGS_feature_wspecifier.size() > 0);
     kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
         FLAGS_wav_rspecifier);
+    kaldi::SequentialTableReader<kaldi::WaveInfoHolder> wav_info_reader(
+        FLAGS_wav_rspecifier);
     kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
 
     int32 num_done = 0, num_err = 0;
@@ -54,6 +62,10 @@ int main(int argc, char* argv[]) {
     opt.frame_opts.frame_shift_ms = 10;
     opt.mel_opts.num_bins = FLAGS_num_bins;
     opt.frame_opts.dither = 0.0;
+    LOG(INFO) << "frame_length_ms: " << opt.frame_opts.frame_length_ms;
+    LOG(INFO) << "frame_shift_ms: " << opt.frame_opts.frame_shift_ms;
+    LOG(INFO) << "num_bins: " << opt.mel_opts.num_bins;
+    LOG(INFO) << "dither: " << opt.frame_opts.dither;
 
     std::unique_ptr<ppspeech::FrontendInterface> fbank(
         new ppspeech::Fbank(opt, std::move(data_source)));
@@ -61,53 +73,73 @@ int main(int argc, char* argv[]) {
     std::unique_ptr<ppspeech::FrontendInterface> cmvn(
         new ppspeech::CMVN(FLAGS_cmvn_file, std::move(fbank)));
 
-    ppspeech::FeatureCacheOptions feat_cache_opts;
     // the feature cache output feature chunk by chunk.
+    ppspeech::FeatureCacheOptions feat_cache_opts;
     ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn));
     LOG(INFO) << "fbank: " << true;
     LOG(INFO) << "feat dim: " << feature_cache.Dim();
 
-    int sample_rate = 16000;
+
     float streaming_chunk = FLAGS_streaming_chunk;
-    int chunk_sample_size = streaming_chunk * sample_rate;
-    LOG(INFO) << "sr: " << sample_rate;
-    LOG(INFO) << "chunk size (s): " << streaming_chunk;
+    int chunk_sample_size = streaming_chunk * FLAGS_sample_rate;
+    LOG(INFO) << "sr: " << FLAGS_sample_rate;
+    LOG(INFO) << "chunk size (sec): " << streaming_chunk;
     LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
 
-    for (; !wav_reader.Done(); wav_reader.Next()) {
-        std::string utt = wav_reader.Key();
+    for (; !wav_reader.Done() && !wav_info_reader.Done(); wav_reader.Next(), wav_info_reader.Next()) {
+        const std::string& utt = wav_reader.Key();
         const kaldi::WaveData& wave_data = wav_reader.Value();
-        LOG(INFO) << "process utt: " << utt;
 
+        const std::string& utt2 = wav_info_reader.Key();
+        const kaldi::WaveInfo& wave_info = wav_info_reader.Value();
+
+        CHECK(utt == utt2) << "wav reader and wav info reader using diff rspecifier!!!";
+        LOG(INFO) << "utt: " << utt;
+        LOG(INFO) << "samples: " << wave_info.SampleCount();
+        LOG(INFO) << "dur: " << wave_info.Duration() << " sec";
+        CHECK(wave_info.SampFreq() == FLAGS_sample_rate) << "need " << FLAGS_sample_rate << " get " << wave_info.SampFreq();
+
+        // load first channel wav
         int32 this_channel = 0;
         kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
                                                     this_channel);
+    
+        // compute feat chunk by chunk
         int tot_samples = waveform.Dim();
-        LOG(INFO) << "wav len (sample): " << tot_samples;
-
         int sample_offset = 0;
         std::vector<kaldi::Vector<BaseFloat>> feats;
         int feature_rows = 0;
         while (sample_offset < tot_samples) {
+            // cur chunk size
             int cur_chunk_size =
                 std::min(chunk_sample_size, tot_samples - sample_offset);
 
+            // get chunk wav
             kaldi::Vector<kaldi::BaseFloat> wav_chunk(cur_chunk_size);
             for (int i = 0; i < cur_chunk_size; ++i) {
                 wav_chunk(i) = waveform(sample_offset + i);
             }
 
-            kaldi::Vector<BaseFloat> features;
+            // compute feat
             feature_cache.Accept(wav_chunk);
+
+            // send finish signal
             if (cur_chunk_size < chunk_sample_size) {
                 feature_cache.SetFinished();
             }
+
+            // read feat
+            kaldi::Vector<BaseFloat> features;
             bool flag = true;
             do {
                 flag = feature_cache.Read(&features);
-                feats.push_back(features);
-                feature_rows += features.Dim() / feature_cache.Dim();
+                if (flag && features.Dim() != 0) {
+                    feats.push_back(features);
+                    feature_rows += features.Dim() / feature_cache.Dim();
+                }
             } while (flag == true && features.Dim() != 0);
+
+            // forward offset
             sample_offset += cur_chunk_size;
         }
 
@@ -125,14 +157,19 @@ int main(int argc, char* argv[]) {
                 ++cur_idx;
             }
         }
+        LOG(INFO) << "feat shape: " << features.NumRows() << " , " << features.NumCols();
         feat_writer.Write(utt, features);
+
+        // reset frontend pipeline state
         feature_cache.Reset();
 
         if (num_done % 50 == 0 && num_done != 0)
-            KALDI_VLOG(2) << "Processed " << num_done << " utterances";
+            VLOG(2) << "Processed " << num_done << " utterances";
+
         num_done++;
     }
-    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
+
+    LOG(INFO) << "Done " << num_done << " utterances, " << num_err
               << " with errors.";
     return (num_done != 0 ? 0 : 1);
 }
diff --git a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
index 162c3529..889f5663 100644
--- a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
+++ b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
@@ -31,8 +31,11 @@ DEFINE_string(cmvn_file, "./cmvn.ark", "read cmvn");
 DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
 
 int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
     google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
 
     kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
         FLAGS_wav_rspecifier);
diff --git a/speechx/speechx/model/CMakeLists.txt b/speechx/speechx/model/CMakeLists.txt
deleted file mode 100644
index e69de29b..00000000
diff --git a/speechx/speechx/nnet/CMakeLists.txt b/speechx/speechx/nnet/CMakeLists.txt
index 565bba3e..2a1812fd 100644
--- a/speechx/speechx/nnet/CMakeLists.txt
+++ b/speechx/speechx/nnet/CMakeLists.txt
@@ -1,12 +1,40 @@
-project(nnet)
+set(srcs decodable.cc)
 
-add_library(nnet STATIC
-  decodable.cc
-  ds2_nnet.cc
-)
+if(USING_DS2)
+  list(APPEND srcs ds2_nnet.cc)
+endif()
+
+if(USING_U2)
+  list(APPEND srcs u2_nnet.cc)
+endif()
+
+add_library(nnet STATIC ${srcs})
 target_link_libraries(nnet absl::strings)
 
-set(bin_name ds2_nnet_main)
-add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
-target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet ${DEPS})
\ No newline at end of file
+if(USING_U2)
+  target_compile_options(nnet  PUBLIC ${PADDLE_COMPILE_FLAGS})
+  target_include_directories(nnet  PUBLIC ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
+  # target_link_libraries(nnet  ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
+endif()
+
+
+if(USING_DS2)
+  set(bin_name ds2_nnet_main)
+  add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+  target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+  target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet)
+
+  target_link_libraries(${bin_name} ${DEPS})
+endif()
+
+# test bin
+if(USING_U2)
+  set(bin_name u2_nnet_main)
+  add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+  target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+  target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet)
+
+  target_compile_options(${bin_name}  PRIVATE ${PADDLE_COMPILE_FLAGS})
+  target_include_directories(${bin_name}  PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
+  target_link_libraries(${bin_name}  ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
+endif()
diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc
index 465f64a9..7780e5ae 100644
--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@@ -30,6 +30,7 @@ Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet,
       frames_ready_(0),
       acoustic_scale_(acoustic_scale) {}
 
+// for debug
 void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
     nnet_cache_ = likelihood;
     frames_ready_ += likelihood.NumRows();
@@ -41,6 +42,7 @@ void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
 // return the size of frame have computed.
 int32 Decodable::NumFramesReady() const { return frames_ready_; }
 
+
 // frame idx is from 0 to frame_ready_ -1;
 bool Decodable::IsLastFrame(int32 frame) {
     bool flag = EnsureFrameHaveComputed(frame);
@@ -72,26 +74,38 @@ bool Decodable::EnsureFrameHaveComputed(int32 frame) {
 }
 
 bool Decodable::AdvanceChunk() {
+    // read feats
     Vector<BaseFloat> features;
     if (frontend_ == NULL || frontend_->Read(&features) == false) {
+        // no feat or frontend_ not init.
         return false;
     }
-    int32 nnet_dim = 0;
-    Vector<BaseFloat> inferences;
-    nnet_->FeedForward(features, frontend_->Dim(), &inferences, &nnet_dim);
-    nnet_cache_.Resize(inferences.Dim() / nnet_dim, nnet_dim);
-    nnet_cache_.CopyRowsFromVec(inferences);
 
+    // forward feats
+    int32 vocab_dim = 0;
+    Vector<BaseFloat> probs;
+    nnet_->FeedForward(features, frontend_->Dim(), &probs, &vocab_dim);
+
+    // cache nnet outupts
+    nnet_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim);
+    nnet_cache_.CopyRowsFromVec(probs);
+
+    // update state
     frame_offset_ = frames_ready_;
     frames_ready_ += nnet_cache_.NumRows();
     return true;
 }
 
+// read one frame likelihood
 bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
-    std::vector<BaseFloat> result;
-    if (EnsureFrameHaveComputed(frame) == false) return false;
-    likelihood->resize(nnet_cache_.NumCols());
-    for (int32 idx = 0; idx < nnet_cache_.NumCols(); ++idx) {
+    if (EnsureFrameHaveComputed(frame) == false) {
+        return false;
+    }
+
+    int vocab_size = nnet_cache_.NumCols();
+    likelihood->resize(vocab_size);
+
+    for (int32 idx = 0; idx < vocab_size; ++idx) {
         (*likelihood)[idx] =
             nnet_cache_(frame - frame_offset_, idx) * acoustic_scale_;
     }
diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h
index 9555fea7..241d0419 100644
--- a/speechx/speechx/nnet/decodable.h
+++ b/speechx/speechx/nnet/decodable.h
@@ -27,35 +27,54 @@ class Decodable : public kaldi::DecodableInterface {
     explicit Decodable(const std::shared_ptr<NnetInterface>& nnet,
                        const std::shared_ptr<FrontendInterface>& frontend,
                        kaldi::BaseFloat acoustic_scale = 1.0);
+
     // void Init(DecodableOpts config);
+
+    // nnet logprob output
     virtual kaldi::BaseFloat LogLikelihood(int32 frame, int32 index);
+
     virtual bool IsLastFrame(int32 frame);
+
+    // nnet output dim, e.g. vocab size
     virtual int32 NumIndices() const;
-    // not logprob
+
+    // nnet prob output
     virtual bool FrameLikelihood(int32 frame,
                                  std::vector<kaldi::BaseFloat>* likelihood);
+
     virtual int32 NumFramesReady() const;
+
     // for offline test
     void Acceptlikelihood(const kaldi::Matrix<kaldi::BaseFloat>& likelihood);
+
     void Reset();
+
     bool IsInputFinished() const { return frontend_->IsFinished(); }
+
     bool EnsureFrameHaveComputed(int32 frame);
+
     int32 TokenId2NnetId(int32 token_id);
 
   private:
     bool AdvanceChunk();
+
     std::shared_ptr<FrontendInterface> frontend_;
     std::shared_ptr<NnetInterface> nnet_;
+
+    // nnet outputs' cache
     kaldi::Matrix<kaldi::BaseFloat> nnet_cache_;
+
     // the frame is nnet prob frame rather than audio feature frame
     // nnet frame subsample the feature frame
     // eg: 35 frame features output 8 frame inferences
     int32 frame_offset_;
     int32 frames_ready_;
+
     // todo: feature frame mismatch with nnet inference frame
     // so use subsampled_frame
     int32 current_log_post_subsampled_offset_;
     int32 num_chunk_computed_;
+
     kaldi::BaseFloat acoustic_scale_;
 };
 
diff --git a/speechx/speechx/nnet/ds2_nnet_main.cc b/speechx/speechx/nnet/ds2_nnet_main.cc
index e2904208..943d7e5f 100644
--- a/speechx/speechx/nnet/ds2_nnet_main.cc
+++ b/speechx/speechx/nnet/ds2_nnet_main.cc
@@ -13,8 +13,7 @@
 // limitations under the License.
 
 #include "nnet/ds2_nnet.h"
-#include "base/flags.h"
-#include "base/log.h"
+#include "base/common.h"
 #include "frontend/audio/assembler.h"
 #include "frontend/audio/data_cache.h"
 #include "kaldi/util/table-types.h"
@@ -49,8 +48,11 @@ using kaldi::Matrix;
 using std::vector;
 
 int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
     google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
 
     kaldi::SequentialBaseFloatMatrixReader feature_reader(
         FLAGS_feature_rspecifier);
@@ -146,7 +148,7 @@ int main(int argc, char* argv[]) {
         }
         kaldi::Matrix<kaldi::BaseFloat> result(prob_vec.size(),
                                                prob_vec[0].Dim());
-        for (int32 row_idx = 0; row_idx < prob_vec.size(); ++row_idx) {
+        for (int row_idx = 0; row_idx < prob_vec.size(); ++row_idx) {
             for (int32 col_idx = 0; col_idx < prob_vec[0].Dim(); ++col_idx) {
                 result(row_idx, col_idx) = prob_vec[row_idx](col_idx);
             }
diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc
new file mode 100644
index 00000000..67ef0952
--- /dev/null
+++ b/speechx/speechx/nnet/u2_nnet.cc
@@ -0,0 +1,706 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet/u2_nnet.h"
+
+#ifdef USE_PROFILING
+#include "paddle/fluid/platform/profiler.h"
+using paddle::platform::RecordEvent;
+using paddle::platform::TracerEventType;
+#endif  // end USE_PROFILING
+
+namespace ppspeech {
+
+int U2NnetBase::num_frames_for_chunk(bool start) const {
+    int num_needed_frames = 0;  // num feat frames
+    bool first = !start;        // start == false is first
+
+    if (chunk_size_ > 0) {
+        // streaming mode
+        if (first) {
+            // first chunk
+            // 1 decoder frame need `context` feat frames
+            int context = this->context();
+            num_needed_frames = (chunk_size_ - 1) * subsampling_rate_ + context;
+        } else {
+            // after first chunk, we need stride this num frames.
+            num_needed_frames = chunk_size_ * subsampling_rate_;
+        }
+    } else {
+        // non-streaming mode. feed all feats once.
+        num_needed_frames = std::numeric_limits<int>::max();
+    }
+
+    return num_needed_frames;
+}
+
+// cache feats for next chunk
+void U2NnetBase::CacheFeature(const std::vector<kaldi::BaseFloat>& chunk_feats,
+                              int32 feat_dim) {
+    // chunk_feats is nframes*feat_dim
+    const int chunk_size = chunk_feats.size() / feat_dim;
+    const int cached_feat_size = this->context() - subsampling_rate_;
+    if (chunk_size >= cached_feat_size) {
+        cached_feats_.resize(cached_feat_size);
+        for (int i = 0; i < cached_feat_size; ++i) {
+            auto start =
+                chunk_feats.begin() + chunk_size - cached_feat_size + i;
+            auto end = start + feat_dim;
+            cached_feats_[i] = std::vector<float>(start, end);
+        }
+    }
+}
+
+void U2NnetBase::ForwardEncoderChunk(
+    const std::vector<kaldi::BaseFloat>& chunk_feats,
+    int32 feat_dim,
+    std::vector<kaldi::BaseFloat>* ctc_probs,
+    int32* vocab_dim) {
+    ctc_probs->clear();
+    // int num_frames = cached_feats_.size() + chunk_feats.size();
+    int num_frames = chunk_feats.size() / feat_dim;
+    VLOG(3) << "foward encoder chunk: " << num_frames << " frames";
+    VLOG(3) << "context: " << this->context() << " frames";
+
+    if (num_frames >= this->context()) {
+        this->ForwardEncoderChunkImpl(
+            chunk_feats, feat_dim, ctc_probs, vocab_dim);
+        VLOG(3) << "after forward chunk";
+        this->CacheFeature(chunk_feats, feat_dim);
+    }
+}
+
+
+void U2Nnet::LoadModel(const std::string& model_path_w_prefix) {
+    paddle::jit::utils::InitKernelSignatureMap();
+
+#ifdef USE_GPU
+    dev_ = phi::GPUPlace();
+#else
+    dev_ = phi::CPUPlace();
+#endif
+    paddle::jit::Layer model = paddle::jit::Load(model_path_w_prefix, dev_);
+    model_ = std::make_shared<paddle::jit::Layer>(std::move(model));
+
+    subsampling_rate_ = model_->Attribute<int>("subsampling_rate");
+    right_context_ = model_->Attribute<int>("right_context");
+    sos_ = model_->Attribute<int>("sos_symbol");
+    eos_ = model_->Attribute<int>("eos_symbol");
+    is_bidecoder_ = model_->Attribute<int>("is_bidirectional_decoder");
+
+    forward_encoder_chunk_ = model_->Function("forward_encoder_chunk");
+    forward_attention_decoder_ = model_->Function("forward_attention_decoder");
+    ctc_activation_ = model_->Function("ctc_activation");
+    CHECK(forward_encoder_chunk_.IsValid());
+    CHECK(forward_attention_decoder_.IsValid());
+    CHECK(ctc_activation_.IsValid());
+
+    LOG(INFO) << "Paddle Model Info: ";
+    LOG(INFO) << "\tsubsampling_rate " << subsampling_rate_;
+    LOG(INFO) << "\tright context " << right_context_;
+    LOG(INFO) << "\tsos " << sos_;
+    LOG(INFO) << "\teos " << eos_;
+    LOG(INFO) << "\tis bidecoder " << is_bidecoder_ << std::endl;
+
+    Warmup();
+}
+
+void U2Nnet::Warmup() {
+#ifdef USE_PROFILING
+    RecordEvent event("warmup", TracerEventType::UserDefined, 1);
+#endif
+
+    {
+#ifdef USE_PROFILING
+        RecordEvent event(
+            "warmup-encoder-ctc", TracerEventType::UserDefined, 1);
+#endif
+        int feat_dim = 80;
+        int frame_num = 16 * 4 + 3;  // chunk_size * downsample_rate +
+                                     // (receptive_field - downsample_rate)
+        paddle::Tensor feats = paddle::full(
+            {1, frame_num, feat_dim}, 0.12f, paddle::DataType::FLOAT32);
+        paddle::Tensor offset = paddle::zeros({1}, paddle::DataType::INT32);
+        paddle::Tensor att_cache =
+            paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32);
+        paddle::Tensor cnn_cache =
+            paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32);
+        std::vector<paddle::Tensor> inputs = {
+            feats, offset, /*required_cache_size, */ att_cache, cnn_cache};
+        std::vector<paddle::Tensor> outputs = forward_encoder_chunk_(inputs);
+
+        auto chunk_out = outputs[0];
+        inputs = std::move(std::vector<paddle::Tensor>({chunk_out}));
+        outputs = ctc_activation_(inputs);
+    }
+
+    {
+#ifdef USE_PROFILING
+        RecordEvent event("warmup-decoder", TracerEventType::UserDefined, 1);
+#endif
+        auto hyps =
+            paddle::full({10, 8}, 10, paddle::DataType::INT64, phi::CPUPlace());
+        auto hyps_lens =
+            paddle::full({10}, 8, paddle::DataType::INT64, phi::CPUPlace());
+        auto encoder_out = paddle::ones(
+            {1, 20, 512}, paddle::DataType::FLOAT32, phi::CPUPlace());
+
+        std::vector<paddle::experimental::Tensor> inputs{
+            hyps, hyps_lens, encoder_out};
+
+        std::vector<paddle::experimental::Tensor> outputs =
+            forward_attention_decoder_(inputs);
+    }
+
+    Reset();
+}
+
+U2Nnet::U2Nnet(const U2ModelOptions& opts) : opts_(opts) {
+    LoadModel(opts_.model_path);
+}
+
+// shallow copy
+U2Nnet::U2Nnet(const U2Nnet& other) {
+    // copy meta
+    right_context_ = other.right_context_;
+    subsampling_rate_ = other.subsampling_rate_;
+    sos_ = other.sos_;
+    eos_ = other.eos_;
+    is_bidecoder_ = other.is_bidecoder_;
+    chunk_size_ = other.chunk_size_;
+    num_left_chunks_ = other.num_left_chunks_;
+
+    forward_encoder_chunk_ = other.forward_encoder_chunk_;
+    forward_attention_decoder_ = other.forward_attention_decoder_;
+    ctc_activation_ = other.ctc_activation_;
+
+    //   offset_ = other.offset_; // TODO: not used in nnets
+
+    // copy model ptr
+    model_ = other.model_;
+
+    // ignore inner states
+}
+
+std::shared_ptr<NnetInterface> U2Nnet::Copy() const {
+    auto asr_model = std::make_shared<U2Nnet>(*this);
+    // reset inner state for new decoding
+    asr_model->Reset();
+    return asr_model;
+}
+
+void U2Nnet::Reset() {
+    //   offset_ = 0;
+    //   cached_feats_.clear(); // TODO: not used in nnets
+
+    att_cache_ =
+        std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32));
+    cnn_cache_ =
+        std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32));
+
+    encoder_outs_.clear();
+}
+
+// Debug API
+void U2Nnet::FeedEncoderOuts(paddle::Tensor& encoder_out) {
+    // encoder_out (T,D)
+    encoder_outs_.clear();
+    encoder_outs_.push_back(encoder_out);
+}
+
+
+void U2Nnet::FeedForward(const kaldi::Vector<BaseFloat>& features,
+                         int32 feature_dim,
+                         kaldi::Vector<BaseFloat>* inferences,
+                         int32* inference_dim) {
+    std::vector<kaldi::BaseFloat> chunk_feats(features.Data(),
+                                              features.Data() + features.Dim());
+    std::vector<kaldi::BaseFloat> ctc_probs;
+    ForwardEncoderChunkImpl(
+        chunk_feats, feature_dim, &ctc_probs, inference_dim);
+    inferences->Resize(ctc_probs.size(), kaldi::kSetZero);
+    std::memcpy(inferences->Data(),
+                ctc_probs.data(),
+                ctc_probs.size() * sizeof(kaldi::BaseFloat));
+}
+
+
+void U2Nnet::ForwardEncoderChunkImpl(
+    const std::vector<kaldi::BaseFloat>& chunk_feats,
+    int32 feat_dim,
+    std::vector<kaldi::BaseFloat>* out_prob,
+    int32* vocab_dim) {
+#ifdef USE_PROFILING
+    RecordEvent event(
+        "ForwardEncoderChunkImpl", TracerEventType::UserDefined, 1);
+#endif
+
+    // 1. splice cached_feature, and chunk_feats
+    //  First dimension is B, which is 1.
+    // int num_frames = cached_feats_.size() + chunk_feats.size();
+
+    int num_frames = chunk_feats.size() / feat_dim;
+    VLOG(3) << "num_frames: " << num_frames;
+    VLOG(3) << "feat_dim: " << feat_dim;
+
+    // feats (B=1,T,D)
+    paddle::Tensor feats =
+        paddle::zeros({1, num_frames, feat_dim}, paddle::DataType::FLOAT32);
+    float* feats_ptr = feats.mutable_data<float>();
+
+    // for (size_t i = 0; i < cached_feats_.size(); ++i) {
+    //     float* row = feats_ptr + i * feat_dim;
+    //     std::memcpy(row, cached_feats_[i].data(), feat_dim * sizeof(float));
+    // }
+
+    // for (size_t i = 0; i < chunk_feats.size(); ++i) {
+    //     float* row = feats_ptr + (cached_feats_.size() + i) * feat_dim;
+    //     std::memcpy(row, chunk_feats[i].data(), feat_dim * sizeof(float));
+    // }
+
+    // not cache feature in nnet
+    CHECK(cached_feats_.size() == 0);
+    // CHECK_EQ(std::is_same<float, kaldi::BaseFloat>::value, true);
+    std::memcpy(feats_ptr,
+                chunk_feats.data(),
+                chunk_feats.size() * sizeof(kaldi::BaseFloat));
+
+    VLOG(3) << "feats shape: " << feats.shape()[0] << ", " << feats.shape()[1]
+            << ", " << feats.shape()[2];
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("feat", std::ios_base::app | std::ios_base::out);
+        path << offset_;
+        std::ofstream feat_fobj(path.str().c_str(), std::ios::out);
+        CHECK(feat_fobj.is_open());
+        // feat_fobj << feats.shape()[0] << " " << feats.shape()[1] << " "
+        //           << feats.shape()[2] << "\n";
+        for (int i = 0; i < feats.numel(); i++) {
+            feat_fobj << std::setprecision(18) << feats_ptr[i] << " ";
+            if ((i + 1) % feat_dim == 0) {
+                feat_fobj << "\n";
+            }
+        }
+        feat_fobj << "\n";
+    }
+#endif
+
+// Endocer chunk forward
+#ifdef USE_GPU
+    feats = feats.copy_to(paddle::GPUPlace(), /*blocking*/ false);
+    att_cache_ = att_cache_.copy_to(paddle::GPUPlace()), /*blocking*/ false;
+    cnn_cache_ = cnn_cache_.copy_to(Paddle::GPUPlace(), /*blocking*/ false);
+#endif
+
+    int required_cache_size = num_left_chunks_ * chunk_size_;  // -1 * 16
+    // must be scalar, but paddle do not have scalar.
+    paddle::Tensor offset = paddle::full({1}, offset_, paddle::DataType::INT32);
+    // freeze `required_cache_size` in graph, so not specific it in function
+    // call.
+    std::vector<paddle::Tensor> inputs = {
+        feats, offset, /*required_cache_size, */ att_cache_, cnn_cache_};
+    VLOG(3) << "inputs size: " << inputs.size();
+    CHECK(inputs.size() == 4);
+    std::vector<paddle::Tensor> outputs = forward_encoder_chunk_(inputs);
+    VLOG(3) << "outputs size: " << outputs.size();
+    CHECK(outputs.size() == 3);
+
+#ifdef USE_GPU
+    paddle::Tensor chunk_out = outputs[0].copy_to(paddle::CPUPlace());
+    att_cache_ = outputs[1].copy_to(paddle::CPUPlace());
+    cnn_cache_ = outputs[2].copy_to(paddle::CPUPlace());
+#else
+    paddle::Tensor chunk_out = outputs[0];
+    att_cache_ = outputs[1];
+    cnn_cache_ = outputs[2];
+#endif
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("encoder_logits",
+                               std::ios_base::app | std::ios_base::out);
+        auto i = offset_ - chunk_out.shape()[1];
+        path << std::max(i, 0L);
+        std::ofstream logits_fobj(path.str().c_str(), std::ios::out);
+        CHECK(logits_fobj.is_open());
+        logits_fobj << chunk_out.shape()[0] << " " << chunk_out.shape()[1]
+                    << " " << chunk_out.shape()[2] << "\n";
+        const float* chunk_out_ptr = chunk_out.data<float>();
+        logits_fobj << chunk_out_ptr << std::endl;
+        for (int i = 0; i < chunk_out.numel(); i++) {
+            logits_fobj << chunk_out_ptr[i] << " ";
+        }
+        logits_fobj << "\n";
+    }
+#endif  // end TEST_DEBUG
+
+    // current offset in decoder frame
+    // not used in nnet
+    offset_ += chunk_out.shape()[1];
+
+    // collects encoder outs.
+    VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size();
+    encoder_outs_.push_back(chunk_out);
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("encoder_logits_list",
+                               std::ios_base::app | std::ios_base::out);
+        path << offset_ - encoder_outs_[0].shape()[1];
+        std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out);
+        CHECK(logits_out_fobj.is_open());
+        logits_out_fobj << encoder_outs_[0].shape()[0] << " "
+                        << encoder_outs_[0].shape()[1] << " "
+                        << encoder_outs_[0].shape()[2] << "\n";
+        const float* encoder_outs_ptr = encoder_outs_[0].data<float>();
+        logits_out_fobj << encoder_outs_ptr << std::endl;
+        for (int i = 0; i < encoder_outs_[0].numel(); i++) {
+            logits_out_fobj << encoder_outs_ptr[i] << " ";
+        }
+        logits_out_fobj << "\n";
+    }
+#endif  // end TEST_DEBUG
+
+#ifdef USE_GPU
+
+#error "Not implementation."
+
+#else
+    // compute ctc_activation == log_softmax
+    inputs.clear();
+    outputs.clear();
+    inputs.push_back(chunk_out);
+    CHECK(inputs.size() == 1);
+    outputs = ctc_activation_(inputs);
+    CHECK(outputs.size() == 1);
+    paddle::Tensor ctc_log_probs = outputs[0];
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("encoder_logprob",
+                               std::ios_base::app | std::ios_base::out);
+        path << offset_ - chunk_out.shape()[1];
+
+        std::ofstream logprob_fobj(path.str().c_str(), std::ios::out);
+        CHECK(logprob_fobj.is_open());
+        logprob_fobj << ctc_log_probs.shape()[0] << " "
+                     << ctc_log_probs.shape()[1] << " "
+                     << ctc_log_probs.shape()[2] << "\n";
+        const float* logprob_ptr = ctc_log_probs.data<float>();
+        for (int i = 0; i < ctc_log_probs.numel(); i++) {
+            logprob_fobj << logprob_ptr[i] << " ";
+            if ((i + 1) % ctc_log_probs.shape()[2] == 0) {
+                logprob_fobj << "\n";
+            }
+        }
+        logprob_fobj << "\n";
+    }
+#endif  // end TEST_DEBUG
+
+#endif  // end USE_GPU
+
+    // Copy to output, (B=1,T,D)
+    std::vector<int64_t> ctc_log_probs_shape = ctc_log_probs.shape();
+    CHECK(ctc_log_probs_shape.size() == 3);
+    int B = ctc_log_probs_shape[0];
+    CHECK(B == 1);
+    int T = ctc_log_probs_shape[1];
+    int D = ctc_log_probs_shape[2];
+    *vocab_dim = D;
+
+    float* ctc_log_probs_ptr = ctc_log_probs.data<float>();
+
+    // // vector<vector<float>>
+    // out_prob->resize(T);
+    // for (int i = 0; i < T; i++) {
+    //     (*out_prob)[i].resize(D);
+    //     float* dst_ptr = (*out_prob)[i].data();
+    //     float* src_ptr = ctc_log_probs_ptr + (i * D);
+    //     std::memcpy(dst_ptr, src_ptr, D * sizeof(float));
+    // }
+    // CHECK(std::is_same<float, kaldi::BaseFloat>::value);
+    out_prob->resize(T * D);
+    std::memcpy(
+        out_prob->data(), ctc_log_probs_ptr, T * D * sizeof(kaldi::BaseFloat));
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("encoder_logits_list_ctc",
+                               std::ios_base::app | std::ios_base::out);
+        path << offset_ - encoder_outs_[0].shape()[1];
+        std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out);
+        CHECK(logits_out_fobj.is_open());
+        logits_out_fobj << encoder_outs_[0].shape()[0] << " "
+                        << encoder_outs_[0].shape()[1] << " "
+                        << encoder_outs_[0].shape()[2] << "\n";
+        const float* encoder_outs_ptr = encoder_outs_[0].data<float>();
+        logits_out_fobj << encoder_outs_ptr << std::endl;
+        for (int i = 0; i < encoder_outs_[0].numel(); i++) {
+            logits_out_fobj << encoder_outs_ptr[i] << " ";
+        }
+        logits_out_fobj << "\n";
+    }
+#endif  // end TEST_DEBUG
+
+    return;
+}
+
+float U2Nnet::ComputePathScore(const paddle::Tensor& prob,
+                               const std::vector<int>& hyp,
+                               int eos) {
+    // sum `hyp` path scores in `prob`
+    // prob (1, Umax, V)
+    // hyp (U,)
+    float score = 0.0f;
+    std::vector<int64_t> dims = prob.shape();
+    CHECK(dims.size() == 3);
+    VLOG(2) << "prob shape: " << dims[0] << ", " << dims[1] << ", " << dims[2];
+    CHECK(dims[0] == 1);
+    int vocab_dim = static_cast<int>(dims[2]);
+
+    const float* prob_ptr = prob.data<float>();
+    for (size_t i = 0; i < hyp.size(); ++i) {
+        const float* row = prob_ptr + i * vocab_dim;
+        score += row[hyp[i]];
+    }
+    const float* row = prob_ptr + hyp.size() * vocab_dim;
+    score += row[eos];
+    return score;
+}
+
+
+void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
+                                float reverse_weight,
+                                std::vector<float>* rescoring_score) {
+#ifdef USE_PROFILING
+    RecordEvent event("AttentionRescoring", TracerEventType::UserDefined, 1);
+#endif
+
+    CHECK(rescoring_score != nullptr);
+
+    int num_hyps = hyps.size();
+    rescoring_score->resize(num_hyps, 0.0f);
+
+    if (num_hyps == 0) return;
+    VLOG(2) << "num hyps: " << num_hyps;
+
+    if (encoder_outs_.size() == 0) {
+        // no encoder outs
+        std::cerr << "encoder_outs_.size() is zero. Please check it."
+                  << std::endl;
+        return;
+    }
+
+    // prepare input
+    paddle::Tensor hyps_lens =
+        paddle::zeros({num_hyps}, paddle::DataType::INT64);
+    int64_t* hyps_len_ptr = hyps_lens.mutable_data<int64_t>();
+    int max_hyps_len = 0;
+    for (size_t i = 0; i < num_hyps; ++i) {
+        int len = hyps[i].size() + 1;  // eos
+        max_hyps_len = std::max(max_hyps_len, len);
+        hyps_len_ptr[i] = static_cast<int64_t>(len);
+    }
+
+    paddle::Tensor hyps_tensor =
+        paddle::full({num_hyps, max_hyps_len}, eos_, paddle::DataType::INT64);
+    int64_t* hyps_ptr = hyps_tensor.mutable_data<int64_t>();
+    for (size_t i = 0; i < num_hyps; ++i) {
+        const std::vector<int>& hyp = hyps[i];
+        int64_t* row = hyps_ptr + max_hyps_len * i;
+        row[0] = sos_;
+        for (size_t j = 0; j < hyp.size(); ++j) {
+            row[j + 1] = hyp[j];
+        }
+    }
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("encoder_logits_concat",
+                               std::ios_base::app | std::ios_base::out);
+        for (int j = 0; j < encoder_outs_.size(); j++) {
+            path << j;
+            std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out);
+            CHECK(logits_out_fobj.is_open());
+            logits_out_fobj << encoder_outs_[j].shape()[0] << " "
+                            << encoder_outs_[j].shape()[1] << " "
+                            << encoder_outs_[j].shape()[2] << "\n";
+            const float* encoder_outs_ptr = encoder_outs_[j].data<float>();
+            for (int i = 0; i < encoder_outs_[j].numel(); i++) {
+                logits_out_fobj << encoder_outs_ptr[i] << " ";
+            }
+            logits_out_fobj << "\n";
+        }
+    }
+#endif  // end TEST_DEBUG
+
+    // forward attention decoder by hyps and correspoinding encoder_outs_
+    paddle::Tensor encoder_out = paddle::concat(encoder_outs_, 1);
+    VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size();
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("encoder_out0",
+                               std::ios_base::app | std::ios_base::out);
+        std::ofstream encoder_out_fobj(path.str().c_str(), std::ios::out);
+        CHECK(encoder_out_fobj.is_open());
+
+        encoder_out_fobj << encoder_outs_[0].shape()[0] << " "
+                         << encoder_outs_[0].shape()[1] << " "
+                         << encoder_outs_[0].shape()[2] << "\n";
+        const float* enc_logprob_ptr = encoder_outs_[0].data<float>();
+
+        size_t size = encoder_outs_[0].numel();
+        for (int i = 0; i < size; i++) {
+            encoder_out_fobj << enc_logprob_ptr[i] << "\n";
+        }
+    }
+#endif  // end TEST_DEBUG
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("encoder_out",
+                               std::ios_base::app | std::ios_base::out);
+        std::ofstream encoder_out_fobj(path.str().c_str(), std::ios::out);
+        CHECK(encoder_out_fobj.is_open());
+
+        encoder_out_fobj << encoder_out.shape()[0] << " "
+                         << encoder_out.shape()[1] << " "
+                         << encoder_out.shape()[2] << "\n";
+        const float* enc_logprob_ptr = encoder_out.data<float>();
+
+        size_t size = encoder_out.numel();
+        for (int i = 0; i < size; i++) {
+            encoder_out_fobj << enc_logprob_ptr[i] << "\n";
+        }
+    }
+#endif  // end TEST_DEBUG
+
+    std::vector<paddle::experimental::Tensor> inputs{
+        hyps_tensor, hyps_lens, encoder_out};
+    std::vector<paddle::Tensor> outputs = forward_attention_decoder_(inputs);
+    CHECK(outputs.size() == 2);
+
+    // (B, Umax, V)
+    paddle::Tensor probs = outputs[0];
+    std::vector<int64_t> probs_shape = probs.shape();
+    CHECK(probs_shape.size() == 3);
+    CHECK(probs_shape[0] == num_hyps);
+    CHECK(probs_shape[1] == max_hyps_len);
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("decoder_logprob",
+                               std::ios_base::app | std::ios_base::out);
+        std::ofstream dec_logprob_fobj(path.str().c_str(), std::ios::out);
+        CHECK(dec_logprob_fobj.is_open());
+
+        dec_logprob_fobj << probs.shape()[0] << " " << probs.shape()[1] << " "
+                         << probs.shape()[2] << "\n";
+        const float* dec_logprob_ptr = probs.data<float>();
+
+        size_t size = probs.numel();
+        for (int i = 0; i < size; i++) {
+            dec_logprob_fobj << dec_logprob_ptr[i] << "\n";
+        }
+    }
+#endif  // end TEST_DEBUG
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("hyps_lens",
+                               std::ios_base::app | std::ios_base::out);
+        std::ofstream hyps_len_fobj(path.str().c_str(), std::ios::out);
+        CHECK(hyps_len_fobj.is_open());
+
+        const int64_t* hyps_lens_ptr = hyps_lens.data<int64_t>();
+
+        size_t size = hyps_lens.numel();
+        for (int i = 0; i < size; i++) {
+            hyps_len_fobj << hyps_lens_ptr[i] << "\n";
+        }
+    }
+#endif  // end TEST_DEBUG
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("hyps_tensor",
+                               std::ios_base::app | std::ios_base::out);
+        std::ofstream hyps_tensor_fobj(path.str().c_str(), std::ios::out);
+        CHECK(hyps_tensor_fobj.is_open());
+
+        const int64_t* hyps_tensor_ptr = hyps_tensor.data<int64_t>();
+
+        size_t size = hyps_tensor.numel();
+        for (int i = 0; i < size; i++) {
+            hyps_tensor_fobj << hyps_tensor_ptr[i] << "\n";
+        }
+    }
+#endif  // end TEST_DEBUG
+
+    paddle::Tensor r_probs = outputs[1];
+    std::vector<int64_t> r_probs_shape = r_probs.shape();
+    if (is_bidecoder_ && reverse_weight > 0) {
+        CHECK(r_probs_shape.size() == 3);
+        CHECK(r_probs_shape[0] == num_hyps);
+        CHECK(r_probs_shape[1] == max_hyps_len);
+    } else {
+        // dump r_probs
+        CHECK(r_probs_shape.size() == 1);
+        CHECK(r_probs_shape[0] == 1) << r_probs_shape[0];
+    }
+
+    // compute rescoring score
+    using IntArray = paddle::experimental::IntArray;
+    std::vector<paddle::Tensor> probs_v =
+        paddle::experimental::split_with_num(probs, num_hyps, 0);
+    VLOG(2) << "split prob: " << probs_v.size() << " "
+            << probs_v[0].shape().size() << " 0: " << probs_v[0].shape()[0]
+            << ", " << probs_v[0].shape()[1] << ", " << probs_v[0].shape()[2];
+    CHECK(static_cast<int>(probs_v.size()) == num_hyps)
+        << ": is " << probs_v.size() << " expect: " << num_hyps;
+
+    std::vector<paddle::Tensor> r_probs_v;
+    if (is_bidecoder_ && reverse_weight > 0) {
+        r_probs_v = paddle::experimental::split_with_num(r_probs, num_hyps, 0);
+        CHECK(static_cast<int>(r_probs_v.size()) == num_hyps)
+            << "r_probs_v size: is " << r_probs_v.size()
+            << " expect: " << num_hyps;
+    }
+
+    for (int i = 0; i < num_hyps; ++i) {
+        const std::vector<int>& hyp = hyps[i];
+
+        // left-to-right decoder score
+        float score = 0.0f;
+        score = ComputePathScore(probs_v[i], hyp, eos_);
+
+        // right-to-left decoder score
+        float r_score = 0.0f;
+        if (is_bidecoder_ && reverse_weight > 0) {
+            std::vector<int> r_hyp(hyp.size());
+            std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin());
+            r_score = ComputePathScore(r_probs_v[i], r_hyp, eos_);
+        }
+
+        // combinded left-to-right and right-to-lfet score
+        (*rescoring_score)[i] =
+            score * (1 - reverse_weight) + r_score * reverse_weight;
+        VLOG(1) << "hyp " << i << " score: " << score << " r_score: " << r_score
+                << " reverse_weight: " << reverse_weight;
+    }
+}
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h
new file mode 100644
index 00000000..ddc85b45
--- /dev/null
+++ b/speechx/speechx/nnet/u2_nnet.h
@@ -0,0 +1,157 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "base/common.h"
+#include "kaldi/matrix/kaldi-matrix.h"
+
+#include "kaldi/util/options-itf.h"
+#include "nnet/nnet_itf.h"
+
+#include "paddle/extension.h"
+#include "paddle/jit/all.h"
+#include "paddle/phi/api/all.h"
+
+namespace ppspeech {
+
+struct U2ModelOptions {
+    std::string model_path;
+    int thread_num;
+    bool use_gpu;
+    U2ModelOptions() : model_path(""), thread_num(1), use_gpu(false) {}
+
+    void Register(kaldi::OptionsItf* opts) {
+        opts->Register("model-path", &model_path, "model file path");
+        opts->Register("thread-num", &thread_num, "thread num");
+        opts->Register("use-gpu", &use_gpu, "if use gpu");
+    }
+};
+
+
+class U2NnetBase : public NnetInterface {
+  public:
+    virtual int context() const { return right_context_ + 1; }
+    virtual int right_context() const { return right_context_; }
+    virtual int subsampling_rate() const { return subsampling_rate_; }
+    virtual int eos() const { return eos_; }
+    virtual int sos() const { return sos_; }
+    virtual int is_bidecoder() const { return is_bidecoder_; }
+    // current offset in decoder frame
+    virtual int offset() const { return offset_; }
+    virtual void set_chunk_size(int chunk_size) { chunk_size_ = chunk_size; }
+    virtual void set_num_left_chunks(int num_left_chunks) {
+        num_left_chunks_ = num_left_chunks;
+    }
+    // start: false, it is the start chunk of one sentence, else true
+    virtual int num_frames_for_chunk(bool start) const;
+
+    virtual std::shared_ptr<NnetInterface> Copy() const = 0;
+
+    virtual void ForwardEncoderChunk(
+        const std::vector<kaldi::BaseFloat>& chunk_feats,
+        int32 feat_dim,
+        std::vector<kaldi::BaseFloat>* ctc_probs,
+        int32* vocab_dim);
+
+    virtual void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
+                                    float reverse_weight,
+                                    std::vector<float>* rescoring_score) = 0;
+
+  protected:
+    virtual void ForwardEncoderChunkImpl(
+        const std::vector<kaldi::BaseFloat>& chunk_feats,
+        int32 feat_dim,
+        std::vector<kaldi::BaseFloat>* ctc_probs,
+        int32* vocab_dim) = 0;
+
+    virtual void CacheFeature(const std::vector<kaldi::BaseFloat>& chunk_feats,
+                              int32 feat_dim);
+
+  protected:
+    // model specification
+    int right_context_{0};
+    int subsampling_rate_{1};
+
+    int sos_{0};
+    int eos_{0};
+
+    bool is_bidecoder_{false};
+
+    int chunk_size_{16};  // num of decoder frames. If chunk_size > 0, streaming
+                          // case. Otherwise, none streaming case
+    int num_left_chunks_{-1};  // -1 means all left chunks
+
+    // asr decoder state
+    int offset_{0};  // current offset in encoder output time stamp. Used by
+                     // position embedding.
+    std::vector<std::vector<float>> cached_feats_{};  // features cache
+};
+
+
+class U2Nnet : public U2NnetBase {
+  public:
+    U2Nnet(const U2ModelOptions& opts);
+    U2Nnet(const U2Nnet& other);
+
+    void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
+                     int32 feature_dim,
+                     kaldi::Vector<kaldi::BaseFloat>* inferences,
+                     int32* inference_dim) override;
+
+    void Reset() override;
+
+    void Dim();
+
+    void LoadModel(const std::string& model_path_w_prefix);
+    void Warmup();
+
+    std::shared_ptr<paddle::jit::Layer> model() const { return model_; }
+
+    std::shared_ptr<NnetInterface> Copy() const override;
+
+    void ForwardEncoderChunkImpl(
+        const std::vector<kaldi::BaseFloat>& chunk_feats,
+        int32 feat_dim,
+        std::vector<kaldi::BaseFloat>* ctc_probs,
+        int32* vocab_dim) override;
+
+    float ComputePathScore(const paddle::Tensor& prob,
+                           const std::vector<int>& hyp,
+                           int eos);
+
+    void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
+                            float reverse_weight,
+                            std::vector<float>* rescoring_score) override;
+
+    // debug
+    void FeedEncoderOuts(paddle::Tensor& encoder_out);
+
+  private:
+    U2ModelOptions opts_;
+
+    phi::Place dev_;
+    std::shared_ptr<paddle::jit::Layer> model_{nullptr};
+    std::vector<paddle::Tensor> encoder_outs_;
+    // transformer/conformer attention cache
+    paddle::Tensor att_cache_ = paddle::full({0, 0, 0, 0}, 0.0);
+    // conformer-only conv_module cache
+    paddle::Tensor cnn_cache_ = paddle::full({0, 0, 0, 0}, 0.0);
+
+    paddle::jit::Function forward_encoder_chunk_;
+    paddle::jit::Function forward_attention_decoder_;
+    paddle::jit::Function ctc_activation_;
+};
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc
new file mode 100644
index 00000000..1a1a5e02
--- /dev/null
+++ b/speechx/speechx/nnet/u2_nnet_main.cc
@@ -0,0 +1,180 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet/u2_nnet.h"
+#include "base/common.h"
+#include "frontend/audio/assembler.h"
+#include "frontend/audio/data_cache.h"
+#include "kaldi/util/table-types.h"
+#include "nnet/decodable.h"
+
+DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
+DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier");
+
+DEFINE_string(model_path, "", "paddle nnet model");
+
+DEFINE_int32(nnet_decoder_chunk, 16, "nnet forward chunk");
+DEFINE_int32(receptive_field_length,
+             7,
+             "receptive field of two CNN(kernel=3) downsampling module.");
+DEFINE_int32(downsampling_rate,
+             4,
+             "two CNN(kernel=3) module downsampling rate.");
+DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
+
+using kaldi::BaseFloat;
+using kaldi::Matrix;
+using std::vector;
+
+int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
+
+    int32 num_done = 0, num_err = 0;
+
+    CHECK(FLAGS_feature_rspecifier.size() > 0);
+    CHECK(FLAGS_nnet_prob_wspecifier.size() > 0);
+    CHECK(FLAGS_model_path.size() > 0);
+    LOG(INFO) << "input rspecifier: " << FLAGS_feature_rspecifier;
+    LOG(INFO) << "output wspecifier: " << FLAGS_nnet_prob_wspecifier;
+    LOG(INFO) << "model path: " << FLAGS_model_path;
+    kaldi::SequentialBaseFloatMatrixReader feature_reader(
+        FLAGS_feature_rspecifier);
+    kaldi::BaseFloatMatrixWriter nnet_out_writer(FLAGS_nnet_prob_wspecifier);
+
+    ppspeech::U2ModelOptions model_opts;
+    model_opts.model_path = FLAGS_model_path;
+
+    int32 chunk_size =
+        (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate +
+        FLAGS_receptive_field_length;
+    int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk;
+    int32 receptive_field_length = FLAGS_receptive_field_length;
+    LOG(INFO) << "chunk size (frame): " << chunk_size;
+    LOG(INFO) << "chunk stride (frame): " << chunk_stride;
+    LOG(INFO) << "receptive field (frame): " << receptive_field_length;
+
+    std::shared_ptr<ppspeech::U2Nnet> nnet(new ppspeech::U2Nnet(model_opts));
+    std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
+    std::shared_ptr<ppspeech::Decodable> decodable(
+        new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale));
+    kaldi::Timer timer;
+
+    for (; !feature_reader.Done(); feature_reader.Next()) {
+        string utt = feature_reader.Key();
+        kaldi::Matrix<BaseFloat> feature = feature_reader.Value();
+
+        int nframes = feature.NumRows();
+        int feat_dim = feature.NumCols();
+        raw_data->SetDim(feat_dim);
+        LOG(INFO) << "utt: " << utt;
+        LOG(INFO) << "feat shape: " << nframes << ", " << feat_dim;
+
+        // // pad feats
+        // int32 padding_len = 0;
+        // if ((feature.NumRows() - chunk_size) % chunk_stride != 0) {
+        //     padding_len =
+        //         chunk_stride - (feature.NumRows() - chunk_size) %
+        //         chunk_stride;
+        //     feature.Resize(feature.NumRows() + padding_len,
+        //                    feature.NumCols(),
+        //                    kaldi::kCopyData);
+        // }
+
+        int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1;
+        int32 frame_idx = 0;
+        std::vector<kaldi::Vector<kaldi::BaseFloat>> prob_vec;
+        int32 ori_feature_len = feature.NumRows();
+
+        for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
+            kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
+                                                          feat_dim);
+
+            int32 feature_chunk_size = 0;
+            if (ori_feature_len > chunk_idx * chunk_stride) {
+                feature_chunk_size = std::min(
+                    ori_feature_len - chunk_idx * chunk_stride, chunk_size);
+            }
+            if (feature_chunk_size < receptive_field_length) {
+                LOG(WARNING) << "utt: " << utt << " skip last "
+                             << feature_chunk_size << " frames, expect is "
+                             << receptive_field_length;
+                break;
+            }
+
+            int32 start = chunk_idx * chunk_stride;
+            for (int row_id = 0; row_id < chunk_size; ++row_id) {
+                kaldi::SubVector<kaldi::BaseFloat> feat_row(feature, start);
+                kaldi::SubVector<kaldi::BaseFloat> feature_chunk_row(
+                    feature_chunk.Data() + row_id * feat_dim, feat_dim);
+
+                feature_chunk_row.CopyFromVec(feat_row);
+                ++start;
+            }
+
+            // feat to frontend pipeline cache
+            raw_data->Accept(feature_chunk);
+
+            // send data finish signal
+            if (chunk_idx == num_chunks - 1) {
+                raw_data->SetFinished();
+            }
+
+            // get nnet outputs
+            vector<kaldi::BaseFloat> prob;
+            while (decodable->FrameLikelihood(frame_idx, &prob)) {
+                kaldi::Vector<kaldi::BaseFloat> vec_tmp(prob.size());
+                std::memcpy(vec_tmp.Data(),
+                            prob.data(),
+                            sizeof(kaldi::BaseFloat) * prob.size());
+                prob_vec.push_back(vec_tmp);
+                frame_idx++;
+            }
+        }
+
+        // after process one utt, then reset decoder state.
+        decodable->Reset();
+
+        if (prob_vec.size() == 0) {
+            // the TokenWriter can not write empty string.
+            ++num_err;
+            LOG(WARNING) << " the nnet prob of " << utt << " is empty";
+            continue;
+        }
+
+        // writer nnet output
+        kaldi::MatrixIndexT nrow = prob_vec.size();
+        kaldi::MatrixIndexT ncol = prob_vec[0].Dim();
+        LOG(INFO) << "nnet out shape: " << nrow << ", " << ncol;
+        kaldi::Matrix<kaldi::BaseFloat> result(nrow, ncol);
+        for (int32 row_idx = 0; row_idx < nrow; ++row_idx) {
+            for (int32 col_idx = 0; col_idx < ncol; ++col_idx) {
+                result(row_idx, col_idx) = prob_vec[row_idx](col_idx);
+            }
+        }
+        nnet_out_writer.Write(utt, result);
+
+        ++num_done;
+    }
+
+    double elapsed = timer.Elapsed();
+    LOG(INFO) << " cost:" << elapsed << " sec";
+
+    LOG(INFO) << "Done " << num_done << " utterances, " << num_err
+              << " with errors.";
+    return (num_done != 0 ? 0 : 1);
+}
diff --git a/speechx/speechx/protocol/CMakeLists.txt b/speechx/speechx/protocol/CMakeLists.txt
index 98b2f38b..71b33daa 100644
--- a/speechx/speechx/protocol/CMakeLists.txt
+++ b/speechx/speechx/protocol/CMakeLists.txt
@@ -1,3 +1 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
 add_subdirectory(websocket)
diff --git a/speechx/speechx/protocol/websocket/CMakeLists.txt b/speechx/speechx/protocol/websocket/CMakeLists.txt
index c3454c39..0f73fd24 100644
--- a/speechx/speechx/protocol/websocket/CMakeLists.txt
+++ b/speechx/speechx/protocol/websocket/CMakeLists.txt
@@ -1,4 +1,4 @@
-project(websocket)
+# project(websocket)
 
 add_library(websocket STATIC
   websocket_server.cc
diff --git a/speechx/speechx/utils/CMakeLists.txt b/speechx/speechx/utils/CMakeLists.txt
index 95e86574..c1e875be 100644
--- a/speechx/speechx/utils/CMakeLists.txt
+++ b/speechx/speechx/utils/CMakeLists.txt
@@ -1,4 +1,5 @@
 
 add_library(utils
   file_utils.cc
+  math.cc
 )
\ No newline at end of file
diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc
index 7c319295..5087ac60 100644
--- a/speechx/speechx/utils/math.cc
+++ b/speechx/speechx/utils/math.cc
@@ -38,11 +38,11 @@ float LogSumExp(float x, float y) {
 template <typename T>
 struct ValGreaterComp {
     bool operator()(const std::pair<T, int32_t>& lhs,
-                    const std::pair<T, int32_>& rhs) const {
+                    const std::pair<T, int32_t>& rhs) const {
         return lhs.first > rhs.first ||
                (lhs.first == rhs.first && lhs.second < rhs.second);
     }
-}
+};
 
 template <typename T>
 void TopK(const std::vector<T>& data,
diff --git a/speechx/tools/venv.sh b/speechx/tools/venv.sh
new file mode 100755
index 00000000..3952988c
--- /dev/null
+++ b/speechx/tools/venv.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+set -ex
+
+PYTHON=python3.7
+test -d venv || virtualenv -p ${PYTHON} venv

From cd1ced4ea0f9f85835a63b7afd2b47f8f14a963f Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 11 Oct 2022 06:43:07 +0000
Subject: [PATCH 06/60] add nnetout struct

---
 speechx/examples/ds2_ol/aishell/run.sh |  4 ++--
 speechx/speechx/nnet/CMakeLists.txt    |  1 -
 speechx/speechx/nnet/decodable.cc      | 25 +++++++++++++------------
 speechx/speechx/nnet/decodable.h       |  2 +-
 speechx/speechx/nnet/ds2_nnet.cc       | 15 +++++++++------
 speechx/speechx/nnet/ds2_nnet.h        |  5 ++---
 speechx/speechx/nnet/nnet_itf.h        | 17 ++++++++++++++---
 speechx/speechx/nnet/u2_nnet.cc        | 18 ++++++++++--------
 speechx/speechx/nnet/u2_nnet.h         | 15 ++++++++-------
 9 files changed, 59 insertions(+), 43 deletions(-)

diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh
index 82e889ce..a29be17b 100755
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-set +x
+set -x
 set -e
 
 . path.sh
@@ -11,7 +11,7 @@ stop_stage=100
 . utils/parse_options.sh
 
 # 1. compile
-if [ ! -d ${SPEECHX_EXAMPLES} ]; then
+if [ ! -d ${SPEECHX_BUILD} ]; then
     pushd ${SPEECHX_ROOT} 
     bash build.sh
     popd
diff --git a/speechx/speechx/nnet/CMakeLists.txt b/speechx/speechx/nnet/CMakeLists.txt
index 2a1812fd..43566616 100644
--- a/speechx/speechx/nnet/CMakeLists.txt
+++ b/speechx/speechx/nnet/CMakeLists.txt
@@ -14,7 +14,6 @@ target_link_libraries(nnet absl::strings)
 if(USING_U2)
   target_compile_options(nnet  PUBLIC ${PADDLE_COMPILE_FLAGS})
   target_include_directories(nnet  PUBLIC ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
-  # target_link_libraries(nnet  ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
 endif()
 
 
diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc
index 7780e5ae..40fac182 100644
--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@@ -32,7 +32,7 @@ Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet,
 
 // for debug
 void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
-    nnet_cache_ = likelihood;
+    nnet_out_cache_ = likelihood;
     frames_ready_ += likelihood.NumRows();
 }
 
@@ -56,13 +56,13 @@ int32 Decodable::NumIndices() const { return 0; }
 int32 Decodable::TokenId2NnetId(int32 token_id) { return token_id - 1; }
 
 BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) {
-    CHECK_LE(index, nnet_cache_.NumCols());
+    CHECK_LE(index, nnet_out_cache_.NumCols());
     CHECK_LE(frame, frames_ready_);
     int32 frame_idx = frame - frame_offset_;
     // the nnet output is prob ranther than log prob
     // the index - 1, because the ilabel
     return acoustic_scale_ *
-           std::log(nnet_cache_(frame_idx, TokenId2NnetId(index)) +
+           std::log(nnet_out_cache_(frame_idx, TokenId2NnetId(index)) +
                     std::numeric_limits<float>::min());
 }
 
@@ -82,17 +82,18 @@ bool Decodable::AdvanceChunk() {
     }
 
     // forward feats
-    int32 vocab_dim = 0;
-    Vector<BaseFloat> probs;
-    nnet_->FeedForward(features, frontend_->Dim(), &probs, &vocab_dim);
+    NnetOut out;
+    nnet_->FeedForward(features, frontend_->Dim(), &out);
+    int32& vocab_dim = out.vocab_dim;
+    Vector<BaseFloat>& probs = out.logprobs;
 
     // cache nnet outupts
-    nnet_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim);
-    nnet_cache_.CopyRowsFromVec(probs);
+    nnet_out_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim);
+    nnet_out_cache_.CopyRowsFromVec(probs);
 
     // update state
     frame_offset_ = frames_ready_;
-    frames_ready_ += nnet_cache_.NumRows();
+    frames_ready_ += nnet_out_cache_.NumRows();
     return true;
 }
 
@@ -102,12 +103,12 @@ bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
         return false;
     }
 
-    int vocab_size = nnet_cache_.NumCols();
+    int vocab_size = nnet_out_cache_.NumCols();
     likelihood->resize(vocab_size);
 
     for (int32 idx = 0; idx < vocab_size; ++idx) {
         (*likelihood)[idx] =
-            nnet_cache_(frame - frame_offset_, idx) * acoustic_scale_;
+            nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_;
     }
     return true;
 }
@@ -117,7 +118,7 @@ void Decodable::Reset() {
     if (nnet_ != nullptr) nnet_->Reset();
     frame_offset_ = 0;
     frames_ready_ = 0;
-    nnet_cache_.Resize(0, 0);
+    nnet_out_cache_.Resize(0, 0);
 }
 
 }  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h
index 241d0419..8786e4f2 100644
--- a/speechx/speechx/nnet/decodable.h
+++ b/speechx/speechx/nnet/decodable.h
@@ -62,7 +62,7 @@ class Decodable : public kaldi::DecodableInterface {
     std::shared_ptr<NnetInterface> nnet_;
 
     // nnet outputs' cache
-    kaldi::Matrix<kaldi::BaseFloat> nnet_cache_;
+    kaldi::Matrix<kaldi::BaseFloat> nnet_out_cache_;
 
     // the frame is nnet prob frame rather than audio feature frame
     // nnet frame subsample the feature frame
diff --git a/speechx/speechx/nnet/ds2_nnet.cc b/speechx/speechx/nnet/ds2_nnet.cc
index a89c0f20..c6add03c 100644
--- a/speechx/speechx/nnet/ds2_nnet.cc
+++ b/speechx/speechx/nnet/ds2_nnet.cc
@@ -143,9 +143,8 @@ shared_ptr<Tensor<BaseFloat>> PaddleNnet::GetCacheEncoder(const string& name) {
 }
 
 void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
-                             int32 feature_dim,
-                             Vector<BaseFloat>* inferences,
-                             int32* inference_dim) {
+                             const int32& feature_dim,
+                             NnetOut* out) {
     paddle_infer::Predictor* predictor = GetPredictor();
 
     int feat_row = features.Dim() / feature_dim;
@@ -203,9 +202,13 @@ void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
     std::vector<int> output_shape = output_tensor->shape();
     int32 row = output_shape[1];
     int32 col = output_shape[2];
-    inferences->Resize(row * col);
-    *inference_dim = col;
-    output_tensor->CopyToCpu(inferences->Data());
+
+
+    // inferences->Resize(row * col);
+    // *inference_dim = col;
+    out->logprobs.Resize(row*col);
+    out->vocab_dim = col;
+    output_tensor->CopyToCpu(out->logprobs.Data());
 
     ReleasePredictor(predictor);
 }
diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h
index e2b3d5bc..717bdb72 100644
--- a/speechx/speechx/nnet/ds2_nnet.h
+++ b/speechx/speechx/nnet/ds2_nnet.h
@@ -97,9 +97,8 @@ class PaddleNnet : public NnetInterface {
     PaddleNnet(const ModelOptions& opts);
 
     virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
-                             int32 feature_dim,
-                             kaldi::Vector<kaldi::BaseFloat>* inferences,
-                             int32* inference_dim);
+                             const int32& feature_dim,
+                             NnetOut* out);
 
     void Dim();
     virtual void Reset();
diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h
index ac040fba..12fe3c27 100644
--- a/speechx/speechx/nnet/nnet_itf.h
+++ b/speechx/speechx/nnet/nnet_itf.h
@@ -21,12 +21,23 @@
 
 namespace ppspeech {
 
+struct NnetOut{
+  // nnet out, maybe logprob or prob
+  kaldi::Vector<kaldi::BaseFloat> logprobs;
+  int32 vocab_dim;
+
+  // nnet state. Only using in Attention model.
+  std::vector<std::vector<kaldi::BaseFloat>> encoder_outs;
+
+  NnetOut() : logprobs({}), vocab_dim(-1), encoder_outs({}) {} 
+};
+
+
 class NnetInterface {
   public:
     virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
-                             int32 feature_dim,
-                             kaldi::Vector<kaldi::BaseFloat>* inferences,
-                             int32* inference_dim) = 0;
+                             const int32& feature_dim,
+                             NnetOut* out) = 0;
     virtual void Reset() = 0;
     virtual ~NnetInterface() {}
 };
diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc
index 67ef0952..26d7da8f 100644
--- a/speechx/speechx/nnet/u2_nnet.cc
+++ b/speechx/speechx/nnet/u2_nnet.cc
@@ -64,7 +64,7 @@ void U2NnetBase::CacheFeature(const std::vector<kaldi::BaseFloat>& chunk_feats,
 
 void U2NnetBase::ForwardEncoderChunk(
     const std::vector<kaldi::BaseFloat>& chunk_feats,
-    int32 feat_dim,
+    const int32& feat_dim,
     std::vector<kaldi::BaseFloat>* ctc_probs,
     int32* vocab_dim) {
     ctc_probs->clear();
@@ -221,16 +221,17 @@ void U2Nnet::FeedEncoderOuts(paddle::Tensor& encoder_out) {
 
 
 void U2Nnet::FeedForward(const kaldi::Vector<BaseFloat>& features,
-                         int32 feature_dim,
-                         kaldi::Vector<BaseFloat>* inferences,
-                         int32* inference_dim) {
+                         const int32& feature_dim,
+                         NnetOut* out) {
     std::vector<kaldi::BaseFloat> chunk_feats(features.Data(),
                                               features.Data() + features.Dim());
+
     std::vector<kaldi::BaseFloat> ctc_probs;
     ForwardEncoderChunkImpl(
-        chunk_feats, feature_dim, &ctc_probs, inference_dim);
-    inferences->Resize(ctc_probs.size(), kaldi::kSetZero);
-    std::memcpy(inferences->Data(),
+        chunk_feats, feature_dim, &ctc_probs, &out->vocab_dim);
+
+    out->logprobs.Resize(ctc_probs.size(), kaldi::kSetZero);
+    std::memcpy(out->logprobs.Data(),
                 ctc_probs.data(),
                 ctc_probs.size() * sizeof(kaldi::BaseFloat));
 }
@@ -238,9 +239,10 @@ void U2Nnet::FeedForward(const kaldi::Vector<BaseFloat>& features,
 
 void U2Nnet::ForwardEncoderChunkImpl(
     const std::vector<kaldi::BaseFloat>& chunk_feats,
-    int32 feat_dim,
+    const int32& feat_dim,
     std::vector<kaldi::BaseFloat>* out_prob,
     int32* vocab_dim) {
+
 #ifdef USE_PROFILING
     RecordEvent event(
         "ForwardEncoderChunkImpl", TracerEventType::UserDefined, 1);
diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h
index ddc85b45..87442959 100644
--- a/speechx/speechx/nnet/u2_nnet.h
+++ b/speechx/speechx/nnet/u2_nnet.h
@@ -61,7 +61,7 @@ class U2NnetBase : public NnetInterface {
 
     virtual void ForwardEncoderChunk(
         const std::vector<kaldi::BaseFloat>& chunk_feats,
-        int32 feat_dim,
+        const int32& feat_dim,
         std::vector<kaldi::BaseFloat>* ctc_probs,
         int32* vocab_dim);
 
@@ -72,7 +72,7 @@ class U2NnetBase : public NnetInterface {
   protected:
     virtual void ForwardEncoderChunkImpl(
         const std::vector<kaldi::BaseFloat>& chunk_feats,
-        int32 feat_dim,
+        const int32& feat_dim,
         std::vector<kaldi::BaseFloat>* ctc_probs,
         int32* vocab_dim) = 0;
 
@@ -93,7 +93,7 @@ class U2NnetBase : public NnetInterface {
                           // case. Otherwise, none streaming case
     int num_left_chunks_{-1};  // -1 means all left chunks
 
-    // asr decoder state
+    // asr decoder state, not used in nnet
     int offset_{0};  // current offset in encoder output time stamp. Used by
                      // position embedding.
     std::vector<std::vector<float>> cached_feats_{};  // features cache
@@ -106,9 +106,8 @@ class U2Nnet : public U2NnetBase {
     U2Nnet(const U2Nnet& other);
 
     void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
-                     int32 feature_dim,
-                     kaldi::Vector<kaldi::BaseFloat>* inferences,
-                     int32* inference_dim) override;
+                     const int32& feature_dim,
+                     NnetOut* out) override;
 
     void Reset() override;
 
@@ -123,7 +122,7 @@ class U2Nnet : public U2NnetBase {
 
     void ForwardEncoderChunkImpl(
         const std::vector<kaldi::BaseFloat>& chunk_feats,
-        int32 feat_dim,
+        const int32& feat_dim,
         std::vector<kaldi::BaseFloat>* ctc_probs,
         int32* vocab_dim) override;
 
@@ -138,6 +137,8 @@ class U2Nnet : public U2NnetBase {
     // debug
     void FeedEncoderOuts(paddle::Tensor& encoder_out);
 
+    const std::vector<paddle::Tensor>& EncoderOuts() const {return encoder_outs_; }
+
   private:
     U2ModelOptions opts_;
 

From a75abc1828e46e27ed368b61a6ee4ab7639eaec7 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 11 Oct 2022 07:51:40 +0000
Subject: [PATCH 07/60] fix u2 nnet out frames num

---
 speechx/.gitignore                   |  1 +
 speechx/speechx/nnet/nnet_itf.h      | 14 +++++++-------
 speechx/speechx/nnet/u2_nnet.cc      |  6 +++---
 speechx/speechx/nnet/u2_nnet.h       |  4 +++-
 speechx/speechx/nnet/u2_nnet_main.cc | 18 +++++++++---------
 speechx/tools/clang-format.sh        |  3 +++
 6 files changed, 26 insertions(+), 20 deletions(-)
 create mode 100755 speechx/tools/clang-format.sh

diff --git a/speechx/.gitignore b/speechx/.gitignore
index e0c61847..9a93805c 100644
--- a/speechx/.gitignore
+++ b/speechx/.gitignore
@@ -1 +1,2 @@
 tools/valgrind*
+*log
diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h
index 12fe3c27..b98f5ebd 100644
--- a/speechx/speechx/nnet/nnet_itf.h
+++ b/speechx/speechx/nnet/nnet_itf.h
@@ -21,15 +21,15 @@
 
 namespace ppspeech {
 
-struct NnetOut{
-  // nnet out, maybe logprob or prob
-  kaldi::Vector<kaldi::BaseFloat> logprobs;
-  int32 vocab_dim;
+struct NnetOut {
+    // nnet out, maybe logprob or prob
+    kaldi::Vector<kaldi::BaseFloat> logprobs;
+    int32 vocab_dim;
 
-  // nnet state. Only using in Attention model.
-  std::vector<std::vector<kaldi::BaseFloat>> encoder_outs;
+    // nnet state. Only using in Attention model.
+    std::vector<std::vector<kaldi::BaseFloat>> encoder_outs;
 
-  NnetOut() : logprobs({}), vocab_dim(-1), encoder_outs({}) {} 
+    NnetOut() : logprobs({}), vocab_dim(-1), encoder_outs({}) {}
 };
 
 
diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc
index 26d7da8f..ddb815d2 100644
--- a/speechx/speechx/nnet/u2_nnet.cc
+++ b/speechx/speechx/nnet/u2_nnet.cc
@@ -313,10 +313,8 @@ void U2Nnet::ForwardEncoderChunkImpl(
     // call.
     std::vector<paddle::Tensor> inputs = {
         feats, offset, /*required_cache_size, */ att_cache_, cnn_cache_};
-    VLOG(3) << "inputs size: " << inputs.size();
     CHECK(inputs.size() == 4);
     std::vector<paddle::Tensor> outputs = forward_encoder_chunk_(inputs);
-    VLOG(3) << "outputs size: " << outputs.size();
     CHECK(outputs.size() == 3);
 
 #ifdef USE_GPU
@@ -351,10 +349,12 @@ void U2Nnet::ForwardEncoderChunkImpl(
     // current offset in decoder frame
     // not used in nnet
     offset_ += chunk_out.shape()[1];
+    VLOG(2) << "encoder out chunk size: " << chunk_out.shape()[1] << " total: " << offset_ ;
+    
 
     // collects encoder outs.
-    VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size();
     encoder_outs_.push_back(chunk_out);
+    VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size();
 
 #ifdef TEST_DEBUG
     {
diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h
index 87442959..775a078a 100644
--- a/speechx/speechx/nnet/u2_nnet.h
+++ b/speechx/speechx/nnet/u2_nnet.h
@@ -137,7 +137,9 @@ class U2Nnet : public U2NnetBase {
     // debug
     void FeedEncoderOuts(paddle::Tensor& encoder_out);
 
-    const std::vector<paddle::Tensor>& EncoderOuts() const {return encoder_outs_; }
+    const std::vector<paddle::Tensor>& EncoderOuts() const {
+        return encoder_outs_;
+    }
 
   private:
     U2ModelOptions opts_;
diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc
index 1a1a5e02..b602ac4d 100644
--- a/speechx/speechx/nnet/u2_nnet_main.cc
+++ b/speechx/speechx/nnet/u2_nnet_main.cc
@@ -95,29 +95,29 @@ int main(int argc, char* argv[]) {
         //                    kaldi::kCopyData);
         // }
 
-        int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1;
         int32 frame_idx = 0;
         std::vector<kaldi::Vector<kaldi::BaseFloat>> prob_vec;
         int32 ori_feature_len = feature.NumRows();
+        int32 num_chunks = feature.NumRows() / chunk_stride + 1;
+        LOG(INFO) << "num_chunks: " << num_chunks;
 
         for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
-            kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
-                                                          feat_dim);
-
-            int32 feature_chunk_size = 0;
+            int32 this_chunk_size = 0;
             if (ori_feature_len > chunk_idx * chunk_stride) {
-                feature_chunk_size = std::min(
+                this_chunk_size = std::min(
                     ori_feature_len - chunk_idx * chunk_stride, chunk_size);
             }
-            if (feature_chunk_size < receptive_field_length) {
+            if (this_chunk_size < receptive_field_length) {
                 LOG(WARNING) << "utt: " << utt << " skip last "
-                             << feature_chunk_size << " frames, expect is "
+                             << this_chunk_size << " frames, expect is "
                              << receptive_field_length;
                 break;
             }
 
+            kaldi::Vector<kaldi::BaseFloat> feature_chunk(this_chunk_size *
+                                                          feat_dim);
             int32 start = chunk_idx * chunk_stride;
-            for (int row_id = 0; row_id < chunk_size; ++row_id) {
+            for (int row_id = 0; row_id < this_chunk_size; ++row_id) {
                 kaldi::SubVector<kaldi::BaseFloat> feat_row(feature, start);
                 kaldi::SubVector<kaldi::BaseFloat> feature_chunk_row(
                     feature_chunk.Data() + row_id * feat_dim, feat_dim);
diff --git a/speechx/tools/clang-format.sh b/speechx/tools/clang-format.sh
new file mode 100755
index 00000000..30f636ff
--- /dev/null
+++ b/speechx/tools/clang-format.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+find speechx -name '*.c' -o -name '*.h' -not -path "*kaldi*" | xargs -I{} clang-format -i  {}

From 5cc874e1c3e6015e2c73fc9ca098a650aa4ef730 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 11 Oct 2022 09:51:07 +0000
Subject: [PATCH 08/60] u2 nnet get encoder out and align with py

---
 speechx/examples/codelab/u2nnet/run.sh |  7 +++-
 speechx/speechx/nnet/decodable.h       |  2 +
 speechx/speechx/nnet/ds2_nnet.h        | 14 +++++--
 speechx/speechx/nnet/nnet_itf.h        | 14 ++++++-
 speechx/speechx/nnet/u2_nnet.cc        | 26 +++++++++++++
 speechx/speechx/nnet/u2_nnet.h         |  5 +--
 speechx/speechx/nnet/u2_nnet_main.cc   | 51 +++++++++++++++++++-------
 7 files changed, 96 insertions(+), 23 deletions(-)

diff --git a/speechx/examples/codelab/u2nnet/run.sh b/speechx/examples/codelab/u2nnet/run.sh
index b309bc6f..704653e7 100755
--- a/speechx/examples/codelab/u2nnet/run.sh
+++ b/speechx/examples/codelab/u2nnet/run.sh
@@ -40,6 +40,7 @@ cmvn_json2kaldi_main \
     --json_file  $model_dir/mean_std.json \
     --cmvn_write_path $exp/cmvn.ark \
     --binary=false
+
 echo "convert json cmvn to kaldi ark."
 
 compute_fbank_main \
@@ -47,6 +48,7 @@ compute_fbank_main \
     --wav_rspecifier=scp:$data/wav.scp \
     --cmvn_file=$exp/cmvn.ark \
     --feature_wspecifier=ark,t:$exp/fbank.ark
+
 echo "compute fbank feature."
 
 u2_nnet_main \
@@ -56,4 +58,7 @@ u2_nnet_main \
     --receptive_field_length=7 \
     --downsampling_rate=4 \
     --acoustic_scale=1.0 \
-    --nnet_prob_wspecifier=ark,t:$exp/probs.ark
+    --nnet_encoder_outs_wspecifier=ark,t:$exp/encoder_outs.ark \
+    --nnet_prob_wspecifier=ark,t:$exp/logprobs.ark
+
+echo "u2 nnet decode."
diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h
index 8786e4f2..39b38dc1 100644
--- a/speechx/speechx/nnet/decodable.h
+++ b/speechx/speechx/nnet/decodable.h
@@ -55,6 +55,8 @@ class Decodable : public kaldi::DecodableInterface {
 
     int32 TokenId2NnetId(int32 token_id);
 
+    std::shared_ptr<NnetInterface> Nnet() { return nnet_; }
+
   private:
     bool AdvanceChunk();
 
diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h
index 717bdb72..80be6927 100644
--- a/speechx/speechx/nnet/ds2_nnet.h
+++ b/speechx/speechx/nnet/ds2_nnet.h
@@ -96,16 +96,22 @@ class PaddleNnet : public NnetInterface {
   public:
     PaddleNnet(const ModelOptions& opts);
 
-    virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
-                             const int32& feature_dim,
-                             NnetOut* out);
+    void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
+                     const int32& feature_dim,
+                     NnetOut* out) override;
 
     void Dim();
-    virtual void Reset();
+
+    void Reset() override;
+
     std::shared_ptr<Tensor<kaldi::BaseFloat>> GetCacheEncoder(
         const std::string& name);
+
     void InitCacheEncouts(const ModelOptions& opts);
 
+    void EncoderOuts(std::vector<kaldi::Vector<kaldi::BaseFloat>>* encoder_out)
+        const override {}
+
   private:
     paddle_infer::Predictor* GetPredictor();
     int ReleasePredictor(paddle_infer::Predictor* predictor);
diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h
index b98f5ebd..5dde72a8 100644
--- a/speechx/speechx/nnet/nnet_itf.h
+++ b/speechx/speechx/nnet/nnet_itf.h
@@ -22,7 +22,7 @@
 namespace ppspeech {
 
 struct NnetOut {
-    // nnet out, maybe logprob or prob
+    // nnet out. maybe logprob or prob. Almost time this is logprob.
     kaldi::Vector<kaldi::BaseFloat> logprobs;
     int32 vocab_dim;
 
@@ -35,11 +35,21 @@ struct NnetOut {
 
 class NnetInterface {
   public:
+    virtual ~NnetInterface() {}
+
+    // forward feat with nnet.
+    // nnet do not cache feats, feats cached by frontend.
+    // nnet cache model outputs, i.e. logprobs/encoder_outs.
     virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
                              const int32& feature_dim,
                              NnetOut* out) = 0;
+
+    // reset nnet state, e.g. nnet_logprob_cache_, offset_, encoder_outs_.
     virtual void Reset() = 0;
-    virtual ~NnetInterface() {}
+
+    // using to get encoder outs. e.g. seq2seq with Attention model.
+    virtual void EncoderOuts(
+        std::vector<kaldi::Vector<kaldi::BaseFloat>>* encoder_out) const = 0;
 };
 
 }  // namespace ppspeech
diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc
index ddb815d2..74f8cf78 100644
--- a/speechx/speechx/nnet/u2_nnet.cc
+++ b/speechx/speechx/nnet/u2_nnet.cc
@@ -705,4 +705,30 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
     }
 }
 
+
+void U2Nnet::EncoderOuts(std::vector<kaldi::Vector<kaldi::BaseFloat>>* encoder_out) const {
+    // list of (B=1,T,D)
+    int size = encoder_outs_.size();
+    VLOG(1) << "encoder_outs_ size: " << size;
+
+    for (int i = 0; i < size; i++){
+        const paddle::Tensor& item = encoder_outs_[i];
+        const std::vector<int64_t> shape = item.shape();
+        CHECK(shape.size() == 3);
+        const int& B = shape[0];
+        const int& T = shape[1];
+        const int& D = shape[2];
+        CHECK(B == 1) << "Only support batch one.";
+        VLOG(1) << "encoder out " << i << " shape: (" << B << "," << T << "," << D << ")";
+
+        const float *this_tensor_ptr = item.data<float>();
+        for (int j = 0; j < T; j++){
+            const float* cur = this_tensor_ptr + j * D; 
+            kaldi::Vector<kaldi::BaseFloat> out(D);
+            std::memcpy(out.Data(), cur, D * sizeof(kaldi::BaseFloat));
+            encoder_out->emplace_back(out);
+        }
+    }
+ }
+
 }  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h
index 775a078a..8ce45f43 100644
--- a/speechx/speechx/nnet/u2_nnet.h
+++ b/speechx/speechx/nnet/u2_nnet.h
@@ -137,9 +137,8 @@ class U2Nnet : public U2NnetBase {
     // debug
     void FeedEncoderOuts(paddle::Tensor& encoder_out);
 
-    const std::vector<paddle::Tensor>& EncoderOuts() const {
-        return encoder_outs_;
-    }
+    void EncoderOuts(
+        std::vector<kaldi::Vector<kaldi::BaseFloat>>* encoder_out) const;
 
   private:
     U2ModelOptions opts_;
diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc
index b602ac4d..fb9fec23 100644
--- a/speechx/speechx/nnet/u2_nnet_main.cc
+++ b/speechx/speechx/nnet/u2_nnet_main.cc
@@ -21,6 +21,7 @@
 
 DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
 DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier");
+DEFINE_string(nnet_encoder_outs_wspecifier, "", "nnet encoder outs wspecifier");
 
 DEFINE_string(model_path, "", "paddle nnet model");
 
@@ -52,9 +53,10 @@ int main(int argc, char* argv[]) {
     LOG(INFO) << "input rspecifier: " << FLAGS_feature_rspecifier;
     LOG(INFO) << "output wspecifier: " << FLAGS_nnet_prob_wspecifier;
     LOG(INFO) << "model path: " << FLAGS_model_path;
-    kaldi::SequentialBaseFloatMatrixReader feature_reader(
-        FLAGS_feature_rspecifier);
+
+    kaldi::SequentialBaseFloatMatrixReader feature_reader(FLAGS_feature_rspecifier);
     kaldi::BaseFloatMatrixWriter nnet_out_writer(FLAGS_nnet_prob_wspecifier);
+    kaldi::BaseFloatMatrixWriter nnet_encoder_outs_writer(FLAGS_nnet_encoder_outs_wspecifier);
 
     ppspeech::U2ModelOptions model_opts;
     model_opts.model_path = FLAGS_model_path;
@@ -97,6 +99,7 @@ int main(int argc, char* argv[]) {
 
         int32 frame_idx = 0;
         std::vector<kaldi::Vector<kaldi::BaseFloat>> prob_vec;
+        std::vector<kaldi::Vector<kaldi::BaseFloat>> encoder_out_vec;
         int32 ori_feature_len = feature.NumRows();
         int32 num_chunks = feature.NumRows() / chunk_stride + 1;
         LOG(INFO) << "num_chunks: " << num_chunks;
@@ -144,29 +147,51 @@ int main(int argc, char* argv[]) {
                 prob_vec.push_back(vec_tmp);
                 frame_idx++;
             }
+
+
         }
 
+        // get encoder out
+        decodable->Nnet()->EncoderOuts(&encoder_out_vec);
+
         // after process one utt, then reset decoder state.
         decodable->Reset();
 
-        if (prob_vec.size() == 0) {
+        if (prob_vec.size() == 0 || encoder_out_vec.size() == 0) {
             // the TokenWriter can not write empty string.
             ++num_err;
-            LOG(WARNING) << " the nnet prob of " << utt << " is empty";
+            LOG(WARNING) << " the nnet prob/encoder_out of " << utt << " is empty";
             continue;
         }
 
-        // writer nnet output
-        kaldi::MatrixIndexT nrow = prob_vec.size();
-        kaldi::MatrixIndexT ncol = prob_vec[0].Dim();
-        LOG(INFO) << "nnet out shape: " << nrow << ", " << ncol;
-        kaldi::Matrix<kaldi::BaseFloat> result(nrow, ncol);
-        for (int32 row_idx = 0; row_idx < nrow; ++row_idx) {
-            for (int32 col_idx = 0; col_idx < ncol; ++col_idx) {
-                result(row_idx, col_idx) = prob_vec[row_idx](col_idx);
+        {
+            // writer nnet output
+            kaldi::MatrixIndexT nrow = prob_vec.size();
+            kaldi::MatrixIndexT ncol = prob_vec[0].Dim();
+            LOG(INFO) << "nnet out shape: " << nrow << ", " << ncol;
+            kaldi::Matrix<kaldi::BaseFloat> nnet_out(nrow, ncol);
+            for (int32 row_idx = 0; row_idx < nrow; ++row_idx) {
+                for (int32 col_idx = 0; col_idx < ncol; ++col_idx) {
+                    nnet_out(row_idx, col_idx) = prob_vec[row_idx](col_idx);
+                }
+            }
+            nnet_out_writer.Write(utt, nnet_out);
+        }
+
+
+        {
+            // writer nnet encoder outs
+            kaldi::MatrixIndexT nrow = encoder_out_vec.size();
+            kaldi::MatrixIndexT ncol = encoder_out_vec[0].Dim();
+            LOG(INFO) << "nnet encoder outs shape: " << nrow << ", " << ncol;
+            kaldi::Matrix<kaldi::BaseFloat> encoder_outs(nrow, ncol);
+            for (int32 row_idx = 0; row_idx < nrow; ++row_idx) {
+                for (int32 col_idx = 0; col_idx < ncol; ++col_idx) {
+                    encoder_outs(row_idx, col_idx) = encoder_out_vec[row_idx](col_idx);
+                }
             }
+            nnet_encoder_outs_writer.Write(utt, encoder_outs);
         }
-        nnet_out_writer.Write(utt, result);
 
         ++num_done;
     }

From 6987751ff82415d3ff211c1624c315520d88aba2 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Wed, 12 Oct 2022 05:59:37 +0000
Subject: [PATCH 09/60] fix LogLikelihood and add AdvanceChunk

---
 speechx/speechx/base/common.h                 |  1 +
 .../frontend/audio/cmvn_json2kaldi_main.cc    |  8 +--
 speechx/speechx/kaldi/decoder/decodable-itf.h | 11 ++--
 speechx/speechx/nnet/decodable.cc             | 64 +++++++++++++++----
 speechx/speechx/nnet/decodable.h              |  6 +-
 speechx/speechx/nnet/ds2_nnet.h               |  2 +
 speechx/speechx/nnet/nnet_itf.h               |  6 +-
 speechx/speechx/nnet/u2_nnet.h                |  2 +
 speechx/speechx/nnet/u2_nnet_main.cc          | 20 +++---
 9 files changed, 87 insertions(+), 33 deletions(-)

diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h
index 90fc96a1..70b11b69 100644
--- a/speechx/speechx/base/common.h
+++ b/speechx/speechx/base/common.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <algorithm>
+#include <cmath>
 #include <condition_variable>
 #include <cstring>
 #include <deque>
diff --git a/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc b/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc
index 93bad688..713c9ef1 100644
--- a/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc
+++ b/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc
@@ -47,13 +47,13 @@ int main(int argc, char* argv[]) {
 
     for (auto obj : value.as_object()) {
         if (obj.key() == "mean_stat") {
-            LOG(INFO) << "mean_stat:" << obj.value();
+            VLOG(2) << "mean_stat:" << obj.value();
         }
         if (obj.key() == "var_stat") {
-            LOG(INFO) << "var_stat: " << obj.value();
+            VLOG(2) << "var_stat: " << obj.value();
         }
         if (obj.key() == "frame_num") {
-            LOG(INFO) << "frame_num: " << obj.value();
+            VLOG(2) << "frame_num: " << obj.value();
         }
     }
 
@@ -79,7 +79,7 @@ int main(int argc, char* argv[]) {
         cmvn_stats(1, idx) = var_stat_vec[idx];
     }
     cmvn_stats(0, mean_size) = frame_num;
-    LOG(INFO) << cmvn_stats;
+    VLOG(2) << cmvn_stats;
 
     kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, FLAGS_binary);
     LOG(INFO) << "cmvn stats have write into: " << FLAGS_cmvn_write_path;
diff --git a/speechx/speechx/kaldi/decoder/decodable-itf.h b/speechx/speechx/kaldi/decoder/decodable-itf.h
index b8ce9143..a7c12588 100644
--- a/speechx/speechx/kaldi/decoder/decodable-itf.h
+++ b/speechx/speechx/kaldi/decoder/decodable-itf.h
@@ -101,7 +101,9 @@ namespace kaldi {
 */
 class DecodableInterface {
   public:
-    /// Returns the log likelihood, which will be negated in the decoder.
+    virtual ~DecodableInterface() {}
+
+    /// Returns the log likelihood(logprob), which will be negated in the decoder.
     /// The "frame" starts from zero.  You should verify that NumFramesReady() >
     /// frame
     /// before calling this.
@@ -143,11 +145,12 @@ class DecodableInterface {
     /// this is for compatibility with OpenFst).
     virtual int32 NumIndices() const = 0;
 
+    /// Returns the likelihood(prob), which will be postive in the decoder.
+    /// The "frame" starts from zero.  You should verify that NumFramesReady() >
+    /// frame
+    /// before calling this.
     virtual bool FrameLikelihood(
         int32 frame, std::vector<kaldi::BaseFloat>* likelihood) = 0;
-
-
-    virtual ~DecodableInterface() {}
 };
 /// @}
 }  // namespace Kaldi
diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc
index 40fac182..1483949b 100644
--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@@ -55,18 +55,10 @@ int32 Decodable::NumIndices() const { return 0; }
 // id.
 int32 Decodable::TokenId2NnetId(int32 token_id) { return token_id - 1; }
 
-BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) {
-    CHECK_LE(index, nnet_out_cache_.NumCols());
-    CHECK_LE(frame, frames_ready_);
-    int32 frame_idx = frame - frame_offset_;
-    // the nnet output is prob ranther than log prob
-    // the index - 1, because the ilabel
-    return acoustic_scale_ *
-           std::log(nnet_out_cache_(frame_idx, TokenId2NnetId(index)) +
-                    std::numeric_limits<float>::min());
-}
+
 
 bool Decodable::EnsureFrameHaveComputed(int32 frame) {
+    // decoding frame
     if (frame >= frames_ready_) {
         return AdvanceChunk();
     }
@@ -74,26 +66,48 @@ bool Decodable::EnsureFrameHaveComputed(int32 frame) {
 }
 
 bool Decodable::AdvanceChunk() {
+    kaldi::Timer timer;
     // read feats
     Vector<BaseFloat> features;
     if (frontend_ == NULL || frontend_->Read(&features) == false) {
         // no feat or frontend_ not init.
         return false;
     }
+    VLOG(2) << "Forward with " << features.Dim() << " frames.";
 
     // forward feats
     NnetOut out;
     nnet_->FeedForward(features, frontend_->Dim(), &out);
     int32& vocab_dim = out.vocab_dim;
-    Vector<BaseFloat>& probs = out.logprobs;
+    Vector<BaseFloat>& logprobs = out.logprobs;
 
     // cache nnet outupts
-    nnet_out_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim);
-    nnet_out_cache_.CopyRowsFromVec(probs);
+    nnet_out_cache_.Resize(logprobs.Dim() / vocab_dim, vocab_dim);
+    nnet_out_cache_.CopyRowsFromVec(logprobs);
 
-    // update state
+    // update state, decoding frame.
     frame_offset_ = frames_ready_;
     frames_ready_ += nnet_out_cache_.NumRows();
+    VLOG(2) << "Forward feat chunk cost: " << timer.Elapsed() << " sec.";
+    return true;
+}
+
+bool Decodable::AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs, int* vocab_dim) {
+    if (AdvanceChunk() == false) {
+        return false;
+    }
+
+    int nrows = nnet_out_cache_.NumRows();
+    CHECK(nrows ==  (frames_ready_ - frame_offset_));
+    if (nrows <= 0){
+        LOG(WARNING) << "No new nnet out in cache.";
+        return false;
+    }
+
+    logprobs->Resize(nnet_out_cache_.NumRows() * nnet_out_cache_.NumCols());
+    logprobs->CopyRowsFromMat(nnet_out_cache_);
+
+    *vocab_dim =  nnet_out_cache_.NumCols();
     return true;
 }
 
@@ -113,6 +127,28 @@ bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
     return true;
 }
 
+BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) {
+    if (EnsureFrameHaveComputed(frame) == false) {
+        return false;
+    }
+
+    CHECK_LE(index, nnet_out_cache_.NumCols());
+    CHECK_LE(frame, frames_ready_);
+
+    // the nnet output is prob ranther than log prob
+    // the index - 1, because the ilabel
+    BaseFloat logprob = 0.0;
+    int32 frame_idx = frame - frame_offset_;
+    BaseFloat nnet_out = nnet_out_cache_(frame_idx, TokenId2NnetId(index));
+    if (nnet_->IsLogProb()){
+        logprob = nnet_out;
+    } else {
+        logprob = std::log(nnet_out + std::numeric_limits<float>::epsilon());
+    }
+    CHECK(!std::isnan(logprob) && !std::isinf(logprob));
+    return acoustic_scale_ * logprob;
+}
+
 void Decodable::Reset() {
     if (frontend_ != nullptr) frontend_->Reset();
     if (nnet_ != nullptr) nnet_->Reset();
diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h
index 39b38dc1..1ee6afbf 100644
--- a/speechx/speechx/nnet/decodable.h
+++ b/speechx/speechx/nnet/decodable.h
@@ -57,9 +57,13 @@ class Decodable : public kaldi::DecodableInterface {
 
     std::shared_ptr<NnetInterface> Nnet() { return nnet_; }
 
-  private:
+    // forward nnet with feats
     bool AdvanceChunk();
+    // forward nnet with feats, and get nnet output
+    bool AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
+                      int* vocab_dim);
 
+  private:
     std::shared_ptr<FrontendInterface> frontend_;
     std::shared_ptr<NnetInterface> nnet_;
 
diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h
index 80be6927..9e2cb77b 100644
--- a/speechx/speechx/nnet/ds2_nnet.h
+++ b/speechx/speechx/nnet/ds2_nnet.h
@@ -104,6 +104,8 @@ class PaddleNnet : public NnetInterface {
 
     void Reset() override;
 
+    bool IsLogProb() override { return false; }
+
     std::shared_ptr<Tensor<kaldi::BaseFloat>> GetCacheEncoder(
         const std::string& name);
 
diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h
index 5dde72a8..d05aabea 100644
--- a/speechx/speechx/nnet/nnet_itf.h
+++ b/speechx/speechx/nnet/nnet_itf.h
@@ -39,7 +39,8 @@ class NnetInterface {
 
     // forward feat with nnet.
     // nnet do not cache feats, feats cached by frontend.
-    // nnet cache model outputs, i.e. logprobs/encoder_outs.
+    // nnet cache model state, i.e. encoder_outs, att_cache, cnn_cache,
+    // frame_offset.
     virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
                              const int32& feature_dim,
                              NnetOut* out) = 0;
@@ -47,6 +48,9 @@ class NnetInterface {
     // reset nnet state, e.g. nnet_logprob_cache_, offset_, encoder_outs_.
     virtual void Reset() = 0;
 
+    // true, nnet output is logprob; otherwise is prob,
+    virtual bool IsLogProb() = 0;
+
     // using to get encoder outs. e.g. seq2seq with Attention model.
     virtual void EncoderOuts(
         std::vector<kaldi::Vector<kaldi::BaseFloat>>* encoder_out) const = 0;
diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h
index 8ce45f43..4ecbac26 100644
--- a/speechx/speechx/nnet/u2_nnet.h
+++ b/speechx/speechx/nnet/u2_nnet.h
@@ -111,6 +111,8 @@ class U2Nnet : public U2NnetBase {
 
     void Reset() override;
 
+    bool IsLogProb() override { return true; }
+
     void Dim();
 
     void LoadModel(const std::string& model_path_w_prefix);
diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc
index fb9fec23..0c5aed54 100644
--- a/speechx/speechx/nnet/u2_nnet_main.cc
+++ b/speechx/speechx/nnet/u2_nnet_main.cc
@@ -98,6 +98,7 @@ int main(int argc, char* argv[]) {
         // }
 
         int32 frame_idx = 0;
+        int vocab_dim = 0;
         std::vector<kaldi::Vector<kaldi::BaseFloat>> prob_vec;
         std::vector<kaldi::Vector<kaldi::BaseFloat>> encoder_out_vec;
         int32 ori_feature_len = feature.NumRows();
@@ -138,17 +139,17 @@ int main(int argc, char* argv[]) {
             }
 
             // get nnet outputs
-            vector<kaldi::BaseFloat> prob;
-            while (decodable->FrameLikelihood(frame_idx, &prob)) {
-                kaldi::Vector<kaldi::BaseFloat> vec_tmp(prob.size());
-                std::memcpy(vec_tmp.Data(),
-                            prob.data(),
-                            sizeof(kaldi::BaseFloat) * prob.size());
+            kaldi::Timer timer;
+            kaldi::Vector<kaldi::BaseFloat> logprobs;
+            bool isok = decodable->AdvanceChunk(&logprobs, &vocab_dim);
+            CHECK(isok == true);
+            for (int row_idx = 0; row_idx < logprobs.Dim() / vocab_dim; row_idx ++) {
+                kaldi::Vector<kaldi::BaseFloat> vec_tmp(vocab_dim);
+                std::memcpy(vec_tmp.Data(), logprobs.Data() + row_idx*vocab_dim, sizeof(kaldi::BaseFloat) * vocab_dim);
                 prob_vec.push_back(vec_tmp);
-                frame_idx++;
             }
 
-
+            VLOG(2) << "frame_idx: " << frame_idx << " elapsed: " << timer.Elapsed() << " sec.";
         }
 
         // get encoder out
@@ -196,8 +197,9 @@ int main(int argc, char* argv[]) {
         ++num_done;
     }
 
+
     double elapsed = timer.Elapsed();
-    LOG(INFO) << " cost:" << elapsed << " sec";
+    LOG(INFO) << "Program cost:" << elapsed << " sec";
 
     LOG(INFO) << "Done " << num_done << " utterances, " << num_err
               << " with errors.";

From 5c8725e8cdc25b9fe7e697f1cde0b79449f8a652 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Wed, 12 Oct 2022 06:34:09 +0000
Subject: [PATCH 10/60] unify model opts; add attention rescore in decodable;
 rename ds2 ctc beam search

---
 speechx/examples/codelab/decoder/run.sh       |  2 +-
 speechx/examples/ds2_ol/aishell/run.sh        |  4 +-
 speechx/examples/ds2_ol/aishell/run_fbank.sh  |  4 +-
 speechx/speechx/decoder/CMakeLists.txt        |  2 +-
 .../speechx/decoder/ctc_beam_search_decoder.h |  2 +
 ...ain.cc => ctc_beam_search_decoder_main.cc} |  2 +-
 .../speechx/decoder/ctc_prefix_beam_search.cc |  0
 speechx/speechx/decoder/param.h               |  1 +
 speechx/speechx/nnet/decodable.cc             |  6 +++
 speechx/speechx/nnet/decodable.h              | 31 ++++++-----
 speechx/speechx/nnet/ds2_nnet.h               | 51 +++----------------
 speechx/speechx/nnet/nnet_itf.h               | 51 +++++++++++++++++++
 speechx/speechx/nnet/u2_nnet.cc               |  2 +-
 speechx/speechx/nnet/u2_nnet.h                | 24 ++-------
 speechx/speechx/nnet/u2_nnet_main.cc          |  2 +-
 15 files changed, 96 insertions(+), 88 deletions(-)
 rename speechx/speechx/decoder/{ctc_prefix_beam_search_decoder_main.cc => ctc_beam_search_decoder_main.cc} (99%)
 create mode 100644 speechx/speechx/decoder/ctc_prefix_beam_search.cc

diff --git a/speechx/examples/codelab/decoder/run.sh b/speechx/examples/codelab/decoder/run.sh
index a911eb03..1a9e3cd7 100755
--- a/speechx/examples/codelab/decoder/run.sh
+++ b/speechx/examples/codelab/decoder/run.sh
@@ -69,7 +69,7 @@ compute_linear_spectrogram_main \
 echo "compute linear spectrogram feature."
 
 # run ctc beam search decoder as streaming
-ctc_prefix_beam_search_decoder_main \
+ctc_beam_search_decoder_main \
     --result_wspecifier=ark,t:$exp_dir/result.txt \
     --feature_rspecifier=ark:$feat_wspecifier \
     --model_path=$model_dir/avg_1.jit.pdmodel \
diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh
index a29be17b..e5fccc03 100755
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@@ -84,7 +84,7 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     #  recognizer
     utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \
-    ctc_prefix_beam_search_decoder_main \
+    ctc_beam_search_decoder_main \
         --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
         --model_path=$model_dir/avg_1.jit.pdmodel \
         --param_path=$model_dir/avg_1.jit.pdiparams \
@@ -103,7 +103,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     #  decode with lm
     utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \
-    ctc_prefix_beam_search_decoder_main \
+    ctc_beam_search_decoder_main \
         --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
         --model_path=$model_dir/avg_1.jit.pdmodel \
         --param_path=$model_dir/avg_1.jit.pdiparams \
diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh
index 72072835..88ed6287 100755
--- a/speechx/examples/ds2_ol/aishell/run_fbank.sh
+++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh
@@ -84,7 +84,7 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     #  recognizer
     utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wolm.log \
-    ctc_prefix_beam_search_decoder_main \
+    ctc_beam_search_decoder_main \
         --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \
         --model_path=$model_dir/avg_5.jit.pdmodel \
         --param_path=$model_dir/avg_5.jit.pdiparams \
@@ -102,7 +102,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     #  decode with lm
     utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.lm.log \
-    ctc_prefix_beam_search_decoder_main \
+    ctc_beam_search_decoder_main \
         --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \
         --model_path=$model_dir/avg_5.jit.pdmodel \
         --param_path=$model_dir/avg_5.jit.pdiparams \
diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt
index 1df93511..8d04a997 100644
--- a/speechx/speechx/decoder/CMakeLists.txt
+++ b/speechx/speechx/decoder/CMakeLists.txt
@@ -12,7 +12,7 @@ add_library(decoder STATIC
 target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder)
 
 set(BINS 
-  ctc_prefix_beam_search_decoder_main
+  ctc_beam_search_decoder_main
   nnet_logprob_decoder_main
   recognizer_main
   tlg_decoder_main
diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.h b/speechx/speechx/decoder/ctc_beam_search_decoder.h
index 9d0a5d14..19dbf2f6 100644
--- a/speechx/speechx/decoder/ctc_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder.h
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// used by deepspeech2
+
 #include "base/common.h"
 #include "decoder/ctc_decoders/path_trie.h"
 #include "decoder/ctc_decoders/scorer.h"
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc
similarity index 99%
rename from speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
rename to speechx/speechx/decoder/ctc_beam_search_decoder_main.cc
index 445f470f..7e245e9b 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// todo refactor, repalce with gtest
+// used by deepspeech2
 
 #include "base/flags.h"
 #include "base/log.h"
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search.cc b/speechx/speechx/decoder/ctc_prefix_beam_search.cc
new file mode 100644
index 00000000..e69de29b
diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h
index ed895aed..8a5990dc 100644
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -67,6 +67,7 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
     frame_opts.dither = 0.0;
     frame_opts.frame_shift_ms = 10;
     opts.use_fbank = FLAGS_use_fbank;
+    LOG(INFO) << "feature type: " << opts.use_fbank ? "fbank" : "linear";
     if (opts.use_fbank) {
         opts.to_float32 = false;
         frame_opts.window_type = "povey";
diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc
index 1483949b..b76c6280 100644
--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@@ -157,4 +157,10 @@ void Decodable::Reset() {
     nnet_out_cache_.Resize(0, 0);
 }
 
+void Decodable::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
+                          float reverse_weight,
+                          std::vector<float>* rescoring_score){
+    nnet_->AttentionRescoring(hyps, reverse_weight, rescoring_score);
+}
+
 }  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h
index 1ee6afbf..bfb75067 100644
--- a/speechx/speechx/nnet/decodable.h
+++ b/speechx/speechx/nnet/decodable.h
@@ -30,23 +30,31 @@ class Decodable : public kaldi::DecodableInterface {
 
     // void Init(DecodableOpts config);
 
-    // nnet logprob output
+    // nnet logprob output, used by wfst
     virtual kaldi::BaseFloat LogLikelihood(int32 frame, int32 index);
 
+    // nnet output
+    virtual bool FrameLikelihood(int32 frame,
+                                 std::vector<kaldi::BaseFloat>* likelihood);
+
+    // forward nnet with feats
+    bool AdvanceChunk();
+    
+    // forward nnet with feats, and get nnet output
+    bool AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
+                      int* vocab_dim);
+                      
+    void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
+                          float reverse_weight,
+                          std::vector<float>* rescoring_score);
+
     virtual bool IsLastFrame(int32 frame);
 
     // nnet output dim, e.g. vocab size
     virtual int32 NumIndices() const;
 
-    // nnet prob output
-    virtual bool FrameLikelihood(int32 frame,
-                                 std::vector<kaldi::BaseFloat>* likelihood);
-
     virtual int32 NumFramesReady() const;
 
-    // for offline test
-    void Acceptlikelihood(const kaldi::Matrix<kaldi::BaseFloat>& likelihood);
-
     void Reset();
 
     bool IsInputFinished() const { return frontend_->IsFinished(); }
@@ -57,11 +65,8 @@ class Decodable : public kaldi::DecodableInterface {
 
     std::shared_ptr<NnetInterface> Nnet() { return nnet_; }
 
-    // forward nnet with feats
-    bool AdvanceChunk();
-    // forward nnet with feats, and get nnet output
-    bool AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
-                      int* vocab_dim);
+    // for offline test
+    void Acceptlikelihood(const kaldi::Matrix<kaldi::BaseFloat>& likelihood);
 
   private:
     std::shared_ptr<FrontendInterface> frontend_;
diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h
index 9e2cb77b..cd1648b4 100644
--- a/speechx/speechx/nnet/ds2_nnet.h
+++ b/speechx/speechx/nnet/ds2_nnet.h
@@ -15,56 +15,11 @@
 #include <numeric>
 #include "base/common.h"
 #include "kaldi/matrix/kaldi-matrix.h"
-#include "kaldi/util/options-itf.h"
 #include "nnet/nnet_itf.h"
 #include "paddle_inference_api.h"
 
 namespace ppspeech {
 
-struct ModelOptions {
-    std::string model_path;
-    std::string param_path;
-    int thread_num;  // predictor thread pool size
-    bool use_gpu;
-    bool switch_ir_optim;
-    std::string input_names;
-    std::string output_names;
-    std::string cache_names;
-    std::string cache_shape;
-    bool enable_fc_padding;
-    bool enable_profile;
-    ModelOptions()
-        : model_path(""),
-          param_path(""),
-          thread_num(2),
-          use_gpu(false),
-          input_names(""),
-          output_names(""),
-          cache_names(""),
-          cache_shape(""),
-          switch_ir_optim(false),
-          enable_fc_padding(false),
-          enable_profile(false) {}
-
-    void Register(kaldi::OptionsItf* opts) {
-        opts->Register("model-path", &model_path, "model file path");
-        opts->Register("model-param", &param_path, "params model file path");
-        opts->Register("thread-num", &thread_num, "thread num");
-        opts->Register("use-gpu", &use_gpu, "if use gpu");
-        opts->Register("input-names", &input_names, "paddle input names");
-        opts->Register("output-names", &output_names, "paddle output names");
-        opts->Register("cache-names", &cache_names, "cache names");
-        opts->Register("cache-shape", &cache_shape, "cache shape");
-        opts->Register("switch-ir-optiom",
-                       &switch_ir_optim,
-                       "paddle SwitchIrOptim option");
-        opts->Register("enable-fc-padding",
-                       &enable_fc_padding,
-                       "paddle EnableFCPadding option");
-        opts->Register(
-            "enable-profile", &enable_profile, "paddle EnableProfile option");
-    }
-};
 
 template <typename T>
 class Tensor {
@@ -100,6 +55,12 @@ class PaddleNnet : public NnetInterface {
                      const int32& feature_dim,
                      NnetOut* out) override;
 
+    void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
+                                    float reverse_weight,
+                                    std::vector<float>* rescoring_score) override {
+      VLOG(2) << "deepspeech2 not has AttentionRescoring.";
+    }
+
     void Dim();
 
     void Reset() override;
diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h
index d05aabea..2e21ff9b 100644
--- a/speechx/speechx/nnet/nnet_itf.h
+++ b/speechx/speechx/nnet/nnet_itf.h
@@ -18,9 +18,56 @@
 #include "base/basic_types.h"
 #include "kaldi/base/kaldi-types.h"
 #include "kaldi/matrix/kaldi-matrix.h"
+#include "kaldi/util/options-itf.h"
 
 namespace ppspeech {
 
+
+struct ModelOptions {
+    std::string model_path;
+    std::string param_path;
+    int thread_num;  // predictor thread pool size for ds2;
+    bool use_gpu;
+    bool switch_ir_optim;
+    std::string input_names;
+    std::string output_names;
+    std::string cache_names;
+    std::string cache_shape;
+    bool enable_fc_padding;
+    bool enable_profile;
+    ModelOptions()
+        : model_path(""),
+          param_path(""),
+          thread_num(1),
+          use_gpu(false),
+          input_names(""),
+          output_names(""),
+          cache_names(""),
+          cache_shape(""),
+          switch_ir_optim(false),
+          enable_fc_padding(false),
+          enable_profile(false) {}
+
+    void Register(kaldi::OptionsItf* opts) {
+        opts->Register("model-path", &model_path, "model file path");
+        opts->Register("model-param", &param_path, "params model file path");
+        opts->Register("thread-num", &thread_num, "thread num");
+        opts->Register("use-gpu", &use_gpu, "if use gpu");
+        opts->Register("input-names", &input_names, "paddle input names");
+        opts->Register("output-names", &output_names, "paddle output names");
+        opts->Register("cache-names", &cache_names, "cache names");
+        opts->Register("cache-shape", &cache_shape, "cache shape");
+        opts->Register("switch-ir-optiom",
+                       &switch_ir_optim,
+                       "paddle SwitchIrOptim option");
+        opts->Register("enable-fc-padding",
+                       &enable_fc_padding,
+                       "paddle EnableFCPadding option");
+        opts->Register(
+            "enable-profile", &enable_profile, "paddle EnableProfile option");
+    }
+};
+
 struct NnetOut {
     // nnet out. maybe logprob or prob. Almost time this is logprob.
     kaldi::Vector<kaldi::BaseFloat> logprobs;
@@ -45,6 +92,10 @@ class NnetInterface {
                              const int32& feature_dim,
                              NnetOut* out) = 0;
 
+    virtual void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
+                                    float reverse_weight,
+                                    std::vector<float>* rescoring_score) = 0;
+
     // reset nnet state, e.g. nnet_logprob_cache_, offset_, encoder_outs_.
     virtual void Reset() = 0;
 
diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc
index 74f8cf78..71252477 100644
--- a/speechx/speechx/nnet/u2_nnet.cc
+++ b/speechx/speechx/nnet/u2_nnet.cc
@@ -166,7 +166,7 @@ void U2Nnet::Warmup() {
     Reset();
 }
 
-U2Nnet::U2Nnet(const U2ModelOptions& opts) : opts_(opts) {
+U2Nnet::U2Nnet(const ModelOptions& opts) : opts_(opts) {
     LoadModel(opts_.model_path);
 }
 
diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h
index 4ecbac26..1bac652e 100644
--- a/speechx/speechx/nnet/u2_nnet.h
+++ b/speechx/speechx/nnet/u2_nnet.h
@@ -17,28 +17,14 @@
 #include "base/common.h"
 #include "kaldi/matrix/kaldi-matrix.h"
 
-#include "kaldi/util/options-itf.h"
-#include "nnet/nnet_itf.h"
 
+#include "nnet/nnet_itf.h"
 #include "paddle/extension.h"
 #include "paddle/jit/all.h"
 #include "paddle/phi/api/all.h"
 
 namespace ppspeech {
 
-struct U2ModelOptions {
-    std::string model_path;
-    int thread_num;
-    bool use_gpu;
-    U2ModelOptions() : model_path(""), thread_num(1), use_gpu(false) {}
-
-    void Register(kaldi::OptionsItf* opts) {
-        opts->Register("model-path", &model_path, "model file path");
-        opts->Register("thread-num", &thread_num, "thread num");
-        opts->Register("use-gpu", &use_gpu, "if use gpu");
-    }
-};
-
 
 class U2NnetBase : public NnetInterface {
   public:
@@ -65,10 +51,6 @@ class U2NnetBase : public NnetInterface {
         std::vector<kaldi::BaseFloat>* ctc_probs,
         int32* vocab_dim);
 
-    virtual void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
-                                    float reverse_weight,
-                                    std::vector<float>* rescoring_score) = 0;
-
   protected:
     virtual void ForwardEncoderChunkImpl(
         const std::vector<kaldi::BaseFloat>& chunk_feats,
@@ -102,7 +84,7 @@ class U2NnetBase : public NnetInterface {
 
 class U2Nnet : public U2NnetBase {
   public:
-    U2Nnet(const U2ModelOptions& opts);
+    U2Nnet(const ModelOptions& opts);
     U2Nnet(const U2Nnet& other);
 
     void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
@@ -143,7 +125,7 @@ class U2Nnet : public U2NnetBase {
         std::vector<kaldi::Vector<kaldi::BaseFloat>>* encoder_out) const;
 
   private:
-    U2ModelOptions opts_;
+    ModelOptions opts_;
 
     phi::Place dev_;
     std::shared_ptr<paddle::jit::Layer> model_{nullptr};
diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc
index 0c5aed54..2dd1fa0d 100644
--- a/speechx/speechx/nnet/u2_nnet_main.cc
+++ b/speechx/speechx/nnet/u2_nnet_main.cc
@@ -58,7 +58,7 @@ int main(int argc, char* argv[]) {
     kaldi::BaseFloatMatrixWriter nnet_out_writer(FLAGS_nnet_prob_wspecifier);
     kaldi::BaseFloatMatrixWriter nnet_encoder_outs_writer(FLAGS_nnet_encoder_outs_wspecifier);
 
-    ppspeech::U2ModelOptions model_opts;
+    ppspeech::ModelOptions model_opts;
     model_opts.model_path = FLAGS_model_path;
 
     int32 chunk_size =

From bc1b6c2e7c2e9c61702f60d4dd44a101e79da679 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Wed, 12 Oct 2022 09:00:54 +0000
Subject: [PATCH 11/60] refactor ctc opts, extract decoder interface, add ctc
 beamsearch score

---
 speechx/examples/ds2_ol/aishell/run.sh        |  2 +-
 speechx/examples/ds2_ol/aishell/run_fbank.sh  |  2 +-
 speechx/speechx/decoder/CMakeLists.txt        |  2 +-
 .../decoder/ctc_beam_search_decoder.cc        | 10 +--
 .../speechx/decoder/ctc_beam_search_decoder.h | 69 ++++++----------
 speechx/speechx/decoder/ctc_beam_search_opt.h | 78 +++++++++++++++++++
 .../speechx/decoder/ctc_prefix_beam_search.cc |  0
 .../decoder/ctc_prefix_beam_search_decoder.cc | 13 ++++
 .../decoder/ctc_prefix_beam_search_decoder.h  | 64 +++++++++++++++
 .../decoder/ctc_prefix_beam_search_score.h    | 68 ++++++++++++++++
 speechx/speechx/decoder/ctc_tlg_decoder.cc    | 12 +--
 speechx/speechx/decoder/ctc_tlg_decoder.h     | 29 ++++---
 ...ecoder_main.cc => ctc_tlg_decoder_main.cc} | 10 ++-
 speechx/speechx/decoder/decoder_itf.h         | 56 +++++++++++++
 speechx/speechx/nnet/decodable.h              |  8 +-
 speechx/speechx/nnet/ds2_nnet.h               |  6 +-
 16 files changed, 351 insertions(+), 78 deletions(-)
 create mode 100644 speechx/speechx/decoder/ctc_beam_search_opt.h
 delete mode 100644 speechx/speechx/decoder/ctc_prefix_beam_search.cc
 create mode 100644 speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
 create mode 100644 speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
 create mode 100644 speechx/speechx/decoder/ctc_prefix_beam_search_score.h
 rename speechx/speechx/decoder/{tlg_decoder_main.cc => ctc_tlg_decoder_main.cc} (99%)
 create mode 100644 speechx/speechx/decoder/decoder_itf.h

diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh
index e5fccc03..794b533f 100755
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@@ -135,7 +135,7 @@ fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     #  TLG decoder
     utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \
-    tlg_decoder_main \
+    ctc_tlg_decoder_main \
         --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
         --model_path=$model_dir/avg_1.jit.pdmodel \
         --param_path=$model_dir/avg_1.jit.pdiparams \
diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh
index 88ed6287..1c3c3e01 100755
--- a/speechx/examples/ds2_ol/aishell/run_fbank.sh
+++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh
@@ -133,7 +133,7 @@ fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     #  TLG decoder
     utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wfst.log \
-    tlg_decoder_main \
+    ctc_tlg_decoder_main \
         --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \
         --model_path=$model_dir/avg_5.jit.pdmodel \
         --param_path=$model_dir/avg_5.jit.pdiparams \
diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt
index 8d04a997..20e93523 100644
--- a/speechx/speechx/decoder/CMakeLists.txt
+++ b/speechx/speechx/decoder/CMakeLists.txt
@@ -15,7 +15,7 @@ set(BINS
   ctc_beam_search_decoder_main
   nnet_logprob_decoder_main
   recognizer_main
-  tlg_decoder_main
+  ctc_tlg_decoder_main
 )
 
 foreach(bin_name IN LISTS BINS)
diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_beam_search_decoder.cc
index 5a12c0b5..ff3298b2 100644
--- a/speechx/speechx/decoder/ctc_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "decoder/ctc_beam_search_decoder.h"
 
-#include "base/basic_types.h"
+#include "base/common.h"
 #include "decoder/ctc_decoders/decoder_utils.h"
+#include "decoder/ctc_beam_search_decoder.h"
 #include "utils/file_utils.h"
 
 namespace ppspeech {
@@ -26,7 +26,7 @@ using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
 CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts)
     : opts_(opts),
       init_ext_scorer_(nullptr),
-      blank_id_(-1),
+      blank_id_(opts.blank),
       space_id_(-1),
       num_frame_decoded_(0),
       root_(nullptr) {
@@ -43,9 +43,9 @@ CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts)
             opts_.alpha, opts_.beta, opts_.lm_path, vocabulary_);
     }
 
-    blank_id_ = 0;
-    auto it = std::find(vocabulary_.begin(), vocabulary_.end(), " ");
+    CHECK(blank_id_==0);
 
+    auto it = std::find(vocabulary_.begin(), vocabulary_.end(), " ");
     space_id_ = it - vocabulary_.begin();
     // if no space in vocabulary
     if ((size_t)space_id_ >= vocabulary_.size()) {
diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.h b/speechx/speechx/decoder/ctc_beam_search_decoder.h
index 19dbf2f6..e36eb4a0 100644
--- a/speechx/speechx/decoder/ctc_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder.h
@@ -14,67 +14,48 @@
 
 // used by deepspeech2
 
-#include "base/common.h"
+#pragma once
+
+#include "decoder/ctc_beam_search_opt.h"
 #include "decoder/ctc_decoders/path_trie.h"
 #include "decoder/ctc_decoders/scorer.h"
-#include "kaldi/decoder/decodable-itf.h"
-#include "util/parse-options.h"
-
-#pragma once
+#include "decoder/decoder_itf.h"
 
 namespace ppspeech {
 
-struct CTCBeamSearchOptions {
-    std::string dict_file;
-    std::string lm_path;
-    BaseFloat alpha;
-    BaseFloat beta;
-    BaseFloat cutoff_prob;
-    int beam_size;
-    int cutoff_top_n;
-    int num_proc_bsearch;
-    CTCBeamSearchOptions()
-        : dict_file("vocab.txt"),
-          lm_path(""),
-          alpha(1.9f),
-          beta(5.0),
-          beam_size(300),
-          cutoff_prob(0.99f),
-          cutoff_top_n(40),
-          num_proc_bsearch(10) {}
-
-    void Register(kaldi::OptionsItf* opts) {
-        opts->Register("dict", &dict_file, "dict file ");
-        opts->Register("lm-path", &lm_path, "language model file");
-        opts->Register("alpha", &alpha, "alpha");
-        opts->Register("beta", &beta, "beta");
-        opts->Register(
-            "beam-size", &beam_size, "beam size for beam search method");
-        opts->Register("cutoff-prob", &cutoff_prob, "cutoff probs");
-        opts->Register("cutoff-top-n", &cutoff_top_n, "cutoff top n");
-        opts->Register(
-            "num-proc-bsearch", &num_proc_bsearch, "num proc bsearch");
-    }
-};
-
-class CTCBeamSearch {
+class CTCBeamSearch : public DecoderInterface {
   public:
     explicit CTCBeamSearch(const CTCBeamSearchOptions& opts);
     ~CTCBeamSearch() {}
+
     void InitDecoder();
+
+    void Reset();
+
+    void AdvanceDecode(
+        const std::shared_ptr<kaldi::DecodableInterface>& decodable);
+
+    std::string GetFinalBestPath();
+
+    std::string GetPartialResult() {
+        CHECK(false) << "Not implement.";
+        return {};
+    }
+
     void Decode(std::shared_ptr<kaldi::DecodableInterface> decodable);
+
     std::string GetBestPath();
     std::vector<std::pair<double, std::string>> GetNBestPath();
-    std::string GetFinalBestPath();
+
+
     int NumFrameDecoded();
+
     int DecodeLikelihoods(const std::vector<std::vector<BaseFloat>>& probs,
                           std::vector<std::string>& nbest_words);
-    void AdvanceDecode(
-        const std::shared_ptr<kaldi::DecodableInterface>& decodable);
-    void Reset();
 
   private:
     void ResetPrefixes();
+
     int32 SearchOneChar(const bool& full_beam,
                         const std::pair<size_t, BaseFloat>& log_prob_idx,
                         const BaseFloat& min_cutoff);
@@ -93,4 +74,4 @@ class CTCBeamSearch {
     DISALLOW_COPY_AND_ASSIGN(CTCBeamSearch);
 };
 
-}  // namespace basr
\ No newline at end of file
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/decoder/ctc_beam_search_opt.h b/speechx/speechx/decoder/ctc_beam_search_opt.h
new file mode 100644
index 00000000..dcb62258
--- /dev/null
+++ b/speechx/speechx/decoder/ctc_beam_search_opt.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/common.h"
+#include "util/parse-options.h"
+
+#pragma once
+
+namespace ppspeech {
+
+struct CTCBeamSearchOptions {
+    // common
+    int blank;
+
+    // ds2
+    std::string dict_file;
+    std::string lm_path;
+    int beam_size;
+    BaseFloat alpha;
+    BaseFloat beta;
+    BaseFloat cutoff_prob;
+    int cutoff_top_n;
+    int num_proc_bsearch;
+
+    // u2
+    int first_beam_size;
+    int second_beam_size;
+    CTCBeamSearchOptions()
+        : blank(0),
+          dict_file("vocab.txt"),
+          lm_path(""),
+          alpha(1.9f),
+          beta(5.0),
+          beam_size(300),
+          cutoff_prob(0.99f),
+          cutoff_top_n(40),
+          num_proc_bsearch(10),
+          first_beam_size(10),
+          second_beam_size(10) {}
+
+    void Register(kaldi::OptionsItf* opts) {
+        std::string module = "Ds2BeamSearchConfig: ";
+        opts->Register("dict", &dict_file, module + "vocab file path.");
+        opts->Register(
+            "lm-path", &lm_path, module + "ngram language model path.");
+        opts->Register("alpha", &alpha, module + "alpha");
+        opts->Register("beta", &beta, module + "beta");
+        opts->Register("beam-size",
+                       &beam_size,
+                       module + "beam size for beam search method");
+        opts->Register("cutoff-prob", &cutoff_prob, module + "cutoff probs");
+        opts->Register("cutoff-top-n", &cutoff_top_n, module + "cutoff top n");
+        opts->Register(
+            "num-proc-bsearch", &num_proc_bsearch, module + "num proc bsearch");
+
+        opts->Register("blank", &blank, "blank id, default is 0.");
+
+        module = "U2BeamSearchConfig: ";
+        opts->Register(
+            "first-beam-size", &first_beam_size, module + "first beam size.");
+        opts->Register("second-beam-size",
+                       &second_beam_size,
+                       module + "second beam size.");
+    }
+};
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search.cc b/speechx/speechx/decoder/ctc_prefix_beam_search.cc
deleted file mode 100644
index e69de29b..00000000
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
new file mode 100644
index 00000000..0544a1e2
--- /dev/null
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
@@ -0,0 +1,13 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
new file mode 100644
index 00000000..745c4a83
--- /dev/null
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "decoder/ctc_beam_search_opt.h"
+#include "decoder/ctc_prefix_beam_search_score.h"
+#include "decoder/decoder_itf.h"
+
+#include "kaldi/decoder/decodable-itf.h"
+
+namespace ppspeech {
+
+class CTCPrefixBeamSearch : public DecoderInterface {
+  public:
+    explicit CTCPrefixBeamSearch(const CTCBeamSearchOptions& opts);
+    ~CTCPrefixBeamSearch() {}
+
+    void InitDecoder();
+
+    void Decode(std::shared_ptr<kaldi::DecodableInterface> decodable);
+
+    std::string GetBestPath();
+
+    std::vector<std::pair<double, std::string>> GetNBestPath();
+
+    std::string GetFinalBestPath();
+
+    int NumFrameDecoded();
+
+    int DecodeLikelihoods(const std::vector<std::vector<BaseFloat>>& probs,
+                          std::vector<std::string>& nbest_words);
+
+    void AdvanceDecode(
+        const std::shared_ptr<kaldi::DecodableInterface>& decodable);
+    void Reset();
+
+  private:
+    void ResetPrefixes();
+    int32 SearchOneChar(const bool& full_beam,
+                        const std::pair<size_t, BaseFloat>& log_prob_idx,
+                        const BaseFloat& min_cutoff);
+    void CalculateApproxScore();
+    void LMRescore();
+    void AdvanceDecoding(const std::vector<std::vector<BaseFloat>>& probs);
+
+    CTCBeamSearchOptions opts_;
+    size_t blank_id_;
+    int num_frame_decoded_;
+    DISALLOW_COPY_AND_ASSIGN(CTCPrefixBeamSearch);
+};
+
+}  // namespace basr
\ No newline at end of file
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h
new file mode 100644
index 00000000..19423b5e
--- /dev/null
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "base/common.h"
+#include "utils/math.h"
+
+namespace ppspeech {
+
+struct PrefxiScore {
+    // decoding, unit in log scale
+    float b = -kFloatMax;   // blank ending score
+    float nb = -kFloatMax;  // none-blank ending score
+
+    // timestamp, unit in log sclae
+    float v_b = -kFloatMax;             // viterbi blank ending score
+    float v_nb = -kFloatMax;            // niterbi none-blank ending score
+    float cur_token_prob = -kFloatMax;  // prob of current token
+    std::vector<int> times_b;           // times of viterbi blank path
+    std::vector<int> times_nb;          // times of viterbi non-blank path
+
+    // context state
+    bool has_context = false;
+    int context_state = 0;
+    float context_score = 0;
+
+    // decoding score, sum
+    float Score() const { return LogSumExp(b, nb); }
+
+    // decodign score with context bias
+    float TotalScore() const { return Score() + context_score; }
+
+    // timestamp score, max
+    float ViterbiScore() const { return std::max(v_b, v_nb); }
+
+    // get timestamp
+    const std::vector<int>& Times() const {
+        return v_b > v_nb ? times_b : times_nb;
+    }
+};
+
+struct PrefixScoreHash {
+    // https://stackoverflow.com/questions/20511347/a-good-hash-function-for-a-vector
+    std::size_t operator()(const std::vector<int>& prefix) const {
+        std::size_t seed = prefix.size();
+        for (auto& i : prefix) {
+            seed ^= i + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+        }
+        return seed;
+    }
+};
+
+using PrefixWithScoreType = std::pair<std::vector<int>, PrefixScoreHash>;
+
+}  // namespace ppspeech
diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.cc b/speechx/speechx/decoder/ctc_tlg_decoder.cc
index 712d27dd..de97f6ad 100644
--- a/speechx/speechx/decoder/ctc_tlg_decoder.cc
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.cc
@@ -22,24 +22,24 @@ TLGDecoder::TLGDecoder(TLGDecoderOptions opts) {
         fst::SymbolTable::ReadText(opts.word_symbol_table));
     decoder_.reset(new kaldi::LatticeFasterOnlineDecoder(*fst_, opts.opts));
     decoder_->InitDecoding();
-    frame_decoded_size_ = 0;
+    num_frame_decoded_ = 0;
 }
 
 void TLGDecoder::InitDecoder() {
     decoder_->InitDecoding();
-    frame_decoded_size_ = 0;
+    num_frame_decoded_ = 0;
 }
 
 void TLGDecoder::AdvanceDecode(
     const std::shared_ptr<kaldi::DecodableInterface>& decodable) {
-    while (!decodable->IsLastFrame(frame_decoded_size_)) {
+    while (!decodable->IsLastFrame(num_frame_decoded_)) {
         AdvanceDecoding(decodable.get());
     }
 }
 
 void TLGDecoder::AdvanceDecoding(kaldi::DecodableInterface* decodable) {
     decoder_->AdvanceDecoding(decodable, 1);
-    frame_decoded_size_++;
+    num_frame_decoded_++;
 }
 
 void TLGDecoder::Reset() {
@@ -48,7 +48,7 @@ void TLGDecoder::Reset() {
 }
 
 std::string TLGDecoder::GetPartialResult() {
-    if (frame_decoded_size_ == 0) {
+    if (num_frame_decoded_ == 0) {
         // Assertion failed: (this->NumFramesDecoded() > 0 && "You cannot call
         // BestPathEnd if no frames were decoded.")
         return std::string("");
@@ -68,7 +68,7 @@ std::string TLGDecoder::GetPartialResult() {
 }
 
 std::string TLGDecoder::GetFinalBestPath() {
-    if (frame_decoded_size_ == 0) {
+    if (num_frame_decoded_ == 0) {
         // Assertion failed: (this->NumFramesDecoded() > 0 && "You cannot call
         // BestPathEnd if no frames were decoded.")
         return std::string("");
diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h
index 1ac46ac6..f2282cb8 100644
--- a/speechx/speechx/decoder/ctc_tlg_decoder.h
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.h
@@ -14,8 +14,9 @@
 
 #pragma once
 
-#include "base/basic_types.h"
-#include "kaldi/decoder/decodable-itf.h"
+#include "base/common.h"
+#include "decoder/decoder_itf.h"
+
 #include "kaldi/decoder/lattice-faster-online-decoder.h"
 #include "util/parse-options.h"
 
@@ -30,21 +31,31 @@ struct TLGDecoderOptions {
     TLGDecoderOptions() : word_symbol_table(""), fst_path("") {}
 };
 
-class TLGDecoder {
+class TLGDecoder : public DecoderInterface {
   public:
     explicit TLGDecoder(TLGDecoderOptions opts);
+    ~TLGDecoder() = default;
+
     void InitDecoder();
+    void Reset();
+
+    void AdvanceDecode(
+        const std::shared_ptr<kaldi::DecodableInterface>& decodable);
+
+
+    std::string GetFinalBestPath();
+    std::string GetPartialResult();
+
+
     void Decode();
+
     std::string GetBestPath();
     std::vector<std::pair<double, std::string>> GetNBestPath();
-    std::string GetFinalBestPath();
-    std::string GetPartialResult();
+
     int NumFrameDecoded();
     int DecodeLikelihoods(const std::vector<std::vector<BaseFloat>>& probs,
                           std::vector<std::string>& nbest_words);
-    void AdvanceDecode(
-        const std::shared_ptr<kaldi::DecodableInterface>& decodable);
-    void Reset();
+
 
   private:
     void AdvanceDecoding(kaldi::DecodableInterface* decodable);
@@ -53,7 +64,7 @@ class TLGDecoder {
     std::shared_ptr<fst::Fst<fst::StdArc>> fst_;
     std::shared_ptr<fst::SymbolTable> word_symbol_table_;
     // the frame size which have decoded starts from 0.
-    int32 frame_decoded_size_;
+    int32 num_frame_decoded_;
 };
 
 
diff --git a/speechx/speechx/decoder/tlg_decoder_main.cc b/speechx/speechx/decoder/ctc_tlg_decoder_main.cc
similarity index 99%
rename from speechx/speechx/decoder/tlg_decoder_main.cc
rename to speechx/speechx/decoder/ctc_tlg_decoder_main.cc
index b633022a..cd1249d8 100644
--- a/speechx/speechx/decoder/tlg_decoder_main.cc
+++ b/speechx/speechx/decoder/ctc_tlg_decoder_main.cc
@@ -14,13 +14,15 @@
 
 // todo refactor, repalce with gtest
 
-#include "base/flags.h"
-#include "base/log.h"
-#include "decoder/ctc_tlg_decoder.h"
+#include "base/common.h"
+
 #include "frontend/audio/data_cache.h"
-#include "kaldi/util/table-types.h"
 #include "nnet/decodable.h"
 #include "nnet/ds2_nnet.h"
+#include "decoder/ctc_tlg_decoder.h"
+
+#include "kaldi/util/table-types.h"
+
 
 DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
 DEFINE_string(result_wspecifier, "", "test result wspecifier");
diff --git a/speechx/speechx/decoder/decoder_itf.h b/speechx/speechx/decoder/decoder_itf.h
new file mode 100644
index 00000000..01061939
--- /dev/null
+++ b/speechx/speechx/decoder/decoder_itf.h
@@ -0,0 +1,56 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "base/common.h"
+#include "kaldi/decoder/decodable-itf.h"
+
+namespace ppspeech {
+
+class DecoderInterface {
+  public:
+    virtual ~DecoderInterface() {}
+
+    virtual void InitDecoder() = 0;
+
+    virtual void Reset() = 0;
+
+    virtual void AdvanceDecode(
+        const std::shared_ptr<kaldi::DecodableInterface>& decodable) = 0;
+
+
+    virtual std::string GetFinalBestPath() = 0;
+
+    virtual std::string GetPartialResult() = 0;
+
+    // void Decode();
+
+    // std::string GetBestPath();
+    // std::vector<std::pair<double, std::string>> GetNBestPath();
+
+    // int NumFrameDecoded();
+    // int DecodeLikelihoods(const std::vector<std::vector<BaseFloat>>& probs,
+    //                       std::vector<std::string>& nbest_words);
+
+
+  private:
+    // void AdvanceDecoding(kaldi::DecodableInterface* decodable);
+
+    // current decoding frame number
+    int32 num_frame_decoded_;
+};
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h
index bfb75067..70a16e2c 100644
--- a/speechx/speechx/nnet/decodable.h
+++ b/speechx/speechx/nnet/decodable.h
@@ -39,14 +39,14 @@ class Decodable : public kaldi::DecodableInterface {
 
     // forward nnet with feats
     bool AdvanceChunk();
-    
+
     // forward nnet with feats, and get nnet output
     bool AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
                       int* vocab_dim);
-                      
+
     void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
-                          float reverse_weight,
-                          std::vector<float>* rescoring_score);
+                            float reverse_weight,
+                            std::vector<float>* rescoring_score);
 
     virtual bool IsLastFrame(int32 frame);
 
diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h
index cd1648b4..e8a49c7d 100644
--- a/speechx/speechx/nnet/ds2_nnet.h
+++ b/speechx/speechx/nnet/ds2_nnet.h
@@ -56,9 +56,9 @@ class PaddleNnet : public NnetInterface {
                      NnetOut* out) override;
 
     void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
-                                    float reverse_weight,
-                                    std::vector<float>* rescoring_score) override {
-      VLOG(2) << "deepspeech2 not has AttentionRescoring.";
+                            float reverse_weight,
+                            std::vector<float>* rescoring_score) override {
+        VLOG(2) << "deepspeech2 not has AttentionRescoring.";
     }
 
     void Dim();

From 3c3aa6b59421f8f911247cd667426095f2298d58 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Wed, 12 Oct 2022 12:31:20 +0000
Subject: [PATCH 12/60] simple ctc prefix beam search compile ok

---
 speechx/speechx/base/common.h                 |   2 +
 speechx/speechx/base/macros.h                 |   3 +-
 speechx/speechx/decoder/CMakeLists.txt        |   3 +-
 .../decoder/ctc_beam_search_decoder.cc        |  12 +-
 .../speechx/decoder/ctc_beam_search_decoder.h |   3 +-
 .../decoder/ctc_prefix_beam_search_decoder.cc | 304 ++++++++++++++++++
 .../decoder/ctc_prefix_beam_search_decoder.h  |  70 +++-
 .../decoder/ctc_prefix_beam_search_score.h    |  50 ++-
 speechx/speechx/decoder/ctc_tlg_decoder.h     |   2 -
 speechx/speechx/decoder/decoder_itf.h         |   3 +-
 speechx/speechx/utils/math.cc                 |   4 +-
 11 files changed, 406 insertions(+), 50 deletions(-)

diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h
index 70b11b69..b470b9de 100644
--- a/speechx/speechx/base/common.h
+++ b/speechx/speechx/base/common.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <algorithm>
+#include <cassert>
 #include <cmath>
 #include <condition_variable>
 #include <cstring>
@@ -35,6 +36,7 @@
 #include <stdexcept>
 #include <string>
 #include <thread>
+#include <tuple>
 #include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
diff --git a/speechx/speechx/base/macros.h b/speechx/speechx/base/macros.h
index 14332a80..faf39373 100644
--- a/speechx/speechx/base/macros.h
+++ b/speechx/speechx/base/macros.h
@@ -25,8 +25,7 @@ namespace ppspeech {
     void operator=(const TypeName&) = delete
 #endif
 
-constexpr float kFloatMax = std::numeric_limits<float>::max();
-
+// kSpaceSymbol in UTF-8 is: ▁
 const std::string kSpaceSymbol = "\xe2\x96\x81";
 
 }  // namespace ppspeech
diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt
index 20e93523..b08aaba5 100644
--- a/speechx/speechx/decoder/CMakeLists.txt
+++ b/speechx/speechx/decoder/CMakeLists.txt
@@ -2,10 +2,11 @@ project(decoder)
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR/ctc_decoders})
 add_library(decoder STATIC
-  ctc_beam_search_decoder.cc
   ctc_decoders/decoder_utils.cpp
   ctc_decoders/path_trie.cpp
   ctc_decoders/scorer.cpp
+  ctc_beam_search_decoder.cc
+  ctc_prefix_beam_search_decoder.cc
   ctc_tlg_decoder.cc
   recognizer.cc
 )
diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_beam_search_decoder.cc
index ff3298b2..76342b87 100644
--- a/speechx/speechx/decoder/ctc_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder.cc
@@ -26,9 +26,7 @@ using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
 CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts)
     : opts_(opts),
       init_ext_scorer_(nullptr),
-      blank_id_(opts.blank),
       space_id_(-1),
-      num_frame_decoded_(0),
       root_(nullptr) {
     LOG(INFO) << "dict path: " << opts_.dict_file;
     if (!ReadFileToVector(opts_.dict_file, &vocabulary_)) {
@@ -43,7 +41,7 @@ CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts)
             opts_.alpha, opts_.beta, opts_.lm_path, vocabulary_);
     }
 
-    CHECK(blank_id_==0);
+    CHECK(opts_.blank==0);
 
     auto it = std::find(vocabulary_.begin(), vocabulary_.end(), " ");
     space_id_ = it - vocabulary_.begin();
@@ -167,7 +165,7 @@ void CTCBeamSearch::AdvanceDecoding(const vector<vector<BaseFloat>>& probs) {
                 continue;
             }
             min_cutoff = prefixes_[num_prefixes_ - 1]->score +
-                         std::log(prob[blank_id_]) -
+                         std::log(prob[opts_.blank]) -
                          std::max(0.0, init_ext_scorer_->beta);
 
             full_beam = (num_prefixes_ == beam_size);
@@ -195,9 +193,9 @@ void CTCBeamSearch::AdvanceDecoding(const vector<vector<BaseFloat>>& probs) {
             for (size_t i = beam_size; i < prefixes_.size(); ++i) {
                 prefixes_[i]->remove();
             }
-        }  // if
+        }  // end if
         num_frame_decoded_++;
-    }  // for probs_seq
+    }  // end for probs_seq
 }
 
 int32 CTCBeamSearch::SearchOneChar(
@@ -215,7 +213,7 @@ int32 CTCBeamSearch::SearchOneChar(
             break;
         }
 
-        if (c == blank_id_) {
+        if (c == opts_.blank) {
             prefix->log_prob_b_cur =
                 log_sum_exp(prefix->log_prob_b_cur, log_prob_c + prefix->score);
             continue;
diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.h b/speechx/speechx/decoder/ctc_beam_search_decoder.h
index e36eb4a0..516f8b2c 100644
--- a/speechx/speechx/decoder/ctc_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder.h
@@ -66,11 +66,10 @@ class CTCBeamSearch : public DecoderInterface {
     CTCBeamSearchOptions opts_;
     std::shared_ptr<Scorer> init_ext_scorer_;  // todo separate later
     std::vector<std::string> vocabulary_;      // todo remove later
-    size_t blank_id_;
     int space_id_;
     std::shared_ptr<PathTrie> root_;
     std::vector<PathTrie*> prefixes_;
-    int num_frame_decoded_;
+
     DISALLOW_COPY_AND_ASSIGN(CTCBeamSearch);
 };
 
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
index 0544a1e2..fd689023 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
@@ -11,3 +11,307 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
+
+#include "base/common.h"
+#include "decoder/ctc_beam_search_opt.h"
+#include "decoder/ctc_prefix_beam_search_score.h"
+#include "decoder/ctc_prefix_beam_search_decoder.h"
+#include "utils/math.h"
+
+#ifdef USE_PROFILING
+#include "paddle/fluid/platform/profiler.h"
+using paddle::platform::RecordEvent;
+using paddle::platform::TracerEventType;
+#endif
+
+namespace ppspeech {
+
+CTCPrefixBeamSearch::CTCPrefixBeamSearch(const CTCBeamSearchOptions& opts)
+    : opts_(opts) {
+    InitDecoder();
+}
+
+void CTCPrefixBeamSearch::InitDecoder() {
+    num_frame_decoded_ = 0;
+
+    cur_hyps_.clear();
+
+  hypotheses_.clear();
+  likelihood_.clear();
+  viterbi_likelihood_.clear();
+  times_.clear();
+  outputs_.clear();
+
+  abs_time_step_ = 0;
+
+  // empty hyp with Score
+  std::vector<int> empty;
+  PrefixScore prefix_score;
+  prefix_score.b = 0.0f;         // log(1)
+  prefix_score.nb = -kBaseFloatMax;  // log(0)
+  prefix_score.v_b = 0.0f;       // log(1)
+  prefix_score.v_nb = 0.0f;      // log(1)
+  cur_hyps_[empty] = prefix_score;
+
+  outputs_.emplace_back(empty);
+  hypotheses_.emplace_back(empty);
+  likelihood_.emplace_back(prefix_score.TotalScore());
+  times_.emplace_back(empty);
+
+}
+
+void CTCPrefixBeamSearch::Reset() {
+    InitDecoder();
+}
+
+void CTCPrefixBeamSearch::Decode(
+    std::shared_ptr<kaldi::DecodableInterface> decodable) {
+    return;
+}
+
+int32 CTCPrefixBeamSearch::NumFrameDecoded() { return num_frame_decoded_ + 1; }
+
+
+void CTCPrefixBeamSearch::UpdateOutputs(
+    const std::pair<std::vector<int>, PrefixScore>& prefix) {
+  const std::vector<int>& input = prefix.first;
+  // const std::vector<int>& start_boundaries = prefix.second.start_boundaries;
+  // const std::vector<int>& end_boundaries = prefix.second.end_boundaries;
+
+  std::vector<int> output;
+  int s = 0;
+  int e = 0;
+  for (int i = 0; i < input.size(); ++i) {
+    // if (s < start_boundaries.size() && i == start_boundaries[s]){
+    //     // <context>
+    //     output.emplace_back(context_graph_->start_tag_id());
+    //     ++s;
+    // }
+
+    output.emplace_back(input[i]);
+
+    // if (e < end_boundaries.size() && i == end_boundaries[e]){
+    //     // </context>
+    //     output.emplace_back(context_graph_->end_tag_id());
+    //     ++e;
+    // }
+  }
+
+  outputs_.emplace_back(output);
+}
+
+
+void CTCPrefixBeamSearch::AdvanceDecode(
+    const std::shared_ptr<kaldi::DecodableInterface>& decodable) {
+    while (1) {
+        std::vector<kaldi::BaseFloat> frame_prob;
+        bool flag = decodable->FrameLikelihood(num_frame_decoded_, &frame_prob);
+        if (flag == false) break;
+        std::vector<std::vector<kaldi::BaseFloat>> likelihood;
+        likelihood.push_back(frame_prob);
+        AdvanceDecoding(likelihood);
+    }
+}
+
+static bool PrefixScoreCompare(
+    const std::pair<std::vector<int>, PrefixScore>& a,
+    const std::pair<std::vector<int>, PrefixScore>& b) {
+  // log domain
+  return a.second.TotalScore() > b.second.TotalScore();
+}
+
+
+void CTCPrefixBeamSearch::AdvanceDecoding(const std::vector<std::vector<float>>& logp) {
+#ifdef USE_PROFILING
+  RecordEvent event(
+      "CtcPrefixBeamSearch::AdvanceDecoding", TracerEventType::UserDefined, 1);
+#endif
+
+  if (logp.size() == 0) return;
+
+  int first_beam_size =
+      std::min(static_cast<int>(logp[0].size()), opts_.first_beam_size);
+
+  for (int t = 0; t < logp.size(); ++t, ++abs_time_step_) {
+    const std::vector<float>& logp_t = logp[t];
+    std::unordered_map<std::vector<int>, PrefixScore, PrefixScoreHash> next_hyps;
+
+    // 1. first beam prune, only select topk candidates
+    std::vector<float> topk_score;
+    std::vector<int32_t> topk_index;
+    TopK(logp_t, first_beam_size, &topk_score, &topk_index);
+
+    // 2. token passing
+    for (int i = 0; i < topk_index.size(); ++i) {
+      int id = topk_index[i];
+      auto prob = topk_score[i];
+
+      for (const auto& it : cur_hyps_) {
+        const std::vector<int>& prefix = it.first;
+        const PrefixScore& prefix_score = it.second;
+
+        // If prefix doesn't exist in next_hyps, next_hyps[prefix] will insert
+        // PrefixScore(-inf, -inf) by default, since the default constructor
+        // of PrefixScore will set fields b(blank ending Score) and
+        // nb(none blank ending Score) to -inf, respectively.
+
+        if (id == opts_.blank) {
+          // case 0: *a + <blank> => *a, *a<blank> + <blank> => *a, prefix not
+          // change
+          PrefixScore& next_score = next_hyps[prefix];
+          next_score.b = LogSumExp(next_score.b, prefix_score.Score() + prob);
+
+          // timestamp, blank is slince, not effact timestamp
+          next_score.v_b = prefix_score.ViterbiScore() + prob;
+          next_score.times_b = prefix_score.Times();
+
+          // Prefix not changed, copy the context from pefix
+          if (context_graph_ && !next_score.has_context) {
+            next_score.CopyContext(prefix_score);
+            next_score.has_context = true;
+          }
+
+        } else if (!prefix.empty() && id == prefix.back()) {
+          // case 1: *a + a => *a, prefix not changed
+          PrefixScore& next_score1 = next_hyps[prefix];
+          next_score1.nb = LogSumExp(next_score1.nb, prefix_score.nb + prob);
+
+          // timestamp, non-blank symbol effact timestamp
+          if (next_score1.v_nb < prefix_score.v_nb + prob) {
+            // compute viterbi Score
+            next_score1.v_nb = prefix_score.v_nb + prob;
+            if (next_score1.cur_token_prob < prob) {
+              // store max token prob
+              next_score1.cur_token_prob = prob;
+              // update this timestamp as token appeared here.
+              next_score1.times_nb = prefix_score.times_nb;
+              assert(next_score1.times_nb.size() > 0);
+              next_score1.times_nb.back() = abs_time_step_;
+            }
+          }
+
+          // Prefix not changed, copy the context from pefix
+          if (context_graph_ && !next_score1.has_context) {
+            next_score1.CopyContext(prefix_score);
+            next_score1.has_context = true;
+          }
+
+          // case 2: *a<blank> + a => *aa, prefix changed.
+          std::vector<int> new_prefix(prefix);
+          new_prefix.emplace_back(id);
+          PrefixScore& next_score2 = next_hyps[new_prefix];
+          next_score2.nb = LogSumExp(next_score2.nb, prefix_score.b + prob);
+
+          // timestamp, non-blank symbol effact timestamp
+          if (next_score2.v_nb < prefix_score.v_b + prob) {
+            // compute viterbi Score
+            next_score2.v_nb = prefix_score.v_b + prob;
+            // new token added
+            next_score2.cur_token_prob = prob;
+            next_score2.times_nb = prefix_score.times_b;
+            next_score2.times_nb.emplace_back(abs_time_step_);
+          }
+
+          // Prefix changed, calculate the context Score.
+          if (context_graph_ && !next_score2.has_context) {
+            next_score2.UpdateContext(
+                context_graph_, prefix_score, id, prefix.size());
+            next_score2.has_context = true;
+          }
+
+        } else {
+          // id != prefix.back()
+          // case 3: *a + b => *ab, *a<blank> +b => *ab
+          std::vector<int> new_prefix(prefix);
+          new_prefix.emplace_back(id);
+          PrefixScore& next_score = next_hyps[new_prefix];
+          next_score.nb = LogSumExp(next_score.nb, prefix_score.Score() + prob);
+
+          // timetamp, non-blank symbol effact timestamp
+          if (next_score.v_nb < prefix_score.ViterbiScore() + prob) {
+            next_score.v_nb = prefix_score.ViterbiScore() + prob;
+
+            next_score.cur_token_prob = prob;
+            next_score.times_nb = prefix_score.Times();
+            next_score.times_nb.emplace_back(abs_time_step_);
+          }
+
+          // Prefix changed, calculate the context Score.
+          if (context_graph_ && !next_score.has_context) {
+            next_score.UpdateContext(
+                context_graph_, prefix_score, id, prefix.size());
+            next_score.has_context = true;
+          }
+        }
+      }  // end for (const auto& it : cur_hyps_)
+    }    // end for (int i = 0; i < topk_index.size(); ++i)
+
+    // 3. second beam prune, only keep top n best paths
+    std::vector<std::pair<std::vector<int>, PrefixScore>> arr(next_hyps.begin(),
+                                                              next_hyps.end());
+    int second_beam_size =
+        std::min(static_cast<int>(arr.size()), opts_.second_beam_size);
+    std::nth_element(arr.begin(),
+                     arr.begin() + second_beam_size,
+                     arr.end(),
+                     PrefixScoreCompare);
+    arr.resize(second_beam_size);
+    std::sort(arr.begin(), arr.end(), PrefixScoreCompare);
+
+    // 4. update cur_hyps by next_hyps, and get new result
+    UpdateHypotheses(arr);
+
+    num_frame_decoded_++;
+  }  // end for (int t = 0; t < logp.size(); ++t, ++abs_time_step_)
+}
+
+
+void CTCPrefixBeamSearch::UpdateHypotheses(
+    const std::vector<std::pair<std::vector<int>, PrefixScore>>& hyps) {
+  cur_hyps_.clear();
+
+  outputs_.clear();
+  hypotheses_.clear();
+  likelihood_.clear();
+  viterbi_likelihood_.clear();
+  times_.clear();
+
+  for (auto& item : hyps) {
+    cur_hyps_[item.first] = item.second;
+
+    UpdateOutputs(item);
+    hypotheses_.emplace_back(std::move(item.first));
+    likelihood_.emplace_back(item.second.TotalScore());
+    viterbi_likelihood_.emplace_back(item.second.ViterbiScore());
+    times_.emplace_back(item.second.Times());
+  }
+}
+
+void CTCPrefixBeamSearch::FinalizeSearch() { UpdateFinalContext(); }
+
+
+void CTCPrefixBeamSearch::UpdateFinalContext() {
+  if (context_graph_ == nullptr) return;
+  assert(hypotheses_.size() == cur_hyps_.size());
+  assert(hypotheses_.size() == likelihood_.size());
+
+  // We should backoff the context Score/state when the context is
+  // not fully matched at the last time.
+  for (const auto& prefix : hypotheses_) {
+    PrefixScore& prefix_score = cur_hyps_[prefix];
+    if (prefix_score.context_score != 0) {
+      //  prefix_score.UpdateContext(context_graph_, prefix_score, 0,
+      //                      prefix.size());
+    }
+  }
+  std::vector<std::pair<std::vector<int>, PrefixScore>> arr(cur_hyps_.begin(),
+                                                            cur_hyps_.end());
+  std::sort(arr.begin(), arr.end(), PrefixScoreCompare);
+
+  // Update cur_hyps_ and get new result
+  UpdateHypotheses(arr);
+}
+
+
+} // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
index 745c4a83..b67733e8 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
@@ -18,10 +18,8 @@
 #include "decoder/ctc_prefix_beam_search_score.h"
 #include "decoder/decoder_itf.h"
 
-#include "kaldi/decoder/decodable-itf.h"
-
 namespace ppspeech {
-
+class ContextGraph;
 class CTCPrefixBeamSearch : public DecoderInterface {
   public:
     explicit CTCPrefixBeamSearch(const CTCBeamSearchOptions& opts);
@@ -29,36 +27,74 @@ class CTCPrefixBeamSearch : public DecoderInterface {
 
     void InitDecoder();
 
+    void Reset();
+
+    void AdvanceDecode(
+        const std::shared_ptr<kaldi::DecodableInterface>& decodable);
+
+    std::string GetFinalBestPath();
+
+    std::string GetPartialResult() {
+        CHECK(false) << "Not implement.";
+        return {};
+    }
+
     void Decode(std::shared_ptr<kaldi::DecodableInterface> decodable);
 
     std::string GetBestPath();
 
     std::vector<std::pair<double, std::string>> GetNBestPath();
 
-    std::string GetFinalBestPath();
 
     int NumFrameDecoded();
 
     int DecodeLikelihoods(const std::vector<std::vector<BaseFloat>>& probs,
                           std::vector<std::string>& nbest_words);
 
-    void AdvanceDecode(
-        const std::shared_ptr<kaldi::DecodableInterface>& decodable);
-    void Reset();
+    const std::vector<float>& ViterbiLikelihood() const {
+        return viterbi_likelihood_;
+    }
+
+    const std::vector<std::vector<int>>& Inputs() const { return hypotheses_; }
+
+    const std::vector<std::vector<int>>& Outputs() const { return outputs_; }
+
+    const std::vector<float>& Likelihood() const { return likelihood_; }
+    const std::vector<std::vector<int>>& Times() const { return times_; }
+
 
   private:
-    void ResetPrefixes();
-    int32 SearchOneChar(const bool& full_beam,
-                        const std::pair<size_t, BaseFloat>& log_prob_idx,
-                        const BaseFloat& min_cutoff);
-    void CalculateApproxScore();
-    void LMRescore();
-    void AdvanceDecoding(const std::vector<std::vector<BaseFloat>>& probs);
+    void AdvanceDecoding(const std::vector<std::vector<BaseFloat>>& logp);
+
+    void FinalizeSearch();
 
+    void UpdateOutputs(const std::pair<std::vector<int>, PrefixScore>& prefix);
+    void UpdateHypotheses(
+        const std::vector<std::pair<std::vector<int>, PrefixScore>>& prefix);
+    void UpdateFinalContext();
+
+
+  private:
     CTCBeamSearchOptions opts_;
-    size_t blank_id_;
-    int num_frame_decoded_;
+
+    int abs_time_step_ = 0;
+
+    std::unordered_map<std::vector<int>, PrefixScore, PrefixScoreHash>
+        cur_hyps_;
+
+    // n-best list and corresponding likelihood, in sorted order
+    std::vector<std::vector<int>> hypotheses_;
+    std::vector<float> likelihood_;
+
+    std::vector<std::vector<int>> times_;
+    std::vector<float> viterbi_likelihood_;
+
+    // Outputs contain the hypotheses_ and tags lik: <context> and </context>
+    std::vector<std::vector<int>> outputs_;
+
+    std::shared_ptr<ContextGraph> context_graph_ = nullptr;
+
     DISALLOW_COPY_AND_ASSIGN(CTCPrefixBeamSearch);
 };
 
-}  // namespace basr
\ No newline at end of file
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h
index 19423b5e..da2fb80a 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h
@@ -20,35 +20,55 @@
 
 namespace ppspeech {
 
-struct PrefxiScore {
+class ContextGraph;
+
+struct PrefixScore {
     // decoding, unit in log scale
-    float b = -kFloatMax;   // blank ending score
-    float nb = -kFloatMax;  // none-blank ending score
+    float b = -kBaseFloatMax;   // blank ending score
+    float nb = -kBaseFloatMax;  // none-blank ending score
+
+    // decoding score, sum
+    float Score() const { return LogSumExp(b, nb); }
 
     // timestamp, unit in log sclae
-    float v_b = -kFloatMax;             // viterbi blank ending score
-    float v_nb = -kFloatMax;            // niterbi none-blank ending score
-    float cur_token_prob = -kFloatMax;  // prob of current token
-    std::vector<int> times_b;           // times of viterbi blank path
-    std::vector<int> times_nb;          // times of viterbi non-blank path
+    float v_b = -kBaseFloatMax;             // viterbi blank ending score
+    float v_nb = -kBaseFloatMax;            // niterbi none-blank ending score
+    float cur_token_prob = -kBaseFloatMax;  // prob of current token
+    std::vector<int> times_b;               // times of viterbi blank path
+    std::vector<int> times_nb;              // times of viterbi non-blank path
+
+
+    // timestamp score, max
+    float ViterbiScore() const { return std::max(v_b, v_nb); }
+
+    // get timestamp
+    const std::vector<int>& Times() const {
+        return v_b > v_nb ? times_b : times_nb;
+    }
 
     // context state
     bool has_context = false;
     int context_state = 0;
     float context_score = 0;
+    std::vector<int> start_boundaries;
+    std::vector<int> end_boundaries;
 
-    // decoding score, sum
-    float Score() const { return LogSumExp(b, nb); }
 
     // decodign score with context bias
     float TotalScore() const { return Score() + context_score; }
 
-    // timestamp score, max
-    float ViterbiScore() const { return std::max(v_b, v_nb); }
+    void CopyContext(const PrefixScore& prefix_score) {
+        context_state = prefix_score.context_state;
+        context_score = prefix_score.context_score;
+        start_boundaries = prefix_score.start_boundaries;
+        end_boundaries = prefix_score.end_boundaries;
+    }
 
-    // get timestamp
-    const std::vector<int>& Times() const {
-        return v_b > v_nb ? times_b : times_nb;
+    void UpdateContext(const std::shared_ptr<ContextGraph>& constext_graph,
+                       const PrefixScore& prefix_score,
+                       int word_id,
+                       int prefix_len) {
+        CHECK(false);
     }
 };
 
diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h
index f2282cb8..f3ecde73 100644
--- a/speechx/speechx/decoder/ctc_tlg_decoder.h
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.h
@@ -63,8 +63,6 @@ class TLGDecoder : public DecoderInterface {
     std::shared_ptr<kaldi::LatticeFasterOnlineDecoder> decoder_;
     std::shared_ptr<fst::Fst<fst::StdArc>> fst_;
     std::shared_ptr<fst::SymbolTable> word_symbol_table_;
-    // the frame size which have decoded starts from 0.
-    int32 num_frame_decoded_;
 };
 
 
diff --git a/speechx/speechx/decoder/decoder_itf.h b/speechx/speechx/decoder/decoder_itf.h
index 01061939..1bbc6b11 100644
--- a/speechx/speechx/decoder/decoder_itf.h
+++ b/speechx/speechx/decoder/decoder_itf.h
@@ -31,7 +31,6 @@ class DecoderInterface {
     virtual void AdvanceDecode(
         const std::shared_ptr<kaldi::DecodableInterface>& decodable) = 0;
 
-
     virtual std::string GetFinalBestPath() = 0;
 
     virtual std::string GetPartialResult() = 0;
@@ -46,7 +45,7 @@ class DecoderInterface {
     //                       std::vector<std::string>& nbest_words);
 
 
-  private:
+  protected:
     // void AdvanceDecoding(kaldi::DecodableInterface* decodable);
 
     // current decoding frame number
diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc
index 5087ac60..6a13f69b 100644
--- a/speechx/speechx/utils/math.cc
+++ b/speechx/speechx/utils/math.cc
@@ -28,8 +28,8 @@ namespace ppspeech {
 
 // Sum in log scale
 float LogSumExp(float x, float y) {
-    if (x <= -kFloatMax) return y;
-    if (y <= -kFloatMax) return x;
+    if (x <= -kBaseFloatMax) return y;
+    if (y <= -kBaseFloatMax) return x;
     float max = std::max(x, y);
     return max + std::log(std::exp(x - max) + std::exp(y - max));
 }

From 7dc9cba3be0706cb024f1d998c69b97a5d6816f3 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 13 Oct 2022 11:51:54 +0000
Subject: [PATCH 13/60] ctc prefix beam search for u2, test can run

---
 speechx/examples/codelab/u2/.gitignore        |   1 +
 speechx/examples/codelab/u2/README.md         |   1 +
 speechx/examples/codelab/u2/local/decode.sh   |  22 +
 speechx/examples/codelab/u2/local/feat.sh     |  27 +
 speechx/examples/codelab/u2/local/nnet.sh     |  23 +
 .../examples/codelab/{u2nnet => u2}/path.sh   |   3 +-
 .../examples/codelab/{u2nnet => u2}/run.sh    |  27 +-
 speechx/examples/codelab/u2nnet/.gitignore    |   3 -
 speechx/examples/codelab/u2nnet/README.md     |   3 -
 speechx/examples/codelab/u2nnet/valgrind.sh   |  21 -
 speechx/speechx/decoder/CMakeLists.txt        |  13 +-
 .../decoder/ctc_beam_search_decoder.cc        |  10 +-
 .../speechx/decoder/ctc_beam_search_decoder.h |  13 +-
 speechx/speechx/decoder/ctc_beam_search_opt.h |  65 +++
 .../decoder/ctc_prefix_beam_search_decoder.cc | 519 ++++++++++--------
 .../decoder/ctc_prefix_beam_search_decoder.h  |  71 ++-
 .../ctc_prefix_beam_search_decoder_main.cc    | 188 +++++++
 .../decoder/ctc_prefix_beam_search_result.h   |  41 ++
 speechx/speechx/decoder/ctc_tlg_decoder.cc    |  17 +-
 speechx/speechx/decoder/ctc_tlg_decoder.h     |  23 +-
 speechx/speechx/decoder/decoder_itf.h         |  22 +-
 speechx/speechx/nnet/u2_nnet_main.cc          |  11 -
 speechx/speechx/utils/math.cc                 |   7 +-
 23 files changed, 763 insertions(+), 368 deletions(-)
 create mode 100644 speechx/examples/codelab/u2/.gitignore
 create mode 100644 speechx/examples/codelab/u2/README.md
 create mode 100755 speechx/examples/codelab/u2/local/decode.sh
 create mode 100755 speechx/examples/codelab/u2/local/feat.sh
 create mode 100755 speechx/examples/codelab/u2/local/nnet.sh
 rename speechx/examples/codelab/{u2nnet => u2}/path.sh (84%)
 rename speechx/examples/codelab/{u2nnet => u2}/run.sh (54%)
 delete mode 100644 speechx/examples/codelab/u2nnet/.gitignore
 delete mode 100644 speechx/examples/codelab/u2nnet/README.md
 delete mode 100755 speechx/examples/codelab/u2nnet/valgrind.sh
 create mode 100644 speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
 create mode 100644 speechx/speechx/decoder/ctc_prefix_beam_search_result.h

diff --git a/speechx/examples/codelab/u2/.gitignore b/speechx/examples/codelab/u2/.gitignore
new file mode 100644
index 00000000..1269488f
--- /dev/null
+++ b/speechx/examples/codelab/u2/.gitignore
@@ -0,0 +1 @@
+data
diff --git a/speechx/examples/codelab/u2/README.md b/speechx/examples/codelab/u2/README.md
new file mode 100644
index 00000000..3c85dc91
--- /dev/null
+++ b/speechx/examples/codelab/u2/README.md
@@ -0,0 +1 @@
+# u2/u2pp Streaming Test
diff --git a/speechx/examples/codelab/u2/local/decode.sh b/speechx/examples/codelab/u2/local/decode.sh
new file mode 100755
index 00000000..12297661
--- /dev/null
+++ b/speechx/examples/codelab/u2/local/decode.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -x
+set -e
+
+. path.sh
+
+data=data
+exp=exp
+mkdir -p $exp
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/
+
+ctc_prefix_beam_search_decoder_main \
+    --model_path=$model_dir/export.jit \
+    --nnet_decoder_chunk=16 \
+    --receptive_field_length=7 \
+    --downsampling_rate=4 \
+    --vocab_path=$model_dir/unit.txt \
+    --feature_rspecifier=ark,t:$exp/fbank.ark \
+    --result_wspecifier=ark,t:$exp/result.ark
+
+echo "u2 ctc prefix beam search decode."
diff --git a/speechx/examples/codelab/u2/local/feat.sh b/speechx/examples/codelab/u2/local/feat.sh
new file mode 100755
index 00000000..1eec3aae
--- /dev/null
+++ b/speechx/examples/codelab/u2/local/feat.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+set -x
+set -e
+
+. path.sh
+
+data=data
+exp=exp
+mkdir -p $exp
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/
+
+
+cmvn_json2kaldi_main \
+    --json_file  $model_dir/mean_std.json \
+    --cmvn_write_path $exp/cmvn.ark \
+    --binary=false
+
+echo "convert json cmvn to kaldi ark."
+
+compute_fbank_main \
+    --num_bins 80 \
+    --wav_rspecifier=scp:$data/wav.scp \
+    --cmvn_file=$exp/cmvn.ark \
+    --feature_wspecifier=ark,t:$exp/fbank.ark
+
+echo "compute fbank feature."
diff --git a/speechx/examples/codelab/u2/local/nnet.sh b/speechx/examples/codelab/u2/local/nnet.sh
new file mode 100755
index 00000000..78663e9c
--- /dev/null
+++ b/speechx/examples/codelab/u2/local/nnet.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -x
+set -e
+
+. path.sh
+
+data=data
+exp=exp
+mkdir -p $exp
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/
+
+u2_nnet_main \
+    --model_path=$model_dir/export.jit \
+    --feature_rspecifier=ark,t:$exp/fbank.ark \
+    --nnet_decoder_chunk=16 \
+    --receptive_field_length=7 \
+    --downsampling_rate=4 \
+    --acoustic_scale=1.0 \
+    --nnet_encoder_outs_wspecifier=ark,t:$exp/encoder_outs.ark \
+    --nnet_prob_wspecifier=ark,t:$exp/logprobs.ark
+echo "u2 nnet decode."
+
diff --git a/speechx/examples/codelab/u2nnet/path.sh b/speechx/examples/codelab/u2/path.sh
similarity index 84%
rename from speechx/examples/codelab/u2nnet/path.sh
rename to speechx/examples/codelab/u2/path.sh
index 564e9fed..7f32fbce 100644
--- a/speechx/examples/codelab/u2nnet/path.sh
+++ b/speechx/examples/codelab/u2/path.sh
@@ -12,8 +12,7 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
 
 export LC_AL=C
 
-SPEECHX_BIN=$SPEECHX_BUILD/nnet
-export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
+export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio
 
 PADDLE_LIB_PATH=$(python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')")
 export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH
diff --git a/speechx/examples/codelab/u2nnet/run.sh b/speechx/examples/codelab/u2/run.sh
similarity index 54%
rename from speechx/examples/codelab/u2nnet/run.sh
rename to speechx/examples/codelab/u2/run.sh
index 704653e7..d314262b 100755
--- a/speechx/examples/codelab/u2nnet/run.sh
+++ b/speechx/examples/codelab/u2/run.sh
@@ -36,29 +36,8 @@ ckpt_dir=./data/model
 model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/
 
 
-cmvn_json2kaldi_main \
-    --json_file  $model_dir/mean_std.json \
-    --cmvn_write_path $exp/cmvn.ark \
-    --binary=false
+./local/feat.sh
 
-echo "convert json cmvn to kaldi ark."
+./local/nnet.sh
 
-compute_fbank_main \
-    --num_bins 80 \
-    --wav_rspecifier=scp:$data/wav.scp \
-    --cmvn_file=$exp/cmvn.ark \
-    --feature_wspecifier=ark,t:$exp/fbank.ark
-
-echo "compute fbank feature."
-
-u2_nnet_main \
-    --model_path=$model_dir/export.jit \
-    --feature_rspecifier=ark,t:$exp/fbank.ark \
-    --nnet_decoder_chunk=16 \
-    --receptive_field_length=7 \
-    --downsampling_rate=4 \
-    --acoustic_scale=1.0 \
-    --nnet_encoder_outs_wspecifier=ark,t:$exp/encoder_outs.ark \
-    --nnet_prob_wspecifier=ark,t:$exp/logprobs.ark
-
-echo "u2 nnet decode."
+./local/decode.sh
diff --git a/speechx/examples/codelab/u2nnet/.gitignore b/speechx/examples/codelab/u2nnet/.gitignore
deleted file mode 100644
index d6fe69bc..00000000
--- a/speechx/examples/codelab/u2nnet/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-data
-exp
-*log
diff --git a/speechx/examples/codelab/u2nnet/README.md b/speechx/examples/codelab/u2nnet/README.md
deleted file mode 100644
index 772a58f0..00000000
--- a/speechx/examples/codelab/u2nnet/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Deepspeech2 Streaming NNet Test
-
-Using for ds2 streaming nnet inference test.
diff --git a/speechx/examples/codelab/u2nnet/valgrind.sh b/speechx/examples/codelab/u2nnet/valgrind.sh
deleted file mode 100755
index a5aab663..00000000
--- a/speechx/examples/codelab/u2nnet/valgrind.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-# this script is for memory check, so please run ./run.sh first.
-
-set +x
-set -e
-
-. ./path.sh
-
-if [ ! -d ${SPEECHX_TOOLS}/valgrind/install ]; then
-  echo "please install valgrind in the speechx tools dir.\n" 
-  exit 1
-fi
-
-ckpt_dir=./data/model
-model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
-
-valgrind --tool=memcheck --track-origins=yes --leak-check=full --show-leak-kinds=all \
-  ds2_model_test_main \
-  --model_path=$model_dir/avg_1.jit.pdmodel \
-  --param_path=$model_dir/avg_1.jit.pdparams
diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt
index b08aaba5..8cf94a10 100644
--- a/speechx/speechx/decoder/CMakeLists.txt
+++ b/speechx/speechx/decoder/CMakeLists.txt
@@ -10,8 +10,9 @@ add_library(decoder STATIC
   ctc_tlg_decoder.cc
   recognizer.cc
 )
-target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder)
+target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder absl::strings)
 
+# test
 set(BINS 
   ctc_beam_search_decoder_main
   nnet_logprob_decoder_main
@@ -24,3 +25,13 @@ foreach(bin_name IN LISTS BINS)
   target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
   target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
 endforeach()
+
+
+# u2
+set(bin_name ctc_prefix_beam_search_decoder_main)
+add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(${bin_name} nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util)
+target_compile_options(${bin_name}  PRIVATE ${PADDLE_COMPILE_FLAGS})
+target_include_directories(${bin_name}  PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
+target_link_libraries(${bin_name}  ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
\ No newline at end of file
diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_beam_search_decoder.cc
index 76342b87..3f00ee35 100644
--- a/speechx/speechx/decoder/ctc_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder.cc
@@ -82,8 +82,6 @@ void CTCBeamSearch::Decode(
     return;
 }
 
-int32 CTCBeamSearch::NumFrameDecoded() { return num_frame_decoded_ + 1; }
-
 // todo rename, refactor
 void CTCBeamSearch::AdvanceDecode(
     const std::shared_ptr<kaldi::DecodableInterface>& decodable) {
@@ -110,15 +108,19 @@ void CTCBeamSearch::ResetPrefixes() {
 int CTCBeamSearch::DecodeLikelihoods(const vector<vector<float>>& probs,
                                      vector<string>& nbest_words) {
     kaldi::Timer timer;
-    timer.Reset();
     AdvanceDecoding(probs);
     LOG(INFO) << "ctc decoding elapsed time(s) "
               << static_cast<float>(timer.Elapsed()) / 1000.0f;
     return 0;
 }
 
+vector<std::pair<double, string>> CTCBeamSearch::GetNBestPath(int n) {
+    int beam_size = n == -1 ?  opts_.beam_size: std::min(n, opts_.beam_size);
+    return get_beam_search_result(prefixes_, vocabulary_, beam_size);
+}
+
 vector<std::pair<double, string>> CTCBeamSearch::GetNBestPath() {
-    return get_beam_search_result(prefixes_, vocabulary_, opts_.beam_size);
+    return GetNBestPath(-1);
 }
 
 string CTCBeamSearch::GetBestPath() {
diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.h b/speechx/speechx/decoder/ctc_beam_search_decoder.h
index 516f8b2c..479754c3 100644
--- a/speechx/speechx/decoder/ctc_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder.h
@@ -35,6 +35,11 @@ class CTCBeamSearch : public DecoderInterface {
     void AdvanceDecode(
         const std::shared_ptr<kaldi::DecodableInterface>& decodable);
 
+    void Decode(std::shared_ptr<kaldi::DecodableInterface> decodable);
+
+    std::string GetBestPath();
+    std::vector<std::pair<double, std::string>> GetNBestPath();
+    std::vector<std::pair<double, std::string>> GetNBestPath(int n);
     std::string GetFinalBestPath();
 
     std::string GetPartialResult() {
@@ -42,14 +47,6 @@ class CTCBeamSearch : public DecoderInterface {
         return {};
     }
 
-    void Decode(std::shared_ptr<kaldi::DecodableInterface> decodable);
-
-    std::string GetBestPath();
-    std::vector<std::pair<double, std::string>> GetNBestPath();
-
-
-    int NumFrameDecoded();
-
     int DecodeLikelihoods(const std::vector<std::vector<BaseFloat>>& probs,
                           std::vector<std::string>& nbest_words);
 
diff --git a/speechx/speechx/decoder/ctc_beam_search_opt.h b/speechx/speechx/decoder/ctc_beam_search_opt.h
index dcb62258..af92fad0 100644
--- a/speechx/speechx/decoder/ctc_beam_search_opt.h
+++ b/speechx/speechx/decoder/ctc_beam_search_opt.h
@@ -19,6 +19,7 @@
 
 namespace ppspeech {
 
+
 struct CTCBeamSearchOptions {
     // common
     int blank;
@@ -75,4 +76,68 @@ struct CTCBeamSearchOptions {
     }
 };
 
+
+// used by u2 model
+struct CTCBeamSearchDecoderOptions {
+    // chunk_size is the frame number of one chunk after subsampling.
+    // e.g. if subsample rate is 4 and chunk_size = 16, the frames in
+    // one chunk are 67=16*4 + 3, stride is 64=16*4
+    int chunk_size;
+    int num_left_chunks;
+
+    // final_score = rescoring_weight * rescoring_score + ctc_weight *
+    // ctc_score;
+    // rescoring_score = left_to_right_score * (1 - reverse_weight) +
+    // right_to_left_score * reverse_weight
+    // Please note the concept of ctc_scores
+    // in the following two search methods are different. For
+    // CtcPrefixBeamSerch,
+    // it's a sum(prefix) score + context score For CtcWfstBeamSerch, it's a
+    // max(viterbi) path score + context score So we should carefully set
+    // ctc_weight accroding to the search methods.
+    float ctc_weight;
+    float rescoring_weight;
+    float reverse_weight;
+
+    // CtcEndpointConfig ctc_endpoint_opts;
+
+    CTCBeamSearchOptions ctc_prefix_search_opts;
+
+    CTCBeamSearchDecoderOptions()
+        : chunk_size(16),
+          num_left_chunks(-1),
+          ctc_weight(0.5),
+          rescoring_weight(1.0),
+          reverse_weight(0.0) {}
+
+    void Register(kaldi::OptionsItf* opts) {
+        std::string module = "DecoderConfig: ";
+        opts->Register(
+            "chunk-size",
+            &chunk_size,
+            module + "the frame number of one chunk after subsampling.");
+        opts->Register("num-left-chunks",
+                       &num_left_chunks,
+                       module + "the left history chunks number.");
+        opts->Register("ctc-weight",
+                       &ctc_weight,
+                       module +
+                           "ctc weight for rescore. final_score = "
+                           "rescoring_weight * rescoring_score + ctc_weight * "
+                           "ctc_score.");
+        opts->Register("rescoring-weight",
+                       &rescoring_weight,
+                       module +
+                           "attention score weight for rescore. final_score = "
+                           "rescoring_weight * rescoring_score + ctc_weight * "
+                           "ctc_score.");
+        opts->Register("reverse-weight",
+                       &reverse_weight,
+                       module +
+                           "reverse decoder weight. rescoring_score = "
+                           "left_to_right_score * (1 - reverse_weight) + "
+                           "right_to_left_score * reverse_weight.");
+    }
+};
+
 }  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
index fd689023..f22bfea2 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
@@ -1,3 +1,5 @@
+// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu)
+//               2022 Binbin Zhang (binbzha@qq.com)
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,11 +15,12 @@
 // limitations under the License.
 
 
+#include "decoder/ctc_prefix_beam_search_decoder.h"
 #include "base/common.h"
 #include "decoder/ctc_beam_search_opt.h"
 #include "decoder/ctc_prefix_beam_search_score.h"
-#include "decoder/ctc_prefix_beam_search_decoder.h"
 #include "utils/math.h"
+#include "absl/strings/str_join.h"
 
 #ifdef USE_PROFILING
 #include "paddle/fluid/platform/profiler.h"
@@ -29,85 +32,47 @@ namespace ppspeech {
 
 CTCPrefixBeamSearch::CTCPrefixBeamSearch(const CTCBeamSearchOptions& opts)
     : opts_(opts) {
-    InitDecoder();
+    Reset();
 }
 
-void CTCPrefixBeamSearch::InitDecoder() {
+void CTCPrefixBeamSearch::Reset() { 
     num_frame_decoded_ = 0;
 
     cur_hyps_.clear();
 
-  hypotheses_.clear();
-  likelihood_.clear();
-  viterbi_likelihood_.clear();
-  times_.clear();
-  outputs_.clear();
+    hypotheses_.clear();
+    likelihood_.clear();
+    viterbi_likelihood_.clear();
+    times_.clear();
+    outputs_.clear();
 
-  abs_time_step_ = 0;
+    // empty hyp with Score
+    std::vector<int> empty;
+    PrefixScore prefix_score;
+    prefix_score.b = 0.0f;             // log(1)
+    prefix_score.nb = -kBaseFloatMax;  // log(0)
+    prefix_score.v_b = 0.0f;           // log(1)
+    prefix_score.v_nb = 0.0f;          // log(1)
+    cur_hyps_[empty] = prefix_score;
 
-  // empty hyp with Score
-  std::vector<int> empty;
-  PrefixScore prefix_score;
-  prefix_score.b = 0.0f;         // log(1)
-  prefix_score.nb = -kBaseFloatMax;  // log(0)
-  prefix_score.v_b = 0.0f;       // log(1)
-  prefix_score.v_nb = 0.0f;      // log(1)
-  cur_hyps_[empty] = prefix_score;
+    outputs_.emplace_back(empty);
+    hypotheses_.emplace_back(empty);
+    likelihood_.emplace_back(prefix_score.TotalScore());
+    times_.emplace_back(empty);
+ }
 
-  outputs_.emplace_back(empty);
-  hypotheses_.emplace_back(empty);
-  likelihood_.emplace_back(prefix_score.TotalScore());
-  times_.emplace_back(empty);
-
-}
+void CTCPrefixBeamSearch::InitDecoder() {  Reset(); }
 
-void CTCPrefixBeamSearch::Reset() {
-    InitDecoder();
-}
-
-void CTCPrefixBeamSearch::Decode(
-    std::shared_ptr<kaldi::DecodableInterface> decodable) {
-    return;
-}
-
-int32 CTCPrefixBeamSearch::NumFrameDecoded() { return num_frame_decoded_ + 1; }
-
-
-void CTCPrefixBeamSearch::UpdateOutputs(
-    const std::pair<std::vector<int>, PrefixScore>& prefix) {
-  const std::vector<int>& input = prefix.first;
-  // const std::vector<int>& start_boundaries = prefix.second.start_boundaries;
-  // const std::vector<int>& end_boundaries = prefix.second.end_boundaries;
-
-  std::vector<int> output;
-  int s = 0;
-  int e = 0;
-  for (int i = 0; i < input.size(); ++i) {
-    // if (s < start_boundaries.size() && i == start_boundaries[s]){
-    //     // <context>
-    //     output.emplace_back(context_graph_->start_tag_id());
-    //     ++s;
-    // }
-
-    output.emplace_back(input[i]);
-
-    // if (e < end_boundaries.size() && i == end_boundaries[e]){
-    //     // </context>
-    //     output.emplace_back(context_graph_->end_tag_id());
-    //     ++e;
-    // }
-  }
-
-  outputs_.emplace_back(output);
-}
 
 
 void CTCPrefixBeamSearch::AdvanceDecode(
     const std::shared_ptr<kaldi::DecodableInterface>& decodable) {
     while (1) {
+        // forward frame by frame
         std::vector<kaldi::BaseFloat> frame_prob;
         bool flag = decodable->FrameLikelihood(num_frame_decoded_, &frame_prob);
         if (flag == false) break;
+
         std::vector<std::vector<kaldi::BaseFloat>> likelihood;
         likelihood.push_back(frame_prob);
         AdvanceDecoding(likelihood);
@@ -117,201 +82,279 @@ void CTCPrefixBeamSearch::AdvanceDecode(
 static bool PrefixScoreCompare(
     const std::pair<std::vector<int>, PrefixScore>& a,
     const std::pair<std::vector<int>, PrefixScore>& b) {
-  // log domain
-  return a.second.TotalScore() > b.second.TotalScore();
+    // log domain
+    return a.second.TotalScore() > b.second.TotalScore();
 }
 
 
-void CTCPrefixBeamSearch::AdvanceDecoding(const std::vector<std::vector<float>>& logp) {
+void CTCPrefixBeamSearch::AdvanceDecoding(
+    const std::vector<std::vector<kaldi::BaseFloat>>& logp) {
 #ifdef USE_PROFILING
-  RecordEvent event(
-      "CtcPrefixBeamSearch::AdvanceDecoding", TracerEventType::UserDefined, 1);
+    RecordEvent event("CtcPrefixBeamSearch::AdvanceDecoding",
+                      TracerEventType::UserDefined,
+                      1);
 #endif
 
-  if (logp.size() == 0) return;
-
-  int first_beam_size =
-      std::min(static_cast<int>(logp[0].size()), opts_.first_beam_size);
-
-  for (int t = 0; t < logp.size(); ++t, ++abs_time_step_) {
-    const std::vector<float>& logp_t = logp[t];
-    std::unordered_map<std::vector<int>, PrefixScore, PrefixScoreHash> next_hyps;
-
-    // 1. first beam prune, only select topk candidates
-    std::vector<float> topk_score;
-    std::vector<int32_t> topk_index;
-    TopK(logp_t, first_beam_size, &topk_score, &topk_index);
-
-    // 2. token passing
-    for (int i = 0; i < topk_index.size(); ++i) {
-      int id = topk_index[i];
-      auto prob = topk_score[i];
-
-      for (const auto& it : cur_hyps_) {
-        const std::vector<int>& prefix = it.first;
-        const PrefixScore& prefix_score = it.second;
-
-        // If prefix doesn't exist in next_hyps, next_hyps[prefix] will insert
-        // PrefixScore(-inf, -inf) by default, since the default constructor
-        // of PrefixScore will set fields b(blank ending Score) and
-        // nb(none blank ending Score) to -inf, respectively.
-
-        if (id == opts_.blank) {
-          // case 0: *a + <blank> => *a, *a<blank> + <blank> => *a, prefix not
-          // change
-          PrefixScore& next_score = next_hyps[prefix];
-          next_score.b = LogSumExp(next_score.b, prefix_score.Score() + prob);
-
-          // timestamp, blank is slince, not effact timestamp
-          next_score.v_b = prefix_score.ViterbiScore() + prob;
-          next_score.times_b = prefix_score.Times();
-
-          // Prefix not changed, copy the context from pefix
-          if (context_graph_ && !next_score.has_context) {
-            next_score.CopyContext(prefix_score);
-            next_score.has_context = true;
-          }
-
-        } else if (!prefix.empty() && id == prefix.back()) {
-          // case 1: *a + a => *a, prefix not changed
-          PrefixScore& next_score1 = next_hyps[prefix];
-          next_score1.nb = LogSumExp(next_score1.nb, prefix_score.nb + prob);
-
-          // timestamp, non-blank symbol effact timestamp
-          if (next_score1.v_nb < prefix_score.v_nb + prob) {
-            // compute viterbi Score
-            next_score1.v_nb = prefix_score.v_nb + prob;
-            if (next_score1.cur_token_prob < prob) {
-              // store max token prob
-              next_score1.cur_token_prob = prob;
-              // update this timestamp as token appeared here.
-              next_score1.times_nb = prefix_score.times_nb;
-              assert(next_score1.times_nb.size() > 0);
-              next_score1.times_nb.back() = abs_time_step_;
-            }
-          }
-
-          // Prefix not changed, copy the context from pefix
-          if (context_graph_ && !next_score1.has_context) {
-            next_score1.CopyContext(prefix_score);
-            next_score1.has_context = true;
-          }
-
-          // case 2: *a<blank> + a => *aa, prefix changed.
-          std::vector<int> new_prefix(prefix);
-          new_prefix.emplace_back(id);
-          PrefixScore& next_score2 = next_hyps[new_prefix];
-          next_score2.nb = LogSumExp(next_score2.nb, prefix_score.b + prob);
-
-          // timestamp, non-blank symbol effact timestamp
-          if (next_score2.v_nb < prefix_score.v_b + prob) {
-            // compute viterbi Score
-            next_score2.v_nb = prefix_score.v_b + prob;
-            // new token added
-            next_score2.cur_token_prob = prob;
-            next_score2.times_nb = prefix_score.times_b;
-            next_score2.times_nb.emplace_back(abs_time_step_);
-          }
-
-          // Prefix changed, calculate the context Score.
-          if (context_graph_ && !next_score2.has_context) {
-            next_score2.UpdateContext(
-                context_graph_, prefix_score, id, prefix.size());
-            next_score2.has_context = true;
-          }
-
-        } else {
-          // id != prefix.back()
-          // case 3: *a + b => *ab, *a<blank> +b => *ab
-          std::vector<int> new_prefix(prefix);
-          new_prefix.emplace_back(id);
-          PrefixScore& next_score = next_hyps[new_prefix];
-          next_score.nb = LogSumExp(next_score.nb, prefix_score.Score() + prob);
-
-          // timetamp, non-blank symbol effact timestamp
-          if (next_score.v_nb < prefix_score.ViterbiScore() + prob) {
-            next_score.v_nb = prefix_score.ViterbiScore() + prob;
-
-            next_score.cur_token_prob = prob;
-            next_score.times_nb = prefix_score.Times();
-            next_score.times_nb.emplace_back(abs_time_step_);
-          }
-
-          // Prefix changed, calculate the context Score.
-          if (context_graph_ && !next_score.has_context) {
-            next_score.UpdateContext(
-                context_graph_, prefix_score, id, prefix.size());
-            next_score.has_context = true;
-          }
-        }
-      }  // end for (const auto& it : cur_hyps_)
-    }    // end for (int i = 0; i < topk_index.size(); ++i)
-
-    // 3. second beam prune, only keep top n best paths
-    std::vector<std::pair<std::vector<int>, PrefixScore>> arr(next_hyps.begin(),
-                                                              next_hyps.end());
-    int second_beam_size =
-        std::min(static_cast<int>(arr.size()), opts_.second_beam_size);
-    std::nth_element(arr.begin(),
-                     arr.begin() + second_beam_size,
-                     arr.end(),
-                     PrefixScoreCompare);
-    arr.resize(second_beam_size);
-    std::sort(arr.begin(), arr.end(), PrefixScoreCompare);
-
-    // 4. update cur_hyps by next_hyps, and get new result
-    UpdateHypotheses(arr);
-
-    num_frame_decoded_++;
-  }  // end for (int t = 0; t < logp.size(); ++t, ++abs_time_step_)
+    if (logp.size() == 0) return;
+
+    int first_beam_size =
+        std::min(static_cast<int>(logp[0].size()), opts_.first_beam_size);
+
+    for (int t = 0; t < logp.size(); ++t, ++num_frame_decoded_) {
+        const std::vector<float>& logp_t = logp[t];
+        std::unordered_map<std::vector<int>, PrefixScore, PrefixScoreHash>
+            next_hyps;
+
+        // 1. first beam prune, only select topk candidates
+        std::vector<float> topk_score;
+        std::vector<int32_t> topk_index;
+        TopK(logp_t, first_beam_size, &topk_score, &topk_index);
+
+        // 2. token passing
+        for (int i = 0; i < topk_index.size(); ++i) {
+            int id = topk_index[i];
+            auto prob = topk_score[i];
+
+            for (const auto& it : cur_hyps_) {
+                const std::vector<int>& prefix = it.first;
+                const PrefixScore& prefix_score = it.second;
+
+                // If prefix doesn't exist in next_hyps, next_hyps[prefix] will
+                // insert
+                // PrefixScore(-inf, -inf) by default, since the default
+                // constructor
+                // of PrefixScore will set fields b(blank ending Score) and
+                // nb(none blank ending Score) to -inf, respectively.
+
+                if (id == opts_.blank) {
+                    // case 0: *a + <blank> => *a, *a<blank> + <blank> => *a,
+                    // prefix not
+                    // change
+                    PrefixScore& next_score = next_hyps[prefix];
+                    next_score.b =
+                        LogSumExp(next_score.b, prefix_score.Score() + prob);
+
+                    // timestamp, blank is slince, not effact timestamp
+                    next_score.v_b = prefix_score.ViterbiScore() + prob;
+                    next_score.times_b = prefix_score.Times();
+
+                    // Prefix not changed, copy the context from pefix
+                    if (context_graph_ && !next_score.has_context) {
+                        next_score.CopyContext(prefix_score);
+                        next_score.has_context = true;
+                    }
+
+                } else if (!prefix.empty() && id == prefix.back()) {
+                    // case 1: *a + a => *a, prefix not changed
+                    PrefixScore& next_score1 = next_hyps[prefix];
+                    next_score1.nb =
+                        LogSumExp(next_score1.nb, prefix_score.nb + prob);
+
+                    // timestamp, non-blank symbol effact timestamp
+                    if (next_score1.v_nb < prefix_score.v_nb + prob) {
+                        // compute viterbi Score
+                        next_score1.v_nb = prefix_score.v_nb + prob;
+                        if (next_score1.cur_token_prob < prob) {
+                            // store max token prob
+                            next_score1.cur_token_prob = prob;
+                            // update this timestamp as token appeared here.
+                            next_score1.times_nb = prefix_score.times_nb;
+                            assert(next_score1.times_nb.size() > 0);
+                            next_score1.times_nb.back() = num_frame_decoded_;
+                        }
+                    }
+
+                    // Prefix not changed, copy the context from pefix
+                    if (context_graph_ && !next_score1.has_context) {
+                        next_score1.CopyContext(prefix_score);
+                        next_score1.has_context = true;
+                    }
+
+                    // case 2: *a<blank> + a => *aa, prefix changed.
+                    std::vector<int> new_prefix(prefix);
+                    new_prefix.emplace_back(id);
+                    PrefixScore& next_score2 = next_hyps[new_prefix];
+                    next_score2.nb =
+                        LogSumExp(next_score2.nb, prefix_score.b + prob);
+
+                    // timestamp, non-blank symbol effact timestamp
+                    if (next_score2.v_nb < prefix_score.v_b + prob) {
+                        // compute viterbi Score
+                        next_score2.v_nb = prefix_score.v_b + prob;
+                        // new token added
+                        next_score2.cur_token_prob = prob;
+                        next_score2.times_nb = prefix_score.times_b;
+                        next_score2.times_nb.emplace_back(num_frame_decoded_);
+                    }
+
+                    // Prefix changed, calculate the context Score.
+                    if (context_graph_ && !next_score2.has_context) {
+                        next_score2.UpdateContext(
+                            context_graph_, prefix_score, id, prefix.size());
+                        next_score2.has_context = true;
+                    }
+
+                } else {
+                    // id != prefix.back()
+                    // case 3: *a + b => *ab, *a<blank> +b => *ab
+                    std::vector<int> new_prefix(prefix);
+                    new_prefix.emplace_back(id);
+                    PrefixScore& next_score = next_hyps[new_prefix];
+                    next_score.nb =
+                        LogSumExp(next_score.nb, prefix_score.Score() + prob);
+
+                    // timetamp, non-blank symbol effact timestamp
+                    if (next_score.v_nb < prefix_score.ViterbiScore() + prob) {
+                        next_score.v_nb = prefix_score.ViterbiScore() + prob;
+
+                        next_score.cur_token_prob = prob;
+                        next_score.times_nb = prefix_score.Times();
+                        next_score.times_nb.emplace_back(num_frame_decoded_);
+                    }
+
+                    // Prefix changed, calculate the context Score.
+                    if (context_graph_ && !next_score.has_context) {
+                        next_score.UpdateContext(
+                            context_graph_, prefix_score, id, prefix.size());
+                        next_score.has_context = true;
+                    }
+                }
+            }  // end for (const auto& it : cur_hyps_)
+        }      // end for (int i = 0; i < topk_index.size(); ++i)
+
+        // 3. second beam prune, only keep top n best paths
+        std::vector<std::pair<std::vector<int>, PrefixScore>> arr(
+            next_hyps.begin(), next_hyps.end());
+        int second_beam_size =
+            std::min(static_cast<int>(arr.size()), opts_.second_beam_size);
+        std::nth_element(arr.begin(),
+                         arr.begin() + second_beam_size,
+                         arr.end(),
+                         PrefixScoreCompare);
+        arr.resize(second_beam_size);
+        std::sort(arr.begin(), arr.end(), PrefixScoreCompare);
+
+        // 4. update cur_hyps by next_hyps, and get new result
+        UpdateHypotheses(arr);
+    }  // end for (int t = 0; t < logp.size(); ++t, ++num_frame_decoded_)
 }
 
 
 void CTCPrefixBeamSearch::UpdateHypotheses(
     const std::vector<std::pair<std::vector<int>, PrefixScore>>& hyps) {
-  cur_hyps_.clear();
-
-  outputs_.clear();
-  hypotheses_.clear();
-  likelihood_.clear();
-  viterbi_likelihood_.clear();
-  times_.clear();
-
-  for (auto& item : hyps) {
-    cur_hyps_[item.first] = item.second;
-
-    UpdateOutputs(item);
-    hypotheses_.emplace_back(std::move(item.first));
-    likelihood_.emplace_back(item.second.TotalScore());
-    viterbi_likelihood_.emplace_back(item.second.ViterbiScore());
-    times_.emplace_back(item.second.Times());
-  }
+    cur_hyps_.clear();
+
+    outputs_.clear();
+    hypotheses_.clear();
+    likelihood_.clear();
+    viterbi_likelihood_.clear();
+    times_.clear();
+
+    for (auto& item : hyps) {
+        cur_hyps_[item.first] = item.second;
+
+        UpdateOutputs(item);
+        hypotheses_.emplace_back(std::move(item.first));
+        likelihood_.emplace_back(item.second.TotalScore());
+        viterbi_likelihood_.emplace_back(item.second.ViterbiScore());
+        times_.emplace_back(item.second.Times());
+    }
 }
 
-void CTCPrefixBeamSearch::FinalizeSearch() { UpdateFinalContext(); }
+void CTCPrefixBeamSearch::UpdateOutputs(
+    const std::pair<std::vector<int>, PrefixScore>& prefix) {
+    const std::vector<int>& input = prefix.first;
+    const std::vector<int>& start_boundaries = prefix.second.start_boundaries;
+    const std::vector<int>& end_boundaries = prefix.second.end_boundaries;
+
+    // add <context> </context> tag
+    std::vector<int> output;
+    int s = 0;
+    int e = 0;
+    for (int i = 0; i < input.size(); ++i) {
+        // if (s < start_boundaries.size() && i == start_boundaries[s]){
+        //     // <context>
+        //     output.emplace_back(context_graph_->start_tag_id());
+        //     ++s;
+        // }
+
+        output.emplace_back(input[i]);
+
+        // if (e < end_boundaries.size() && i == end_boundaries[e]){
+        //     // </context>
+        //     output.emplace_back(context_graph_->end_tag_id());
+        //     ++e;
+        // }
+    }
 
+    outputs_.emplace_back(output);
+}
+
+void CTCPrefixBeamSearch::FinalizeSearch() { 
+  UpdateFinalContext(); 
+}
 
 void CTCPrefixBeamSearch::UpdateFinalContext() {
-  if (context_graph_ == nullptr) return;
-  assert(hypotheses_.size() == cur_hyps_.size());
-  assert(hypotheses_.size() == likelihood_.size());
-
-  // We should backoff the context Score/state when the context is
-  // not fully matched at the last time.
-  for (const auto& prefix : hypotheses_) {
-    PrefixScore& prefix_score = cur_hyps_[prefix];
-    if (prefix_score.context_score != 0) {
-      //  prefix_score.UpdateContext(context_graph_, prefix_score, 0,
-      //                      prefix.size());
+    if (context_graph_ == nullptr) return;
+
+    CHECK(hypotheses_.size() == cur_hyps_.size());
+    CHECK(hypotheses_.size() == likelihood_.size());
+
+    // We should backoff the context Score/state when the context is
+    // not fully matched at the last time.
+    for (const auto& prefix : hypotheses_) {
+        PrefixScore& prefix_score = cur_hyps_[prefix];
+        if (prefix_score.context_score != 0) {
+             prefix_score.UpdateContext(context_graph_, prefix_score, 0,
+                                 prefix.size());
+        }
     }
+    std::vector<std::pair<std::vector<int>, PrefixScore>> arr(cur_hyps_.begin(),
+                                                              cur_hyps_.end());
+    std::sort(arr.begin(), arr.end(), PrefixScoreCompare);
+
+    // Update cur_hyps_ and get new result
+    UpdateHypotheses(arr);
+}
+
+  std::string CTCPrefixBeamSearch::GetBestPath(int index) {
+    int n_hyps = Outputs().size();
+    CHECK(n_hyps > 0);
+    CHECK(index < n_hyps);
+    std::vector<int> one = Outputs()[index];
+    return std::string(absl::StrJoin(one, kSpaceSymbol));
+  }
+
+  std::string  CTCPrefixBeamSearch::GetBestPath() {
+    return GetBestPath(0);
+  }
+
+  std::vector<std::pair<double, std::string>>  CTCPrefixBeamSearch::GetNBestPath(int n) {
+      int hyps_size = hypotheses_.size();
+      CHECK(hyps_size > 0);
+
+      int min_n = n == -1 ? hypotheses_.size() : std::min(n,  hyps_size);
+
+      std::vector<std::pair<double, std::string>> n_best;
+      n_best.reserve(min_n);
+
+      for (int i = 0; i < min_n; i++){
+        n_best.emplace_back(Likelihood()[i], GetBestPath(i) );
+      }
+      return n_best;
+  }
+
+  std::vector<std::pair<double, std::string>>  CTCPrefixBeamSearch::GetNBestPath() {
+    return GetNBestPath(-1);
   }
-  std::vector<std::pair<std::vector<int>, PrefixScore>> arr(cur_hyps_.begin(),
-                                                            cur_hyps_.end());
-  std::sort(arr.begin(), arr.end(), PrefixScoreCompare);
 
-  // Update cur_hyps_ and get new result
-  UpdateHypotheses(arr);
+std::string CTCPrefixBeamSearch::GetFinalBestPath() {
+  return GetBestPath();
+}
+
+std::string CTCPrefixBeamSearch::GetPartialResult() {
+  return GetBestPath();
 }
 
 
-} // namespace ppspeech
\ No newline at end of file
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
index b67733e8..ba44b0a2 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "decoder/ctc_beam_search_opt.h"
+#include "decoder/ctc_prefix_beam_search_result.h"
 #include "decoder/ctc_prefix_beam_search_score.h"
 #include "decoder/decoder_itf.h"
 
@@ -25,48 +26,37 @@ class CTCPrefixBeamSearch : public DecoderInterface {
     explicit CTCPrefixBeamSearch(const CTCBeamSearchOptions& opts);
     ~CTCPrefixBeamSearch() {}
 
-    void InitDecoder();
+    void InitDecoder() override;
 
-    void Reset();
+    void Reset() override;
 
     void AdvanceDecode(
-        const std::shared_ptr<kaldi::DecodableInterface>& decodable);
+        const std::shared_ptr<kaldi::DecodableInterface>& decodable) override;
 
-    std::string GetFinalBestPath();
+    std::string GetFinalBestPath() override;
+    std::string GetPartialResult() override;
 
-    std::string GetPartialResult() {
-        CHECK(false) << "Not implement.";
-        return {};
-    }
-
-    void Decode(std::shared_ptr<kaldi::DecodableInterface> decodable);
-
-    std::string GetBestPath();
-
-    std::vector<std::pair<double, std::string>> GetNBestPath();
-
-
-    int NumFrameDecoded();
-
-    int DecodeLikelihoods(const std::vector<std::vector<BaseFloat>>& probs,
-                          std::vector<std::string>& nbest_words);
+    void FinalizeSearch();
 
-    const std::vector<float>& ViterbiLikelihood() const {
-        return viterbi_likelihood_;
-    }
+  protected:
+    std::string GetBestPath() override;
+    std::vector<std::pair<double, std::string>> GetNBestPath() override;
+    std::vector<std::pair<double, std::string>> GetNBestPath(int n) override;
 
     const std::vector<std::vector<int>>& Inputs() const { return hypotheses_; }
-
     const std::vector<std::vector<int>>& Outputs() const { return outputs_; }
-
     const std::vector<float>& Likelihood() const { return likelihood_; }
+    const std::vector<float>& ViterbiLikelihood() const {
+        return viterbi_likelihood_;
+    }
     const std::vector<std::vector<int>>& Times() const { return times_; }
 
 
   private:
-    void AdvanceDecoding(const std::vector<std::vector<BaseFloat>>& logp);
+    std::string GetBestPath(int index);
 
-    void FinalizeSearch();
+    void AdvanceDecoding(
+        const std::vector<std::vector<kaldi::BaseFloat>>& logp);
 
     void UpdateOutputs(const std::pair<std::vector<int>, PrefixScore>& prefix);
     void UpdateHypotheses(
@@ -77,8 +67,6 @@ class CTCPrefixBeamSearch : public DecoderInterface {
   private:
     CTCBeamSearchOptions opts_;
 
-    int abs_time_step_ = 0;
-
     std::unordered_map<std::vector<int>, PrefixScore, PrefixScoreHash>
         cur_hyps_;
 
@@ -97,4 +85,29 @@ class CTCPrefixBeamSearch : public DecoderInterface {
     DISALLOW_COPY_AND_ASSIGN(CTCPrefixBeamSearch);
 };
 
+
+class CTCPrefixBeamSearchDecoder : public CTCPrefixBeamSearch {
+  public:
+    explicit CTCPrefixBeamSearchDecoder(const CTCBeamSearchDecoderOptions& opts)
+        : CTCPrefixBeamSearch(opts.ctc_prefix_search_opts), opts_(opts) {}
+
+    ~CTCPrefixBeamSearchDecoder() {}
+
+  private:
+    CTCBeamSearchDecoderOptions opts_;
+
+    // cache feature
+    bool start_ = false;  // false, this is first frame.
+    // for continues decoding
+    int num_frames_ = 0;
+    int global_frame_offset_ = 0;
+    const int time_stamp_gap_ =
+        100;  // timestamp gap between words in a sentence
+
+    // std::unique_ptr<CtcEndpoint> ctc_endpointer_;
+
+    int num_frames_in_current_chunk_ = 0;
+    std::vector<DecodeResult> result_;
+};
+
 }  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
new file mode 100644
index 00000000..8927a5f4
--- /dev/null
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
@@ -0,0 +1,188 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/common.h"
+#include "decoder/ctc_prefix_beam_search_decoder.h"
+#include "frontend/audio/data_cache.h"
+#include "kaldi/util/table-types.h"
+#include "nnet/decodable.h"
+#include "nnet/u2_nnet.h"
+#include "absl/strings/str_split.h"
+#include "fst/symbol-table.h"
+
+DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
+DEFINE_string(result_wspecifier, "", "test result wspecifier");
+DEFINE_string(vocab_path, "", "vocab path");
+
+DEFINE_string(model_path, "", "paddle nnet model");
+
+DEFINE_int32(receptive_field_length,
+             7,
+             "receptive field of two CNN(kernel=3) downsampling module.");
+DEFINE_int32(downsampling_rate,
+             4,
+             "two CNN(kernel=3) module downsampling rate.");
+
+DEFINE_int32(nnet_decoder_chunk, 16, "paddle nnet forward chunk");
+
+using kaldi::BaseFloat;
+using kaldi::Matrix;
+using std::vector;
+
+// test ds2 online decoder by feeding speech feature
+int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
+
+    int32 num_done = 0, num_err = 0;
+
+    CHECK(FLAGS_result_wspecifier != "");
+    CHECK(FLAGS_feature_rspecifier != "");
+    CHECK(FLAGS_vocab_path != "");
+    CHECK(FLAGS_model_path != "");
+    LOG(INFO) << "model path: " << FLAGS_model_path;
+
+    kaldi::SequentialBaseFloatMatrixReader feature_reader(
+        FLAGS_feature_rspecifier);
+    kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
+
+    LOG(INFO) << "Reading vocab table " << FLAGS_vocab_path;
+    fst::SymbolTable* unit_table = fst::SymbolTable::ReadText(FLAGS_vocab_path);
+
+    // nnet
+    ppspeech::ModelOptions model_opts;
+    model_opts.model_path = FLAGS_model_path;
+    std::shared_ptr<ppspeech::U2Nnet> nnet(
+        new ppspeech::U2Nnet(model_opts));
+
+    // decodeable
+    std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
+    std::shared_ptr<ppspeech::Decodable> decodable(
+        new ppspeech::Decodable(nnet, raw_data));
+
+    // decoder
+    ppspeech::CTCBeamSearchDecoderOptions opts;
+    opts.chunk_size = 16;
+    opts.num_left_chunks = -1;
+    opts.ctc_weight = 0.5;
+    opts.rescoring_weight = 1.0;
+    opts.reverse_weight = 0.3;
+    opts.ctc_prefix_search_opts.blank = 0;
+    opts.ctc_prefix_search_opts.first_beam_size = 10;
+    opts.ctc_prefix_search_opts.second_beam_size = 10;
+    ppspeech::CTCPrefixBeamSearchDecoder decoder(opts);
+
+
+    int32 chunk_size = FLAGS_receptive_field_length +
+                       (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
+    int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk;
+    int32 receptive_field_length = FLAGS_receptive_field_length;
+    LOG(INFO) << "chunk size (frame): " << chunk_size;
+    LOG(INFO) << "chunk stride (frame): " << chunk_stride;
+    LOG(INFO) << "receptive field (frame): " << receptive_field_length;
+
+    decoder.InitDecoder();
+
+    kaldi::Timer timer;
+    for (; !feature_reader.Done(); feature_reader.Next()) {
+        string utt = feature_reader.Key();
+        kaldi::Matrix<BaseFloat> feature = feature_reader.Value();
+
+        int nframes = feature.NumRows();
+        int feat_dim = feature.NumCols();
+        raw_data->SetDim(feat_dim);
+        LOG(INFO) << "utt: " << utt;
+        LOG(INFO) << "feat shape: " << nframes << ", " << feat_dim;
+
+        raw_data->SetDim(feat_dim);
+
+        int32 ori_feature_len = feature.NumRows();
+        int32 num_chunks = feature.NumRows() / chunk_stride + 1;
+        LOG(INFO) << "num_chunks: " << num_chunks;
+
+        for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
+            int32 this_chunk_size = 0;
+            if (ori_feature_len > chunk_idx * chunk_stride) {
+                this_chunk_size = std::min(
+                    ori_feature_len - chunk_idx * chunk_stride, chunk_size);
+            }
+            if (this_chunk_size < receptive_field_length) {
+                LOG(WARNING) << "utt: " << utt << " skip last "
+                             << this_chunk_size << " frames, expect is "
+                             << receptive_field_length;
+                break;
+            }
+
+
+            kaldi::Vector<kaldi::BaseFloat> feature_chunk(this_chunk_size *
+                                                          feat_dim);
+            int32 start = chunk_idx * chunk_stride;
+            for (int row_id = 0; row_id < this_chunk_size; ++row_id) {
+                kaldi::SubVector<kaldi::BaseFloat> feat_row(feature, start);
+                kaldi::SubVector<kaldi::BaseFloat> feature_chunk_row(
+                    feature_chunk.Data() + row_id * feat_dim,  feat_dim);
+
+                feature_chunk_row.CopyFromVec(feat_row);
+                ++start;
+            }
+
+            // feat to frontend pipeline cache
+            raw_data->Accept(feature_chunk);
+
+            // send data finish signal
+            if (chunk_idx == num_chunks - 1) {
+                raw_data->SetFinished();
+            }
+
+            // forward nnet
+            decoder.AdvanceDecode(decodable);
+        }
+
+        decoder.FinalizeSearch();
+
+        // get 1-best result
+        std::string result_ints = decoder.GetFinalBestPath();
+        std::vector<std::string> tokenids = absl::StrSplit(result_ints, ppspeech::kSpaceSymbol);
+        std::string result;
+        for (int i = 0; i < tokenids.size(); i++){
+            result += unit_table->Find(std::stoi(tokenids[i]));
+        }
+  
+        // after process one utt, then reset state.
+        decodable->Reset();
+        decoder.Reset();
+
+        if (result.empty()) {
+            // the TokenWriter can not write empty string.
+            ++num_err;
+            LOG(INFO) << " the result of " << utt << " is empty";
+            continue;
+        }
+
+        LOG(INFO) << " the result of " << utt << " is " << result;
+        result_writer.Write(utt, result);
+
+        ++num_done;
+    }
+
+    double elapsed = timer.Elapsed();
+    LOG(INFO) << "Program cost:" << elapsed << " sec";
+
+    LOG(INFO) << "Done " << num_done << " utterances, " << num_err
+              << " with errors.";
+    return (num_done != 0 ? 0 : 1);
+}
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_result.h b/speechx/speechx/decoder/ctc_prefix_beam_search_result.h
new file mode 100644
index 00000000..caa3e37e
--- /dev/null
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_result.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "base/common.h"
+
+namespace ppspeech {
+
+struct WordPiece {
+    std::string word;
+    int start = -1;
+    int end = -1;
+
+    WordPiece(std::string word, int start, int end)
+        : word(std::move(word)), start(start), end(end) {}
+};
+
+struct DecodeResult {
+    float score = -kBaseFloatMax;
+    std::string sentence;
+    std::vector<WordPiece> word_pieces;
+
+    static bool CompareFunc(const DecodeResult& a, const DecodeResult& b) {
+        return a.score > b.score;
+    }
+};
+
+}  // namespace ppspeech
diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.cc b/speechx/speechx/decoder/ctc_tlg_decoder.cc
index de97f6ad..4d0a21d5 100644
--- a/speechx/speechx/decoder/ctc_tlg_decoder.cc
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.cc
@@ -18,16 +18,23 @@ namespace ppspeech {
 TLGDecoder::TLGDecoder(TLGDecoderOptions opts) {
     fst_.reset(fst::Fst<fst::StdArc>::Read(opts.fst_path));
     CHECK(fst_ != nullptr);
+
     word_symbol_table_.reset(
         fst::SymbolTable::ReadText(opts.word_symbol_table));
+
     decoder_.reset(new kaldi::LatticeFasterOnlineDecoder(*fst_, opts.opts));
+
+    Reset();
+}
+
+void TLGDecoder::Reset() {
     decoder_->InitDecoding();
     num_frame_decoded_ = 0;
+    return;
 }
 
 void TLGDecoder::InitDecoder() {
-    decoder_->InitDecoding();
-    num_frame_decoded_ = 0;
+    Reset();
 }
 
 void TLGDecoder::AdvanceDecode(
@@ -42,10 +49,7 @@ void TLGDecoder::AdvanceDecoding(kaldi::DecodableInterface* decodable) {
     num_frame_decoded_++;
 }
 
-void TLGDecoder::Reset() {
-    InitDecoder();
-    return;
-}
+
 
 std::string TLGDecoder::GetPartialResult() {
     if (num_frame_decoded_ == 0) {
@@ -88,4 +92,5 @@ std::string TLGDecoder::GetFinalBestPath() {
     }
     return words;
 }
+
 }
diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h
index f3ecde73..2f1d6c10 100644
--- a/speechx/speechx/decoder/ctc_tlg_decoder.h
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.h
@@ -42,20 +42,27 @@ class TLGDecoder : public DecoderInterface {
     void AdvanceDecode(
         const std::shared_ptr<kaldi::DecodableInterface>& decodable);
 
-
-    std::string GetFinalBestPath();
-    std::string GetPartialResult();
-
-
     void Decode();
 
-    std::string GetBestPath();
-    std::vector<std::pair<double, std::string>> GetNBestPath();
+    std::string GetFinalBestPath() override;
+    std::string GetPartialResult() override;
 
-    int NumFrameDecoded();
     int DecodeLikelihoods(const std::vector<std::vector<BaseFloat>>& probs,
                           std::vector<std::string>& nbest_words);
 
+  protected:
+    std::string GetBestPath() override {
+        CHECK(false);
+        return {};
+    }
+    std::vector<std::pair<double, std::string>> GetNBestPath() override {
+        CHECK(false);
+        return {};
+    }
+    std::vector<std::pair<double, std::string>> GetNBestPath(int n) override {
+        CHECK(false);
+        return {};
+    }
 
   private:
     void AdvanceDecoding(kaldi::DecodableInterface* decodable);
diff --git a/speechx/speechx/decoder/decoder_itf.h b/speechx/speechx/decoder/decoder_itf.h
index 1bbc6b11..fe4e7408 100644
--- a/speechx/speechx/decoder/decoder_itf.h
+++ b/speechx/speechx/decoder/decoder_itf.h
@@ -28,27 +28,31 @@ class DecoderInterface {
 
     virtual void Reset() = 0;
 
+    // call AdvanceDecoding
     virtual void AdvanceDecode(
         const std::shared_ptr<kaldi::DecodableInterface>& decodable) = 0;
 
+    // call GetBestPath
     virtual std::string GetFinalBestPath() = 0;
 
     virtual std::string GetPartialResult() = 0;
 
-    // void Decode();
+  protected:
+    // virtual void AdvanceDecoding(kaldi::DecodableInterface* decodable) = 0;
 
-    // std::string GetBestPath();
-    // std::vector<std::pair<double, std::string>> GetNBestPath();
+    // virtual void Decode() = 0;
 
-    // int NumFrameDecoded();
-    // int DecodeLikelihoods(const std::vector<std::vector<BaseFloat>>& probs,
-    //                       std::vector<std::string>& nbest_words);
+    virtual std::string GetBestPath() = 0;
 
+    virtual std::vector<std::pair<double, std::string>> GetNBestPath() = 0;
 
-  protected:
-    // void AdvanceDecoding(kaldi::DecodableInterface* decodable);
+    virtual std::vector<std::pair<double, std::string>> GetNBestPath(int n) = 0;
 
-    // current decoding frame number
+    // start from one
+    int NumFrameDecoded() { return num_frame_decoded_ + 1; }
+
+  protected:
+    // current decoding frame number, abs_time_step_
     int32 num_frame_decoded_;
 };
 
diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc
index 2dd1fa0d..4b30f6b4 100644
--- a/speechx/speechx/nnet/u2_nnet_main.cc
+++ b/speechx/speechx/nnet/u2_nnet_main.cc
@@ -86,17 +86,6 @@ int main(int argc, char* argv[]) {
         LOG(INFO) << "utt: " << utt;
         LOG(INFO) << "feat shape: " << nframes << ", " << feat_dim;
 
-        // // pad feats
-        // int32 padding_len = 0;
-        // if ((feature.NumRows() - chunk_size) % chunk_stride != 0) {
-        //     padding_len =
-        //         chunk_stride - (feature.NumRows() - chunk_size) %
-        //         chunk_stride;
-        //     feature.Resize(feature.NumRows() + padding_len,
-        //                    feature.NumCols(),
-        //                    kaldi::kCopyData);
-        // }
-
         int32 frame_idx = 0;
         int vocab_dim = 0;
         std::vector<kaldi::Vector<kaldi::BaseFloat>> prob_vec;
diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc
index 6a13f69b..c218990a 100644
--- a/speechx/speechx/utils/math.cc
+++ b/speechx/speechx/utils/math.cc
@@ -68,7 +68,7 @@ void TopK(const std::vector<T>& data,
     for (int i = k; i < n; i++) {
         if (pq.top().first < data[i]) {
             pq.pop();
-            pq.emplace_back(data[i], i);
+            pq.emplace(data[i], i);
         }
     }
 
@@ -88,4 +88,9 @@ void TopK(const std::vector<T>& data,
     }
 }
 
+template void TopK<float>(const std::vector<float>& data,
+          int32_t k,
+          std::vector<float>* values,
+          std::vector<int>* indices) ;
+
 }  // namespace ppspeech
\ No newline at end of file

From 13a7fa9808d0faaa1589e0ef0659c537bd4d5dbb Mon Sep 17 00:00:00 2001
From: "david.95" <david.95@live.cn>
Date: Fri, 14 Oct 2022 15:37:33 +0800
Subject: [PATCH 14/60] enable chinese words' pinyin specified  in text of ssml
 formats, test=tts

---
 paddlespeech/t2s/exps/syn_utils.py       |   6 +-
 paddlespeech/t2s/frontend/zh_frontend.py | 156 ++++++++++++++++++++++
 paddlespeech/t2s/ssml/xml_processor.py   | 163 +++++++++++++++++++++++
 3 files changed, 323 insertions(+), 2 deletions(-)
 create mode 100644 paddlespeech/t2s/ssml/xml_processor.py

diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
index 15d8dfb7..f9d1cd1b 100644
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import math
 import os
+import re
 from pathlib import Path
 from typing import Any
 from typing import Dict
@@ -33,6 +34,7 @@ from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.modules.normalizer import ZScore
 from paddlespeech.utils.dynamic_import import dynamic_import
+
 # remove [W:onnxruntime: xxx] from ort
 ort.set_default_logger_severity(3)
 
@@ -103,7 +105,7 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
     sentences = []
     with open(text_file, 'rt') as f:
         for line in f:
-            items = line.strip().split()
+            items = re.split(r"\s+", line.strip(), 1)
             utt_id = items[0]
             if lang == 'zh':
                 sentence = "".join(items[1:])
@@ -180,7 +182,7 @@ def run_frontend(frontend: object,
                  to_tensor: bool=True):
     outs = dict()
     if lang == 'zh':
-        input_ids = frontend.get_input_ids(
+        input_ids = frontend.get_input_ids_ssml(
             text,
             merge_sentences=merge_sentences,
             get_tone_ids=get_tone_ids,
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index 722eed60..25558780 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import os
 import re
+from operator import itemgetter
 from typing import Dict
 from typing import List
 
@@ -31,6 +32,7 @@ from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter
 from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
 from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi
 from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer
+from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor
 
 INITIALS = [
     'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh',
@@ -81,6 +83,7 @@ class Frontend():
                  g2p_model="g2pW",
                  phone_vocab_path=None,
                  tone_vocab_path=None):
+        self.mix_ssml_processor = MixTextProcessor()
         self.tone_modifier = ToneSandhi()
         self.text_normalizer = TextNormalizer()
         self.punc = "：，；。？！“”‘’':,;.?!"
@@ -143,6 +146,7 @@ class Frontend():
                 tone_id = [line.strip().split() for line in f.readlines()]
             for tone, id in tone_id:
                 self.vocab_tones[tone] = int(id)
+        self.mix_ssml_processor.__repr__()
 
     def _init_pypinyin(self):
         large_pinyin.load()
@@ -281,6 +285,65 @@ class Frontend():
             phones_list.append(merge_list)
         return phones_list
 
+    def _split_word_to_char(self, words):
+        res = []
+        for x in words:
+            res.append(x)
+        return res
+
+    # if using ssml, have pingyin specified, assign pinyin to words
+    def _g2p_assign(self,
+                    words: List[str],
+                    pinyin_spec: List[str],
+                    merge_sentences: bool=True) -> List[List[str]]:
+        phones_list = []
+        initials = []
+        finals = []
+
+        words = self._split_word_to_char(words[0])
+        for pinyin, char in zip(pinyin_spec, words):
+            sub_initials = []
+            sub_finals = []
+            pinyin = pinyin.replace("u:", "v")
+            #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu
+            if pinyin in self.pinyin2phone:
+                initial_final_list = self.pinyin2phone[pinyin].split(" ")
+                if len(initial_final_list) == 2:
+                    sub_initials.append(initial_final_list[0])
+                    sub_finals.append(initial_final_list[1])
+                elif len(initial_final_list) == 1:
+                    sub_initials.append('')
+                    sub_finals.append(initial_final_list[1])
+            else:
+                # If it's not pinyin (possibly punctuation) or no conversion is required
+                sub_initials.append(pinyin)
+                sub_finals.append(pinyin)
+            initials.append(sub_initials)
+            finals.append(sub_finals)
+
+        initials = sum(initials, [])
+        finals = sum(finals, [])
+        phones = []
+        for c, v in zip(initials, finals):
+            # NOTE: post process for pypinyin outputs
+            # we discriminate i, ii and iii
+            if c and c not in self.punc:
+                phones.append(c)
+            if c and c in self.punc:
+                phones.append('sp')
+            if v and v not in self.punc:
+                phones.append(v)
+        phones_list.append(phones)
+        if merge_sentences:
+            merge_list = sum(phones_list, [])
+            # rm the last 'sp' to avoid the noise at the end
+            # cause in the training data, no 'sp' in the end
+            if merge_list[-1] == 'sp':
+                merge_list = merge_list[:-1]
+            phones_list = []
+            phones_list.append(merge_list)
+        return phones_list
+
     def _merge_erhua(self,
                      initials: List[str],
                      finals: List[str],
@@ -396,6 +459,52 @@ class Frontend():
             print("----------------------------")
         return phonemes
 
+    #@an added for ssml pinyin 
+    def get_phonemes_ssml(self,
+                          ssml_inputs: list,
+                          merge_sentences: bool=True,
+                          with_erhua: bool=True,
+                          robot: bool=False,
+                          print_info: bool=False) -> List[List[str]]:
+        all_phonemes = []
+        for word_pinyin_item in ssml_inputs:
+            phonemes = []
+            sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item)
+            sentences = self.text_normalizer.normalize(sentence)
+            if len(pinyin_spec) == 0:
+                phonemes = self._g2p(
+                    sentences,
+                    merge_sentences=merge_sentences,
+                    with_erhua=with_erhua)
+            else:
+                # phonemes should be pinyin_spec 
+                phonemes = self._g2p_assign(
+                    sentences, pinyin_spec, merge_sentences=merge_sentences)
+
+            all_phonemes = all_phonemes + phonemes
+
+        if robot:
+            new_phonemes = []
+            for sentence in all_phonemes:
+                new_sentence = []
+                for item in sentence:
+                    # `er` only have tone `2`
+                    if item[-1] in "12345" and item != "er2":
+                        item = item[:-1] + "1"
+                    new_sentence.append(item)
+                new_phonemes.append(new_sentence)
+            all_phonemes = new_phonemes
+
+        if print_info:
+            print("----------------------------")
+            print("text norm results:")
+            print(sentences)
+            print("----------------------------")
+            print("g2p results:")
+            print(all_phonemes[0])
+            print("----------------------------")
+        return [sum(all_phonemes, [])]
+
     def get_input_ids(self,
                       sentence: str,
                       merge_sentences: bool=True,
@@ -405,6 +514,7 @@ class Frontend():
                       add_blank: bool=False,
                       blank_token: str="<pad>",
                       to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
+
         phonemes = self.get_phonemes(
             sentence,
             merge_sentences=merge_sentences,
@@ -437,3 +547,49 @@ class Frontend():
         if temp_phone_ids:
             result["phone_ids"] = temp_phone_ids
         return result
+
+    # @an added for ssml
+    def get_input_ids_ssml(
+            self,
+            sentence: str,
+            merge_sentences: bool=True,
+            get_tone_ids: bool=False,
+            robot: bool=False,
+            print_info: bool=False,
+            add_blank: bool=False,
+            blank_token: str="<pad>",
+            to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
+
+        l_inputs = MixTextProcessor.get_pinyin_split(sentence)
+        phonemes = self.get_phonemes_ssml(
+            l_inputs,
+            merge_sentences=merge_sentences,
+            print_info=print_info,
+            robot=robot)
+        result = {}
+        phones = []
+        tones = []
+        temp_phone_ids = []
+        temp_tone_ids = []
+
+        for part_phonemes in phonemes:
+            phones, tones = self._get_phone_tone(
+                part_phonemes, get_tone_ids=get_tone_ids)
+            if add_blank:
+                phones = insert_after_character(phones, blank_token)
+            if tones:
+                tone_ids = self._t2id(tones)
+                if to_tensor:
+                    tone_ids = paddle.to_tensor(tone_ids)
+                temp_tone_ids.append(tone_ids)
+            if phones:
+                phone_ids = self._p2id(phones)
+                # if use paddle.to_tensor() in onnxruntime, the first time will be too low
+                if to_tensor:
+                    phone_ids = paddle.to_tensor(phone_ids)
+                temp_phone_ids.append(phone_ids)
+        if temp_tone_ids:
+            result["tone_ids"] = temp_tone_ids
+        if temp_phone_ids:
+            result["phone_ids"] = temp_phone_ids
+        return result
diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/ssml/xml_processor.py
new file mode 100644
index 00000000..54f24f59
--- /dev/null
+++ b/paddlespeech/t2s/ssml/xml_processor.py
@@ -0,0 +1,163 @@
+# -*- coding: utf-8 -*-
+import re
+import xml.dom.minidom
+import xml.parsers.expat
+from xml.dom.minidom import Node
+from xml.dom.minidom import parseString
+'''
+Note:  xml 有5种特殊字符， &<>"'
+其一，采用<![CDATA[ ]]>特殊标签，将包含特殊字符的字符串封装起来。
+例如：
+<TitleName><![CDATA["姓名"]]></TitleName>
+其二，使用XML转义序列表示这些特殊的字符，这5个特殊字符所对应XML转义序列为：
+&  &amp;
+<  &lt;
+>  &gt;
+"  &quot;
+'  &apos;
+例如：
+<TitleName>&quot;姓名&quot;</TitleName>
+
+'''
+
+
+class MixTextProcessor():
+    def __repr__(self):
+        print("@an MixTextProcessor class")
+
+    def get_xml_content(self, mixstr):
+        '''返回字符串的 xml 内容'''
+        xmlptn = re.compile(r"<speak>.*?</speak>", re.M | re.S)
+        ctn = re.search(xmlptn, mixstr)
+        if ctn:
+            return ctn.group(0)
+        else:
+            return None
+
+    def get_content_split(self, mixstr):
+        ''' 文本分解，顺序加了列表中，按非xml 和 xml 分开，对应的字符串,带标点符号
+        不能去除空格，因为xml 中tag 属性带空格
+        '''
+        ctlist = []
+        # print("Testing:",mixstr[:20])
+        patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
+        mat = re.match(patn, mixstr)
+        if mat:
+            pre_xml = mat.group(1)
+            in_xml = mat.group(2)
+            after_xml = mat.group(3)
+
+            ctlist.append(pre_xml)
+            ctlist.append(in_xml)
+            ctlist.append(after_xml)
+            return ctlist
+        else:
+            ctlist.append(mixstr)
+        return ctlist
+
+    @classmethod
+    def get_pinyin_split(self, mixstr):
+        ctlist = []
+        patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
+        mat = re.match(patn, mixstr)
+        if mat:
+            pre_xml = mat.group(1)
+            in_xml = mat.group(2)
+            after_xml = mat.group(3)
+
+            ctlist.append([pre_xml, []])
+            dom = DomXml(in_xml)
+            pinyinlist = dom.get_pinyins_for_xml()
+            ctlist = ctlist + pinyinlist
+            ctlist.append([after_xml, []])
+        else:
+            ctlist.append([mixstr, []])
+        return ctlist
+
+
+class DomXml():
+    def __init__(self, xmlstr):
+        print("Parse xml str:", xmlstr)
+        self.tdom = parseString(xmlstr)  #Document
+        # print("tdom:",type(self.tdom)) 
+        self.root = self.tdom.documentElement  #Element
+        # print("root:",type(self.root)) 
+        self.rnode = self.tdom.childNodes  #NodeList
+        # print("rnode:",type(self.rnode))
+        pass
+
+    def get_text(self):
+        '''返回xml 内容的所有文本内容的 列表'''
+        res = []
+
+        for x1 in self.rnode:
+            if x1.nodeType == Node.TEXT_NODE:
+                res.append(x1.value)
+            else:
+                for x2 in x1.childNodes:
+                    if isinstance(x2, xml.dom.minidom.Text):
+                        res.append(x2.data)
+                    else:
+                        for x3 in x2.childNodes:
+                            if isinstance(x3, xml.dom.minidom.Text):
+                                res.append(x3.data)
+                            else:
+                                print("len(nodes of x3):", len(x3.childNodes))
+
+        return res
+
+    def get_xmlchild_list(self):
+        '''返回xml 内容的列表， 包括所有文本内容(不带tag)'''
+        res = []
+
+        for x1 in self.rnode:
+            if x1.nodeType == Node.TEXT_NODE:
+                res.append(x1.value)
+            else:
+                for x2 in x1.childNodes:
+                    if isinstance(x2, xml.dom.minidom.Text):
+                        res.append(x2.data)
+                    else:
+                        for x3 in x2.childNodes:
+                            if isinstance(x3, xml.dom.minidom.Text):
+                                res.append(x3.data)
+                            else:
+                                print("len(nodes of x3):", len(x3.childNodes))
+        print(res)
+        return res
+
+    def get_pinyins_for_xml(self):
+        '''返回xml 内容，如果字符串 和 拼音的 list , 如 ['''
+        res = []
+
+        for x1 in self.rnode:
+            if x1.nodeType == Node.TEXT_NODE:
+                t = re.sub(r"\s+", "", x1.value)
+                res.append([t, []])
+            else:
+                for x2 in x1.childNodes:
+                    if isinstance(x2, xml.dom.minidom.Text):
+                        t = re.sub(r"\s+", "", x2.data)
+                        res.append([t, []])
+                    else:
+                        # print("x2",x2,x2.tagName)
+                        if x2.hasAttribute('pinyin'):
+                            pinyin_value = x2.getAttribute("pinyin")
+                            pinyins = pinyin_value.split(" ")
+                        for x3 in x2.childNodes:
+                            # print('x3',x3)
+                            if isinstance(x3, xml.dom.minidom.Text):
+                                t = re.sub(r"\s+", "", x3.data)
+                                res.append([t, pinyins])
+                            else:
+                                print("len(nodes of x3):", len(x3.childNodes))
+
+        return res
+
+    def get_all_tags(self, tag_name):
+        '''获取所有的tag 及属性值'''
+        alltags = self.root.getElementsByTagName(tag_name)
+        for x in alltags:
+            if x.hasAttribute('pinyin'):  # pinyin
+                print(x.tagName, 'pinyin',
+                      x.getAttribute('pinyin'), x.firstChild.data)

From 86eb718908ea34e3617b76308b1e0fb3f911f1ba Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 14 Oct 2022 11:31:01 +0000
Subject: [PATCH 15/60] add u2 recg

---
 speechx/examples/codelab/u2/local/decode.sh   |   2 +-
 speechx/speechx/decoder/CMakeLists.txt        |  21 +-
 speechx/speechx/decoder/common.h              |  31 ++-
 speechx/speechx/decoder/ctc_beam_search_opt.h |  64 ------
 .../decoder/ctc_prefix_beam_search_decoder.cc |  14 +-
 .../decoder/ctc_prefix_beam_search_decoder.h  |  45 ++--
 .../ctc_prefix_beam_search_decoder_main.cc    |  28 +--
 .../decoder/ctc_prefix_beam_search_result.h   |  41 ----
 speechx/speechx/decoder/decoder_itf.h         |   4 +
 speechx/speechx/decoder/param.h               |  35 +--
 speechx/speechx/decoder/recognizer.cc         |   6 +
 speechx/speechx/decoder/recognizer.h          |  13 +-
 speechx/speechx/decoder/recognizer_main.cc    |  29 ++-
 speechx/speechx/decoder/u2_recognizer.cc      | 209 ++++++++++++++++++
 speechx/speechx/decoder/u2_recognizer.h       | 164 ++++++++++++++
 speechx/speechx/decoder/u2_recognizer_main.cc | 137 ++++++++++++
 .../frontend/audio/feature_pipeline.cc        |   2 +-
 .../speechx/frontend/audio/feature_pipeline.h |  17 +-
 speechx/speechx/nnet/ds2_nnet.cc              |   1 +
 speechx/speechx/nnet/ds2_nnet.h               |   2 +
 speechx/speechx/nnet/nnet_itf.h               |   9 +-
 speechx/speechx/nnet/u2_nnet.h                |   3 +-
 .../speechx/protocol/websocket/CMakeLists.txt |   2 -
 .../websocket/websocket_server_main.cc        |  29 ++-
 24 files changed, 693 insertions(+), 215 deletions(-)
 delete mode 100644 speechx/speechx/decoder/ctc_prefix_beam_search_result.h
 create mode 100644 speechx/speechx/decoder/u2_recognizer.cc
 create mode 100644 speechx/speechx/decoder/u2_recognizer.h
 create mode 100644 speechx/speechx/decoder/u2_recognizer_main.cc

diff --git a/speechx/examples/codelab/u2/local/decode.sh b/speechx/examples/codelab/u2/local/decode.sh
index 12297661..24e9fca5 100755
--- a/speechx/examples/codelab/u2/local/decode.sh
+++ b/speechx/examples/codelab/u2/local/decode.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-set -x
+set +x
 set -e
 
 . path.sh
diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt
index 8cf94a10..472d9332 100644
--- a/speechx/speechx/decoder/CMakeLists.txt
+++ b/speechx/speechx/decoder/CMakeLists.txt
@@ -9,6 +9,7 @@ add_library(decoder STATIC
   ctc_prefix_beam_search_decoder.cc
   ctc_tlg_decoder.cc
   recognizer.cc
+  u2_recognizer.cc
 )
 target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder absl::strings)
 
@@ -28,10 +29,16 @@ endforeach()
 
 
 # u2
-set(bin_name ctc_prefix_beam_search_decoder_main)
-add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
-target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(${bin_name} nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util)
-target_compile_options(${bin_name}  PRIVATE ${PADDLE_COMPILE_FLAGS})
-target_include_directories(${bin_name}  PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
-target_link_libraries(${bin_name}  ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
\ No newline at end of file
+set(TEST_BINS 
+  u2_recognizer_main
+  ctc_prefix_beam_search_decoder_main
+)
+
+foreach(bin_name IN LISTS TEST_BINS)
+  add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+  target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+  target_link_libraries(${bin_name} nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util)
+  target_compile_options(${bin_name}  PRIVATE ${PADDLE_COMPILE_FLAGS})
+  target_include_directories(${bin_name}  PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
+  target_link_libraries(${bin_name}  ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
+endforeach()
\ No newline at end of file
diff --git a/speechx/speechx/decoder/common.h b/speechx/speechx/decoder/common.h
index 52deffac..0ae73277 100644
--- a/speechx/speechx/decoder/common.h
+++ b/speechx/speechx/decoder/common.h
@@ -1,3 +1,4 @@
+// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,10 +13,36 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "base/basic_types.h"
+#pragma once
+
+#include "base/common.h"
 
 struct DecoderResult {
     BaseFloat acoustic_score;
     std::vector<int32> words_idx;
-    std::vector<pair<int32, int32>> time_stamp;
+    std::vector<std::pair<int32, int32>> time_stamp;
+};
+
+
+namespace ppspeech {
+
+struct WordPiece {
+    std::string word;
+    int start = -1;
+    int end = -1;
+
+    WordPiece(std::string word, int start, int end)
+        : word(std::move(word)), start(start), end(end) {}
 };
+
+struct DecodeResult {
+    float score = -kBaseFloatMax;
+    std::string sentence;
+    std::vector<WordPiece> word_pieces;
+
+    static bool CompareFunc(const DecodeResult& a, const DecodeResult& b) {
+        return a.score > b.score;
+    }
+};
+
+}  // namespace ppspeech
diff --git a/speechx/speechx/decoder/ctc_beam_search_opt.h b/speechx/speechx/decoder/ctc_beam_search_opt.h
index af92fad0..d21b3abd 100644
--- a/speechx/speechx/decoder/ctc_beam_search_opt.h
+++ b/speechx/speechx/decoder/ctc_beam_search_opt.h
@@ -76,68 +76,4 @@ struct CTCBeamSearchOptions {
     }
 };
 
-
-// used by u2 model
-struct CTCBeamSearchDecoderOptions {
-    // chunk_size is the frame number of one chunk after subsampling.
-    // e.g. if subsample rate is 4 and chunk_size = 16, the frames in
-    // one chunk are 67=16*4 + 3, stride is 64=16*4
-    int chunk_size;
-    int num_left_chunks;
-
-    // final_score = rescoring_weight * rescoring_score + ctc_weight *
-    // ctc_score;
-    // rescoring_score = left_to_right_score * (1 - reverse_weight) +
-    // right_to_left_score * reverse_weight
-    // Please note the concept of ctc_scores
-    // in the following two search methods are different. For
-    // CtcPrefixBeamSerch,
-    // it's a sum(prefix) score + context score For CtcWfstBeamSerch, it's a
-    // max(viterbi) path score + context score So we should carefully set
-    // ctc_weight accroding to the search methods.
-    float ctc_weight;
-    float rescoring_weight;
-    float reverse_weight;
-
-    // CtcEndpointConfig ctc_endpoint_opts;
-
-    CTCBeamSearchOptions ctc_prefix_search_opts;
-
-    CTCBeamSearchDecoderOptions()
-        : chunk_size(16),
-          num_left_chunks(-1),
-          ctc_weight(0.5),
-          rescoring_weight(1.0),
-          reverse_weight(0.0) {}
-
-    void Register(kaldi::OptionsItf* opts) {
-        std::string module = "DecoderConfig: ";
-        opts->Register(
-            "chunk-size",
-            &chunk_size,
-            module + "the frame number of one chunk after subsampling.");
-        opts->Register("num-left-chunks",
-                       &num_left_chunks,
-                       module + "the left history chunks number.");
-        opts->Register("ctc-weight",
-                       &ctc_weight,
-                       module +
-                           "ctc weight for rescore. final_score = "
-                           "rescoring_weight * rescoring_score + ctc_weight * "
-                           "ctc_score.");
-        opts->Register("rescoring-weight",
-                       &rescoring_weight,
-                       module +
-                           "attention score weight for rescore. final_score = "
-                           "rescoring_weight * rescoring_score + ctc_weight * "
-                           "ctc_score.");
-        opts->Register("reverse-weight",
-                       &reverse_weight,
-                       module +
-                           "reverse decoder weight. rescoring_score = "
-                           "left_to_right_score * (1 - reverse_weight) + "
-                           "right_to_left_score * reverse_weight.");
-    }
-};
-
 }  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
index f22bfea2..ce2d4dc2 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
@@ -30,8 +30,14 @@ using paddle::platform::TracerEventType;
 
 namespace ppspeech {
 
-CTCPrefixBeamSearch::CTCPrefixBeamSearch(const CTCBeamSearchOptions& opts)
+CTCPrefixBeamSearch::CTCPrefixBeamSearch(
+    const std::string vocab_path, 
+    const CTCBeamSearchOptions& opts)
     : opts_(opts) {
+        
+    unit_table_ = std::shared_ptr<fst::SymbolTable>(fst::SymbolTable::ReadText(vocab_path));
+    CHECK(unit_table_ != nullptr);
+
     Reset();
 }
 
@@ -322,7 +328,11 @@ void CTCPrefixBeamSearch::UpdateFinalContext() {
     CHECK(n_hyps > 0);
     CHECK(index < n_hyps);
     std::vector<int> one = Outputs()[index];
-    return std::string(absl::StrJoin(one, kSpaceSymbol));
+    std::string sentence;
+    for (int i = 0; i < one.size(); i++){
+        sentence += unit_table_->Find(one[i]);
+    }
+    return sentence;
   }
 
   std::string  CTCPrefixBeamSearch::GetBestPath() {
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
index ba44b0a2..2c28bee1 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
@@ -15,17 +15,21 @@
 #pragma once
 
 #include "decoder/ctc_beam_search_opt.h"
-#include "decoder/ctc_prefix_beam_search_result.h"
 #include "decoder/ctc_prefix_beam_search_score.h"
 #include "decoder/decoder_itf.h"
 
+#include "fst/symbol-table.h"
+
 namespace ppspeech {
 class ContextGraph;
 class CTCPrefixBeamSearch : public DecoderInterface {
   public:
-    explicit CTCPrefixBeamSearch(const CTCBeamSearchOptions& opts);
+    explicit CTCPrefixBeamSearch(const std::string vocab_path,
+                                 const CTCBeamSearchOptions& opts);
     ~CTCPrefixBeamSearch() {}
 
+    SearchType Type() const { return SearchType::kPrefixBeamSearch; }
+
     void InitDecoder() override;
 
     void Reset() override;
@@ -38,10 +42,9 @@ class CTCPrefixBeamSearch : public DecoderInterface {
 
     void FinalizeSearch();
 
-  protected:
-    std::string GetBestPath() override;
-    std::vector<std::pair<double, std::string>> GetNBestPath() override;
-    std::vector<std::pair<double, std::string>> GetNBestPath(int n) override;
+    const std::shared_ptr<fst::SymbolTable> VocabTable() const {
+        return unit_table_;
+    }
 
     const std::vector<std::vector<int>>& Inputs() const { return hypotheses_; }
     const std::vector<std::vector<int>>& Outputs() const { return outputs_; }
@@ -52,6 +55,11 @@ class CTCPrefixBeamSearch : public DecoderInterface {
     const std::vector<std::vector<int>>& Times() const { return times_; }
 
 
+  protected:
+    std::string GetBestPath() override;
+    std::vector<std::pair<double, std::string>> GetNBestPath() override;
+    std::vector<std::pair<double, std::string>> GetNBestPath(int n) override;
+
   private:
     std::string GetBestPath(int index);
 
@@ -66,6 +74,7 @@ class CTCPrefixBeamSearch : public DecoderInterface {
 
   private:
     CTCBeamSearchOptions opts_;
+    std::shared_ptr<fst::SymbolTable> unit_table_;
 
     std::unordered_map<std::vector<int>, PrefixScore, PrefixScoreHash>
         cur_hyps_;
@@ -86,28 +95,4 @@ class CTCPrefixBeamSearch : public DecoderInterface {
 };
 
 
-class CTCPrefixBeamSearchDecoder : public CTCPrefixBeamSearch {
-  public:
-    explicit CTCPrefixBeamSearchDecoder(const CTCBeamSearchDecoderOptions& opts)
-        : CTCPrefixBeamSearch(opts.ctc_prefix_search_opts), opts_(opts) {}
-
-    ~CTCPrefixBeamSearchDecoder() {}
-
-  private:
-    CTCBeamSearchDecoderOptions opts_;
-
-    // cache feature
-    bool start_ = false;  // false, this is first frame.
-    // for continues decoding
-    int num_frames_ = 0;
-    int global_frame_offset_ = 0;
-    const int time_stamp_gap_ =
-        100;  // timestamp gap between words in a sentence
-
-    // std::unique_ptr<CtcEndpoint> ctc_endpointer_;
-
-    int num_frames_in_current_chunk_ = 0;
-    std::vector<DecodeResult> result_;
-};
-
 }  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
index 8927a5f4..dd352378 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
@@ -55,14 +55,12 @@ int main(int argc, char* argv[]) {
     CHECK(FLAGS_vocab_path != "");
     CHECK(FLAGS_model_path != "");
     LOG(INFO) << "model path: " << FLAGS_model_path;
+    LOG(INFO) << "Reading vocab table " << FLAGS_vocab_path;
 
     kaldi::SequentialBaseFloatMatrixReader feature_reader(
         FLAGS_feature_rspecifier);
     kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
 
-    LOG(INFO) << "Reading vocab table " << FLAGS_vocab_path;
-    fst::SymbolTable* unit_table = fst::SymbolTable::ReadText(FLAGS_vocab_path);
-
     // nnet
     ppspeech::ModelOptions model_opts;
     model_opts.model_path = FLAGS_model_path;
@@ -75,16 +73,11 @@ int main(int argc, char* argv[]) {
         new ppspeech::Decodable(nnet, raw_data));
 
     // decoder
-    ppspeech::CTCBeamSearchDecoderOptions opts;
-    opts.chunk_size = 16;
-    opts.num_left_chunks = -1;
-    opts.ctc_weight = 0.5;
-    opts.rescoring_weight = 1.0;
-    opts.reverse_weight = 0.3;
-    opts.ctc_prefix_search_opts.blank = 0;
-    opts.ctc_prefix_search_opts.first_beam_size = 10;
-    opts.ctc_prefix_search_opts.second_beam_size = 10;
-    ppspeech::CTCPrefixBeamSearchDecoder decoder(opts);
+    ppspeech::CTCBeamSearchOptions opts;
+    opts.blank = 0;
+    opts.first_beam_size = 10;
+    opts.second_beam_size = 10;
+    ppspeech::CTCPrefixBeamSearch decoder(FLAGS_vocab_path, opts);
 
 
     int32 chunk_size = FLAGS_receptive_field_length +
@@ -150,17 +143,14 @@ int main(int argc, char* argv[]) {
 
             // forward nnet
             decoder.AdvanceDecode(decodable);
+
+            LOG(INFO) << "Partial result: " << decoder.GetPartialResult();
         }
 
         decoder.FinalizeSearch();
 
         // get 1-best result
-        std::string result_ints = decoder.GetFinalBestPath();
-        std::vector<std::string> tokenids = absl::StrSplit(result_ints, ppspeech::kSpaceSymbol);
-        std::string result;
-        for (int i = 0; i < tokenids.size(); i++){
-            result += unit_table->Find(std::stoi(tokenids[i]));
-        }
+        std::string result = decoder.GetFinalBestPath();
   
         // after process one utt, then reset state.
         decodable->Reset();
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_result.h b/speechx/speechx/decoder/ctc_prefix_beam_search_result.h
deleted file mode 100644
index caa3e37e..00000000
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_result.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "base/common.h"
-
-namespace ppspeech {
-
-struct WordPiece {
-    std::string word;
-    int start = -1;
-    int end = -1;
-
-    WordPiece(std::string word, int start, int end)
-        : word(std::move(word)), start(start), end(end) {}
-};
-
-struct DecodeResult {
-    float score = -kBaseFloatMax;
-    std::string sentence;
-    std::vector<WordPiece> word_pieces;
-
-    static bool CompareFunc(const DecodeResult& a, const DecodeResult& b) {
-        return a.score > b.score;
-    }
-};
-
-}  // namespace ppspeech
diff --git a/speechx/speechx/decoder/decoder_itf.h b/speechx/speechx/decoder/decoder_itf.h
index fe4e7408..eec9bc3d 100644
--- a/speechx/speechx/decoder/decoder_itf.h
+++ b/speechx/speechx/decoder/decoder_itf.h
@@ -20,6 +20,10 @@
 
 namespace ppspeech {
 
+enum SearchType {
+    kPrefixBeamSearch = 0,
+    kWfstBeamSearch = 1,
+};
 class DecoderInterface {
   public:
     virtual ~DecoderInterface() {}
diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h
index 8a5990dc..e0f22d8c 100644
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -19,12 +19,15 @@
 #include "decoder/ctc_tlg_decoder.h"
 #include "frontend/audio/feature_pipeline.h"
 
+
 // feature
 DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
 // DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear
 // feature, or fbank");
 DEFINE_int32(num_bins, 161, "num bins of mel");
 DEFINE_string(cmvn_file, "", "read cmvn");
+
+
 // feature sliding window
 DEFINE_int32(receptive_field_length,
              7,
@@ -33,6 +36,8 @@ DEFINE_int32(downsampling_rate,
              4,
              "two CNN(kernel=3) module downsampling rate.");
 DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk");
+
+
 // nnet
 DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
 DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
@@ -89,34 +94,4 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
     return opts;
 }
 
-ModelOptions InitModelOptions() {
-    ModelOptions model_opts;
-    model_opts.model_path = FLAGS_model_path;
-    model_opts.param_path = FLAGS_param_path;
-    model_opts.cache_names = FLAGS_model_cache_names;
-    model_opts.cache_shape = FLAGS_model_cache_shapes;
-    model_opts.input_names = FLAGS_model_input_names;
-    model_opts.output_names = FLAGS_model_output_names;
-    return model_opts;
-}
-
-TLGDecoderOptions InitDecoderOptions() {
-    TLGDecoderOptions decoder_opts;
-    decoder_opts.word_symbol_table = FLAGS_word_symbol_table;
-    decoder_opts.fst_path = FLAGS_graph_path;
-    decoder_opts.opts.max_active = FLAGS_max_active;
-    decoder_opts.opts.beam = FLAGS_beam;
-    decoder_opts.opts.lattice_beam = FLAGS_lattice_beam;
-    return decoder_opts;
-}
-
-RecognizerResource InitRecognizerResoure() {
-    RecognizerResource resource;
-    resource.acoustic_scale = FLAGS_acoustic_scale;
-    resource.feature_pipeline_opts = InitFeaturePipelineOptions();
-    resource.model_opts = InitModelOptions();
-    resource.tlg_opts = InitDecoderOptions();
-    return resource;
-}
-
 }  // namespace ppspeech
diff --git a/speechx/speechx/decoder/recognizer.cc b/speechx/speechx/decoder/recognizer.cc
index 44c3911c..bb9ea187 100644
--- a/speechx/speechx/decoder/recognizer.cc
+++ b/speechx/speechx/decoder/recognizer.cc
@@ -14,6 +14,7 @@
 
 #include "decoder/recognizer.h"
 
+
 namespace ppspeech {
 
 using kaldi::Vector;
@@ -23,14 +24,19 @@ using std::vector;
 using kaldi::SubVector;
 using std::unique_ptr;
 
+
 Recognizer::Recognizer(const RecognizerResource& resource) {
     // resource_ = resource;
     const FeaturePipelineOptions& feature_opts = resource.feature_pipeline_opts;
     feature_pipeline_.reset(new FeaturePipeline(feature_opts));
+
     std::shared_ptr<PaddleNnet> nnet(new PaddleNnet(resource.model_opts));
+
     BaseFloat ac_scale = resource.acoustic_scale;
     decodable_.reset(new Decodable(nnet, feature_pipeline_, ac_scale));
+
     decoder_.reset(new TLGDecoder(resource.tlg_opts));
+
     input_finished_ = false;
 }
 
diff --git a/speechx/speechx/decoder/recognizer.h b/speechx/speechx/decoder/recognizer.h
index e47ca433..4965e7a3 100644
--- a/speechx/speechx/decoder/recognizer.h
+++ b/speechx/speechx/decoder/recognizer.h
@@ -25,16 +25,11 @@
 namespace ppspeech {
 
 struct RecognizerResource {
-    FeaturePipelineOptions feature_pipeline_opts;
-    ModelOptions model_opts;
-    TLGDecoderOptions tlg_opts;
+    FeaturePipelineOptions feature_pipeline_opts{};
+    ModelOptions model_opts{};
+    TLGDecoderOptions tlg_opts{};
     //    CTCBeamSearchOptions beam_search_opts;
-    kaldi::BaseFloat acoustic_scale;
-    RecognizerResource()
-        : acoustic_scale(1.0),
-          feature_pipeline_opts(),
-          model_opts(),
-          tlg_opts() {}
+    kaldi::BaseFloat acoustic_scale{1.0};
 };
 
 class Recognizer {
diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/decoder/recognizer_main.cc
index 05026646..2b497d6e 100644
--- a/speechx/speechx/decoder/recognizer_main.cc
+++ b/speechx/speechx/decoder/recognizer_main.cc
@@ -22,6 +22,33 @@ DEFINE_string(result_wspecifier, "", "test result wspecifier");
 DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
 DEFINE_int32(sample_rate, 16000, "sample rate");
 
+ppspeech::RecognizerResource InitRecognizerResoure() {
+    ppspeech::RecognizerResource resource;
+    resource.acoustic_scale = FLAGS_acoustic_scale;
+    resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions();
+
+    ppspeech::ModelOptions model_opts;
+    model_opts.model_path = FLAGS_model_path;
+    model_opts.param_path = FLAGS_param_path;
+    model_opts.cache_names = FLAGS_model_cache_names;
+    model_opts.cache_shape = FLAGS_model_cache_shapes;
+    model_opts.input_names = FLAGS_model_input_names;
+    model_opts.output_names = FLAGS_model_output_names;
+    model_opts.subsample_rate = FLAGS_downsampling_rate;
+    resource.model_opts = model_opts;
+
+    ppspeech::TLGDecoderOptions decoder_opts;
+    decoder_opts.word_symbol_table = FLAGS_word_symbol_table;
+    decoder_opts.fst_path = FLAGS_graph_path;
+    decoder_opts.opts.max_active = FLAGS_max_active;
+    decoder_opts.opts.beam = FLAGS_beam;
+    decoder_opts.opts.lattice_beam = FLAGS_lattice_beam;
+
+    resource.tlg_opts = decoder_opts;
+
+    return resource;
+}
+
 int main(int argc, char* argv[]) {
     gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
@@ -29,7 +56,7 @@ int main(int argc, char* argv[]) {
     google::InstallFailureSignalHandler();
     FLAGS_logtostderr = 1;
 
-    ppspeech::RecognizerResource resource = ppspeech::InitRecognizerResoure();
+    ppspeech::RecognizerResource resource = InitRecognizerResoure();
     ppspeech::Recognizer recognizer(resource);
 
     kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
diff --git a/speechx/speechx/decoder/u2_recognizer.cc b/speechx/speechx/decoder/u2_recognizer.cc
new file mode 100644
index 00000000..0ace086c
--- /dev/null
+++ b/speechx/speechx/decoder/u2_recognizer.cc
@@ -0,0 +1,209 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "decoder/u2_recognizer.h"
+#include "nnet/u2_nnet.h"
+
+namespace ppspeech {
+
+using kaldi::Vector;
+using kaldi::VectorBase;
+using kaldi::BaseFloat;
+using std::vector;
+using kaldi::SubVector;
+using std::unique_ptr;
+
+U2Recognizer::U2Recognizer(const U2RecognizerResource& resource): opts_(resource) {
+    const FeaturePipelineOptions& feature_opts = resource.feature_pipeline_opts;
+    feature_pipeline_.reset(new FeaturePipeline(feature_opts));
+
+    std::shared_ptr<NnetInterface> nnet(new U2Nnet(resource.model_opts));
+
+    BaseFloat am_scale = resource.acoustic_scale;
+    decodable_.reset(new Decodable(nnet, feature_pipeline_, am_scale));
+
+    decoder_.reset(new CTCPrefixBeamSearch(resource.vocab_path, resource.decoder_opts.ctc_prefix_search_opts));
+
+    unit_table_ = decoder_->VocabTable();
+    symbol_table_ = unit_table_;
+
+    input_finished_ = false;
+}
+
+void U2Recognizer::Reset() {
+    global_frame_offset_ = 0;
+    num_frames_ = 0;
+    result_.clear();
+
+    feature_pipeline_->Reset();
+    decodable_->Reset();
+    decoder_->Reset();
+}
+
+void U2Recognizer::ResetContinuousDecoding() {
+    global_frame_offset_ = num_frames_;
+    num_frames_ = 0;
+    result_.clear();
+
+    feature_pipeline_->Reset();
+    decodable_->Reset();
+    decoder_->Reset();
+}
+
+
+void U2Recognizer::Accept(const VectorBase<BaseFloat>& waves) {
+    feature_pipeline_->Accept(waves);
+}
+
+
+void U2Recognizer::Decode() { 
+    decoder_->AdvanceDecode(decodable_); 
+}
+
+void U2Recognizer::Rescoring() {
+  // Do attention Rescoring
+  kaldi::Timer timer;
+  AttentionRescoring();
+  VLOG(1) << "Rescoring cost latency: " << timer.Elapsed() << " sec.";
+}
+
+void U2Recognizer::UpdateResult(bool finish) {
+  const auto& hypotheses = decoder_->Outputs();
+  const auto& inputs = decoder_->Inputs();
+  const auto& likelihood = decoder_->Likelihood();
+  const auto& times = decoder_->Times();
+  result_.clear();
+
+  CHECK_EQ(hypotheses.size(), likelihood.size());
+  for (size_t i = 0; i < hypotheses.size(); i++) {
+    const std::vector<int>& hypothesis = hypotheses[i];
+
+    DecodeResult path;
+    path.score = likelihood[i];
+    for (size_t j = 0; j < hypothesis.size(); j++) {
+      std::string word = symbol_table_->Find(hypothesis[j]);
+      // A detailed explanation of this if-else branch can be found in
+      // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058
+      if (decoder_->Type() == kWfstBeamSearch) {
+        path.sentence += (" " + word);
+      } else {
+        path.sentence += (word);
+      }
+    }
+
+    // TimeStamp is only supported in final result
+    // TimeStamp of the output of CtcWfstBeamSearch may be inaccurate due to
+    // various FST operations when building the decoding graph. So here we use
+    // time stamp of the input(e2e model unit), which is more accurate, and it
+    // requires the symbol table of the e2e model used in training.
+    if (unit_table_ != nullptr && finish) {
+      int offset = global_frame_offset_ * FrameShiftInMs();
+
+      const std::vector<int>& input = inputs[i];
+      const std::vector<int> time_stamp = times[i];
+      CHECK_EQ(input.size(), time_stamp.size());
+
+      for (size_t j = 0; j < input.size(); j++) {
+        std::string word = unit_table_->Find(input[j]);
+
+        int start = time_stamp[j] * FrameShiftInMs() - time_stamp_gap_ > 0
+                        ? time_stamp[j] * FrameShiftInMs() - time_stamp_gap_
+                        : 0;
+        if (j > 0) {
+          start = (time_stamp[j] - time_stamp[j - 1]) * FrameShiftInMs() <
+                          time_stamp_gap_
+                      ? (time_stamp[j - 1] + time_stamp[j]) / 2 *
+                            FrameShiftInMs()
+                      : start;
+        }
+
+        int end = time_stamp[j] * FrameShiftInMs();
+        if (j < input.size() - 1) {
+          end = (time_stamp[j + 1] - time_stamp[j]) * FrameShiftInMs() <
+                        time_stamp_gap_
+                    ? (time_stamp[j + 1] + time_stamp[j]) / 2 *
+                          FrameShiftInMs()
+                    : end;
+        }
+
+        WordPiece word_piece(word, offset + start, offset + end);
+        path.word_pieces.emplace_back(word_piece);
+      }
+    }
+
+    // if (post_processor_ != nullptr) {
+    //   path.sentence = post_processor_->Process(path.sentence, finish);
+    // }
+
+    result_.emplace_back(path);
+  }
+
+  if (DecodedSomething()) {
+    VLOG(1) << "Partial CTC result " << result_[0].sentence;
+  }
+}
+
+void U2Recognizer::AttentionRescoring() {
+  decoder_->FinalizeSearch();
+  UpdateResult(true);
+
+  // No need to do rescoring
+  if (0.0 == opts_.decoder_opts.rescoring_weight) {
+    LOG_EVERY_N(WARNING, 3) << "Not do AttentionRescoring!";
+    return;
+  }
+  LOG_EVERY_N(WARNING, 3) << "Do AttentionRescoring!";
+
+  // Inputs() returns N-best input ids, which is the basic unit for rescoring
+  // In CtcPrefixBeamSearch, inputs are the same to outputs
+  const auto& hypotheses = decoder_->Inputs();
+  int num_hyps = hypotheses.size();
+  if (num_hyps <= 0) {
+    return;
+  }
+
+  kaldi::Timer timer;
+  std::vector<float> rescoring_score;
+  decodable_->AttentionRescoring(
+      hypotheses, opts_.decoder_opts.reverse_weight, &rescoring_score);
+  VLOG(1) << "Attention Rescoring takes " << timer.Elapsed() << " sec.";
+
+  // combine ctc score and rescoring score
+  for (size_t i = 0; i < num_hyps; i++) {
+    VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i]
+            << " ctc_score: " << result_[i].score;
+    result_[i].score = opts_.decoder_opts.rescoring_weight * rescoring_score[i] +
+                       opts_.decoder_opts.ctc_weight * result_[i].score;
+  }
+
+  std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc);
+  VLOG(1) << "result: " << result_[0].sentence
+          << " score: " << result_[0].score;
+}
+
+std::string U2Recognizer::GetFinalResult() {
+    return result_[0].sentence;
+}
+
+std::string U2Recognizer::GetPartialResult() {
+    return result_[0].sentence;
+}
+
+void U2Recognizer::SetFinished() {
+    feature_pipeline_->SetFinished();
+    input_finished_ = true;
+}
+
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/decoder/u2_recognizer.h b/speechx/speechx/decoder/u2_recognizer.h
new file mode 100644
index 00000000..0947e593
--- /dev/null
+++ b/speechx/speechx/decoder/u2_recognizer.h
@@ -0,0 +1,164 @@
+
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "decoder/common.h"
+#include "decoder/ctc_beam_search_opt.h"
+#include "decoder/ctc_prefix_beam_search_decoder.h"
+#include "decoder/decoder_itf.h"
+#include "frontend/audio/feature_pipeline.h"
+#include "nnet/decodable.h"
+
+#include "fst/fstlib.h"
+#include "fst/symbol-table.h"
+
+namespace ppspeech {
+
+
+struct DecodeOptions {
+    // chunk_size is the frame number of one chunk after subsampling.
+    // e.g. if subsample rate is 4 and chunk_size = 16, the frames in
+    // one chunk are 67=16*4 + 3, stride is 64=16*4
+    int chunk_size;
+    int num_left_chunks;
+
+    // final_score = rescoring_weight * rescoring_score + ctc_weight *
+    // ctc_score;
+    // rescoring_score = left_to_right_score * (1 - reverse_weight) +
+    // right_to_left_score * reverse_weight
+    // Please note the concept of ctc_scores
+    // in the following two search methods are different. For
+    // CtcPrefixBeamSerch,
+    // it's a sum(prefix) score + context score For CtcWfstBeamSerch, it's a
+    // max(viterbi) path score + context score So we should carefully set
+    // ctc_weight accroding to the search methods.
+    float ctc_weight;
+    float rescoring_weight;
+    float reverse_weight;
+
+    // CtcEndpointConfig ctc_endpoint_opts;
+    CTCBeamSearchOptions ctc_prefix_search_opts;
+
+    DecodeOptions()
+        : chunk_size(16),
+          num_left_chunks(-1),
+          ctc_weight(0.5),
+          rescoring_weight(1.0),
+          reverse_weight(0.0) {}
+
+    void Register(kaldi::OptionsItf* opts) {
+        std::string module = "DecoderConfig: ";
+        opts->Register(
+            "chunk-size",
+            &chunk_size,
+            module + "the frame number of one chunk after subsampling.");
+        opts->Register("num-left-chunks",
+                       &num_left_chunks,
+                       module + "the left history chunks number.");
+        opts->Register("ctc-weight",
+                       &ctc_weight,
+                       module +
+                           "ctc weight for rescore. final_score = "
+                           "rescoring_weight * rescoring_score + ctc_weight * "
+                           "ctc_score.");
+        opts->Register("rescoring-weight",
+                       &rescoring_weight,
+                       module +
+                           "attention score weight for rescore. final_score = "
+                           "rescoring_weight * rescoring_score + ctc_weight * "
+                           "ctc_score.");
+        opts->Register("reverse-weight",
+                       &reverse_weight,
+                       module +
+                           "reverse decoder weight. rescoring_score = "
+                           "left_to_right_score * (1 - reverse_weight) + "
+                           "right_to_left_score * reverse_weight.");
+    }
+};
+
+
+struct U2RecognizerResource {
+    FeaturePipelineOptions feature_pipeline_opts{};
+    ModelOptions model_opts{};
+    DecodeOptions decoder_opts{};
+    //    CTCBeamSearchOptions beam_search_opts;
+    kaldi::BaseFloat acoustic_scale{1.0};
+    std::string vocab_path{};
+};
+
+
+class U2Recognizer {
+  public:
+    explicit U2Recognizer(const U2RecognizerResource& resouce);
+    void Reset();
+    void ResetContinuousDecoding();
+
+    void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves);
+    void Decode();
+    void Rescoring();
+
+
+    std::string GetFinalResult();
+    std::string GetPartialResult();
+
+    void SetFinished();
+    bool IsFinished() { return input_finished_; }
+
+    bool DecodedSomething() const {
+        return !result_.empty() && !result_[0].sentence.empty();
+    }
+
+
+    int FrameShiftInMs() const {
+        // one decoder frame length in ms
+        return decodable_->Nnet()->SubsamplingRate() *
+               feature_pipeline_->FrameShift();
+    }
+
+
+    const std::vector<DecodeResult>& Result() const { return result_; }
+
+  private:
+    void AttentionRescoring();
+    void UpdateResult(bool finish = false);
+
+  private:
+    U2RecognizerResource opts_;
+
+    // std::shared_ptr<U2RecognizerResource> resource_;
+    // U2RecognizerResource resource_;
+    std::shared_ptr<FeaturePipeline> feature_pipeline_;
+    std::shared_ptr<Decodable> decodable_;
+    std::unique_ptr<CTCPrefixBeamSearch> decoder_;
+
+    // e2e unit symbol table
+    std::shared_ptr<fst::SymbolTable> unit_table_ = nullptr;
+    std::shared_ptr<fst::SymbolTable> symbol_table_ = nullptr;
+
+    std::vector<DecodeResult> result_;
+
+    // global decoded frame offset
+    int global_frame_offset_;
+    // cur decoded frame num
+    int num_frames_;
+    // timestamp gap between words in a sentence
+    const int time_stamp_gap_ = 100;
+
+    bool input_finished_;
+};
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/decoder/u2_recognizer_main.cc b/speechx/speechx/decoder/u2_recognizer_main.cc
new file mode 100644
index 00000000..70bc7d67
--- /dev/null
+++ b/speechx/speechx/decoder/u2_recognizer_main.cc
@@ -0,0 +1,137 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "decoder/u2_recognizer.h"
+#include "decoder/param.h"
+#include "kaldi/feat/wave-reader.h"
+#include "kaldi/util/table-types.h"
+
+DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
+DEFINE_string(result_wspecifier, "", "test result wspecifier");
+DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
+DEFINE_int32(sample_rate, 16000, "sample rate");
+
+
+ppspeech::U2RecognizerResource InitOpts() {
+    ppspeech::U2RecognizerResource resource;
+    resource.acoustic_scale = FLAGS_acoustic_scale;
+    resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions();
+
+    ppspeech::ModelOptions model_opts;
+    model_opts.model_path = FLAGS_model_path;
+
+    resource.model_opts = model_opts;
+
+    ppspeech::DecodeOptions decoder_opts;
+    decoder_opts.chunk_size=16;
+    decoder_opts.num_left_chunks = -1;
+    decoder_opts.ctc_weight = 0.5;
+    decoder_opts.rescoring_weight = 1.0;
+    decoder_opts.reverse_weight = 0.3;
+    decoder_opts.ctc_prefix_search_opts.blank = 0;
+    decoder_opts.ctc_prefix_search_opts.first_beam_size = 10;
+    decoder_opts.ctc_prefix_search_opts.second_beam_size = 10;
+
+    resource.decoder_opts = decoder_opts;
+    return resource;
+}
+
+int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
+
+    int32 num_done = 0, num_err = 0;
+    double tot_wav_duration = 0.0;
+
+    ppspeech::U2RecognizerResource resource = InitOpts();
+    ppspeech::U2Recognizer recognizer(resource);
+
+    kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
+        FLAGS_wav_rspecifier);
+    kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
+
+    int sample_rate = FLAGS_sample_rate;
+    float streaming_chunk = FLAGS_streaming_chunk;
+    int chunk_sample_size = streaming_chunk * sample_rate;
+    LOG(INFO) << "sr: " << sample_rate;
+    LOG(INFO) << "chunk size (s): " << streaming_chunk;
+    LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
+
+    kaldi::Timer timer;
+
+    for (; !wav_reader.Done(); wav_reader.Next()) {
+        std::string utt = wav_reader.Key();
+        const kaldi::WaveData& wave_data = wav_reader.Value();
+        LOG(INFO) << "utt: " << utt;
+        LOG(INFO) << "wav dur: " << wave_data.Duration() << " sec.";
+        tot_wav_duration += wave_data.Duration();
+
+        int32 this_channel = 0;
+        kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
+                                                    this_channel);
+        int tot_samples = waveform.Dim();
+        LOG(INFO) << "wav len (sample): " << tot_samples;
+
+        int sample_offset = 0;
+        while (sample_offset < tot_samples) {
+            int cur_chunk_size =
+                std::min(chunk_sample_size, tot_samples - sample_offset);
+
+            kaldi::Vector<kaldi::BaseFloat> wav_chunk(cur_chunk_size);
+            for (int i = 0; i < cur_chunk_size; ++i) {
+                wav_chunk(i) = waveform(sample_offset + i);
+            }
+            // wav_chunk = waveform.Range(sample_offset + i, cur_chunk_size);
+
+            recognizer.Accept(wav_chunk);
+            if (cur_chunk_size < chunk_sample_size) {
+                recognizer.SetFinished();
+            }
+            recognizer.Decode();
+            LOG(INFO) << "Pratial result: " << recognizer.GetPartialResult();
+
+            // no overlap
+            sample_offset += cur_chunk_size;
+        }
+        // second pass decoding
+        recognizer.Rescoring();
+
+        std::string result = recognizer.GetFinalResult();
+
+        recognizer.Reset();
+
+        if (result.empty()) {
+            // the TokenWriter can not write empty string.
+            ++num_err;
+            LOG(INFO) << " the result of " << utt << " is empty";
+            continue;
+        }
+
+        LOG(INFO) << " the result of " << utt << " is " << result;
+
+        result_writer.Write(utt, result);
+
+        ++num_done;
+    }
+
+    double elapsed = timer.Elapsed();
+  
+    LOG(INFO) << "Done " << num_done << " out of " << (num_err + num_done);
+    LOG(INFO) << "cost:" << elapsed << " sec";
+    LOG(INFO) << "total wav duration is: " << tot_wav_duration << " sec";
+    LOG(INFO) << "the RTF is: " << elapsed / tot_wav_duration;
+}
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc
index 9cacff9f..9fc35c95 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.cc
+++ b/speechx/speechx/frontend/audio/feature_pipeline.cc
@@ -18,7 +18,7 @@ namespace ppspeech {
 
 using std::unique_ptr;
 
-FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) {
+FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) : opts_(opts) {
     unique_ptr<FrontendInterface> data_source(
         new ppspeech::AudioCache(1000 * kint16max, opts.to_float32));
 
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h
index 48f95e3f..613f69c6 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@@ -26,7 +26,6 @@
 #include "frontend/audio/normalizer.h"
 
 namespace ppspeech {
-
 struct FeaturePipelineOptions {
     std::string cmvn_file;
     bool to_float32;  // true, only for linear feature
@@ -60,7 +59,21 @@ class FeaturePipeline : public FrontendInterface {
     virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
     virtual void Reset() { base_extractor_->Reset(); }
 
+    const FeaturePipelineOptions& Config() { return opts_; }
+
+    const BaseFloat FrameShift() const {
+        return opts_.fbank_opts.frame_opts.frame_shift_ms;
+    }
+    const BaseFloat FrameLength() const {
+        return opts_.fbank_opts.frame_opts.frame_length_ms;
+    }
+    const BaseFloat SampleRate() const {
+        return opts_.fbank_opts.frame_opts.samp_freq;
+    }
+
   private:
+    FeaturePipelineOptions opts_;
     std::unique_ptr<FrontendInterface> base_extractor_;
 };
-}
+
+}  // namespace ppspeech
diff --git a/speechx/speechx/nnet/ds2_nnet.cc b/speechx/speechx/nnet/ds2_nnet.cc
index c6add03c..8c83f832 100644
--- a/speechx/speechx/nnet/ds2_nnet.cc
+++ b/speechx/speechx/nnet/ds2_nnet.cc
@@ -48,6 +48,7 @@ void PaddleNnet::InitCacheEncouts(const ModelOptions& opts) {
 }
 
 PaddleNnet::PaddleNnet(const ModelOptions& opts) : opts_(opts) {
+    subsampling_rate_ = opts.subsample_rate;
     paddle_infer::Config config;
     config.SetModel(opts.model_path, opts.param_path);
     if (opts.use_gpu) {
diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h
index e8a49c7d..2a53e5f7 100644
--- a/speechx/speechx/nnet/ds2_nnet.h
+++ b/speechx/speechx/nnet/ds2_nnet.h
@@ -67,6 +67,7 @@ class PaddleNnet : public NnetInterface {
 
     bool IsLogProb() override { return false; }
 
+
     std::shared_ptr<Tensor<kaldi::BaseFloat>> GetCacheEncoder(
         const std::string& name);
 
@@ -85,6 +86,7 @@ class PaddleNnet : public NnetInterface {
     std::map<paddle_infer::Predictor*, int> predictor_to_thread_id;
     std::map<std::string, int> cache_names_idx_;
     std::vector<std::shared_ptr<Tensor<kaldi::BaseFloat>>> cache_encouts_;
+
     ModelOptions opts_;
 
   public:
diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h
index 2e21ff9b..109f54e0 100644
--- a/speechx/speechx/nnet/nnet_itf.h
+++ b/speechx/speechx/nnet/nnet_itf.h
@@ -35,6 +35,7 @@ struct ModelOptions {
     std::string cache_shape;
     bool enable_fc_padding;
     bool enable_profile;
+    int subsample_rate;
     ModelOptions()
         : model_path(""),
           param_path(""),
@@ -46,7 +47,8 @@ struct ModelOptions {
           cache_shape(""),
           switch_ir_optim(false),
           enable_fc_padding(false),
-          enable_profile(false) {}
+          enable_profile(false),
+          subsample_rate(0) {}
 
     void Register(kaldi::OptionsItf* opts) {
         opts->Register("model-path", &model_path, "model file path");
@@ -102,9 +104,14 @@ class NnetInterface {
     // true, nnet output is logprob; otherwise is prob,
     virtual bool IsLogProb() = 0;
 
+    int SubsamplingRate() const { return subsampling_rate_; }
+
     // using to get encoder outs. e.g. seq2seq with Attention model.
     virtual void EncoderOuts(
         std::vector<kaldi::Vector<kaldi::BaseFloat>>* encoder_out) const = 0;
+
+  protected:
+    int subsampling_rate_{1};
 };
 
 }  // namespace ppspeech
diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h
index 1bac652e..7058ea94 100644
--- a/speechx/speechx/nnet/u2_nnet.h
+++ b/speechx/speechx/nnet/u2_nnet.h
@@ -30,7 +30,7 @@ class U2NnetBase : public NnetInterface {
   public:
     virtual int context() const { return right_context_ + 1; }
     virtual int right_context() const { return right_context_; }
-    virtual int subsampling_rate() const { return subsampling_rate_; }
+
     virtual int eos() const { return eos_; }
     virtual int sos() const { return sos_; }
     virtual int is_bidecoder() const { return is_bidecoder_; }
@@ -64,7 +64,6 @@ class U2NnetBase : public NnetInterface {
   protected:
     // model specification
     int right_context_{0};
-    int subsampling_rate_{1};
 
     int sos_{0};
     int eos_{0};
diff --git a/speechx/speechx/protocol/websocket/CMakeLists.txt b/speechx/speechx/protocol/websocket/CMakeLists.txt
index 0f73fd24..a171d84d 100644
--- a/speechx/speechx/protocol/websocket/CMakeLists.txt
+++ b/speechx/speechx/protocol/websocket/CMakeLists.txt
@@ -1,5 +1,3 @@
-# project(websocket)
-
 add_library(websocket STATIC
   websocket_server.cc
   websocket_client.cc
diff --git a/speechx/speechx/protocol/websocket/websocket_server_main.cc b/speechx/speechx/protocol/websocket/websocket_server_main.cc
index 109da96b..9c01a0a1 100644
--- a/speechx/speechx/protocol/websocket/websocket_server_main.cc
+++ b/speechx/speechx/protocol/websocket/websocket_server_main.cc
@@ -17,11 +17,38 @@
 
 DEFINE_int32(port, 8082, "websocket listening port");
 
+ppspeech::RecognizerResource InitRecognizerResoure() {
+    ppspeech::RecognizerResource resource;
+    resource.acoustic_scale = FLAGS_acoustic_scale;
+    resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions();
+
+    ppspeech::ModelOptions model_opts;
+    model_opts.model_path = FLAGS_model_path;
+    model_opts.param_path = FLAGS_param_path;
+    model_opts.cache_names = FLAGS_model_cache_names;
+    model_opts.cache_shape = FLAGS_model_cache_shapes;
+    model_opts.input_names = FLAGS_model_input_names;
+    model_opts.output_names = FLAGS_model_output_names;
+    model_opts.subsample_rate = FLAGS_downsampling_rate;
+    resource.model_opts = model_opts;
+
+    ppspeech::TLGDecoderOptions decoder_opts;
+    decoder_opts.word_symbol_table = FLAGS_word_symbol_table;
+    decoder_opts.fst_path = FLAGS_graph_path;
+    decoder_opts.opts.max_active = FLAGS_max_active;
+    decoder_opts.opts.beam = FLAGS_beam;
+    decoder_opts.opts.lattice_beam = FLAGS_lattice_beam;
+
+    resource.tlg_opts = decoder_opts;
+
+    return resource;
+}
+
 int main(int argc, char *argv[]) {
     gflags::ParseCommandLineFlags(&argc, &argv, false);
     google::InitGoogleLogging(argv[0]);
 
-    ppspeech::RecognizerResource resource = ppspeech::InitRecognizerResoure();
+    ppspeech::RecognizerResource resource = InitRecognizerResoure();
 
     ppspeech::WebSocketServer server(FLAGS_port, resource);
     LOG(INFO) << "Listening at port " << FLAGS_port;

From 17ea30e7cac2367e2d7850e38d7db7fb7dd50558 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 17 Oct 2022 05:56:38 +0000
Subject: [PATCH 16/60] u2 recog test main ok

---
 .../examples/codelab/u2/local/recognizer.sh   | 22 ++++++++++++++
 speechx/speechx/decoder/param.h               |  8 +++--
 speechx/speechx/decoder/u2_recognizer.cc      |  4 +++
 speechx/speechx/decoder/u2_recognizer.h       |  5 ++--
 speechx/speechx/decoder/u2_recognizer_main.cc | 13 ++++----
 speechx/speechx/frontend/audio/cmvn.cc        | 30 ++++++++++---------
 .../frontend/audio/feature_pipeline.cc        |  1 +
 7 files changed, 59 insertions(+), 24 deletions(-)
 create mode 100755 speechx/examples/codelab/u2/local/recognizer.sh

diff --git a/speechx/examples/codelab/u2/local/recognizer.sh b/speechx/examples/codelab/u2/local/recognizer.sh
new file mode 100755
index 00000000..a7359753
--- /dev/null
+++ b/speechx/examples/codelab/u2/local/recognizer.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -e
+
+. path.sh
+
+data=data
+exp=exp
+mkdir -p $exp
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/
+
+u2_recognizer_main \
+    --use_fbank=true \
+    --num_bins=80 \
+    --cmvn_file=$exp/cmvn.ark \
+    --model_path=$model_dir/export.jit \
+    --nnet_decoder_chunk=16 \
+    --receptive_field_length=7 \
+    --downsampling_rate=4 \
+    --vocab_path=$model_dir/unit.txt \
+    --wav_rspecifier=scp:$data/wav.scp \
+    --result_wspecifier=ark,t:$exp/result.ark
diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h
index e0f22d8c..1827e82d 100644
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -52,11 +52,12 @@ DEFINE_string(model_cache_names,
               "chunk_state_h_box,chunk_state_c_box",
               "model cache names");
 DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
+DEFINE_string(vocab_path, "", "nnet vocab path.");
 
 // decoder
-DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
-DEFINE_string(graph_path, "TLG", "decoder graph");
 DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
+DEFINE_string(graph_path, "TLG", "decoder graph");
+DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
 DEFINE_int32(max_active, 7500, "max active");
 DEFINE_double(beam, 15.0, "decoder beam");
 DEFINE_double(lattice_beam, 7.5, "decoder beam");
@@ -72,13 +73,14 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
     frame_opts.dither = 0.0;
     frame_opts.frame_shift_ms = 10;
     opts.use_fbank = FLAGS_use_fbank;
-    LOG(INFO) << "feature type: " << opts.use_fbank ? "fbank" : "linear";
+    LOG(INFO) << "feature type: " << (opts.use_fbank ? "fbank" : "linear");
     if (opts.use_fbank) {
         opts.to_float32 = false;
         frame_opts.window_type = "povey";
         frame_opts.frame_length_ms = 25;
         opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
         opts.fbank_opts.frame_opts = frame_opts;
+        LOG(INFO) << "num bins: " << opts.fbank_opts.mel_opts.num_bins;
     } else {
         opts.to_float32 = true;
         frame_opts.remove_dc_offset = false;
diff --git a/speechx/speechx/decoder/u2_recognizer.cc b/speechx/speechx/decoder/u2_recognizer.cc
index 0ace086c..8fcc5d79 100644
--- a/speechx/speechx/decoder/u2_recognizer.cc
+++ b/speechx/speechx/decoder/u2_recognizer.cc
@@ -33,12 +33,15 @@ U2Recognizer::U2Recognizer(const U2RecognizerResource& resource): opts_(resource
     BaseFloat am_scale = resource.acoustic_scale;
     decodable_.reset(new Decodable(nnet, feature_pipeline_, am_scale));
 
+    CHECK(resource.vocab_path != "");
     decoder_.reset(new CTCPrefixBeamSearch(resource.vocab_path, resource.decoder_opts.ctc_prefix_search_opts));
 
     unit_table_ = decoder_->VocabTable();
     symbol_table_ = unit_table_;
 
     input_finished_ = false;
+
+    Reset();
 }
 
 void U2Recognizer::Reset() {
@@ -69,6 +72,7 @@ void U2Recognizer::Accept(const VectorBase<BaseFloat>& waves) {
 
 void U2Recognizer::Decode() { 
     decoder_->AdvanceDecode(decodable_); 
+    UpdateResult(false);
 }
 
 void U2Recognizer::Rescoring() {
diff --git a/speechx/speechx/decoder/u2_recognizer.h b/speechx/speechx/decoder/u2_recognizer.h
index 0947e593..a65cae3b 100644
--- a/speechx/speechx/decoder/u2_recognizer.h
+++ b/speechx/speechx/decoder/u2_recognizer.h
@@ -92,12 +92,13 @@ struct DecodeOptions {
 
 
 struct U2RecognizerResource {
+    kaldi::BaseFloat acoustic_scale{1.0};
+    std::string vocab_path{};
+
     FeaturePipelineOptions feature_pipeline_opts{};
     ModelOptions model_opts{};
     DecodeOptions decoder_opts{};
     //    CTCBeamSearchOptions beam_search_opts;
-    kaldi::BaseFloat acoustic_scale{1.0};
-    std::string vocab_path{};
 };
 
 
diff --git a/speechx/speechx/decoder/u2_recognizer_main.cc b/speechx/speechx/decoder/u2_recognizer_main.cc
index 70bc7d67..ab2c6695 100644
--- a/speechx/speechx/decoder/u2_recognizer_main.cc
+++ b/speechx/speechx/decoder/u2_recognizer_main.cc
@@ -25,13 +25,16 @@ DEFINE_int32(sample_rate, 16000, "sample rate");
 
 ppspeech::U2RecognizerResource InitOpts() {
     ppspeech::U2RecognizerResource resource;
+    resource.vocab_path = FLAGS_vocab_path;
     resource.acoustic_scale = FLAGS_acoustic_scale;
-    resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions();
 
+    resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions();
+    LOG(INFO) << "feature!";
     ppspeech::ModelOptions model_opts;
     model_opts.model_path = FLAGS_model_path;
 
     resource.model_opts = model_opts;
+     LOG(INFO) << "model!";
 
     ppspeech::DecodeOptions decoder_opts;
     decoder_opts.chunk_size=16;
@@ -44,6 +47,7 @@ ppspeech::U2RecognizerResource InitOpts() {
     decoder_opts.ctc_prefix_search_opts.second_beam_size = 10;
 
     resource.decoder_opts = decoder_opts;
+    LOG(INFO) << "decoder!";
     return resource;
 }
 
@@ -57,9 +61,6 @@ int main(int argc, char* argv[]) {
     int32 num_done = 0, num_err = 0;
     double tot_wav_duration = 0.0;
 
-    ppspeech::U2RecognizerResource resource = InitOpts();
-    ppspeech::U2Recognizer recognizer(resource);
-
     kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
         FLAGS_wav_rspecifier);
     kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
@@ -71,8 +72,10 @@ int main(int argc, char* argv[]) {
     LOG(INFO) << "chunk size (s): " << streaming_chunk;
     LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
 
-    kaldi::Timer timer;
+    ppspeech::U2RecognizerResource resource = InitOpts();
+    ppspeech::U2Recognizer recognizer(resource);
 
+    kaldi::Timer timer;
     for (; !wav_reader.Done(); wav_reader.Next()) {
         std::string utt = wav_reader.Key();
         const kaldi::WaveData& wave_data = wav_reader.Value();
diff --git a/speechx/speechx/frontend/audio/cmvn.cc b/speechx/speechx/frontend/audio/cmvn.cc
index 1ea83aba..5e84a1a1 100644
--- a/speechx/speechx/frontend/audio/cmvn.cc
+++ b/speechx/speechx/frontend/audio/cmvn.cc
@@ -29,7 +29,9 @@ using std::unique_ptr;
 
 CMVN::CMVN(std::string cmvn_file, unique_ptr<FrontendInterface> base_extractor)
     : var_norm_(true) {
+    CHECK(cmvn_file != "");
     base_extractor_ = std::move(base_extractor);
+
     bool binary;
     kaldi::Input ki(cmvn_file, &binary);
     stats_.Read(ki.Stream(), binary);
@@ -55,11 +57,11 @@ bool CMVN::Read(kaldi::Vector<BaseFloat>* feats) {
 // feats contain num_frames feature.
 void CMVN::Compute(VectorBase<BaseFloat>* feats) const {
     KALDI_ASSERT(feats != NULL);
-    int32 dim = stats_.NumCols() - 1;
+   
     if (stats_.NumRows() > 2 || stats_.NumRows() < 1 ||
-        feats->Dim() % dim != 0) {
-        KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << 'x'
-                  << stats_.NumCols() << ", feats " << feats->Dim() << 'x';
+        feats->Dim() % dim_ != 0) {
+        KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << ','
+                  << stats_.NumCols() - 1 << ", feats " << feats->Dim() << 'x';
     }
     if (stats_.NumRows() == 1 && var_norm_) {
         KALDI_ERR
@@ -67,7 +69,7 @@ void CMVN::Compute(VectorBase<BaseFloat>* feats) const {
             << "are supplied.";
     }
 
-    double count = stats_(0, dim);
+    double count = stats_(0, dim_);
     // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
     // computing an offset and representing it as stats_, we use a count of one.
     if (count < 1.0)
@@ -77,14 +79,14 @@ void CMVN::Compute(VectorBase<BaseFloat>* feats) const {
 
     if (!var_norm_) {
         Vector<BaseFloat> offset(feats->Dim());
-        SubVector<double> mean_stats(stats_.RowData(0), dim);
+        SubVector<double> mean_stats(stats_.RowData(0), dim_);
         Vector<double> mean_stats_apply(feats->Dim());
-        // fill the datat of mean_stats in mean_stats_appy whose dim is equal
-        // with the dim of feature.
-        // the dim of feats = dim * num_frames;
-        for (int32 idx = 0; idx < feats->Dim() / dim; ++idx) {
-            SubVector<double> stats_tmp(mean_stats_apply.Data() + dim * idx,
-                                        dim);
+        // fill the datat of mean_stats in mean_stats_appy whose dim_ is equal
+        // with the dim_ of feature.
+        // the dim_ of feats = dim_ * num_frames;
+        for (int32 idx = 0; idx < feats->Dim() / dim_; ++idx) {
+            SubVector<double> stats_tmp(mean_stats_apply.Data() + dim_ * idx,
+                                        dim_);
             stats_tmp.CopyFromVec(mean_stats);
         }
         offset.AddVec(-1.0 / count, mean_stats_apply);
@@ -94,7 +96,7 @@ void CMVN::Compute(VectorBase<BaseFloat>* feats) const {
     // norm(0, d) = mean offset;
     // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
     kaldi::Matrix<BaseFloat> norm(2, feats->Dim());
-    for (int32 d = 0; d < dim; d++) {
+    for (int32 d = 0; d < dim_; d++) {
         double mean, offset, scale;
         mean = stats_(0, d) / count;
         double var = (stats_(1, d) / count) - mean * mean, floor = 1.0e-20;
@@ -111,7 +113,7 @@ void CMVN::Compute(VectorBase<BaseFloat>* feats) const {
         for (int32 d_skip = d; d_skip < feats->Dim();) {
             norm(0, d_skip) = offset;
             norm(1, d_skip) = scale;
-            d_skip = d_skip + dim;
+            d_skip = d_skip + dim_;
         }
     }
     // Apply the normalization.
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc
index 9fc35c95..7232efc4 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.cc
+++ b/speechx/speechx/frontend/audio/feature_pipeline.cc
@@ -32,6 +32,7 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) : opts_(opt
             opts.linear_spectrogram_opts, std::move(data_source)));
     }
 
+    CHECK(opts.cmvn_file != "");
     unique_ptr<FrontendInterface> cmvn(
         new ppspeech::CMVN(opts.cmvn_file, std::move(base_feature)));
 

From 278c7a41a83412f02bc4b0b98832c5076f0940cf Mon Sep 17 00:00:00 2001
From: "david.95" <david.95@live.cn>
Date: Mon, 17 Oct 2022 14:59:23 +0800
Subject: [PATCH 17/60] add module define to fix ci, test=tts

---
 paddlespeech/t2s/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddlespeech/t2s/__init__.py b/paddlespeech/t2s/__init__.py
index 7d93c026..57fe82a9 100644
--- a/paddlespeech/t2s/__init__.py
+++ b/paddlespeech/t2s/__init__.py
@@ -18,5 +18,6 @@ from . import exps
 from . import frontend
 from . import models
 from . import modules
+from . import ssml
 from . import training
 from . import utils

From 616fc4594b2484f12400fb937c4b0ff0e9de4a15 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 17 Oct 2022 08:19:43 +0000
Subject: [PATCH 18/60] refactor options

---
 speechx/examples/codelab/u2/local/decode.sh   |  2 +-
 speechx/examples/codelab/u2/local/nnet.sh     |  2 +-
 .../examples/codelab/u2/local/recognizer.sh   |  2 +-
 speechx/speechx/decoder/CMakeLists.txt        | 83 +++++++++++-------
 .../decoder/ctc_beam_search_decoder_main.cc   | 15 ++--
 .../ctc_prefix_beam_search_decoder_main.cc    |  6 +-
 speechx/speechx/decoder/ctc_tlg_decoder.h     | 32 +++++--
 .../speechx/decoder/ctc_tlg_decoder_main.cc   | 53 ++----------
 speechx/speechx/decoder/param.h               | 60 +++++--------
 speechx/speechx/decoder/recognizer.h          | 14 ++-
 speechx/speechx/decoder/recognizer_main.cc    | 24 +-----
 speechx/speechx/decoder/u2_recognizer.h       | 84 +++++++++---------
 speechx/speechx/decoder/u2_recognizer_main.cc | 31 +------
 .../speechx/frontend/audio/feature_pipeline.h | 77 +++++++++++++----
 speechx/speechx/nnet/ds2_nnet_main.cc         | 35 ++------
 speechx/speechx/nnet/nnet_itf.h               | 85 ++++++++++---------
 speechx/speechx/nnet/u2_nnet.h                |  1 -
 speechx/speechx/nnet/u2_nnet_main.cc          | 23 ++---
 .../websocket/websocket_server_main.cc        | 24 +-----
 19 files changed, 293 insertions(+), 360 deletions(-)

diff --git a/speechx/examples/codelab/u2/local/decode.sh b/speechx/examples/codelab/u2/local/decode.sh
index 24e9fca5..c22ad7f0 100755
--- a/speechx/examples/codelab/u2/local/decode.sh
+++ b/speechx/examples/codelab/u2/local/decode.sh
@@ -14,7 +14,7 @@ ctc_prefix_beam_search_decoder_main \
     --model_path=$model_dir/export.jit \
     --nnet_decoder_chunk=16 \
     --receptive_field_length=7 \
-    --downsampling_rate=4 \
+    --subsampling_rate=4 \
     --vocab_path=$model_dir/unit.txt \
     --feature_rspecifier=ark,t:$exp/fbank.ark \
     --result_wspecifier=ark,t:$exp/result.ark
diff --git a/speechx/examples/codelab/u2/local/nnet.sh b/speechx/examples/codelab/u2/local/nnet.sh
index 78663e9c..4419201c 100755
--- a/speechx/examples/codelab/u2/local/nnet.sh
+++ b/speechx/examples/codelab/u2/local/nnet.sh
@@ -15,7 +15,7 @@ u2_nnet_main \
     --feature_rspecifier=ark,t:$exp/fbank.ark \
     --nnet_decoder_chunk=16 \
     --receptive_field_length=7 \
-    --downsampling_rate=4 \
+    --subsampling_rate=4 \
     --acoustic_scale=1.0 \
     --nnet_encoder_outs_wspecifier=ark,t:$exp/encoder_outs.ark \
     --nnet_prob_wspecifier=ark,t:$exp/logprobs.ark
diff --git a/speechx/examples/codelab/u2/local/recognizer.sh b/speechx/examples/codelab/u2/local/recognizer.sh
index a7359753..9f697b45 100755
--- a/speechx/examples/codelab/u2/local/recognizer.sh
+++ b/speechx/examples/codelab/u2/local/recognizer.sh
@@ -16,7 +16,7 @@ u2_recognizer_main \
     --model_path=$model_dir/export.jit \
     --nnet_decoder_chunk=16 \
     --receptive_field_length=7 \
-    --downsampling_rate=4 \
+    --subsampling_rate=4 \
     --vocab_path=$model_dir/unit.txt \
     --wav_rspecifier=scp:$data/wav.scp \
     --result_wspecifier=ark,t:$exp/result.ark
diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt
index 472d9332..d06c3529 100644
--- a/speechx/speechx/decoder/CMakeLists.txt
+++ b/speechx/speechx/decoder/CMakeLists.txt
@@ -1,44 +1,61 @@
 project(decoder)
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR/ctc_decoders})
-add_library(decoder STATIC
-  ctc_decoders/decoder_utils.cpp
-  ctc_decoders/path_trie.cpp
-  ctc_decoders/scorer.cpp
-  ctc_beam_search_decoder.cc
-  ctc_prefix_beam_search_decoder.cc
-  ctc_tlg_decoder.cc
-  recognizer.cc
-  u2_recognizer.cc
+
+set(decoder_src   )
+
+if (USING_DS2)
+list(APPEND decoder_src
+ctc_decoders/decoder_utils.cpp
+ctc_decoders/path_trie.cpp
+ctc_decoders/scorer.cpp
+ctc_beam_search_decoder.cc
+ctc_tlg_decoder.cc
+recognizer.cc
 )
+endif()
+
+if (USING_U2)
+  list(APPEND decoder_src
+    ctc_prefix_beam_search_decoder.cc
+    u2_recognizer.cc
+  )
+endif()
+
+add_library(decoder STATIC ${decoder_src})
 target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder absl::strings)
 
 # test
-set(BINS 
-  ctc_beam_search_decoder_main
-  nnet_logprob_decoder_main
-  recognizer_main
-  ctc_tlg_decoder_main
-)
+if (USING_DS2)
+  set(BINS 
+    ctc_beam_search_decoder_main
+    nnet_logprob_decoder_main
+    recognizer_main
+    ctc_tlg_decoder_main
+  )
 
-foreach(bin_name IN LISTS BINS)
-  add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
-  target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-  target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
-endforeach()
+  foreach(bin_name IN LISTS BINS)
+    add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+    target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+    target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
+  endforeach()
+endif()
 
 
-# u2
-set(TEST_BINS 
-  u2_recognizer_main
-  ctc_prefix_beam_search_decoder_main
-)
+if (USING_U2)
+  set(TEST_BINS 
+    ctc_prefix_beam_search_decoder_main
+    u2_recognizer_main
+  )
+
+  foreach(bin_name IN LISTS TEST_BINS)
+    add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+    target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+    target_link_libraries(${bin_name} nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util)
+    target_compile_options(${bin_name}  PRIVATE ${PADDLE_COMPILE_FLAGS})
+    target_include_directories(${bin_name}  PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
+    target_link_libraries(${bin_name}  ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
+  endforeach()
+
+endif()
 
-foreach(bin_name IN LISTS TEST_BINS)
-  add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
-  target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-  target_link_libraries(${bin_name} nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util)
-  target_compile_options(${bin_name}  PRIVATE ${PADDLE_COMPILE_FLAGS})
-  target_include_directories(${bin_name}  PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
-  target_link_libraries(${bin_name}  ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
-endforeach()
\ No newline at end of file
diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc
index 7e245e9b..edf9215a 100644
--- a/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc
@@ -31,7 +31,7 @@ DEFINE_string(lm_path, "", "language model");
 DEFINE_int32(receptive_field_length,
              7,
              "receptive field of two CNN(kernel=3) downsampling module.");
-DEFINE_int32(downsampling_rate,
+DEFINE_int32(subsampling_rate,
              4,
              "two CNN(kernel=3) module downsampling rate.");
 DEFINE_string(
@@ -81,13 +81,8 @@ int main(int argc, char* argv[]) {
     opts.lm_path = lm_path;
     ppspeech::CTCBeamSearch decoder(opts);
 
-    ppspeech::ModelOptions model_opts;
-    model_opts.model_path = model_path;
-    model_opts.param_path = model_params;
-    model_opts.cache_names = FLAGS_model_cache_names;
-    model_opts.cache_shape = FLAGS_model_cache_shapes;
-    model_opts.input_names = FLAGS_model_input_names;
-    model_opts.output_names = FLAGS_model_output_names;
+    ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags();
+
     std::shared_ptr<ppspeech::PaddleNnet> nnet(
         new ppspeech::PaddleNnet(model_opts));
     std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
@@ -95,8 +90,8 @@ int main(int argc, char* argv[]) {
         new ppspeech::Decodable(nnet, raw_data));
 
     int32 chunk_size = FLAGS_receptive_field_length +
-                       (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
-    int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk;
+                       (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate;
+    int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk;
     int32 receptive_field_length = FLAGS_receptive_field_length;
     LOG(INFO) << "chunk size (frame): " << chunk_size;
     LOG(INFO) << "chunk stride (frame): " << chunk_stride;
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
index dd352378..7a488bb0 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
@@ -30,7 +30,7 @@ DEFINE_string(model_path, "", "paddle nnet model");
 DEFINE_int32(receptive_field_length,
              7,
              "receptive field of two CNN(kernel=3) downsampling module.");
-DEFINE_int32(downsampling_rate,
+DEFINE_int32(subsampling_rate,
              4,
              "two CNN(kernel=3) module downsampling rate.");
 
@@ -81,8 +81,8 @@ int main(int argc, char* argv[]) {
 
 
     int32 chunk_size = FLAGS_receptive_field_length +
-                       (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
-    int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk;
+                       (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate;
+    int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk;
     int32 receptive_field_length = FLAGS_receptive_field_length;
     LOG(INFO) << "chunk size (frame): " << chunk_size;
     LOG(INFO) << "chunk stride (frame): " << chunk_stride;
diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h
index 2f1d6c10..76bbcf42 100644
--- a/speechx/speechx/decoder/ctc_tlg_decoder.h
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.h
@@ -20,15 +20,37 @@
 #include "kaldi/decoder/lattice-faster-online-decoder.h"
 #include "util/parse-options.h"
 
+
+DECLARE_string(graph_path);
+DECLARE_string(word_symbol_table);
+DECLARE_int32(max_active);
+DECLARE_double(beam);
+DECLARE_double(lattice_beam);
+
 namespace ppspeech {
 
 struct TLGDecoderOptions {
-    kaldi::LatticeFasterDecoderConfig opts;
+    kaldi::LatticeFasterDecoderConfig opts{};
     // todo remove later, add into decode resource
-    std::string word_symbol_table;
-    std::string fst_path;
-
-    TLGDecoderOptions() : word_symbol_table(""), fst_path("") {}
+    std::string word_symbol_table{};
+    std::string fst_path{};
+
+    static TLGDecoderOptions InitFromFlags(){
+        TLGDecoderOptions decoder_opts;
+        decoder_opts.word_symbol_table = FLAGS_word_symbol_table;
+        decoder_opts.fst_path = FLAGS_graph_path;
+        LOG(INFO) << "fst path: " << decoder_opts.fst_path;
+        LOG(INFO) << "fst symbole table: " << decoder_opts.word_symbol_table;
+
+        decoder_opts.opts.max_active = FLAGS_max_active;
+        decoder_opts.opts.beam = FLAGS_beam;
+        decoder_opts.opts.lattice_beam = FLAGS_lattice_beam;
+        LOG(INFO) << "LatticeFasterDecoder max active: " <<  decoder_opts.opts.max_active ;
+        LOG(INFO) << "LatticeFasterDecoder beam: " <<  decoder_opts.opts.beam ;
+        LOG(INFO) << "LatticeFasterDecoder lattice_beam: " <<  decoder_opts.opts.lattice_beam ;
+
+        return decoder_opts;
+    }
 };
 
 class TLGDecoder : public DecoderInterface {
diff --git a/speechx/speechx/decoder/ctc_tlg_decoder_main.cc b/speechx/speechx/decoder/ctc_tlg_decoder_main.cc
index cd1249d8..f262101a 100644
--- a/speechx/speechx/decoder/ctc_tlg_decoder_main.cc
+++ b/speechx/speechx/decoder/ctc_tlg_decoder_main.cc
@@ -19,6 +19,7 @@
 #include "frontend/audio/data_cache.h"
 #include "nnet/decodable.h"
 #include "nnet/ds2_nnet.h"
+#include "decoder/param.h"
 #include "decoder/ctc_tlg_decoder.h"
 
 #include "kaldi/util/table-types.h"
@@ -26,30 +27,7 @@
 
 DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
 DEFINE_string(result_wspecifier, "", "test result wspecifier");
-DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
-DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
-DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
-DEFINE_string(graph_path, "TLG", "decoder graph");
-DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
-DEFINE_int32(max_active, 7500, "decoder graph");
-DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk");
-DEFINE_int32(receptive_field_length,
-             7,
-             "receptive field of two CNN(kernel=3) downsampling module.");
-DEFINE_int32(downsampling_rate,
-             4,
-             "two CNN(kernel=3) module downsampling rate.");
-DEFINE_string(
-    model_input_names,
-    "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box",
-    "model input names");
-DEFINE_string(model_output_names,
-              "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0",
-              "model output names");
-DEFINE_string(model_cache_names,
-              "chunk_state_h_box,chunk_state_c_box",
-              "model cache names");
-DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
+
 
 using kaldi::BaseFloat;
 using kaldi::Matrix;
@@ -66,32 +44,16 @@ int main(int argc, char* argv[]) {
     kaldi::SequentialBaseFloatMatrixReader feature_reader(
         FLAGS_feature_rspecifier);
     kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
-    std::string model_graph = FLAGS_model_path;
-    std::string model_params = FLAGS_param_path;
-    std::string word_symbol_table = FLAGS_word_symbol_table;
-    std::string graph_path = FLAGS_graph_path;
-    LOG(INFO) << "model path: " << model_graph;
-    LOG(INFO) << "model param: " << model_params;
-    LOG(INFO) << "word symbol path: " << word_symbol_table;
-    LOG(INFO) << "graph path: " << graph_path;
 
     int32 num_done = 0, num_err = 0;
 
-    ppspeech::TLGDecoderOptions opts;
-    opts.word_symbol_table = word_symbol_table;
-    opts.fst_path = graph_path;
-    opts.opts.max_active = FLAGS_max_active;
+    ppspeech::TLGDecoderOptions opts = ppspeech::TLGDecoderOptions::InitFromFlags();
     opts.opts.beam = 15.0;
     opts.opts.lattice_beam = 7.5;
     ppspeech::TLGDecoder decoder(opts);
 
-    ppspeech::ModelOptions model_opts;
-    model_opts.model_path = model_graph;
-    model_opts.param_path = model_params;
-    model_opts.cache_names = FLAGS_model_cache_names;
-    model_opts.cache_shape = FLAGS_model_cache_shapes;
-    model_opts.input_names = FLAGS_model_input_names;
-    model_opts.output_names = FLAGS_model_output_names;
+    ppspeech::ModelOptions model_opts =  ppspeech::ModelOptions::InitFromFlags();
+
     std::shared_ptr<ppspeech::PaddleNnet> nnet(
         new ppspeech::PaddleNnet(model_opts));
     std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
@@ -99,12 +61,13 @@ int main(int argc, char* argv[]) {
         new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale));
 
     int32 chunk_size = FLAGS_receptive_field_length +
-                       (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
-    int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk;
+                       (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate;
+    int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk;
     int32 receptive_field_length = FLAGS_receptive_field_length;
     LOG(INFO) << "chunk size (frame): " << chunk_size;
     LOG(INFO) << "chunk stride (frame): " << chunk_stride;
     LOG(INFO) << "receptive field (frame): " << receptive_field_length;
+    
     decoder.InitDecoder();
     kaldi::Timer timer;
     for (; !feature_reader.Done(); feature_reader.Next()) {
diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h
index 1827e82d..5e1120ad 100644
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -17,8 +17,6 @@
 #include "base/common.h"
 #include "decoder/ctc_beam_search_decoder.h"
 #include "decoder/ctc_tlg_decoder.h"
-#include "frontend/audio/feature_pipeline.h"
-
 
 // feature
 DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
@@ -27,18 +25,18 @@ DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
 DEFINE_int32(num_bins, 161, "num bins of mel");
 DEFINE_string(cmvn_file, "", "read cmvn");
 
-
 // feature sliding window
 DEFINE_int32(receptive_field_length,
              7,
              "receptive field of two CNN(kernel=3) downsampling module.");
-DEFINE_int32(downsampling_rate,
-             4,
+DEFINE_int32(subsampling_rate,
+             4, 
              "two CNN(kernel=3) module downsampling rate.");
 DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk");
 
 
 // nnet
+DEFINE_string(vocab_path, "", "nnet vocab path.");
 DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
 DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
 DEFINE_string(
@@ -52,10 +50,11 @@ DEFINE_string(model_cache_names,
               "chunk_state_h_box,chunk_state_c_box",
               "model cache names");
 DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
-DEFINE_string(vocab_path, "", "nnet vocab path.");
+
 
 // decoder
 DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
+
 DEFINE_string(graph_path, "TLG", "decoder graph");
 DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
 DEFINE_int32(max_active, 7500, "max active");
@@ -63,37 +62,20 @@ DEFINE_double(beam, 15.0, "decoder beam");
 DEFINE_double(lattice_beam, 7.5, "decoder beam");
 
 
-namespace ppspeech {
-
-// todo refactor later
-FeaturePipelineOptions InitFeaturePipelineOptions() {
-    FeaturePipelineOptions opts;
-    opts.cmvn_file = FLAGS_cmvn_file;
-    kaldi::FrameExtractionOptions frame_opts;
-    frame_opts.dither = 0.0;
-    frame_opts.frame_shift_ms = 10;
-    opts.use_fbank = FLAGS_use_fbank;
-    LOG(INFO) << "feature type: " << (opts.use_fbank ? "fbank" : "linear");
-    if (opts.use_fbank) {
-        opts.to_float32 = false;
-        frame_opts.window_type = "povey";
-        frame_opts.frame_length_ms = 25;
-        opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
-        opts.fbank_opts.frame_opts = frame_opts;
-        LOG(INFO) << "num bins: " << opts.fbank_opts.mel_opts.num_bins;
-    } else {
-        opts.to_float32 = true;
-        frame_opts.remove_dc_offset = false;
-        frame_opts.frame_length_ms = 20;
-        frame_opts.window_type = "hanning";
-        frame_opts.preemph_coeff = 0.0;
-        opts.linear_spectrogram_opts.frame_opts = frame_opts;
-    }
-    opts.assembler_opts.subsampling_rate = FLAGS_downsampling_rate;
-    opts.assembler_opts.receptive_filed_length = FLAGS_receptive_field_length;
-    opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk;
-
-    return opts;
-}
 
-}  // namespace ppspeech
+// DecodeOptions flags
+// DEFINE_int32(chunk_size, -1, "decoding chunk size");
+DEFINE_int32(num_left_chunks, -1, "left chunks in decoding");
+DEFINE_double(ctc_weight,
+              0.5,
+              "ctc weight when combining ctc score and rescoring score");
+DEFINE_double(rescoring_weight,
+              1.0,
+              "rescoring weight when combining ctc score and rescoring score");
+DEFINE_double(reverse_weight,
+              0.3,
+              "used for bitransformer rescoring. it must be 0.0 if decoder is"
+              "conventional transformer decoder, and only reverse_weight > 0.0"
+              "dose the right to left decoder will be calculated and used");
+DEFINE_int32(nbest, 10, "nbest for ctc wfst or prefix search");
+DEFINE_int32(blank, 0, "blank id in vocab");
diff --git a/speechx/speechx/decoder/recognizer.h b/speechx/speechx/decoder/recognizer.h
index 4965e7a3..51b66673 100644
--- a/speechx/speechx/decoder/recognizer.h
+++ b/speechx/speechx/decoder/recognizer.h
@@ -22,14 +22,26 @@
 #include "nnet/decodable.h"
 #include "nnet/ds2_nnet.h"
 
+DECLARE_double(acoustic_scale);
+
 namespace ppspeech {
 
 struct RecognizerResource {
+    kaldi::BaseFloat acoustic_scale{1.0};
     FeaturePipelineOptions feature_pipeline_opts{};
     ModelOptions model_opts{};
     TLGDecoderOptions tlg_opts{};
     //    CTCBeamSearchOptions beam_search_opts;
-    kaldi::BaseFloat acoustic_scale{1.0};
+ 
+    static RecognizerResource InitFromFlags(){
+        RecognizerResource resource;
+        resource.acoustic_scale = FLAGS_acoustic_scale;
+        resource.feature_pipeline_opts = FeaturePipelineOptions::InitFromFlags();
+        resource.model_opts = ModelOptions::InitFromFlags();
+        resource.tlg_opts =  TLGDecoderOptions::InitFromFlags();
+      return resource;
+
+    }
 };
 
 class Recognizer {
diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/decoder/recognizer_main.cc
index 2b497d6e..662943b5 100644
--- a/speechx/speechx/decoder/recognizer_main.cc
+++ b/speechx/speechx/decoder/recognizer_main.cc
@@ -25,27 +25,9 @@ DEFINE_int32(sample_rate, 16000, "sample rate");
 ppspeech::RecognizerResource InitRecognizerResoure() {
     ppspeech::RecognizerResource resource;
     resource.acoustic_scale = FLAGS_acoustic_scale;
-    resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions();
-
-    ppspeech::ModelOptions model_opts;
-    model_opts.model_path = FLAGS_model_path;
-    model_opts.param_path = FLAGS_param_path;
-    model_opts.cache_names = FLAGS_model_cache_names;
-    model_opts.cache_shape = FLAGS_model_cache_shapes;
-    model_opts.input_names = FLAGS_model_input_names;
-    model_opts.output_names = FLAGS_model_output_names;
-    model_opts.subsample_rate = FLAGS_downsampling_rate;
-    resource.model_opts = model_opts;
-
-    ppspeech::TLGDecoderOptions decoder_opts;
-    decoder_opts.word_symbol_table = FLAGS_word_symbol_table;
-    decoder_opts.fst_path = FLAGS_graph_path;
-    decoder_opts.opts.max_active = FLAGS_max_active;
-    decoder_opts.opts.beam = FLAGS_beam;
-    decoder_opts.opts.lattice_beam = FLAGS_lattice_beam;
-
-    resource.tlg_opts = decoder_opts;
-
+    resource.feature_pipeline_opts = ppspeech::FeaturePipelineOptions::InitFromFlags();
+    resource.model_opts =  ppspeech::ModelOptions::InitFromFlags();
+    resource.tlg_opts = ppspeech::TLGDecoderOptions::InitFromFlags();
     return resource;
 }
 
diff --git a/speechx/speechx/decoder/u2_recognizer.h b/speechx/speechx/decoder/u2_recognizer.h
index a65cae3b..86bd4821 100644
--- a/speechx/speechx/decoder/u2_recognizer.h
+++ b/speechx/speechx/decoder/u2_recognizer.h
@@ -26,15 +26,25 @@
 #include "fst/fstlib.h"
 #include "fst/symbol-table.h"
 
-namespace ppspeech {
+DECLARE_int32(nnet_decoder_chunk);
+DECLARE_int32(num_left_chunks);
+DECLARE_double(ctc_weight);
+DECLARE_double(rescoring_weight);
+DECLARE_double(reverse_weight);
+DECLARE_int32(nbest);
+DECLARE_int32(blank);
+
+DECLARE_double(acoustic_scale);
+DECLARE_string(vocab_path);
 
+namespace ppspeech {
 
 struct DecodeOptions {
     // chunk_size is the frame number of one chunk after subsampling.
     // e.g. if subsample rate is 4 and chunk_size = 16, the frames in
     // one chunk are 67=16*4 + 3, stride is 64=16*4
-    int chunk_size;
-    int num_left_chunks;
+    int chunk_size{16};
+    int num_left_chunks{-1};
 
     // final_score = rescoring_weight * rescoring_score + ctc_weight *
     // ctc_score;
@@ -46,51 +56,27 @@ struct DecodeOptions {
     // it's a sum(prefix) score + context score For CtcWfstBeamSerch, it's a
     // max(viterbi) path score + context score So we should carefully set
     // ctc_weight accroding to the search methods.
-    float ctc_weight;
-    float rescoring_weight;
-    float reverse_weight;
+    float ctc_weight{0.0};
+    float rescoring_weight{1.0};
+    float reverse_weight{0.0};
 
     // CtcEndpointConfig ctc_endpoint_opts;
-    CTCBeamSearchOptions ctc_prefix_search_opts;
-
-    DecodeOptions()
-        : chunk_size(16),
-          num_left_chunks(-1),
-          ctc_weight(0.5),
-          rescoring_weight(1.0),
-          reverse_weight(0.0) {}
-
-    void Register(kaldi::OptionsItf* opts) {
-        std::string module = "DecoderConfig: ";
-        opts->Register(
-            "chunk-size",
-            &chunk_size,
-            module + "the frame number of one chunk after subsampling.");
-        opts->Register("num-left-chunks",
-                       &num_left_chunks,
-                       module + "the left history chunks number.");
-        opts->Register("ctc-weight",
-                       &ctc_weight,
-                       module +
-                           "ctc weight for rescore. final_score = "
-                           "rescoring_weight * rescoring_score + ctc_weight * "
-                           "ctc_score.");
-        opts->Register("rescoring-weight",
-                       &rescoring_weight,
-                       module +
-                           "attention score weight for rescore. final_score = "
-                           "rescoring_weight * rescoring_score + ctc_weight * "
-                           "ctc_score.");
-        opts->Register("reverse-weight",
-                       &reverse_weight,
-                       module +
-                           "reverse decoder weight. rescoring_score = "
-                           "left_to_right_score * (1 - reverse_weight) + "
-                           "right_to_left_score * reverse_weight.");
+    CTCBeamSearchOptions ctc_prefix_search_opts{};
+
+    static DecodeOptions InitFromFlags(){
+        DecodeOptions decoder_opts;
+        decoder_opts.chunk_size=FLAGS_nnet_decoder_chunk;
+        decoder_opts.num_left_chunks = FLAGS_num_left_chunks;
+        decoder_opts.ctc_weight = FLAGS_ctc_weight;
+        decoder_opts.rescoring_weight = FLAGS_rescoring_weight;
+        decoder_opts.reverse_weight = FLAGS_reverse_weight;
+        decoder_opts.ctc_prefix_search_opts.blank = FLAGS_blank;
+        decoder_opts.ctc_prefix_search_opts.first_beam_size = FLAGS_nbest;
+        decoder_opts.ctc_prefix_search_opts.second_beam_size = FLAGS_nbest;
+        return decoder_opts;
     }
 };
 
-
 struct U2RecognizerResource {
     kaldi::BaseFloat acoustic_scale{1.0};
     std::string vocab_path{};
@@ -98,7 +84,17 @@ struct U2RecognizerResource {
     FeaturePipelineOptions feature_pipeline_opts{};
     ModelOptions model_opts{};
     DecodeOptions decoder_opts{};
-    //    CTCBeamSearchOptions beam_search_opts;
+
+    static U2RecognizerResource InitFromFlags() {
+    U2RecognizerResource resource;
+    resource.vocab_path = FLAGS_vocab_path;
+    resource.acoustic_scale = FLAGS_acoustic_scale;
+
+    resource.feature_pipeline_opts = ppspeech::FeaturePipelineOptions::InitFromFlags();
+    resource.model_opts = ppspeech::ModelOptions::InitFromFlags();
+    resource.decoder_opts = ppspeech::DecodeOptions::InitFromFlags();
+    return resource;
+}
 };
 
 
diff --git a/speechx/speechx/decoder/u2_recognizer_main.cc b/speechx/speechx/decoder/u2_recognizer_main.cc
index ab2c6695..b1a7b2e8 100644
--- a/speechx/speechx/decoder/u2_recognizer_main.cc
+++ b/speechx/speechx/decoder/u2_recognizer_main.cc
@@ -22,35 +22,6 @@ DEFINE_string(result_wspecifier, "", "test result wspecifier");
 DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
 DEFINE_int32(sample_rate, 16000, "sample rate");
 
-
-ppspeech::U2RecognizerResource InitOpts() {
-    ppspeech::U2RecognizerResource resource;
-    resource.vocab_path = FLAGS_vocab_path;
-    resource.acoustic_scale = FLAGS_acoustic_scale;
-
-    resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions();
-    LOG(INFO) << "feature!";
-    ppspeech::ModelOptions model_opts;
-    model_opts.model_path = FLAGS_model_path;
-
-    resource.model_opts = model_opts;
-     LOG(INFO) << "model!";
-
-    ppspeech::DecodeOptions decoder_opts;
-    decoder_opts.chunk_size=16;
-    decoder_opts.num_left_chunks = -1;
-    decoder_opts.ctc_weight = 0.5;
-    decoder_opts.rescoring_weight = 1.0;
-    decoder_opts.reverse_weight = 0.3;
-    decoder_opts.ctc_prefix_search_opts.blank = 0;
-    decoder_opts.ctc_prefix_search_opts.first_beam_size = 10;
-    decoder_opts.ctc_prefix_search_opts.second_beam_size = 10;
-
-    resource.decoder_opts = decoder_opts;
-    LOG(INFO) << "decoder!";
-    return resource;
-}
-
 int main(int argc, char* argv[]) {
     gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
@@ -72,7 +43,7 @@ int main(int argc, char* argv[]) {
     LOG(INFO) << "chunk size (s): " << streaming_chunk;
     LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
 
-    ppspeech::U2RecognizerResource resource = InitOpts();
+    ppspeech::U2RecognizerResource resource = ppspeech::U2RecognizerResource::InitFromFlags();
     ppspeech::U2Recognizer recognizer(resource);
 
     kaldi::Timer timer;
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h
index 613f69c6..38a47433 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@@ -25,26 +25,71 @@
 #include "frontend/audio/linear_spectrogram.h"
 #include "frontend/audio/normalizer.h"
 
+// feature
+DECLARE_bool(use_fbank);
+DECLARE_int32(num_bins);
+DECLARE_string(cmvn_file);
+
+// feature sliding window
+DECLARE_int32(receptive_field_length);
+DECLARE_int32(subsampling_rate);
+DECLARE_int32(nnet_decoder_chunk);
+
 namespace ppspeech {
+
 struct FeaturePipelineOptions {
-    std::string cmvn_file;
-    bool to_float32;  // true, only for linear feature
-    bool use_fbank;
-    LinearSpectrogramOptions linear_spectrogram_opts;
-    kaldi::FbankOptions fbank_opts;
-    FeatureCacheOptions feature_cache_opts;
-    AssemblerOptions assembler_opts;
-
-    FeaturePipelineOptions()
-        : cmvn_file(""),
-          to_float32(false),  // true, only for linear feature
-          use_fbank(true),
-          linear_spectrogram_opts(),
-          fbank_opts(),
-          feature_cache_opts(),
-          assembler_opts() {}
+    std::string cmvn_file{};
+    bool to_float32{false};  // true, only for linear feature
+    bool use_fbank{true};
+    LinearSpectrogramOptions linear_spectrogram_opts{};
+    kaldi::FbankOptions fbank_opts{};
+    FeatureCacheOptions feature_cache_opts{};
+    AssemblerOptions assembler_opts{};
+
+    static FeaturePipelineOptions InitFromFlags(){
+        FeaturePipelineOptions opts;
+        opts.cmvn_file = FLAGS_cmvn_file;
+        LOG(INFO) << "cmvn file: " <<  opts.cmvn_file;
+
+        // frame options
+        kaldi::FrameExtractionOptions frame_opts;
+        frame_opts.dither = 0.0;
+        LOG(INFO) << "dither: " <<  frame_opts.dither;
+        frame_opts.frame_shift_ms = 10;
+        LOG(INFO) << "frame shift ms: " <<  frame_opts.frame_shift_ms;
+        opts.use_fbank = FLAGS_use_fbank;
+        LOG(INFO) << "feature type: " << (opts.use_fbank ? "fbank" : "linear");
+        if (opts.use_fbank) {
+            opts.to_float32 = false;
+            frame_opts.window_type = "povey";
+            frame_opts.frame_length_ms = 25;
+            opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
+            LOG(INFO) << "num bins: " << opts.fbank_opts.mel_opts.num_bins;
+
+            opts.fbank_opts.frame_opts = frame_opts;
+        } else {
+            opts.to_float32 = true;
+            frame_opts.remove_dc_offset = false;
+            frame_opts.frame_length_ms = 20;
+            frame_opts.window_type = "hanning";
+            frame_opts.preemph_coeff = 0.0;
+
+            opts.linear_spectrogram_opts.frame_opts = frame_opts;
+        }
+        LOG(INFO) << "frame length ms: " <<  frame_opts.frame_length_ms;
+
+        // assembler opts
+        opts.assembler_opts.subsampling_rate = FLAGS_subsampling_rate;
+        LOG(INFO) << "subsampling rate: " << opts.assembler_opts.subsampling_rate;
+        opts.assembler_opts.receptive_filed_length = FLAGS_receptive_field_length;
+        LOG(INFO) << "nnet receptive filed length: " <<  opts.assembler_opts.receptive_filed_length;
+        opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk;
+        LOG(INFO) << "nnet chunk size: " <<  opts.assembler_opts.nnet_decoder_chunk;
+        return opts;
+    }
 };
 
+
 class FeaturePipeline : public FrontendInterface {
   public:
     explicit FeaturePipeline(const FeaturePipelineOptions& opts);
diff --git a/speechx/speechx/nnet/ds2_nnet_main.cc b/speechx/speechx/nnet/ds2_nnet_main.cc
index 943d7e5f..d8d33e98 100644
--- a/speechx/speechx/nnet/ds2_nnet_main.cc
+++ b/speechx/speechx/nnet/ds2_nnet_main.cc
@@ -14,6 +14,7 @@
 
 #include "nnet/ds2_nnet.h"
 #include "base/common.h"
+#include "decoder/param.h"
 #include "frontend/audio/assembler.h"
 #include "frontend/audio/data_cache.h"
 #include "kaldi/util/table-types.h"
@@ -21,27 +22,6 @@
 
 DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
 DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier");
-DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
-DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
-DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk");
-DEFINE_int32(receptive_field_length,
-             7,
-             "receptive field of two CNN(kernel=3) downsampling module.");
-DEFINE_int32(downsampling_rate,
-             4,
-             "two CNN(kernel=3) module downsampling rate.");
-DEFINE_string(
-    model_input_names,
-    "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box",
-    "model input names");
-DEFINE_string(model_output_names,
-              "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0",
-              "model output names");
-DEFINE_string(model_cache_names,
-              "chunk_state_h_box,chunk_state_c_box",
-              "model cache names");
-DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
-DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
 
 using kaldi::BaseFloat;
 using kaldi::Matrix;
@@ -64,13 +44,8 @@ int main(int argc, char* argv[]) {
 
     int32 num_done = 0, num_err = 0;
 
-    ppspeech::ModelOptions model_opts;
-    model_opts.model_path = model_graph;
-    model_opts.param_path = model_params;
-    model_opts.cache_names = FLAGS_model_cache_names;
-    model_opts.cache_shape = FLAGS_model_cache_shapes;
-    model_opts.input_names = FLAGS_model_input_names;
-    model_opts.output_names = FLAGS_model_output_names;
+    ppspeech::ModelOptions model_opts =  ppspeech::ModelOptions::InitFromFlags();
+
     std::shared_ptr<ppspeech::PaddleNnet> nnet(
         new ppspeech::PaddleNnet(model_opts));
     std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
@@ -78,8 +53,8 @@ int main(int argc, char* argv[]) {
         new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale));
 
     int32 chunk_size = FLAGS_receptive_field_length +
-                       (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
-    int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk;
+                       (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate;
+    int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk;
     int32 receptive_field_length = FLAGS_receptive_field_length;
     LOG(INFO) << "chunk size (frame): " << chunk_size;
     LOG(INFO) << "chunk stride (frame): " << chunk_stride;
diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h
index 109f54e0..f8105b7f 100644
--- a/speechx/speechx/nnet/nnet_itf.h
+++ b/speechx/speechx/nnet/nnet_itf.h
@@ -20,53 +20,54 @@
 #include "kaldi/matrix/kaldi-matrix.h"
 #include "kaldi/util/options-itf.h"
 
-namespace ppspeech {
+DECLARE_int32(subsampling_rate);
+DECLARE_string(model_path);
+DECLARE_string(param_path);
+DECLARE_string(model_input_names);
+DECLARE_string(model_output_names);
+DECLARE_string(model_cache_names);
+DECLARE_string(model_cache_shapes);
 
+namespace ppspeech {
 
 struct ModelOptions {
+    // common
+    int subsample_rate{1};
+    int thread_num{1};  // predictor thread pool size for ds2;
+    bool use_gpu{false};
     std::string model_path;
+
     std::string param_path;
-    int thread_num;  // predictor thread pool size for ds2;
-    bool use_gpu;
-    bool switch_ir_optim;
-    std::string input_names;
-    std::string output_names;
-    std::string cache_names;
-    std::string cache_shape;
-    bool enable_fc_padding;
-    bool enable_profile;
-    int subsample_rate;
-    ModelOptions()
-        : model_path(""),
-          param_path(""),
-          thread_num(1),
-          use_gpu(false),
-          input_names(""),
-          output_names(""),
-          cache_names(""),
-          cache_shape(""),
-          switch_ir_optim(false),
-          enable_fc_padding(false),
-          enable_profile(false),
-          subsample_rate(0) {}
-
-    void Register(kaldi::OptionsItf* opts) {
-        opts->Register("model-path", &model_path, "model file path");
-        opts->Register("model-param", &param_path, "params model file path");
-        opts->Register("thread-num", &thread_num, "thread num");
-        opts->Register("use-gpu", &use_gpu, "if use gpu");
-        opts->Register("input-names", &input_names, "paddle input names");
-        opts->Register("output-names", &output_names, "paddle output names");
-        opts->Register("cache-names", &cache_names, "cache names");
-        opts->Register("cache-shape", &cache_shape, "cache shape");
-        opts->Register("switch-ir-optiom",
-                       &switch_ir_optim,
-                       "paddle SwitchIrOptim option");
-        opts->Register("enable-fc-padding",
-                       &enable_fc_padding,
-                       "paddle EnableFCPadding option");
-        opts->Register(
-            "enable-profile", &enable_profile, "paddle EnableProfile option");
+
+    // ds2 for inference
+    std::string input_names{};
+    std::string output_names{};
+    std::string cache_names{};
+    std::string cache_shape{};
+    bool switch_ir_optim{false};
+    bool enable_fc_padding{false};
+    bool enable_profile{false};
+
+    static ModelOptions InitFromFlags(){
+        ModelOptions opts;
+        opts.subsample_rate = FLAGS_subsampling_rate;
+        LOG(INFO) << "subsampling rate: " <<  opts.subsample_rate;
+        opts.model_path = FLAGS_model_path;
+        LOG(INFO) << "model path: " << opts.model_path ;
+
+        opts.param_path = FLAGS_param_path;
+        LOG(INFO) << "param path: " << opts.param_path ;
+
+        LOG(INFO) << "DS2 param: ";
+        opts.cache_names = FLAGS_model_cache_names;
+        LOG(INFO) << "  cache names: " <<    opts.cache_names;
+        opts.cache_shape = FLAGS_model_cache_shapes;
+        LOG(INFO) << "  cache shape: " <<  opts.cache_shape;
+        opts.input_names = FLAGS_model_input_names;
+        LOG(INFO) << "  input names: " <<  opts.input_names;
+        opts.output_names = FLAGS_model_output_names;
+        LOG(INFO) << "  output names: " <<  opts.output_names;
+        return opts;
     }
 };
 
diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h
index 7058ea94..697ac20c 100644
--- a/speechx/speechx/nnet/u2_nnet.h
+++ b/speechx/speechx/nnet/u2_nnet.h
@@ -17,7 +17,6 @@
 #include "base/common.h"
 #include "kaldi/matrix/kaldi-matrix.h"
 
-
 #include "nnet/nnet_itf.h"
 #include "paddle/extension.h"
 #include "paddle/jit/all.h"
diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc
index 4b30f6b4..adbbf0e8 100644
--- a/speechx/speechx/nnet/u2_nnet_main.cc
+++ b/speechx/speechx/nnet/u2_nnet_main.cc
@@ -12,28 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "nnet/u2_nnet.h"
+
 #include "base/common.h"
 #include "frontend/audio/assembler.h"
 #include "frontend/audio/data_cache.h"
 #include "kaldi/util/table-types.h"
 #include "nnet/decodable.h"
+#include "decoder/param.h"
+#include "nnet/u2_nnet.h"
+
 
 DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
 DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier");
 DEFINE_string(nnet_encoder_outs_wspecifier, "", "nnet encoder outs wspecifier");
 
-DEFINE_string(model_path, "", "paddle nnet model");
-
-DEFINE_int32(nnet_decoder_chunk, 16, "nnet forward chunk");
-DEFINE_int32(receptive_field_length,
-             7,
-             "receptive field of two CNN(kernel=3) downsampling module.");
-DEFINE_int32(downsampling_rate,
-             4,
-             "two CNN(kernel=3) module downsampling rate.");
-DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
-
 using kaldi::BaseFloat;
 using kaldi::Matrix;
 using std::vector;
@@ -58,13 +50,12 @@ int main(int argc, char* argv[]) {
     kaldi::BaseFloatMatrixWriter nnet_out_writer(FLAGS_nnet_prob_wspecifier);
     kaldi::BaseFloatMatrixWriter nnet_encoder_outs_writer(FLAGS_nnet_encoder_outs_wspecifier);
 
-    ppspeech::ModelOptions model_opts;
-    model_opts.model_path = FLAGS_model_path;
+    ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags();
 
     int32 chunk_size =
-        (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate +
+        (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate +
         FLAGS_receptive_field_length;
-    int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk;
+    int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk;
     int32 receptive_field_length = FLAGS_receptive_field_length;
     LOG(INFO) << "chunk size (frame): " << chunk_size;
     LOG(INFO) << "chunk stride (frame): " << chunk_stride;
diff --git a/speechx/speechx/protocol/websocket/websocket_server_main.cc b/speechx/speechx/protocol/websocket/websocket_server_main.cc
index 9c01a0a1..827b164f 100644
--- a/speechx/speechx/protocol/websocket/websocket_server_main.cc
+++ b/speechx/speechx/protocol/websocket/websocket_server_main.cc
@@ -20,27 +20,9 @@ DEFINE_int32(port, 8082, "websocket listening port");
 ppspeech::RecognizerResource InitRecognizerResoure() {
     ppspeech::RecognizerResource resource;
     resource.acoustic_scale = FLAGS_acoustic_scale;
-    resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions();
-
-    ppspeech::ModelOptions model_opts;
-    model_opts.model_path = FLAGS_model_path;
-    model_opts.param_path = FLAGS_param_path;
-    model_opts.cache_names = FLAGS_model_cache_names;
-    model_opts.cache_shape = FLAGS_model_cache_shapes;
-    model_opts.input_names = FLAGS_model_input_names;
-    model_opts.output_names = FLAGS_model_output_names;
-    model_opts.subsample_rate = FLAGS_downsampling_rate;
-    resource.model_opts = model_opts;
-
-    ppspeech::TLGDecoderOptions decoder_opts;
-    decoder_opts.word_symbol_table = FLAGS_word_symbol_table;
-    decoder_opts.fst_path = FLAGS_graph_path;
-    decoder_opts.opts.max_active = FLAGS_max_active;
-    decoder_opts.opts.beam = FLAGS_beam;
-    decoder_opts.opts.lattice_beam = FLAGS_lattice_beam;
-
-    resource.tlg_opts = decoder_opts;
-
+    resource.feature_pipeline_opts = ppspeech::FeaturePipelineOptions::InitFromFlags();
+    resource.model_opts = ppspeech::ModelOptions::InitFromFlags();
+    resource.tlg_opts =  ppspeech::TLGDecoderOptions::InitFromFlags();
     return resource;
 }
 

From 56a0a02452bb17204cbd5e126200ea2e02fb0be5 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 17 Oct 2022 08:26:01 +0000
Subject: [PATCH 19/60] format code

---
 speechx/speechx/base/basic_types.h            |  4 +--
 speechx/speechx/base/common.h                 |  1 -
 .../decoder/ctc_prefix_beam_search_decoder.h  |  1 -
 speechx/speechx/decoder/ctc_tlg_decoder.h     | 11 ++++----
 speechx/speechx/decoder/param.h               |  3 +--
 speechx/speechx/decoder/recognizer.h          | 12 ++++-----
 speechx/speechx/decoder/u2_recognizer.h       | 26 +++++++++----------
 speechx/speechx/frontend/audio/data_cache.h   |  2 +-
 .../speechx/frontend/audio/feature_pipeline.h | 22 +++++++++-------
 speechx/speechx/frontend/audio/mfcc.h         |  1 -
 speechx/speechx/nnet/ds2_nnet.h               |  1 +
 speechx/speechx/nnet/nnet_itf.h               | 16 ++++++------
 speechx/speechx/nnet/u2_nnet.h                |  1 -
 .../protocol/websocket/websocket_client.h     |  3 +--
 .../protocol/websocket/websocket_server.h     |  2 --
 speechx/speechx/utils/file_utils.h            |  2 +-
 16 files changed, 53 insertions(+), 55 deletions(-)

diff --git a/speechx/speechx/base/basic_types.h b/speechx/speechx/base/basic_types.h
index 206b7be6..3a648649 100644
--- a/speechx/speechx/base/basic_types.h
+++ b/speechx/speechx/base/basic_types.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
-#include "kaldi/base/kaldi-types.h"
-
 #include <limits>
 
+#include "kaldi/base/kaldi-types.h"
+
 typedef float BaseFloat;
 typedef double double64;
 
diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h
index b470b9de..97bff966 100644
--- a/speechx/speechx/base/common.h
+++ b/speechx/speechx/base/common.h
@@ -47,6 +47,5 @@
 #include "base/flags.h"
 #include "base/log.h"
 #include "base/macros.h"
-
 #include "utils/file_utils.h"
 #include "utils/math.h"
\ No newline at end of file
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
index 2c28bee1..eef8823d 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
@@ -17,7 +17,6 @@
 #include "decoder/ctc_beam_search_opt.h"
 #include "decoder/ctc_prefix_beam_search_score.h"
 #include "decoder/decoder_itf.h"
-
 #include "fst/symbol-table.h"
 
 namespace ppspeech {
diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h
index 76bbcf42..cf8a9b73 100644
--- a/speechx/speechx/decoder/ctc_tlg_decoder.h
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.h
@@ -16,7 +16,6 @@
 
 #include "base/common.h"
 #include "decoder/decoder_itf.h"
-
 #include "kaldi/decoder/lattice-faster-online-decoder.h"
 #include "util/parse-options.h"
 
@@ -35,7 +34,7 @@ struct TLGDecoderOptions {
     std::string word_symbol_table{};
     std::string fst_path{};
 
-    static TLGDecoderOptions InitFromFlags(){
+    static TLGDecoderOptions InitFromFlags() {
         TLGDecoderOptions decoder_opts;
         decoder_opts.word_symbol_table = FLAGS_word_symbol_table;
         decoder_opts.fst_path = FLAGS_graph_path;
@@ -45,9 +44,11 @@ struct TLGDecoderOptions {
         decoder_opts.opts.max_active = FLAGS_max_active;
         decoder_opts.opts.beam = FLAGS_beam;
         decoder_opts.opts.lattice_beam = FLAGS_lattice_beam;
-        LOG(INFO) << "LatticeFasterDecoder max active: " <<  decoder_opts.opts.max_active ;
-        LOG(INFO) << "LatticeFasterDecoder beam: " <<  decoder_opts.opts.beam ;
-        LOG(INFO) << "LatticeFasterDecoder lattice_beam: " <<  decoder_opts.opts.lattice_beam ;
+        LOG(INFO) << "LatticeFasterDecoder max active: "
+                  << decoder_opts.opts.max_active;
+        LOG(INFO) << "LatticeFasterDecoder beam: " << decoder_opts.opts.beam;
+        LOG(INFO) << "LatticeFasterDecoder lattice_beam: "
+                  << decoder_opts.opts.lattice_beam;
 
         return decoder_opts;
     }
diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h
index 5e1120ad..1f13bbc0 100644
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -30,7 +30,7 @@ DEFINE_int32(receptive_field_length,
              7,
              "receptive field of two CNN(kernel=3) downsampling module.");
 DEFINE_int32(subsampling_rate,
-             4, 
+             4,
              "two CNN(kernel=3) module downsampling rate.");
 DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk");
 
@@ -62,7 +62,6 @@ DEFINE_double(beam, 15.0, "decoder beam");
 DEFINE_double(lattice_beam, 7.5, "decoder beam");
 
 
-
 // DecodeOptions flags
 // DEFINE_int32(chunk_size, -1, "decoding chunk size");
 DEFINE_int32(num_left_chunks, -1, "left chunks in decoding");
diff --git a/speechx/speechx/decoder/recognizer.h b/speechx/speechx/decoder/recognizer.h
index 51b66673..0402bcd3 100644
--- a/speechx/speechx/decoder/recognizer.h
+++ b/speechx/speechx/decoder/recognizer.h
@@ -32,15 +32,15 @@ struct RecognizerResource {
     ModelOptions model_opts{};
     TLGDecoderOptions tlg_opts{};
     //    CTCBeamSearchOptions beam_search_opts;
- 
-    static RecognizerResource InitFromFlags(){
+
+    static RecognizerResource InitFromFlags() {
         RecognizerResource resource;
         resource.acoustic_scale = FLAGS_acoustic_scale;
-        resource.feature_pipeline_opts = FeaturePipelineOptions::InitFromFlags();
+        resource.feature_pipeline_opts =
+            FeaturePipelineOptions::InitFromFlags();
         resource.model_opts = ModelOptions::InitFromFlags();
-        resource.tlg_opts =  TLGDecoderOptions::InitFromFlags();
-      return resource;
-
+        resource.tlg_opts = TLGDecoderOptions::InitFromFlags();
+        return resource;
     }
 };
 
diff --git a/speechx/speechx/decoder/u2_recognizer.h b/speechx/speechx/decoder/u2_recognizer.h
index 86bd4821..f4e91b18 100644
--- a/speechx/speechx/decoder/u2_recognizer.h
+++ b/speechx/speechx/decoder/u2_recognizer.h
@@ -21,10 +21,9 @@
 #include "decoder/ctc_prefix_beam_search_decoder.h"
 #include "decoder/decoder_itf.h"
 #include "frontend/audio/feature_pipeline.h"
-#include "nnet/decodable.h"
-
 #include "fst/fstlib.h"
 #include "fst/symbol-table.h"
+#include "nnet/decodable.h"
 
 DECLARE_int32(nnet_decoder_chunk);
 DECLARE_int32(num_left_chunks);
@@ -63,9 +62,9 @@ struct DecodeOptions {
     // CtcEndpointConfig ctc_endpoint_opts;
     CTCBeamSearchOptions ctc_prefix_search_opts{};
 
-    static DecodeOptions InitFromFlags(){
+    static DecodeOptions InitFromFlags() {
         DecodeOptions decoder_opts;
-        decoder_opts.chunk_size=FLAGS_nnet_decoder_chunk;
+        decoder_opts.chunk_size = FLAGS_nnet_decoder_chunk;
         decoder_opts.num_left_chunks = FLAGS_num_left_chunks;
         decoder_opts.ctc_weight = FLAGS_ctc_weight;
         decoder_opts.rescoring_weight = FLAGS_rescoring_weight;
@@ -86,15 +85,16 @@ struct U2RecognizerResource {
     DecodeOptions decoder_opts{};
 
     static U2RecognizerResource InitFromFlags() {
-    U2RecognizerResource resource;
-    resource.vocab_path = FLAGS_vocab_path;
-    resource.acoustic_scale = FLAGS_acoustic_scale;
-
-    resource.feature_pipeline_opts = ppspeech::FeaturePipelineOptions::InitFromFlags();
-    resource.model_opts = ppspeech::ModelOptions::InitFromFlags();
-    resource.decoder_opts = ppspeech::DecodeOptions::InitFromFlags();
-    return resource;
-}
+        U2RecognizerResource resource;
+        resource.vocab_path = FLAGS_vocab_path;
+        resource.acoustic_scale = FLAGS_acoustic_scale;
+
+        resource.feature_pipeline_opts =
+            ppspeech::FeaturePipelineOptions::InitFromFlags();
+        resource.model_opts = ppspeech::ModelOptions::InitFromFlags();
+        resource.decoder_opts = ppspeech::DecodeOptions::InitFromFlags();
+        return resource;
+    }
 };
 
 
diff --git a/speechx/speechx/frontend/audio/data_cache.h b/speechx/speechx/frontend/audio/data_cache.h
index 64e9db86..5fafdeb2 100644
--- a/speechx/speechx/frontend/audio/data_cache.h
+++ b/speechx/speechx/frontend/audio/data_cache.h
@@ -56,4 +56,4 @@ class DataCache : public FrontendInterface {
 
     DISALLOW_COPY_AND_ASSIGN(DataCache);
 };
-}
\ No newline at end of file
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h
index 38a47433..d91a70e3 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@@ -46,17 +46,17 @@ struct FeaturePipelineOptions {
     FeatureCacheOptions feature_cache_opts{};
     AssemblerOptions assembler_opts{};
 
-    static FeaturePipelineOptions InitFromFlags(){
+    static FeaturePipelineOptions InitFromFlags() {
         FeaturePipelineOptions opts;
         opts.cmvn_file = FLAGS_cmvn_file;
-        LOG(INFO) << "cmvn file: " <<  opts.cmvn_file;
+        LOG(INFO) << "cmvn file: " << opts.cmvn_file;
 
         // frame options
         kaldi::FrameExtractionOptions frame_opts;
         frame_opts.dither = 0.0;
-        LOG(INFO) << "dither: " <<  frame_opts.dither;
+        LOG(INFO) << "dither: " << frame_opts.dither;
         frame_opts.frame_shift_ms = 10;
-        LOG(INFO) << "frame shift ms: " <<  frame_opts.frame_shift_ms;
+        LOG(INFO) << "frame shift ms: " << frame_opts.frame_shift_ms;
         opts.use_fbank = FLAGS_use_fbank;
         LOG(INFO) << "feature type: " << (opts.use_fbank ? "fbank" : "linear");
         if (opts.use_fbank) {
@@ -76,15 +76,19 @@ struct FeaturePipelineOptions {
 
             opts.linear_spectrogram_opts.frame_opts = frame_opts;
         }
-        LOG(INFO) << "frame length ms: " <<  frame_opts.frame_length_ms;
+        LOG(INFO) << "frame length ms: " << frame_opts.frame_length_ms;
 
         // assembler opts
         opts.assembler_opts.subsampling_rate = FLAGS_subsampling_rate;
-        LOG(INFO) << "subsampling rate: " << opts.assembler_opts.subsampling_rate;
-        opts.assembler_opts.receptive_filed_length = FLAGS_receptive_field_length;
-        LOG(INFO) << "nnet receptive filed length: " <<  opts.assembler_opts.receptive_filed_length;
+        LOG(INFO) << "subsampling rate: "
+                  << opts.assembler_opts.subsampling_rate;
+        opts.assembler_opts.receptive_filed_length =
+            FLAGS_receptive_field_length;
+        LOG(INFO) << "nnet receptive filed length: "
+                  << opts.assembler_opts.receptive_filed_length;
         opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk;
-        LOG(INFO) << "nnet chunk size: " <<  opts.assembler_opts.nnet_decoder_chunk;
+        LOG(INFO) << "nnet chunk size: "
+                  << opts.assembler_opts.nnet_decoder_chunk;
         return opts;
     }
 };
diff --git a/speechx/speechx/frontend/audio/mfcc.h b/speechx/speechx/frontend/audio/mfcc.h
index 62b0078c..6c1c2f7d 100644
--- a/speechx/speechx/frontend/audio/mfcc.h
+++ b/speechx/speechx/frontend/audio/mfcc.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include "kaldi/feat/feature-mfcc.h"
 #include "kaldi/feat/feature-mfcc.h"
 #include "kaldi/matrix/kaldi-vector.h"
 
diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h
index 2a53e5f7..4aeec32f 100644
--- a/speechx/speechx/nnet/ds2_nnet.h
+++ b/speechx/speechx/nnet/ds2_nnet.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 #include <numeric>
+
 #include "base/common.h"
 #include "kaldi/matrix/kaldi-matrix.h"
 #include "nnet/nnet_itf.h"
diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h
index f8105b7f..cc737ce0 100644
--- a/speechx/speechx/nnet/nnet_itf.h
+++ b/speechx/speechx/nnet/nnet_itf.h
@@ -48,25 +48,25 @@ struct ModelOptions {
     bool enable_fc_padding{false};
     bool enable_profile{false};
 
-    static ModelOptions InitFromFlags(){
+    static ModelOptions InitFromFlags() {
         ModelOptions opts;
         opts.subsample_rate = FLAGS_subsampling_rate;
-        LOG(INFO) << "subsampling rate: " <<  opts.subsample_rate;
+        LOG(INFO) << "subsampling rate: " << opts.subsample_rate;
         opts.model_path = FLAGS_model_path;
-        LOG(INFO) << "model path: " << opts.model_path ;
+        LOG(INFO) << "model path: " << opts.model_path;
 
         opts.param_path = FLAGS_param_path;
-        LOG(INFO) << "param path: " << opts.param_path ;
+        LOG(INFO) << "param path: " << opts.param_path;
 
         LOG(INFO) << "DS2 param: ";
         opts.cache_names = FLAGS_model_cache_names;
-        LOG(INFO) << "  cache names: " <<    opts.cache_names;
+        LOG(INFO) << "  cache names: " << opts.cache_names;
         opts.cache_shape = FLAGS_model_cache_shapes;
-        LOG(INFO) << "  cache shape: " <<  opts.cache_shape;
+        LOG(INFO) << "  cache shape: " << opts.cache_shape;
         opts.input_names = FLAGS_model_input_names;
-        LOG(INFO) << "  input names: " <<  opts.input_names;
+        LOG(INFO) << "  input names: " << opts.input_names;
         opts.output_names = FLAGS_model_output_names;
-        LOG(INFO) << "  output names: " <<  opts.output_names;
+        LOG(INFO) << "  output names: " << opts.output_names;
         return opts;
     }
 };
diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h
index 697ac20c..3435bca8 100644
--- a/speechx/speechx/nnet/u2_nnet.h
+++ b/speechx/speechx/nnet/u2_nnet.h
@@ -16,7 +16,6 @@
 
 #include "base/common.h"
 #include "kaldi/matrix/kaldi-matrix.h"
-
 #include "nnet/nnet_itf.h"
 #include "paddle/extension.h"
 #include "paddle/jit/all.h"
diff --git a/speechx/speechx/protocol/websocket/websocket_client.h b/speechx/speechx/protocol/websocket/websocket_client.h
index 886da292..7ae6d98d 100644
--- a/speechx/speechx/protocol/websocket/websocket_client.h
+++ b/speechx/speechx/protocol/websocket/websocket_client.h
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "base/common.h"
-
 #include "boost/asio/connect.hpp"
 #include "boost/asio/ip/tcp.hpp"
 #include "boost/beast/core.hpp"
@@ -54,4 +53,4 @@ class WebSocketClient {
     websocket::stream<tcp::socket> ws_{ioc_};
     std::unique_ptr<std::thread> t_{nullptr};
 };
-}
\ No newline at end of file
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/protocol/websocket/websocket_server.h b/speechx/speechx/protocol/websocket/websocket_server.h
index 009fc42e..8f3360e4 100644
--- a/speechx/speechx/protocol/websocket/websocket_server.h
+++ b/speechx/speechx/protocol/websocket/websocket_server.h
@@ -15,12 +15,10 @@
 #pragma once
 
 #include "base/common.h"
-
 #include "boost/asio/connect.hpp"
 #include "boost/asio/ip/tcp.hpp"
 #include "boost/beast/core.hpp"
 #include "boost/beast/websocket.hpp"
-
 #include "decoder/recognizer.h"
 #include "frontend/audio/feature_pipeline.h"
 
diff --git a/speechx/speechx/utils/file_utils.h b/speechx/speechx/utils/file_utils.h
index 8c56c02e..a471e024 100644
--- a/speechx/speechx/utils/file_utils.h
+++ b/speechx/speechx/utils/file_utils.h
@@ -20,4 +20,4 @@ bool ReadFileToVector(const std::string& filename,
                       std::vector<std::string>* data);
 
 std::string ReadFile2String(const std::string& path);
-}
+}  // namespace ppspeech

From 29508f400b23211c9e7380800e2d02c9a16a426f Mon Sep 17 00:00:00 2001
From: "david.95" <david.95@live.cn>
Date: Mon, 17 Oct 2022 16:44:29 +0800
Subject: [PATCH 20/60] to fix CI issue, test=tts

---
 paddlespeech/t2s/ssml/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 paddlespeech/t2s/ssml/__init__.py

diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/ssml/__init__.py
new file mode 100644
index 00000000..e69de29b

From f56cc08b18f5fb6fc3254db4dd40ec3597d34f36 Mon Sep 17 00:00:00 2001
From: "david.95" <david.95@live.cn>
Date: Mon, 17 Oct 2022 16:55:07 +0800
Subject: [PATCH 21/60] add license content, test=tts

---
 paddlespeech/t2s/ssml/__init__.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/ssml/__init__.py
index e69de29b..abf198b9 100644
--- a/paddlespeech/t2s/ssml/__init__.py
+++ b/paddlespeech/t2s/ssml/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

From 1067088debd49ba308fc55a8c55d1d04f211ff51 Mon Sep 17 00:00:00 2001
From: "david.95" <david.95@live.cn>
Date: Mon, 17 Oct 2022 17:18:27 +0800
Subject: [PATCH 22/60] modify __init__

---
 paddlespeech/t2s/ssml/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/ssml/__init__.py
index abf198b9..f344250d 100644
--- a/paddlespeech/t2s/ssml/__init__.py
+++ b/paddlespeech/t2s/ssml/__init__.py
@@ -11,3 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .xml_processor import *

From 89e9ea69ebb884d5ba13d02c66c29475a153f2ea Mon Sep 17 00:00:00 2001
From: "david.95" <david.95@live.cn>
Date: Mon, 17 Oct 2022 17:29:46 +0800
Subject: [PATCH 23/60] modify __init__

---
 paddlespeech/t2s/ssml/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/ssml/__init__.py
index f344250d..9b4db053 100644
--- a/paddlespeech/t2s/ssml/__init__.py
+++ b/paddlespeech/t2s/ssml/__init__.py
@@ -11,5 +11,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from .xml_processor import *

From f295d2d4450099f2cf8b7e2d417a9c9599230563 Mon Sep 17 00:00:00 2001
From: "david.95" <david.95@live.cn>
Date: Mon, 17 Oct 2022 18:00:13 +0800
Subject: [PATCH 24/60] remove useless code

---
 paddlespeech/t2s/frontend/zh_frontend.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index 25558780..e3028698 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -146,7 +146,6 @@ class Frontend():
                 tone_id = [line.strip().split() for line in f.readlines()]
             for tone, id in tone_id:
                 self.vocab_tones[tone] = int(id)
-        self.mix_ssml_processor.__repr__()
 
     def _init_pypinyin(self):
         large_pinyin.load()

From 72c9e973a2bb9d6c8dab603d67a6ae80a73669f7 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 17 Oct 2022 10:06:06 +0000
Subject: [PATCH 25/60] add scripts

---
 .../examples/u2pp_ol/wenetspeech/.gitignore   |  3 +
 .../examples/u2pp_ol/wenetspeech/README.md    | 28 +++++++
 .../wenetspeech/local/aishell_train_lms.sh    | 71 +++++++++++++++++
 .../u2pp_ol/wenetspeech/local/decode.sh       | 25 ++++++
 .../u2pp_ol/wenetspeech/local/feat.sh         | 31 ++++++++
 .../u2pp_ol/wenetspeech/local/nnet.sh         | 23 ++++++
 .../u2pp_ol/wenetspeech/local/recognizer.sh   | 34 +++++++++
 .../u2pp_ol/wenetspeech/local/split_data.sh   | 30 ++++++++
 speechx/examples/u2pp_ol/wenetspeech/path.sh  | 18 +++++
 speechx/examples/u2pp_ol/wenetspeech/run.sh   | 76 +++++++++++++++++++
 10 files changed, 339 insertions(+)
 create mode 100644 speechx/examples/u2pp_ol/wenetspeech/.gitignore
 create mode 100644 speechx/examples/u2pp_ol/wenetspeech/README.md
 create mode 100755 speechx/examples/u2pp_ol/wenetspeech/local/aishell_train_lms.sh
 create mode 100755 speechx/examples/u2pp_ol/wenetspeech/local/decode.sh
 create mode 100755 speechx/examples/u2pp_ol/wenetspeech/local/feat.sh
 create mode 100755 speechx/examples/u2pp_ol/wenetspeech/local/nnet.sh
 create mode 100755 speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh
 create mode 100755 speechx/examples/u2pp_ol/wenetspeech/local/split_data.sh
 create mode 100644 speechx/examples/u2pp_ol/wenetspeech/path.sh
 create mode 100755 speechx/examples/u2pp_ol/wenetspeech/run.sh

diff --git a/speechx/examples/u2pp_ol/wenetspeech/.gitignore b/speechx/examples/u2pp_ol/wenetspeech/.gitignore
new file mode 100644
index 00000000..02c0cc21
--- /dev/null
+++ b/speechx/examples/u2pp_ol/wenetspeech/.gitignore
@@ -0,0 +1,3 @@
+data
+utils
+exp
diff --git a/speechx/examples/u2pp_ol/wenetspeech/README.md b/speechx/examples/u2pp_ol/wenetspeech/README.md
new file mode 100644
index 00000000..a9a4578f
--- /dev/null
+++ b/speechx/examples/u2pp_ol/wenetspeech/README.md
@@ -0,0 +1,28 @@
+# u2/u2pp Streaming ASR 
+
+## Testing with Aishell Test Data
+
+## Download wav and model
+
+```
+run.sh --stop_stage 0
+```
+
+### compute feature
+
+```
+./run.sh --stage 1 --stop_stage 1
+```
+
+### decoding using feature
+
+```
+./run.sh --stage 2 --stop_stage 2
+```
+
+### decoding using wav
+
+
+```
+./run.sh --stage 3 --stop_stage 3
+```
\ No newline at end of file
diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/aishell_train_lms.sh b/speechx/examples/u2pp_ol/wenetspeech/local/aishell_train_lms.sh
new file mode 100755
index 00000000..544a1f59
--- /dev/null
+++ b/speechx/examples/u2pp_ol/wenetspeech/local/aishell_train_lms.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+# To be run from one directory above this script.
+. ./path.sh
+
+nj=40
+text=data/local/lm/text
+lexicon=data/local/dict/lexicon.txt
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+
+# Check SRILM tools
+if ! which ngram-count > /dev/null; then
+    echo "srilm tools are not found, please download it and install it from: "
+    echo "http://www.speech.sri.com/projects/srilm/download.html"
+    echo "Then add the tools to your PATH"
+    exit 1
+fi
+
+# This script takes no arguments.  It assumes you have already run
+# aishell_data_prep.sh.
+# It takes as input the files
+# data/local/lm/text
+# data/local/dict/lexicon.txt
+dir=data/local/lm
+mkdir -p $dir
+
+cleantext=$dir/text.no_oov
+
+# oov to <SPOKEN_NOISE>
+# lexicon line: word char0 ... charn
+# text line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
+text_dir=$(dirname $text)
+split_name=$(basename $text)
+./local/split_data.sh $text_dir $text $split_name $nj
+
+utils/run.pl JOB=1:$nj $text_dir/split${nj}/JOB/${split_name}.no_oov.log \
+  cat ${text_dir}/split${nj}/JOB/${split_name} \| awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+    {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
+    \> ${text_dir}/split${nj}/JOB/${split_name}.no_oov || exit 1;
+cat ${text_dir}/split${nj}/*/${split_name}.no_oov  > $cleantext
+
+# compute word counts, sort in descending order
+# line: count word
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort --parallel=`nproc` | uniq -c | \
+   sort --parallel=`nproc` -nr > $dir/word.counts || exit 1;
+
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+   sort --parallel=`nproc` | uniq -c | sort --parallel=`nproc` -nr > $dir/unigram.counts || exit 1;
+
+# word with <s> </s>
+cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
+
+# hold out to compute ppl
+heldout_sent=10000 # Don't change this if you want result to be comparable with kaldi_lm results
+
+mkdir -p $dir
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  head -$heldout_sent > $dir/heldout
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  tail -n +$heldout_sent > $dir/train
+
+ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
+  -map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
+ngram -lm $dir/lm.arpa -ppl $dir/heldout
\ No newline at end of file
diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/decode.sh b/speechx/examples/u2pp_ol/wenetspeech/local/decode.sh
new file mode 100755
index 00000000..c17cdbe6
--- /dev/null
+++ b/speechx/examples/u2pp_ol/wenetspeech/local/decode.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+set -e
+
+. path.sh
+
+data=data
+exp=exp
+nj=20
+mkdir -p $exp
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/
+
+utils/run.pl JOB=1:$nj $data/split${nj}/JOB/decoder.fbank.wolm.log \
+ctc_prefix_beam_search_decoder_main \
+    --model_path=$model_dir/export.jit \
+    --vocab_path=$model_dir/unit.txt \
+    --nnet_decoder_chunk=16 \
+    --receptive_field_length=7 \
+    --subsampling_rate=4 \
+    --feature_rspecifier=scp:$data/split${nj}/JOB/fbank.scp \
+    --result_wspecifier=ark,t:$data/split${nj}/JOB/result_decode.ark
+
+cat $data/split${nj}/*/result_decode.ark > $exp/${label_file}
+utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file} > $exp/${wer}
+tail -n 7 $exp/${wer}
\ No newline at end of file
diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/feat.sh b/speechx/examples/u2pp_ol/wenetspeech/local/feat.sh
new file mode 100755
index 00000000..4341cec8
--- /dev/null
+++ b/speechx/examples/u2pp_ol/wenetspeech/local/feat.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+set -e
+
+. path.sh
+
+data=data
+exp=exp
+nj=20
+mkdir -p $exp
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/
+aishell_wav_scp=aishell_test.scp
+
+cmvn_json2kaldi_main \
+    --json_file  $model_dir/mean_std.json \
+    --cmvn_write_path $exp/cmvn.ark \
+    --binary=false
+
+echo "convert json cmvn to kaldi ark."
+
+./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
+
+utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat.log \
+compute_fbank_main \
+    --num_bins 80 \
+    --cmvn_file=$exp/cmvn.ark \
+    --streaming_chunk=36 \
+    --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
+    --feature_wspecifier=ark,scp:$data/split${nj}/JOB/fbank.ark,$data/split${nj}/JOB/fbank.scp
+
+echo "compute fbank feature."
diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/nnet.sh b/speechx/examples/u2pp_ol/wenetspeech/local/nnet.sh
new file mode 100755
index 00000000..4419201c
--- /dev/null
+++ b/speechx/examples/u2pp_ol/wenetspeech/local/nnet.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -x
+set -e
+
+. path.sh
+
+data=data
+exp=exp
+mkdir -p $exp
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/
+
+u2_nnet_main \
+    --model_path=$model_dir/export.jit \
+    --feature_rspecifier=ark,t:$exp/fbank.ark \
+    --nnet_decoder_chunk=16 \
+    --receptive_field_length=7 \
+    --subsampling_rate=4 \
+    --acoustic_scale=1.0 \
+    --nnet_encoder_outs_wspecifier=ark,t:$exp/encoder_outs.ark \
+    --nnet_prob_wspecifier=ark,t:$exp/logprobs.ark
+echo "u2 nnet decode."
+
diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh
new file mode 100755
index 00000000..29b50537
--- /dev/null
+++ b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+set -e
+
+. path.sh
+
+data=data
+exp=exp
+nj=20
+mkdir -p $exp
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/
+aishell_wav_scp=aishell_test.scp
+
+./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
+
+utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer.log \
+u2_recognizer_main \
+    --use_fbank=true \
+    --num_bins=80 \
+    --cmvn_file=$exp/cmvn.ark \
+    --model_path=$model_dir/export.jit \
+    --vocab_path=$model_dir/unit.txt \
+    --nnet_decoder_chunk=16 \
+    --receptive_field_length=7 \
+    --subsampling_rate=4 \
+    --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
+    --result_wspecifier=ark,t:$data/split${nj}/JOB/result_recognizer.ark
+
+
+cat $data/split${nj}/*/result_recognizer.ark > $exp/${label_file}_recognizer
+utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_recognizer > $exp/${wer}.recognizer
+echo "recognizer test have finished!!!"
+echo "please checkout in ${exp}/${wer}.recognizer"
+tail -n 7 $exp/${wer}.recognizer
\ No newline at end of file
diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/split_data.sh b/speechx/examples/u2pp_ol/wenetspeech/local/split_data.sh
new file mode 100755
index 00000000..faa5c42d
--- /dev/null
+++ b/speechx/examples/u2pp_ol/wenetspeech/local/split_data.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+set -eo pipefail
+
+data=$1
+scp=$2
+split_name=$3
+numsplit=$4
+
+# save in $data/split{n}
+# $scp to split
+# 
+
+if [[ ! $numsplit -gt 0 ]]; then
+  echo "$0: Invalid num-split argument";
+  exit 1;
+fi
+
+directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n; done)
+scp_splits=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_name}; done)
+
+# if this mkdir fails due to argument-list being too long, iterate.
+if ! mkdir -p $directories >&/dev/null; then
+  for n in `seq $numsplit`; do
+    mkdir -p $data/split${numsplit}/$n
+  done
+fi
+
+echo "utils/split_scp.pl $scp $scp_splits"
+utils/split_scp.pl $scp $scp_splits
diff --git a/speechx/examples/u2pp_ol/wenetspeech/path.sh b/speechx/examples/u2pp_ol/wenetspeech/path.sh
new file mode 100644
index 00000000..7f32fbce
--- /dev/null
+++ b/speechx/examples/u2pp_ol/wenetspeech/path.sh
@@ -0,0 +1,18 @@
+# This contains the locations of binarys build required for running the examples.
+
+unset GREP_OPTIONS
+
+SPEECHX_ROOT=$PWD/../../../
+SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
+
+SPEECHX_TOOLS=$SPEECHX_ROOT/tools
+TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
+
+[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; }
+
+export LC_AL=C
+
+export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio
+
+PADDLE_LIB_PATH=$(python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')")
+export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH
diff --git a/speechx/examples/u2pp_ol/wenetspeech/run.sh b/speechx/examples/u2pp_ol/wenetspeech/run.sh
new file mode 100755
index 00000000..12e3af95
--- /dev/null
+++ b/speechx/examples/u2pp_ol/wenetspeech/run.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+set +x
+set -e
+
+. path.sh
+
+nj=40
+stage=0
+stop_stage=5
+
+. utils/parse_options.sh
+
+# input
+data=data
+exp=exp
+mkdir -p $exp $data
+
+
+# 1. compile
+if [ ! -d ${SPEECHX_BUILD} ]; then
+    pushd ${SPEECHX_ROOT} 
+    bash build.sh
+    popd
+fi
+
+
+ckpt_dir=$data/model
+model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then
+    #  download model
+    if [ ! -f $ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz ]; then
+        mkdir -p $ckpt_dir
+        pushd $ckpt_dir
+
+        wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/static/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz
+        tar xzfv asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz
+
+        popd
+    fi
+
+    # test wav scp
+    if [ ! -f data/wav.scp ]; then
+        mkdir -p $data
+        pushd $data
+        wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
+        echo "utt1 " $PWD/zh.wav > wav.scp
+        popd 
+    fi
+
+    # aishell wav scp
+    if [ ! -d $data/test ]; then
+        pushd $data
+        wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
+        unzip  aishell_test.zip
+        popd
+
+        realpath $data/test/*/*.wav > $data/wavlist
+        awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
+        paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
+    fi
+fi
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ./local/feat.sh
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ./local/decode.sh
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    ./loca/recognizer.sh
+fi
\ No newline at end of file

From fddcd36fa013ec9bce67e1a95c257d91140faf32 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 17 Oct 2022 19:03:15 +0800
Subject: [PATCH 26/60] format

---
 .../examples/u2pp_ol/wenetspeech/README.md    |   2 +-
 .../codelab/nnet/ds2_model_test_main.cc       |   1 +
 .../decoder/ctc_beam_search_decoder.cc        |  12 +-
 .../decoder/ctc_prefix_beam_search_decoder.cc |  73 +++---
 .../ctc_prefix_beam_search_decoder_main.cc    |  17 +-
 speechx/speechx/decoder/ctc_tlg_decoder.cc    |   7 +-
 .../speechx/decoder/ctc_tlg_decoder_main.cc   |  15 +-
 speechx/speechx/decoder/recognizer.cc         |   6 +-
 speechx/speechx/decoder/recognizer_main.cc    |   7 +-
 speechx/speechx/decoder/u2_recognizer.cc      | 231 +++++++++---------
 speechx/speechx/decoder/u2_recognizer_main.cc |   7 +-
 speechx/speechx/frontend/audio/cmvn.cc        |   9 +-
 .../frontend/audio/compute_fbank_main.cc      |  21 +-
 .../audio/compute_linear_spectrogram_main.cc  |   7 +-
 .../frontend/audio/feature_pipeline.cc        |   5 +-
 speechx/speechx/nnet/decodable.cc             |  18 +-
 speechx/speechx/nnet/u2_nnet.cc               |  22 +-
 speechx/speechx/nnet/u2_nnet_main.cc          |  35 +--
 .../websocket/websocket_server_main.cc        |   7 +-
 speechx/speechx/utils/math.cc                 |  10 +-
 20 files changed, 259 insertions(+), 253 deletions(-)

diff --git a/speechx/examples/u2pp_ol/wenetspeech/README.md b/speechx/examples/u2pp_ol/wenetspeech/README.md
index a9a4578f..9a8f8af5 100644
--- a/speechx/examples/u2pp_ol/wenetspeech/README.md
+++ b/speechx/examples/u2pp_ol/wenetspeech/README.md
@@ -25,4 +25,4 @@ run.sh --stop_stage 0
 
 ```
 ./run.sh --stage 3 --stop_stage 3
-```
\ No newline at end of file
+```
diff --git a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc
index 7d99e857..09f9e2fb 100644
--- a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc
+++ b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc
@@ -21,6 +21,7 @@
 #include <iterator>
 #include <numeric>
 #include <thread>
+
 #include "base/flags.h"
 #include "base/log.h"
 #include "paddle_inference_api.h"
diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_beam_search_decoder.cc
index 3f00ee35..c4b35ff0 100644
--- a/speechx/speechx/decoder/ctc_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder.cc
@@ -13,9 +13,10 @@
 // limitations under the License.
 
 
+#include "decoder/ctc_beam_search_decoder.h"
+
 #include "base/common.h"
 #include "decoder/ctc_decoders/decoder_utils.h"
-#include "decoder/ctc_beam_search_decoder.h"
 #include "utils/file_utils.h"
 
 namespace ppspeech {
@@ -24,10 +25,7 @@ using std::vector;
 using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
 
 CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts)
-    : opts_(opts),
-      init_ext_scorer_(nullptr),
-      space_id_(-1),
-      root_(nullptr) {
+    : opts_(opts), init_ext_scorer_(nullptr), space_id_(-1), root_(nullptr) {
     LOG(INFO) << "dict path: " << opts_.dict_file;
     if (!ReadFileToVector(opts_.dict_file, &vocabulary_)) {
         LOG(INFO) << "load the dict failed";
@@ -41,7 +39,7 @@ CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts)
             opts_.alpha, opts_.beta, opts_.lm_path, vocabulary_);
     }
 
-    CHECK(opts_.blank==0);
+    CHECK(opts_.blank == 0);
 
     auto it = std::find(vocabulary_.begin(), vocabulary_.end(), " ");
     space_id_ = it - vocabulary_.begin();
@@ -115,7 +113,7 @@ int CTCBeamSearch::DecodeLikelihoods(const vector<vector<float>>& probs,
 }
 
 vector<std::pair<double, string>> CTCBeamSearch::GetNBestPath(int n) {
-    int beam_size = n == -1 ?  opts_.beam_size: std::min(n, opts_.beam_size);
+    int beam_size = n == -1 ? opts_.beam_size : std::min(n, opts_.beam_size);
     return get_beam_search_result(prefixes_, vocabulary_, beam_size);
 }
 
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
index ce2d4dc2..a0fe5b2a 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
@@ -16,11 +16,12 @@
 
 
 #include "decoder/ctc_prefix_beam_search_decoder.h"
+
+#include "absl/strings/str_join.h"
 #include "base/common.h"
 #include "decoder/ctc_beam_search_opt.h"
 #include "decoder/ctc_prefix_beam_search_score.h"
 #include "utils/math.h"
-#include "absl/strings/str_join.h"
 
 #ifdef USE_PROFILING
 #include "paddle/fluid/platform/profiler.h"
@@ -30,18 +31,17 @@ using paddle::platform::TracerEventType;
 
 namespace ppspeech {
 
-CTCPrefixBeamSearch::CTCPrefixBeamSearch(
-    const std::string vocab_path, 
-    const CTCBeamSearchOptions& opts)
+CTCPrefixBeamSearch::CTCPrefixBeamSearch(const std::string vocab_path,
+                                         const CTCBeamSearchOptions& opts)
     : opts_(opts) {
-        
-    unit_table_ = std::shared_ptr<fst::SymbolTable>(fst::SymbolTable::ReadText(vocab_path));
+    unit_table_ = std::shared_ptr<fst::SymbolTable>(
+        fst::SymbolTable::ReadText(vocab_path));
     CHECK(unit_table_ != nullptr);
 
     Reset();
 }
 
-void CTCPrefixBeamSearch::Reset() { 
+void CTCPrefixBeamSearch::Reset() {
     num_frame_decoded_ = 0;
 
     cur_hyps_.clear();
@@ -65,10 +65,9 @@ void CTCPrefixBeamSearch::Reset() {
     hypotheses_.emplace_back(empty);
     likelihood_.emplace_back(prefix_score.TotalScore());
     times_.emplace_back(empty);
- }
-
-void CTCPrefixBeamSearch::InitDecoder() {  Reset(); }
+}
 
+void CTCPrefixBeamSearch::InitDecoder() { Reset(); }
 
 
 void CTCPrefixBeamSearch::AdvanceDecode(
@@ -296,9 +295,7 @@ void CTCPrefixBeamSearch::UpdateOutputs(
     outputs_.emplace_back(output);
 }
 
-void CTCPrefixBeamSearch::FinalizeSearch() { 
-  UpdateFinalContext(); 
-}
+void CTCPrefixBeamSearch::FinalizeSearch() { UpdateFinalContext(); }
 
 void CTCPrefixBeamSearch::UpdateFinalContext() {
     if (context_graph_ == nullptr) return;
@@ -311,8 +308,8 @@ void CTCPrefixBeamSearch::UpdateFinalContext() {
     for (const auto& prefix : hypotheses_) {
         PrefixScore& prefix_score = cur_hyps_[prefix];
         if (prefix_score.context_score != 0) {
-             prefix_score.UpdateContext(context_graph_, prefix_score, 0,
-                                 prefix.size());
+            prefix_score.UpdateContext(
+                context_graph_, prefix_score, 0, prefix.size());
         }
     }
     std::vector<std::pair<std::vector<int>, PrefixScore>> arr(cur_hyps_.begin(),
@@ -323,48 +320,44 @@ void CTCPrefixBeamSearch::UpdateFinalContext() {
     UpdateHypotheses(arr);
 }
 
-  std::string CTCPrefixBeamSearch::GetBestPath(int index) {
+std::string CTCPrefixBeamSearch::GetBestPath(int index) {
     int n_hyps = Outputs().size();
     CHECK(n_hyps > 0);
     CHECK(index < n_hyps);
     std::vector<int> one = Outputs()[index];
     std::string sentence;
-    for (int i = 0; i < one.size(); i++){
+    for (int i = 0; i < one.size(); i++) {
         sentence += unit_table_->Find(one[i]);
     }
     return sentence;
-  }
+}
 
-  std::string  CTCPrefixBeamSearch::GetBestPath() {
-    return GetBestPath(0);
-  }
+std::string CTCPrefixBeamSearch::GetBestPath() { return GetBestPath(0); }
 
-  std::vector<std::pair<double, std::string>>  CTCPrefixBeamSearch::GetNBestPath(int n) {
-      int hyps_size = hypotheses_.size();
-      CHECK(hyps_size > 0);
+std::vector<std::pair<double, std::string>> CTCPrefixBeamSearch::GetNBestPath(
+    int n) {
+    int hyps_size = hypotheses_.size();
+    CHECK(hyps_size > 0);
 
-      int min_n = n == -1 ? hypotheses_.size() : std::min(n,  hyps_size);
+    int min_n = n == -1 ? hypotheses_.size() : std::min(n, hyps_size);
 
-      std::vector<std::pair<double, std::string>> n_best;
-      n_best.reserve(min_n);
+    std::vector<std::pair<double, std::string>> n_best;
+    n_best.reserve(min_n);
 
-      for (int i = 0; i < min_n; i++){
-        n_best.emplace_back(Likelihood()[i], GetBestPath(i) );
-      }
-      return n_best;
-  }
+    for (int i = 0; i < min_n; i++) {
+        n_best.emplace_back(Likelihood()[i], GetBestPath(i));
+    }
+    return n_best;
+}
 
-  std::vector<std::pair<double, std::string>>  CTCPrefixBeamSearch::GetNBestPath() {
+std::vector<std::pair<double, std::string>>
+CTCPrefixBeamSearch::GetNBestPath() {
     return GetNBestPath(-1);
-  }
-
-std::string CTCPrefixBeamSearch::GetFinalBestPath() {
-  return GetBestPath();
 }
 
-std::string CTCPrefixBeamSearch::GetPartialResult() {
-  return GetBestPath();
-}
+std::string CTCPrefixBeamSearch::GetFinalBestPath() { return GetBestPath(); }
+
+std::string CTCPrefixBeamSearch::GetPartialResult() { return GetBestPath(); }
 
 
 }  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
index 7a488bb0..d9cca147 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "absl/strings/str_split.h"
 #include "base/common.h"
 #include "decoder/ctc_prefix_beam_search_decoder.h"
 #include "frontend/audio/data_cache.h"
+#include "fst/symbol-table.h"
 #include "kaldi/util/table-types.h"
 #include "nnet/decodable.h"
 #include "nnet/u2_nnet.h"
-#include "absl/strings/str_split.h"
-#include "fst/symbol-table.h"
 
 DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
 DEFINE_string(result_wspecifier, "", "test result wspecifier");
@@ -64,8 +64,7 @@ int main(int argc, char* argv[]) {
     // nnet
     ppspeech::ModelOptions model_opts;
     model_opts.model_path = FLAGS_model_path;
-    std::shared_ptr<ppspeech::U2Nnet> nnet(
-        new ppspeech::U2Nnet(model_opts));
+    std::shared_ptr<ppspeech::U2Nnet> nnet(new ppspeech::U2Nnet(model_opts));
 
     // decodeable
     std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
@@ -114,9 +113,9 @@ int main(int argc, char* argv[]) {
                     ori_feature_len - chunk_idx * chunk_stride, chunk_size);
             }
             if (this_chunk_size < receptive_field_length) {
-                LOG(WARNING) << "utt: " << utt << " skip last "
-                             << this_chunk_size << " frames, expect is "
-                             << receptive_field_length;
+                LOG(WARNING)
+                    << "utt: " << utt << " skip last " << this_chunk_size
+                    << " frames, expect is " << receptive_field_length;
                 break;
             }
 
@@ -127,7 +126,7 @@ int main(int argc, char* argv[]) {
             for (int row_id = 0; row_id < this_chunk_size; ++row_id) {
                 kaldi::SubVector<kaldi::BaseFloat> feat_row(feature, start);
                 kaldi::SubVector<kaldi::BaseFloat> feature_chunk_row(
-                    feature_chunk.Data() + row_id * feat_dim,  feat_dim);
+                    feature_chunk.Data() + row_id * feat_dim, feat_dim);
 
                 feature_chunk_row.CopyFromVec(feat_row);
                 ++start;
@@ -151,7 +150,7 @@ int main(int argc, char* argv[]) {
 
         // get 1-best result
         std::string result = decoder.GetFinalBestPath();
-  
+
         // after process one utt, then reset state.
         decodable->Reset();
         decoder.Reset();
diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.cc b/speechx/speechx/decoder/ctc_tlg_decoder.cc
index 4d0a21d5..2c2b6d3c 100644
--- a/speechx/speechx/decoder/ctc_tlg_decoder.cc
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.cc
@@ -33,9 +33,7 @@ void TLGDecoder::Reset() {
     return;
 }
 
-void TLGDecoder::InitDecoder() {
-    Reset();
-}
+void TLGDecoder::InitDecoder() { Reset(); }
 
 void TLGDecoder::AdvanceDecode(
     const std::shared_ptr<kaldi::DecodableInterface>& decodable) {
@@ -50,7 +48,6 @@ void TLGDecoder::AdvanceDecoding(kaldi::DecodableInterface* decodable) {
 }
 
 
-
 std::string TLGDecoder::GetPartialResult() {
     if (num_frame_decoded_ == 0) {
         // Assertion failed: (this->NumFramesDecoded() > 0 && "You cannot call
@@ -93,4 +90,4 @@ std::string TLGDecoder::GetFinalBestPath() {
     return words;
 }
 
-}
+}  // namespace ppspeech
diff --git a/speechx/speechx/decoder/ctc_tlg_decoder_main.cc b/speechx/speechx/decoder/ctc_tlg_decoder_main.cc
index f262101a..e9bd8a3f 100644
--- a/speechx/speechx/decoder/ctc_tlg_decoder_main.cc
+++ b/speechx/speechx/decoder/ctc_tlg_decoder_main.cc
@@ -15,14 +15,12 @@
 // todo refactor, repalce with gtest
 
 #include "base/common.h"
-
+#include "decoder/ctc_tlg_decoder.h"
+#include "decoder/param.h"
 #include "frontend/audio/data_cache.h"
+#include "kaldi/util/table-types.h"
 #include "nnet/decodable.h"
 #include "nnet/ds2_nnet.h"
-#include "decoder/param.h"
-#include "decoder/ctc_tlg_decoder.h"
-
-#include "kaldi/util/table-types.h"
 
 
 DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
@@ -47,12 +45,13 @@ int main(int argc, char* argv[]) {
 
     int32 num_done = 0, num_err = 0;
 
-    ppspeech::TLGDecoderOptions opts = ppspeech::TLGDecoderOptions::InitFromFlags();
+    ppspeech::TLGDecoderOptions opts =
+        ppspeech::TLGDecoderOptions::InitFromFlags();
     opts.opts.beam = 15.0;
     opts.opts.lattice_beam = 7.5;
     ppspeech::TLGDecoder decoder(opts);
 
-    ppspeech::ModelOptions model_opts =  ppspeech::ModelOptions::InitFromFlags();
+    ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags();
 
     std::shared_ptr<ppspeech::PaddleNnet> nnet(
         new ppspeech::PaddleNnet(model_opts));
@@ -67,7 +66,7 @@ int main(int argc, char* argv[]) {
     LOG(INFO) << "chunk size (frame): " << chunk_size;
     LOG(INFO) << "chunk stride (frame): " << chunk_stride;
     LOG(INFO) << "receptive field (frame): " << receptive_field_length;
-    
+
     decoder.InitDecoder();
     kaldi::Timer timer;
     for (; !feature_reader.Done(); feature_reader.Next()) {
diff --git a/speechx/speechx/decoder/recognizer.cc b/speechx/speechx/decoder/recognizer.cc
index bb9ea187..870aa40a 100644
--- a/speechx/speechx/decoder/recognizer.cc
+++ b/speechx/speechx/decoder/recognizer.cc
@@ -17,12 +17,12 @@
 
 namespace ppspeech {
 
-using kaldi::Vector;
-using kaldi::VectorBase;
 using kaldi::BaseFloat;
-using std::vector;
 using kaldi::SubVector;
+using kaldi::Vector;
+using kaldi::VectorBase;
 using std::unique_ptr;
+using std::vector;
 
 
 Recognizer::Recognizer(const RecognizerResource& resource) {
diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/decoder/recognizer_main.cc
index 662943b5..8e83b188 100644
--- a/speechx/speechx/decoder/recognizer_main.cc
+++ b/speechx/speechx/decoder/recognizer_main.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "decoder/recognizer.h"
 #include "decoder/param.h"
+#include "decoder/recognizer.h"
 #include "kaldi/feat/wave-reader.h"
 #include "kaldi/util/table-types.h"
 
@@ -25,8 +25,9 @@ DEFINE_int32(sample_rate, 16000, "sample rate");
 ppspeech::RecognizerResource InitRecognizerResoure() {
     ppspeech::RecognizerResource resource;
     resource.acoustic_scale = FLAGS_acoustic_scale;
-    resource.feature_pipeline_opts = ppspeech::FeaturePipelineOptions::InitFromFlags();
-    resource.model_opts =  ppspeech::ModelOptions::InitFromFlags();
+    resource.feature_pipeline_opts =
+        ppspeech::FeaturePipelineOptions::InitFromFlags();
+    resource.model_opts = ppspeech::ModelOptions::InitFromFlags();
     resource.tlg_opts = ppspeech::TLGDecoderOptions::InitFromFlags();
     return resource;
 }
diff --git a/speechx/speechx/decoder/u2_recognizer.cc b/speechx/speechx/decoder/u2_recognizer.cc
index 8fcc5d79..04712e7b 100644
--- a/speechx/speechx/decoder/u2_recognizer.cc
+++ b/speechx/speechx/decoder/u2_recognizer.cc
@@ -13,18 +13,20 @@
 // limitations under the License.
 
 #include "decoder/u2_recognizer.h"
+
 #include "nnet/u2_nnet.h"
 
 namespace ppspeech {
 
-using kaldi::Vector;
-using kaldi::VectorBase;
 using kaldi::BaseFloat;
-using std::vector;
 using kaldi::SubVector;
+using kaldi::Vector;
+using kaldi::VectorBase;
 using std::unique_ptr;
+using std::vector;
 
-U2Recognizer::U2Recognizer(const U2RecognizerResource& resource): opts_(resource) {
+U2Recognizer::U2Recognizer(const U2RecognizerResource& resource)
+    : opts_(resource) {
     const FeaturePipelineOptions& feature_opts = resource.feature_pipeline_opts;
     feature_pipeline_.reset(new FeaturePipeline(feature_opts));
 
@@ -34,7 +36,8 @@ U2Recognizer::U2Recognizer(const U2RecognizerResource& resource): opts_(resource
     decodable_.reset(new Decodable(nnet, feature_pipeline_, am_scale));
 
     CHECK(resource.vocab_path != "");
-    decoder_.reset(new CTCPrefixBeamSearch(resource.vocab_path, resource.decoder_opts.ctc_prefix_search_opts));
+    decoder_.reset(new CTCPrefixBeamSearch(
+        resource.vocab_path, resource.decoder_opts.ctc_prefix_search_opts));
 
     unit_table_ = decoder_->VocabTable();
     symbol_table_ = unit_table_;
@@ -70,140 +73,140 @@ void U2Recognizer::Accept(const VectorBase<BaseFloat>& waves) {
 }
 
 
-void U2Recognizer::Decode() { 
-    decoder_->AdvanceDecode(decodable_); 
+void U2Recognizer::Decode() {
+    decoder_->AdvanceDecode(decodable_);
     UpdateResult(false);
 }
 
 void U2Recognizer::Rescoring() {
-  // Do attention Rescoring
-  kaldi::Timer timer;
-  AttentionRescoring();
-  VLOG(1) << "Rescoring cost latency: " << timer.Elapsed() << " sec.";
+    // Do attention Rescoring
+    kaldi::Timer timer;
+    AttentionRescoring();
+    VLOG(1) << "Rescoring cost latency: " << timer.Elapsed() << " sec.";
 }
 
 void U2Recognizer::UpdateResult(bool finish) {
-  const auto& hypotheses = decoder_->Outputs();
-  const auto& inputs = decoder_->Inputs();
-  const auto& likelihood = decoder_->Likelihood();
-  const auto& times = decoder_->Times();
-  result_.clear();
-
-  CHECK_EQ(hypotheses.size(), likelihood.size());
-  for (size_t i = 0; i < hypotheses.size(); i++) {
-    const std::vector<int>& hypothesis = hypotheses[i];
-
-    DecodeResult path;
-    path.score = likelihood[i];
-    for (size_t j = 0; j < hypothesis.size(); j++) {
-      std::string word = symbol_table_->Find(hypothesis[j]);
-      // A detailed explanation of this if-else branch can be found in
-      // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058
-      if (decoder_->Type() == kWfstBeamSearch) {
-        path.sentence += (" " + word);
-      } else {
-        path.sentence += (word);
-      }
-    }
+    const auto& hypotheses = decoder_->Outputs();
+    const auto& inputs = decoder_->Inputs();
+    const auto& likelihood = decoder_->Likelihood();
+    const auto& times = decoder_->Times();
+    result_.clear();
 
-    // TimeStamp is only supported in final result
-    // TimeStamp of the output of CtcWfstBeamSearch may be inaccurate due to
-    // various FST operations when building the decoding graph. So here we use
-    // time stamp of the input(e2e model unit), which is more accurate, and it
-    // requires the symbol table of the e2e model used in training.
-    if (unit_table_ != nullptr && finish) {
-      int offset = global_frame_offset_ * FrameShiftInMs();
+    CHECK_EQ(hypotheses.size(), likelihood.size());
+    for (size_t i = 0; i < hypotheses.size(); i++) {
+        const std::vector<int>& hypothesis = hypotheses[i];
+
+        DecodeResult path;
+        path.score = likelihood[i];
+        for (size_t j = 0; j < hypothesis.size(); j++) {
+            std::string word = symbol_table_->Find(hypothesis[j]);
+            // A detailed explanation of this if-else branch can be found in
+            // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058
+            if (decoder_->Type() == kWfstBeamSearch) {
+                path.sentence += (" " + word);
+            } else {
+                path.sentence += (word);
+            }
+        }
+
+        // TimeStamp is only supported in final result
+        // TimeStamp of the output of CtcWfstBeamSearch may be inaccurate due to
+        // various FST operations when building the decoding graph. So here we
+        // use time stamp of the input(e2e model unit), which is more accurate,
+        // and it requires the symbol table of the e2e model used in training.
+        if (unit_table_ != nullptr && finish) {
+            int offset = global_frame_offset_ * FrameShiftInMs();
 
-      const std::vector<int>& input = inputs[i];
-      const std::vector<int> time_stamp = times[i];
-      CHECK_EQ(input.size(), time_stamp.size());
+            const std::vector<int>& input = inputs[i];
+            const std::vector<int> time_stamp = times[i];
+            CHECK_EQ(input.size(), time_stamp.size());
 
-      for (size_t j = 0; j < input.size(); j++) {
-        std::string word = unit_table_->Find(input[j]);
+            for (size_t j = 0; j < input.size(); j++) {
+                std::string word = unit_table_->Find(input[j]);
 
-        int start = time_stamp[j] * FrameShiftInMs() - time_stamp_gap_ > 0
+                int start =
+                    time_stamp[j] * FrameShiftInMs() - time_stamp_gap_ > 0
                         ? time_stamp[j] * FrameShiftInMs() - time_stamp_gap_
                         : 0;
-        if (j > 0) {
-          start = (time_stamp[j] - time_stamp[j - 1]) * FrameShiftInMs() <
-                          time_stamp_gap_
-                      ? (time_stamp[j - 1] + time_stamp[j]) / 2 *
-                            FrameShiftInMs()
-                      : start;
+                if (j > 0) {
+                    start =
+                        (time_stamp[j] - time_stamp[j - 1]) * FrameShiftInMs() <
+                                time_stamp_gap_
+                            ? (time_stamp[j - 1] + time_stamp[j]) / 2 *
+                                  FrameShiftInMs()
+                            : start;
+                }
+
+                int end = time_stamp[j] * FrameShiftInMs();
+                if (j < input.size() - 1) {
+                    end =
+                        (time_stamp[j + 1] - time_stamp[j]) * FrameShiftInMs() <
+                                time_stamp_gap_
+                            ? (time_stamp[j + 1] + time_stamp[j]) / 2 *
+                                  FrameShiftInMs()
+                            : end;
+                }
+
+                WordPiece word_piece(word, offset + start, offset + end);
+                path.word_pieces.emplace_back(word_piece);
+            }
         }
 
-        int end = time_stamp[j] * FrameShiftInMs();
-        if (j < input.size() - 1) {
-          end = (time_stamp[j + 1] - time_stamp[j]) * FrameShiftInMs() <
-                        time_stamp_gap_
-                    ? (time_stamp[j + 1] + time_stamp[j]) / 2 *
-                          FrameShiftInMs()
-                    : end;
-        }
+        // if (post_processor_ != nullptr) {
+        //   path.sentence = post_processor_->Process(path.sentence, finish);
+        // }
 
-        WordPiece word_piece(word, offset + start, offset + end);
-        path.word_pieces.emplace_back(word_piece);
-      }
+        result_.emplace_back(path);
     }
 
-    // if (post_processor_ != nullptr) {
-    //   path.sentence = post_processor_->Process(path.sentence, finish);
-    // }
-
-    result_.emplace_back(path);
-  }
-
-  if (DecodedSomething()) {
-    VLOG(1) << "Partial CTC result " << result_[0].sentence;
-  }
+    if (DecodedSomething()) {
+        VLOG(1) << "Partial CTC result " << result_[0].sentence;
+    }
 }
 
 void U2Recognizer::AttentionRescoring() {
-  decoder_->FinalizeSearch();
-  UpdateResult(true);
-
-  // No need to do rescoring
-  if (0.0 == opts_.decoder_opts.rescoring_weight) {
-    LOG_EVERY_N(WARNING, 3) << "Not do AttentionRescoring!";
-    return;
-  }
-  LOG_EVERY_N(WARNING, 3) << "Do AttentionRescoring!";
-
-  // Inputs() returns N-best input ids, which is the basic unit for rescoring
-  // In CtcPrefixBeamSearch, inputs are the same to outputs
-  const auto& hypotheses = decoder_->Inputs();
-  int num_hyps = hypotheses.size();
-  if (num_hyps <= 0) {
-    return;
-  }
-
-  kaldi::Timer timer;
-  std::vector<float> rescoring_score;
-  decodable_->AttentionRescoring(
-      hypotheses, opts_.decoder_opts.reverse_weight, &rescoring_score);
-  VLOG(1) << "Attention Rescoring takes " << timer.Elapsed() << " sec.";
-
-  // combine ctc score and rescoring score
-  for (size_t i = 0; i < num_hyps; i++) {
-    VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i]
-            << " ctc_score: " << result_[i].score;
-    result_[i].score = opts_.decoder_opts.rescoring_weight * rescoring_score[i] +
-                       opts_.decoder_opts.ctc_weight * result_[i].score;
-  }
-
-  std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc);
-  VLOG(1) << "result: " << result_[0].sentence
-          << " score: " << result_[0].score;
-}
+    decoder_->FinalizeSearch();
+    UpdateResult(true);
 
-std::string U2Recognizer::GetFinalResult() {
-    return result_[0].sentence;
-}
+    // No need to do rescoring
+    if (0.0 == opts_.decoder_opts.rescoring_weight) {
+        LOG_EVERY_N(WARNING, 3) << "Not do AttentionRescoring!";
+        return;
+    }
+    LOG_EVERY_N(WARNING, 3) << "Do AttentionRescoring!";
+
+    // Inputs() returns N-best input ids, which is the basic unit for rescoring
+    // In CtcPrefixBeamSearch, inputs are the same to outputs
+    const auto& hypotheses = decoder_->Inputs();
+    int num_hyps = hypotheses.size();
+    if (num_hyps <= 0) {
+        return;
+    }
+
+    kaldi::Timer timer;
+    std::vector<float> rescoring_score;
+    decodable_->AttentionRescoring(
+        hypotheses, opts_.decoder_opts.reverse_weight, &rescoring_score);
+    VLOG(1) << "Attention Rescoring takes " << timer.Elapsed() << " sec.";
+
+    // combine ctc score and rescoring score
+    for (size_t i = 0; i < num_hyps; i++) {
+        VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i]
+                << " ctc_score: " << result_[i].score;
+        result_[i].score =
+            opts_.decoder_opts.rescoring_weight * rescoring_score[i] +
+            opts_.decoder_opts.ctc_weight * result_[i].score;
+    }
 
-std::string U2Recognizer::GetPartialResult() {
-    return result_[0].sentence;
+    std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc);
+    VLOG(1) << "result: " << result_[0].sentence
+            << " score: " << result_[0].score;
 }
 
+std::string U2Recognizer::GetFinalResult() { return result_[0].sentence; }
+
+std::string U2Recognizer::GetPartialResult() { return result_[0].sentence; }
+
 void U2Recognizer::SetFinished() {
     feature_pipeline_->SetFinished();
     input_finished_ = true;
diff --git a/speechx/speechx/decoder/u2_recognizer_main.cc b/speechx/speechx/decoder/u2_recognizer_main.cc
index b1a7b2e8..9eb0441b 100644
--- a/speechx/speechx/decoder/u2_recognizer_main.cc
+++ b/speechx/speechx/decoder/u2_recognizer_main.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "decoder/u2_recognizer.h"
 #include "decoder/param.h"
+#include "decoder/u2_recognizer.h"
 #include "kaldi/feat/wave-reader.h"
 #include "kaldi/util/table-types.h"
 
@@ -43,7 +43,8 @@ int main(int argc, char* argv[]) {
     LOG(INFO) << "chunk size (s): " << streaming_chunk;
     LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
 
-    ppspeech::U2RecognizerResource resource = ppspeech::U2RecognizerResource::InitFromFlags();
+    ppspeech::U2RecognizerResource resource =
+        ppspeech::U2RecognizerResource::InitFromFlags();
     ppspeech::U2Recognizer recognizer(resource);
 
     kaldi::Timer timer;
@@ -103,7 +104,7 @@ int main(int argc, char* argv[]) {
     }
 
     double elapsed = timer.Elapsed();
-  
+
     LOG(INFO) << "Done " << num_done << " out of " << (num_err + num_done);
     LOG(INFO) << "cost:" << elapsed << " sec";
     LOG(INFO) << "total wav duration is: " << tot_wav_duration << " sec";
diff --git a/speechx/speechx/frontend/audio/cmvn.cc b/speechx/speechx/frontend/audio/cmvn.cc
index 5e84a1a1..7997e8a7 100644
--- a/speechx/speechx/frontend/audio/cmvn.cc
+++ b/speechx/speechx/frontend/audio/cmvn.cc
@@ -14,17 +14,18 @@
 
 
 #include "frontend/audio/cmvn.h"
+
 #include "kaldi/feat/cmvn.h"
 #include "kaldi/util/kaldi-io.h"
 
 namespace ppspeech {
 
-using kaldi::Vector;
-using kaldi::VectorBase;
 using kaldi::BaseFloat;
-using std::vector;
 using kaldi::SubVector;
+using kaldi::Vector;
+using kaldi::VectorBase;
 using std::unique_ptr;
+using std::vector;
 
 
 CMVN::CMVN(std::string cmvn_file, unique_ptr<FrontendInterface> base_extractor)
@@ -57,7 +58,7 @@ bool CMVN::Read(kaldi::Vector<BaseFloat>* feats) {
 // feats contain num_frames feature.
 void CMVN::Compute(VectorBase<BaseFloat>* feats) const {
     KALDI_ASSERT(feats != NULL);
-   
+
     if (stats_.NumRows() > 2 || stats_.NumRows() < 1 ||
         feats->Dim() % dim_ != 0) {
         KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << ','
diff --git a/speechx/speechx/frontend/audio/compute_fbank_main.cc b/speechx/speechx/frontend/audio/compute_fbank_main.cc
index 93a6d407..bb7e449f 100644
--- a/speechx/speechx/frontend/audio/compute_fbank_main.cc
+++ b/speechx/speechx/frontend/audio/compute_fbank_main.cc
@@ -16,16 +16,15 @@
 
 #include "base/flags.h"
 #include "base/log.h"
-#include "kaldi/feat/wave-reader.h"
-#include "kaldi/util/kaldi-io.h"
-#include "kaldi/util/table-types.h"
-
 #include "frontend/audio/audio_cache.h"
 #include "frontend/audio/data_cache.h"
 #include "frontend/audio/fbank.h"
 #include "frontend/audio/feature_cache.h"
 #include "frontend/audio/frontend_itf.h"
 #include "frontend/audio/normalizer.h"
+#include "kaldi/feat/wave-reader.h"
+#include "kaldi/util/kaldi-io.h"
+#include "kaldi/util/table-types.h"
 
 DEFINE_string(wav_rspecifier, "", "test wav scp path");
 DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
@@ -86,24 +85,27 @@ int main(int argc, char* argv[]) {
     LOG(INFO) << "chunk size (sec): " << streaming_chunk;
     LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
 
-    for (; !wav_reader.Done() && !wav_info_reader.Done(); wav_reader.Next(), wav_info_reader.Next()) {
+    for (; !wav_reader.Done() && !wav_info_reader.Done();
+         wav_reader.Next(), wav_info_reader.Next()) {
         const std::string& utt = wav_reader.Key();
         const kaldi::WaveData& wave_data = wav_reader.Value();
 
         const std::string& utt2 = wav_info_reader.Key();
         const kaldi::WaveInfo& wave_info = wav_info_reader.Value();
 
-        CHECK(utt == utt2) << "wav reader and wav info reader using diff rspecifier!!!";
+        CHECK(utt == utt2)
+            << "wav reader and wav info reader using diff rspecifier!!!";
         LOG(INFO) << "utt: " << utt;
         LOG(INFO) << "samples: " << wave_info.SampleCount();
         LOG(INFO) << "dur: " << wave_info.Duration() << " sec";
-        CHECK(wave_info.SampFreq() == FLAGS_sample_rate) << "need " << FLAGS_sample_rate << " get " << wave_info.SampFreq();
+        CHECK(wave_info.SampFreq() == FLAGS_sample_rate)
+            << "need " << FLAGS_sample_rate << " get " << wave_info.SampFreq();
 
         // load first channel wav
         int32 this_channel = 0;
         kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
                                                     this_channel);
-    
+
         // compute feat chunk by chunk
         int tot_samples = waveform.Dim();
         int sample_offset = 0;
@@ -157,7 +159,8 @@ int main(int argc, char* argv[]) {
                 ++cur_idx;
             }
         }
-        LOG(INFO) << "feat shape: " << features.NumRows() << " , " << features.NumCols();
+        LOG(INFO) << "feat shape: " << features.NumRows() << " , "
+                  << features.NumCols();
         feat_writer.Write(utt, features);
 
         // reset frontend pipeline state
diff --git a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
index 889f5663..42693c0c 100644
--- a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
+++ b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
@@ -14,16 +14,15 @@
 
 #include "base/flags.h"
 #include "base/log.h"
-#include "kaldi/feat/wave-reader.h"
-#include "kaldi/util/kaldi-io.h"
-#include "kaldi/util/table-types.h"
-
 #include "frontend/audio/audio_cache.h"
 #include "frontend/audio/data_cache.h"
 #include "frontend/audio/feature_cache.h"
 #include "frontend/audio/frontend_itf.h"
 #include "frontend/audio/linear_spectrogram.h"
 #include "frontend/audio/normalizer.h"
+#include "kaldi/feat/wave-reader.h"
+#include "kaldi/util/kaldi-io.h"
+#include "kaldi/util/table-types.h"
 
 DEFINE_string(wav_rspecifier, "", "test wav scp path");
 DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc
index 7232efc4..65493e42 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.cc
+++ b/speechx/speechx/frontend/audio/feature_pipeline.cc
@@ -18,7 +18,8 @@ namespace ppspeech {
 
 using std::unique_ptr;
 
-FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) : opts_(opts) {
+FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts)
+    : opts_(opts) {
     unique_ptr<FrontendInterface> data_source(
         new ppspeech::AudioCache(1000 * kint16max, opts.to_float32));
 
@@ -43,4 +44,4 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) : opts_(opt
         new ppspeech::Assembler(opts.assembler_opts, std::move(cache)));
 }
 
-}  // ppspeech
+}  // namespace ppspeech
diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc
index b76c6280..dc971e0f 100644
--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@@ -18,8 +18,8 @@ namespace ppspeech {
 
 using kaldi::BaseFloat;
 using kaldi::Matrix;
-using std::vector;
 using kaldi::Vector;
+using std::vector;
 
 Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet,
                      const std::shared_ptr<FrontendInterface>& frontend,
@@ -56,7 +56,6 @@ int32 Decodable::NumIndices() const { return 0; }
 int32 Decodable::TokenId2NnetId(int32 token_id) { return token_id - 1; }
 
 
-
 bool Decodable::EnsureFrameHaveComputed(int32 frame) {
     // decoding frame
     if (frame >= frames_ready_) {
@@ -92,14 +91,15 @@ bool Decodable::AdvanceChunk() {
     return true;
 }
 
-bool Decodable::AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs, int* vocab_dim) {
+bool Decodable::AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
+                             int* vocab_dim) {
     if (AdvanceChunk() == false) {
         return false;
     }
 
     int nrows = nnet_out_cache_.NumRows();
-    CHECK(nrows ==  (frames_ready_ - frame_offset_));
-    if (nrows <= 0){
+    CHECK(nrows == (frames_ready_ - frame_offset_));
+    if (nrows <= 0) {
         LOG(WARNING) << "No new nnet out in cache.";
         return false;
     }
@@ -107,7 +107,7 @@ bool Decodable::AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs, int* voc
     logprobs->Resize(nnet_out_cache_.NumRows() * nnet_out_cache_.NumCols());
     logprobs->CopyRowsFromMat(nnet_out_cache_);
 
-    *vocab_dim =  nnet_out_cache_.NumCols();
+    *vocab_dim = nnet_out_cache_.NumCols();
     return true;
 }
 
@@ -140,7 +140,7 @@ BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) {
     BaseFloat logprob = 0.0;
     int32 frame_idx = frame - frame_offset_;
     BaseFloat nnet_out = nnet_out_cache_(frame_idx, TokenId2NnetId(index));
-    if (nnet_->IsLogProb()){
+    if (nnet_->IsLogProb()) {
         logprob = nnet_out;
     } else {
         logprob = std::log(nnet_out + std::numeric_limits<float>::epsilon());
@@ -158,8 +158,8 @@ void Decodable::Reset() {
 }
 
 void Decodable::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
-                          float reverse_weight,
-                          std::vector<float>* rescoring_score){
+                                   float reverse_weight,
+                                   std::vector<float>* rescoring_score) {
     nnet_->AttentionRescoring(hyps, reverse_weight, rescoring_score);
 }
 
diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc
index 71252477..4bafdf83 100644
--- a/speechx/speechx/nnet/u2_nnet.cc
+++ b/speechx/speechx/nnet/u2_nnet.cc
@@ -242,7 +242,6 @@ void U2Nnet::ForwardEncoderChunkImpl(
     const int32& feat_dim,
     std::vector<kaldi::BaseFloat>* out_prob,
     int32* vocab_dim) {
-
 #ifdef USE_PROFILING
     RecordEvent event(
         "ForwardEncoderChunkImpl", TracerEventType::UserDefined, 1);
@@ -349,8 +348,9 @@ void U2Nnet::ForwardEncoderChunkImpl(
     // current offset in decoder frame
     // not used in nnet
     offset_ += chunk_out.shape()[1];
-    VLOG(2) << "encoder out chunk size: " << chunk_out.shape()[1] << " total: " << offset_ ;
-    
+    VLOG(2) << "encoder out chunk size: " << chunk_out.shape()[1]
+            << " total: " << offset_;
+
 
     // collects encoder outs.
     encoder_outs_.push_back(chunk_out);
@@ -706,12 +706,13 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
 }
 
 
-void U2Nnet::EncoderOuts(std::vector<kaldi::Vector<kaldi::BaseFloat>>* encoder_out) const {
+void U2Nnet::EncoderOuts(
+    std::vector<kaldi::Vector<kaldi::BaseFloat>>* encoder_out) const {
     // list of (B=1,T,D)
     int size = encoder_outs_.size();
     VLOG(1) << "encoder_outs_ size: " << size;
 
-    for (int i = 0; i < size; i++){
+    for (int i = 0; i < size; i++) {
         const paddle::Tensor& item = encoder_outs_[i];
         const std::vector<int64_t> shape = item.shape();
         CHECK(shape.size() == 3);
@@ -719,16 +720,17 @@ void U2Nnet::EncoderOuts(std::vector<kaldi::Vector<kaldi::BaseFloat>>* encoder_o
         const int& T = shape[1];
         const int& D = shape[2];
         CHECK(B == 1) << "Only support batch one.";
-        VLOG(1) << "encoder out " << i << " shape: (" << B << "," << T << "," << D << ")";
+        VLOG(1) << "encoder out " << i << " shape: (" << B << "," << T << ","
+                << D << ")";
 
-        const float *this_tensor_ptr = item.data<float>();
-        for (int j = 0; j < T; j++){
-            const float* cur = this_tensor_ptr + j * D; 
+        const float* this_tensor_ptr = item.data<float>();
+        for (int j = 0; j < T; j++) {
+            const float* cur = this_tensor_ptr + j * D;
             kaldi::Vector<kaldi::BaseFloat> out(D);
             std::memcpy(out.Data(), cur, D * sizeof(kaldi::BaseFloat));
             encoder_out->emplace_back(out);
         }
     }
- }
+}
 
 }  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc
index adbbf0e8..5039a59a 100644
--- a/speechx/speechx/nnet/u2_nnet_main.cc
+++ b/speechx/speechx/nnet/u2_nnet_main.cc
@@ -14,11 +14,11 @@
 
 
 #include "base/common.h"
+#include "decoder/param.h"
 #include "frontend/audio/assembler.h"
 #include "frontend/audio/data_cache.h"
 #include "kaldi/util/table-types.h"
 #include "nnet/decodable.h"
-#include "decoder/param.h"
 #include "nnet/u2_nnet.h"
 
 
@@ -46,15 +46,16 @@ int main(int argc, char* argv[]) {
     LOG(INFO) << "output wspecifier: " << FLAGS_nnet_prob_wspecifier;
     LOG(INFO) << "model path: " << FLAGS_model_path;
 
-    kaldi::SequentialBaseFloatMatrixReader feature_reader(FLAGS_feature_rspecifier);
+    kaldi::SequentialBaseFloatMatrixReader feature_reader(
+        FLAGS_feature_rspecifier);
     kaldi::BaseFloatMatrixWriter nnet_out_writer(FLAGS_nnet_prob_wspecifier);
-    kaldi::BaseFloatMatrixWriter nnet_encoder_outs_writer(FLAGS_nnet_encoder_outs_wspecifier);
+    kaldi::BaseFloatMatrixWriter nnet_encoder_outs_writer(
+        FLAGS_nnet_encoder_outs_wspecifier);
 
     ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags();
 
-    int32 chunk_size =
-        (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate +
-        FLAGS_receptive_field_length;
+    int32 chunk_size = (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate +
+                       FLAGS_receptive_field_length;
     int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk;
     int32 receptive_field_length = FLAGS_receptive_field_length;
     LOG(INFO) << "chunk size (frame): " << chunk_size;
@@ -92,9 +93,9 @@ int main(int argc, char* argv[]) {
                     ori_feature_len - chunk_idx * chunk_stride, chunk_size);
             }
             if (this_chunk_size < receptive_field_length) {
-                LOG(WARNING) << "utt: " << utt << " skip last "
-                             << this_chunk_size << " frames, expect is "
-                             << receptive_field_length;
+                LOG(WARNING)
+                    << "utt: " << utt << " skip last " << this_chunk_size
+                    << " frames, expect is " << receptive_field_length;
                 break;
             }
 
@@ -123,13 +124,17 @@ int main(int argc, char* argv[]) {
             kaldi::Vector<kaldi::BaseFloat> logprobs;
             bool isok = decodable->AdvanceChunk(&logprobs, &vocab_dim);
             CHECK(isok == true);
-            for (int row_idx = 0; row_idx < logprobs.Dim() / vocab_dim; row_idx ++) {
+            for (int row_idx = 0; row_idx < logprobs.Dim() / vocab_dim;
+                 row_idx++) {
                 kaldi::Vector<kaldi::BaseFloat> vec_tmp(vocab_dim);
-                std::memcpy(vec_tmp.Data(), logprobs.Data() + row_idx*vocab_dim, sizeof(kaldi::BaseFloat) * vocab_dim);
+                std::memcpy(vec_tmp.Data(),
+                            logprobs.Data() + row_idx * vocab_dim,
+                            sizeof(kaldi::BaseFloat) * vocab_dim);
                 prob_vec.push_back(vec_tmp);
             }
 
-            VLOG(2) << "frame_idx: " << frame_idx << " elapsed: " << timer.Elapsed() << " sec.";
+            VLOG(2) << "frame_idx: " << frame_idx
+                    << " elapsed: " << timer.Elapsed() << " sec.";
         }
 
         // get encoder out
@@ -141,7 +146,8 @@ int main(int argc, char* argv[]) {
         if (prob_vec.size() == 0 || encoder_out_vec.size() == 0) {
             // the TokenWriter can not write empty string.
             ++num_err;
-            LOG(WARNING) << " the nnet prob/encoder_out of " << utt << " is empty";
+            LOG(WARNING) << " the nnet prob/encoder_out of " << utt
+                         << " is empty";
             continue;
         }
 
@@ -168,7 +174,8 @@ int main(int argc, char* argv[]) {
             kaldi::Matrix<kaldi::BaseFloat> encoder_outs(nrow, ncol);
             for (int32 row_idx = 0; row_idx < nrow; ++row_idx) {
                 for (int32 col_idx = 0; col_idx < ncol; ++col_idx) {
-                    encoder_outs(row_idx, col_idx) = encoder_out_vec[row_idx](col_idx);
+                    encoder_outs(row_idx, col_idx) =
+                        encoder_out_vec[row_idx](col_idx);
                 }
             }
             nnet_encoder_outs_writer.Write(utt, encoder_outs);
diff --git a/speechx/speechx/protocol/websocket/websocket_server_main.cc b/speechx/speechx/protocol/websocket/websocket_server_main.cc
index 827b164f..5c32caf2 100644
--- a/speechx/speechx/protocol/websocket/websocket_server_main.cc
+++ b/speechx/speechx/protocol/websocket/websocket_server_main.cc
@@ -12,17 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "websocket/websocket_server.h"
 #include "decoder/param.h"
+#include "websocket/websocket_server.h"
 
 DEFINE_int32(port, 8082, "websocket listening port");
 
 ppspeech::RecognizerResource InitRecognizerResoure() {
     ppspeech::RecognizerResource resource;
     resource.acoustic_scale = FLAGS_acoustic_scale;
-    resource.feature_pipeline_opts = ppspeech::FeaturePipelineOptions::InitFromFlags();
+    resource.feature_pipeline_opts =
+        ppspeech::FeaturePipelineOptions::InitFromFlags();
     resource.model_opts = ppspeech::ModelOptions::InitFromFlags();
-    resource.tlg_opts =  ppspeech::TLGDecoderOptions::InitFromFlags();
+    resource.tlg_opts = ppspeech::TLGDecoderOptions::InitFromFlags();
     return resource;
 }
 
diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc
index c218990a..289470f6 100644
--- a/speechx/speechx/utils/math.cc
+++ b/speechx/speechx/utils/math.cc
@@ -16,13 +16,13 @@
 
 #include "utils/math.h"
 
-#include "base/common.h"
-
 #include <algorithm>
 #include <cmath>
 #include <queue>
 #include <utility>
 
+#include "base/common.h"
+
 
 namespace ppspeech {
 
@@ -89,8 +89,8 @@ void TopK(const std::vector<T>& data,
 }
 
 template void TopK<float>(const std::vector<float>& data,
-          int32_t k,
-          std::vector<float>* values,
-          std::vector<int>* indices) ;
+                          int32_t k,
+                          std::vector<float>* values,
+                          std::vector<int>* indices);
 
 }  // namespace ppspeech
\ No newline at end of file

From 99b3632d4d904e348e4cf37397538bb0a11bd2a8 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 18 Oct 2022 03:41:09 +0000
Subject: [PATCH 27/60] seprate recognizer; NnetBase as base class

---
 speechx/speechx/CMakeLists.txt                |  6 +++
 speechx/speechx/decoder/CMakeLists.txt        | 14 ++----
 speechx/speechx/nnet/decodable.cc             |  2 +-
 speechx/speechx/nnet/decodable.h              |  6 +--
 speechx/speechx/nnet/ds2_nnet.h               |  2 +-
 speechx/speechx/nnet/nnet_itf.h               | 10 +++--
 speechx/speechx/nnet/u2_nnet.cc               |  2 +-
 speechx/speechx/nnet/u2_nnet.h                |  6 +--
 .../speechx/protocol/websocket/CMakeLists.txt |  2 +-
 .../protocol/websocket/websocket_server.h     |  2 +-
 speechx/speechx/recognizer/CMakeLists.txt     | 45 +++++++++++++++++++
 .../{decoder => recognizer}/recognizer.cc     |  2 +-
 .../{decoder => recognizer}/recognizer.h      |  0
 .../recognizer_main.cc                        | 13 +-----
 .../{decoder => recognizer}/u2_recognizer.cc  |  4 +-
 .../{decoder => recognizer}/u2_recognizer.h   |  0
 .../u2_recognizer_main.cc                     |  2 +-
 17 files changed, 78 insertions(+), 40 deletions(-)
 create mode 100644 speechx/speechx/recognizer/CMakeLists.txt
 rename speechx/speechx/{decoder => recognizer}/recognizer.cc (97%)
 rename speechx/speechx/{decoder => recognizer}/recognizer.h (100%)
 rename speechx/speechx/{decoder => recognizer}/recognizer_main.cc (88%)
 rename speechx/speechx/{decoder => recognizer}/u2_recognizer.cc (98%)
 rename speechx/speechx/{decoder => recognizer}/u2_recognizer.h (100%)
 rename speechx/speechx/{decoder => recognizer}/u2_recognizer_main.cc (99%)

diff --git a/speechx/speechx/CMakeLists.txt b/speechx/speechx/CMakeLists.txt
index c8e21d48..60c18347 100644
--- a/speechx/speechx/CMakeLists.txt
+++ b/speechx/speechx/CMakeLists.txt
@@ -32,6 +32,12 @@ ${CMAKE_CURRENT_SOURCE_DIR}/decoder
 )
 add_subdirectory(decoder)
 
+include_directories(
+${CMAKE_CURRENT_SOURCE_DIR}
+${CMAKE_CURRENT_SOURCE_DIR}/recognizer
+)
+add_subdirectory(recognizer)
+
 include_directories(
 ${CMAKE_CURRENT_SOURCE_DIR}
 ${CMAKE_CURRENT_SOURCE_DIR}/protocol
diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt
index d06c3529..5bec24a6 100644
--- a/speechx/speechx/decoder/CMakeLists.txt
+++ b/speechx/speechx/decoder/CMakeLists.txt
@@ -1,28 +1,24 @@
-project(decoder)
-
 include_directories(${CMAKE_CURRENT_SOURCE_DIR/ctc_decoders})
 
-set(decoder_src   )
+set(srcs)
 
 if (USING_DS2)
-list(APPEND decoder_src
+list(APPEND srcs
 ctc_decoders/decoder_utils.cpp
 ctc_decoders/path_trie.cpp
 ctc_decoders/scorer.cpp
 ctc_beam_search_decoder.cc
 ctc_tlg_decoder.cc
-recognizer.cc
 )
 endif()
 
 if (USING_U2)
-  list(APPEND decoder_src
+  list(APPEND srcs
     ctc_prefix_beam_search_decoder.cc
-    u2_recognizer.cc
   )
 endif()
 
-add_library(decoder STATIC ${decoder_src})
+add_library(decoder STATIC ${srcs})
 target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder absl::strings)
 
 # test
@@ -30,7 +26,6 @@ if (USING_DS2)
   set(BINS 
     ctc_beam_search_decoder_main
     nnet_logprob_decoder_main
-    recognizer_main
     ctc_tlg_decoder_main
   )
 
@@ -45,7 +40,6 @@ endif()
 if (USING_U2)
   set(TEST_BINS 
     ctc_prefix_beam_search_decoder_main
-    u2_recognizer_main
   )
 
   foreach(bin_name IN LISTS TEST_BINS)
diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc
index dc971e0f..9bad8ed4 100644
--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@@ -21,7 +21,7 @@ using kaldi::Matrix;
 using kaldi::Vector;
 using std::vector;
 
-Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet,
+Decodable::Decodable(const std::shared_ptr<NnetBase>& nnet,
                      const std::shared_ptr<FrontendInterface>& frontend,
                      kaldi::BaseFloat acoustic_scale)
     : frontend_(frontend),
diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h
index 70a16e2c..dd7b329e 100644
--- a/speechx/speechx/nnet/decodable.h
+++ b/speechx/speechx/nnet/decodable.h
@@ -24,7 +24,7 @@ struct DecodableOpts;
 
 class Decodable : public kaldi::DecodableInterface {
   public:
-    explicit Decodable(const std::shared_ptr<NnetInterface>& nnet,
+    explicit Decodable(const std::shared_ptr<NnetBase>& nnet,
                        const std::shared_ptr<FrontendInterface>& frontend,
                        kaldi::BaseFloat acoustic_scale = 1.0);
 
@@ -63,14 +63,14 @@ class Decodable : public kaldi::DecodableInterface {
 
     int32 TokenId2NnetId(int32 token_id);
 
-    std::shared_ptr<NnetInterface> Nnet() { return nnet_; }
+    std::shared_ptr<NnetBase> Nnet() { return nnet_; }
 
     // for offline test
     void Acceptlikelihood(const kaldi::Matrix<kaldi::BaseFloat>& likelihood);
 
   private:
     std::shared_ptr<FrontendInterface> frontend_;
-    std::shared_ptr<NnetInterface> nnet_;
+    std::shared_ptr<NnetBase> nnet_;
 
     // nnet outputs' cache
     kaldi::Matrix<kaldi::BaseFloat> nnet_out_cache_;
diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h
index 4aeec32f..d1e3ac8c 100644
--- a/speechx/speechx/nnet/ds2_nnet.h
+++ b/speechx/speechx/nnet/ds2_nnet.h
@@ -48,7 +48,7 @@ class Tensor {
     std::vector<T> _data;
 };
 
-class PaddleNnet : public NnetInterface {
+class PaddleNnet : public NnetBase {
   public:
     PaddleNnet(const ModelOptions& opts);
 
diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h
index cc737ce0..a504cce5 100644
--- a/speechx/speechx/nnet/nnet_itf.h
+++ b/speechx/speechx/nnet/nnet_itf.h
@@ -11,8 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
-
 #pragma once
 
 #include "base/basic_types.h"
@@ -105,11 +103,15 @@ class NnetInterface {
     // true, nnet output is logprob; otherwise is prob,
     virtual bool IsLogProb() = 0;
 
-    int SubsamplingRate() const { return subsampling_rate_; }
-
     // using to get encoder outs. e.g. seq2seq with Attention model.
     virtual void EncoderOuts(
         std::vector<kaldi::Vector<kaldi::BaseFloat>>* encoder_out) const = 0;
+};
+
+
+class NnetBase : public NnetInterface {
+  public:
+    int SubsamplingRate() const { return subsampling_rate_; }
 
   protected:
     int subsampling_rate_{1};
diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc
index 4bafdf83..c92c96aa 100644
--- a/speechx/speechx/nnet/u2_nnet.cc
+++ b/speechx/speechx/nnet/u2_nnet.cc
@@ -193,7 +193,7 @@ U2Nnet::U2Nnet(const U2Nnet& other) {
     // ignore inner states
 }
 
-std::shared_ptr<NnetInterface> U2Nnet::Copy() const {
+std::shared_ptr<NnetBase> U2Nnet::Copy() const {
     auto asr_model = std::make_shared<U2Nnet>(*this);
     // reset inner state for new decoding
     asr_model->Reset();
diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h
index 3435bca8..a37a88f2 100644
--- a/speechx/speechx/nnet/u2_nnet.h
+++ b/speechx/speechx/nnet/u2_nnet.h
@@ -24,7 +24,7 @@
 namespace ppspeech {
 
 
-class U2NnetBase : public NnetInterface {
+class U2NnetBase : public NnetBase {
   public:
     virtual int context() const { return right_context_ + 1; }
     virtual int right_context() const { return right_context_; }
@@ -41,7 +41,7 @@ class U2NnetBase : public NnetInterface {
     // start: false, it is the start chunk of one sentence, else true
     virtual int num_frames_for_chunk(bool start) const;
 
-    virtual std::shared_ptr<NnetInterface> Copy() const = 0;
+    virtual std::shared_ptr<NnetBase> Copy() const = 0;
 
     virtual void ForwardEncoderChunk(
         const std::vector<kaldi::BaseFloat>& chunk_feats,
@@ -99,7 +99,7 @@ class U2Nnet : public U2NnetBase {
 
     std::shared_ptr<paddle::jit::Layer> model() const { return model_; }
 
-    std::shared_ptr<NnetInterface> Copy() const override;
+    std::shared_ptr<NnetBase> Copy() const override;
 
     void ForwardEncoderChunkImpl(
         const std::vector<kaldi::BaseFloat>& chunk_feats,
diff --git a/speechx/speechx/protocol/websocket/CMakeLists.txt b/speechx/speechx/protocol/websocket/CMakeLists.txt
index a171d84d..cafbbec7 100644
--- a/speechx/speechx/protocol/websocket/CMakeLists.txt
+++ b/speechx/speechx/protocol/websocket/CMakeLists.txt
@@ -2,7 +2,7 @@ add_library(websocket STATIC
   websocket_server.cc
   websocket_client.cc
 )
-target_link_libraries(websocket PUBLIC frontend decoder nnet)
+target_link_libraries(websocket PUBLIC frontend nnet decoder recognizer)
 
 add_executable(websocket_server_main ${CMAKE_CURRENT_SOURCE_DIR}/websocket_server_main.cc)
 target_include_directories(websocket_server_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
diff --git a/speechx/speechx/protocol/websocket/websocket_server.h b/speechx/speechx/protocol/websocket/websocket_server.h
index 8f3360e4..9b05f868 100644
--- a/speechx/speechx/protocol/websocket/websocket_server.h
+++ b/speechx/speechx/protocol/websocket/websocket_server.h
@@ -19,7 +19,7 @@
 #include "boost/asio/ip/tcp.hpp"
 #include "boost/beast/core.hpp"
 #include "boost/beast/websocket.hpp"
-#include "decoder/recognizer.h"
+#include "recognizer/recognizer.h"
 #include "frontend/audio/feature_pipeline.h"
 
 namespace beast = boost::beast;          // from <boost/beast.hpp>
diff --git a/speechx/speechx/recognizer/CMakeLists.txt b/speechx/speechx/recognizer/CMakeLists.txt
new file mode 100644
index 00000000..05078873
--- /dev/null
+++ b/speechx/speechx/recognizer/CMakeLists.txt
@@ -0,0 +1,45 @@
+set(srcs)
+
+if (USING_DS2)
+list(APPEND srcs
+recognizer.cc
+)
+endif()
+
+if (USING_U2)
+  list(APPEND srcs
+    u2_recognizer.cc
+  )
+endif()
+
+add_library(recognizer STATIC ${srcs})
+target_link_libraries(recognizer PUBLIC decoder)
+
+# test
+if (USING_DS2)
+  set(BINS recognizer_main)
+
+  foreach(bin_name IN LISTS BINS)
+    add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+    target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+    target_link_libraries(${bin_name} PUBLIC recognizer nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
+  endforeach()
+endif()
+
+
+if (USING_U2)
+  set(TEST_BINS 
+    u2_recognizer_main
+  )
+
+  foreach(bin_name IN LISTS TEST_BINS)
+    add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+    target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+    target_link_libraries(${bin_name} recognizer nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util)
+    target_compile_options(${bin_name}  PRIVATE ${PADDLE_COMPILE_FLAGS})
+    target_include_directories(${bin_name}  PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
+    target_link_libraries(${bin_name}  ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
+  endforeach()
+
+endif()
+
diff --git a/speechx/speechx/decoder/recognizer.cc b/speechx/speechx/recognizer/recognizer.cc
similarity index 97%
rename from speechx/speechx/decoder/recognizer.cc
rename to speechx/speechx/recognizer/recognizer.cc
index 870aa40a..c6631813 100644
--- a/speechx/speechx/decoder/recognizer.cc
+++ b/speechx/speechx/recognizer/recognizer.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "decoder/recognizer.h"
+#include "recognizer/recognizer.h"
 
 
 namespace ppspeech {
diff --git a/speechx/speechx/decoder/recognizer.h b/speechx/speechx/recognizer/recognizer.h
similarity index 100%
rename from speechx/speechx/decoder/recognizer.h
rename to speechx/speechx/recognizer/recognizer.h
diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/recognizer/recognizer_main.cc
similarity index 88%
rename from speechx/speechx/decoder/recognizer_main.cc
rename to speechx/speechx/recognizer/recognizer_main.cc
index 8e83b188..7c30fe6a 100644
--- a/speechx/speechx/decoder/recognizer_main.cc
+++ b/speechx/speechx/recognizer/recognizer_main.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "decoder/param.h"
-#include "decoder/recognizer.h"
+#include "recognizer/recognizer.h"
 #include "kaldi/feat/wave-reader.h"
 #include "kaldi/util/table-types.h"
 
@@ -22,15 +22,6 @@ DEFINE_string(result_wspecifier, "", "test result wspecifier");
 DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
 DEFINE_int32(sample_rate, 16000, "sample rate");
 
-ppspeech::RecognizerResource InitRecognizerResoure() {
-    ppspeech::RecognizerResource resource;
-    resource.acoustic_scale = FLAGS_acoustic_scale;
-    resource.feature_pipeline_opts =
-        ppspeech::FeaturePipelineOptions::InitFromFlags();
-    resource.model_opts = ppspeech::ModelOptions::InitFromFlags();
-    resource.tlg_opts = ppspeech::TLGDecoderOptions::InitFromFlags();
-    return resource;
-}
 
 int main(int argc, char* argv[]) {
     gflags::SetUsageMessage("Usage:");
@@ -39,7 +30,7 @@ int main(int argc, char* argv[]) {
     google::InstallFailureSignalHandler();
     FLAGS_logtostderr = 1;
 
-    ppspeech::RecognizerResource resource = InitRecognizerResoure();
+    ppspeech::RecognizerResource resource = ppspeech::RecognizerResource::InitFromFlags();
     ppspeech::Recognizer recognizer(resource);
 
     kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
diff --git a/speechx/speechx/decoder/u2_recognizer.cc b/speechx/speechx/recognizer/u2_recognizer.cc
similarity index 98%
rename from speechx/speechx/decoder/u2_recognizer.cc
rename to speechx/speechx/recognizer/u2_recognizer.cc
index 04712e7b..75834aa5 100644
--- a/speechx/speechx/decoder/u2_recognizer.cc
+++ b/speechx/speechx/recognizer/u2_recognizer.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "decoder/u2_recognizer.h"
+#include "recognizer/u2_recognizer.h"
 
 #include "nnet/u2_nnet.h"
 
@@ -30,7 +30,7 @@ U2Recognizer::U2Recognizer(const U2RecognizerResource& resource)
     const FeaturePipelineOptions& feature_opts = resource.feature_pipeline_opts;
     feature_pipeline_.reset(new FeaturePipeline(feature_opts));
 
-    std::shared_ptr<NnetInterface> nnet(new U2Nnet(resource.model_opts));
+    std::shared_ptr<NnetBase> nnet(new U2Nnet(resource.model_opts));
 
     BaseFloat am_scale = resource.acoustic_scale;
     decodable_.reset(new Decodable(nnet, feature_pipeline_, am_scale));
diff --git a/speechx/speechx/decoder/u2_recognizer.h b/speechx/speechx/recognizer/u2_recognizer.h
similarity index 100%
rename from speechx/speechx/decoder/u2_recognizer.h
rename to speechx/speechx/recognizer/u2_recognizer.h
diff --git a/speechx/speechx/decoder/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc
similarity index 99%
rename from speechx/speechx/decoder/u2_recognizer_main.cc
rename to speechx/speechx/recognizer/u2_recognizer_main.cc
index 9eb0441b..ff848f58 100644
--- a/speechx/speechx/decoder/u2_recognizer_main.cc
+++ b/speechx/speechx/recognizer/u2_recognizer_main.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "decoder/param.h"
-#include "decoder/u2_recognizer.h"
+#include "recognizer/u2_recognizer.h"
 #include "kaldi/feat/wave-reader.h"
 #include "kaldi/util/table-types.h"
 

From 043246e16807b51e6a9f29b2da5ff18428614f45 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 18 Oct 2022 03:54:06 +0000
Subject: [PATCH 28/60] format

---
 speechx/speechx/protocol/websocket/websocket_server.h | 2 +-
 speechx/speechx/recognizer/u2_recognizer_main.cc      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/speechx/speechx/protocol/websocket/websocket_server.h b/speechx/speechx/protocol/websocket/websocket_server.h
index 9b05f868..b0dcb3e3 100644
--- a/speechx/speechx/protocol/websocket/websocket_server.h
+++ b/speechx/speechx/protocol/websocket/websocket_server.h
@@ -19,8 +19,8 @@
 #include "boost/asio/ip/tcp.hpp"
 #include "boost/beast/core.hpp"
 #include "boost/beast/websocket.hpp"
-#include "recognizer/recognizer.h"
 #include "frontend/audio/feature_pipeline.h"
+#include "recognizer/recognizer.h"
 
 namespace beast = boost::beast;          // from <boost/beast.hpp>
 namespace http = beast::http;            // from <boost/beast/http.hpp>
diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc
index ff848f58..38bd5ccc 100644
--- a/speechx/speechx/recognizer/u2_recognizer_main.cc
+++ b/speechx/speechx/recognizer/u2_recognizer_main.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "decoder/param.h"
-#include "recognizer/u2_recognizer.h"
 #include "kaldi/feat/wave-reader.h"
 #include "kaldi/util/table-types.h"
+#include "recognizer/u2_recognizer.h"
 
 DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
 DEFINE_string(result_wspecifier, "", "test result wspecifier");

From 005d0e17be246f3e867be4250fe890cedf58c205 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 18 Oct 2022 03:57:09 +0000
Subject: [PATCH 29/60] update path.sh

---
 speechx/examples/codelab/u2/path.sh          | 2 +-
 speechx/examples/u2pp_ol/wenetspeech/path.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/speechx/examples/codelab/u2/path.sh b/speechx/examples/codelab/u2/path.sh
index 7f32fbce..d0600133 100644
--- a/speechx/examples/codelab/u2/path.sh
+++ b/speechx/examples/codelab/u2/path.sh
@@ -12,7 +12,7 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
 
 export LC_AL=C
 
-export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio
+export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio:$SPEECHX_BUILD/recognizer
 
 PADDLE_LIB_PATH=$(python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')")
 export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH
diff --git a/speechx/examples/u2pp_ol/wenetspeech/path.sh b/speechx/examples/u2pp_ol/wenetspeech/path.sh
index 7f32fbce..d0600133 100644
--- a/speechx/examples/u2pp_ol/wenetspeech/path.sh
+++ b/speechx/examples/u2pp_ol/wenetspeech/path.sh
@@ -12,7 +12,7 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
 
 export LC_AL=C
 
-export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio
+export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio:$SPEECHX_BUILD/recognizer
 
 PADDLE_LIB_PATH=$(python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')")
 export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH

From 138b4fe1f083d24d5bdccc461224dcf13ef0706d Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 18 Oct 2022 06:59:21 +0000
Subject: [PATCH 30/60] fix cmake paddle flags; more doc info

---
 speechx/CMakeLists.txt                       |  8 ++++--
 speechx/README.md                            |  4 +++
 speechx/examples/README.md                   | 30 +++++++++++++++++---
 speechx/examples/codelab/README.md           |  5 ++--
 speechx/examples/codelab/u2/path.sh          |  2 +-
 speechx/examples/u2pp_ol/README.md           |  2 +-
 speechx/examples/u2pp_ol/wenetspeech/path.sh |  2 +-
 7 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/speechx/CMakeLists.txt b/speechx/CMakeLists.txt
index 17e64c04..6255cb2e 100644
--- a/speechx/CMakeLists.txt
+++ b/speechx/CMakeLists.txt
@@ -100,8 +100,9 @@ message(STATUS "Python3_EXECUTABLE = ${Python3_EXECUTABLE}")
 message(STATUS "Pybind11_INCLUDES = ${pybind11_INCLUDE_DIRS}, pybind11_LIBRARIES=${pybind11_LIBRARIES}, pybind11_DEFINITIONS=${pybind11_DEFINITIONS}")
 
 # paddle include and link option
+# -L/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/libs -L/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/fluid -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so
 execute_process(
-    COMMAND python -c "import paddle ; print(' '.join(paddle.sysconfig.get_link_flags()), end='')"
+    COMMAND python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=' '.join([\"-L\" + libs_dir, \"-L\" + fluid_dir]); out += \" -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so\"; print(out);"
     OUTPUT_VARIABLE PADDLE_LINK_FLAGS
     RESULT_VARIABLE SUCESS)
 
@@ -109,8 +110,9 @@ message(STATUS PADDLE_LINK_FLAGS= ${PADDLE_LINK_FLAGS})
 string(STRIP ${PADDLE_LINK_FLAGS} PADDLE_LINK_FLAGS)
 
 # paddle compile option
+# -I/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/include
 execute_process(
-    COMMAND python -c "import paddle ; print(' '.join(paddle.sysconfig.get_compile_flags()), end='')"
+    COMMAND python -c "import paddle; include_dir = paddle.sysconfig.get_include(); print(f\"-I{include_dir}\");"
     OUTPUT_VARIABLE PADDLE_COMPILE_FLAGS)
 message(STATUS PADDLE_COMPILE_FLAGS= ${PADDLE_COMPILE_FLAGS})
 string(STRIP ${PADDLE_COMPILE_FLAGS} PADDLE_COMPILE_FLAGS)
@@ -119,7 +121,7 @@ string(STRIP ${PADDLE_COMPILE_FLAGS} PADDLE_COMPILE_FLAGS)
 # for LD_LIBRARY_PATH
 # set(PADDLE_LIB_DIRS /workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid:/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/libs/)
 execute_process(
-    COMMAND python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')"
+    COMMAND python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out);"
     OUTPUT_VARIABLE PADDLE_LIB_DIRS)
 message(STATUS PADDLE_LIB_DIRS= ${PADDLE_LIB_DIRS})
 
diff --git a/speechx/README.md b/speechx/README.md
index cc7b13e6..3861edf3 100644
--- a/speechx/README.md
+++ b/speechx/README.md
@@ -35,11 +35,15 @@ bash tools/venv.sh
 
 2. Build `speechx` and `examples`.
 
+For now we using feature under `develop` branch of paddle, so we need install `paddlepaddle` nightly build version.
+For example: 
 ```
 source venv/bin/activate
+python -m pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html
 ./build.sh
 ```
 
+
 3. Go to `examples` to have a fun.
 
 More details please see `README.md` under `examples`.
diff --git a/speechx/examples/README.md b/speechx/examples/README.md
index f7f6f9ac..de27bd94 100644
--- a/speechx/examples/README.md
+++ b/speechx/examples/README.md
@@ -1,20 +1,42 @@
 # Examples for SpeechX
 
+> `u2pp_ol` is recommended.
+
+* `u2pp_ol` - u2++ streaming asr test under `aishell-1` test dataset.
 * `ds2_ol` - ds2 streaming test under `aishell-1` test dataset. 
 
+
 ## How to run  
 
-`run.sh` is the entry point.
+### Create env
+
+Using `tools/evn.sh` under `speechx` to create python env.
+
+```
+bash tools/env.sh
+```
+
+Source env before play with example.
+```
+. venv/bin/activate
+```
+
+### Play with example
+
+`run.sh` is the entry point for every example.
 
-Example to play `ds2_ol`:
+Example to play `u2pp_ol`:
 
 ```
-pushd ds2_ol/aishell
-bash run.sh
+pushd u2pp_ol/wenetspeech
+bash run.sh --stop_stage 4
 ```
 
 ## Display Model with [Netron](https://github.com/lutzroeder/netron)  
 
+If you have a model, we can using this commnd to show model graph.
+
+For example:
 ```
 pip install netron
 netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel  --port 8022 --host 10.21.55.20
diff --git a/speechx/examples/codelab/README.md b/speechx/examples/codelab/README.md
index f89184de..803f25fa 100644
--- a/speechx/examples/codelab/README.md
+++ b/speechx/examples/codelab/README.md
@@ -1,8 +1,9 @@
 # Codelab
 
-## introduction
+> The below is for developing and offline testing. 
+> Do not run it only if you know what it is.
 
-> The below is for developing and offline testing. Do not run it only if you know what it is.
 * nnet
 * feat
 * decoder
+* u2
diff --git a/speechx/examples/codelab/u2/path.sh b/speechx/examples/codelab/u2/path.sh
index d0600133..ec278bd3 100644
--- a/speechx/examples/codelab/u2/path.sh
+++ b/speechx/examples/codelab/u2/path.sh
@@ -14,5 +14,5 @@ export LC_AL=C
 
 export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio:$SPEECHX_BUILD/recognizer
 
-PADDLE_LIB_PATH=$(python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')")
+PADDLE_LIB_PATH=$(python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out);")
 export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH
diff --git a/speechx/examples/u2pp_ol/README.md b/speechx/examples/u2pp_ol/README.md
index ce01a8fc..838db435 100644
--- a/speechx/examples/u2pp_ol/README.md
+++ b/speechx/examples/u2pp_ol/README.md
@@ -2,4 +2,4 @@
 
 ## Examples
 
-* `wenetspeech` - Streaming Decoding using wenetspeech u2/u2++ model. Using aishell test data for testing.    
+* `wenetspeech` - Streaming Decoding with wenetspeech u2/u2++ model. Using aishell test data for testing.    
diff --git a/speechx/examples/u2pp_ol/wenetspeech/path.sh b/speechx/examples/u2pp_ol/wenetspeech/path.sh
index d0600133..ec278bd3 100644
--- a/speechx/examples/u2pp_ol/wenetspeech/path.sh
+++ b/speechx/examples/u2pp_ol/wenetspeech/path.sh
@@ -14,5 +14,5 @@ export LC_AL=C
 
 export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio:$SPEECHX_BUILD/recognizer
 
-PADDLE_LIB_PATH=$(python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')")
+PADDLE_LIB_PATH=$(python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out);")
 export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH

From 36af34b293a18c0fc3b61de3b261b04468cac1b7 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 18 Oct 2022 09:00:14 +0000
Subject: [PATCH 31/60] add DecoderBase and license

---
 speechx/speechx/decoder/ctc_beam_search_decoder.h        | 2 +-
 speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h | 5 ++++-
 speechx/speechx/decoder/ctc_prefix_beam_search_score.h   | 2 ++
 speechx/speechx/decoder/ctc_tlg_decoder.h                | 2 +-
 speechx/speechx/decoder/decoder_itf.h                    | 3 +++
 speechx/speechx/nnet/u2_nnet.cc                          | 3 +++
 speechx/speechx/nnet/u2_nnet.h                           | 2 ++
 7 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.h b/speechx/speechx/decoder/ctc_beam_search_decoder.h
index 479754c3..6347bba8 100644
--- a/speechx/speechx/decoder/ctc_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder.h
@@ -23,7 +23,7 @@
 
 namespace ppspeech {
 
-class CTCBeamSearch : public DecoderInterface {
+class CTCBeamSearch : public DecoderBase {
   public:
     explicit CTCBeamSearch(const CTCBeamSearchOptions& opts);
     ~CTCBeamSearch() {}
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
index eef8823d..ef96ecd9 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
@@ -1,3 +1,4 @@
+// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,6 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.cc
+
 #pragma once
 
 #include "decoder/ctc_beam_search_opt.h"
@@ -21,7 +24,7 @@
 
 namespace ppspeech {
 class ContextGraph;
-class CTCPrefixBeamSearch : public DecoderInterface {
+class CTCPrefixBeamSearch : public DecoderBase {
   public:
     explicit CTCPrefixBeamSearch(const std::string vocab_path,
                                  const CTCBeamSearchOptions& opts);
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h
index da2fb80a..908be1d6 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h
@@ -13,6 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.h
+
 #pragma once
 
 #include "base/common.h"
diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h
index cf8a9b73..f250ac25 100644
--- a/speechx/speechx/decoder/ctc_tlg_decoder.h
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.h
@@ -54,7 +54,7 @@ struct TLGDecoderOptions {
     }
 };
 
-class TLGDecoder : public DecoderInterface {
+class TLGDecoder : public DecoderBase {
   public:
     explicit TLGDecoder(TLGDecoderOptions opts);
     ~TLGDecoder() = default;
diff --git a/speechx/speechx/decoder/decoder_itf.h b/speechx/speechx/decoder/decoder_itf.h
index eec9bc3d..2289b317 100644
--- a/speechx/speechx/decoder/decoder_itf.h
+++ b/speechx/speechx/decoder/decoder_itf.h
@@ -51,7 +51,10 @@ class DecoderInterface {
     virtual std::vector<std::pair<double, std::string>> GetNBestPath() = 0;
 
     virtual std::vector<std::pair<double, std::string>> GetNBestPath(int n) = 0;
+};
 
+class DecoderBase : public DecoderInterface {
+  protected:
     // start from one
     int NumFrameDecoded() { return num_frame_decoded_ + 1; }
 
diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc
index c92c96aa..ff6a4dc3 100644
--- a/speechx/speechx/nnet/u2_nnet.cc
+++ b/speechx/speechx/nnet/u2_nnet.cc
@@ -1,3 +1,4 @@
+// Copyright 2022 Horizon Robotics. All Rights Reserved.
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,6 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.cc
+
 #include "nnet/u2_nnet.h"
 
 #ifdef USE_PROFILING
diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h
index a37a88f2..48dd8193 100644
--- a/speechx/speechx/nnet/u2_nnet.h
+++ b/speechx/speechx/nnet/u2_nnet.h
@@ -1,3 +1,4 @@
+// Copyright 2022 Horizon Robotics. All Rights Reserved.
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,6 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.h
 #pragma once
 
 #include "base/common.h"

From 0a8ef58af088d58ae882044640eba5fcb64ccf13 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 18 Oct 2022 09:10:49 +0000
Subject: [PATCH 32/60] remove uesless code

---
 speechx/speechx/nnet/u2_nnet.cc | 83 +--------------------------------
 speechx/speechx/nnet/u2_nnet.h  | 27 ++++-------
 2 files changed, 10 insertions(+), 100 deletions(-)

diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc
index ff6a4dc3..baae2ce8 100644
--- a/speechx/speechx/nnet/u2_nnet.cc
+++ b/speechx/speechx/nnet/u2_nnet.cc
@@ -25,65 +25,6 @@ using paddle::platform::TracerEventType;
 
 namespace ppspeech {
 
-int U2NnetBase::num_frames_for_chunk(bool start) const {
-    int num_needed_frames = 0;  // num feat frames
-    bool first = !start;        // start == false is first
-
-    if (chunk_size_ > 0) {
-        // streaming mode
-        if (first) {
-            // first chunk
-            // 1 decoder frame need `context` feat frames
-            int context = this->context();
-            num_needed_frames = (chunk_size_ - 1) * subsampling_rate_ + context;
-        } else {
-            // after first chunk, we need stride this num frames.
-            num_needed_frames = chunk_size_ * subsampling_rate_;
-        }
-    } else {
-        // non-streaming mode. feed all feats once.
-        num_needed_frames = std::numeric_limits<int>::max();
-    }
-
-    return num_needed_frames;
-}
-
-// cache feats for next chunk
-void U2NnetBase::CacheFeature(const std::vector<kaldi::BaseFloat>& chunk_feats,
-                              int32 feat_dim) {
-    // chunk_feats is nframes*feat_dim
-    const int chunk_size = chunk_feats.size() / feat_dim;
-    const int cached_feat_size = this->context() - subsampling_rate_;
-    if (chunk_size >= cached_feat_size) {
-        cached_feats_.resize(cached_feat_size);
-        for (int i = 0; i < cached_feat_size; ++i) {
-            auto start =
-                chunk_feats.begin() + chunk_size - cached_feat_size + i;
-            auto end = start + feat_dim;
-            cached_feats_[i] = std::vector<float>(start, end);
-        }
-    }
-}
-
-void U2NnetBase::ForwardEncoderChunk(
-    const std::vector<kaldi::BaseFloat>& chunk_feats,
-    const int32& feat_dim,
-    std::vector<kaldi::BaseFloat>* ctc_probs,
-    int32* vocab_dim) {
-    ctc_probs->clear();
-    // int num_frames = cached_feats_.size() + chunk_feats.size();
-    int num_frames = chunk_feats.size() / feat_dim;
-    VLOG(3) << "foward encoder chunk: " << num_frames << " frames";
-    VLOG(3) << "context: " << this->context() << " frames";
-
-    if (num_frames >= this->context()) {
-        this->ForwardEncoderChunkImpl(
-            chunk_feats, feat_dim, ctc_probs, vocab_dim);
-        VLOG(3) << "after forward chunk";
-        this->CacheFeature(chunk_feats, feat_dim);
-    }
-}
-
 
 void U2Nnet::LoadModel(const std::string& model_path_w_prefix) {
     paddle::jit::utils::InitKernelSignatureMap();
@@ -188,7 +129,7 @@ U2Nnet::U2Nnet(const U2Nnet& other) {
     forward_attention_decoder_ = other.forward_attention_decoder_;
     ctc_activation_ = other.ctc_activation_;
 
-    //   offset_ = other.offset_; // TODO: not used in nnets
+    offset_ = other.offset_; 
 
     // copy model ptr
     model_ = other.model_;
@@ -204,8 +145,7 @@ std::shared_ptr<NnetBase> U2Nnet::Copy() const {
 }
 
 void U2Nnet::Reset() {
-    //   offset_ = 0;
-    //   cached_feats_.clear(); // TODO: not used in nnets
+    offset_ = 0;
 
     att_cache_ =
         std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32));
@@ -263,16 +203,6 @@ void U2Nnet::ForwardEncoderChunkImpl(
         paddle::zeros({1, num_frames, feat_dim}, paddle::DataType::FLOAT32);
     float* feats_ptr = feats.mutable_data<float>();
 
-    // for (size_t i = 0; i < cached_feats_.size(); ++i) {
-    //     float* row = feats_ptr + i * feat_dim;
-    //     std::memcpy(row, cached_feats_[i].data(), feat_dim * sizeof(float));
-    // }
-
-    // for (size_t i = 0; i < chunk_feats.size(); ++i) {
-    //     float* row = feats_ptr + (cached_feats_.size() + i) * feat_dim;
-    //     std::memcpy(row, chunk_feats[i].data(), feat_dim * sizeof(float));
-    // }
-
     // not cache feature in nnet
     CHECK(cached_feats_.size() == 0);
     // CHECK_EQ(std::is_same<float, kaldi::BaseFloat>::value, true);
@@ -427,15 +357,6 @@ void U2Nnet::ForwardEncoderChunkImpl(
 
     float* ctc_log_probs_ptr = ctc_log_probs.data<float>();
 
-    // // vector<vector<float>>
-    // out_prob->resize(T);
-    // for (int i = 0; i < T; i++) {
-    //     (*out_prob)[i].resize(D);
-    //     float* dst_ptr = (*out_prob)[i].data();
-    //     float* src_ptr = ctc_log_probs_ptr + (i * D);
-    //     std::memcpy(dst_ptr, src_ptr, D * sizeof(float));
-    // }
-    // CHECK(std::is_same<float, kaldi::BaseFloat>::value);
     out_prob->resize(T * D);
     std::memcpy(
         out_prob->data(), ctc_log_probs_ptr, T * D * sizeof(kaldi::BaseFloat));
diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h
index 48dd8193..6cbc0570 100644
--- a/speechx/speechx/nnet/u2_nnet.h
+++ b/speechx/speechx/nnet/u2_nnet.h
@@ -28,29 +28,21 @@ namespace ppspeech {
 
 class U2NnetBase : public NnetBase {
   public:
-    virtual int context() const { return right_context_ + 1; }
-    virtual int right_context() const { return right_context_; }
+    virtual int Context() const { return right_context_ + 1; }
+    virtual int RightContext() const { return right_context_; }
 
-    virtual int eos() const { return eos_; }
-    virtual int sos() const { return sos_; }
-    virtual int is_bidecoder() const { return is_bidecoder_; }
+    virtual int EOS() const { return eos_; }
+    virtual int SOS() const { return sos_; }
+    virtual int IsBidecoder() const { return is_bidecoder_; }
     // current offset in decoder frame
-    virtual int offset() const { return offset_; }
-    virtual void set_chunk_size(int chunk_size) { chunk_size_ = chunk_size; }
-    virtual void set_num_left_chunks(int num_left_chunks) {
+    virtual int Offset() const { return offset_; }
+    virtual void SetChunkSize(int chunk_size) { chunk_size_ = chunk_size; }
+    virtual void SetNumLeftChunks(int num_left_chunks) {
         num_left_chunks_ = num_left_chunks;
     }
-    // start: false, it is the start chunk of one sentence, else true
-    virtual int num_frames_for_chunk(bool start) const;
 
     virtual std::shared_ptr<NnetBase> Copy() const = 0;
 
-    virtual void ForwardEncoderChunk(
-        const std::vector<kaldi::BaseFloat>& chunk_feats,
-        const int32& feat_dim,
-        std::vector<kaldi::BaseFloat>* ctc_probs,
-        int32* vocab_dim);
-
   protected:
     virtual void ForwardEncoderChunkImpl(
         const std::vector<kaldi::BaseFloat>& chunk_feats,
@@ -58,9 +50,6 @@ class U2NnetBase : public NnetBase {
         std::vector<kaldi::BaseFloat>* ctc_probs,
         int32* vocab_dim) = 0;
 
-    virtual void CacheFeature(const std::vector<kaldi::BaseFloat>& chunk_feats,
-                              int32 feat_dim);
-
   protected:
     // model specification
     int right_context_{0};

From 050d766915c01a59fd4880dfb263dbc30605944f Mon Sep 17 00:00:00 2001
From: tianhao zhang <15600919271@163.com>
Date: Wed, 19 Oct 2022 05:31:18 +0000
Subject: [PATCH 33/60] fix u2pp model

---
 docs/source/released_model.md              |  2 +-
 paddlespeech/cli/asr/infer.py              |  4 ++--
 paddlespeech/resource/model_alias.py       |  1 -
 paddlespeech/resource/pretrained_models.py | 26 +++-------------------
 4 files changed, 6 insertions(+), 27 deletions(-)

diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index a2456f1f..586f17c3 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -9,7 +9,7 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER |
 [Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz) | Aishell Dataset | Char-based | 491 MB  | 2 Conv + 5 LSTM layers | 0.0666 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) | onnx/inference/python |
 [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz)| Aishell Dataset | Char-based | 1.4 GB | 2 Conv + 5 bidirectional LSTM layers| 0.0554 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) | inference/python |
 [Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) 0.1879 (test\_meeting) |-| 10000 h |- | python |
-[Conformer U2PP Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.4.model.tar.gz) | WenetSpeech Dataset | Char-based | 476 MB  | Encoder:Conformer, Decoder:BiTransformer, Decoding method: Attention rescoring| 0.047198 (aishell test\_-1) 0.059212 (aishell test\_16) |-| 10000 h |- | python |
+[Conformer U2PP Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.3.0.model.tar.gz) | WenetSpeech Dataset | Char-based | 476 MB  | Encoder:Conformer, Decoder:BiTransformer, Decoding method: Attention rescoring| 0.047198 (aishell test\_-1) 0.059212 (aishell test\_16) |-| 10000 h |- | python |
 [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) | python |
 [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_1.0.1.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0460 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) | python |
 [Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer  Aishell ASR1](../../examples/aishell/asr1) | python |
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 437f6463..00414336 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -52,7 +52,7 @@ class ASRExecutor(BaseExecutor):
         self.parser.add_argument(
             '--model',
             type=str,
-            default='conformer_u2pp_wenetspeech',
+            default='conformer_u2pp_online_wenetspeech',
             choices=[
                 tag[:tag.index('-')]
                 for tag in self.task_resource.pretrained_models.keys()
@@ -470,7 +470,7 @@ class ASRExecutor(BaseExecutor):
     @stats_wrapper
     def __call__(self,
                  audio_file: os.PathLike,
-                 model: str='conformer_u2pp_wenetspeech',
+                 model: str='conformer_u2pp_online_wenetspeech',
                  lang: str='zh',
                  sample_rate: int=16000,
                  config: os.PathLike=None,
diff --git a/paddlespeech/resource/model_alias.py b/paddlespeech/resource/model_alias.py
index f5ec655b..8e9ecc4b 100644
--- a/paddlespeech/resource/model_alias.py
+++ b/paddlespeech/resource/model_alias.py
@@ -25,7 +25,6 @@ model_alias = {
     "deepspeech2online": ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"],
     "conformer": ["paddlespeech.s2t.models.u2:U2Model"],
     "conformer_online": ["paddlespeech.s2t.models.u2:U2Model"],
-    "conformer_u2pp": ["paddlespeech.s2t.models.u2:U2Model"],
     "conformer_u2pp_online": ["paddlespeech.s2t.models.u2:U2Model"],
     "transformer": ["paddlespeech.s2t.models.u2:U2Model"],
     "wenetspeech": ["paddlespeech.s2t.models.u2:U2Model"],
diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py
index efd6bb3f..df50a6a9 100644
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@@ -68,32 +68,12 @@ asr_dynamic_pretrained_models = {
             '',
         },
     },
-    "conformer_u2pp_wenetspeech-zh-16k": {
-        '1.1': {
-            'url':
-            'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.3.model.tar.gz',
-            'md5':
-            '662b347e1d2131b7a4dc5398365e2134',
-            'cfg_path':
-            'model.yaml',
-            'ckpt_path':
-            'exp/chunk_conformer_u2pp/checkpoints/avg_10',
-            'model':
-            'exp/chunk_conformer_u2pp/checkpoints/avg_10.pdparams',
-            'params':
-            'exp/chunk_conformer_u2pp/checkpoints/avg_10.pdparams',
-            'lm_url':
-            '',
-            'lm_md5':
-            '',
-        },
-    },
     "conformer_u2pp_online_wenetspeech-zh-16k": {
-        '1.1': {
+        '1.3': {
             'url':
-            'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.4.model.tar.gz',
+            'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.3.0.model.tar.gz',
             'md5':
-            '3100fc1eac5779486cab859366992d0b',
+            '62d230c1bf27731192aa9d3b8deca300',
             'cfg_path':
             'model.yaml',
             'ckpt_path':

From f9fc32e89ebd82193feceea3bf79bb27b4ee5d80 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Wed, 19 Oct 2022 07:56:57 +0000
Subject: [PATCH 34/60] fix scripts

---
 .../examples/u2pp_ol/wenetspeech/local/recognizer.sh  | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh
index 29b50537..bf463545 100755
--- a/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh
+++ b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh
@@ -6,10 +6,13 @@ set -e
 data=data
 exp=exp
 nj=20
+
+
 mkdir -p $exp
 ckpt_dir=./data/model
 model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/
 aishell_wav_scp=aishell_test.scp
+text=$data/test/text
 
 ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
 
@@ -27,8 +30,8 @@ u2_recognizer_main \
     --result_wspecifier=ark,t:$data/split${nj}/JOB/result_recognizer.ark
 
 
-cat $data/split${nj}/*/result_recognizer.ark > $exp/${label_file}_recognizer
-utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_recognizer > $exp/${wer}.recognizer
+cat $data/split${nj}/*/result_recognizer.ark > $exp/aishell_recognizer
+utils/compute-wer.py --char=1 --v=1 $text $exp/aishell_recognizer > $exp/aishell.recognizer.err
 echo "recognizer test have finished!!!"
-echo "please checkout in ${exp}/${wer}.recognizer"
-tail -n 7 $exp/${wer}.recognizer
\ No newline at end of file
+echo "please checkout in $exp/aishell.recognizer.err"
+tail -n 7 $exp/aishell.recognizer.err
\ No newline at end of file

From 7e334ce890a512f067af9a0918632a1c3c45001e Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Wed, 19 Oct 2022 12:43:47 +0000
Subject: [PATCH 35/60] fix assembler buf, which not clear cache, and fill zero
 default

---
 .../u2pp_ol/wenetspeech/local/recognizer.sh   |  2 +-
 speechx/speechx/frontend/audio/assembler.cc   | 50 ++++++++++++++-----
 speechx/speechx/frontend/audio/assembler.h    | 31 ++++++------
 speechx/speechx/frontend/audio/audio_cache.cc |  4 ++
 speechx/speechx/frontend/audio/audio_cache.h  |  4 +-
 .../speechx/frontend/audio/feature_cache.cc   |  3 ++
 .../speechx/frontend/audio/feature_cache.h    | 10 ++--
 speechx/speechx/nnet/u2_nnet.cc               |  1 +
 .../speechx/recognizer/u2_recognizer_main.cc  |  6 ++-
 9 files changed, 77 insertions(+), 34 deletions(-)

diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh
index bf463545..f71a8003 100755
--- a/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh
+++ b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh
@@ -5,7 +5,7 @@ set -e
 
 data=data
 exp=exp
-nj=20
+nj=40
 
 
 mkdir -p $exp
diff --git a/speechx/speechx/frontend/audio/assembler.cc b/speechx/speechx/frontend/audio/assembler.cc
index 37eeec80..ff1b1f28 100644
--- a/speechx/speechx/frontend/audio/assembler.cc
+++ b/speechx/speechx/frontend/audio/assembler.cc
@@ -23,9 +23,11 @@ using std::unique_ptr;
 
 Assembler::Assembler(AssemblerOptions opts,
                      unique_ptr<FrontendInterface> base_extractor) {
+    fill_zero_ = opts.fill_zero;
     frame_chunk_stride_ = opts.subsampling_rate * opts.nnet_decoder_chunk;
     frame_chunk_size_ = (opts.nnet_decoder_chunk - 1) * opts.subsampling_rate +
                         opts.receptive_filed_length;
+    cache_size_ = frame_chunk_size_ - frame_chunk_stride_;
     receptive_filed_length_ = opts.receptive_filed_length;
     base_extractor_ = std::move(base_extractor);
     dim_ = base_extractor_->Dim();
@@ -38,14 +40,13 @@ void Assembler::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
 
 // pop feature chunk
 bool Assembler::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
-    feats->Resize(dim_ * frame_chunk_size_);
     bool result = Compute(feats);
     return result;
 }
 
-// read all data from base_feature_extractor_ into cache_
+// read frame by frame from base_feature_extractor_ into cache_
 bool Assembler::Compute(Vector<BaseFloat>* feats) {
-    // compute and feed
+    // compute and feed frame by frame
     bool result = false;
     while (feature_cache_.size() < frame_chunk_size_) {
         Vector<BaseFloat> feature;
@@ -54,33 +55,58 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
             if (IsFinished() == false) return false;
             break;
         }
+
+        CHECK(feature.Dim() == dim_);
+        nframes_ += 1;
+        VLOG(1) << "nframes: " << nframes_;
+
         feature_cache_.push(feature);
     }
 
     if (feature_cache_.size() < receptive_filed_length_) {
+        VLOG(1) << "feature_cache less than receptive_filed_lenght. " << feature_cache_.size() << ": " << receptive_filed_length_;
         return false;
     }
 
-    while (feature_cache_.size() < frame_chunk_size_) {
-        Vector<BaseFloat> feature(dim_, kaldi::kSetZero);
-        feature_cache_.push(feature);
+
+    if (fill_zero_){
+        while (feature_cache_.size() < frame_chunk_size_) {
+            Vector<BaseFloat> feature(dim_, kaldi::kSetZero);
+            nframes_ += 1;
+            feature_cache_.push(feature);
+        }
     }
 
+    int32 this_chunk_size = std::min(static_cast<int32>(feature_cache_.size()), frame_chunk_size_);
+    feats->Resize(dim_ * this_chunk_size);
+
     int32 counter = 0;
-    int32 cache_size = frame_chunk_size_ - frame_chunk_stride_;
-    int32 elem_dim = base_extractor_->Dim();
-    while (counter < frame_chunk_size_) {
+    while (counter < this_chunk_size) {
         Vector<BaseFloat>& val = feature_cache_.front();
-        int32 start = counter * elem_dim;
-        feats->Range(start, elem_dim).CopyFromVec(val);
-        if (frame_chunk_size_ - counter <= cache_size) {
+        CHECK(val.Dim() == dim_) << val.Dim();
+      
+        int32 start = counter * dim_;
+        feats->Range(start, dim_).CopyFromVec(val);
+
+        if (this_chunk_size - counter <= cache_size_) {
             feature_cache_.push(val);
         }
+
+        // val is reference, so we should pop here
         feature_cache_.pop();
+  
         counter++;
     }
 
     return result;
 }
 
+
+ void Assembler::Reset() { 
+    std::queue<kaldi::Vector<kaldi::BaseFloat>> empty;
+    std::swap(feature_cache_, empty);
+    nframes_ = 0;
+    base_extractor_->Reset(); 
+}
+
 }  // namespace ppspeech
diff --git a/speechx/speechx/frontend/audio/assembler.h b/speechx/speechx/frontend/audio/assembler.h
index 258e61f2..4f165ea8 100644
--- a/speechx/speechx/frontend/audio/assembler.h
+++ b/speechx/speechx/frontend/audio/assembler.h
@@ -22,14 +22,10 @@ namespace ppspeech {
 struct AssemblerOptions {
     // refer:https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/s2t/exps/deepspeech2/model.py
     // the nnet batch forward
-    int32 receptive_filed_length;
-    int32 subsampling_rate;
-    int32 nnet_decoder_chunk;
-
-    AssemblerOptions()
-        : receptive_filed_length(1),
-          subsampling_rate(1),
-          nnet_decoder_chunk(1) {}
+    int32 receptive_filed_length{1};
+    int32 subsampling_rate{1};
+    int32 nnet_decoder_chunk{1};
+    bool fill_zero{false}; // whether fill zero when last chunk is not equal to frame_chunk_size_
 };
 
 class Assembler : public FrontendInterface {
@@ -39,29 +35,34 @@ class Assembler : public FrontendInterface {
         std::unique_ptr<FrontendInterface> base_extractor = NULL);
 
     // Feed feats or waves
-    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
+    void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) override;
 
     // feats size = num_frames * feat_dim
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
+    bool Read(kaldi::Vector<kaldi::BaseFloat>* feats) override;
 
     // feat dim
-    virtual size_t Dim() const { return dim_; }
+    size_t Dim() const override { return dim_; }
 
-    virtual void SetFinished() { base_extractor_->SetFinished(); }
+    void SetFinished() override { base_extractor_->SetFinished(); }
 
-    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
+    bool IsFinished() const override { return base_extractor_->IsFinished(); }
 
-    virtual void Reset() { base_extractor_->Reset(); }
+    void Reset() override;
 
   private:
     bool Compute(kaldi::Vector<kaldi::BaseFloat>* feats);
 
-    int32 dim_;
+    bool fill_zero_{false};
+
+    int32 dim_;                 // feat dim
     int32 frame_chunk_size_;    // window
     int32 frame_chunk_stride_;  // stride
+    int32 cache_size_;          // window - stride
     int32 receptive_filed_length_;
     std::queue<kaldi::Vector<kaldi::BaseFloat>> feature_cache_;
     std::unique_ptr<FrontendInterface> base_extractor_;
+
+    int32 nframes_; // num frame computed
     DISALLOW_COPY_AND_ASSIGN(Assembler);
 };
 
diff --git a/speechx/speechx/frontend/audio/audio_cache.cc b/speechx/speechx/frontend/audio/audio_cache.cc
index b7a15acd..71e5d09e 100644
--- a/speechx/speechx/frontend/audio/audio_cache.cc
+++ b/speechx/speechx/frontend/audio/audio_cache.cc
@@ -83,6 +83,10 @@ bool AudioCache::Read(Vector<BaseFloat>* waves) {
     }
     size_ -= chunk_size;
     offset_ = (offset_ + chunk_size) % ring_buffer_.size();
+
+    nsamples_ += chunk_size;
+    VLOG(1) << "nsamples readed: " <<  nsamples_;
+    
     ready_feed_condition_.notify_one();
     return true;
 }
diff --git a/speechx/speechx/frontend/audio/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h
index fc07d4ba..da422daa 100644
--- a/speechx/speechx/frontend/audio/audio_cache.h
+++ b/speechx/speechx/frontend/audio/audio_cache.h
@@ -41,10 +41,11 @@ class AudioCache : public FrontendInterface {
 
     virtual bool IsFinished() const { return finished_; }
 
-    virtual void Reset() {
+    void Reset() override {
         offset_ = 0;
         size_ = 0;
         finished_ = false;
+        nsamples_ = 0;
     }
 
   private:
@@ -61,6 +62,7 @@ class AudioCache : public FrontendInterface {
     kaldi::int32 timeout_;  // millisecond
     bool to_float32_;       // int16 -> float32. used in linear_spectrogram
 
+    int32 nsamples_; // number samples readed.
     DISALLOW_COPY_AND_ASSIGN(AudioCache);
 };
 
diff --git a/speechx/speechx/frontend/audio/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc
index 509a98c3..c712e48e 100644
--- a/speechx/speechx/frontend/audio/feature_cache.cc
+++ b/speechx/speechx/frontend/audio/feature_cache.cc
@@ -73,6 +73,9 @@ bool FeatureCache::Compute() {
     if (result == false || feature.Dim() == 0) return false;
 
     int32 num_chunk = feature.Dim() / dim_;
+    nframe_ += num_chunk;
+    VLOG(1) << "nframe computed: " << nframe_;
+
     for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) {
         int32 start = chunk_idx * dim_;
         Vector<BaseFloat> feature_chunk(dim_);
diff --git a/speechx/speechx/frontend/audio/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h
index b922de12..09d7f7eb 100644
--- a/speechx/speechx/frontend/audio/feature_cache.h
+++ b/speechx/speechx/frontend/audio/feature_cache.h
@@ -51,11 +51,12 @@ class FeatureCache : public FrontendInterface {
 
     virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
 
-    virtual void Reset() {
+    void Reset() override {
+        std::queue<kaldi::Vector<BaseFloat>> empty;
+        std::swap(cache_, empty);
+        nframe_ = 0;
         base_extractor_->Reset();
-        while (!cache_.empty()) {
-            cache_.pop();
-        }
+        VLOG(1) << "feature cache reset: cache size: " << cache_.size();
     }
 
   private:
@@ -74,6 +75,7 @@ class FeatureCache : public FrontendInterface {
     std::condition_variable ready_feed_condition_;
     std::condition_variable ready_read_condition_;
 
+    int32 nframe_; // num of feature computed
     DISALLOW_COPY_AND_ASSIGN(FeatureCache);
 };
 
diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc
index baae2ce8..63a8a793 100644
--- a/speechx/speechx/nnet/u2_nnet.cc
+++ b/speechx/speechx/nnet/u2_nnet.cc
@@ -153,6 +153,7 @@ void U2Nnet::Reset() {
         std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32));
 
     encoder_outs_.clear();
+    VLOG(1) << "u2nnet reset";
 }
 
 // Debug API
diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc
index 38bd5ccc..2375586e 100644
--- a/speechx/speechx/recognizer/u2_recognizer_main.cc
+++ b/speechx/speechx/recognizer/u2_recognizer_main.cc
@@ -82,9 +82,13 @@ int main(int argc, char* argv[]) {
             // no overlap
             sample_offset += cur_chunk_size;
         }
+        CHECK(sample_offset == tot_samples);
+
+        // recognizer.SetFinished();
+
         // second pass decoding
         recognizer.Rescoring();
-
+    
         std::string result = recognizer.GetFinalResult();
 
         recognizer.Reset();

From 3ac7ac253f66c46f01aa11be3de95d6177f47107 Mon Sep 17 00:00:00 2001
From: "david.95" <david.95@live.cn>
Date: Thu, 20 Oct 2022 09:29:11 +0800
Subject: [PATCH 36/60] fix review issue,test=tts

---
 paddlespeech/t2s/ssml/xml_processor.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/ssml/xml_processor.py
index 54f24f59..b3912134 100644
--- a/paddlespeech/t2s/ssml/xml_processor.py
+++ b/paddlespeech/t2s/ssml/xml_processor.py
@@ -35,8 +35,8 @@ class MixTextProcessor():
             return None
 
     def get_content_split(self, mixstr):
-        ''' 文本分解，顺序加了列表中，按非xml 和 xml 分开，对应的字符串,带标点符号
-        不能去除空格，因为xml 中tag 属性带空格
+        ''' 文本分解，顺序加了列表中，按非 xml 和 xml 分开，对应的字符串,带标点符号
+        不能去除空格，因为 xml 中tag 属性带空格
         '''
         ctlist = []
         # print("Testing:",mixstr[:20])
@@ -77,17 +77,12 @@ class MixTextProcessor():
 
 class DomXml():
     def __init__(self, xmlstr):
-        print("Parse xml str:", xmlstr)
         self.tdom = parseString(xmlstr)  #Document
-        # print("tdom:",type(self.tdom)) 
         self.root = self.tdom.documentElement  #Element
-        # print("root:",type(self.root)) 
         self.rnode = self.tdom.childNodes  #NodeList
-        # print("rnode:",type(self.rnode))
-        pass
 
     def get_text(self):
-        '''返回xml 内容的所有文本内容的 列表'''
+        '''返回 xml 内容的所有文本内容的列表'''
         res = []
 
         for x1 in self.rnode:
@@ -107,7 +102,7 @@ class DomXml():
         return res
 
     def get_xmlchild_list(self):
-        '''返回xml 内容的列表， 包括所有文本内容(不带tag)'''
+        '''返回 xml 内容的列表，包括所有文本内容(不带 tag)'''
         res = []
 
         for x1 in self.rnode:
@@ -127,7 +122,7 @@ class DomXml():
         return res
 
     def get_pinyins_for_xml(self):
-        '''返回xml 内容，如果字符串 和 拼音的 list , 如 ['''
+        '''返回 xml 内容，字符串和拼音的 list '''
         res = []
 
         for x1 in self.rnode:
@@ -155,7 +150,7 @@ class DomXml():
         return res
 
     def get_all_tags(self, tag_name):
-        '''获取所有的tag 及属性值'''
+        '''获取所有的 tag 及属性值'''
         alltags = self.root.getElementsByTagName(tag_name)
         for x in alltags:
             if x.hasAttribute('pinyin'):  # pinyin

From 7d5ae651ce92d0bd953f0de54b81d00cf951b01d Mon Sep 17 00:00:00 2001
From: "david.95" <david.95@live.cn>
Date: Thu, 20 Oct 2022 10:07:21 +0800
Subject: [PATCH 37/60] add readme thanks

---
 README.md    | 2 +-
 README_cn.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 49e40624..0abb3fd6 100644
--- a/README.md
+++ b/README.md
@@ -923,7 +923,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P
 
 ## Acknowledgement
 - Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples.
-- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. 
+- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data.Added SSML for Chinese Text Frontend. 
 - Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW).
 - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help.
 - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files.
diff --git a/README_cn.md b/README_cn.md
index bf3ff4df..0c3af5dd 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -928,7 +928,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声
 
 ## 致谢
 - 非常感谢 [HighCWu](https://github.com/HighCWu) 新增 [VITS-aishell3](./examples/aishell3/vits) 和 [VITS-VC](./examples/aishell3/vits-vc) 代码示例。
-- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题，贡献补充多条程序和数据。
+- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题，贡献补充多条程序和数据。新增 SSML 中文文本前端处理。
 - 非常感谢 [BarryKCL](https://github.com/BarryKCL) 基于 [G2PW](https://github.com/GitYCC/g2pW) 对 TTS 中文文本前端的优化。
 - 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) 多年来的关注和建议，以及在诸多问题上的帮助。
 - 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。

From ec1f9edd562275e2d2799c16e36a304bae172e1c Mon Sep 17 00:00:00 2001
From: "david.95" <david.95@live.cn>
Date: Thu, 20 Oct 2022 10:11:26 +0800
Subject: [PATCH 38/60] add space after punctions

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0abb3fd6..d02ac4c6 100644
--- a/README.md
+++ b/README.md
@@ -923,7 +923,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P
 
 ## Acknowledgement
 - Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples.
-- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data.Added SSML for Chinese Text Frontend. 
+- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. Added SSML for Chinese Text Frontend. 
 - Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW).
 - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help.
 - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files.

From da525d346f0a78fc1b6f11db408a5ce1a76c5610 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Thu, 20 Oct 2022 06:17:17 +0000
Subject: [PATCH 39/60] fix uvicorn's version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index e551d9fa..3353cdad 100644
--- a/setup.py
+++ b/setup.py
@@ -77,7 +77,7 @@ base = [
     "pybind11",
 ]
 
-server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"]
+server = ["fastapi", "uvicorn<=0.18.3", "pattern_singleton", "websockets"]
 
 requirements = {
     "install":

From 63c80121e2c5691145a2bc8c49cf1a2b277c7067 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Thu, 20 Oct 2022 06:33:07 +0000
Subject: [PATCH 40/60] fix uvicorn's bug

---
 paddlespeech/server/bin/paddlespeech_server.py | 2 +-
 setup.py                                       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py
index 10a91d9b..1b1792bd 100644
--- a/paddlespeech/server/bin/paddlespeech_server.py
+++ b/paddlespeech/server/bin/paddlespeech_server.py
@@ -113,7 +113,7 @@ class ServerExecutor(BaseExecutor):
         """
         config = get_config(config_file)
         if self.init(config):
-            uvicorn.run(app, host=config.host, port=config.port, debug=True)
+            uvicorn.run(app, host=config.host, port=config.port)
 
 
 @cli_server_register(
diff --git a/setup.py b/setup.py
index 3353cdad..e551d9fa 100644
--- a/setup.py
+++ b/setup.py
@@ -77,7 +77,7 @@ base = [
     "pybind11",
 ]
 
-server = ["fastapi", "uvicorn<=0.18.3", "pattern_singleton", "websockets"]
+server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"]
 
 requirements = {
     "install":

From ce153d915e512c5ab38e7791fb733540189ebfb1 Mon Sep 17 00:00:00 2001
From: tianhao zhang <15600919271@163.com>
Date: Thu, 20 Oct 2022 07:54:00 +0000
Subject: [PATCH 41/60] update u2pp result.md

---
 examples/wenetspeech/asr1/RESULTS.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md
index f22c652e..cd480163 100644
--- a/examples/wenetspeech/asr1/RESULTS.md
+++ b/examples/wenetspeech/asr1/RESULTS.md
@@ -53,3 +53,22 @@ Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1
 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | ctc_greedy_search | -1 | 0.061884 |  
 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | ctc_prefix_beam_search | -1 | 0.062056 |  
 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | attention_rescoring | -1 |  0.052110 |
+
+
+## U2PP Steaming Pretrained Model
+
+Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.3.0.model.tar.gz
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | attention | 16 | 0.057031 |  
+| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | ctc_greedy_search | 16 | 0.068826 |  
+| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | ctc_prefix_beam_search | 16 | 0.069111 |  
+| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | attention_rescoring | 16 | 0.059213 |
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | attention | -1 | 0.049256 |  
+| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | ctc_greedy_search | -1 | 0.052086 |  
+| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | ctc_prefix_beam_search | -1 | 0.052267 |  
+| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug  | aishell1 | attention_rescoring | -1 |  0.047198 |

From ed0138c6e324a87e31a23138bafe6f878ed8f4e9 Mon Sep 17 00:00:00 2001
From: "david.95" <david.95@live.cn>
Date: Thu, 20 Oct 2022 18:09:41 +0800
Subject: [PATCH 42/60] add condition check if a ssml input and filter space
 line, test=tts

---
 paddlespeech/t2s/exps/syn_utils.py | 36 +++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
index f9d1cd1b..41663891 100644
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -105,14 +105,15 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
     sentences = []
     with open(text_file, 'rt') as f:
         for line in f:
-            items = re.split(r"\s+", line.strip(), 1)
-            utt_id = items[0]
-            if lang == 'zh':
-                sentence = "".join(items[1:])
-            elif lang == 'en':
-                sentence = " ".join(items[1:])
-            elif lang == 'mix':
-                sentence = " ".join(items[1:])
+            if line.strip() != "":
+                items = re.split(r"\s+", line.strip(), 1)
+                utt_id = items[0]
+                if lang == 'zh':
+                    sentence = "".join(items[1:])
+                elif lang == 'en':
+                    sentence = " ".join(items[1:])
+                elif lang == 'mix':
+                    sentence = " ".join(items[1:])
             sentences.append((utt_id, sentence))
     return sentences
 
@@ -182,11 +183,20 @@ def run_frontend(frontend: object,
                  to_tensor: bool=True):
     outs = dict()
     if lang == 'zh':
-        input_ids = frontend.get_input_ids_ssml(
-            text,
-            merge_sentences=merge_sentences,
-            get_tone_ids=get_tone_ids,
-            to_tensor=to_tensor)
+        input_ids = {}
+        if text.strip() != "" and re.match(r".*?<speak>.*?</speak>.*", text,
+                                           re.DOTALL):
+            input_ids = frontend.get_input_ids_ssml(
+                text,
+                merge_sentences=merge_sentences,
+                get_tone_ids=get_tone_ids,
+                to_tensor=to_tensor)
+        else:
+            input_ids = frontend.get_input_ids(
+                text,
+                merge_sentences=merge_sentences,
+                get_tone_ids=get_tone_ids,
+                to_tensor=to_tensor)
         phone_ids = input_ids["phone_ids"]
         if get_tone_ids:
             tone_ids = input_ids["tone_ids"]

From 64cb4048a85cdaaf5175bcb511ce23d261bb2f71 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 20 Oct 2022 12:37:38 +0000
Subject: [PATCH 43/60] fix topk bug which cause ctc score diff

---
 speechx/speechx/utils/math.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc
index 289470f6..959740a0 100644
--- a/speechx/speechx/utils/math.cc
+++ b/speechx/speechx/utils/math.cc
@@ -79,10 +79,12 @@ void TopK(const std::vector<T>& data,
     int cur = values->size() - 1;
     while (!pq.empty()) {
         const auto& item = pq.top();
-        pq.pop();
-
+    
         (*values)[cur] = item.first;
         (*indices)[cur] = item.second;
+         
+        // item if reference, must pop here
+        pq.pop();
 
         cur--;
     }
@@ -93,4 +95,4 @@ template void TopK<float>(const std::vector<float>& data,
                           std::vector<float>* values,
                           std::vector<int>* indices);
 
-}  // namespace ppspeech
\ No newline at end of file
+}  // namespace ppspeech

From 48271260103d4e44fd6652be5fb4ce3f9695429d Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 21 Oct 2022 02:24:36 +0000
Subject: [PATCH 44/60] more log

---
 .../examples/u2pp_ol/wenetspeech/local/recognizer.sh   |  4 ++--
 speechx/speechx/recognizer/u2_recognizer.h             | 10 ++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh
index f71a8003..f4553f2a 100755
--- a/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh
+++ b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh
@@ -5,7 +5,7 @@ set -e
 
 data=data
 exp=exp
-nj=40
+nj=20
 
 
 mkdir -p $exp
@@ -34,4 +34,4 @@ cat $data/split${nj}/*/result_recognizer.ark > $exp/aishell_recognizer
 utils/compute-wer.py --char=1 --v=1 $text $exp/aishell_recognizer > $exp/aishell.recognizer.err
 echo "recognizer test have finished!!!"
 echo "please checkout in $exp/aishell.recognizer.err"
-tail -n 7 $exp/aishell.recognizer.err
\ No newline at end of file
+tail -n 7 $exp/aishell.recognizer.err
diff --git a/speechx/speechx/recognizer/u2_recognizer.h b/speechx/speechx/recognizer/u2_recognizer.h
index f4e91b18..54f4d258 100644
--- a/speechx/speechx/recognizer/u2_recognizer.h
+++ b/speechx/speechx/recognizer/u2_recognizer.h
@@ -72,6 +72,14 @@ struct DecodeOptions {
         decoder_opts.ctc_prefix_search_opts.blank = FLAGS_blank;
         decoder_opts.ctc_prefix_search_opts.first_beam_size = FLAGS_nbest;
         decoder_opts.ctc_prefix_search_opts.second_beam_size = FLAGS_nbest;
+        LOG(INFO) << "chunk_size: " << decoder_opts.chunk_size;
+        LOG(INFO) << "num_left_chunks: " << decoder_opts.num_left_chunks;
+        LOG(INFO) << "ctc_weight: " << decoder_opts.ctc_weight;
+        LOG(INFO) << "rescoring_weight: " << decoder_opts.rescoring_weight;
+        LOG(INFO) << "reverse_weight: " << decoder_opts.reverse_weight;
+        LOG(INFO) << "blank: " << FLAGS_blank;
+        LOG(INFO) << "first_beam_size: " << FLAGS_nbest;
+        LOG(INFO) << "second_beam_size: " << FLAGS_nbest;
         return decoder_opts;
     }
 };
@@ -88,6 +96,8 @@ struct U2RecognizerResource {
         U2RecognizerResource resource;
         resource.vocab_path = FLAGS_vocab_path;
         resource.acoustic_scale = FLAGS_acoustic_scale;
+        LOG(INFO) << "vocab path: " <<  resource.vocab_path;
+        LOG(INFO) << "acoustic_scale: " <<  resource.acoustic_scale;
 
         resource.feature_pipeline_opts =
             ppspeech::FeaturePipelineOptions::InitFromFlags();

From fc72ab1e074de0385dc795fe4ae05ff0e4691222 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 21 Oct 2022 02:31:08 +0000
Subject: [PATCH 45/60] more debug info

---
 speechx/build.sh                              |  2 +-
 .../decoder/ctc_prefix_beam_search_decoder.cc | 25 ++++++++++++++++---
 speechx/speechx/frontend/audio/assembler.cc   | 19 +++++++++-----
 .../speechx/frontend/audio/feature_cache.h    |  4 ++-
 speechx/speechx/nnet/decodable.cc             | 11 +++++---
 speechx/speechx/nnet/u2_nnet.cc               |  5 ++--
 speechx/speechx/recognizer/u2_recognizer.cc   |  6 ++---
 .../speechx/recognizer/u2_recognizer_main.cc  |  5 +++-
 8 files changed, 57 insertions(+), 20 deletions(-)

diff --git a/speechx/build.sh b/speechx/build.sh
index e0a38675..7655f963 100755
--- a/speechx/build.sh
+++ b/speechx/build.sh
@@ -20,4 +20,4 @@ fi
 mkdir -p build
 
 cmake -B build -DBOOST_ROOT:STRING=${boost_SOURCE_DIR}
-cmake --build build
+cmake --build build -j
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
index a0fe5b2a..04530fb9 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
@@ -76,11 +76,15 @@ void CTCPrefixBeamSearch::AdvanceDecode(
         // forward frame by frame
         std::vector<kaldi::BaseFloat> frame_prob;
         bool flag = decodable->FrameLikelihood(num_frame_decoded_, &frame_prob);
-        if (flag == false) break;
+        if (flag == false) {
+            LOG(INFO) << "decoder advance decode exit." << frame_prob.size();
+            break;
+        }
 
         std::vector<std::vector<kaldi::BaseFloat>> likelihood;
         likelihood.push_back(frame_prob);
         AdvanceDecoding(likelihood);
+        VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_;
     }
 }
 
@@ -114,7 +118,11 @@ void CTCPrefixBeamSearch::AdvanceDecoding(
         std::vector<float> topk_score;
         std::vector<int32_t> topk_index;
         TopK(logp_t, first_beam_size, &topk_score, &topk_index);
-
+        VLOG(2) << "topk: " << num_frame_decoded_ << " " <<  *std::max_element(logp_t.begin(), logp_t.end()) << " " << topk_score[0];
+        for (int i = 0; i < topk_score.size(); i++){
+             VLOG(2) << "topk: " << num_frame_decoded_ << " " << topk_score[i];
+        }
+       
         // 2. token passing
         for (int i = 0; i < topk_index.size(); ++i) {
             int id = topk_index[i];
@@ -295,7 +303,18 @@ void CTCPrefixBeamSearch::UpdateOutputs(
     outputs_.emplace_back(output);
 }
 
-void CTCPrefixBeamSearch::FinalizeSearch() { UpdateFinalContext(); }
+void CTCPrefixBeamSearch::FinalizeSearch() { 
+    UpdateFinalContext(); 
+    
+    VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_;
+    int cnt = 0;
+    for (int i = 0; i < hypotheses_.size(); i ++){
+        VLOG(2) << "hyp " << cnt << " len: " << hypotheses_[i].size() << " ctc score: " << likelihood_[i];
+        for (int j = 0; j < hypotheses_[i].size(); j ++){
+            VLOG(2) <<  hypotheses_[i][j];
+        }
+    }
+}
 
 void CTCPrefixBeamSearch::UpdateFinalContext() {
     if (context_graph_ == nullptr) return;
diff --git a/speechx/speechx/frontend/audio/assembler.cc b/speechx/speechx/frontend/audio/assembler.cc
index ff1b1f28..afee3a6a 100644
--- a/speechx/speechx/frontend/audio/assembler.cc
+++ b/speechx/speechx/frontend/audio/assembler.cc
@@ -52,15 +52,21 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
         Vector<BaseFloat> feature;
         result = base_extractor_->Read(&feature);
         if (result == false || feature.Dim() == 0) {
-            if (IsFinished() == false) return false;
-            break;
+            VLOG(1) << "result: " << result << "feature dim: " << feature.Dim();
+            if (IsFinished() == false) {
+                LOG(INFO) << "finished reading feature. cache size: " << feature_cache_.size();
+                return false;
+            } else {
+                LOG(INFO) << "break";
+                break;
+            }
         }
 
         CHECK(feature.Dim() == dim_);
+        feature_cache_.push(feature);
+
         nframes_ += 1;
         VLOG(1) << "nframes: " << nframes_;
-
-        feature_cache_.push(feature);
     }
 
     if (feature_cache_.size() < receptive_filed_length_) {
@@ -68,8 +74,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
         return false;
     }
 
-
-    if (fill_zero_){
+    if (fill_zero_) {
         while (feature_cache_.size() < frame_chunk_size_) {
             Vector<BaseFloat> feature(dim_, kaldi::kSetZero);
             nframes_ += 1;
@@ -79,6 +84,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
 
     int32 this_chunk_size = std::min(static_cast<int32>(feature_cache_.size()), frame_chunk_size_);
     feats->Resize(dim_ * this_chunk_size);
+    VLOG(1) << "read " << this_chunk_size << " feat.";
 
     int32 counter = 0;
     while (counter < this_chunk_size) {
@@ -97,6 +103,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
   
         counter++;
     }
+    CHECK(feature_cache_.size() == cache_size_ );
 
     return result;
 }
diff --git a/speechx/speechx/frontend/audio/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h
index 09d7f7eb..b4ed58ff 100644
--- a/speechx/speechx/frontend/audio/feature_cache.h
+++ b/speechx/speechx/frontend/audio/feature_cache.h
@@ -41,12 +41,14 @@ class FeatureCache : public FrontendInterface {
     virtual size_t Dim() const { return dim_; }
 
     virtual void SetFinished() {
+        LOG(INFO) << "set finished";
         // std::unique_lock<std::mutex> lock(mutex_);
         base_extractor_->SetFinished();
-        LOG(INFO) << "set finished";
+
         // read the last chunk data
         Compute();
         // ready_feed_condition_.notify_one();
+        LOG(INFO) << "compute last feats done.";
     }
 
     virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc
index 9bad8ed4..6956a2cb 100644
--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@@ -36,8 +36,6 @@ void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
     frames_ready_ += likelihood.NumRows();
 }
 
-// Decodable::Init(DecodableConfig config) {
-//}
 
 // return the size of frame have computed.
 int32 Decodable::NumFramesReady() const { return frames_ready_; }
@@ -70,9 +68,10 @@ bool Decodable::AdvanceChunk() {
     Vector<BaseFloat> features;
     if (frontend_ == NULL || frontend_->Read(&features) == false) {
         // no feat or frontend_ not init.
+        VLOG(1) << "decodable exit;";
         return false;
     }
-    VLOG(2) << "Forward with " << features.Dim() << " frames.";
+    VLOG(2) << "Forward in " << features.Dim() / frontend_->Dim() << " feats.";
 
     // forward feats
     NnetOut out;
@@ -80,6 +79,7 @@ bool Decodable::AdvanceChunk() {
     int32& vocab_dim = out.vocab_dim;
     Vector<BaseFloat>& logprobs = out.logprobs;
 
+    VLOG(2) << "Forward out " << logprobs.Dim() / vocab_dim  << " decoder frames.";
     // cache nnet outupts
     nnet_out_cache_.Resize(logprobs.Dim() / vocab_dim, vocab_dim);
     nnet_out_cache_.CopyRowsFromVec(logprobs);
@@ -114,15 +114,20 @@ bool Decodable::AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
 // read one frame likelihood
 bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
     if (EnsureFrameHaveComputed(frame) == false) {
+        LOG(INFO) << "framelikehood exit.";
         return false;
     }
 
+    int nrows = nnet_out_cache_.NumRows();
+    CHECK(nrows == (frames_ready_ - frame_offset_));
     int vocab_size = nnet_out_cache_.NumCols();
     likelihood->resize(vocab_size);
 
     for (int32 idx = 0; idx < vocab_size; ++idx) {
         (*likelihood)[idx] =
             nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_;
+
+        VLOG(4) << "nnet out: " << frame  << " offset:" << frame_offset_  << " " << nnet_out_cache_.NumRows() << " logprob: " <<  nnet_out_cache_(frame - frame_offset_, idx);
     }
     return true;
 }
diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc
index 63a8a793..07e2dde2 100644
--- a/speechx/speechx/nnet/u2_nnet.cc
+++ b/speechx/speechx/nnet/u2_nnet.cc
@@ -440,6 +440,7 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
         max_hyps_len = std::max(max_hyps_len, len);
         hyps_len_ptr[i] = static_cast<int64_t>(len);
     }
+    VLOG(2) << "max_hyps_len: " << max_hyps_len;
 
     paddle::Tensor hyps_tensor =
         paddle::full({num_hyps, max_hyps_len}, eos_, paddle::DataType::INT64);
@@ -625,8 +626,8 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
         // combinded left-to-right and right-to-lfet score
         (*rescoring_score)[i] =
             score * (1 - reverse_weight) + r_score * reverse_weight;
-        VLOG(1) << "hyp " << i << " score: " << score << " r_score: " << r_score
-                << " reverse_weight: " << reverse_weight;
+        VLOG(1) << "hyp " << i << " " << hyp.size() << " score: " << score << " r_score: " << r_score
+                << " reverse_weight: " << reverse_weight << " final score: " << (*rescoring_score)[i];
     }
 }
 
diff --git a/speechx/speechx/recognizer/u2_recognizer.cc b/speechx/speechx/recognizer/u2_recognizer.cc
index 75834aa5..b4a1257b 100644
--- a/speechx/speechx/recognizer/u2_recognizer.cc
+++ b/speechx/speechx/recognizer/u2_recognizer.cc
@@ -52,7 +52,6 @@ void U2Recognizer::Reset() {
     num_frames_ = 0;
     result_.clear();
 
-    feature_pipeline_->Reset();
     decodable_->Reset();
     decoder_->Reset();
 }
@@ -62,7 +61,6 @@ void U2Recognizer::ResetContinuousDecoding() {
     num_frames_ = 0;
     result_.clear();
 
-    feature_pipeline_->Reset();
     decodable_->Reset();
     decoder_->Reset();
 }
@@ -192,10 +190,12 @@ void U2Recognizer::AttentionRescoring() {
     // combine ctc score and rescoring score
     for (size_t i = 0; i < num_hyps; i++) {
         VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i]
-                << " ctc_score: " << result_[i].score;
+                << " ctc_score: " << result_[i].score << " rescoring_weight: " <<  opts_.decoder_opts.rescoring_weight << " ctc_weight: " <<  opts_.decoder_opts.ctc_weight;
         result_[i].score =
             opts_.decoder_opts.rescoring_weight * rescoring_score[i] +
             opts_.decoder_opts.ctc_weight * result_[i].score;
+
+        VLOG(1) << "hyp: " << result_[0].sentence << " score: " << result_[0].score;
     }
 
     std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc);
diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc
index 2375586e..bfb37fb8 100644
--- a/speechx/speechx/recognizer/u2_recognizer_main.cc
+++ b/speechx/speechx/recognizer/u2_recognizer_main.cc
@@ -62,6 +62,7 @@ int main(int argc, char* argv[]) {
         LOG(INFO) << "wav len (sample): " << tot_samples;
 
         int sample_offset = 0;
+        int cnt = 0;
         while (sample_offset < tot_samples) {
             int cur_chunk_size =
                 std::min(chunk_sample_size, tot_samples - sample_offset);
@@ -77,12 +78,14 @@ int main(int argc, char* argv[]) {
                 recognizer.SetFinished();
             }
             recognizer.Decode();
-            LOG(INFO) << "Pratial result: " << recognizer.GetPartialResult();
+            LOG(INFO) << "Pratial result: " << cnt << " " << recognizer.GetPartialResult();
 
             // no overlap
             sample_offset += cur_chunk_size;
+            cnt++;
         }
         CHECK(sample_offset == tot_samples);
+        VLOG(1) << "num decode: " << cnt;
 
         // recognizer.SetFinished();
 

From 83f885c6ccad46bbc17e5c1d502c9e91417f2c3c Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 21 Oct 2022 09:19:40 +0000
Subject: [PATCH 46/60] fix delete char in wav end bug

---
 speechx/speechx/frontend/audio/assembler.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/speechx/speechx/frontend/audio/assembler.cc b/speechx/speechx/frontend/audio/assembler.cc
index afee3a6a..bbd09442 100644
--- a/speechx/speechx/frontend/audio/assembler.cc
+++ b/speechx/speechx/frontend/audio/assembler.cc
@@ -105,7 +105,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
     }
     CHECK(feature_cache_.size() == cache_size_ );
 
-    return result;
+    return true;
 }
 
 

From 4dfb3365f637b28b30f0359dd641f571800eb2a8 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 21 Oct 2022 17:23:17 +0800
Subject: [PATCH 47/60] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index d02ac4c6..4ed1a022 100644
--- a/README.md
+++ b/README.md
@@ -157,6 +157,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
   - 🧩  *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV).
 
 ### Recent Update
+- 🎉 2022.10.21: Add [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for Chinese Text Frontend.
 - 👑 2022.10.11: Add [Wav2vec2ASR](./examples/librispeech/asr3), wav2vec2.0 fine-tuning for ASR on LibriSpeech.
 - 🔥 2022.09.26: Add Voice Cloning, TTS finetune, and ERNIE-SAT in [PaddleSpeech Web Demo](./demos/speech_web).
 - ⚡ 2022.09.09: Add AISHELL-3 Voice Cloning [example](./examples/aishell3/vc2) with ECAPA-TDNN speaker encoder.

From 7693bd1812086d2b5d5a19646e704a6155cb1103 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 21 Oct 2022 17:24:40 +0800
Subject: [PATCH 48/60] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4ed1a022..3b26ff9b 100644
--- a/README.md
+++ b/README.md
@@ -924,7 +924,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P
 
 ## Acknowledgement
 - Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples.
-- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. Added SSML for Chinese Text Frontend. 
+- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. Added [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for Chinese Text Frontend. 
 - Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW).
 - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help.
 - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files.

From 9c68c2061e1b595deac62229a2f29f9f0659ff17 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 21 Oct 2022 17:29:13 +0800
Subject: [PATCH 49/60] Update README_cn.md

---
 README_cn.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README_cn.md b/README_cn.md
index 0c3af5dd..9a454989 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -164,7 +164,8 @@
 
   
 ### 近期更新
-- 👑 2022.10.11: 新增 [Wav2vec2ASR](./examples/librispeech/asr3), 在 LibriSpeech 上针对ASR任务对wav2vec2.0 的fine-tuning.
+ - 🎉 2022.10.21: TTS 中文文本前端新增 [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) 功能。
+- 👑 2022.10.11: 新增 [Wav2vec2ASR](./examples/librispeech/asr3), 在 LibriSpeech 上针对 ASR 任务对 wav2vec2.0 的 finetuning。
 - 🔥 2022.09.26: 新增 Voice Cloning, TTS finetune 和 ERNIE-SAT 到 [PaddleSpeech 网页应用](./demos/speech_web)。
 - ⚡ 2022.09.09: 新增基于 ECAPA-TDNN 声纹模型的 AISHELL-3 Voice Cloning [示例](./examples/aishell3/vc2)。
 - ⚡ 2022.08.25: 发布 TTS [finetune](./examples/other/tts_finetune/tts3) 示例。
@@ -928,7 +929,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声
 
 ## 致谢
 - 非常感谢 [HighCWu](https://github.com/HighCWu) 新增 [VITS-aishell3](./examples/aishell3/vits) 和 [VITS-VC](./examples/aishell3/vits-vc) 代码示例。
-- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题，贡献补充多条程序和数据。新增 SSML 中文文本前端处理。
+- 非常感谢 [david-95](https://github.com/david-95) 修复 TTS 句尾多标点符号出错的问题，贡献补充多条程序和数据。为 TTS 中文文本前端新增 [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) 功能。
 - 非常感谢 [BarryKCL](https://github.com/BarryKCL) 基于 [G2PW](https://github.com/GitYCC/g2pW) 对 TTS 中文文本前端的优化。
 - 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) 多年来的关注和建议，以及在诸多问题上的帮助。
 - 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。

From 09a735af2449a2205a6006287e6bd1e98b355c37 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 21 Oct 2022 17:32:47 +0800
Subject: [PATCH 50/60] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 3b26ff9b..26f13d00 100644
--- a/README.md
+++ b/README.md
@@ -157,7 +157,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
   - 🧩  *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV).
 
 ### Recent Update
-- 🎉 2022.10.21: Add [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for Chinese Text Frontend.
+- 🎉 2022.10.21: Add [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for TTS Chinese Text Frontend.
 - 👑 2022.10.11: Add [Wav2vec2ASR](./examples/librispeech/asr3), wav2vec2.0 fine-tuning for ASR on LibriSpeech.
 - 🔥 2022.09.26: Add Voice Cloning, TTS finetune, and ERNIE-SAT in [PaddleSpeech Web Demo](./demos/speech_web).
 - ⚡ 2022.09.09: Add AISHELL-3 Voice Cloning [example](./examples/aishell3/vc2) with ECAPA-TDNN speaker encoder.
@@ -924,8 +924,8 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P
 
 ## Acknowledgement
 - Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples.
-- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. Added [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for Chinese Text Frontend. 
-- Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW).
+- Many thanks to [david-95](https://github.com/david-95) for fixing multi-punctuation bug、contributing to multiple program and data, and adding [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for TTS Chinese Text Frontend. 
+- Many thanks to [BarryKCL](https://github.com/BarryKCL) for improving TTS Chinses Frontend based on [G2PW](https://github.com/GitYCC/g2pW).
 - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help.
 - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files.
 - Many thanks to [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) for developing Virtual Uploader(VUP)/Virtual YouTuber(VTuber) with PaddleSpeech TTS function.

From 28dafea0e01afa9f5acfbdad2cf93e0aaabd7a7d Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 21 Oct 2022 09:44:44 +0000
Subject: [PATCH 51/60] add fill zero opt for frontend

---
 speechx/speechx/decoder/param.h                   |  1 +
 speechx/speechx/frontend/audio/assembler.cc       | 11 +++++------
 speechx/speechx/frontend/audio/feature_pipeline.h | 10 +++++++---
 speechx/speechx/nnet/decodable.cc                 |  2 +-
 speechx/speechx/recognizer/recognizer.h           |  2 ++
 speechx/speechx/recognizer/u2_recognizer.h        |  2 ++
 speechx/speechx/recognizer/u2_recognizer_main.cc  |  3 ---
 7 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h
index 1f13bbc0..1a332755 100644
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -20,6 +20,7 @@
 
 // feature
 DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
+DEFINE_bool(fill_zero, false, "fill zero at last chunk, when chunk < chunk_size");
 // DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear
 // feature, or fbank");
 DEFINE_int32(num_bins, 161, "num bins of mel");
diff --git a/speechx/speechx/frontend/audio/assembler.cc b/speechx/speechx/frontend/audio/assembler.cc
index bbd09442..26a3905b 100644
--- a/speechx/speechx/frontend/audio/assembler.cc
+++ b/speechx/speechx/frontend/audio/assembler.cc
@@ -47,17 +47,16 @@ bool Assembler::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
 // read frame by frame from base_feature_extractor_ into cache_
 bool Assembler::Compute(Vector<BaseFloat>* feats) {
     // compute and feed frame by frame
-    bool result = false;
     while (feature_cache_.size() < frame_chunk_size_) {
         Vector<BaseFloat> feature;
-        result = base_extractor_->Read(&feature);
+        bool result = base_extractor_->Read(&feature);
         if (result == false || feature.Dim() == 0) {
-            VLOG(1) << "result: " << result << "feature dim: " << feature.Dim();
+            VLOG(1) << "result: " << result << " feature dim: " << feature.Dim();
             if (IsFinished() == false) {
-                LOG(INFO) << "finished reading feature. cache size: " << feature_cache_.size();
+                VLOG(1) << "finished reading feature. cache size: " << feature_cache_.size();
                 return false;
             } else {
-                LOG(INFO) << "break";
+                VLOG(1) << "break";
                 break;
             }
         }
@@ -103,7 +102,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
   
         counter++;
     }
-    CHECK(feature_cache_.size() == cache_size_ );
+    CHECK(feature_cache_.size() == cache_size_);
 
     return true;
 }
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h
index d91a70e3..e06995b1 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@@ -27,6 +27,7 @@
 
 // feature
 DECLARE_bool(use_fbank);
+DECLARE_bool(fill_zero);
 DECLARE_int32(num_bins);
 DECLARE_string(cmvn_file);
 
@@ -80,15 +81,18 @@ struct FeaturePipelineOptions {
 
         // assembler opts
         opts.assembler_opts.subsampling_rate = FLAGS_subsampling_rate;
-        LOG(INFO) << "subsampling rate: "
-                  << opts.assembler_opts.subsampling_rate;
         opts.assembler_opts.receptive_filed_length =
             FLAGS_receptive_field_length;
+        opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk;
+        opts.assembler_opts.fill_zero = FLAGS_fill_zero;
+        LOG(INFO) << "subsampling rate: "
+                  << opts.assembler_opts.subsampling_rate;
         LOG(INFO) << "nnet receptive filed length: "
                   << opts.assembler_opts.receptive_filed_length;
-        opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk;
         LOG(INFO) << "nnet chunk size: "
                   << opts.assembler_opts.nnet_decoder_chunk;
+        LOG(INFO) << "frontend fill zeros: "
+                  << opts.assembler_opts.fill_zero;
         return opts;
     }
 };
diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc
index 6956a2cb..a7de58b5 100644
--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@@ -114,7 +114,7 @@ bool Decodable::AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
 // read one frame likelihood
 bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
     if (EnsureFrameHaveComputed(frame) == false) {
-        LOG(INFO) << "framelikehood exit.";
+        VLOG(1) << "framelikehood exit.";
         return false;
     }
 
diff --git a/speechx/speechx/recognizer/recognizer.h b/speechx/speechx/recognizer/recognizer.h
index 0402bcd3..27f1228a 100644
--- a/speechx/speechx/recognizer/recognizer.h
+++ b/speechx/speechx/recognizer/recognizer.h
@@ -38,6 +38,8 @@ struct RecognizerResource {
         resource.acoustic_scale = FLAGS_acoustic_scale;
         resource.feature_pipeline_opts =
             FeaturePipelineOptions::InitFromFlags();
+        resource.feature_pipeline_opts.assembler_opts.fill_zero = true;
+        LOG(INFO) << "ds2 need fill zero be true: " << resource.feature_pipeline_opts.assembler_opts.fill_zero;
         resource.model_opts = ModelOptions::InitFromFlags();
         resource.tlg_opts = TLGDecoderOptions::InitFromFlags();
         return resource;
diff --git a/speechx/speechx/recognizer/u2_recognizer.h b/speechx/speechx/recognizer/u2_recognizer.h
index 54f4d258..4746d86f 100644
--- a/speechx/speechx/recognizer/u2_recognizer.h
+++ b/speechx/speechx/recognizer/u2_recognizer.h
@@ -101,6 +101,8 @@ struct U2RecognizerResource {
 
         resource.feature_pipeline_opts =
             ppspeech::FeaturePipelineOptions::InitFromFlags();
+        resource.feature_pipeline_opts.assembler_opts.fill_zero = false;
+        LOG(INFO) << "u2 need fill zero be false: " << resource.feature_pipeline_opts.assembler_opts.fill_zero;
         resource.model_opts = ppspeech::ModelOptions::InitFromFlags();
         resource.decoder_opts = ppspeech::DecodeOptions::InitFromFlags();
         return resource;
diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc
index bfb37fb8..7e59d6cb 100644
--- a/speechx/speechx/recognizer/u2_recognizer_main.cc
+++ b/speechx/speechx/recognizer/u2_recognizer_main.cc
@@ -85,9 +85,6 @@ int main(int argc, char* argv[]) {
             cnt++;
         }
         CHECK(sample_offset == tot_samples);
-        VLOG(1) << "num decode: " << cnt;
-
-        // recognizer.SetFinished();
 
         // second pass decoding
         recognizer.Rescoring();

From 850096a3a0d277eabddf292a57e4eecd01d081df Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 21 Oct 2022 09:59:23 +0000
Subject: [PATCH 52/60] format code

---
 .../decoder/ctc_prefix_beam_search_decoder.cc | 25 +++++++++++--------
 .../decoder/ctc_prefix_beam_search_decoder.h  |  3 ++-
 .../decoder/ctc_prefix_beam_search_score.h    |  3 ++-
 speechx/speechx/decoder/param.h               |  4 ++-
 speechx/speechx/frontend/audio/assembler.cc   | 22 +++++++++-------
 speechx/speechx/frontend/audio/assembler.h    |  5 ++--
 speechx/speechx/frontend/audio/audio_cache.cc |  7 +++---
 speechx/speechx/frontend/audio/audio_cache.h  |  2 +-
 .../speechx/frontend/audio/feature_cache.cc   |  6 ++---
 .../speechx/frontend/audio/feature_cache.h    |  2 +-
 .../speechx/frontend/audio/feature_pipeline.h |  3 +--
 speechx/speechx/nnet/decodable.cc             |  7 ++++--
 speechx/speechx/nnet/u2_nnet.cc               | 11 +++++---
 speechx/speechx/nnet/u2_nnet.h                |  3 ++-
 speechx/speechx/recognizer/u2_recognizer.cc   |  7 ++++--
 speechx/speechx/recognizer/u2_recognizer.h    |  7 +++---
 .../speechx/recognizer/u2_recognizer_main.cc  |  5 ++--
 speechx/speechx/utils/math.cc                 |  4 +--
 18 files changed, 75 insertions(+), 51 deletions(-)

diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
index 04530fb9..2986ea7e 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
@@ -118,11 +118,13 @@ void CTCPrefixBeamSearch::AdvanceDecoding(
         std::vector<float> topk_score;
         std::vector<int32_t> topk_index;
         TopK(logp_t, first_beam_size, &topk_score, &topk_index);
-        VLOG(2) << "topk: " << num_frame_decoded_ << " " <<  *std::max_element(logp_t.begin(), logp_t.end()) << " " << topk_score[0];
-        for (int i = 0; i < topk_score.size(); i++){
-             VLOG(2) << "topk: " << num_frame_decoded_ << " " << topk_score[i];
+        VLOG(2) << "topk: " << num_frame_decoded_ << " "
+                << *std::max_element(logp_t.begin(), logp_t.end()) << " "
+                << topk_score[0];
+        for (int i = 0; i < topk_score.size(); i++) {
+            VLOG(2) << "topk: " << num_frame_decoded_ << " " << topk_score[i];
         }
-       
+
         // 2. token passing
         for (int i = 0; i < topk_index.size(); ++i) {
             int id = topk_index[i];
@@ -303,15 +305,16 @@ void CTCPrefixBeamSearch::UpdateOutputs(
     outputs_.emplace_back(output);
 }
 
-void CTCPrefixBeamSearch::FinalizeSearch() { 
-    UpdateFinalContext(); 
-    
+void CTCPrefixBeamSearch::FinalizeSearch() {
+    UpdateFinalContext();
+
     VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_;
     int cnt = 0;
-    for (int i = 0; i < hypotheses_.size(); i ++){
-        VLOG(2) << "hyp " << cnt << " len: " << hypotheses_[i].size() << " ctc score: " << likelihood_[i];
-        for (int j = 0; j < hypotheses_[i].size(); j ++){
-            VLOG(2) <<  hypotheses_[i][j];
+    for (int i = 0; i < hypotheses_.size(); i++) {
+        VLOG(2) << "hyp " << cnt << " len: " << hypotheses_[i].size()
+                << " ctc score: " << likelihood_[i];
+        for (int j = 0; j < hypotheses_[i].size(); j++) {
+            VLOG(2) << hypotheses_[i][j];
         }
     }
 }
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
index ef96ecd9..475b4d35 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
@@ -13,7 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.cc
+// modified from
+// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.cc
 
 #pragma once
 
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h
index 908be1d6..3547b2b7 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h
@@ -13,7 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.h
+// modified from
+// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.h
 
 #pragma once
 
diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h
index 1a332755..ebdd7119 100644
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -20,7 +20,9 @@
 
 // feature
 DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
-DEFINE_bool(fill_zero, false, "fill zero at last chunk, when chunk < chunk_size");
+DEFINE_bool(fill_zero,
+            false,
+            "fill zero at last chunk, when chunk < chunk_size");
 // DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear
 // feature, or fbank");
 DEFINE_int32(num_bins, 161, "num bins of mel");
diff --git a/speechx/speechx/frontend/audio/assembler.cc b/speechx/speechx/frontend/audio/assembler.cc
index 26a3905b..56dfc3aa 100644
--- a/speechx/speechx/frontend/audio/assembler.cc
+++ b/speechx/speechx/frontend/audio/assembler.cc
@@ -16,9 +16,9 @@
 
 namespace ppspeech {
 
+using kaldi::BaseFloat;
 using kaldi::Vector;
 using kaldi::VectorBase;
-using kaldi::BaseFloat;
 using std::unique_ptr;
 
 Assembler::Assembler(AssemblerOptions opts,
@@ -51,9 +51,11 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
         Vector<BaseFloat> feature;
         bool result = base_extractor_->Read(&feature);
         if (result == false || feature.Dim() == 0) {
-            VLOG(1) << "result: " << result << " feature dim: " << feature.Dim();
+            VLOG(1) << "result: " << result
+                    << " feature dim: " << feature.Dim();
             if (IsFinished() == false) {
-                VLOG(1) << "finished reading feature. cache size: " << feature_cache_.size();
+                VLOG(1) << "finished reading feature. cache size: "
+                        << feature_cache_.size();
                 return false;
             } else {
                 VLOG(1) << "break";
@@ -69,7 +71,8 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
     }
 
     if (feature_cache_.size() < receptive_filed_length_) {
-        VLOG(1) << "feature_cache less than receptive_filed_lenght. " << feature_cache_.size() << ": " << receptive_filed_length_;
+        VLOG(1) << "feature_cache less than receptive_filed_lenght. "
+                << feature_cache_.size() << ": " << receptive_filed_length_;
         return false;
     }
 
@@ -81,7 +84,8 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
         }
     }
 
-    int32 this_chunk_size = std::min(static_cast<int32>(feature_cache_.size()), frame_chunk_size_);
+    int32 this_chunk_size =
+        std::min(static_cast<int32>(feature_cache_.size()), frame_chunk_size_);
     feats->Resize(dim_ * this_chunk_size);
     VLOG(1) << "read " << this_chunk_size << " feat.";
 
@@ -89,7 +93,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
     while (counter < this_chunk_size) {
         Vector<BaseFloat>& val = feature_cache_.front();
         CHECK(val.Dim() == dim_) << val.Dim();
-      
+
         int32 start = counter * dim_;
         feats->Range(start, dim_).CopyFromVec(val);
 
@@ -99,7 +103,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
 
         // val is reference, so we should pop here
         feature_cache_.pop();
-  
+
         counter++;
     }
     CHECK(feature_cache_.size() == cache_size_);
@@ -108,11 +112,11 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
 }
 
 
- void Assembler::Reset() { 
+void Assembler::Reset() {
     std::queue<kaldi::Vector<kaldi::BaseFloat>> empty;
     std::swap(feature_cache_, empty);
     nframes_ = 0;
-    base_extractor_->Reset(); 
+    base_extractor_->Reset();
 }
 
 }  // namespace ppspeech
diff --git a/speechx/speechx/frontend/audio/assembler.h b/speechx/speechx/frontend/audio/assembler.h
index 4f165ea8..72e6f635 100644
--- a/speechx/speechx/frontend/audio/assembler.h
+++ b/speechx/speechx/frontend/audio/assembler.h
@@ -25,7 +25,8 @@ struct AssemblerOptions {
     int32 receptive_filed_length{1};
     int32 subsampling_rate{1};
     int32 nnet_decoder_chunk{1};
-    bool fill_zero{false}; // whether fill zero when last chunk is not equal to frame_chunk_size_
+    bool fill_zero{false};  // whether fill zero when last chunk is not equal to
+                            // frame_chunk_size_
 };
 
 class Assembler : public FrontendInterface {
@@ -62,7 +63,7 @@ class Assembler : public FrontendInterface {
     std::queue<kaldi::Vector<kaldi::BaseFloat>> feature_cache_;
     std::unique_ptr<FrontendInterface> base_extractor_;
 
-    int32 nframes_; // num frame computed
+    int32 nframes_;  // num frame computed
     DISALLOW_COPY_AND_ASSIGN(Assembler);
 };
 
diff --git a/speechx/speechx/frontend/audio/audio_cache.cc b/speechx/speechx/frontend/audio/audio_cache.cc
index 71e5d09e..61ef8841 100644
--- a/speechx/speechx/frontend/audio/audio_cache.cc
+++ b/speechx/speechx/frontend/audio/audio_cache.cc
@@ -13,13 +13,14 @@
 // limitations under the License.
 
 #include "frontend/audio/audio_cache.h"
+
 #include "kaldi/base/timer.h"
 
 namespace ppspeech {
 
 using kaldi::BaseFloat;
-using kaldi::VectorBase;
 using kaldi::Vector;
+using kaldi::VectorBase;
 
 AudioCache::AudioCache(int buffer_size, bool to_float32)
     : finished_(false),
@@ -85,8 +86,8 @@ bool AudioCache::Read(Vector<BaseFloat>* waves) {
     offset_ = (offset_ + chunk_size) % ring_buffer_.size();
 
     nsamples_ += chunk_size;
-    VLOG(1) << "nsamples readed: " <<  nsamples_;
-    
+    VLOG(1) << "nsamples readed: " << nsamples_;
+
     ready_feed_condition_.notify_one();
     return true;
 }
diff --git a/speechx/speechx/frontend/audio/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h
index da422daa..4708a6e0 100644
--- a/speechx/speechx/frontend/audio/audio_cache.h
+++ b/speechx/speechx/frontend/audio/audio_cache.h
@@ -62,7 +62,7 @@ class AudioCache : public FrontendInterface {
     kaldi::int32 timeout_;  // millisecond
     bool to_float32_;       // int16 -> float32. used in linear_spectrogram
 
-    int32 nsamples_; // number samples readed.
+    int32 nsamples_;  // number samples readed.
     DISALLOW_COPY_AND_ASSIGN(AudioCache);
 };
 
diff --git a/speechx/speechx/frontend/audio/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc
index c712e48e..3f05eae6 100644
--- a/speechx/speechx/frontend/audio/feature_cache.cc
+++ b/speechx/speechx/frontend/audio/feature_cache.cc
@@ -16,12 +16,12 @@
 
 namespace ppspeech {
 
-using kaldi::Vector;
-using kaldi::VectorBase;
 using kaldi::BaseFloat;
-using std::vector;
 using kaldi::SubVector;
+using kaldi::Vector;
+using kaldi::VectorBase;
 using std::unique_ptr;
+using std::vector;
 
 FeatureCache::FeatureCache(FeatureCacheOptions opts,
                            unique_ptr<FrontendInterface> base_extractor) {
diff --git a/speechx/speechx/frontend/audio/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h
index b4ed58ff..bd869225 100644
--- a/speechx/speechx/frontend/audio/feature_cache.h
+++ b/speechx/speechx/frontend/audio/feature_cache.h
@@ -77,7 +77,7 @@ class FeatureCache : public FrontendInterface {
     std::condition_variable ready_feed_condition_;
     std::condition_variable ready_read_condition_;
 
-    int32 nframe_; // num of feature computed
+    int32 nframe_;  // num of feature computed
     DISALLOW_COPY_AND_ASSIGN(FeatureCache);
 };
 
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h
index e06995b1..e83a3f31 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@@ -91,8 +91,7 @@ struct FeaturePipelineOptions {
                   << opts.assembler_opts.receptive_filed_length;
         LOG(INFO) << "nnet chunk size: "
                   << opts.assembler_opts.nnet_decoder_chunk;
-        LOG(INFO) << "frontend fill zeros: "
-                  << opts.assembler_opts.fill_zero;
+        LOG(INFO) << "frontend fill zeros: " << opts.assembler_opts.fill_zero;
         return opts;
     }
 };
diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc
index a7de58b5..11d60d3e 100644
--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@@ -79,7 +79,8 @@ bool Decodable::AdvanceChunk() {
     int32& vocab_dim = out.vocab_dim;
     Vector<BaseFloat>& logprobs = out.logprobs;
 
-    VLOG(2) << "Forward out " << logprobs.Dim() / vocab_dim  << " decoder frames.";
+    VLOG(2) << "Forward out " << logprobs.Dim() / vocab_dim
+            << " decoder frames.";
     // cache nnet outupts
     nnet_out_cache_.Resize(logprobs.Dim() / vocab_dim, vocab_dim);
     nnet_out_cache_.CopyRowsFromVec(logprobs);
@@ -127,7 +128,9 @@ bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
         (*likelihood)[idx] =
             nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_;
 
-        VLOG(4) << "nnet out: " << frame  << " offset:" << frame_offset_  << " " << nnet_out_cache_.NumRows() << " logprob: " <<  nnet_out_cache_(frame - frame_offset_, idx);
+        VLOG(4) << "nnet out: " << frame << " offset:" << frame_offset_ << " "
+                << nnet_out_cache_.NumRows()
+                << " logprob: " << nnet_out_cache_(frame - frame_offset_, idx);
     }
     return true;
 }
diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc
index 07e2dde2..636e2ad4 100644
--- a/speechx/speechx/nnet/u2_nnet.cc
+++ b/speechx/speechx/nnet/u2_nnet.cc
@@ -13,7 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.cc
+// modified from
+// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.cc
 
 #include "nnet/u2_nnet.h"
 
@@ -129,7 +130,7 @@ U2Nnet::U2Nnet(const U2Nnet& other) {
     forward_attention_decoder_ = other.forward_attention_decoder_;
     ctc_activation_ = other.ctc_activation_;
 
-    offset_ = other.offset_; 
+    offset_ = other.offset_;
 
     // copy model ptr
     model_ = other.model_;
@@ -626,8 +627,10 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
         // combinded left-to-right and right-to-lfet score
         (*rescoring_score)[i] =
             score * (1 - reverse_weight) + r_score * reverse_weight;
-        VLOG(1) << "hyp " << i << " " << hyp.size() << " score: " << score << " r_score: " << r_score
-                << " reverse_weight: " << reverse_weight << " final score: " << (*rescoring_score)[i];
+        VLOG(1) << "hyp " << i << " " << hyp.size() << " score: " << score
+                << " r_score: " << r_score
+                << " reverse_weight: " << reverse_weight
+                << " final score: " << (*rescoring_score)[i];
     }
 }
 
diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h
index 6cbc0570..e548d4c0 100644
--- a/speechx/speechx/nnet/u2_nnet.h
+++ b/speechx/speechx/nnet/u2_nnet.h
@@ -13,7 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.h
+// modified from
+// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.h
 #pragma once
 
 #include "base/common.h"
diff --git a/speechx/speechx/recognizer/u2_recognizer.cc b/speechx/speechx/recognizer/u2_recognizer.cc
index b4a1257b..4ec64665 100644
--- a/speechx/speechx/recognizer/u2_recognizer.cc
+++ b/speechx/speechx/recognizer/u2_recognizer.cc
@@ -190,12 +190,15 @@ void U2Recognizer::AttentionRescoring() {
     // combine ctc score and rescoring score
     for (size_t i = 0; i < num_hyps; i++) {
         VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i]
-                << " ctc_score: " << result_[i].score << " rescoring_weight: " <<  opts_.decoder_opts.rescoring_weight << " ctc_weight: " <<  opts_.decoder_opts.ctc_weight;
+                << " ctc_score: " << result_[i].score
+                << " rescoring_weight: " << opts_.decoder_opts.rescoring_weight
+                << " ctc_weight: " << opts_.decoder_opts.ctc_weight;
         result_[i].score =
             opts_.decoder_opts.rescoring_weight * rescoring_score[i] +
             opts_.decoder_opts.ctc_weight * result_[i].score;
 
-        VLOG(1) << "hyp: " << result_[0].sentence << " score: " << result_[0].score;
+        VLOG(1) << "hyp: " << result_[0].sentence
+                << " score: " << result_[0].score;
     }
 
     std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc);
diff --git a/speechx/speechx/recognizer/u2_recognizer.h b/speechx/speechx/recognizer/u2_recognizer.h
index 4746d86f..9b43b08f 100644
--- a/speechx/speechx/recognizer/u2_recognizer.h
+++ b/speechx/speechx/recognizer/u2_recognizer.h
@@ -96,13 +96,14 @@ struct U2RecognizerResource {
         U2RecognizerResource resource;
         resource.vocab_path = FLAGS_vocab_path;
         resource.acoustic_scale = FLAGS_acoustic_scale;
-        LOG(INFO) << "vocab path: " <<  resource.vocab_path;
-        LOG(INFO) << "acoustic_scale: " <<  resource.acoustic_scale;
+        LOG(INFO) << "vocab path: " << resource.vocab_path;
+        LOG(INFO) << "acoustic_scale: " << resource.acoustic_scale;
 
         resource.feature_pipeline_opts =
             ppspeech::FeaturePipelineOptions::InitFromFlags();
         resource.feature_pipeline_opts.assembler_opts.fill_zero = false;
-        LOG(INFO) << "u2 need fill zero be false: " << resource.feature_pipeline_opts.assembler_opts.fill_zero;
+        LOG(INFO) << "u2 need fill zero be false: "
+                  << resource.feature_pipeline_opts.assembler_opts.fill_zero;
         resource.model_opts = ppspeech::ModelOptions::InitFromFlags();
         resource.decoder_opts = ppspeech::DecodeOptions::InitFromFlags();
         return resource;
diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc
index 7e59d6cb..c02e1c23 100644
--- a/speechx/speechx/recognizer/u2_recognizer_main.cc
+++ b/speechx/speechx/recognizer/u2_recognizer_main.cc
@@ -78,7 +78,8 @@ int main(int argc, char* argv[]) {
                 recognizer.SetFinished();
             }
             recognizer.Decode();
-            LOG(INFO) << "Pratial result: " << cnt << " " << recognizer.GetPartialResult();
+            LOG(INFO) << "Pratial result: " << cnt << " "
+                      << recognizer.GetPartialResult();
 
             // no overlap
             sample_offset += cur_chunk_size;
@@ -88,7 +89,7 @@ int main(int argc, char* argv[]) {
 
         // second pass decoding
         recognizer.Rescoring();
-    
+
         std::string result = recognizer.GetFinalResult();
 
         recognizer.Reset();
diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc
index 959740a0..71656cb3 100644
--- a/speechx/speechx/utils/math.cc
+++ b/speechx/speechx/utils/math.cc
@@ -79,10 +79,10 @@ void TopK(const std::vector<T>& data,
     int cur = values->size() - 1;
     while (!pq.empty()) {
         const auto& item = pq.top();
-    
+
         (*values)[cur] = item.first;
         (*indices)[cur] = item.second;
-         
+
         // item if reference, must pop here
         pq.pop();
 

From 606e2c237fa7283e1d39c3dbb1cb62d7855a55c4 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 24 Oct 2022 03:47:23 +0000
Subject: [PATCH 53/60] fix as comment

---
 speechx/speechx/decoder/ctc_beam_search_opt.h |  4 ++--
 .../decoder/ctc_prefix_beam_search_decoder.cc | 19 ++-----------------
 .../decoder/ctc_prefix_beam_search_decoder.h  |  6 +++---
 .../ctc_prefix_beam_search_decoder_main.cc    |  7 +++----
 .../decoder/ctc_prefix_beam_search_score.h    |  7 +++++++
 speechx/speechx/decoder/ctc_tlg_decoder.h     |  4 ++--
 6 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/speechx/speechx/decoder/ctc_beam_search_opt.h b/speechx/speechx/decoder/ctc_beam_search_opt.h
index d21b3abd..4a4f817d 100644
--- a/speechx/speechx/decoder/ctc_beam_search_opt.h
+++ b/speechx/speechx/decoder/ctc_beam_search_opt.h
@@ -37,13 +37,13 @@ struct CTCBeamSearchOptions {
     // u2
     int first_beam_size;
     int second_beam_size;
-    CTCBeamSearchOptions()
+    explicit CTCBeamSearchOptions()
         : blank(0),
           dict_file("vocab.txt"),
           lm_path(""),
+          beam_size(300),
           alpha(1.9f),
           beta(5.0),
-          beam_size(300),
           cutoff_prob(0.99f),
           cutoff_top_n(40),
           num_proc_bsearch(10),
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
index 2986ea7e..7414d06d 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
@@ -31,7 +31,7 @@ using paddle::platform::TracerEventType;
 
 namespace ppspeech {
 
-CTCPrefixBeamSearch::CTCPrefixBeamSearch(const std::string vocab_path,
+CTCPrefixBeamSearch::CTCPrefixBeamSearch(const std::string& vocab_path,
                                          const CTCBeamSearchOptions& opts)
     : opts_(opts) {
     unit_table_ = std::shared_ptr<fst::SymbolTable>(
@@ -55,10 +55,7 @@ void CTCPrefixBeamSearch::Reset() {
     // empty hyp with Score
     std::vector<int> empty;
     PrefixScore prefix_score;
-    prefix_score.b = 0.0f;             // log(1)
-    prefix_score.nb = -kBaseFloatMax;  // log(0)
-    prefix_score.v_b = 0.0f;           // log(1)
-    prefix_score.v_nb = 0.0f;          // log(1)
+    prefix_score.InitEmpty();
     cur_hyps_[empty] = prefix_score;
 
     outputs_.emplace_back(empty);
@@ -287,19 +284,7 @@ void CTCPrefixBeamSearch::UpdateOutputs(
     int s = 0;
     int e = 0;
     for (int i = 0; i < input.size(); ++i) {
-        // if (s < start_boundaries.size() && i == start_boundaries[s]){
-        //     // <context>
-        //     output.emplace_back(context_graph_->start_tag_id());
-        //     ++s;
-        // }
-
         output.emplace_back(input[i]);
-
-        // if (e < end_boundaries.size() && i == end_boundaries[e]){
-        //     // </context>
-        //     output.emplace_back(context_graph_->end_tag_id());
-        //     ++e;
-        // }
     }
 
     outputs_.emplace_back(output);
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
index 475b4d35..a0c2a74e 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
@@ -27,7 +27,7 @@ namespace ppspeech {
 class ContextGraph;
 class CTCPrefixBeamSearch : public DecoderBase {
   public:
-    explicit CTCPrefixBeamSearch(const std::string vocab_path,
+    explicit CTCPrefixBeamSearch(const std::string& vocab_path,
                                  const CTCBeamSearchOptions& opts);
     ~CTCPrefixBeamSearch() {}
 
@@ -77,7 +77,7 @@ class CTCPrefixBeamSearch : public DecoderBase {
 
   private:
     CTCBeamSearchOptions opts_;
-    std::shared_ptr<fst::SymbolTable> unit_table_;
+    std::shared_ptr<fst::SymbolTable> unit_table_{nullptr};
 
     std::unordered_map<std::vector<int>, PrefixScore, PrefixScoreHash>
         cur_hyps_;
@@ -92,7 +92,7 @@ class CTCPrefixBeamSearch : public DecoderBase {
     // Outputs contain the hypotheses_ and tags lik: <context> and </context>
     std::vector<std::vector<int>> outputs_;
 
-    std::shared_ptr<ContextGraph> context_graph_ = nullptr;
+    std::shared_ptr<ContextGraph> context_graph_{nullptr};
 
     DISALLOW_COPY_AND_ASSIGN(CTCPrefixBeamSearch);
 };
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
index d9cca147..69f32686 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
@@ -64,12 +64,11 @@ int main(int argc, char* argv[]) {
     // nnet
     ppspeech::ModelOptions model_opts;
     model_opts.model_path = FLAGS_model_path;
-    std::shared_ptr<ppspeech::U2Nnet> nnet(new ppspeech::U2Nnet(model_opts));
+    std::shared_ptr<ppspeech::U2Nnet> nnet = std::make_shared<ppspeech::U2Nnet>(model_opts);
 
     // decodeable
-    std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
-    std::shared_ptr<ppspeech::Decodable> decodable(
-        new ppspeech::Decodable(nnet, raw_data));
+    std::shared_ptr<ppspeech::DataCache> raw_data = std::make_shared<ppspeech::DataCache>();
+    std::shared_ptr<ppspeech::Decodable> decodable = std::make_shared<ppspeech::Decodable>(nnet, raw_data);
 
     // decoder
     ppspeech::CTCBeamSearchOptions opts;
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h
index 3547b2b7..76b09e9b 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h
@@ -73,6 +73,13 @@ struct PrefixScore {
                        int prefix_len) {
         CHECK(false);
     }
+
+    void InitEmpty() {
+        b = 0.0f;             // log(1)
+        nb = -kBaseFloatMax;  // log(0)
+        v_b = 0.0f;           // log(1)
+        v_nb = 0.0f;          // log(1)
+    }
 };
 
 struct PrefixScoreHash {
diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h
index f250ac25..0ff1de2a 100644
--- a/speechx/speechx/decoder/ctc_tlg_decoder.h
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.h
@@ -31,8 +31,8 @@ namespace ppspeech {
 struct TLGDecoderOptions {
     kaldi::LatticeFasterDecoderConfig opts{};
     // todo remove later, add into decode resource
-    std::string word_symbol_table{};
-    std::string fst_path{};
+    std::string word_symbol_table;
+    std::string fst_path;
 
     static TLGDecoderOptions InitFromFlags() {
         TLGDecoderOptions decoder_opts;

From 08c432f70a2f328d146a876920b9179543107b3e Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 24 Oct 2022 07:19:28 +0000
Subject: [PATCH 54/60] add paddleslim

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index e551d9fa..35668bdd 100644
--- a/setup.py
+++ b/setup.py
@@ -75,6 +75,7 @@ base = [
     "braceexpand",
     "pyyaml",
     "pybind11",
+    "paddleslim==2.3.4",
 ]
 
 server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"]

From 8271fcfb0a7837d0cc8fdbe8764dedaa17924cc0 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 24 Oct 2022 08:31:42 +0000
Subject: [PATCH 55/60] fix as comment

---
 speechx/CMakeLists.txt                        | 27 ++++++++++++++++---
 speechx/README.md                             |  4 +--
 speechx/cmake/paddleinference.cmake           |  9 -------
 speechx/examples/codelab/feat/run.sh          |  6 ++---
 speechx/examples/codelab/u2/local/decode.sh   |  2 +-
 speechx/speechx/decoder/CMakeLists.txt        | 10 +++----
 speechx/speechx/decoder/ctc_beam_search_opt.h |  5 ++--
 .../decoder/ctc_prefix_beam_search_decoder.cc |  6 ++---
 .../decoder/ctc_prefix_beam_search_decoder.h  |  4 +--
 9 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/speechx/CMakeLists.txt b/speechx/CMakeLists.txt
index 6255cb2e..978a23d9 100644
--- a/speechx/CMakeLists.txt
+++ b/speechx/CMakeLists.txt
@@ -102,7 +102,16 @@ message(STATUS "Pybind11_INCLUDES = ${pybind11_INCLUDE_DIRS}, pybind11_LIBRARIES
 # paddle include and link option
 # -L/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/libs -L/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/fluid -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so
 execute_process(
-    COMMAND python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=' '.join([\"-L\" + libs_dir, \"-L\" + fluid_dir]); out += \" -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so\"; print(out);"
+    COMMAND python -c "\
+import os;\
+import paddle;\
+include_dir=paddle.sysconfig.get_include();\
+paddle_dir=os.path.split(include_dir)[0];\
+libs_dir=os.path.join(paddle_dir, 'libs');\
+fluid_dir=os.path.join(paddle_dir, 'fluid');\
+out=' '.join([\"-L\" + libs_dir, \"-L\" + fluid_dir]);\
+out += \" -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so\"; print(out);\
+    "
     OUTPUT_VARIABLE PADDLE_LINK_FLAGS
     RESULT_VARIABLE SUCESS)
 
@@ -112,7 +121,11 @@ string(STRIP ${PADDLE_LINK_FLAGS} PADDLE_LINK_FLAGS)
 # paddle compile option
 # -I/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/include
 execute_process(
-    COMMAND python -c "import paddle; include_dir = paddle.sysconfig.get_include(); print(f\"-I{include_dir}\");"
+    COMMAND python -c "\
+import paddle; \
+include_dir = paddle.sysconfig.get_include(); \
+print(f\"-I{include_dir}\"); \
+    "
     OUTPUT_VARIABLE PADDLE_COMPILE_FLAGS)
 message(STATUS PADDLE_COMPILE_FLAGS= ${PADDLE_COMPILE_FLAGS})
 string(STRIP ${PADDLE_COMPILE_FLAGS} PADDLE_COMPILE_FLAGS)
@@ -121,7 +134,15 @@ string(STRIP ${PADDLE_COMPILE_FLAGS} PADDLE_COMPILE_FLAGS)
 # for LD_LIBRARY_PATH
 # set(PADDLE_LIB_DIRS /workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid:/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/libs/)
 execute_process(
-    COMMAND python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out);"
+    COMMAND python -c " \
+import os; \
+import paddle; \
+include_dir=paddle.sysconfig.get_include(); \
+paddle_dir=os.path.split(include_dir)[0]; \
+libs_dir=os.path.join(paddle_dir, 'libs'); \
+fluid_dir=os.path.join(paddle_dir, 'fluid'); \
+out=':'.join([libs_dir, fluid_dir]); print(out); \
+    "
     OUTPUT_VARIABLE PADDLE_LIB_DIRS)
 message(STATUS PADDLE_LIB_DIRS= ${PADDLE_LIB_DIRS})
 
diff --git a/speechx/README.md b/speechx/README.md
index 3861edf3..f744defa 100644
--- a/speechx/README.md
+++ b/speechx/README.md
@@ -9,7 +9,7 @@ We develop under:
 * gcc/g++/gfortran - 8.2.0
 * cmake - 3.16.0
 
-> Please using `tools/env.sh` to create python `venv`, then `source venv/bin/activate` to build speechx.
+> Please use `tools/env.sh` to create python `venv`, then `source venv/bin/activate` to build speechx.
 
 > We make sure all things work fun under docker, and recommend using it to develop and deploy.
 
@@ -35,7 +35,7 @@ bash tools/venv.sh
 
 2. Build `speechx` and `examples`.
 
-For now we using feature under `develop` branch of paddle, so we need install `paddlepaddle` nightly build version.
+For now we are using feature under `develop` branch of paddle, so we need to install `paddlepaddle` nightly build version.
 For example: 
 ```
 source venv/bin/activate
diff --git a/speechx/cmake/paddleinference.cmake b/speechx/cmake/paddleinference.cmake
index 311804d6..d8a9c613 100644
--- a/speechx/cmake/paddleinference.cmake
+++ b/speechx/cmake/paddleinference.cmake
@@ -1,14 +1,5 @@
 set(paddle_SOURCE_DIR ${fc_patch}/paddle-lib)
 set(paddle_PREFIX_DIR ${fc_patch}/paddle-lib-prefix)
-# ExternalProject_Add(paddle
-#   URL      https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/CPU/gcc8.2_avx_mkl/paddle_inference.tgz
-#   URL_HASH SHA256=7c6399e778c6554a929b5a39ba2175e702e115145e8fa690d2af974101d98873
-#   PREFIX            ${paddle_PREFIX_DIR} 
-#   SOURCE_DIR        ${paddle_SOURCE_DIR}
-#   CONFIGURE_COMMAND ""
-#   BUILD_COMMAND     ""
-#   INSTALL_COMMAND   ""
-# )
 
 include(FetchContent)
 FetchContent_Declare(
diff --git a/speechx/examples/codelab/feat/run.sh b/speechx/examples/codelab/feat/run.sh
index 66bd8ae2..5d7612ae 100755
--- a/speechx/examples/codelab/feat/run.sh
+++ b/speechx/examples/codelab/feat/run.sh
@@ -42,8 +42,8 @@ mkdir -p $exp_dir
 export GLOG_logtostderr=1
 
 cmvn_json2kaldi_main \
-    --json_file  $model_dir/data/mean_std.json \
-    --cmvn_write_path $exp_dir/cmvn.ark \
+    --json_file=$model_dir/data/mean_std.json \
+    --cmvn_write_path=$exp_dir/cmvn.ark \
     --binary=false
 echo "convert json cmvn to kaldi ark."
 
@@ -55,7 +55,7 @@ compute_linear_spectrogram_main \
 echo "compute linear spectrogram feature."
 
 compute_fbank_main \
-    --num_bins 161 \
+    --num_bins=161 \
     --wav_rspecifier=scp:$data_dir/wav.scp \
     --feature_wspecifier=ark,t:$exp_dir/fbank.ark \
     --cmvn_file=$exp_dir/cmvn.ark
diff --git a/speechx/examples/codelab/u2/local/decode.sh b/speechx/examples/codelab/u2/local/decode.sh
index c22ad7f0..11c1afe8 100755
--- a/speechx/examples/codelab/u2/local/decode.sh
+++ b/speechx/examples/codelab/u2/local/decode.sh
@@ -7,7 +7,7 @@ set -e
 data=data
 exp=exp
 mkdir -p $exp
-ckpt_dir=./data/model
+ckpt_dir=$data/model
 model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/
 
 ctc_prefix_beam_search_decoder_main \
diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt
index 5bec24a6..f0fd32ba 100644
--- a/speechx/speechx/decoder/CMakeLists.txt
+++ b/speechx/speechx/decoder/CMakeLists.txt
@@ -4,11 +4,11 @@ set(srcs)
 
 if (USING_DS2)
 list(APPEND srcs
-ctc_decoders/decoder_utils.cpp
-ctc_decoders/path_trie.cpp
-ctc_decoders/scorer.cpp
-ctc_beam_search_decoder.cc
-ctc_tlg_decoder.cc
+  ctc_decoders/decoder_utils.cpp
+  ctc_decoders/path_trie.cpp
+  ctc_decoders/scorer.cpp
+  ctc_beam_search_decoder.cc
+  ctc_tlg_decoder.cc
 )
 endif()
 
diff --git a/speechx/speechx/decoder/ctc_beam_search_opt.h b/speechx/speechx/decoder/ctc_beam_search_opt.h
index 4a4f817d..f9e5933c 100644
--- a/speechx/speechx/decoder/ctc_beam_search_opt.h
+++ b/speechx/speechx/decoder/ctc_beam_search_opt.h
@@ -11,12 +11,11 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#pragma once
 
 #include "base/common.h"
 #include "util/parse-options.h"
 
-#pragma once
-
 namespace ppspeech {
 
 
@@ -76,4 +75,4 @@ struct CTCBeamSearchOptions {
     }
 };
 
-}  // namespace ppspeech
\ No newline at end of file
+}  // namespace ppspeech
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
index 7414d06d..56867c70 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
@@ -107,12 +107,12 @@ void CTCPrefixBeamSearch::AdvanceDecoding(
         std::min(static_cast<int>(logp[0].size()), opts_.first_beam_size);
 
     for (int t = 0; t < logp.size(); ++t, ++num_frame_decoded_) {
-        const std::vector<float>& logp_t = logp[t];
+        const std::vector<kaldi::BaseFloat>& logp_t = logp[t];
         std::unordered_map<std::vector<int>, PrefixScore, PrefixScoreHash>
             next_hyps;
 
         // 1. first beam prune, only select topk candidates
-        std::vector<float> topk_score;
+        std::vector<kaldi::BaseFloat> topk_score;
         std::vector<int32_t> topk_index;
         TopK(logp_t, first_beam_size, &topk_score, &topk_index);
         VLOG(2) << "topk: " << num_frame_decoded_ << " "
@@ -367,4 +367,4 @@ std::string CTCPrefixBeamSearch::GetFinalBestPath() { return GetBestPath(); }
 std::string CTCPrefixBeamSearch::GetPartialResult() { return GetBestPath(); }
 
 
-}  // namespace ppspeech
\ No newline at end of file
+}  // namespace ppspeech
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
index a0c2a74e..91977092 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
@@ -27,7 +27,7 @@ namespace ppspeech {
 class ContextGraph;
 class CTCPrefixBeamSearch : public DecoderBase {
   public:
-    explicit CTCPrefixBeamSearch(const std::string& vocab_path,
+    CTCPrefixBeamSearch(const std::string& vocab_path,
                                  const CTCBeamSearchOptions& opts);
     ~CTCPrefixBeamSearch() {}
 
@@ -98,4 +98,4 @@ class CTCPrefixBeamSearch : public DecoderBase {
 };
 
 
-}  // namespace ppspeech
\ No newline at end of file
+}  // namespace ppspeech

From a6b2a0a697cade73112ab66dd7fef477e44e9577 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 24 Oct 2022 09:28:38 +0000
Subject: [PATCH 56/60] cpplint

---
 .pre-commit-config.yaml                       |  9 +++-
 speechx/speechx/base/basic_types.h            | 42 +++++++++----------
 speechx/speechx/base/macros.h                 |  2 +-
 speechx/speechx/base/thread_pool.h            |  2 +-
 .../codelab/nnet/ds2_model_test_main.cc       |  4 +-
 .../decoder/ctc_beam_search_decoder.cc        |  6 +--
 .../speechx/decoder/ctc_beam_search_decoder.h |  2 +-
 .../decoder/ctc_beam_search_decoder_main.cc   |  4 +-
 speechx/speechx/decoder/ctc_beam_search_opt.h |  2 +-
 .../decoder/ctc_prefix_beam_search_decoder.cc |  6 +--
 .../decoder/ctc_prefix_beam_search_decoder.h  |  2 +-
 .../ctc_prefix_beam_search_decoder_main.cc    | 17 ++++----
 speechx/speechx/decoder/ctc_tlg_decoder.h     |  2 +-
 speechx/speechx/frontend/audio/cmvn.cc        |  2 +-
 .../frontend/audio/compute_fbank_main.cc      |  4 +-
 speechx/speechx/frontend/audio/data_cache.h   |  2 +-
 speechx/speechx/frontend/audio/db_norm.cc     |  7 ++--
 speechx/speechx/frontend/audio/fbank.cc       |  7 ++--
 .../frontend/audio/feature_pipeline.cc        |  2 +-
 .../frontend/audio/linear_spectrogram.cc      |  7 ++--
 speechx/speechx/frontend/audio/mfcc.cc        |  7 ++--
 speechx/speechx/nnet/ds2_nnet.cc              |  9 ++--
 speechx/speechx/nnet/ds2_nnet.h               |  4 +-
 speechx/speechx/nnet/ds2_nnet_main.cc         |  4 +-
 speechx/speechx/nnet/u2_nnet.cc               | 40 +++++++++---------
 speechx/speechx/nnet/u2_nnet.h                |  4 +-
 speechx/speechx/nnet/u2_nnet_main.cc          |  6 +--
 .../websocket/websocket_client_main.cc        |  2 +-
 speechx/speechx/recognizer/recognizer.h       |  3 +-
 speechx/speechx/recognizer/recognizer_main.cc |  5 ++-
 speechx/speechx/recognizer/u2_recognizer.cc   |  2 +-
 speechx/speechx/recognizer/u2_recognizer.h    |  2 -
 speechx/speechx/utils/file_utils.cc           |  2 +-
 33 files changed, 118 insertions(+), 103 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6e7ae1fb..0435cfbe 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -50,13 +50,20 @@ repos:
         entry: bash .pre-commit-hooks/clang-format.hook -i
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
-        exclude: (?=speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin).*(\.cpp|\.cc|\.h|\.py)$
+        exclude: (?=speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.py)$
     #-   id: copyright_checker
     #    name: copyright_checker
     #    entry: python .pre-commit-hooks/copyright-check.hook
     #    language: system
     #    files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
     #    exclude: (?=third_party|pypinyin|speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin).*(\.cpp|\.cc|\.h|\.py)$
+    -   id: cpplint
+        name: cpplint
+        description: Static code analysis of C/C++ files
+        language: python
+        files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
+        exclude: (?=speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.py)$ 
+        entry: cpplint --filter=-build,-whitespace,+whitespace/comma,-whitespace/indent
 -   repo: https://github.com/asottile/reorder_python_imports
     rev: v2.4.0
     hooks:
diff --git a/speechx/speechx/base/basic_types.h b/speechx/speechx/base/basic_types.h
index 3a648649..96bc0ca5 100644
--- a/speechx/speechx/base/basic_types.h
+++ b/speechx/speechx/base/basic_types.h
@@ -22,39 +22,39 @@ typedef float BaseFloat;
 typedef double double64;
 
 typedef signed char int8;
-typedef short int16;
-typedef int int32;
+typedef short int16;  // NOLINT
+typedef int int32;    // NOLINT
 
 #if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD)
-typedef long int64;
+typedef long int64;  // NOLINT
 #else
-typedef long long int64;
+typedef long long int64;            // NOLINT
 #endif
 
-typedef unsigned char uint8;
-typedef unsigned short uint16;
-typedef unsigned int uint32;
+typedef unsigned char uint8;    // NOLINT
+typedef unsigned short uint16;  // NOLINT
+typedef unsigned int uint32;    // NOLINT
 
 #if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD)
-typedef unsigned long uint64;
+typedef unsigned long uint64;  // NOLINT
 #else
-typedef unsigned long long uint64;
+typedef unsigned long long uint64;  // NOLINT
 #endif
 
 typedef signed int char32;
 
-const uint8 kuint8max = ((uint8)0xFF);
-const uint16 kuint16max = ((uint16)0xFFFF);
-const uint32 kuint32max = ((uint32)0xFFFFFFFF);
-const uint64 kuint64max = ((uint64)(0xFFFFFFFFFFFFFFFFLL));
-const int8 kint8min = ((int8)0x80);
-const int8 kint8max = ((int8)0x7F);
-const int16 kint16min = ((int16)0x8000);
-const int16 kint16max = ((int16)0x7FFF);
-const int32 kint32min = ((int32)0x80000000);
-const int32 kint32max = ((int32)0x7FFFFFFF);
-const int64 kint64min = ((int64)(0x8000000000000000LL));
-const int64 kint64max = ((int64)(0x7FFFFFFFFFFFFFFFLL));
+const uint8 kuint8max = (static_cast<uint8> 0xFF);
+const uint16 kuint16max = (static_cast<uint16> 0xFFFF);
+const uint32 kuint32max = (static_cast<uint32> 0xFFFFFFFF);
+const uint64 kuint64max = (static_cast<uint64>(0xFFFFFFFFFFFFFFFFLL));
+const int8 kint8min = (static_cast<int8> 0x80);
+const int8 kint8max = (static_cast<int8> 0x7F);
+const int16 kint16min = (static_cast<int16> 0x8000);
+const int16 kint16max = (static_cast<int16> 0x7FFF);
+const int32 kint32min = (static_cast<int32> 0x80000000);
+const int32 kint32max = (static_cast<int32> 0x7FFFFFFF);
+const int64 kint64min = (static_cast<int64>(0x8000000000000000LL));
+const int64 kint64max = (static_cast<int64>(0x7FFFFFFFFFFFFFFFLL));
 
 const BaseFloat kBaseFloatMax = std::numeric_limits<BaseFloat>::max();
 const BaseFloat kBaseFloatMin = std::numeric_limits<BaseFloat>::min();
diff --git a/speechx/speechx/base/macros.h b/speechx/speechx/base/macros.h
index faf39373..95608f40 100644
--- a/speechx/speechx/base/macros.h
+++ b/speechx/speechx/base/macros.h
@@ -26,6 +26,6 @@ namespace ppspeech {
 #endif
 
 // kSpaceSymbol in UTF-8 is: ▁
-const std::string kSpaceSymbol = "\xe2\x96\x81";
+const char[] kSpaceSymbol = "\xe2\x96\x81";
 
 }  // namespace ppspeech
diff --git a/speechx/speechx/base/thread_pool.h b/speechx/speechx/base/thread_pool.h
index ba895f71..6d59dac5 100644
--- a/speechx/speechx/base/thread_pool.h
+++ b/speechx/speechx/base/thread_pool.h
@@ -35,7 +35,7 @@
 
 class ThreadPool {
   public:
-    ThreadPool(size_t);
+    explicit ThreadPool(size_t);
     template <class F, class... Args>
     auto enqueue(F&& f, Args&&... args)
         -> std::future<typename std::result_of<F(Args...)>::type>;
diff --git a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc
index 09f9e2fb..ab7b2cb5 100644
--- a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc
+++ b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc
@@ -64,8 +64,8 @@ void model_forward_test() {
     ;
     std::string model_graph = FLAGS_model_path;
     std::string model_params = FLAGS_param_path;
-    CHECK(model_graph != "");
-    CHECK(model_params != "");
+    CHECK_NE(model_graph, "");
+    CHECK_NE(model_params, "");
     cout << "model path: " << model_graph << endl;
     cout << "model param path : " << model_params << endl;
 
diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_beam_search_decoder.cc
index c4b35ff0..6e3a0d13 100644
--- a/speechx/speechx/decoder/ctc_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder.cc
@@ -39,12 +39,12 @@ CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts)
             opts_.alpha, opts_.beta, opts_.lm_path, vocabulary_);
     }
 
-    CHECK(opts_.blank == 0);
+    CHECK_EQ(opts_.blank, 0);
 
     auto it = std::find(vocabulary_.begin(), vocabulary_.end(), " ");
     space_id_ = it - vocabulary_.begin();
     // if no space in vocabulary
-    if ((size_t)space_id_ >= vocabulary_.size()) {
+    if (static_cast<size_t>(space_id_) >= vocabulary_.size()) {
         space_id_ = -2;
     }
 }
@@ -104,7 +104,7 @@ void CTCBeamSearch::ResetPrefixes() {
 }
 
 int CTCBeamSearch::DecodeLikelihoods(const vector<vector<float>>& probs,
-                                     vector<string>& nbest_words) {
+                                     const vector<string>& nbest_words) {
     kaldi::Timer timer;
     AdvanceDecoding(probs);
     LOG(INFO) << "ctc decoding elapsed time(s) "
diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.h b/speechx/speechx/decoder/ctc_beam_search_decoder.h
index 6347bba8..f06d88e3 100644
--- a/speechx/speechx/decoder/ctc_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder.h
@@ -48,7 +48,7 @@ class CTCBeamSearch : public DecoderBase {
     }
 
     int DecodeLikelihoods(const std::vector<std::vector<BaseFloat>>& probs,
-                          std::vector<std::string>& nbest_words);
+                          const std::vector<std::string>& nbest_words);
 
   private:
     void ResetPrefixes();
diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc
index edf9215a..ab0376b6 100644
--- a/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc
@@ -59,8 +59,8 @@ int main(int argc, char* argv[]) {
     google::InstallFailureSignalHandler();
     FLAGS_logtostderr = 1;
 
-    CHECK(FLAGS_result_wspecifier != "");
-    CHECK(FLAGS_feature_rspecifier != "");
+    CHECK_NE(FLAGS_result_wspecifier, "");
+    CHECK_NE(FLAGS_feature_rspecifier, "");
 
     kaldi::SequentialBaseFloatMatrixReader feature_reader(
         FLAGS_feature_rspecifier);
diff --git a/speechx/speechx/decoder/ctc_beam_search_opt.h b/speechx/speechx/decoder/ctc_beam_search_opt.h
index f9e5933c..f4a81b3a 100644
--- a/speechx/speechx/decoder/ctc_beam_search_opt.h
+++ b/speechx/speechx/decoder/ctc_beam_search_opt.h
@@ -36,7 +36,7 @@ struct CTCBeamSearchOptions {
     // u2
     int first_beam_size;
     int second_beam_size;
-    explicit CTCBeamSearchOptions()
+    CTCBeamSearchOptions()
         : blank(0),
           dict_file("vocab.txt"),
           lm_path(""),
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
index 56867c70..0a0afcd7 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
@@ -329,8 +329,8 @@ void CTCPrefixBeamSearch::UpdateFinalContext() {
 
 std::string CTCPrefixBeamSearch::GetBestPath(int index) {
     int n_hyps = Outputs().size();
-    CHECK(n_hyps > 0);
-    CHECK(index < n_hyps);
+    CHECK_GT(n_hyps, 0);
+    CHECK_LT(index, n_hyps);
     std::vector<int> one = Outputs()[index];
     std::string sentence;
     for (int i = 0; i < one.size(); i++) {
@@ -344,7 +344,7 @@ std::string CTCPrefixBeamSearch::GetBestPath() { return GetBestPath(0); }
 std::vector<std::pair<double, std::string>> CTCPrefixBeamSearch::GetNBestPath(
     int n) {
     int hyps_size = hypotheses_.size();
-    CHECK(hyps_size > 0);
+    CHECK_GT(hyps_size, 0);
 
     int min_n = n == -1 ? hypotheses_.size() : std::min(n, hyps_size);
 
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
index 91977092..5013246a 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
@@ -28,7 +28,7 @@ class ContextGraph;
 class CTCPrefixBeamSearch : public DecoderBase {
   public:
     CTCPrefixBeamSearch(const std::string& vocab_path,
-                                 const CTCBeamSearchOptions& opts);
+                        const CTCBeamSearchOptions& opts);
     ~CTCPrefixBeamSearch() {}
 
     SearchType Type() const { return SearchType::kPrefixBeamSearch; }
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
index 69f32686..c59b1f2e 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
@@ -50,10 +50,10 @@ int main(int argc, char* argv[]) {
 
     int32 num_done = 0, num_err = 0;
 
-    CHECK(FLAGS_result_wspecifier != "");
-    CHECK(FLAGS_feature_rspecifier != "");
-    CHECK(FLAGS_vocab_path != "");
-    CHECK(FLAGS_model_path != "");
+    CHECK_NE(FLAGS_result_wspecifier, "");
+    CHECK_NE(FLAGS_feature_rspecifier, "");
+    CHECK_NE(FLAGS_vocab_path, "");
+    CHECK_NE(FLAGS_model_path, "");
     LOG(INFO) << "model path: " << FLAGS_model_path;
     LOG(INFO) << "Reading vocab table " << FLAGS_vocab_path;
 
@@ -64,11 +64,14 @@ int main(int argc, char* argv[]) {
     // nnet
     ppspeech::ModelOptions model_opts;
     model_opts.model_path = FLAGS_model_path;
-    std::shared_ptr<ppspeech::U2Nnet> nnet = std::make_shared<ppspeech::U2Nnet>(model_opts);
+    std::shared_ptr<ppspeech::U2Nnet> nnet =
+        std::make_shared<ppspeech::U2Nnet>(model_opts);
 
     // decodeable
-    std::shared_ptr<ppspeech::DataCache> raw_data = std::make_shared<ppspeech::DataCache>();
-    std::shared_ptr<ppspeech::Decodable> decodable = std::make_shared<ppspeech::Decodable>(nnet, raw_data);
+    std::shared_ptr<ppspeech::DataCache> raw_data =
+        std::make_shared<ppspeech::DataCache>();
+    std::shared_ptr<ppspeech::Decodable> decodable =
+        std::make_shared<ppspeech::Decodable>(nnet, raw_data);
 
     // decoder
     ppspeech::CTCBeamSearchOptions opts;
diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h
index 0ff1de2a..8be69dad 100644
--- a/speechx/speechx/decoder/ctc_tlg_decoder.h
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.h
@@ -71,7 +71,7 @@ class TLGDecoder : public DecoderBase {
     std::string GetPartialResult() override;
 
     int DecodeLikelihoods(const std::vector<std::vector<BaseFloat>>& probs,
-                          std::vector<std::string>& nbest_words);
+                          const std::vector<std::string>& nbest_words);
 
   protected:
     std::string GetBestPath() override {
diff --git a/speechx/speechx/frontend/audio/cmvn.cc b/speechx/speechx/frontend/audio/cmvn.cc
index 7997e8a7..3d80e001 100644
--- a/speechx/speechx/frontend/audio/cmvn.cc
+++ b/speechx/speechx/frontend/audio/cmvn.cc
@@ -30,7 +30,7 @@ using std::vector;
 
 CMVN::CMVN(std::string cmvn_file, unique_ptr<FrontendInterface> base_extractor)
     : var_norm_(true) {
-    CHECK(cmvn_file != "");
+    CHECK_NE(cmvn_file, "");
     base_extractor_ = std::move(base_extractor);
 
     bool binary;
diff --git a/speechx/speechx/frontend/audio/compute_fbank_main.cc b/speechx/speechx/frontend/audio/compute_fbank_main.cc
index bb7e449f..e2b54a8a 100644
--- a/speechx/speechx/frontend/audio/compute_fbank_main.cc
+++ b/speechx/speechx/frontend/audio/compute_fbank_main.cc
@@ -40,8 +40,8 @@ int main(int argc, char* argv[]) {
     google::InstallFailureSignalHandler();
     FLAGS_logtostderr = 1;
 
-    CHECK(FLAGS_wav_rspecifier.size() > 0);
-    CHECK(FLAGS_feature_wspecifier.size() > 0);
+    CHECK_GT(FLAGS_wav_rspecifier.size(), 0);
+    CHECK_GT(FLAGS_feature_wspecifier.size(), 0);
     kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
         FLAGS_wav_rspecifier);
     kaldi::SequentialTableReader<kaldi::WaveInfoHolder> wav_info_reader(
diff --git a/speechx/speechx/frontend/audio/data_cache.h b/speechx/speechx/frontend/audio/data_cache.h
index 5fafdeb2..5f5cd51b 100644
--- a/speechx/speechx/frontend/audio/data_cache.h
+++ b/speechx/speechx/frontend/audio/data_cache.h
@@ -27,7 +27,7 @@ namespace ppspeech {
 // pre-recorded audio/feature
 class DataCache : public FrontendInterface {
   public:
-    explicit DataCache() { finished_ = false; }
+    DataCache() { finished_ = false; }
 
     // accept waves/feats
     virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
diff --git a/speechx/speechx/frontend/audio/db_norm.cc b/speechx/speechx/frontend/audio/db_norm.cc
index 931e932d..ad79fcc3 100644
--- a/speechx/speechx/frontend/audio/db_norm.cc
+++ b/speechx/speechx/frontend/audio/db_norm.cc
@@ -14,17 +14,18 @@
 
 
 #include "frontend/audio/db_norm.h"
+
 #include "kaldi/feat/cmvn.h"
 #include "kaldi/util/kaldi-io.h"
 
 namespace ppspeech {
 
-using kaldi::Vector;
-using kaldi::VectorBase;
 using kaldi::BaseFloat;
-using std::vector;
 using kaldi::SubVector;
+using kaldi::Vector;
+using kaldi::VectorBase;
 using std::unique_ptr;
+using std::vector;
 
 DecibelNormalizer::DecibelNormalizer(
     const DecibelNormalizerOptions& opts,
diff --git a/speechx/speechx/frontend/audio/fbank.cc b/speechx/speechx/frontend/audio/fbank.cc
index 059abbbd..deabe876 100644
--- a/speechx/speechx/frontend/audio/fbank.cc
+++ b/speechx/speechx/frontend/audio/fbank.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "frontend/audio/fbank.h"
+
 #include "kaldi/base/kaldi-math.h"
 #include "kaldi/feat/feature-common.h"
 #include "kaldi/feat/feature-functions.h"
@@ -20,12 +21,12 @@
 
 namespace ppspeech {
 
-using kaldi::int32;
 using kaldi::BaseFloat;
-using kaldi::Vector;
+using kaldi::int32;
+using kaldi::Matrix;
 using kaldi::SubVector;
+using kaldi::Vector;
 using kaldi::VectorBase;
-using kaldi::Matrix;
 using std::vector;
 
 FbankComputer::FbankComputer(const Options& opts)
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc
index 65493e42..2931b96b 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.cc
+++ b/speechx/speechx/frontend/audio/feature_pipeline.cc
@@ -33,7 +33,7 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts)
             opts.linear_spectrogram_opts, std::move(data_source)));
     }
 
-    CHECK(opts.cmvn_file != "");
+    CHECK_NE(opts.cmvn_file, "");
     unique_ptr<FrontendInterface> cmvn(
         new ppspeech::CMVN(opts.cmvn_file, std::move(base_feature)));
 
diff --git a/speechx/speechx/frontend/audio/linear_spectrogram.cc b/speechx/speechx/frontend/audio/linear_spectrogram.cc
index 55c03978..d4a2fcc6 100644
--- a/speechx/speechx/frontend/audio/linear_spectrogram.cc
+++ b/speechx/speechx/frontend/audio/linear_spectrogram.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "frontend/audio/linear_spectrogram.h"
+
 #include "kaldi/base/kaldi-math.h"
 #include "kaldi/feat/feature-common.h"
 #include "kaldi/feat/feature-functions.h"
@@ -20,12 +21,12 @@
 
 namespace ppspeech {
 
-using kaldi::int32;
 using kaldi::BaseFloat;
-using kaldi::Vector;
+using kaldi::int32;
+using kaldi::Matrix;
 using kaldi::SubVector;
+using kaldi::Vector;
 using kaldi::VectorBase;
-using kaldi::Matrix;
 using std::vector;
 
 LinearSpectrogramComputer::LinearSpectrogramComputer(const Options& opts)
diff --git a/speechx/speechx/frontend/audio/mfcc.cc b/speechx/speechx/frontend/audio/mfcc.cc
index bda1f96d..15f8cb0f 100644
--- a/speechx/speechx/frontend/audio/mfcc.cc
+++ b/speechx/speechx/frontend/audio/mfcc.cc
@@ -14,6 +14,7 @@
 
 
 #include "frontend/audio/mfcc.h"
+
 #include "kaldi/base/kaldi-math.h"
 #include "kaldi/feat/feature-common.h"
 #include "kaldi/feat/feature-functions.h"
@@ -21,12 +22,12 @@
 
 namespace ppspeech {
 
-using kaldi::int32;
 using kaldi::BaseFloat;
-using kaldi::Vector;
+using kaldi::int32;
+using kaldi::Matrix;
 using kaldi::SubVector;
+using kaldi::Vector;
 using kaldi::VectorBase;
-using kaldi::Matrix;
 using std::vector;
 
 Mfcc::Mfcc(const MfccOptions& opts,
diff --git a/speechx/speechx/nnet/ds2_nnet.cc b/speechx/speechx/nnet/ds2_nnet.cc
index 8c83f832..22c7f61b 100644
--- a/speechx/speechx/nnet/ds2_nnet.cc
+++ b/speechx/speechx/nnet/ds2_nnet.cc
@@ -13,15 +13,16 @@
 // limitations under the License.
 
 #include "nnet/ds2_nnet.h"
+
 #include "absl/strings/str_split.h"
 
 namespace ppspeech {
 
-using std::vector;
-using std::string;
-using std::shared_ptr;
 using kaldi::Matrix;
 using kaldi::Vector;
+using std::shared_ptr;
+using std::string;
+using std::vector;
 
 void PaddleNnet::InitCacheEncouts(const ModelOptions& opts) {
     std::vector<std::string> cache_names;
@@ -207,7 +208,7 @@ void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
 
     // inferences->Resize(row * col);
     // *inference_dim = col;
-    out->logprobs.Resize(row*col);
+    out->logprobs.Resize(row * col);
     out->vocab_dim = col;
     output_tensor->CopyToCpu(out->logprobs.Data());
 
diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h
index d1e3ac8c..420fa177 100644
--- a/speechx/speechx/nnet/ds2_nnet.h
+++ b/speechx/speechx/nnet/ds2_nnet.h
@@ -26,7 +26,7 @@ template <typename T>
 class Tensor {
   public:
     Tensor() {}
-    Tensor(const std::vector<int>& shape) : _shape(shape) {
+    explicit Tensor(const std::vector<int>& shape) : _shape(shape) {
         int neml = std::accumulate(
             _shape.begin(), _shape.end(), 1, std::multiplies<int>());
         LOG(INFO) << "Tensor neml: " << neml;
@@ -50,7 +50,7 @@ class Tensor {
 
 class PaddleNnet : public NnetBase {
   public:
-    PaddleNnet(const ModelOptions& opts);
+    explicit PaddleNnet(const ModelOptions& opts);
 
     void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
                      const int32& feature_dim,
diff --git a/speechx/speechx/nnet/ds2_nnet_main.cc b/speechx/speechx/nnet/ds2_nnet_main.cc
index d8d33e98..6092b8a4 100644
--- a/speechx/speechx/nnet/ds2_nnet_main.cc
+++ b/speechx/speechx/nnet/ds2_nnet_main.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "nnet/ds2_nnet.h"
 #include "base/common.h"
 #include "decoder/param.h"
 #include "frontend/audio/assembler.h"
 #include "frontend/audio/data_cache.h"
 #include "kaldi/util/table-types.h"
 #include "nnet/decodable.h"
+#include "nnet/ds2_nnet.h"
 
 DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
 DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier");
@@ -44,7 +44,7 @@ int main(int argc, char* argv[]) {
 
     int32 num_done = 0, num_err = 0;
 
-    ppspeech::ModelOptions model_opts =  ppspeech::ModelOptions::InitFromFlags();
+    ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags();
 
     std::shared_ptr<ppspeech::PaddleNnet> nnet(
         new ppspeech::PaddleNnet(model_opts));
diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc
index 636e2ad4..19cb85fd 100644
--- a/speechx/speechx/nnet/u2_nnet.cc
+++ b/speechx/speechx/nnet/u2_nnet.cc
@@ -158,7 +158,7 @@ void U2Nnet::Reset() {
 }
 
 // Debug API
-void U2Nnet::FeedEncoderOuts(paddle::Tensor& encoder_out) {
+void U2Nnet::FeedEncoderOuts(const paddle::Tensor& encoder_out) {
     // encoder_out (T,D)
     encoder_outs_.clear();
     encoder_outs_.push_back(encoder_out);
@@ -206,7 +206,7 @@ void U2Nnet::ForwardEncoderChunkImpl(
     float* feats_ptr = feats.mutable_data<float>();
 
     // not cache feature in nnet
-    CHECK(cached_feats_.size() == 0);
+    CHECK_EQ(cached_feats_.size(), 0);
     // CHECK_EQ(std::is_same<float, kaldi::BaseFloat>::value, true);
     std::memcpy(feats_ptr,
                 chunk_feats.data(),
@@ -247,9 +247,9 @@ void U2Nnet::ForwardEncoderChunkImpl(
     // call.
     std::vector<paddle::Tensor> inputs = {
         feats, offset, /*required_cache_size, */ att_cache_, cnn_cache_};
-    CHECK(inputs.size() == 4);
+    CHECK_EQ(inputs.size(), 4);
     std::vector<paddle::Tensor> outputs = forward_encoder_chunk_(inputs);
-    CHECK(outputs.size() == 3);
+    CHECK_EQ(outputs.size(), 3);
 
 #ifdef USE_GPU
     paddle::Tensor chunk_out = outputs[0].copy_to(paddle::CPUPlace());
@@ -319,9 +319,9 @@ void U2Nnet::ForwardEncoderChunkImpl(
     inputs.clear();
     outputs.clear();
     inputs.push_back(chunk_out);
-    CHECK(inputs.size() == 1);
+    CHECK_EQ(inputs.size(), 1);
     outputs = ctc_activation_(inputs);
-    CHECK(outputs.size() == 1);
+    CHECK_EQ(outputs.size(), 1);
     paddle::Tensor ctc_log_probs = outputs[0];
 
 #ifdef TEST_DEBUG
@@ -350,9 +350,9 @@ void U2Nnet::ForwardEncoderChunkImpl(
 
     // Copy to output, (B=1,T,D)
     std::vector<int64_t> ctc_log_probs_shape = ctc_log_probs.shape();
-    CHECK(ctc_log_probs_shape.size() == 3);
+    CHECK_EQ(ctc_log_probs_shape.size(), 3);
     int B = ctc_log_probs_shape[0];
-    CHECK(B == 1);
+    CHECK_EQ(B, 1);
     int T = ctc_log_probs_shape[1];
     int D = ctc_log_probs_shape[2];
     *vocab_dim = D;
@@ -393,9 +393,9 @@ float U2Nnet::ComputePathScore(const paddle::Tensor& prob,
     // hyp (U,)
     float score = 0.0f;
     std::vector<int64_t> dims = prob.shape();
-    CHECK(dims.size() == 3);
+    CHECK_EQ(dims.size(), 3);
     VLOG(2) << "prob shape: " << dims[0] << ", " << dims[1] << ", " << dims[2];
-    CHECK(dims[0] == 1);
+    CHECK_EQ(dims[0], 1);
     int vocab_dim = static_cast<int>(dims[2]);
 
     const float* prob_ptr = prob.data<float>();
@@ -520,14 +520,14 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
     std::vector<paddle::experimental::Tensor> inputs{
         hyps_tensor, hyps_lens, encoder_out};
     std::vector<paddle::Tensor> outputs = forward_attention_decoder_(inputs);
-    CHECK(outputs.size() == 2);
+    CHECK_EQ(outputs.size(), 2);
 
     // (B, Umax, V)
     paddle::Tensor probs = outputs[0];
     std::vector<int64_t> probs_shape = probs.shape();
-    CHECK(probs_shape.size() == 3);
-    CHECK(probs_shape[0] == num_hyps);
-    CHECK(probs_shape[1] == max_hyps_len);
+    CHECK_EQ(probs_shape.size(), 3);
+    CHECK_EQ(probs_shape[0], num_hyps);
+    CHECK_EQ(probs_shape[1], max_hyps_len);
 
 #ifdef TEST_DEBUG
     {
@@ -582,13 +582,13 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
     paddle::Tensor r_probs = outputs[1];
     std::vector<int64_t> r_probs_shape = r_probs.shape();
     if (is_bidecoder_ && reverse_weight > 0) {
-        CHECK(r_probs_shape.size() == 3);
-        CHECK(r_probs_shape[0] == num_hyps);
-        CHECK(r_probs_shape[1] == max_hyps_len);
+        CHECK_EQ(r_probs_shape.size(), 3);
+        CHECK_EQ(r_probs_shape[0], num_hyps);
+        CHECK_EQ(r_probs_shape[1], max_hyps_len);
     } else {
         // dump r_probs
-        CHECK(r_probs_shape.size() == 1);
-        CHECK(r_probs_shape[0] == 1) << r_probs_shape[0];
+        CHECK_EQ(r_probs_shape.size(), 1);
+        CHECK_EQ(r_probs_shape[0], 1) << r_probs_shape[0];
     }
 
     // compute rescoring score
@@ -644,7 +644,7 @@ void U2Nnet::EncoderOuts(
     for (int i = 0; i < size; i++) {
         const paddle::Tensor& item = encoder_outs_[i];
         const std::vector<int64_t> shape = item.shape();
-        CHECK(shape.size() == 3);
+        CHECK_EQ(shape.size(), 3);
         const int& B = shape[0];
         const int& T = shape[1];
         const int& D = shape[2];
diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h
index e548d4c0..23cc0ea3 100644
--- a/speechx/speechx/nnet/u2_nnet.h
+++ b/speechx/speechx/nnet/u2_nnet.h
@@ -73,7 +73,7 @@ class U2NnetBase : public NnetBase {
 
 class U2Nnet : public U2NnetBase {
   public:
-    U2Nnet(const ModelOptions& opts);
+    explicit U2Nnet(const ModelOptions& opts);
     U2Nnet(const U2Nnet& other);
 
     void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
@@ -108,7 +108,7 @@ class U2Nnet : public U2NnetBase {
                             std::vector<float>* rescoring_score) override;
 
     // debug
-    void FeedEncoderOuts(paddle::Tensor& encoder_out);
+    void FeedEncoderOuts(const paddle::Tensor& encoder_out);
 
     void EncoderOuts(
         std::vector<kaldi::Vector<kaldi::BaseFloat>>* encoder_out) const;
diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc
index 5039a59a..53fc5554 100644
--- a/speechx/speechx/nnet/u2_nnet_main.cc
+++ b/speechx/speechx/nnet/u2_nnet_main.cc
@@ -39,9 +39,9 @@ int main(int argc, char* argv[]) {
 
     int32 num_done = 0, num_err = 0;
 
-    CHECK(FLAGS_feature_rspecifier.size() > 0);
-    CHECK(FLAGS_nnet_prob_wspecifier.size() > 0);
-    CHECK(FLAGS_model_path.size() > 0);
+    CHECK_GT(FLAGS_feature_rspecifier.size(), 0);
+    CHECK_GT(FLAGS_nnet_prob_wspecifier.size(), 0);
+    CHECK_GT(FLAGS_model_path.size(), 0);
     LOG(INFO) << "input rspecifier: " << FLAGS_feature_rspecifier;
     LOG(INFO) << "output wspecifier: " << FLAGS_nnet_prob_wspecifier;
     LOG(INFO) << "model path: " << FLAGS_model_path;
diff --git a/speechx/speechx/protocol/websocket/websocket_client_main.cc b/speechx/speechx/protocol/websocket/websocket_client_main.cc
index 7ad36e3a..7c5a4f2f 100644
--- a/speechx/speechx/protocol/websocket/websocket_client_main.cc
+++ b/speechx/speechx/protocol/websocket/websocket_client_main.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "websocket/websocket_client.h"
 #include "kaldi/feat/wave-reader.h"
 #include "kaldi/util/kaldi-io.h"
 #include "kaldi/util/table-types.h"
+#include "websocket/websocket_client.h"
 
 DEFINE_string(host, "127.0.0.1", "host of websocket server");
 DEFINE_int32(port, 8082, "port of websocket server");
diff --git a/speechx/speechx/recognizer/recognizer.h b/speechx/speechx/recognizer/recognizer.h
index 27f1228a..57d5bb36 100644
--- a/speechx/speechx/recognizer/recognizer.h
+++ b/speechx/speechx/recognizer/recognizer.h
@@ -39,7 +39,8 @@ struct RecognizerResource {
         resource.feature_pipeline_opts =
             FeaturePipelineOptions::InitFromFlags();
         resource.feature_pipeline_opts.assembler_opts.fill_zero = true;
-        LOG(INFO) << "ds2 need fill zero be true: " << resource.feature_pipeline_opts.assembler_opts.fill_zero;
+        LOG(INFO) << "ds2 need fill zero be true: "
+                  << resource.feature_pipeline_opts.assembler_opts.fill_zero;
         resource.model_opts = ModelOptions::InitFromFlags();
         resource.tlg_opts = TLGDecoderOptions::InitFromFlags();
         return resource;
diff --git a/speechx/speechx/recognizer/recognizer_main.cc b/speechx/speechx/recognizer/recognizer_main.cc
index 7c30fe6a..cb0de2d6 100644
--- a/speechx/speechx/recognizer/recognizer_main.cc
+++ b/speechx/speechx/recognizer/recognizer_main.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "decoder/param.h"
-#include "recognizer/recognizer.h"
 #include "kaldi/feat/wave-reader.h"
 #include "kaldi/util/table-types.h"
+#include "recognizer/recognizer.h"
 
 DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
 DEFINE_string(result_wspecifier, "", "test result wspecifier");
@@ -30,7 +30,8 @@ int main(int argc, char* argv[]) {
     google::InstallFailureSignalHandler();
     FLAGS_logtostderr = 1;
 
-    ppspeech::RecognizerResource resource = ppspeech::RecognizerResource::InitFromFlags();
+    ppspeech::RecognizerResource resource =
+        ppspeech::RecognizerResource::InitFromFlags();
     ppspeech::Recognizer recognizer(resource);
 
     kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
diff --git a/speechx/speechx/recognizer/u2_recognizer.cc b/speechx/speechx/recognizer/u2_recognizer.cc
index 4ec64665..382f622f 100644
--- a/speechx/speechx/recognizer/u2_recognizer.cc
+++ b/speechx/speechx/recognizer/u2_recognizer.cc
@@ -35,7 +35,7 @@ U2Recognizer::U2Recognizer(const U2RecognizerResource& resource)
     BaseFloat am_scale = resource.acoustic_scale;
     decodable_.reset(new Decodable(nnet, feature_pipeline_, am_scale));
 
-    CHECK(resource.vocab_path != "");
+    CHECK_NE(resource.vocab_path, "");
     decoder_.reset(new CTCPrefixBeamSearch(
         resource.vocab_path, resource.decoder_opts.ctc_prefix_search_opts));
 
diff --git a/speechx/speechx/recognizer/u2_recognizer.h b/speechx/speechx/recognizer/u2_recognizer.h
index 9b43b08f..25850863 100644
--- a/speechx/speechx/recognizer/u2_recognizer.h
+++ b/speechx/speechx/recognizer/u2_recognizer.h
@@ -1,5 +1,3 @@
-
-
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/speechx/speechx/utils/file_utils.cc b/speechx/speechx/utils/file_utils.cc
index e5943e31..c42a642c 100644
--- a/speechx/speechx/utils/file_utils.cc
+++ b/speechx/speechx/utils/file_utils.cc
@@ -40,4 +40,4 @@ std::string ReadFile2String(const std::string& path) {
     return std::string((std::istreambuf_iterator<char>(input_file)),
                        std::istreambuf_iterator<char>());
 }
-}
+}  // namespace ppspeech

From 8ef3b339ea829661e3f0a4da24e77f4f096ada6f Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 24 Oct 2022 09:35:27 +0000
Subject: [PATCH 57/60] fix cpplint

---
 speechx/speechx/base/basic_types.h | 24 ++++++++++++------------
 speechx/speechx/base/macros.h      |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/speechx/speechx/base/basic_types.h b/speechx/speechx/base/basic_types.h
index 96bc0ca5..3e298b1b 100644
--- a/speechx/speechx/base/basic_types.h
+++ b/speechx/speechx/base/basic_types.h
@@ -43,18 +43,18 @@ typedef unsigned long long uint64;  // NOLINT
 
 typedef signed int char32;
 
-const uint8 kuint8max = (static_cast<uint8> 0xFF);
-const uint16 kuint16max = (static_cast<uint16> 0xFFFF);
-const uint32 kuint32max = (static_cast<uint32> 0xFFFFFFFF);
-const uint64 kuint64max = (static_cast<uint64>(0xFFFFFFFFFFFFFFFFLL));
-const int8 kint8min = (static_cast<int8> 0x80);
-const int8 kint8max = (static_cast<int8> 0x7F);
-const int16 kint16min = (static_cast<int16> 0x8000);
-const int16 kint16max = (static_cast<int16> 0x7FFF);
-const int32 kint32min = (static_cast<int32> 0x80000000);
-const int32 kint32max = (static_cast<int32> 0x7FFFFFFF);
-const int64 kint64min = (static_cast<int64>(0x8000000000000000LL));
-const int64 kint64max = (static_cast<int64>(0x7FFFFFFFFFFFFFFFLL));
+const uint8 kuint8max = static_cast<uint8>(0xFF);
+const uint16 kuint16max = static_cast<uint16>(0xFFFF);
+const uint32 kuint32max = static_cast<uint32>(0xFFFFFFFF);
+const uint64 kuint64max = static_cast<uint64>(0xFFFFFFFFFFFFFFFFLL);
+const int8 kint8min = static_cast<int8> (0x80);
+const int8 kint8max = static_cast<int8> (0x7F);
+const int16 kint16min = static_cast<int16> (0x8000);
+const int16 kint16max = static_cast<int16> (0x7FFF);
+const int32 kint32min = static_cast<int32>(0x80000000);
+const int32 kint32max = static_cast<int32>(0x7FFFFFFF);
+const int64 kint64min = static_cast<int64>(0x8000000000000000LL);
+const int64 kint64max = static_cast<int64>(0x7FFFFFFFFFFFFFFFLL);
 
 const BaseFloat kBaseFloatMax = std::numeric_limits<BaseFloat>::max();
 const BaseFloat kBaseFloatMin = std::numeric_limits<BaseFloat>::min();
diff --git a/speechx/speechx/base/macros.h b/speechx/speechx/base/macros.h
index 95608f40..db989812 100644
--- a/speechx/speechx/base/macros.h
+++ b/speechx/speechx/base/macros.h
@@ -26,6 +26,6 @@ namespace ppspeech {
 #endif
 
 // kSpaceSymbol in UTF-8 is: ▁
-const char[] kSpaceSymbol = "\xe2\x96\x81";
+const char kSpaceSymbo[] = "\xe2\x96\x81";
 
 }  // namespace ppspeech

From 4dbff16f7062aee8a13bb5fad78b1b5d820f4563 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 24 Oct 2022 11:01:27 +0000
Subject: [PATCH 58/60] fix format

---
 speechx/speechx/base/basic_types.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/speechx/speechx/base/basic_types.h b/speechx/speechx/base/basic_types.h
index 3e298b1b..2b15a61f 100644
--- a/speechx/speechx/base/basic_types.h
+++ b/speechx/speechx/base/basic_types.h
@@ -47,10 +47,10 @@ const uint8 kuint8max = static_cast<uint8>(0xFF);
 const uint16 kuint16max = static_cast<uint16>(0xFFFF);
 const uint32 kuint32max = static_cast<uint32>(0xFFFFFFFF);
 const uint64 kuint64max = static_cast<uint64>(0xFFFFFFFFFFFFFFFFLL);
-const int8 kint8min = static_cast<int8> (0x80);
-const int8 kint8max = static_cast<int8> (0x7F);
-const int16 kint16min = static_cast<int16> (0x8000);
-const int16 kint16max = static_cast<int16> (0x7FFF);
+const int8 kint8min = static_cast<int8>(0x80);
+const int8 kint8max = static_cast<int8>(0x7F);
+const int16 kint16min = static_cast<int16>(0x8000);
+const int16 kint16max = static_cast<int16>(0x7FFF);
 const int32 kint32min = static_cast<int32>(0x80000000);
 const int32 kint32max = static_cast<int32>(0x7FFFFFFF);
 const int64 kint64min = static_cast<int64>(0x8000000000000000LL);

From aaf39863e03a52ec4c1cfc9e580c4c73d277f3bc Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 25 Oct 2022 06:35:20 +0000
Subject: [PATCH 59/60] more info

---
 .../decoder/ctc_prefix_beam_search_decoder.cc |  2 +-
 .../speechx/recognizer/u2_recognizer_main.cc  | 19 +++++++++++++------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
index 0a0afcd7..03a7c133 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
@@ -74,7 +74,7 @@ void CTCPrefixBeamSearch::AdvanceDecode(
         std::vector<kaldi::BaseFloat> frame_prob;
         bool flag = decodable->FrameLikelihood(num_frame_decoded_, &frame_prob);
         if (flag == false) {
-            LOG(INFO) << "decoder advance decode exit." << frame_prob.size();
+            VLOG(1) << "decoder advance decode exit." << frame_prob.size();
             break;
         }
 
diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc
index c02e1c23..61330259 100644
--- a/speechx/speechx/recognizer/u2_recognizer_main.cc
+++ b/speechx/speechx/recognizer/u2_recognizer_main.cc
@@ -49,11 +49,13 @@ int main(int argc, char* argv[]) {
 
     kaldi::Timer timer;
     for (; !wav_reader.Done(); wav_reader.Next()) {
+        kaldi::Timer local_timer;
         std::string utt = wav_reader.Key();
         const kaldi::WaveData& wave_data = wav_reader.Value();
         LOG(INFO) << "utt: " << utt;
         LOG(INFO) << "wav dur: " << wave_data.Duration() << " sec.";
-        tot_wav_duration += wave_data.Duration();
+        double dur = wave_data.Duration();
+        tot_wav_duration += dur;
 
         int32 this_channel = 0;
         kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
@@ -63,6 +65,7 @@ int main(int argc, char* argv[]) {
 
         int sample_offset = 0;
         int cnt = 0;
+
         while (sample_offset < tot_samples) {
             int cur_chunk_size =
                 std::min(chunk_sample_size, tot_samples - sample_offset);
@@ -78,8 +81,10 @@ int main(int argc, char* argv[]) {
                 recognizer.SetFinished();
             }
             recognizer.Decode();
-            LOG(INFO) << "Pratial result: " << cnt << " "
-                      << recognizer.GetPartialResult();
+            if (recognizer.DecodedSomething()) {
+                LOG(INFO) << "Pratial result: " << cnt << " "
+                          << recognizer.GetPartialResult();
+            }
 
             // no overlap
             sample_offset += cur_chunk_size;
@@ -101,7 +106,9 @@ int main(int argc, char* argv[]) {
             continue;
         }
 
-        LOG(INFO) << " the result of " << utt << " is " << result;
+        LOG(INFO) << utt << " " << result;
+        LOG(INFO) << " RTF: " << dur / local_timer.Elapsed() << " dur: " << dur
+                  << " cost: " << local_timer.Elapsed();
 
         result_writer.Write(utt, result);
 
@@ -111,7 +118,7 @@ int main(int argc, char* argv[]) {
     double elapsed = timer.Elapsed();
 
     LOG(INFO) << "Done " << num_done << " out of " << (num_err + num_done);
-    LOG(INFO) << "cost:" << elapsed << " sec";
+    LOG(INFO) << "total cost:" << elapsed << " sec";
     LOG(INFO) << "total wav duration is: " << tot_wav_duration << " sec";
-    LOG(INFO) << "the RTF is: " << elapsed / tot_wav_duration;
+    LOG(INFO) << "RTF is: " << elapsed / tot_wav_duration;
 }

From b4d1dc1d6526b45b417327a1f4d1a35228c385ca Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 25 Oct 2022 06:53:35 +0000
Subject: [PATCH 60/60] fix rtf compute

---
 speechx/speechx/recognizer/u2_recognizer_main.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc
index 61330259..5cb8dbb1 100644
--- a/speechx/speechx/recognizer/u2_recognizer_main.cc
+++ b/speechx/speechx/recognizer/u2_recognizer_main.cc
@@ -107,7 +107,7 @@ int main(int argc, char* argv[]) {
         }
 
         LOG(INFO) << utt << " " << result;
-        LOG(INFO) << " RTF: " << dur / local_timer.Elapsed() << " dur: " << dur
+        LOG(INFO) << " RTF: " << local_timer.Elapsed() / dur << " dur: " << dur
                   << " cost: " << local_timer.Elapsed();
 
         result_writer.Write(utt, result);