diff --git a/examples/wenetspeech/asr1/local/test_wav.sh b/examples/wenetspeech/asr1/local/test_wav.sh
index 474642624..c3a17f491 100755
--- a/examples/wenetspeech/asr1/local/test_wav.sh
+++ b/examples/wenetspeech/asr1/local/test_wav.sh
@@ -42,6 +42,7 @@ for type in  attention_rescoring; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test_wav.py \
+    --debug True \
     --ngpu ${ngpu} \
     --config ${config_path} \
     --decode_cfg ${decode_config_path} \
diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py
index 2e067ab6b..67ef2e53c 100644
--- a/paddlespeech/s2t/exps/u2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py
@@ -16,6 +16,8 @@ import os
 import sys
 from pathlib import Path
 
+import distutils
+import numpy as np
 import paddle
 import soundfile
 from yacs.config import CfgNode
@@ -74,6 +76,8 @@ class U2Infer():
             # fbank
             feat = self.preprocessing(audio, **self.preprocess_args)
             logger.info(f"feat shape: {feat.shape}")
+            if self.args.debug:
+                np.savetxt("feat.transform.txt", feat)
 
             ilen = paddle.to_tensor(feat.shape[0])
             xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0)
@@ -125,6 +129,11 @@ if __name__ == "__main__":
         "--result_file", type=str, help="path of save the asr result")
     parser.add_argument(
         "--audio_file", type=str, help="path of the input audio file")
+    parser.add_argument(
+        "--debug",
+        type=distutils.util.strtobool,
+        default=False,
+        help="for debug.")
     args = parser.parse_args()
 
     config = CfgNode(new_allowed=True)
diff --git a/speechx/.clang-format b/speechx/.clang-format
new file mode 100644
index 000000000..af946a4a9
--- /dev/null
+++ b/speechx/.clang-format
@@ -0,0 +1,29 @@
+# This file is used by clang-format to autoformat paddle source code
+#
+# The clang-format is part of llvm toolchain.
+# It need to install llvm and clang to format source code style.
+#
+# The basic usage is,
+#   clang-format -i -style=file PATH/TO/SOURCE/CODE
+#
+# The -style=file implicit use ".clang-format" file located in one of
+# parent directory.
+# The -i means inplace change.
+#
+# The document of clang-format is
+#   http://clang.llvm.org/docs/ClangFormat.html
+#   http://clang.llvm.org/docs/ClangFormatStyleOptions.html
+---
+Language:        Cpp
+BasedOnStyle:  Google
+IndentWidth:     4
+TabWidth:        4
+ContinuationIndentWidth: 4
+MaxEmptyLinesToKeep: 2
+AccessModifierOffset: -2  # The private/protected/public has no indent in class
+Standard:  Cpp11
+AllowAllParametersOfDeclarationOnNextLine: true
+BinPackParameters: false
+BinPackArguments: false
+...
+
diff --git a/speechx/CMakeLists.txt b/speechx/CMakeLists.txt
index 8307d9920..17e64c04a 100644
--- a/speechx/CMakeLists.txt
+++ b/speechx/CMakeLists.txt
@@ -31,9 +31,13 @@ SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} --std=c++14 -pthread -fPIC -O3 -Wall
 ###############################################################################
 # Option Configurations
 ###############################################################################
-# option configurations 
 option(TEST_DEBUG "option for debug" OFF)
+option(USE_PROFILING "enable c++ profling" OFF)
 
+option(USING_U2  "compile u2 model." ON)
+option(USING_DS2 "compile with ds2 model." ON)
+
+option(USING_GPU "u2 compute on GPU." OFF)
 
 ###############################################################################
 # Include third party
@@ -85,6 +89,41 @@ add_dependencies(openfst gflags glog)
 include(paddleinference)
 
 
+# paddle core.so
+find_package(Threads REQUIRED)
+find_package(PythonLibs REQUIRED)
+find_package(Python3 REQUIRED)
+find_package(pybind11 CONFIG)
+
+message(STATUS "PYTHON_LIBRARIES = ${PYTHON_LIBRARIES}")
+message(STATUS "Python3_EXECUTABLE = ${Python3_EXECUTABLE}")
+message(STATUS "Pybind11_INCLUDES = ${pybind11_INCLUDE_DIRS}, pybind11_LIBRARIES=${pybind11_LIBRARIES}, pybind11_DEFINITIONS=${pybind11_DEFINITIONS}")
+
+# paddle include and link option
+execute_process(
+    COMMAND python -c "import paddle ; print(' '.join(paddle.sysconfig.get_link_flags()), end='')"
+    OUTPUT_VARIABLE PADDLE_LINK_FLAGS
+    RESULT_VARIABLE SUCESS)
+
+message(STATUS PADDLE_LINK_FLAGS= ${PADDLE_LINK_FLAGS})
+string(STRIP ${PADDLE_LINK_FLAGS} PADDLE_LINK_FLAGS)
+
+# paddle compile option
+execute_process(
+    COMMAND python -c "import paddle ; print(' '.join(paddle.sysconfig.get_compile_flags()), end='')"
+    OUTPUT_VARIABLE PADDLE_COMPILE_FLAGS)
+message(STATUS PADDLE_COMPILE_FLAGS= ${PADDLE_COMPILE_FLAGS})
+string(STRIP ${PADDLE_COMPILE_FLAGS} PADDLE_COMPILE_FLAGS)
+
+
+# for LD_LIBRARY_PATH
+# set(PADDLE_LIB_DIRS /workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid:/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/libs/)
+execute_process(
+    COMMAND python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')"
+    OUTPUT_VARIABLE PADDLE_LIB_DIRS)
+message(STATUS PADDLE_LIB_DIRS= ${PADDLE_LIB_DIRS})
+
+
 ###############################################################################
 # Add local library
 ###############################################################################
diff --git a/speechx/README.md b/speechx/README.md
index cd1cd62c1..cc7b13e6a 100644
--- a/speechx/README.md
+++ b/speechx/README.md
@@ -3,11 +3,14 @@
 ## Environment
 
 We develop under:
+* python - 3.7
 * docker - `registry.baidubce.com/paddlepaddle/paddle:2.2.2-gpu-cuda10.2-cudnn7`
 * os - Ubuntu 16.04.7 LTS
 * gcc/g++/gfortran - 8.2.0
 * cmake - 3.16.0
 
+> Please using `tools/env.sh` to create python `venv`, then `source venv/bin/activate` to build speechx.
+
 > We make sure all things work fun under docker, and recommend using it to develop and deploy.
 
 * [How to Install Docker](https://docs.docker.com/engine/install/)
@@ -24,13 +27,16 @@ docker run --privileged  --net=host --ipc=host -it --rm -v $PWD:/workspace --nam
 
 * More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html).
 
+2. Create python environment.
 
-2. Build `speechx` and `examples`.
+```
+bash tools/venv.sh
+```
 
-> Do not source venv.
+2. Build `speechx` and `examples`.
 
 ```
-pushd /path/to/speechx
+source venv/bin/activate
 ./build.sh
 ```
 
diff --git a/speechx/cmake/gflags.cmake b/speechx/cmake/gflags.cmake
index 66ae47f70..36bebc877 100644
--- a/speechx/cmake/gflags.cmake
+++ b/speechx/cmake/gflags.cmake
@@ -2,10 +2,9 @@ include(FetchContent)
 
 FetchContent_Declare(
   gflags
-  URL      https://github.com/gflags/gflags/archive/v2.2.1.zip
-  URL_HASH SHA256=4e44b69e709c826734dbbbd5208f61888a2faf63f239d73d8ba0011b2dccc97a
+  URL      https://github.com/gflags/gflags/archive/v2.2.2.zip
+  URL_HASH SHA256=19713a36c9f32b33df59d1c79b4958434cb005b5b47dc5400a7a4b078111d9b5
 )
-
 FetchContent_MakeAvailable(gflags)
 
 # openfst need
diff --git a/speechx/cmake/gtest.cmake b/speechx/cmake/gtest.cmake
index 7fe397fcb..1ea8ed0b7 100644
--- a/speechx/cmake/gtest.cmake
+++ b/speechx/cmake/gtest.cmake
@@ -1,8 +1,8 @@
 include(FetchContent)
 FetchContent_Declare(
   gtest
-  URL      https://github.com/google/googletest/archive/release-1.10.0.zip
-  URL_HASH SHA256=94c634d499558a76fa649edb13721dce6e98fb1e7018dfaeba3cd7a083945e91
+  URL      https://github.com/google/googletest/archive/release-1.11.0.zip
+  URL_HASH SHA256=353571c2440176ded91c2de6d6cd88ddd41401d14692ec1f99e35d013feda55a
 )
 FetchContent_MakeAvailable(gtest)
 
diff --git a/speechx/examples/codelab/feat/.gitignore b/speechx/examples/codelab/feat/.gitignore
new file mode 100644
index 000000000..bbd86a25b
--- /dev/null
+++ b/speechx/examples/codelab/feat/.gitignore
@@ -0,0 +1,2 @@
+data
+exp
diff --git a/speechx/examples/codelab/feat/path.sh b/speechx/examples/codelab/feat/path.sh
index 3b89d01e9..9d2291743 100644
--- a/speechx/examples/codelab/feat/path.sh
+++ b/speechx/examples/codelab/feat/path.sh
@@ -1,12 +1,12 @@
 # This contains the locations of binarys build required for running the examples.
 
 SPEECHX_ROOT=$PWD/../../../
-SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
+SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
 
 SPEECHX_TOOLS=$SPEECHX_ROOT/tools
 TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
 
-[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
+[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; }
 
 export LC_AL=C
 
diff --git a/speechx/examples/codelab/feat/run.sh b/speechx/examples/codelab/feat/run.sh
index 1fa37f981..66bd8ae20 100755
--- a/speechx/examples/codelab/feat/run.sh
+++ b/speechx/examples/codelab/feat/run.sh
@@ -54,4 +54,10 @@ compute_linear_spectrogram_main \
     --cmvn_file=$exp_dir/cmvn.ark
 echo "compute linear spectrogram feature."
 
+compute_fbank_main \
+    --num_bins 161 \
+    --wav_rspecifier=scp:$data_dir/wav.scp \
+    --feature_wspecifier=ark,t:$exp_dir/fbank.ark \
+    --cmvn_file=$exp_dir/cmvn.ark
+echo "compute fbank feature."
 
diff --git a/speechx/examples/codelab/nnet/path.sh b/speechx/examples/codelab/nnet/path.sh
index 7d395d648..11c8aef8b 100644
--- a/speechx/examples/codelab/nnet/path.sh
+++ b/speechx/examples/codelab/nnet/path.sh
@@ -6,7 +6,7 @@ SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
 SPEECHX_TOOLS=$SPEECHX_ROOT/tools
 TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
 
-[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
+[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; }
 
 export LC_AL=C
 
diff --git a/speechx/examples/codelab/u2nnet/.gitignore b/speechx/examples/codelab/u2nnet/.gitignore
new file mode 100644
index 000000000..d6fe69bcd
--- /dev/null
+++ b/speechx/examples/codelab/u2nnet/.gitignore
@@ -0,0 +1,3 @@
+data
+exp
+*log
diff --git a/speechx/examples/codelab/u2nnet/README.md b/speechx/examples/codelab/u2nnet/README.md
new file mode 100644
index 000000000..772a58f0e
--- /dev/null
+++ b/speechx/examples/codelab/u2nnet/README.md
@@ -0,0 +1,3 @@
+# Deepspeech2 Streaming NNet Test
+
+Using for ds2 streaming nnet inference test.
diff --git a/speechx/examples/codelab/u2nnet/path.sh b/speechx/examples/codelab/u2nnet/path.sh
new file mode 100644
index 000000000..564e9fed1
--- /dev/null
+++ b/speechx/examples/codelab/u2nnet/path.sh
@@ -0,0 +1,19 @@
+# This contains the locations of binarys build required for running the examples.
+
+unset GREP_OPTIONS
+
+SPEECHX_ROOT=$PWD/../../../
+SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
+
+SPEECHX_TOOLS=$SPEECHX_ROOT/tools
+TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
+
+[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; }
+
+export LC_AL=C
+
+SPEECHX_BIN=$SPEECHX_BUILD/nnet
+export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
+
+PADDLE_LIB_PATH=$(python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')")
+export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH
diff --git a/speechx/examples/codelab/u2nnet/run.sh b/speechx/examples/codelab/u2nnet/run.sh
new file mode 100755
index 000000000..b309bc6f2
--- /dev/null
+++ b/speechx/examples/codelab/u2nnet/run.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+set -x
+set -e
+
+. path.sh
+
+# 1. compile
+if [ ! -d ${SPEECHX_EXAMPLES} ]; then
+    pushd ${SPEECHX_ROOT} 
+    bash build.sh
+    popd
+fi
+
+# 2. download model
+if [ ! -f data/model/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz ]; then
+    mkdir -p data/model
+    pushd data/model
+    wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/static/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz
+    tar xzfv asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz
+    popd
+fi
+
+# produce wav scp
+if [ ! -f data/wav.scp ]; then
+    mkdir -p data
+    pushd data
+    wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
+    echo "utt1 " $PWD/zh.wav > wav.scp
+    popd 
+fi
+
+data=data
+exp=exp
+mkdir -p $exp
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/
+
+
+cmvn_json2kaldi_main \
+    --json_file  $model_dir/mean_std.json \
+    --cmvn_write_path $exp/cmvn.ark \
+    --binary=false
+echo "convert json cmvn to kaldi ark."
+
+compute_fbank_main \
+    --num_bins 80 \
+    --wav_rspecifier=scp:$data/wav.scp \
+    --cmvn_file=$exp/cmvn.ark \
+    --feature_wspecifier=ark,t:$exp/fbank.ark
+echo "compute fbank feature."
+
+u2_nnet_main \
+    --model_path=$model_dir/export.jit \
+    --feature_rspecifier=ark,t:$exp/fbank.ark \
+    --nnet_decoder_chunk=16 \
+    --receptive_field_length=7 \
+    --downsampling_rate=4 \
+    --acoustic_scale=1.0 \
+    --nnet_prob_wspecifier=ark,t:$exp/probs.ark
diff --git a/speechx/examples/codelab/u2nnet/valgrind.sh b/speechx/examples/codelab/u2nnet/valgrind.sh
new file mode 100755
index 000000000..a5aab6637
--- /dev/null
+++ b/speechx/examples/codelab/u2nnet/valgrind.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# this script is for memory check, so please run ./run.sh first.
+
+set +x
+set -e
+
+. ./path.sh
+
+if [ ! -d ${SPEECHX_TOOLS}/valgrind/install ]; then
+  echo "please install valgrind in the speechx tools dir.\n" 
+  exit 1
+fi
+
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
+
+valgrind --tool=memcheck --track-origins=yes --leak-check=full --show-leak-kinds=all \
+  ds2_model_test_main \
+  --model_path=$model_dir/avg_1.jit.pdmodel \
+  --param_path=$model_dir/avg_1.jit.pdparams
diff --git a/speechx/examples/u2pp_ol/README.md b/speechx/examples/u2pp_ol/README.md
new file mode 100644
index 000000000..ce01a8fc7
--- /dev/null
+++ b/speechx/examples/u2pp_ol/README.md
@@ -0,0 +1,5 @@
+# U2/U2++ Streaming ASR
+
+## Examples
+
+* `wenetspeech` - Streaming Decoding using wenetspeech u2/u2++ model. Using aishell test data for testing.    
diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h
index dfb148853..90fc96a18 100644
--- a/speechx/speechx/base/common.h
+++ b/speechx/speechx/base/common.h
@@ -34,6 +34,7 @@
 #include <stdexcept>
 #include <string>
 #include <thread>
+#include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
diff --git a/speechx/speechx/codelab/glog/glog_logtostderr_main.cc b/speechx/speechx/codelab/glog/glog_logtostderr_main.cc
index b0616a7de..c891827a1 100644
--- a/speechx/speechx/codelab/glog/glog_logtostderr_main.cc
+++ b/speechx/speechx/codelab/glog/glog_logtostderr_main.cc
@@ -17,7 +17,7 @@
 int main(int argc, char* argv[]) {
     // Initialize Google’s logging library.
     google::InitGoogleLogging(argv[0]);
-
+    google::InstallFailureSignalHandler();
     FLAGS_logtostderr = 1;
 
     LOG(INFO) << "Found " << 10 << " cookies";
diff --git a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc
index 283466dc1..7d99e8571 100644
--- a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc
+++ b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc
@@ -195,8 +195,11 @@ void model_forward_test() {
 }
 
 int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
     google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
 
     model_forward_test();
     return 0;
diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt
index 0383c3ea0..1df935112 100644
--- a/speechx/speechx/decoder/CMakeLists.txt
+++ b/speechx/speechx/decoder/CMakeLists.txt
@@ -18,7 +18,6 @@ set(BINS
   tlg_decoder_main
 )
 
-message(STATUS "xxxxxxxxxx: " ${DEPS})
 foreach(bin_name IN LISTS BINS)
   add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
   target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
index e4e5c2afb..445f470f9 100644
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
@@ -53,8 +53,11 @@ using std::vector;
 
 // test ds2 online decoder by feeding speech feature
 int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
     google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
 
     CHECK(FLAGS_result_wspecifier != "");
     CHECK(FLAGS_feature_rspecifier != "");
diff --git a/speechx/speechx/decoder/nnet_logprob_decoder_main.cc b/speechx/speechx/decoder/nnet_logprob_decoder_main.cc
index 0e249cc6b..e0acbe77b 100644
--- a/speechx/speechx/decoder/nnet_logprob_decoder_main.cc
+++ b/speechx/speechx/decoder/nnet_logprob_decoder_main.cc
@@ -30,8 +30,11 @@ using std::vector;
 
 // test decoder by feeding nnet posterior probability
 int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
     google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
 
     kaldi::SequentialBaseFloatMatrixReader likelihood_reader(
         FLAGS_nnet_prob_respecifier);
diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/decoder/recognizer_main.cc
index 232513539..050266462 100644
--- a/speechx/speechx/decoder/recognizer_main.cc
+++ b/speechx/speechx/decoder/recognizer_main.cc
@@ -23,8 +23,11 @@ DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
 DEFINE_int32(sample_rate, 16000, "sample rate");
 
 int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
     google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
 
     ppspeech::RecognizerResource resource = ppspeech::InitRecognizerResoure();
     ppspeech::Recognizer recognizer(resource);
diff --git a/speechx/speechx/decoder/tlg_decoder_main.cc b/speechx/speechx/decoder/tlg_decoder_main.cc
index 93f84da3f..b633022a3 100644
--- a/speechx/speechx/decoder/tlg_decoder_main.cc
+++ b/speechx/speechx/decoder/tlg_decoder_main.cc
@@ -55,8 +55,11 @@ using std::vector;
 
 // test TLG decoder by feeding speech feature.
 int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
     google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
 
     kaldi::SequentialBaseFloatMatrixReader feature_reader(
         FLAGS_feature_rspecifier);
diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt
index 8ae63256a..050d78bea 100644
--- a/speechx/speechx/frontend/audio/CMakeLists.txt
+++ b/speechx/speechx/frontend/audio/CMakeLists.txt
@@ -1,5 +1,3 @@
-project(frontend)
-
 add_library(frontend STATIC
   cmvn.cc
   db_norm.cc
diff --git a/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc b/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc
index 0def14660..93bad6886 100644
--- a/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc
+++ b/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc
@@ -30,8 +30,11 @@ DEFINE_bool(binary, true, "write cmvn in binary (true) or text(false)");
 using namespace boost::json;  // from <boost/json.hpp>
 
 int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
     google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
 
     LOG(INFO) << "cmvn josn path: " << FLAGS_json_file;
 
diff --git a/speechx/speechx/frontend/audio/compute_fbank_main.cc b/speechx/speechx/frontend/audio/compute_fbank_main.cc
index f7a42315f..93a6d4072 100644
--- a/speechx/speechx/frontend/audio/compute_fbank_main.cc
+++ b/speechx/speechx/frontend/audio/compute_fbank_main.cc
@@ -32,13 +32,21 @@ DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
 DEFINE_string(cmvn_file, "", "read cmvn");
 DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
 DEFINE_int32(num_bins, 161, "fbank num bins");
+DEFINE_int32(sample_rate, 16000, "sampe rate: 16k, 8k.");
 
 int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
     google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
 
+    CHECK(FLAGS_wav_rspecifier.size() > 0);
+    CHECK(FLAGS_feature_wspecifier.size() > 0);
     kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
         FLAGS_wav_rspecifier);
+    kaldi::SequentialTableReader<kaldi::WaveInfoHolder> wav_info_reader(
+        FLAGS_wav_rspecifier);
     kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
 
     int32 num_done = 0, num_err = 0;
@@ -54,6 +62,10 @@ int main(int argc, char* argv[]) {
     opt.frame_opts.frame_shift_ms = 10;
     opt.mel_opts.num_bins = FLAGS_num_bins;
     opt.frame_opts.dither = 0.0;
+    LOG(INFO) << "frame_length_ms: " << opt.frame_opts.frame_length_ms;
+    LOG(INFO) << "frame_shift_ms: " << opt.frame_opts.frame_shift_ms;
+    LOG(INFO) << "num_bins: " << opt.mel_opts.num_bins;
+    LOG(INFO) << "dither: " << opt.frame_opts.dither;
 
     std::unique_ptr<ppspeech::FrontendInterface> fbank(
         new ppspeech::Fbank(opt, std::move(data_source)));
@@ -61,53 +73,73 @@ int main(int argc, char* argv[]) {
     std::unique_ptr<ppspeech::FrontendInterface> cmvn(
         new ppspeech::CMVN(FLAGS_cmvn_file, std::move(fbank)));
 
-    ppspeech::FeatureCacheOptions feat_cache_opts;
     // the feature cache output feature chunk by chunk.
+    ppspeech::FeatureCacheOptions feat_cache_opts;
     ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn));
     LOG(INFO) << "fbank: " << true;
     LOG(INFO) << "feat dim: " << feature_cache.Dim();
 
-    int sample_rate = 16000;
+
     float streaming_chunk = FLAGS_streaming_chunk;
-    int chunk_sample_size = streaming_chunk * sample_rate;
-    LOG(INFO) << "sr: " << sample_rate;
-    LOG(INFO) << "chunk size (s): " << streaming_chunk;
+    int chunk_sample_size = streaming_chunk * FLAGS_sample_rate;
+    LOG(INFO) << "sr: " << FLAGS_sample_rate;
+    LOG(INFO) << "chunk size (sec): " << streaming_chunk;
     LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
 
-    for (; !wav_reader.Done(); wav_reader.Next()) {
-        std::string utt = wav_reader.Key();
+    for (; !wav_reader.Done() && !wav_info_reader.Done(); wav_reader.Next(), wav_info_reader.Next()) {
+        const std::string& utt = wav_reader.Key();
         const kaldi::WaveData& wave_data = wav_reader.Value();
-        LOG(INFO) << "process utt: " << utt;
 
+        const std::string& utt2 = wav_info_reader.Key();
+        const kaldi::WaveInfo& wave_info = wav_info_reader.Value();
+
+        CHECK(utt == utt2) << "wav reader and wav info reader using diff rspecifier!!!";
+        LOG(INFO) << "utt: " << utt;
+        LOG(INFO) << "samples: " << wave_info.SampleCount();
+        LOG(INFO) << "dur: " << wave_info.Duration() << " sec";
+        CHECK(wave_info.SampFreq() == FLAGS_sample_rate) << "need " << FLAGS_sample_rate << " get " << wave_info.SampFreq();
+
+        // load first channel wav
         int32 this_channel = 0;
         kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
                                                     this_channel);
+    
+        // compute feat chunk by chunk
         int tot_samples = waveform.Dim();
-        LOG(INFO) << "wav len (sample): " << tot_samples;
-
         int sample_offset = 0;
         std::vector<kaldi::Vector<BaseFloat>> feats;
         int feature_rows = 0;
         while (sample_offset < tot_samples) {
+            // cur chunk size
             int cur_chunk_size =
                 std::min(chunk_sample_size, tot_samples - sample_offset);
 
+            // get chunk wav
             kaldi::Vector<kaldi::BaseFloat> wav_chunk(cur_chunk_size);
             for (int i = 0; i < cur_chunk_size; ++i) {
                 wav_chunk(i) = waveform(sample_offset + i);
             }
 
-            kaldi::Vector<BaseFloat> features;
+            // compute feat
             feature_cache.Accept(wav_chunk);
+
+            // send finish signal
             if (cur_chunk_size < chunk_sample_size) {
                 feature_cache.SetFinished();
             }
+
+            // read feat
+            kaldi::Vector<BaseFloat> features;
             bool flag = true;
             do {
                 flag = feature_cache.Read(&features);
-                feats.push_back(features);
-                feature_rows += features.Dim() / feature_cache.Dim();
+                if (flag && features.Dim() != 0) {
+                    feats.push_back(features);
+                    feature_rows += features.Dim() / feature_cache.Dim();
+                }
             } while (flag == true && features.Dim() != 0);
+
+            // forward offset
             sample_offset += cur_chunk_size;
         }
 
@@ -125,14 +157,19 @@ int main(int argc, char* argv[]) {
                 ++cur_idx;
             }
         }
+        LOG(INFO) << "feat shape: " << features.NumRows() << " , " << features.NumCols();
         feat_writer.Write(utt, features);
+
+        // reset frontend pipeline state
         feature_cache.Reset();
 
         if (num_done % 50 == 0 && num_done != 0)
-            KALDI_VLOG(2) << "Processed " << num_done << " utterances";
+            VLOG(2) << "Processed " << num_done << " utterances";
+
         num_done++;
     }
-    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
+
+    LOG(INFO) << "Done " << num_done << " utterances, " << num_err
               << " with errors.";
     return (num_done != 0 ? 0 : 1);
 }
diff --git a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
index 162c3529d..889f5663d 100644
--- a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
+++ b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
@@ -31,8 +31,11 @@ DEFINE_string(cmvn_file, "./cmvn.ark", "read cmvn");
 DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
 
 int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
     google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
 
     kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
         FLAGS_wav_rspecifier);
diff --git a/speechx/speechx/model/CMakeLists.txt b/speechx/speechx/model/CMakeLists.txt
deleted file mode 100644
index e69de29bb..000000000
diff --git a/speechx/speechx/nnet/CMakeLists.txt b/speechx/speechx/nnet/CMakeLists.txt
index 565bba3eb..2a1812fdf 100644
--- a/speechx/speechx/nnet/CMakeLists.txt
+++ b/speechx/speechx/nnet/CMakeLists.txt
@@ -1,12 +1,40 @@
-project(nnet)
+set(srcs decodable.cc)
 
-add_library(nnet STATIC
-  decodable.cc
-  ds2_nnet.cc
-)
+if(USING_DS2)
+  list(APPEND srcs ds2_nnet.cc)
+endif()
+
+if(USING_U2)
+  list(APPEND srcs u2_nnet.cc)
+endif()
+
+add_library(nnet STATIC ${srcs})
 target_link_libraries(nnet absl::strings)
 
-set(bin_name ds2_nnet_main)
-add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
-target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet ${DEPS})
\ No newline at end of file
+if(USING_U2)
+  target_compile_options(nnet  PUBLIC ${PADDLE_COMPILE_FLAGS})
+  target_include_directories(nnet  PUBLIC ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
+  # target_link_libraries(nnet  ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
+endif()
+
+
+if(USING_DS2)
+  set(bin_name ds2_nnet_main)
+  add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+  target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+  target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet)
+
+  target_link_libraries(${bin_name} ${DEPS})
+endif()
+
+# test bin
+if(USING_U2)
+  set(bin_name u2_nnet_main)
+  add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+  target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+  target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet)
+
+  target_compile_options(${bin_name}  PRIVATE ${PADDLE_COMPILE_FLAGS})
+  target_include_directories(${bin_name}  PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
+  target_link_libraries(${bin_name}  ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
+endif()
diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc
index 465f64a94..7780e5ae6 100644
--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@@ -30,6 +30,7 @@ Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet,
       frames_ready_(0),
       acoustic_scale_(acoustic_scale) {}
 
+// for debug
 void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
     nnet_cache_ = likelihood;
     frames_ready_ += likelihood.NumRows();
@@ -41,6 +42,7 @@ void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
 // return the size of frame have computed.
 int32 Decodable::NumFramesReady() const { return frames_ready_; }
 
+
 // frame idx is from 0 to frame_ready_ -1;
 bool Decodable::IsLastFrame(int32 frame) {
     bool flag = EnsureFrameHaveComputed(frame);
@@ -72,26 +74,38 @@ bool Decodable::EnsureFrameHaveComputed(int32 frame) {
 }
 
 bool Decodable::AdvanceChunk() {
+    // read feats
     Vector<BaseFloat> features;
     if (frontend_ == NULL || frontend_->Read(&features) == false) {
+        // no feat or frontend_ not init.
         return false;
     }
-    int32 nnet_dim = 0;
-    Vector<BaseFloat> inferences;
-    nnet_->FeedForward(features, frontend_->Dim(), &inferences, &nnet_dim);
-    nnet_cache_.Resize(inferences.Dim() / nnet_dim, nnet_dim);
-    nnet_cache_.CopyRowsFromVec(inferences);
 
+    // forward feats
+    int32 vocab_dim = 0;
+    Vector<BaseFloat> probs;
+    nnet_->FeedForward(features, frontend_->Dim(), &probs, &vocab_dim);
+
+    // cache nnet outupts
+    nnet_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim);
+    nnet_cache_.CopyRowsFromVec(probs);
+
+    // update state
     frame_offset_ = frames_ready_;
     frames_ready_ += nnet_cache_.NumRows();
     return true;
 }
 
+// read one frame likelihood
 bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
-    std::vector<BaseFloat> result;
-    if (EnsureFrameHaveComputed(frame) == false) return false;
-    likelihood->resize(nnet_cache_.NumCols());
-    for (int32 idx = 0; idx < nnet_cache_.NumCols(); ++idx) {
+    if (EnsureFrameHaveComputed(frame) == false) {
+        return false;
+    }
+
+    int vocab_size = nnet_cache_.NumCols();
+    likelihood->resize(vocab_size);
+
+    for (int32 idx = 0; idx < vocab_size; ++idx) {
         (*likelihood)[idx] =
             nnet_cache_(frame - frame_offset_, idx) * acoustic_scale_;
     }
diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h
index 9555fea79..241d04198 100644
--- a/speechx/speechx/nnet/decodable.h
+++ b/speechx/speechx/nnet/decodable.h
@@ -27,35 +27,54 @@ class Decodable : public kaldi::DecodableInterface {
     explicit Decodable(const std::shared_ptr<NnetInterface>& nnet,
                        const std::shared_ptr<FrontendInterface>& frontend,
                        kaldi::BaseFloat acoustic_scale = 1.0);
+
     // void Init(DecodableOpts config);
+
+    // nnet logprob output
     virtual kaldi::BaseFloat LogLikelihood(int32 frame, int32 index);
+
     virtual bool IsLastFrame(int32 frame);
+
+    // nnet output dim, e.g. vocab size
     virtual int32 NumIndices() const;
-    // not logprob
+
+    // nnet prob output
     virtual bool FrameLikelihood(int32 frame,
                                  std::vector<kaldi::BaseFloat>* likelihood);
+
     virtual int32 NumFramesReady() const;
+
     // for offline test
     void Acceptlikelihood(const kaldi::Matrix<kaldi::BaseFloat>& likelihood);
+
     void Reset();
+
     bool IsInputFinished() const { return frontend_->IsFinished(); }
+
     bool EnsureFrameHaveComputed(int32 frame);
+
     int32 TokenId2NnetId(int32 token_id);
 
   private:
     bool AdvanceChunk();
+
     std::shared_ptr<FrontendInterface> frontend_;
     std::shared_ptr<NnetInterface> nnet_;
+
+    // nnet outputs' cache
     kaldi::Matrix<kaldi::BaseFloat> nnet_cache_;
+
     // the frame is nnet prob frame rather than audio feature frame
     // nnet frame subsample the feature frame
     // eg: 35 frame features output 8 frame inferences
     int32 frame_offset_;
     int32 frames_ready_;
+
     // todo: feature frame mismatch with nnet inference frame
     // so use subsampled_frame
     int32 current_log_post_subsampled_offset_;
     int32 num_chunk_computed_;
+
     kaldi::BaseFloat acoustic_scale_;
 };
 
diff --git a/speechx/speechx/nnet/ds2_nnet_main.cc b/speechx/speechx/nnet/ds2_nnet_main.cc
index e29042082..943d7e5f2 100644
--- a/speechx/speechx/nnet/ds2_nnet_main.cc
+++ b/speechx/speechx/nnet/ds2_nnet_main.cc
@@ -13,8 +13,7 @@
 // limitations under the License.
 
 #include "nnet/ds2_nnet.h"
-#include "base/flags.h"
-#include "base/log.h"
+#include "base/common.h"
 #include "frontend/audio/assembler.h"
 #include "frontend/audio/data_cache.h"
 #include "kaldi/util/table-types.h"
@@ -49,8 +48,11 @@ using kaldi::Matrix;
 using std::vector;
 
 int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
     gflags::ParseCommandLineFlags(&argc, &argv, false);
     google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
 
     kaldi::SequentialBaseFloatMatrixReader feature_reader(
         FLAGS_feature_rspecifier);
@@ -146,7 +148,7 @@ int main(int argc, char* argv[]) {
         }
         kaldi::Matrix<kaldi::BaseFloat> result(prob_vec.size(),
                                                prob_vec[0].Dim());
-        for (int32 row_idx = 0; row_idx < prob_vec.size(); ++row_idx) {
+        for (int row_idx = 0; row_idx < prob_vec.size(); ++row_idx) {
             for (int32 col_idx = 0; col_idx < prob_vec[0].Dim(); ++col_idx) {
                 result(row_idx, col_idx) = prob_vec[row_idx](col_idx);
             }
diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc
new file mode 100644
index 000000000..67ef0952a
--- /dev/null
+++ b/speechx/speechx/nnet/u2_nnet.cc
@@ -0,0 +1,706 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet/u2_nnet.h"
+
+#ifdef USE_PROFILING
+#include "paddle/fluid/platform/profiler.h"
+using paddle::platform::RecordEvent;
+using paddle::platform::TracerEventType;
+#endif  // end USE_PROFILING
+
+namespace ppspeech {
+
+int U2NnetBase::num_frames_for_chunk(bool start) const {
+    int num_needed_frames = 0;  // num feat frames
+    bool first = !start;        // start == false is first
+
+    if (chunk_size_ > 0) {
+        // streaming mode
+        if (first) {
+            // first chunk
+            // 1 decoder frame need `context` feat frames
+            int context = this->context();
+            num_needed_frames = (chunk_size_ - 1) * subsampling_rate_ + context;
+        } else {
+            // after first chunk, we need stride this num frames.
+            num_needed_frames = chunk_size_ * subsampling_rate_;
+        }
+    } else {
+        // non-streaming mode. feed all feats once.
+        num_needed_frames = std::numeric_limits<int>::max();
+    }
+
+    return num_needed_frames;
+}
+
+// cache feats for next chunk
+void U2NnetBase::CacheFeature(const std::vector<kaldi::BaseFloat>& chunk_feats,
+                              int32 feat_dim) {
+    // chunk_feats is nframes*feat_dim
+    const int chunk_size = chunk_feats.size() / feat_dim;
+    const int cached_feat_size = this->context() - subsampling_rate_;
+    if (chunk_size >= cached_feat_size) {
+        cached_feats_.resize(cached_feat_size);
+        for (int i = 0; i < cached_feat_size; ++i) {
+            auto start =
+                chunk_feats.begin() + chunk_size - cached_feat_size + i;
+            auto end = start + feat_dim;
+            cached_feats_[i] = std::vector<float>(start, end);
+        }
+    }
+}
+
+void U2NnetBase::ForwardEncoderChunk(
+    const std::vector<kaldi::BaseFloat>& chunk_feats,
+    int32 feat_dim,
+    std::vector<kaldi::BaseFloat>* ctc_probs,
+    int32* vocab_dim) {
+    ctc_probs->clear();
+    // int num_frames = cached_feats_.size() + chunk_feats.size();
+    int num_frames = chunk_feats.size() / feat_dim;
+    VLOG(3) << "foward encoder chunk: " << num_frames << " frames";
+    VLOG(3) << "context: " << this->context() << " frames";
+
+    if (num_frames >= this->context()) {
+        this->ForwardEncoderChunkImpl(
+            chunk_feats, feat_dim, ctc_probs, vocab_dim);
+        VLOG(3) << "after forward chunk";
+        this->CacheFeature(chunk_feats, feat_dim);
+    }
+}
+
+
+void U2Nnet::LoadModel(const std::string& model_path_w_prefix) {
+    paddle::jit::utils::InitKernelSignatureMap();
+
+#ifdef USE_GPU
+    dev_ = phi::GPUPlace();
+#else
+    dev_ = phi::CPUPlace();
+#endif
+    paddle::jit::Layer model = paddle::jit::Load(model_path_w_prefix, dev_);
+    model_ = std::make_shared<paddle::jit::Layer>(std::move(model));
+
+    subsampling_rate_ = model_->Attribute<int>("subsampling_rate");
+    right_context_ = model_->Attribute<int>("right_context");
+    sos_ = model_->Attribute<int>("sos_symbol");
+    eos_ = model_->Attribute<int>("eos_symbol");
+    is_bidecoder_ = model_->Attribute<int>("is_bidirectional_decoder");
+
+    forward_encoder_chunk_ = model_->Function("forward_encoder_chunk");
+    forward_attention_decoder_ = model_->Function("forward_attention_decoder");
+    ctc_activation_ = model_->Function("ctc_activation");
+    CHECK(forward_encoder_chunk_.IsValid());
+    CHECK(forward_attention_decoder_.IsValid());
+    CHECK(ctc_activation_.IsValid());
+
+    LOG(INFO) << "Paddle Model Info: ";
+    LOG(INFO) << "\tsubsampling_rate " << subsampling_rate_;
+    LOG(INFO) << "\tright context " << right_context_;
+    LOG(INFO) << "\tsos " << sos_;
+    LOG(INFO) << "\teos " << eos_;
+    LOG(INFO) << "\tis bidecoder " << is_bidecoder_ << std::endl;
+
+    Warmup();
+}
+
+void U2Nnet::Warmup() {
+#ifdef USE_PROFILING
+    RecordEvent event("warmup", TracerEventType::UserDefined, 1);
+#endif
+
+    {
+#ifdef USE_PROFILING
+        RecordEvent event(
+            "warmup-encoder-ctc", TracerEventType::UserDefined, 1);
+#endif
+        int feat_dim = 80;
+        int frame_num = 16 * 4 + 3;  // chunk_size * downsample_rate +
+                                     // (receptive_field - downsample_rate)
+        paddle::Tensor feats = paddle::full(
+            {1, frame_num, feat_dim}, 0.12f, paddle::DataType::FLOAT32);
+        paddle::Tensor offset = paddle::zeros({1}, paddle::DataType::INT32);
+        paddle::Tensor att_cache =
+            paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32);
+        paddle::Tensor cnn_cache =
+            paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32);
+        std::vector<paddle::Tensor> inputs = {
+            feats, offset, /*required_cache_size, */ att_cache, cnn_cache};
+        std::vector<paddle::Tensor> outputs = forward_encoder_chunk_(inputs);
+
+        auto chunk_out = outputs[0];
+        inputs = std::move(std::vector<paddle::Tensor>({chunk_out}));
+        outputs = ctc_activation_(inputs);
+    }
+
+    {
+#ifdef USE_PROFILING
+        RecordEvent event("warmup-decoder", TracerEventType::UserDefined, 1);
+#endif
+        auto hyps =
+            paddle::full({10, 8}, 10, paddle::DataType::INT64, phi::CPUPlace());
+        auto hyps_lens =
+            paddle::full({10}, 8, paddle::DataType::INT64, phi::CPUPlace());
+        auto encoder_out = paddle::ones(
+            {1, 20, 512}, paddle::DataType::FLOAT32, phi::CPUPlace());
+
+        std::vector<paddle::experimental::Tensor> inputs{
+            hyps, hyps_lens, encoder_out};
+
+        std::vector<paddle::experimental::Tensor> outputs =
+            forward_attention_decoder_(inputs);
+    }
+
+    Reset();
+}
+
+U2Nnet::U2Nnet(const U2ModelOptions& opts) : opts_(opts) {
+    LoadModel(opts_.model_path);
+}
+
+// shallow copy
+U2Nnet::U2Nnet(const U2Nnet& other) {
+    // copy meta
+    right_context_ = other.right_context_;
+    subsampling_rate_ = other.subsampling_rate_;
+    sos_ = other.sos_;
+    eos_ = other.eos_;
+    is_bidecoder_ = other.is_bidecoder_;
+    chunk_size_ = other.chunk_size_;
+    num_left_chunks_ = other.num_left_chunks_;
+
+    forward_encoder_chunk_ = other.forward_encoder_chunk_;
+    forward_attention_decoder_ = other.forward_attention_decoder_;
+    ctc_activation_ = other.ctc_activation_;
+
+    //   offset_ = other.offset_; // TODO: not used in nnets
+
+    // copy model ptr
+    model_ = other.model_;
+
+    // ignore inner states
+}
+
+std::shared_ptr<NnetInterface> U2Nnet::Copy() const {
+    auto asr_model = std::make_shared<U2Nnet>(*this);
+    // reset inner state for new decoding
+    asr_model->Reset();
+    return asr_model;
+}
+
+void U2Nnet::Reset() {
+    //   offset_ = 0;
+    //   cached_feats_.clear(); // TODO: not used in nnets
+
+    att_cache_ =
+        std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32));
+    cnn_cache_ =
+        std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32));
+
+    encoder_outs_.clear();
+}
+
+// Debug API
+void U2Nnet::FeedEncoderOuts(paddle::Tensor& encoder_out) {
+    // encoder_out (T,D)
+    encoder_outs_.clear();
+    encoder_outs_.push_back(encoder_out);
+}
+
+
+void U2Nnet::FeedForward(const kaldi::Vector<BaseFloat>& features,
+                         int32 feature_dim,
+                         kaldi::Vector<BaseFloat>* inferences,
+                         int32* inference_dim) {
+    std::vector<kaldi::BaseFloat> chunk_feats(features.Data(),
+                                              features.Data() + features.Dim());
+    std::vector<kaldi::BaseFloat> ctc_probs;
+    ForwardEncoderChunkImpl(
+        chunk_feats, feature_dim, &ctc_probs, inference_dim);
+    inferences->Resize(ctc_probs.size(), kaldi::kSetZero);
+    std::memcpy(inferences->Data(),
+                ctc_probs.data(),
+                ctc_probs.size() * sizeof(kaldi::BaseFloat));
+}
+
+
+void U2Nnet::ForwardEncoderChunkImpl(
+    const std::vector<kaldi::BaseFloat>& chunk_feats,
+    int32 feat_dim,
+    std::vector<kaldi::BaseFloat>* out_prob,
+    int32* vocab_dim) {
+#ifdef USE_PROFILING
+    RecordEvent event(
+        "ForwardEncoderChunkImpl", TracerEventType::UserDefined, 1);
+#endif
+
+    // 1. splice cached_feature, and chunk_feats
+    //  First dimension is B, which is 1.
+    // int num_frames = cached_feats_.size() + chunk_feats.size();
+
+    int num_frames = chunk_feats.size() / feat_dim;
+    VLOG(3) << "num_frames: " << num_frames;
+    VLOG(3) << "feat_dim: " << feat_dim;
+
+    // feats (B=1,T,D)
+    paddle::Tensor feats =
+        paddle::zeros({1, num_frames, feat_dim}, paddle::DataType::FLOAT32);
+    float* feats_ptr = feats.mutable_data<float>();
+
+    // for (size_t i = 0; i < cached_feats_.size(); ++i) {
+    //     float* row = feats_ptr + i * feat_dim;
+    //     std::memcpy(row, cached_feats_[i].data(), feat_dim * sizeof(float));
+    // }
+
+    // for (size_t i = 0; i < chunk_feats.size(); ++i) {
+    //     float* row = feats_ptr + (cached_feats_.size() + i) * feat_dim;
+    //     std::memcpy(row, chunk_feats[i].data(), feat_dim * sizeof(float));
+    // }
+
+    // not cache feature in nnet
+    CHECK(cached_feats_.size() == 0);
+    // CHECK_EQ(std::is_same<float, kaldi::BaseFloat>::value, true);
+    std::memcpy(feats_ptr,
+                chunk_feats.data(),
+                chunk_feats.size() * sizeof(kaldi::BaseFloat));
+
+    VLOG(3) << "feats shape: " << feats.shape()[0] << ", " << feats.shape()[1]
+            << ", " << feats.shape()[2];
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("feat", std::ios_base::app | std::ios_base::out);
+        path << offset_;
+        std::ofstream feat_fobj(path.str().c_str(), std::ios::out);
+        CHECK(feat_fobj.is_open());
+        // feat_fobj << feats.shape()[0] << " " << feats.shape()[1] << " "
+        //           << feats.shape()[2] << "\n";
+        for (int i = 0; i < feats.numel(); i++) {
+            feat_fobj << std::setprecision(18) << feats_ptr[i] << " ";
+            if ((i + 1) % feat_dim == 0) {
+                feat_fobj << "\n";
+            }
+        }
+        feat_fobj << "\n";
+    }
+#endif
+
+// Endocer chunk forward
+#ifdef USE_GPU
+    feats = feats.copy_to(paddle::GPUPlace(), /*blocking*/ false);
+    att_cache_ = att_cache_.copy_to(paddle::GPUPlace()), /*blocking*/ false;
+    cnn_cache_ = cnn_cache_.copy_to(Paddle::GPUPlace(), /*blocking*/ false);
+#endif
+
+    int required_cache_size = num_left_chunks_ * chunk_size_;  // -1 * 16
+    // must be scalar, but paddle do not have scalar.
+    paddle::Tensor offset = paddle::full({1}, offset_, paddle::DataType::INT32);
+    // freeze `required_cache_size` in graph, so not specific it in function
+    // call.
+    std::vector<paddle::Tensor> inputs = {
+        feats, offset, /*required_cache_size, */ att_cache_, cnn_cache_};
+    VLOG(3) << "inputs size: " << inputs.size();
+    CHECK(inputs.size() == 4);
+    std::vector<paddle::Tensor> outputs = forward_encoder_chunk_(inputs);
+    VLOG(3) << "outputs size: " << outputs.size();
+    CHECK(outputs.size() == 3);
+
+#ifdef USE_GPU
+    paddle::Tensor chunk_out = outputs[0].copy_to(paddle::CPUPlace());
+    att_cache_ = outputs[1].copy_to(paddle::CPUPlace());
+    cnn_cache_ = outputs[2].copy_to(paddle::CPUPlace());
+#else
+    paddle::Tensor chunk_out = outputs[0];
+    att_cache_ = outputs[1];
+    cnn_cache_ = outputs[2];
+#endif
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("encoder_logits",
+                               std::ios_base::app | std::ios_base::out);
+        auto i = offset_ - chunk_out.shape()[1];
+        path << std::max(i, 0L);
+        std::ofstream logits_fobj(path.str().c_str(), std::ios::out);
+        CHECK(logits_fobj.is_open());
+        logits_fobj << chunk_out.shape()[0] << " " << chunk_out.shape()[1]
+                    << " " << chunk_out.shape()[2] << "\n";
+        const float* chunk_out_ptr = chunk_out.data<float>();
+        logits_fobj << chunk_out_ptr << std::endl;
+        for (int i = 0; i < chunk_out.numel(); i++) {
+            logits_fobj << chunk_out_ptr[i] << " ";
+        }
+        logits_fobj << "\n";
+    }
+#endif  // end TEST_DEBUG
+
+    // current offset in decoder frame
+    // not used in nnet
+    offset_ += chunk_out.shape()[1];
+
+    // collects encoder outs.
+    VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size();
+    encoder_outs_.push_back(chunk_out);
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("encoder_logits_list",
+                               std::ios_base::app | std::ios_base::out);
+        path << offset_ - encoder_outs_[0].shape()[1];
+        std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out);
+        CHECK(logits_out_fobj.is_open());
+        logits_out_fobj << encoder_outs_[0].shape()[0] << " "
+                        << encoder_outs_[0].shape()[1] << " "
+                        << encoder_outs_[0].shape()[2] << "\n";
+        const float* encoder_outs_ptr = encoder_outs_[0].data<float>();
+        logits_out_fobj << encoder_outs_ptr << std::endl;
+        for (int i = 0; i < encoder_outs_[0].numel(); i++) {
+            logits_out_fobj << encoder_outs_ptr[i] << " ";
+        }
+        logits_out_fobj << "\n";
+    }
+#endif  // end TEST_DEBUG
+
+#ifdef USE_GPU
+
+#error "Not implementation."
+
+#else
+    // compute ctc_activation == log_softmax
+    inputs.clear();
+    outputs.clear();
+    inputs.push_back(chunk_out);
+    CHECK(inputs.size() == 1);
+    outputs = ctc_activation_(inputs);
+    CHECK(outputs.size() == 1);
+    paddle::Tensor ctc_log_probs = outputs[0];
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("encoder_logprob",
+                               std::ios_base::app | std::ios_base::out);
+        path << offset_ - chunk_out.shape()[1];
+
+        std::ofstream logprob_fobj(path.str().c_str(), std::ios::out);
+        CHECK(logprob_fobj.is_open());
+        logprob_fobj << ctc_log_probs.shape()[0] << " "
+                     << ctc_log_probs.shape()[1] << " "
+                     << ctc_log_probs.shape()[2] << "\n";
+        const float* logprob_ptr = ctc_log_probs.data<float>();
+        for (int i = 0; i < ctc_log_probs.numel(); i++) {
+            logprob_fobj << logprob_ptr[i] << " ";
+            if ((i + 1) % ctc_log_probs.shape()[2] == 0) {
+                logprob_fobj << "\n";
+            }
+        }
+        logprob_fobj << "\n";
+    }
+#endif  // end TEST_DEBUG
+
+#endif  // end USE_GPU
+
+    // Copy to output, (B=1,T,D)
+    std::vector<int64_t> ctc_log_probs_shape = ctc_log_probs.shape();
+    CHECK(ctc_log_probs_shape.size() == 3);
+    int B = ctc_log_probs_shape[0];
+    CHECK(B == 1);
+    int T = ctc_log_probs_shape[1];
+    int D = ctc_log_probs_shape[2];
+    *vocab_dim = D;
+
+    float* ctc_log_probs_ptr = ctc_log_probs.data<float>();
+
+    // // vector<vector<float>>
+    // out_prob->resize(T);
+    // for (int i = 0; i < T; i++) {
+    //     (*out_prob)[i].resize(D);
+    //     float* dst_ptr = (*out_prob)[i].data();
+    //     float* src_ptr = ctc_log_probs_ptr + (i * D);
+    //     std::memcpy(dst_ptr, src_ptr, D * sizeof(float));
+    // }
+    // CHECK(std::is_same<float, kaldi::BaseFloat>::value);
+    out_prob->resize(T * D);
+    std::memcpy(
+        out_prob->data(), ctc_log_probs_ptr, T * D * sizeof(kaldi::BaseFloat));
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("encoder_logits_list_ctc",
+                               std::ios_base::app | std::ios_base::out);
+        path << offset_ - encoder_outs_[0].shape()[1];
+        std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out);
+        CHECK(logits_out_fobj.is_open());
+        logits_out_fobj << encoder_outs_[0].shape()[0] << " "
+                        << encoder_outs_[0].shape()[1] << " "
+                        << encoder_outs_[0].shape()[2] << "\n";
+        const float* encoder_outs_ptr = encoder_outs_[0].data<float>();
+        logits_out_fobj << encoder_outs_ptr << std::endl;
+        for (int i = 0; i < encoder_outs_[0].numel(); i++) {
+            logits_out_fobj << encoder_outs_ptr[i] << " ";
+        }
+        logits_out_fobj << "\n";
+    }
+#endif  // end TEST_DEBUG
+
+    return;
+}
+
+float U2Nnet::ComputePathScore(const paddle::Tensor& prob,
+                               const std::vector<int>& hyp,
+                               int eos) {
+    // sum `hyp` path scores in `prob`
+    // prob (1, Umax, V)
+    // hyp (U,)
+    float score = 0.0f;
+    std::vector<int64_t> dims = prob.shape();
+    CHECK(dims.size() == 3);
+    VLOG(2) << "prob shape: " << dims[0] << ", " << dims[1] << ", " << dims[2];
+    CHECK(dims[0] == 1);
+    int vocab_dim = static_cast<int>(dims[2]);
+
+    const float* prob_ptr = prob.data<float>();
+    for (size_t i = 0; i < hyp.size(); ++i) {
+        const float* row = prob_ptr + i * vocab_dim;
+        score += row[hyp[i]];
+    }
+    const float* row = prob_ptr + hyp.size() * vocab_dim;
+    score += row[eos];
+    return score;
+}
+
+
+void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
+                                float reverse_weight,
+                                std::vector<float>* rescoring_score) {
+#ifdef USE_PROFILING
+    RecordEvent event("AttentionRescoring", TracerEventType::UserDefined, 1);
+#endif
+
+    CHECK(rescoring_score != nullptr);
+
+    int num_hyps = hyps.size();
+    rescoring_score->resize(num_hyps, 0.0f);
+
+    if (num_hyps == 0) return;
+    VLOG(2) << "num hyps: " << num_hyps;
+
+    if (encoder_outs_.size() == 0) {
+        // no encoder outs
+        std::cerr << "encoder_outs_.size() is zero. Please check it."
+                  << std::endl;
+        return;
+    }
+
+    // prepare input
+    paddle::Tensor hyps_lens =
+        paddle::zeros({num_hyps}, paddle::DataType::INT64);
+    int64_t* hyps_len_ptr = hyps_lens.mutable_data<int64_t>();
+    int max_hyps_len = 0;
+    for (size_t i = 0; i < num_hyps; ++i) {
+        int len = hyps[i].size() + 1;  // eos
+        max_hyps_len = std::max(max_hyps_len, len);
+        hyps_len_ptr[i] = static_cast<int64_t>(len);
+    }
+
+    paddle::Tensor hyps_tensor =
+        paddle::full({num_hyps, max_hyps_len}, eos_, paddle::DataType::INT64);
+    int64_t* hyps_ptr = hyps_tensor.mutable_data<int64_t>();
+    for (size_t i = 0; i < num_hyps; ++i) {
+        const std::vector<int>& hyp = hyps[i];
+        int64_t* row = hyps_ptr + max_hyps_len * i;
+        row[0] = sos_;
+        for (size_t j = 0; j < hyp.size(); ++j) {
+            row[j + 1] = hyp[j];
+        }
+    }
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("encoder_logits_concat",
+                               std::ios_base::app | std::ios_base::out);
+        for (int j = 0; j < encoder_outs_.size(); j++) {
+            path << j;
+            std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out);
+            CHECK(logits_out_fobj.is_open());
+            logits_out_fobj << encoder_outs_[j].shape()[0] << " "
+                            << encoder_outs_[j].shape()[1] << " "
+                            << encoder_outs_[j].shape()[2] << "\n";
+            const float* encoder_outs_ptr = encoder_outs_[j].data<float>();
+            for (int i = 0; i < encoder_outs_[j].numel(); i++) {
+                logits_out_fobj << encoder_outs_ptr[i] << " ";
+            }
+            logits_out_fobj << "\n";
+        }
+    }
+#endif  // end TEST_DEBUG
+
+    // forward attention decoder by hyps and correspoinding encoder_outs_
+    paddle::Tensor encoder_out = paddle::concat(encoder_outs_, 1);
+    VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size();
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("encoder_out0",
+                               std::ios_base::app | std::ios_base::out);
+        std::ofstream encoder_out_fobj(path.str().c_str(), std::ios::out);
+        CHECK(encoder_out_fobj.is_open());
+
+        encoder_out_fobj << encoder_outs_[0].shape()[0] << " "
+                         << encoder_outs_[0].shape()[1] << " "
+                         << encoder_outs_[0].shape()[2] << "\n";
+        const float* enc_logprob_ptr = encoder_outs_[0].data<float>();
+
+        size_t size = encoder_outs_[0].numel();
+        for (int i = 0; i < size; i++) {
+            encoder_out_fobj << enc_logprob_ptr[i] << "\n";
+        }
+    }
+#endif  // end TEST_DEBUG
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("encoder_out",
+                               std::ios_base::app | std::ios_base::out);
+        std::ofstream encoder_out_fobj(path.str().c_str(), std::ios::out);
+        CHECK(encoder_out_fobj.is_open());
+
+        encoder_out_fobj << encoder_out.shape()[0] << " "
+                         << encoder_out.shape()[1] << " "
+                         << encoder_out.shape()[2] << "\n";
+        const float* enc_logprob_ptr = encoder_out.data<float>();
+
+        size_t size = encoder_out.numel();
+        for (int i = 0; i < size; i++) {
+            encoder_out_fobj << enc_logprob_ptr[i] << "\n";
+        }
+    }
+#endif  // end TEST_DEBUG
+
+    std::vector<paddle::experimental::Tensor> inputs{
+        hyps_tensor, hyps_lens, encoder_out};
+    std::vector<paddle::Tensor> outputs = forward_attention_decoder_(inputs);
+    CHECK(outputs.size() == 2);
+
+    // (B, Umax, V)
+    paddle::Tensor probs = outputs[0];
+    std::vector<int64_t> probs_shape = probs.shape();
+    CHECK(probs_shape.size() == 3);
+    CHECK(probs_shape[0] == num_hyps);
+    CHECK(probs_shape[1] == max_hyps_len);
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("decoder_logprob",
+                               std::ios_base::app | std::ios_base::out);
+        std::ofstream dec_logprob_fobj(path.str().c_str(), std::ios::out);
+        CHECK(dec_logprob_fobj.is_open());
+
+        dec_logprob_fobj << probs.shape()[0] << " " << probs.shape()[1] << " "
+                         << probs.shape()[2] << "\n";
+        const float* dec_logprob_ptr = probs.data<float>();
+
+        size_t size = probs.numel();
+        for (int i = 0; i < size; i++) {
+            dec_logprob_fobj << dec_logprob_ptr[i] << "\n";
+        }
+    }
+#endif  // end TEST_DEBUG
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("hyps_lens",
+                               std::ios_base::app | std::ios_base::out);
+        std::ofstream hyps_len_fobj(path.str().c_str(), std::ios::out);
+        CHECK(hyps_len_fobj.is_open());
+
+        const int64_t* hyps_lens_ptr = hyps_lens.data<int64_t>();
+
+        size_t size = hyps_lens.numel();
+        for (int i = 0; i < size; i++) {
+            hyps_len_fobj << hyps_lens_ptr[i] << "\n";
+        }
+    }
+#endif  // end TEST_DEBUG
+
+#ifdef TEST_DEBUG
+    {
+        std::stringstream path("hyps_tensor",
+                               std::ios_base::app | std::ios_base::out);
+        std::ofstream hyps_tensor_fobj(path.str().c_str(), std::ios::out);
+        CHECK(hyps_tensor_fobj.is_open());
+
+        const int64_t* hyps_tensor_ptr = hyps_tensor.data<int64_t>();
+
+        size_t size = hyps_tensor.numel();
+        for (int i = 0; i < size; i++) {
+            hyps_tensor_fobj << hyps_tensor_ptr[i] << "\n";
+        }
+    }
+#endif  // end TEST_DEBUG
+
+    paddle::Tensor r_probs = outputs[1];
+    std::vector<int64_t> r_probs_shape = r_probs.shape();
+    if (is_bidecoder_ && reverse_weight > 0) {
+        CHECK(r_probs_shape.size() == 3);
+        CHECK(r_probs_shape[0] == num_hyps);
+        CHECK(r_probs_shape[1] == max_hyps_len);
+    } else {
+        // dump r_probs
+        CHECK(r_probs_shape.size() == 1);
+        CHECK(r_probs_shape[0] == 1) << r_probs_shape[0];
+    }
+
+    // compute rescoring score
+    using IntArray = paddle::experimental::IntArray;
+    std::vector<paddle::Tensor> probs_v =
+        paddle::experimental::split_with_num(probs, num_hyps, 0);
+    VLOG(2) << "split prob: " << probs_v.size() << " "
+            << probs_v[0].shape().size() << " 0: " << probs_v[0].shape()[0]
+            << ", " << probs_v[0].shape()[1] << ", " << probs_v[0].shape()[2];
+    CHECK(static_cast<int>(probs_v.size()) == num_hyps)
+        << ": is " << probs_v.size() << " expect: " << num_hyps;
+
+    std::vector<paddle::Tensor> r_probs_v;
+    if (is_bidecoder_ && reverse_weight > 0) {
+        r_probs_v = paddle::experimental::split_with_num(r_probs, num_hyps, 0);
+        CHECK(static_cast<int>(r_probs_v.size()) == num_hyps)
+            << "r_probs_v size: is " << r_probs_v.size()
+            << " expect: " << num_hyps;
+    }
+
+    for (int i = 0; i < num_hyps; ++i) {
+        const std::vector<int>& hyp = hyps[i];
+
+        // left-to-right decoder score
+        float score = 0.0f;
+        score = ComputePathScore(probs_v[i], hyp, eos_);
+
+        // right-to-left decoder score
+        float r_score = 0.0f;
+        if (is_bidecoder_ && reverse_weight > 0) {
+            std::vector<int> r_hyp(hyp.size());
+            std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin());
+            r_score = ComputePathScore(r_probs_v[i], r_hyp, eos_);
+        }
+
+        // combinded left-to-right and right-to-lfet score
+        (*rescoring_score)[i] =
+            score * (1 - reverse_weight) + r_score * reverse_weight;
+        VLOG(1) << "hyp " << i << " score: " << score << " r_score: " << r_score
+                << " reverse_weight: " << reverse_weight;
+    }
+}
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h
new file mode 100644
index 000000000..ddc85b45f
--- /dev/null
+++ b/speechx/speechx/nnet/u2_nnet.h
@@ -0,0 +1,157 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "base/common.h"
+#include "kaldi/matrix/kaldi-matrix.h"
+
+#include "kaldi/util/options-itf.h"
+#include "nnet/nnet_itf.h"
+
+#include "paddle/extension.h"
+#include "paddle/jit/all.h"
+#include "paddle/phi/api/all.h"
+
+namespace ppspeech {
+
+struct U2ModelOptions {
+    std::string model_path;
+    int thread_num;
+    bool use_gpu;
+    U2ModelOptions() : model_path(""), thread_num(1), use_gpu(false) {}
+
+    void Register(kaldi::OptionsItf* opts) {
+        opts->Register("model-path", &model_path, "model file path");
+        opts->Register("thread-num", &thread_num, "thread num");
+        opts->Register("use-gpu", &use_gpu, "if use gpu");
+    }
+};
+
+
+class U2NnetBase : public NnetInterface {
+  public:
+    virtual int context() const { return right_context_ + 1; }
+    virtual int right_context() const { return right_context_; }
+    virtual int subsampling_rate() const { return subsampling_rate_; }
+    virtual int eos() const { return eos_; }
+    virtual int sos() const { return sos_; }
+    virtual int is_bidecoder() const { return is_bidecoder_; }
+    // current offset in decoder frame
+    virtual int offset() const { return offset_; }
+    virtual void set_chunk_size(int chunk_size) { chunk_size_ = chunk_size; }
+    virtual void set_num_left_chunks(int num_left_chunks) {
+        num_left_chunks_ = num_left_chunks;
+    }
+    // start: false, it is the start chunk of one sentence, else true
+    virtual int num_frames_for_chunk(bool start) const;
+
+    virtual std::shared_ptr<NnetInterface> Copy() const = 0;
+
+    virtual void ForwardEncoderChunk(
+        const std::vector<kaldi::BaseFloat>& chunk_feats,
+        int32 feat_dim,
+        std::vector<kaldi::BaseFloat>* ctc_probs,
+        int32* vocab_dim);
+
+    virtual void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
+                                    float reverse_weight,
+                                    std::vector<float>* rescoring_score) = 0;
+
+  protected:
+    virtual void ForwardEncoderChunkImpl(
+        const std::vector<kaldi::BaseFloat>& chunk_feats,
+        int32 feat_dim,
+        std::vector<kaldi::BaseFloat>* ctc_probs,
+        int32* vocab_dim) = 0;
+
+    virtual void CacheFeature(const std::vector<kaldi::BaseFloat>& chunk_feats,
+                              int32 feat_dim);
+
+  protected:
+    // model specification
+    int right_context_{0};
+    int subsampling_rate_{1};
+
+    int sos_{0};
+    int eos_{0};
+
+    bool is_bidecoder_{false};
+
+    int chunk_size_{16};  // num of decoder frames. If chunk_size > 0, streaming
+                          // case. Otherwise, none streaming case
+    int num_left_chunks_{-1};  // -1 means all left chunks
+
+    // asr decoder state
+    int offset_{0};  // current offset in encoder output time stamp. Used by
+                     // position embedding.
+    std::vector<std::vector<float>> cached_feats_{};  // features cache
+};
+
+
+class U2Nnet : public U2NnetBase {
+  public:
+    U2Nnet(const U2ModelOptions& opts);
+    U2Nnet(const U2Nnet& other);
+
+    void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
+                     int32 feature_dim,
+                     kaldi::Vector<kaldi::BaseFloat>* inferences,
+                     int32* inference_dim) override;
+
+    void Reset() override;
+
+    void Dim();
+
+    void LoadModel(const std::string& model_path_w_prefix);
+    void Warmup();
+
+    std::shared_ptr<paddle::jit::Layer> model() const { return model_; }
+
+    std::shared_ptr<NnetInterface> Copy() const override;
+
+    void ForwardEncoderChunkImpl(
+        const std::vector<kaldi::BaseFloat>& chunk_feats,
+        int32 feat_dim,
+        std::vector<kaldi::BaseFloat>* ctc_probs,
+        int32* vocab_dim) override;
+
+    float ComputePathScore(const paddle::Tensor& prob,
+                           const std::vector<int>& hyp,
+                           int eos);
+
+    void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
+                            float reverse_weight,
+                            std::vector<float>* rescoring_score) override;
+
+    // debug
+    void FeedEncoderOuts(paddle::Tensor& encoder_out);
+
+  private:
+    U2ModelOptions opts_;
+
+    phi::Place dev_;
+    std::shared_ptr<paddle::jit::Layer> model_{nullptr};
+    std::vector<paddle::Tensor> encoder_outs_;
+    // transformer/conformer attention cache
+    paddle::Tensor att_cache_ = paddle::full({0, 0, 0, 0}, 0.0);
+    // conformer-only conv_module cache
+    paddle::Tensor cnn_cache_ = paddle::full({0, 0, 0, 0}, 0.0);
+
+    paddle::jit::Function forward_encoder_chunk_;
+    paddle::jit::Function forward_attention_decoder_;
+    paddle::jit::Function ctc_activation_;
+};
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc
new file mode 100644
index 000000000..1a1a5e02d
--- /dev/null
+++ b/speechx/speechx/nnet/u2_nnet_main.cc
@@ -0,0 +1,180 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet/u2_nnet.h"
+#include "base/common.h"
+#include "frontend/audio/assembler.h"
+#include "frontend/audio/data_cache.h"
+#include "kaldi/util/table-types.h"
+#include "nnet/decodable.h"
+
+DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
+DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier");
+
+DEFINE_string(model_path, "", "paddle nnet model");
+
+DEFINE_int32(nnet_decoder_chunk, 16, "nnet forward chunk");
+DEFINE_int32(receptive_field_length,
+             7,
+             "receptive field of two CNN(kernel=3) downsampling module.");
+DEFINE_int32(downsampling_rate,
+             4,
+             "two CNN(kernel=3) module downsampling rate.");
+DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
+
+using kaldi::BaseFloat;
+using kaldi::Matrix;
+using std::vector;
+
+int main(int argc, char* argv[]) {
+    gflags::SetUsageMessage("Usage:");
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+    google::InstallFailureSignalHandler();
+    FLAGS_logtostderr = 1;
+
+    int32 num_done = 0, num_err = 0;
+
+    CHECK(FLAGS_feature_rspecifier.size() > 0);
+    CHECK(FLAGS_nnet_prob_wspecifier.size() > 0);
+    CHECK(FLAGS_model_path.size() > 0);
+    LOG(INFO) << "input rspecifier: " << FLAGS_feature_rspecifier;
+    LOG(INFO) << "output wspecifier: " << FLAGS_nnet_prob_wspecifier;
+    LOG(INFO) << "model path: " << FLAGS_model_path;
+    kaldi::SequentialBaseFloatMatrixReader feature_reader(
+        FLAGS_feature_rspecifier);
+    kaldi::BaseFloatMatrixWriter nnet_out_writer(FLAGS_nnet_prob_wspecifier);
+
+    ppspeech::U2ModelOptions model_opts;
+    model_opts.model_path = FLAGS_model_path;
+
+    int32 chunk_size =
+        (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate +
+        FLAGS_receptive_field_length;
+    int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk;
+    int32 receptive_field_length = FLAGS_receptive_field_length;
+    LOG(INFO) << "chunk size (frame): " << chunk_size;
+    LOG(INFO) << "chunk stride (frame): " << chunk_stride;
+    LOG(INFO) << "receptive field (frame): " << receptive_field_length;
+
+    std::shared_ptr<ppspeech::U2Nnet> nnet(new ppspeech::U2Nnet(model_opts));
+    std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
+    std::shared_ptr<ppspeech::Decodable> decodable(
+        new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale));
+    kaldi::Timer timer;
+
+    for (; !feature_reader.Done(); feature_reader.Next()) {
+        string utt = feature_reader.Key();
+        kaldi::Matrix<BaseFloat> feature = feature_reader.Value();
+
+        int nframes = feature.NumRows();
+        int feat_dim = feature.NumCols();
+        raw_data->SetDim(feat_dim);
+        LOG(INFO) << "utt: " << utt;
+        LOG(INFO) << "feat shape: " << nframes << ", " << feat_dim;
+
+        // // pad feats
+        // int32 padding_len = 0;
+        // if ((feature.NumRows() - chunk_size) % chunk_stride != 0) {
+        //     padding_len =
+        //         chunk_stride - (feature.NumRows() - chunk_size) %
+        //         chunk_stride;
+        //     feature.Resize(feature.NumRows() + padding_len,
+        //                    feature.NumCols(),
+        //                    kaldi::kCopyData);
+        // }
+
+        int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1;
+        int32 frame_idx = 0;
+        std::vector<kaldi::Vector<kaldi::BaseFloat>> prob_vec;
+        int32 ori_feature_len = feature.NumRows();
+
+        for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
+            kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
+                                                          feat_dim);
+
+            int32 feature_chunk_size = 0;
+            if (ori_feature_len > chunk_idx * chunk_stride) {
+                feature_chunk_size = std::min(
+                    ori_feature_len - chunk_idx * chunk_stride, chunk_size);
+            }
+            if (feature_chunk_size < receptive_field_length) {
+                LOG(WARNING) << "utt: " << utt << " skip last "
+                             << feature_chunk_size << " frames, expect is "
+                             << receptive_field_length;
+                break;
+            }
+
+            int32 start = chunk_idx * chunk_stride;
+            for (int row_id = 0; row_id < chunk_size; ++row_id) {
+                kaldi::SubVector<kaldi::BaseFloat> feat_row(feature, start);
+                kaldi::SubVector<kaldi::BaseFloat> feature_chunk_row(
+                    feature_chunk.Data() + row_id * feat_dim, feat_dim);
+
+                feature_chunk_row.CopyFromVec(feat_row);
+                ++start;
+            }
+
+            // feat to frontend pipeline cache
+            raw_data->Accept(feature_chunk);
+
+            // send data finish signal
+            if (chunk_idx == num_chunks - 1) {
+                raw_data->SetFinished();
+            }
+
+            // get nnet outputs
+            vector<kaldi::BaseFloat> prob;
+            while (decodable->FrameLikelihood(frame_idx, &prob)) {
+                kaldi::Vector<kaldi::BaseFloat> vec_tmp(prob.size());
+                std::memcpy(vec_tmp.Data(),
+                            prob.data(),
+                            sizeof(kaldi::BaseFloat) * prob.size());
+                prob_vec.push_back(vec_tmp);
+                frame_idx++;
+            }
+        }
+
+        // after process one utt, then reset decoder state.
+        decodable->Reset();
+
+        if (prob_vec.size() == 0) {
+            // the TokenWriter can not write empty string.
+            ++num_err;
+            LOG(WARNING) << " the nnet prob of " << utt << " is empty";
+            continue;
+        }
+
+        // writer nnet output
+        kaldi::MatrixIndexT nrow = prob_vec.size();
+        kaldi::MatrixIndexT ncol = prob_vec[0].Dim();
+        LOG(INFO) << "nnet out shape: " << nrow << ", " << ncol;
+        kaldi::Matrix<kaldi::BaseFloat> result(nrow, ncol);
+        for (int32 row_idx = 0; row_idx < nrow; ++row_idx) {
+            for (int32 col_idx = 0; col_idx < ncol; ++col_idx) {
+                result(row_idx, col_idx) = prob_vec[row_idx](col_idx);
+            }
+        }
+        nnet_out_writer.Write(utt, result);
+
+        ++num_done;
+    }
+
+    double elapsed = timer.Elapsed();
+    LOG(INFO) << " cost:" << elapsed << " sec";
+
+    LOG(INFO) << "Done " << num_done << " utterances, " << num_err
+              << " with errors.";
+    return (num_done != 0 ? 0 : 1);
+}
diff --git a/speechx/speechx/protocol/CMakeLists.txt b/speechx/speechx/protocol/CMakeLists.txt
index 98b2f38b4..71b33daa9 100644
--- a/speechx/speechx/protocol/CMakeLists.txt
+++ b/speechx/speechx/protocol/CMakeLists.txt
@@ -1,3 +1 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
 add_subdirectory(websocket)
diff --git a/speechx/speechx/protocol/websocket/CMakeLists.txt b/speechx/speechx/protocol/websocket/CMakeLists.txt
index c3454c399..0f73fd24c 100644
--- a/speechx/speechx/protocol/websocket/CMakeLists.txt
+++ b/speechx/speechx/protocol/websocket/CMakeLists.txt
@@ -1,4 +1,4 @@
-project(websocket)
+# project(websocket)
 
 add_library(websocket STATIC
   websocket_server.cc
diff --git a/speechx/speechx/utils/CMakeLists.txt b/speechx/speechx/utils/CMakeLists.txt
index 95e865744..c1e875be1 100644
--- a/speechx/speechx/utils/CMakeLists.txt
+++ b/speechx/speechx/utils/CMakeLists.txt
@@ -1,4 +1,5 @@
 
 add_library(utils
   file_utils.cc
+  math.cc
 )
\ No newline at end of file
diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc
index 7c3192956..5087ac60b 100644
--- a/speechx/speechx/utils/math.cc
+++ b/speechx/speechx/utils/math.cc
@@ -38,11 +38,11 @@ float LogSumExp(float x, float y) {
 template <typename T>
 struct ValGreaterComp {
     bool operator()(const std::pair<T, int32_t>& lhs,
-                    const std::pair<T, int32_>& rhs) const {
+                    const std::pair<T, int32_t>& rhs) const {
         return lhs.first > rhs.first ||
                (lhs.first == rhs.first && lhs.second < rhs.second);
     }
-}
+};
 
 template <typename T>
 void TopK(const std::vector<T>& data,
diff --git a/speechx/tools/venv.sh b/speechx/tools/venv.sh
new file mode 100755
index 000000000..3952988c6
--- /dev/null
+++ b/speechx/tools/venv.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+set -ex
+
+PYTHON=python3.7
+test -d venv || virtualenv -p ${PYTHON} venv