diff --git a/runtime/engine/asr/decoder/ctc_prefix_beam_search_decoder.cc b/runtime/engine/asr/decoder/ctc_prefix_beam_search_decoder.cc
index f54f21fa2..bf912af2e 100644
--- a/runtime/engine/asr/decoder/ctc_prefix_beam_search_decoder.cc
+++ b/runtime/engine/asr/decoder/ctc_prefix_beam_search_decoder.cc
@@ -87,9 +87,9 @@ void CTCPrefixBeamSearch::AdvanceDecode(
 
         VLOG(1) << "num_frame_decoded_: " << num_frame_decoded_;
     }
-    VLOG(1) << "AdvanceDecode feat + forward  cost: " << feat_nnet_cost
+    VLOG(2) << "AdvanceDecode feat + forward  cost: " << feat_nnet_cost
             << " sec.";
-    VLOG(1) << "AdvanceDecode search  cost: " << search_cost << " sec.";
+    VLOG(2) << "AdvanceDecode search  cost: " << search_cost << " sec.";
 }
 
 static bool PrefixScoreCompare(
diff --git a/runtime/engine/asr/nnet/nnet_producer.cc b/runtime/engine/asr/nnet/nnet_producer.cc
index 431b70251..529fae656 100644
--- a/runtime/engine/asr/nnet/nnet_producer.cc
+++ b/runtime/engine/asr/nnet/nnet_producer.cc
@@ -46,7 +46,6 @@ void NnetProducer::Acceptlikelihood(
 
 bool NnetProducer::Read(std::vector<kaldi::BaseFloat>* nnet_prob) {
     bool flag = cache_.pop(nnet_prob);
-    VLOG(1) << "nnet cache_ size: " << cache_.size();
     return flag;
 }
 
diff --git a/runtime/engine/asr/nnet/u2_nnet.cc b/runtime/engine/asr/nnet/u2_nnet.cc
index 968b6ceea..9a09514e3 100644
--- a/runtime/engine/asr/nnet/u2_nnet.cc
+++ b/runtime/engine/asr/nnet/u2_nnet.cc
@@ -124,7 +124,15 @@ U2Nnet::U2Nnet(const U2Nnet& other) {
     offset_ = other.offset_;
 
     // copy model ptr
-    model_ = other.model_->Clone();
+    // model_ = other.model_->Clone();
+    // hack, fix later
+    #ifdef WITH_GPU
+        dev_ = phi::GPUPlace();
+    #else
+        dev_ = phi::CPUPlace();
+    #endif
+    paddle::jit::Layer model = paddle::jit::Load(other.opts_.model_path, dev_);
+    model_ = std::make_shared<paddle::jit::Layer>(std::move(model));
     ctc_activation_ = model_->Function("ctc_activation");
     subsampling_rate_ = model_->Attribute<int>("subsampling_rate");
     right_context_ = model_->Attribute<int>("right_context");
@@ -166,6 +174,7 @@ void U2Nnet::Reset() {
         std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32));
 
     encoder_outs_.clear();
+    VLOG(1) << "FeedForward cost: " << cost_time_ << " sec. ";
     VLOG(3) << "u2nnet reset";
 }
 
@@ -185,8 +194,10 @@ void U2Nnet::FeedForward(const std::vector<BaseFloat>& features,
     std::vector<kaldi::BaseFloat> ctc_probs;
     ForwardEncoderChunkImpl(
         features, feature_dim, &out->logprobs, &out->vocab_dim);
-    VLOG(1) << "FeedForward cost: " << timer.Elapsed() << " sec. "
+    float forward_chunk_time = timer.Elapsed();
+    VLOG(1) << "FeedForward cost: " << forward_chunk_time << " sec. "
             << features.size() / feature_dim << " frames.";
+    cost_time_ += forward_chunk_time;
 }
 
 
diff --git a/runtime/engine/asr/nnet/u2_nnet.h b/runtime/engine/asr/nnet/u2_nnet.h
index 35a157078..dba5c55e0 100644
--- a/runtime/engine/asr/nnet/u2_nnet.h
+++ b/runtime/engine/asr/nnet/u2_nnet.h
@@ -113,8 +113,8 @@ class U2Nnet : public U2NnetBase {
     void EncoderOuts(
         std::vector<std::vector<kaldi::BaseFloat>>* encoder_out) const;
 
+    ModelOptions opts_; // hack, fix later
   private:
-    ModelOptions opts_;
 
     phi::Place dev_;
     std::shared_ptr<paddle::jit::Layer> model_{nullptr};
@@ -127,6 +127,7 @@ class U2Nnet : public U2NnetBase {
     paddle::jit::Function forward_encoder_chunk_;
     paddle::jit::Function forward_attention_decoder_;
     paddle::jit::Function ctc_activation_;
+    float cost_time_ = 0.0;
 };
 
 }  // namespace ppspeech
\ No newline at end of file
diff --git a/runtime/engine/asr/recognizer/recognizer_main.cc b/runtime/engine/asr/recognizer/recognizer_main.cc
index 141a44f33..99b7b4dd8 100644
--- a/runtime/engine/asr/recognizer/recognizer_main.cc
+++ b/runtime/engine/asr/recognizer/recognizer_main.cc
@@ -88,7 +88,8 @@ int main(int argc, char* argv[]) {
 
         kaldi::Timer timer;
         recognizer_ptr->AttentionRescoring();
-        tot_attention_rescore_time += timer.Elapsed();
+        float rescore_time = timer.Elapsed();
+        tot_attention_rescore_time += rescore_time;
 
         std::string result = recognizer_ptr->GetFinalResult();
         if (result.empty()) {
@@ -101,7 +102,7 @@ int main(int argc, char* argv[]) {
         tot_decode_time += local_timer.Elapsed();
         LOG(INFO) << utt << " " << result;
         LOG(INFO) << " RTF: " << local_timer.Elapsed() / dur << " dur: " << dur
-                  << " cost: " << local_timer.Elapsed();
+                  << " cost: " << local_timer.Elapsed() << " rescore:" << rescore_time;
 
         result_writer.Write(utt, result);
 
diff --git a/runtime/examples/u2pp_ol/wenetspeech/RESULTS.md b/runtime/examples/u2pp_ol/wenetspeech/RESULTS.md
index 3a3544641..263bca519 100644
--- a/runtime/examples/u2pp_ol/wenetspeech/RESULTS.md
+++ b/runtime/examples/u2pp_ol/wenetspeech/RESULTS.md
@@ -67,7 +67,7 @@ Other -> 100.00 % N=3 C=0 S=3 D=0 I=0
 #### RTF
 ```
 I0417 08:07:15.300631 75784 recognizer_main.cc:113] total wav duration is: 36108.9 sec
-I0417 08:07:15.300642 75784 recognizer_main.cc:114] total decode cost:16353.7 sec
-I0417 08:07:15.300648 75784 recognizer_main.cc:115] total rescore cost:936.858 sec
-I0417 08:07:15.300653 75784 recognizer_main.cc:116] RTF is: 0.4529
-```
\ No newline at end of file
+I0417 08:07:15.300642 75784 recognizer_main.cc:114] total decode cost:10247.7 sec
+I0417 08:07:15.300648 75784 recognizer_main.cc:115] total rescore cost:908.228 sec
+I0417 08:07:15.300653 75784 recognizer_main.cc:116] RTF is: 0.283
+```
diff --git a/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_fastdeploy.sh b/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_fastdeploy.sh
new file mode 100755
index 000000000..1d60eaff4
--- /dev/null
+++ b/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_fastdeploy.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+set -e
+
+data=data
+exp=exp
+nj=20
+
+. utils/parse_options.sh
+
+mkdir -p $exp
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/onnx_model/
+aishell_wav_scp=aishell_test.scp
+text=$data/test/text
+
+./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
+
+utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer.fd.log \
+recognizer_main \
+    --use_fbank=true \
+    --num_bins=80 \
+    --model_path=$model_dir \
+    --word_symbol_table=$model_dir/unit.txt \
+    --nnet_decoder_chunk=16 \
+    --receptive_field_length=7 \
+    --subsampling_rate=4 \
+    --with_onnx_model=true \
+    --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
+    --result_wspecifier=ark,t:$data/split${nj}/JOB/recognizer.fd.rsl.ark
+
+
+cat $data/split${nj}/*/recognizer.fd.rsl.ark > $exp/aishell.recognizer.fd.rsl
+utils/compute-wer.py --char=1 --v=1 $text $exp/aishell.recognizer.fd.rsl > $exp/aishell.recognizer.fd.err
+echo "recognizer fd test have finished!!!"
+echo "please checkout in $exp/aishell.recognizer.fd.err"
+tail -n 7 $exp/aishell.recognizer.fd.err
diff --git a/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_wfst_fastdeploy.sh b/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_wfst_fastdeploy.sh
new file mode 100755
index 000000000..fb0a19e88
--- /dev/null
+++ b/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_wfst_fastdeploy.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+set -e
+
+data=data
+exp=exp
+nj=20
+
+. utils/parse_options.sh
+
+mkdir -p $exp
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/onnx_model/
+aishell_wav_scp=aishell_test.scp
+text=$data/test/text
+
+./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
+
+lang_dir=./data/lang_test/
+graph=$lang_dir/TLG.fst
+word_table=$lang_dir/words.txt
+
+if [ ! -f $graph ]; then
+    # download ngram, if you want to make graph by yourself, please refer local/run_build_tlg.sh
+    mkdir -p $lang_dir
+    pushd $lang_dir
+    wget -c https://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/tlg.zip
+    unzip tlg.zip
+    popd
+fi
+
+utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer_wfst_fd.log \
+recognizer_main \
+    --use_fbank=true \
+    --num_bins=80 \
+    --model_path=$model_dir \
+    --graph_path=$lang_dir/TLG.fst \
+    --word_symbol_table=$word_table \
+    --nnet_decoder_chunk=16 \
+    --receptive_field_length=7 \
+    --subsampling_rate=4 \
+    --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
+    --rescoring_weight=0.0 \
+    --acoustic_scale=2 \
+    --result_wspecifier=ark,t:$data/split${nj}/JOB/result_recognizer_wfst_fd.ark
+
+
+cat $data/split${nj}/*/result_recognizer_wfst_fd.ark > $exp/aishell_recognizer_wfst_fd
+utils/compute-wer.py --char=1 --v=1 $text $exp/aishell_recognizer_wfst_fd > $exp/aishell.recognizer_wfst_fd.err
+echo "recognizer test have finished!!!"
+echo "please checkout in $exp/aishell.recognizer_wfst_fd.err"
+tail -n 7 $exp/aishell.recognizer_wfst_fd.err