From 0ffe1f91143b0489fd38be90747afcbb5e61fedc Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 28 Mar 2022 03:35:55 +0000
Subject: [PATCH 01/15] replace kaidi_fbank with paddleaudio

---
 examples/aishell/asr1/conf/preprocess.yaml   |  9 ++--
 paddlespeech/s2t/transform/spectrogram.py    | 45 ++++++++++++++++++++
 paddlespeech/s2t/transform/transformation.py |  1 +
 3 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/examples/aishell/asr1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml
index f7f4c58d5..a20ff2ab3 100644
--- a/examples/aishell/asr1/conf/preprocess.yaml
+++ b/examples/aishell/asr1/conf/preprocess.yaml
@@ -3,8 +3,9 @@ process:
   - type: fbank_kaldi
     fs: 16000
     n_mels: 80
-    n_shift: 160
-    win_length: 400
+    n_frame_length: 25
+    n_frame_shift: 10
+    energy_floor: 0.0
     dither: 0.1
   - type: cmvn_json
     cmvn_path: data/mean_std.json
@@ -23,7 +24,3 @@ process:
     n_mask: 2
     inplace: true
     replace_with_zero: false
-
-
-
-
diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py
index 889cd349d..f779b07d4 100644
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@@ -14,8 +14,11 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 import librosa
 import numpy as np
+import paddle
 from python_speech_features import logfbank
 
+import paddleaudio.compliance.kaldi as kaldi
+
 
 def stft(x,
          n_fft,
@@ -309,6 +312,48 @@ class IStft():
 
 
 class LogMelSpectrogramKaldi():
+    def __init__(self,
+                 fs=16000,
+                 n_mels=80,
+                 n_frame_length=25,
+                 n_frame_shift=10,
+                 energy_floor=0.0,
+                 dither=0.1):
+        self.fs = fs
+        self.n_mels = n_mels
+        self.n_frame_length = n_frame_length
+        self.n_frame_shift = n_frame_shift
+        self.energy_floor = energy_floor
+        self.dither = dither
+
+    def __repr__(self):
+        return (
+            "{name}(fs={fs}, n_mels={n_mels}, "
+            "n_frame_shift={n_frame_shift}, n_frame_length={n_frame_length}, "
+            "dither={dither}))".format(
+                name=self.__class__.__name__,
+                fs=self.fs,
+                n_mels=self.n_mels,
+                n_frame_shift=self.n_frame_shift,
+                n_frame_length=self.n_frame_length,
+                dither=self.dither, ))
+
+    def __call__(self, x, train):
+        dither = self.dither if train else 0.0
+        waveform = paddle.to_tensor(np.expand_dims(x, 0), dtype=paddle.float32)
+        mat = kaldi.fbank(
+            waveform,
+            n_mels=self.n_mels,
+            frame_length=self.n_frame_length,
+            frame_shift=self.n_frame_shift,
+            dither=dither,
+            energy_floor=self.energy_floor,
+            sr=self.fs)
+        mat = np.squeeze(mat.numpy())
+        return mat
+
+
+class LogMelSpectrogramKaldi_decay():
     def __init__(
             self,
             fs=16000,
diff --git a/paddlespeech/s2t/transform/transformation.py b/paddlespeech/s2t/transform/transformation.py
index 381b0cdc9..3b433cb0b 100644
--- a/paddlespeech/s2t/transform/transformation.py
+++ b/paddlespeech/s2t/transform/transformation.py
@@ -31,6 +31,7 @@ import_alias = dict(
     freq_mask="paddlespeech.s2t.transform.spec_augment:FreqMask",
     spec_augment="paddlespeech.s2t.transform.spec_augment:SpecAugment",
     speed_perturbation="paddlespeech.s2t.transform.perturb:SpeedPerturbation",
+    speed_perturbation_sox="paddlespeech.s2t.transform.perturb:SpeedPerturbationSox",
     volume_perturbation="paddlespeech.s2t.transform.perturb:VolumePerturbation",
     noise_injection="paddlespeech.s2t.transform.perturb:NoiseInjection",
     bandpass_perturbation="paddlespeech.s2t.transform.perturb:BandpassPerturbation",

From bc5ae43d3a5ecdb3fbcea92d18a054ddc260816a Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Mon, 28 Mar 2022 04:49:10 +0000
Subject: [PATCH 02/15] restructure expand in length_regulator.py for
 paddle2onnx, test=tts

---
 examples/csmsc/tts3/README.md                 |  4 +++-
 .../t2s/modules/predictor/length_regulator.py | 24 ++++++++++++-------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index 7b803526f..a31203f8a 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -227,7 +227,9 @@ Pretrained FastSpeech2 model with no silence in the edge of audios:
 - [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)
 - [fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)
 
-The static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip).
+The static model can be downloaded here:
+[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip).
+[fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip)
 
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss 
 :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
diff --git a/paddlespeech/t2s/modules/predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py
index 2472c413b..be788e6ed 100644
--- a/paddlespeech/t2s/modules/predictor/length_regulator.py
+++ b/paddlespeech/t2s/modules/predictor/length_regulator.py
@@ -73,15 +73,21 @@ class LengthRegulator(nn.Layer):
         batch_size, t_enc = paddle.shape(durations)
         slens = paddle.sum(durations, -1)
         t_dec = paddle.max(slens)
-        M = paddle.zeros([batch_size, t_dec, t_enc])
-        for i in range(batch_size):
-            k = 0
-            for j in range(t_enc):
-                d = durations[i, j]
-                # If the d == 0, slice action is meaningless and not supported in paddle
-                if d >= 1:
-                    M[i, k:k + d, j] = 1
-                k += d
+        t_dec_1 = t_dec + 1
+        flatten_duration = paddle.cumsum(
+            paddle.reshape(durations, [batch_size * t_enc])) + 1
+        init = paddle.zeros(t_dec_1)
+        m_batch = batch_size * t_enc
+        M = paddle.zeros([t_dec_1, m_batch])
+        for i in range(m_batch):
+            d = flatten_duration[i]
+            m = paddle.concat(
+                [paddle.ones(d), paddle.zeros(t_dec_1 - d)], axis=0)
+            M[:, i] = m - init
+            init = m
+        M = paddle.reshape(M, shape=[t_dec_1, batch_size, t_enc])
+        M = M[1:, :, :]
+        M = paddle.transpose(M, (1, 0, 2))
         encodings = paddle.matmul(M, encodings)
         return encodings
 

From e52fc08c586e1da29bb5ccf1ebaf6b639e21412d Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Mon, 28 Mar 2022 05:53:31 +0000
Subject: [PATCH 03/15] update readme, test=doc

---
 examples/csmsc/tts3/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index a31203f8a..08bf2ee91 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -228,8 +228,8 @@ Pretrained FastSpeech2 model with no silence in the edge of audios:
 - [fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)
 
 The static model can be downloaded here:
-[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip).
-[fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip)
+- [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)
+- [fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip)
 
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss 
 :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:

From ed490b66cb052c1308117e5e9703d94d8e43239a Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 29 Mar 2022 03:20:07 +0000
Subject: [PATCH 04/15] update spectrogram, test=asr

---
 examples/aishell/asr1/conf/preprocess.yaml |  5 ++--
 paddlespeech/s2t/transform/spectrogram.py  | 34 ++++++++++++++++------
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/examples/aishell/asr1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml
index a20ff2ab3..d3992cb9f 100644
--- a/examples/aishell/asr1/conf/preprocess.yaml
+++ b/examples/aishell/asr1/conf/preprocess.yaml
@@ -3,9 +3,8 @@ process:
   - type: fbank_kaldi
     fs: 16000
     n_mels: 80
-    n_frame_length: 25
-    n_frame_shift: 10
-    energy_floor: 0.0
+    n_shift: 160
+    win_length: 400
     dither: 0.1
   - type: cmvn_json
     cmvn_path: data/mean_std.json
diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py
index f779b07d4..75787d92b 100644
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@@ -312,17 +312,33 @@ class IStft():
 
 
 class LogMelSpectrogramKaldi():
-    def __init__(self,
-                 fs=16000,
-                 n_mels=80,
-                 n_frame_length=25,
-                 n_frame_shift=10,
-                 energy_floor=0.0,
-                 dither=0.1):
+    def __init__(
+            self,
+            fs=16000,
+            n_mels=80,
+            n_shift=160,  # unit:sample, 10ms
+            win_length=400,  # unit:sample, 25ms
+            energy_floor=0.0,
+            dither=0.1):
+        """
+        The Kaldi implementation of LogMelSpectrogram 
+        Args:
+            fs (int): sample rate of the audio
+            n_mels (int): number of mel filter banks
+            n_shift (int): number of points in a frame shift
+            win_length (int): number of points in a frame windows
+            energy_floor (float): Floor on energy in Spectrogram computation (absolute)
+            dither (float): Dithering constant
+
+        Returns:
+            LogMelSpectrogramKaldi
+        """
+
         self.fs = fs
         self.n_mels = n_mels
-        self.n_frame_length = n_frame_length
-        self.n_frame_shift = n_frame_shift
+        num_point_ms = fs / 1000
+        self.n_frame_length = win_length / num_point_ms
+        self.n_frame_shift = n_shift / num_point_ms
         self.energy_floor = energy_floor
         self.dither = dither
 

From 284a5f26c20aa21401f0720ac594d7c5800ef629 Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Tue, 29 Mar 2022 17:17:32 +0800
Subject: [PATCH 05/15] fix speechx doc typo

---
 speechx/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/speechx/README.md b/speechx/README.md
index 07449f9f9..bb256712e 100644
--- a/speechx/README.md
+++ b/speechx/README.md
@@ -5,7 +5,7 @@
 We develop under:
 * docker - registry.baidubce.com/paddlepaddle/paddle:2.1.1-gpu-cuda10.2-cudnn7
 * os - Ubuntu 16.04.7 LTS
-* ** gcc/g++/gfortran - 8.2.0 **
+* gcc/g++/gfortran - 8.2.0
 * cmake - 3.16.0
 
 > We make sure all things work fun under docker, and recommend using it to develop and deploy.
@@ -24,7 +24,7 @@ nvidia-docker run --privileged  --net=host --ipc=host -it --rm -v $PWD:/workspac
 
 * More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html).
 
-* If you want only work under cpu, please download corresponded [image](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html), and using `docker` instead `nviida-docker`.
+* If you want only work under cpu, please download corresponded [image](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html), and using `docker` instead `nvida-docker`.
 
 
 2. Build `speechx` and `examples`.

From 678c8369aa6e01063496d5a9992e893ad444012b Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Tue, 29 Mar 2022 17:38:50 +0800
Subject: [PATCH 06/15] fix typo

---
 speechx/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/speechx/README.md b/speechx/README.md
index bb256712e..610b88a8f 100644
--- a/speechx/README.md
+++ b/speechx/README.md
@@ -24,7 +24,7 @@ nvidia-docker run --privileged  --net=host --ipc=host -it --rm -v $PWD:/workspac
 
 * More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html).
 
-* If you want only work under cpu, please download corresponded [image](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html), and using `docker` instead `nvida-docker`.
+* If you want only work under cpu, please download corresponded [image](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html), and using `docker` instead `nvidia-docker`.
 
 
 2. Build `speechx` and `examples`.

From f47146af494f510428e9d14702f4b735c88843aa Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 29 Mar 2022 12:09:54 +0000
Subject: [PATCH 07/15] add docstring, test=asr

---
 paddlespeech/s2t/transform/spectrogram.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py
index 75787d92b..4a65548fe 100644
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@@ -355,7 +355,20 @@ class LogMelSpectrogramKaldi():
                 dither=self.dither, ))
 
     def __call__(self, x, train):
+        """
+        Args:
+            x (np.ndarray): shape (Ti,)
+            train (bool): True, train mode.
+
+        Raises:
+            ValueError: not support (Ti, C)
+
+        Returns:
+            np.ndarray: (T, D)
+        """
         dither = self.dither if train else 0.0
+        if x.ndim != 1:
+            raise ValueError("Not support x: [Time, Channel]")
         waveform = paddle.to_tensor(np.expand_dims(x, 0), dtype=paddle.float32)
         mat = kaldi.fbank(
             waveform,

From fa160d86e326e7f2a8260f07ccbaa68c28c9defc Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Wed, 30 Mar 2022 07:57:52 +0000
Subject: [PATCH 08/15] update version of ctcdecoders, test=asr

---
 third_party/ctc_decoders/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/ctc_decoders/setup.py b/third_party/ctc_decoders/setup.py
index 4a11b890d..ce2787e3f 100644
--- a/third_party/ctc_decoders/setup.py
+++ b/third_party/ctc_decoders/setup.py
@@ -127,7 +127,7 @@ decoders_module = [
 
 setup(
     name='paddlespeech_ctcdecoders',
-    version='0.1.1',
+    version='0.2.0',
     description="CTC decoders in paddlespeech",
     author="PaddlePaddle Speech and Language Team",
     author_email="paddlesl@baidu.com",

From d847fe29cfc83afbe5e4fc6f3717240516600eb7 Mon Sep 17 00:00:00 2001
From: WilliamZhang06 <xd_0614@163.com>
Date: Wed, 30 Mar 2022 16:44:04 +0800
Subject: [PATCH 09/15] added online asr engine , test=doc

---
 paddlespeech/s2t/frontend/audio.py            |  12 +
 paddlespeech/s2t/frontend/speech.py           |  16 +
 paddlespeech/server/bin/main.py               |  10 +-
 paddlespeech/server/conf/application.yaml     |  27 +-
 .../server/engine/asr/online/__init__.py      |  13 +
 .../server/engine/asr/online/asr_engine.py    | 355 ++++++++++++++++++
 paddlespeech/server/engine/engine_factory.py  |   3 +
 .../tests/asr/online/microphone_client.py     | 154 ++++++++
 .../tests/asr/online/websocket_client.py      | 115 ++++++
 paddlespeech/server/utils/buffer.py           |  59 +++
 paddlespeech/server/utils/vad.py              |  79 ++++
 paddlespeech/server/ws/__init__.py            |  13 +
 paddlespeech/server/ws/api.py                 |  38 ++
 paddlespeech/server/ws/asr_socket.py          | 106 ++++++
 14 files changed, 996 insertions(+), 4 deletions(-)
 create mode 100644 paddlespeech/server/engine/asr/online/__init__.py
 create mode 100644 paddlespeech/server/engine/asr/online/asr_engine.py
 create mode 100644 paddlespeech/server/tests/asr/online/microphone_client.py
 create mode 100644 paddlespeech/server/tests/asr/online/websocket_client.py
 create mode 100644 paddlespeech/server/utils/buffer.py
 create mode 100644 paddlespeech/server/utils/vad.py
 create mode 100644 paddlespeech/server/ws/__init__.py
 create mode 100644 paddlespeech/server/ws/api.py
 create mode 100644 paddlespeech/server/ws/asr_socket.py

diff --git a/paddlespeech/s2t/frontend/audio.py b/paddlespeech/s2t/frontend/audio.py
index d0368cc8d..7f71e5dd9 100644
--- a/paddlespeech/s2t/frontend/audio.py
+++ b/paddlespeech/s2t/frontend/audio.py
@@ -208,6 +208,18 @@ class AudioSegment():
             io.BytesIO(bytes), dtype='float32')
         return cls(samples, sample_rate)
 
+    @classmethod
+    def from_pcm(cls, samples, sample_rate):
+        """Create audio segment from a byte string containing audio samples.
+        :param samples: Audio samples [num_samples x num_channels].
+        :type samples: numpy.ndarray
+        :param sample_rate: Audio sample rate.
+        :type sample_rate: int
+        :return: Audio segment instance.
+        :rtype: AudioSegment
+        """
+        return cls(samples, sample_rate)
+
     @classmethod
     def concatenate(cls, *segments):
         """Concatenate an arbitrary number of audio segments together.
diff --git a/paddlespeech/s2t/frontend/speech.py b/paddlespeech/s2t/frontend/speech.py
index 8fd661c92..0340831a6 100644
--- a/paddlespeech/s2t/frontend/speech.py
+++ b/paddlespeech/s2t/frontend/speech.py
@@ -107,6 +107,22 @@ class SpeechSegment(AudioSegment):
         return cls(audio.samples, audio.sample_rate, transcript, tokens,
                    token_ids)
 
+    @classmethod
+    def from_pcm(cls, samples, sample_rate, transcript, tokens=None, token_ids=None):
+        """Create speech segment from pcm on online mode 
+        Args:
+            samples (numpy.ndarray): Audio samples [num_samples x num_channels].
+            sample_rate (int): Audio sample rate.
+            transcript (str): Transcript text for the speech.
+            tokens (List[str], optional): text tokens. Defaults to None.
+            token_ids (List[int], optional): text token ids. Defaults to None.
+        Returns: 
+            SpeechSegment: Speech segment instance.
+        """
+        audio = AudioSegment.from_pcm(samples, sample_rate)
+        return cls(audio.samples, audio.sample_rate, transcript, tokens,
+                   token_ids)
+
     @classmethod
     def concatenate(cls, *segments):
         """Concatenate an arbitrary number of speech segments together, both
diff --git a/paddlespeech/server/bin/main.py b/paddlespeech/server/bin/main.py
index de5282993..45ded33d8 100644
--- a/paddlespeech/server/bin/main.py
+++ b/paddlespeech/server/bin/main.py
@@ -17,7 +17,8 @@ import uvicorn
 from fastapi import FastAPI
 
 from paddlespeech.server.engine.engine_pool import init_engine_pool
-from paddlespeech.server.restful.api import setup_router
+from paddlespeech.server.restful.api import setup_router as setup_http_router
+from paddlespeech.server.ws.api import setup_router as setup_ws_router
 from paddlespeech.server.utils.config import get_config
 
 app = FastAPI(
@@ -35,7 +36,12 @@ def init(config):
     """
     # init api
     api_list = list(engine.split("_")[0] for engine in config.engine_list)
-    api_router = setup_router(api_list)
+    if config.protocol == "websocket":
+        api_router = setup_ws_router(api_list)
+    elif config.protocol == "http":
+        api_router = setup_http_router(api_list)
+    else:
+        raise Exception("unsupported protocol")
     app.include_router(api_router)
 
     if not init_engine_pool(config):
diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml
index 2b1a05998..40de8e3b2 100644
--- a/paddlespeech/server/conf/application.yaml
+++ b/paddlespeech/server/conf/application.yaml
@@ -3,13 +3,18 @@
 #################################################################################
 #                             SERVER SETTING                                    #
 #################################################################################
-host: 127.0.0.1
+host: 0.0.0.0
 port: 8090
 
 # The task format in the engin_list is: <speech task>_<engine type>
 # task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
+# protocol: 'http'
+# engine_list: ['asr_python', 'tts_python', 'cls_python']
 
-engine_list: ['asr_python', 'tts_python', 'cls_python']
+
+# websocket, http (only choose one). websocket only support online engine type.
+protocol: 'websocket'
+engine_list: ['asr_online']
 
 
 #################################################################################
@@ -48,6 +53,24 @@ asr_inference:
         summary: True  # False -> do not show predictor config
 
 
+################### speech task: asr; engine_type: online #######################
+asr_online:
+    model_type: 'deepspeech2online_aishell'
+    am_model: # the pdmodel file of am static model [optional]
+    am_params:  # the pdiparams file of am static model [optional]
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: 
+    decode_method: 
+    force_yes: True
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+
 ################################### TTS #########################################
 ################### speech task: tts; engine_type: python #######################
 tts_python: 
diff --git a/paddlespeech/server/engine/asr/online/__init__.py b/paddlespeech/server/engine/asr/online/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/paddlespeech/server/engine/asr/online/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py
new file mode 100644
index 000000000..d5c1aa7bd
--- /dev/null
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@@ -0,0 +1,355 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import os
+import time
+from typing import Optional
+import pickle
+import numpy as np
+from numpy import float32
+import soundfile
+
+import paddle
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.frontend.speech import SpeechSegment
+from paddlespeech.cli.asr.infer import ASRExecutor
+from paddlespeech.cli.log import logger
+from paddlespeech.cli.utils import MODEL_HOME
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.modules.ctc import CTCDecoder
+from paddlespeech.s2t.utils.utility import UpdateConfig
+from paddlespeech.server.engine.base_engine import BaseEngine
+from paddlespeech.server.utils.config import get_config
+from paddlespeech.server.utils.paddle_predictor import init_predictor
+from paddlespeech.server.utils.paddle_predictor import run_model
+
+__all__ = ['ASREngine']
+
+pretrained_models = {
+    "deepspeech2online_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        'd5e076217cf60486519f72c217d21b9b',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2_online/checkpoints/avg_1',
+        'model':
+        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
+        'params':
+        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+}
+
+
+class ASRServerExecutor(ASRExecutor):
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def _init_from_path(self,
+                        model_type: str='wenetspeech',
+                        am_model: Optional[os.PathLike]=None,
+                        am_params: Optional[os.PathLike]=None,
+                        lang: str='zh',
+                        sample_rate: int=16000,
+                        cfg_path: Optional[os.PathLike]=None,
+                        decode_method: str='attention_rescoring',
+                        am_predictor_conf: dict=None):
+        """
+        Init model and other resources from a specific path.
+        """
+
+        if cfg_path is None or am_model is None or am_params is None:
+            sample_rate_str = '16k' if sample_rate == 16000 else '8k'
+            tag = model_type + '-' + lang + '-' + sample_rate_str
+            res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
+            self.res_path = res_path
+            self.cfg_path = os.path.join(res_path,
+                                         pretrained_models[tag]['cfg_path'])
+
+            self.am_model = os.path.join(res_path,
+                                         pretrained_models[tag]['model'])
+            self.am_params = os.path.join(res_path,
+                                          pretrained_models[tag]['params'])
+            logger.info(res_path)
+            logger.info(self.cfg_path)
+            logger.info(self.am_model)
+            logger.info(self.am_params)
+        else:
+            self.cfg_path = os.path.abspath(cfg_path)
+            self.am_model = os.path.abspath(am_model)
+            self.am_params = os.path.abspath(am_params)
+            self.res_path = os.path.dirname(
+                os.path.dirname(os.path.abspath(self.cfg_path)))
+
+        #Init body.
+        self.config = CfgNode(new_allowed=True)
+        self.config.merge_from_file(self.cfg_path)
+
+        with UpdateConfig(self.config):
+            if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
+                from paddlespeech.s2t.io.collator import SpeechCollator
+                self.vocab = self.config.vocab_filepath
+                self.config.decode.lang_model_path = os.path.join(
+                    MODEL_HOME, 'language_model',
+                    self.config.decode.lang_model_path)
+                self.collate_fn_test = SpeechCollator.from_config(self.config)
+                self.text_feature = TextFeaturizer(
+                    unit_type=self.config.unit_type, vocab=self.vocab)
+
+                lm_url = pretrained_models[tag]['lm_url']
+                lm_md5 = pretrained_models[tag]['lm_md5']
+                self.download_lm(
+                    lm_url,
+                    os.path.dirname(self.config.decode.lang_model_path), lm_md5)
+            elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
+                raise Exception("wrong type")
+            else:
+                raise Exception("wrong type")
+
+        # AM predictor
+        self.am_predictor_conf = am_predictor_conf
+        self.am_predictor = init_predictor(
+            model_file=self.am_model,
+            params_file=self.am_params,
+            predictor_conf=self.am_predictor_conf)
+
+        # decoder
+        self.decoder = CTCDecoder(
+            odim=self.config.output_dim,  # <blank> is in  vocab
+            enc_n_units=self.config.rnn_layer_size * 2,
+            blank_id=self.config.blank_id,
+            dropout_rate=0.0,
+            reduction=True,  # sum
+            batch_average=True,  # sum / batch_size
+            grad_norm_type=self.config.get('ctc_grad_norm_type', None))
+        
+        # init decoder
+        cfg = self.config.decode
+        decode_batch_size = 1      # for online
+        self.decoder.init_decoder(
+            decode_batch_size, self.text_feature.vocab_list,
+            cfg.decoding_method, cfg.lang_model_path, cfg.alpha, cfg.beta,
+            cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n,
+            cfg.num_proc_bsearch)
+
+        # init state box
+        self.chunk_state_h_box = np.zeros(
+            (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+            dtype=float32)
+        self.chunk_state_c_box = np.zeros(
+            (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+            dtype=float32)
+
+    def reset_decoder_and_chunk(self):
+        """reset decoder and chunk state for an new audio
+        """
+        self.decoder.reset_decoder(batch_size=1)
+        # init state box, for new audio request
+        self.chunk_state_h_box = np.zeros(
+            (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+            dtype=float32)
+        self.chunk_state_c_box = np.zeros(
+            (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+            dtype=float32)
+
+    def decode_one_chunk(self, x_chunk, x_chunk_lens, model_type: str):
+        """decode one chunk
+
+        Args:
+            x_chunk (numpy.array): shape[B, T, D]
+            x_chunk_lens (numpy.array): shape[B]
+            model_type (str): online model type
+
+        Returns:
+            [type]: [description]
+        """
+        if "deepspeech2online" in model_type :
+            input_names = self.am_predictor.get_input_names()
+            audio_handle = self.am_predictor.get_input_handle(input_names[0])
+            audio_len_handle = self.am_predictor.get_input_handle(input_names[1])
+            h_box_handle = self.am_predictor.get_input_handle(input_names[2])
+            c_box_handle = self.am_predictor.get_input_handle(input_names[3])
+
+            audio_handle.reshape(x_chunk.shape)
+            audio_handle.copy_from_cpu(x_chunk)
+
+            audio_len_handle.reshape(x_chunk_lens.shape)
+            audio_len_handle.copy_from_cpu(x_chunk_lens)
+
+            h_box_handle.reshape(self.chunk_state_h_box.shape)
+            h_box_handle.copy_from_cpu(self.chunk_state_h_box)
+
+            c_box_handle.reshape(self.chunk_state_c_box.shape)
+            c_box_handle.copy_from_cpu(self.chunk_state_c_box)
+
+            output_names = self.am_predictor.get_output_names()
+            output_handle = self.am_predictor.get_output_handle(output_names[0])
+            output_lens_handle = self.am_predictor.get_output_handle(output_names[1])
+            output_state_h_handle = self.am_predictor.get_output_handle(
+                output_names[2])
+            output_state_c_handle = self.am_predictor.get_output_handle(
+                output_names[3])
+
+            self.am_predictor.run()
+
+            output_chunk_probs = output_handle.copy_to_cpu()
+            output_chunk_lens = output_lens_handle.copy_to_cpu()
+            self.chunk_state_h_box = output_state_h_handle.copy_to_cpu()
+            self.chunk_state_c_box = output_state_c_handle.copy_to_cpu()
+
+            self.decoder.next(output_chunk_probs, output_chunk_lens)
+            trans_best, trans_beam = self.decoder.decode()
+
+            return trans_best[0]
+
+        elif "conformer" in model_type or "transformer" in model_type:
+            raise Exception("invalid model name")
+        else:
+            raise Exception("invalid model name")
+
+    def _pcm16to32(self, audio):
+        """pcm int16 to float32
+
+        Args:
+            audio(numpy.array): numpy.int16
+
+        Returns:
+            audio(numpy.array): numpy.float32
+        """
+        if audio.dtype == np.int16:
+            audio = audio.astype("float32")
+            bits = np.iinfo(np.int16).bits
+            audio = audio / (2**(bits - 1))
+        return audio
+
+    def extract_feat(self, samples, sample_rate):
+        """extract feat
+
+        Args:
+            samples (numpy.array): numpy.float32
+            sample_rate (int): sample rate
+
+        Returns:
+            x_chunk (numpy.array): shape[B, T, D]
+            x_chunk_lens (numpy.array): shape[B]
+        """
+        # pcm16 -> pcm 32
+        samples = self._pcm16to32(samples)
+
+        # read audio
+        speech_segment = SpeechSegment.from_pcm(
+            samples, sample_rate, transcript=" ")
+        # audio augment
+        self.collate_fn_test.augmentation.transform_audio(speech_segment)
+
+        # extract speech feature
+        spectrum, transcript_part = self.collate_fn_test._speech_featurizer.featurize(
+            speech_segment, self.collate_fn_test.keep_transcription_text)
+        # CMVN spectrum
+        if self.collate_fn_test._normalizer:
+            spectrum = self.collate_fn_test._normalizer.apply(spectrum)
+
+        # spectrum augment
+        audio = self.collate_fn_test.augmentation.transform_feature(spectrum)
+
+        audio_len = audio.shape[0]
+        audio = paddle.to_tensor(audio, dtype='float32')
+        # audio_len = paddle.to_tensor(audio_len)
+        audio = paddle.unsqueeze(audio, axis=0)
+
+        x_chunk = audio.numpy()
+        x_chunk_lens = np.array([audio_len])
+
+        return x_chunk, x_chunk_lens
+
+
+class ASREngine(BaseEngine):
+    """ASR server engine
+
+    Args:
+        metaclass: Defaults to Singleton.
+    """
+
+    def __init__(self):
+        super(ASREngine, self).__init__()
+
+    def init(self, config: dict) -> bool:
+        """init engine resource
+
+        Args:
+            config_file (str): config file
+
+        Returns:
+            bool: init failed or success
+        """
+        self.input = None
+        self.output = ""
+        self.executor = ASRServerExecutor()
+        self.config = config
+
+        self.executor._init_from_path(
+            model_type=self.config.model_type,
+            am_model=self.config.am_model,
+            am_params=self.config.am_params,
+            lang=self.config.lang,
+            sample_rate=self.config.sample_rate,
+            cfg_path=self.config.cfg_path,
+            decode_method=self.config.decode_method,
+            am_predictor_conf=self.config.am_predictor_conf)
+
+        logger.info("Initialize ASR server engine successfully.")
+        return True
+
+    def preprocess(self, samples, sample_rate):
+        """preprocess
+
+        Args:
+            samples (numpy.array): numpy.float32
+            sample_rate (int): sample rate
+
+        Returns:
+            x_chunk (numpy.array): shape[B, T, D]
+            x_chunk_lens (numpy.array): shape[B]
+        """
+        x_chunk, x_chunk_lens = self.executor.extract_feat(samples, sample_rate)
+        return x_chunk, x_chunk_lens
+
+    def run(self, x_chunk, x_chunk_lens, decoder_chunk_size=1):
+        """run online engine
+
+        Args:
+            x_chunk (numpy.array): shape[B, T, D]
+            x_chunk_lens (numpy.array): shape[B]
+            decoder_chunk_size(int)
+        """
+        self.output = self.executor.decode_one_chunk(x_chunk, x_chunk_lens, self.config.model_type)
+
+    def postprocess(self):
+        """postprocess
+        """
+        return self.output
+
+    def reset(self):
+        """reset engine decoder and inference state
+        """
+        self.executor.reset_decoder_and_chunk()
+        self.output = ""
diff --git a/paddlespeech/server/engine/engine_factory.py b/paddlespeech/server/engine/engine_factory.py
index c39c44cae..2a39fb79b 100644
--- a/paddlespeech/server/engine/engine_factory.py
+++ b/paddlespeech/server/engine/engine_factory.py
@@ -25,6 +25,9 @@ class EngineFactory(object):
         elif engine_name == 'asr' and engine_type == 'python':
             from paddlespeech.server.engine.asr.python.asr_engine import ASREngine
             return ASREngine()
+        elif engine_name == 'asr' and engine_type == 'online':
+            from paddlespeech.server.engine.asr.online.asr_engine import ASREngine
+            return ASREngine()
         elif engine_name == 'tts' and engine_type == 'inference':
             from paddlespeech.server.engine.tts.paddleinference.tts_engine import TTSEngine
             return TTSEngine()
diff --git a/paddlespeech/server/tests/asr/online/microphone_client.py b/paddlespeech/server/tests/asr/online/microphone_client.py
new file mode 100644
index 000000000..74d457c56
--- /dev/null
+++ b/paddlespeech/server/tests/asr/online/microphone_client.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+record wave from the mic
+"""
+
+import threading
+import pyaudio
+import wave
+import logging
+import asyncio
+import websockets
+import json
+from signal import SIGINT, SIGTERM
+
+
+class ASRAudioHandler(threading.Thread):
+    def __init__(self,
+                 url="127.0.0.1",
+                 port=8090):
+        threading.Thread.__init__(self)
+        self.url = url
+        self.port = port
+        self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr"
+        self.fileName = "./output.wav"
+        self.chunk = 5120
+        self.format = pyaudio.paInt16
+        self.channels = 1
+        self.rate = 16000
+        self._running = True
+        self._frames = []
+        self.data_backup = []
+
+    def startrecord(self):
+        """
+        start a new thread to record wave
+        """
+        threading._start_new_thread(self.recording, ())
+
+    def recording(self):
+        """
+        recording wave
+        """
+        self._running = True
+        self._frames = []
+        p = pyaudio.PyAudio()
+        stream = p.open(format=self.format,
+                               channels=self.channels,
+                               rate=self.rate,
+                               input=True,
+                               frames_per_buffer=self.chunk)
+        while(self._running):
+            data = stream.read(self.chunk)
+            self._frames.append(data)
+            self.data_backup.append(data)
+
+        stream.stop_stream()
+        stream.close()
+        p.terminate()
+
+    def save(self):
+        """
+        save wave data
+        """
+        p = pyaudio.PyAudio()
+        wf = wave.open(self.fileName, 'wb')
+        wf.setnchannels(self.channels)
+        wf.setsampwidth(p.get_sample_size(self.format))
+        wf.setframerate(self.rate)
+        wf.writeframes(b''.join(self.data_backup))
+        wf.close()
+        p.terminate()
+
+    def stoprecord(self):
+        """
+        stop recording
+        """
+        self._running = False
+
+    async def run(self):
+        aa = input("是否开始录音？   (y/n)")
+        if aa.strip() == "y":
+            self.startrecord()
+            logging.info("*" * 10 + "开始录音，请输入语音")
+
+            async with websockets.connect(self.url) as ws:
+                # 发送开始指令
+                audio_info = json.dumps({
+                                "name": "test.wav",
+                                "signal": "start",
+                                "nbest": 5
+                                }, sort_keys=True, indent=4, separators=(',', ': '))
+                await ws.send(audio_info)
+                msg = await ws.recv()
+                logging.info("receive msg={}".format(msg))
+
+                # send bytes data
+                logging.info("结束录音请: Ctrl + c。继续请按回车。")
+                try:
+                    while True:
+                        while len(self._frames) > 0:
+                            await ws.send(self._frames.pop(0))
+                            msg = await ws.recv()
+                            logging.info("receive msg={}".format(msg))
+                except asyncio.CancelledError:
+                    # quit
+                    # send finished 
+                    audio_info = json.dumps({
+                                    "name": "test.wav",
+                                    "signal": "end",
+                                    "nbest": 5
+                                    }, sort_keys=True, indent=4, separators=(',', ': '))
+                    await ws.send(audio_info)
+                    msg = await ws.recv()
+                    logging.info("receive msg={}".format(msg))
+
+                    self.stoprecord()
+                    logging.info("*" * 10 + "录音结束")
+                    self.save()
+        elif aa.strip() == "n":
+            exit()
+        else:
+            print("无效输入!")
+            exit()
+
+
+if __name__ == "__main__":
+
+    logging.basicConfig(level=logging.INFO)
+    logging.info("asr websocket client start")
+
+    handler = ASRAudioHandler("127.0.0.1", 8090)
+    loop = asyncio.get_event_loop()
+    main_task = asyncio.ensure_future(handler.run())
+    for signal in [SIGINT, SIGTERM]:
+        loop.add_signal_handler(signal, main_task.cancel)
+    try:
+        loop.run_until_complete(main_task)
+    finally:
+        loop.close()
+
+    logging.info("asr websocket client finished")
diff --git a/paddlespeech/server/tests/asr/online/websocket_client.py b/paddlespeech/server/tests/asr/online/websocket_client.py
new file mode 100644
index 000000000..d849ffeac
--- /dev/null
+++ b/paddlespeech/server/tests/asr/online/websocket_client.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+
+import argparse
+import logging
+import time
+import os
+import json
+import wave
+import numpy as np
+import asyncio
+import websockets
+import soundfile
+
+
+class ASRAudioHandler:
+    def __init__(self,
+                 url="127.0.0.1",
+                 port=8090):
+        self.url = url
+        self.port = port
+        self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr"
+
+    def read_wave(self, wavfile_path: str):
+        samples, sample_rate = soundfile.read(wavfile_path, dtype='int16')
+        x_len = len(samples)
+        chunk_stride = 40 * 16      #40ms, sample_rate = 16kHz
+        chunk_size = 80 * 16        #80ms, sample_rate = 16kHz
+
+        if (x_len - chunk_size) % chunk_stride != 0:
+            padding_len_x = chunk_stride - (x_len - chunk_size
+                                            ) % chunk_stride
+        else:
+            padding_len_x = 0
+
+        padding = np.zeros(
+            (padding_len_x), dtype=samples.dtype)
+        padded_x = np.concatenate([samples, padding], axis=0)
+
+        num_chunk = (x_len + padding_len_x - chunk_size) / chunk_stride + 1
+        num_chunk = int(num_chunk)
+
+        for i in range(0, num_chunk):
+            start = i * chunk_stride
+            end = start + chunk_size
+            x_chunk = padded_x[start:end]
+            yield x_chunk
+
+    async def run(self, wavfile_path: str):
+        logging.info("send a message to the server")
+        # 读取音频
+        # self.read_wave()
+        # 发送 websocket 的 handshake 协议头
+        async with websockets.connect(self.url) as ws:
+            # server 端已经接收到 handshake 协议头
+            # 发送开始指令
+            audio_info = json.dumps({
+                            "name": "test.wav",
+                            "signal": "start",
+                            "nbest": 5
+                            }, sort_keys=True, indent=4, separators=(',', ': '))
+            await ws.send(audio_info)
+            msg = await ws.recv()
+            logging.info("receive msg={}".format(msg))
+
+            # send chunk audio data to engine
+            for chunk_data in self.read_wave(wavfile_path):
+                await ws.send(chunk_data.tobytes())
+                msg = await ws.recv()
+                logging.info("receive msg={}".format(msg))
+
+            # finished 
+            audio_info = json.dumps({
+                            "name": "test.wav",
+                            "signal": "end",
+                            "nbest": 5
+                            }, sort_keys=True, indent=4, separators=(',', ': '))
+            await ws.send(audio_info)
+            msg = await ws.recv()
+            logging.info("receive msg={}".format(msg))
+
+
+def main(args):
+    logging.basicConfig(level=logging.INFO)
+    logging.info("asr websocket client start")
+    handler = ASRAudioHandler("127.0.0.1", 8090)
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(handler.run(args.wavfile))
+    logging.info("asr websocket client finished")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--wavfile",
+        action="store",
+        help="wav file path ",
+        default="./16_audio.wav")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/paddlespeech/server/utils/buffer.py b/paddlespeech/server/utils/buffer.py
new file mode 100644
index 000000000..4c1a3958a
--- /dev/null
+++ b/paddlespeech/server/utils/buffer.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Frame(object):
+    """Represents a "frame" of audio data."""
+
+    def __init__(self, bytes, timestamp, duration):
+        self.bytes = bytes
+        self.timestamp = timestamp
+        self.duration = duration
+
+
+class ChunkBuffer(object):
+    def __init__(self,
+                 frame_duration_ms=80,
+                 shift_ms=40,
+                 sample_rate=16000,
+                 sample_width=2):
+        self.sample_rate = sample_rate
+        self.frame_duration_ms = frame_duration_ms
+        self.shift_ms = shift_ms
+        self.remained_audio = b''
+        self.sample_width = sample_width  # int16 = 2; float32 = 4
+
+    def frame_generator(self, audio):
+        """Generates audio frames from PCM audio data.
+        Takes the desired frame duration in milliseconds, the PCM data, and
+        the sample rate.
+        Yields Frames of the requested duration.
+        """
+        audio = self.remained_audio + audio
+        self.remained_audio = b''
+
+        n = int(self.sample_rate *
+                (self.frame_duration_ms / 1000.0) * self.sample_width)
+        shift_n = int(self.sample_rate *
+                      (self.shift_ms / 1000.0) * self.sample_width)
+        offset = 0
+        timestamp = 0.0
+        duration = (float(n) / self.sample_rate) / self.sample_width
+        shift_duration = (float(shift_n) / self.sample_rate) / self.sample_width
+        while offset + n <= len(audio):
+            yield Frame(audio[offset:offset + n], timestamp, duration)
+            timestamp += shift_duration
+            offset += shift_n
+
+        self.remained_audio += audio[offset:]
diff --git a/paddlespeech/server/utils/vad.py b/paddlespeech/server/utils/vad.py
new file mode 100644
index 000000000..e9b557171
--- /dev/null
+++ b/paddlespeech/server/utils/vad.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import logging
+
+import webrtcvad
+
+
+class VADAudio():
+    def __init__(self,
+                 aggressiveness,
+                 rate,
+                 frame_duration_ms,
+                 sample_width=2,
+                 padding_ms=200,
+                 padding_ratio=0.9):
+        """Initializes VAD with given aggressivenes and sets up internal queues"""
+        self.vad = webrtcvad.Vad(aggressiveness)
+        self.rate = rate
+        self.sample_width = sample_width
+        self.frame_duration_ms = frame_duration_ms
+        self._frame_length = int(rate * (frame_duration_ms / 1000.0) *
+                                 self.sample_width)
+        self._buffer_queue = collections.deque()
+        self.ring_buffer = collections.deque(maxlen=padding_ms //
+                                             frame_duration_ms)
+        self._ratio = padding_ratio
+        self.triggered = False
+
+    def add_audio(self, audio):
+        """Adds new audio to internal queue"""
+        for x in audio:
+            self._buffer_queue.append(x)
+
+    def frame_generator(self):
+        """Generator that yields audio frames of frame_duration_ms"""
+        while len(self._buffer_queue) > self._frame_length:
+            frame = bytearray()
+            for _ in range(self._frame_length):
+                frame.append(self._buffer_queue.popleft())
+            yield bytes(frame)
+
+    def vad_collector(self):
+        """Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None.
+            Determines voice activity by ratio of frames in padding_ms. Uses a buffer to include padding_ms prior to being triggered.
+            Example: (frame, ..., frame, None, frame, ..., frame, None, ...)
+                      |---utterence---|        |---utterence---|
+        """
+        for frame in self.frame_generator():
+            is_speech = self.vad.is_speech(frame, self.rate)
+            if not self.triggered:
+                self.ring_buffer.append((frame, is_speech))
+                num_voiced = len(
+                    [f for f, speech in self.ring_buffer if speech])
+                if num_voiced > self._ratio * self.ring_buffer.maxlen:
+                    self.triggered = True
+                    for f, s in self.ring_buffer:
+                        yield f
+                    self.ring_buffer.clear()
+            else:
+                yield frame
+                self.ring_buffer.append((frame, is_speech))
+                num_unvoiced = len(
+                    [f for f, speech in self.ring_buffer if not speech])
+                if num_unvoiced > self._ratio * self.ring_buffer.maxlen:
+                    self.triggered = False
+                    yield None
+                    self.ring_buffer.clear()
diff --git a/paddlespeech/server/ws/__init__.py b/paddlespeech/server/ws/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/paddlespeech/server/ws/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/server/ws/api.py b/paddlespeech/server/ws/api.py
new file mode 100644
index 000000000..10664d114
--- /dev/null
+++ b/paddlespeech/server/ws/api.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+from fastapi import APIRouter
+
+from paddlespeech.server.ws.asr_socket import router as asr_router
+
+_router = APIRouter()
+
+
+def setup_router(api_list: List):
+    """setup router for fastapi
+    Args:
+        api_list (List): [asr, tts]
+    Returns:
+        APIRouter
+    """
+    for api_name in api_list:
+        if api_name == 'asr':
+            _router.include_router(asr_router)
+        elif api_name == 'tts':
+            pass
+        else:
+            pass
+
+    return _router
diff --git a/paddlespeech/server/ws/asr_socket.py b/paddlespeech/server/ws/asr_socket.py
new file mode 100644
index 000000000..5cc9472c8
--- /dev/null
+++ b/paddlespeech/server/ws/asr_socket.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import traceback
+from typing import Union
+import random
+import numpy as np
+import json
+
+from fastapi import APIRouter
+from fastapi import WebSocket
+from fastapi import WebSocketDisconnect
+from starlette.websockets import WebSocketState as WebSocketState
+
+from paddlespeech.server.engine.asr.online.asr_engine import ASREngine
+from paddlespeech.server.engine.engine_pool import get_engine_pool
+from paddlespeech.server.utils.buffer import ChunkBuffer
+from paddlespeech.server.utils.vad import VADAudio
+
+
+router = APIRouter()
+
+@router.websocket('/ws/asr')
+async def websocket_endpoint(websocket: WebSocket):
+
+    await websocket.accept()
+
+    # init buffer
+    chunk_buffer = ChunkBuffer(sample_width=2)
+    # init vad
+    vad = VADAudio(2, 16000, 20)
+
+    try:
+        while True:
+            # careful here, changed the source code from starlette.websockets
+            assert websocket.application_state == WebSocketState.CONNECTED
+            message = await websocket.receive()
+            websocket._raise_on_disconnect(message)
+            if "text" in message:
+                message = json.loads(message["text"])
+                if 'signal' not in message:
+                    resp = {
+                            "status": "ok",
+                            "message": "no valid json data"
+                            }
+                    await websocket.send_json(resp)
+
+                if message['signal'] == 'start':
+                    resp = {
+                            "status": "ok",
+                            "signal": "server_ready"
+                            }
+                    # do something at begining here
+                    await websocket.send_json(resp)
+                elif message['signal'] == 'end':
+                    engine_pool = get_engine_pool()
+                    asr_engine = engine_pool['asr']
+                    # reset single  engine for an new connection
+                    asr_engine.reset()
+                    resp = {
+                            "status": "ok",
+                            "signal": "finished"
+                            }
+                    await websocket.send_json(resp)
+                    break
+                else:
+                    resp = {
+                            "status": "ok",
+                            "message": "no valid json data"
+                            }
+                    await websocket.send_json(resp)
+            elif "bytes" in message:
+                message = message["bytes"]
+
+                # vad for input bytes audio
+                vad.add_audio(message)
+                message = b''.join(f for f in vad.vad_collector() if f is not None)
+
+                engine_pool = get_engine_pool()
+                asr_engine = engine_pool['asr']
+                asr_results = ""
+                frames = chunk_buffer.frame_generator(message)
+                for frame in frames:
+                    samples = np.frombuffer(frame.bytes, dtype=np.int16)
+                    sample_rate = asr_engine.config.sample_rate
+                    x_chunk, x_chunk_lens = asr_engine.preprocess(samples, sample_rate)
+                    asr_engine.run(x_chunk, x_chunk_lens)
+                    asr_results = asr_engine.postprocess()
+
+                asr_results = asr_engine.postprocess()
+                resp = {'asr_results': asr_results}
+
+                await websocket.send_json(resp)
+    except WebSocketDisconnect:
+        pass

From 9602749a3cb166cf57e04af81c069f387364905a Mon Sep 17 00:00:00 2001
From: qingen <qingenz123@126.com>
Date: Wed, 30 Mar 2022 17:29:22 +0800
Subject: [PATCH 10/15] [vec][search] update client image url, test=doc fix
 #1608

---
 demos/README.md                           | 1 +
 demos/README_cn.md                        | 1 +
 demos/audio_searching/docker-compose.yaml | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/demos/README.md b/demos/README.md
index 4482aa191..36e93dbf1 100644
--- a/demos/README.md
+++ b/demos/README.md
@@ -4,6 +4,7 @@
 
 The directory containes many speech applications in multi scenarios.
 
+* audio searching - mass audio similarity retrieval
 * audio tagging - multi-label tagging of an audio file
 * automatic_video_subtitiles - generate subtitles from a video
 * metaverse - 2D AR with TTS  
diff --git a/demos/README_cn.md b/demos/README_cn.md
index 242b4f070..add6e25f5 100644
--- a/demos/README_cn.md
+++ b/demos/README_cn.md
@@ -4,6 +4,7 @@
 
 该目录包含基于 PaddleSpeech 开发的不同场景的语音应用 Demo：
 
+* 声音检索 - 海量音频相似性检索。
 * 声音分类 - 基于 AudioSet 的 527 类标签的音频多标签分类。 
 * 视频字幕生成 - 识别视频中语音的文本，并进行文本后处理。
 * 元宇宙 - 基于语音合成的 2D 增强现实。
diff --git a/demos/audio_searching/docker-compose.yaml b/demos/audio_searching/docker-compose.yaml
index 8916e76fd..16ac054d6 100644
--- a/demos/audio_searching/docker-compose.yaml
+++ b/demos/audio_searching/docker-compose.yaml
@@ -64,7 +64,7 @@ services:
 
   webclient:
     container_name: audio-webclient
-    image: qingen1/paddlespeech-audio-search-client:2.3
+    image: paddlepaddle/paddlespeech-audio-search-client:2.3
     networks:
       app_net:
         ipv4_address: 172.16.23.13

From aa0cb236e2f78f3cb1b494650fca4c3a867591bf Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 31 Mar 2022 03:20:05 +0000
Subject: [PATCH 11/15] fix openfst patch

---
 speechx/cmake/external/openfst.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/speechx/cmake/external/openfst.cmake b/speechx/cmake/external/openfst.cmake
index dc9cdff6c..9acf530a1 100644
--- a/speechx/cmake/external/openfst.cmake
+++ b/speechx/cmake/external/openfst.cmake
@@ -13,7 +13,7 @@ ExternalProject_Add(openfst
                       "CPPFLAGS=-I${gflags_BINARY_DIR}/include -I${glog_SOURCE_DIR}/src -I${glog_BINARY_DIR}"
                       "LDFLAGS=-L${gflags_BINARY_DIR} -L${glog_BINARY_DIR}"
                       "LIBS=-lgflags_nothreads -lglog -lpthread"
-  COMMAND           ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR}
+  COMMAND           ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR}
   BUILD_COMMAND     make -j 4
 )
 link_directories(${openfst_PREFIX_DIR}/lib)

From 21c4132edacd436ceb2709f64ae5cdf2f39b3bd9 Mon Sep 17 00:00:00 2001
From: liangym <34430015+lym0302@users.noreply.github.com>
Date: Thu, 31 Mar 2022 15:16:32 +0800
Subject: [PATCH 12/15] Update paddlespeech_client.py

---
 paddlespeech/server/bin/paddlespeech_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py
index 40f17c63c..413f00872 100644
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@@ -150,7 +150,7 @@ class TTSClientExecutor(BaseExecutor):
 
         res = requests.post(url, json.dumps(request))
         response_dict = res.json()
-        if not output:
+        if output is not None:
             self.postprocess(response_dict["result"]["audio"], output)
         return res
 

From f17347f454fd1000ce235e74713c443777c7a79a Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 31 Mar 2022 07:21:28 +0000
Subject: [PATCH 13/15] glog to stderr

---
 demos/audio_searching/src/encode.py              |  4 ----
 speechx/examples/CMakeLists.txt                  |  2 ++
 speechx/examples/README.md                       |  3 ++-
 speechx/examples/decoder/offline_decoder_main.cc |  1 +
 speechx/examples/decoder/run.sh                  |  5 ++++-
 speechx/examples/feat/linear_spectrogram_main.cc | 10 ++++++++++
 speechx/examples/feat/run.sh                     |  1 +
 7 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/demos/audio_searching/src/encode.py b/demos/audio_searching/src/encode.py
index cf5f29a40..358057841 100644
--- a/demos/audio_searching/src/encode.py
+++ b/demos/audio_searching/src/encode.py
@@ -11,11 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
-
-import librosa
 import numpy as np
-from config import DEFAULT_TABLE
 from logs import LOGGER
 
 from paddlespeech.cli import VectorExecutor
diff --git a/speechx/examples/CMakeLists.txt b/speechx/examples/CMakeLists.txt
index ef0a72b88..7f1543c25 100644
--- a/speechx/examples/CMakeLists.txt
+++ b/speechx/examples/CMakeLists.txt
@@ -3,3 +3,5 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 add_subdirectory(feat)
 add_subdirectory(nnet)
 add_subdirectory(decoder)
+
+add_subdirectory(glog)
\ No newline at end of file
diff --git a/speechx/examples/README.md b/speechx/examples/README.md
index 941c4272d..705ca2006 100644
--- a/speechx/examples/README.md
+++ b/speechx/examples/README.md
@@ -1,8 +1,9 @@
 # Examples
 
-* decoder - online decoder to work as offline
+* glog - glog usage
 * feat - mfcc, linear 
 * nnet - ds2 nn
+* decoder - online decoder to work as offline
 
 ## How to run
 
diff --git a/speechx/examples/decoder/offline_decoder_main.cc b/speechx/examples/decoder/offline_decoder_main.cc
index 44127c73b..eccd7c099 100644
--- a/speechx/examples/decoder/offline_decoder_main.cc
+++ b/speechx/examples/decoder/offline_decoder_main.cc
@@ -63,6 +63,7 @@ int main(int argc, char* argv[]) {
 
     int32 chunk_size = 35;
     decoder.InitDecoder();
+    LOG(INFO) << "chunk size: " << chunk_size;
 
     for (; !feature_reader.Done(); feature_reader.Next()) {
         string utt = feature_reader.Key();
diff --git a/speechx/examples/decoder/run.sh b/speechx/examples/decoder/run.sh
index fc5e91824..1e5a678c2 100755
--- a/speechx/examples/decoder/run.sh
+++ b/speechx/examples/decoder/run.sh
@@ -25,6 +25,9 @@ model_dir=../paddle_asr_model
 feat_wspecifier=./feats.ark
 cmvn=./cmvn.ark
 
+
+export GLOG_logtostderr=1
+
 # 3. run feat
 linear_spectrogram_main \
     --wav_rspecifier=scp:$model_dir/wav.scp \
@@ -37,4 +40,4 @@ offline_decoder_main \
     --model_path=$model_dir/avg_1.jit.pdmodel \
     --param_path=$model_dir/avg_1.jit.pdparams \
     --dict_file=$model_dir/vocab.txt \
-    --lm_path=$model_dir/avg_1.jit.klm
\ No newline at end of file
+    --lm_path=$model_dir/avg_1.jit.klm
diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc
index 9ed4d6f93..a27db56fd 100644
--- a/speechx/examples/feat/linear_spectrogram_main.cc
+++ b/speechx/examples/feat/linear_spectrogram_main.cc
@@ -25,6 +25,8 @@
 #include "kaldi/util/kaldi-io.h"
 #include "kaldi/util/table-types.h"
 
+#include <glog/logging.h>
+
 DEFINE_string(wav_rspecifier, "", "test wav scp path");
 DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
 DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn");
@@ -172,6 +174,9 @@ int main(int argc, char* argv[]) {
     ppspeech::LinearSpectrogramOptions opt;
     opt.frame_opts.frame_length_ms = 20;
     opt.frame_opts.frame_shift_ms = 10;
+    LOG(INFO) << "frame length (ms):" << opt.frame_opts.frame_length_ms;
+    LOG(INFO) << "frame shift (ms):" << opt.frame_opts.frame_shift_ms;
+
     ppspeech::DecibelNormalizerOptions db_norm_opt;
     std::unique_ptr<ppspeech::FeatureExtractorInterface> base_feature_extractor(
         new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source)));
@@ -190,6 +195,11 @@ int main(int argc, char* argv[]) {
     int sample_rate = 16000;
     int chunk_sample_size = streaming_chunk * sample_rate;
 
+    LOG(INFO) << "sr:" << sample_rate;
+    LOG(INFO) << "chunk size (s):" << streaming_chunk;
+    LOG(INFO) << "chunk size (sample):" << chunk_sample_size;
+
+
     for (; !wav_reader.Done(); wav_reader.Next()) {
         std::string utt = wav_reader.Key();
         const kaldi::WaveData& wave_data = wav_reader.Value();
diff --git a/speechx/examples/feat/run.sh b/speechx/examples/feat/run.sh
index bd21bd7f4..29c49d325 100755
--- a/speechx/examples/feat/run.sh
+++ b/speechx/examples/feat/run.sh
@@ -25,6 +25,7 @@ feat_wspecifier=./feats.ark
 cmvn=./cmvn.ark
 
 # 3. run feat
+export GLOG_logtostderr=1
 linear_spectrogram_main \
     --wav_rspecifier=scp:$model_dir/wav.scp \
     --feature_wspecifier=ark,t:$feat_wspecifier \

From 2ec8d608bf1ad5b0be9c36dc4339702271c27a6e Mon Sep 17 00:00:00 2001
From: WilliamZhang06 <xd_0614@163.com>
Date: Thu, 31 Mar 2022 16:06:16 +0800
Subject: [PATCH 14/15] fixed comments, test=doc

---
 paddlespeech/server/conf/application.yaml     | 13 ++--
 paddlespeech/server/conf/ws_application.yaml  | 51 ++++++++++++++++
 .../tests/asr/online/microphone_client.py     | 61 +++++++++++--------
 .../tests/asr/online/websocket_client.py      | 56 ++++++++---------
 paddlespeech/server/utils/vad.py              |  7 +--
 paddlespeech/server/ws/asr_socket.py          | 48 +++++++--------
 6 files changed, 142 insertions(+), 94 deletions(-)
 create mode 100644 paddlespeech/server/conf/ws_application.yaml

diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml
index 40de8e3b2..849349c2d 100644
--- a/paddlespeech/server/conf/application.yaml
+++ b/paddlespeech/server/conf/application.yaml
@@ -3,18 +3,15 @@
 #################################################################################
 #                             SERVER SETTING                                    #
 #################################################################################
-host: 0.0.0.0
+host: 127.0.0.1
 port: 8090
 
 # The task format in the engin_list is: <speech task>_<engine type>
 # task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
-# protocol: 'http'
-# engine_list: ['asr_python', 'tts_python', 'cls_python']
-
-
-# websocket, http (only choose one). websocket only support online engine type.
-protocol: 'websocket'
-engine_list: ['asr_online']
+# protocol = ['websocket', 'http'] (only one can be selected). 
+# http only support offline engine type.
+protocol: 'http'
+engine_list: ['asr_python', 'tts_python', 'cls_python']
 
 
 #################################################################################
diff --git a/paddlespeech/server/conf/ws_application.yaml b/paddlespeech/server/conf/ws_application.yaml
new file mode 100644
index 000000000..ef23593ed
--- /dev/null
+++ b/paddlespeech/server/conf/ws_application.yaml
@@ -0,0 +1,51 @@
+# This is the parameter configuration file for PaddleSpeech Serving.
+
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
+host: 0.0.0.0
+port: 8091
+
+# The task format in the engin_list is: <speech task>_<engine type>
+# task choices = ['asr_online', 'tts_online']
+# protocol = ['websocket', 'http'] (only one can be selected).
+# websocket only support online engine type.
+protocol: 'websocket'
+engine_list: ['asr_online']
+
+
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+
+################################### ASR #########################################
+################### speech task: asr; engine_type: online #######################
+asr_online:
+    model_type: 'deepspeech2online_aishell'
+    am_model: # the pdmodel file of am static model [optional]
+    am_params:  # the pdiparams file of am static model [optional]
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: 
+    decode_method: 
+    force_yes: True
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    chunk_buffer_conf:
+        frame_duration_ms: 80
+        shift_ms: 40
+        sample_rate: 16000
+        sample_width: 2
+
+    vad_conf:
+        aggressiveness: 2
+        sample_rate: 16000
+        frame_duration_ms: 20
+        sample_width: 2
+        padding_ms: 200
+        padding_ratio: 0.9
diff --git a/paddlespeech/server/tests/asr/online/microphone_client.py b/paddlespeech/server/tests/asr/online/microphone_client.py
index 74d457c56..2ceaf6d03 100644
--- a/paddlespeech/server/tests/asr/online/microphone_client.py
+++ b/paddlespeech/server/tests/asr/online/microphone_client.py
@@ -11,25 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 record wave from the mic
 """
-
+import asyncio
+import json
+import logging
 import threading
-import pyaudio
 import wave
-import logging
-import asyncio
+from signal import SIGINT
+from signal import SIGTERM
+
+import pyaudio
 import websockets
-import json
-from signal import SIGINT, SIGTERM
 
 
 class ASRAudioHandler(threading.Thread):
-    def __init__(self,
-                 url="127.0.0.1",
-                 port=8090):
+    def __init__(self, url="127.0.0.1", port=8091):
         threading.Thread.__init__(self)
         self.url = url
         self.port = port
@@ -56,12 +54,13 @@ class ASRAudioHandler(threading.Thread):
         self._running = True
         self._frames = []
         p = pyaudio.PyAudio()
-        stream = p.open(format=self.format,
-                               channels=self.channels,
-                               rate=self.rate,
-                               input=True,
-                               frames_per_buffer=self.chunk)
-        while(self._running):
+        stream = p.open(
+            format=self.format,
+            channels=self.channels,
+            rate=self.rate,
+            input=True,
+            frames_per_buffer=self.chunk)
+        while (self._running):
             data = stream.read(self.chunk)
             self._frames.append(data)
             self.data_backup.append(data)
@@ -97,11 +96,15 @@ class ASRAudioHandler(threading.Thread):
 
             async with websockets.connect(self.url) as ws:
                 # 发送开始指令
-                audio_info = json.dumps({
-                                "name": "test.wav",
-                                "signal": "start",
-                                "nbest": 5
-                                }, sort_keys=True, indent=4, separators=(',', ': '))
+                audio_info = json.dumps(
+                    {
+                        "name": "test.wav",
+                        "signal": "start",
+                        "nbest": 5
+                    },
+                    sort_keys=True,
+                    indent=4,
+                    separators=(',', ': '))
                 await ws.send(audio_info)
                 msg = await ws.recv()
                 logging.info("receive msg={}".format(msg))
@@ -117,11 +120,15 @@ class ASRAudioHandler(threading.Thread):
                 except asyncio.CancelledError:
                     # quit
                     # send finished 
-                    audio_info = json.dumps({
-                                    "name": "test.wav",
-                                    "signal": "end",
-                                    "nbest": 5
-                                    }, sort_keys=True, indent=4, separators=(',', ': '))
+                    audio_info = json.dumps(
+                        {
+                            "name": "test.wav",
+                            "signal": "end",
+                            "nbest": 5
+                        },
+                        sort_keys=True,
+                        indent=4,
+                        separators=(',', ': '))
                     await ws.send(audio_info)
                     msg = await ws.recv()
                     logging.info("receive msg={}".format(msg))
@@ -141,7 +148,7 @@ if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
     logging.info("asr websocket client start")
 
-    handler = ASRAudioHandler("127.0.0.1", 8090)
+    handler = ASRAudioHandler("127.0.0.1", 8091)
     loop = asyncio.get_event_loop()
     main_task = asyncio.ensure_future(handler.run())
     for signal in [SIGINT, SIGTERM]:
diff --git a/paddlespeech/server/tests/asr/online/websocket_client.py b/paddlespeech/server/tests/asr/online/websocket_client.py
index d849ffeac..58b1a452c 100644
--- a/paddlespeech/server/tests/asr/online/websocket_client.py
+++ b/paddlespeech/server/tests/asr/online/websocket_client.py
@@ -11,26 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 #!/usr/bin/python
 # -*- coding: UTF-8 -*-
-
 import argparse
-import logging
-import time
-import os
+import asyncio
 import json
-import wave
+import logging
+
 import numpy as np
-import asyncio
-import websockets
 import soundfile
+import websockets
 
 
 class ASRAudioHandler:
-    def __init__(self,
-                 url="127.0.0.1",
-                 port=8090):
+    def __init__(self, url="127.0.0.1", port=8090):
         self.url = url
         self.port = port
         self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr"
@@ -38,17 +32,15 @@ class ASRAudioHandler:
     def read_wave(self, wavfile_path: str):
         samples, sample_rate = soundfile.read(wavfile_path, dtype='int16')
         x_len = len(samples)
-        chunk_stride = 40 * 16      #40ms, sample_rate = 16kHz
-        chunk_size = 80 * 16        #80ms, sample_rate = 16kHz
+        chunk_stride = 40 * 16  #40ms, sample_rate = 16kHz
+        chunk_size = 80 * 16  #80ms, sample_rate = 16kHz
 
         if (x_len - chunk_size) % chunk_stride != 0:
-            padding_len_x = chunk_stride - (x_len - chunk_size
-                                            ) % chunk_stride
+            padding_len_x = chunk_stride - (x_len - chunk_size) % chunk_stride
         else:
             padding_len_x = 0
 
-        padding = np.zeros(
-            (padding_len_x), dtype=samples.dtype)
+        padding = np.zeros((padding_len_x), dtype=samples.dtype)
         padded_x = np.concatenate([samples, padding], axis=0)
 
         num_chunk = (x_len + padding_len_x - chunk_size) / chunk_stride + 1
@@ -68,11 +60,15 @@ class ASRAudioHandler:
         async with websockets.connect(self.url) as ws:
             # server 端已经接收到 handshake 协议头
             # 发送开始指令
-            audio_info = json.dumps({
-                            "name": "test.wav",
-                            "signal": "start",
-                            "nbest": 5
-                            }, sort_keys=True, indent=4, separators=(',', ': '))
+            audio_info = json.dumps(
+                {
+                    "name": "test.wav",
+                    "signal": "start",
+                    "nbest": 5
+                },
+                sort_keys=True,
+                indent=4,
+                separators=(',', ': '))
             await ws.send(audio_info)
             msg = await ws.recv()
             logging.info("receive msg={}".format(msg))
@@ -84,11 +80,15 @@ class ASRAudioHandler:
                 logging.info("receive msg={}".format(msg))
 
             # finished 
-            audio_info = json.dumps({
-                            "name": "test.wav",
-                            "signal": "end",
-                            "nbest": 5
-                            }, sort_keys=True, indent=4, separators=(',', ': '))
+            audio_info = json.dumps(
+                {
+                    "name": "test.wav",
+                    "signal": "end",
+                    "nbest": 5
+                },
+                sort_keys=True,
+                indent=4,
+                separators=(',', ': '))
             await ws.send(audio_info)
             msg = await ws.recv()
             logging.info("receive msg={}".format(msg))
@@ -97,7 +97,7 @@ class ASRAudioHandler:
 def main(args):
     logging.basicConfig(level=logging.INFO)
     logging.info("asr websocket client start")
-    handler = ASRAudioHandler("127.0.0.1", 8090)
+    handler = ASRAudioHandler("127.0.0.1", 8091)
     loop = asyncio.get_event_loop()
     loop.run_until_complete(handler.run(args.wavfile))
     logging.info("asr websocket client finished")
diff --git a/paddlespeech/server/utils/vad.py b/paddlespeech/server/utils/vad.py
index e9b557171..a2dcf68b8 100644
--- a/paddlespeech/server/utils/vad.py
+++ b/paddlespeech/server/utils/vad.py
@@ -12,16 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import collections
-import logging
 
 import webrtcvad
 
 
 class VADAudio():
     def __init__(self,
-                 aggressiveness,
-                 rate,
-                 frame_duration_ms,
+                 aggressiveness=2,
+                 rate=16000,
+                 frame_duration_ms=20,
                  sample_width=2,
                  padding_ms=200,
                  padding_ratio=0.9):
diff --git a/paddlespeech/server/ws/asr_socket.py b/paddlespeech/server/ws/asr_socket.py
index 5cc9472c8..ea19816b6 100644
--- a/paddlespeech/server/ws/asr_socket.py
+++ b/paddlespeech/server/ws/asr_socket.py
@@ -11,35 +11,39 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import base64
-import traceback
-from typing import Union
-import random
-import numpy as np
 import json
 
+import numpy as np
 from fastapi import APIRouter
 from fastapi import WebSocket
 from fastapi import WebSocketDisconnect
 from starlette.websockets import WebSocketState as WebSocketState
 
-from paddlespeech.server.engine.asr.online.asr_engine import ASREngine
 from paddlespeech.server.engine.engine_pool import get_engine_pool
 from paddlespeech.server.utils.buffer import ChunkBuffer
 from paddlespeech.server.utils.vad import VADAudio
 
-
 router = APIRouter()
 
+
 @router.websocket('/ws/asr')
 async def websocket_endpoint(websocket: WebSocket):
 
     await websocket.accept()
 
+    engine_pool = get_engine_pool()
+    asr_engine = engine_pool['asr']
     # init buffer
-    chunk_buffer = ChunkBuffer(sample_width=2)
+    chunk_buffer_conf = asr_engine.config.chunk_buffer_conf
+    chunk_buffer = ChunkBuffer(
+        sample_rate=chunk_buffer_conf['sample_rate'],
+        sample_width=chunk_buffer_conf['sample_width'])
     # init vad
-    vad = VADAudio(2, 16000, 20)
+    vad_conf = asr_engine.config.vad_conf
+    vad = VADAudio(
+        aggressiveness=vad_conf['aggressiveness'],
+        rate=vad_conf['sample_rate'],
+        frame_duration_ms=vad_conf['frame_duration_ms'])
 
     try:
         while True:
@@ -50,17 +54,11 @@ async def websocket_endpoint(websocket: WebSocket):
             if "text" in message:
                 message = json.loads(message["text"])
                 if 'signal' not in message:
-                    resp = {
-                            "status": "ok",
-                            "message": "no valid json data"
-                            }
+                    resp = {"status": "ok", "message": "no valid json data"}
                     await websocket.send_json(resp)
 
                 if message['signal'] == 'start':
-                    resp = {
-                            "status": "ok",
-                            "signal": "server_ready"
-                            }
+                    resp = {"status": "ok", "signal": "server_ready"}
                     # do something at begining here
                     await websocket.send_json(resp)
                 elif message['signal'] == 'end':
@@ -68,24 +66,19 @@ async def websocket_endpoint(websocket: WebSocket):
                     asr_engine = engine_pool['asr']
                     # reset single  engine for an new connection
                     asr_engine.reset()
-                    resp = {
-                            "status": "ok",
-                            "signal": "finished"
-                            }
+                    resp = {"status": "ok", "signal": "finished"}
                     await websocket.send_json(resp)
                     break
                 else:
-                    resp = {
-                            "status": "ok",
-                            "message": "no valid json data"
-                            }
+                    resp = {"status": "ok", "message": "no valid json data"}
                     await websocket.send_json(resp)
             elif "bytes" in message:
                 message = message["bytes"]
 
                 # vad for input bytes audio
                 vad.add_audio(message)
-                message = b''.join(f for f in vad.vad_collector() if f is not None)
+                message = b''.join(f for f in vad.vad_collector()
+                                   if f is not None)
 
                 engine_pool = get_engine_pool()
                 asr_engine = engine_pool['asr']
@@ -94,7 +87,8 @@ async def websocket_endpoint(websocket: WebSocket):
                 for frame in frames:
                     samples = np.frombuffer(frame.bytes, dtype=np.int16)
                     sample_rate = asr_engine.config.sample_rate
-                    x_chunk, x_chunk_lens = asr_engine.preprocess(samples, sample_rate)
+                    x_chunk, x_chunk_lens = asr_engine.preprocess(samples,
+                                                                  sample_rate)
                     asr_engine.run(x_chunk, x_chunk_lens)
                     asr_results = asr_engine.postprocess()
 

From cb66b742ab75287e35915e13b59cf71bf42d4146 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 31 Mar 2022 09:03:01 +0000
Subject: [PATCH 15/15] more comment

---
 .../examples/decoder/offline_decoder_main.cc  | 41 ++++++++++++++-----
 speechx/examples/decoder/run.sh               |  2 +-
 speechx/examples/feat/feature-mfcc-test.cc    |  1 -
 .../examples/feat/linear_spectrogram_main.cc  | 38 +++++++++--------
 speechx/speechx/frontend/feature_cache.h      |  4 +-
 speechx/speechx/frontend/raw_audio.h          |  3 +-
 6 files changed, 57 insertions(+), 32 deletions(-)

diff --git a/speechx/examples/decoder/offline_decoder_main.cc b/speechx/examples/decoder/offline_decoder_main.cc
index eccd7c099..3a94cc947 100644
--- a/speechx/examples/decoder/offline_decoder_main.cc
+++ b/speechx/examples/decoder/offline_decoder_main.cc
@@ -22,11 +22,12 @@
 #include "nnet/decodable.h"
 #include "nnet/paddle_nnet.h"
 
-DEFINE_string(feature_respecifier, "", "test feature rspecifier");
+DEFINE_string(feature_respecifier, "", "feature matrix rspecifier");
 DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
 DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
 DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
 DEFINE_string(lm_path, "lm.klm", "language model");
+DEFINE_int32(chunk_size, 35, "feat chunk size");
 
 
 using kaldi::BaseFloat;
@@ -43,14 +44,16 @@ int main(int argc, char* argv[]) {
     std::string model_params = FLAGS_param_path;
     std::string dict_file = FLAGS_dict_file;
     std::string lm_path = FLAGS_lm_path;
+    int32 chunk_size = FLAGS_chunk_size;
+    LOG(INFO) << "model path: " << model_graph;
+    LOG(INFO) << "model param: " << model_params;
+    LOG(INFO) << "dict path: " << dict_file;
+    LOG(INFO) << "lm path: " << lm_path;
+    LOG(INFO) << "chunk size (frame): " << chunk_size;
 
     int32 num_done = 0, num_err = 0;
 
-    ppspeech::CTCBeamSearchOptions opts;
-    opts.dict_file = dict_file;
-    opts.lm_path = lm_path;
-    ppspeech::CTCBeamSearch decoder(opts);
-
+    // frontend + nnet is decodable
     ppspeech::ModelOptions model_opts;
     model_opts.model_path = model_graph;
     model_opts.params_path = model_params;
@@ -60,34 +63,50 @@ int main(int argc, char* argv[]) {
         new ppspeech::RawDataCache());
     std::shared_ptr<ppspeech::Decodable> decodable(
         new ppspeech::Decodable(nnet, raw_data));
+    LOG(INFO) << "Init decodeable.";
 
-    int32 chunk_size = 35;
-    decoder.InitDecoder();
-    LOG(INFO) << "chunk size: " << chunk_size;
+    // init decoder
+    ppspeech::CTCBeamSearchOptions opts;
+    opts.dict_file = dict_file;
+    opts.lm_path = lm_path;
+    ppspeech::CTCBeamSearch decoder(opts);
+    LOG(INFO) << "Init decoder.";
 
+    decoder.InitDecoder();
     for (; !feature_reader.Done(); feature_reader.Next()) {
         string utt = feature_reader.Key();
         const kaldi::Matrix<BaseFloat> feature = feature_reader.Value();
+        LOG(INFO) << "utt: " << utt;
+
+        // feat dim
         raw_data->SetDim(feature.NumCols());
+        LOG(INFO) << "dim: " << raw_data->Dim();
+
         int32 row_idx = 0;
         int32 num_chunks = feature.NumRows() / chunk_size;
+        LOG(INFO) << "n chunks: " << num_chunks;
         for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
+            // feat chunk
             kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
                                                           feature.NumCols());
             for (int row_id = 0; row_id < chunk_size; ++row_id) {
-                kaldi::SubVector<kaldi::BaseFloat> tmp(feature, row_idx);
+                kaldi::SubVector<kaldi::BaseFloat> feat_one_row(feature,
+                                                                row_idx);
                 kaldi::SubVector<kaldi::BaseFloat> f_chunk_tmp(
                     feature_chunk.Data() + row_id * feature.NumCols(),
                     feature.NumCols());
-                f_chunk_tmp.CopyFromVec(tmp);
+                f_chunk_tmp.CopyFromVec(feat_one_row);
                 row_idx++;
             }
+            // feed to raw cache
             raw_data->Accept(feature_chunk);
             if (chunk_idx == num_chunks - 1) {
                 raw_data->SetFinished();
             }
+            // decode step
             decoder.AdvanceDecode(decodable);
         }
+
         std::string result;
         result = decoder.GetFinalBestPath();
         KALDI_LOG << " the result of " << utt << " is " << result;
diff --git a/speechx/examples/decoder/run.sh b/speechx/examples/decoder/run.sh
index 1e5a678c2..ddda89702 100755
--- a/speechx/examples/decoder/run.sh
+++ b/speechx/examples/decoder/run.sh
@@ -28,7 +28,7 @@ cmvn=./cmvn.ark
 
 export GLOG_logtostderr=1
 
-# 3. run feat
+# 3. gen linear feat
 linear_spectrogram_main \
     --wav_rspecifier=scp:$model_dir/wav.scp \
     --feature_wspecifier=ark,t:$feat_wspecifier \
diff --git a/speechx/examples/feat/feature-mfcc-test.cc b/speechx/examples/feat/feature-mfcc-test.cc
index ae32aba9e..48a9e1c29 100644
--- a/speechx/examples/feat/feature-mfcc-test.cc
+++ b/speechx/examples/feat/feature-mfcc-test.cc
@@ -41,7 +41,6 @@
 
 using namespace kaldi;
 
-
 static void UnitTestReadWave() {
     std::cout << "=== UnitTestReadWave() ===\n";
 
diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc
index a27db56fd..cde78c4d3 100644
--- a/speechx/examples/feat/linear_spectrogram_main.cc
+++ b/speechx/examples/feat/linear_spectrogram_main.cc
@@ -151,7 +151,7 @@ void WriteMatrix() {
         cmvn_stats(1, idx) = variance_[idx];
     }
     cmvn_stats(0, mean_.size()) = count_;
-    kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, true);
+    kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, false);
 }
 
 int main(int argc, char* argv[]) {
@@ -163,51 +163,56 @@ int main(int argc, char* argv[]) {
     kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
     WriteMatrix();
 
-    // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning
-    // window -->linear_spectrogram --> cmvn
+
     int32 num_done = 0, num_err = 0;
+
+    // feature pipeline: wave cache --> decibel_normalizer --> hanning
+    // window -->linear_spectrogram --> global cmvn -> feat cache
+
     // std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new
     // ppspeech::RawDataCache());
     std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(
         new ppspeech::RawAudioCache());
 
+    ppspeech::DecibelNormalizerOptions db_norm_opt;
+    std::unique_ptr<ppspeech::FeatureExtractorInterface> db_norm(
+        new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source)));
+
     ppspeech::LinearSpectrogramOptions opt;
     opt.frame_opts.frame_length_ms = 20;
     opt.frame_opts.frame_shift_ms = 10;
-    LOG(INFO) << "frame length (ms):" << opt.frame_opts.frame_length_ms;
-    LOG(INFO) << "frame shift (ms):" << opt.frame_opts.frame_shift_ms;
-
-    ppspeech::DecibelNormalizerOptions db_norm_opt;
-    std::unique_ptr<ppspeech::FeatureExtractorInterface> base_feature_extractor(
-        new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source)));
+    LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms;
+    LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms;
 
     std::unique_ptr<ppspeech::FeatureExtractorInterface> linear_spectrogram(
-        new ppspeech::LinearSpectrogram(opt,
-                                        std::move(base_feature_extractor)));
+        new ppspeech::LinearSpectrogram(opt, std::move(db_norm)));
 
     std::unique_ptr<ppspeech::FeatureExtractorInterface> cmvn(
         new ppspeech::CMVN(FLAGS_cmvn_write_path,
                            std::move(linear_spectrogram)));
 
     ppspeech::FeatureCache feature_cache(kint16max, std::move(cmvn));
+    LOG(INFO) << "feat dim: " << feature_cache.Dim();
 
-    float streaming_chunk = 0.36;
     int sample_rate = 16000;
+    float streaming_chunk = 0.36;
     int chunk_sample_size = streaming_chunk * sample_rate;
-
-    LOG(INFO) << "sr:" << sample_rate;
-    LOG(INFO) << "chunk size (s):" << streaming_chunk;
-    LOG(INFO) << "chunk size (sample):" << chunk_sample_size;
+    LOG(INFO) << "sr: " << sample_rate;
+    LOG(INFO) << "chunk size (s): " << streaming_chunk;
+    LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
 
 
     for (; !wav_reader.Done(); wav_reader.Next()) {
         std::string utt = wav_reader.Key();
         const kaldi::WaveData& wave_data = wav_reader.Value();
+        LOG(INFO) << "process utt: " << utt;
 
         int32 this_channel = 0;
         kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
                                                     this_channel);
         int tot_samples = waveform.Dim();
+        LOG(INFO) << "wav len (sample): " << tot_samples;
+
         int sample_offset = 0;
         std::vector<kaldi::Vector<BaseFloat>> feats;
         int feature_rows = 0;
@@ -219,6 +224,7 @@ int main(int argc, char* argv[]) {
             for (int i = 0; i < cur_chunk_size; ++i) {
                 wav_chunk(i) = waveform(sample_offset + i);
             }
+
             kaldi::Vector<BaseFloat> features;
             feature_cache.Accept(wav_chunk);
             if (cur_chunk_size < chunk_sample_size) {
diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h
index b6bbdf3cb..f52b9b0f6 100644
--- a/speechx/speechx/frontend/feature_cache.h
+++ b/speechx/speechx/frontend/feature_cache.h
@@ -28,10 +28,10 @@ class FeatureCache : public FeatureExtractorInterface {
     // Feed feats or waves
     virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
 
-    // feats dim = num_frames * feature_dim
+    // feats size = num_frames * feat_dim
     virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
 
-    // feature cache only cache feature which from base extractor
+    // feat dim
     virtual size_t Dim() const { return base_extractor_->Dim(); }
 
     virtual void SetFinished() {
diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/raw_audio.h
index ce75c137c..7a28f2c98 100644
--- a/speechx/speechx/frontend/raw_audio.h
+++ b/speechx/speechx/frontend/raw_audio.h
@@ -68,9 +68,10 @@ class RawDataCache : public FeatureExtractorInterface {
         data_.Resize(0);
         return true;
     }
-    virtual size_t Dim() const { return dim_; }
+
     virtual void SetFinished() { finished_ = true; }
     virtual bool IsFinished() const { return finished_; }
+    virtual size_t Dim() const { return dim_; }
     void SetDim(int32 dim) { dim_ = dim; }
     virtual void Reset() { finished_ = true; }