Merge branch 'develop' into server

4 years ago · 9d20a10b5a
parent d21ccd0287 0cde9f87ab
commit 9d20a10b5a
122 changed files with 48092 additions and 36517 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -50,13 +50,13 @@ repos:
        entry: bash .pre-commit-hooks/clang-format.hook -i
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
-        exclude: (?=speechx/speechx/kaldi|speechx/patch).*(\.cpp|\.cc|\.h|\.py)$
+        exclude: (?=speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin).*(\.cpp|\.cc|\.h|\.py)$
    -   id: copyright_checker
        name: copyright_checker
        entry: python .pre-commit-hooks/copyright-check.hook
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
-        exclude: (?=third_party|pypinyin|speechx/speechx/kaldi|speechx/patch).*(\.cpp|\.cc|\.h|\.py)$
+        exclude: (?=third_party|pypinyin|speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin).*(\.cpp|\.cc|\.h|\.py)$
 -   repo: https://github.com/asottile/reorder_python_imports
    rev: v2.4.0
    hooks:
--- a/demos/audio_searching/README.md
+++ b/demos/audio_searching/README.md
@ -90,7 +90,7 @@ Then to start the system server, and it provides HTTP backend services.

  ```bash
  export PYTHONPATH=$PYTHONPATH:./src:../../paddleaudio
-  python src/main.py
+  python src/audio_search.py
  ```

  Then you will see the Application is started:
@ -111,7 +111,7 @@ Then to start the system server, and it provides HTTP backend services.
  ```bash
  wget -c https://www.openslr.org/resources/82/cn-celeb_v2.tar.gz && tar -xvf cn-celeb_v2.tar.gz 
  ```
-  **Note**: If you want to build a quick demo, you can use ./src/test_main.py:download_audio_data function, it downloads 20 audio files , Subsequent results show this collection as an example
+  **Note**: If you want to build a quick demo, you can use ./src/test_audio_search.py:download_audio_data function, it downloads 20 audio files , Subsequent results show this collection as an example

 - Prepare model(Skip this step if you use the default model.)
  ```bash
@ -123,7 +123,7 @@ Then to start the system server, and it provides HTTP backend services.

    The internal process is downloading data, loading the paddlespeech model, extracting embedding, storing library, retrieving and deleting library  
    ```bash
-    python ./src/test_main.py
+    python ./src/test_audio_search.py
    ```

    Output：
--- a/demos/audio_searching/README_cn.md
+++ b/demos/audio_searching/README_cn.md
@ -92,7 +92,7 @@ ffce340b3790  minio/minio:RELEASE.2020-12-03T00-03-10Z  "/usr/bin/docker-ent…"

  ```bash
  export PYTHONPATH=$PYTHONPATH:./src:../../paddleaudio
-  python src/main.py
+  python src/audio_search.py
  ```

  然后你会看到应用程序启动:
@ -113,7 +113,7 @@ ffce340b3790  minio/minio:RELEASE.2020-12-03T00-03-10Z  "/usr/bin/docker-ent…"
  ```bash
  wget -c https://www.openslr.org/resources/82/cn-celeb_v2.tar.gz && tar -xvf cn-celeb_v2.tar.gz 
  ```
-  **注**：如果希望快速搭建 demo，可以采用 ./src/test_main.py:download_audio_data 内部的 20 条音频，另外后续结果展示以该集合为例
+  **注**：如果希望快速搭建 demo，可以采用 ./src/test_audio_search.py:download_audio_data 内部的 20 条音频，另外后续结果展示以该集合为例

 - 准备模型（如果使用默认模型，可以跳过此步骤）
  ```bash
@ -124,7 +124,7 @@ ffce340b3790  minio/minio:RELEASE.2020-12-03T00-03-10Z  "/usr/bin/docker-ent…"
 - 脚本测试（推荐）

    ```bash
-    python ./src/test_main.py
+    python ./src/test_audio_search.py
    ```
    注：内部将依次下载数据，加载 paddlespeech 模型，提取 embedding，存储建库，检索，删库

--- a/demos/audio_searching/src/audio_search.py
+++ b/demos/audio_searching/src/audio_search.py
@ -40,7 +40,6 @@ app.add_middleware(
    allow_methods=["*"],
    allow_headers=["*"])

-MODEL = None
 MILVUS_CLI = MilvusHelper()
 MYSQL_CLI = MySQLHelper()

--- a/demos/audio_searching/src/encode.py
+++ b/demos/audio_searching/src/encode.py
@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
-
 from logs import LOGGER
+
 from paddlespeech.cli import VectorExecutor

 vector_executor = VectorExecutor()
--- a/demos/audio_searching/src/mysql_helpers.py
+++ b/demos/audio_searching/src/mysql_helpers.py
@ -13,6 +13,7 @@
 # limitations under the License.
 import sys

+import numpy
 import pymysql
 from config import MYSQL_DB
 from config import MYSQL_HOST
@ -69,7 +70,7 @@ class MySQLHelper():
            sys.exit(1)

    def load_data_to_mysql(self, table_name, data):
-        # Batch insert (Milvus_ids, img_path) to mysql
+        # Batch insert (Milvus_ids, audio_path) to mysql
        self.test_connection()
        sql = "insert into " + table_name + " (milvus_id,audio_path) values (%s,%s);"
        try:
@ -82,7 +83,7 @@ class MySQLHelper():
            sys.exit(1)

    def search_by_milvus_ids(self, ids, table_name):
-        # Get the img_path according to the milvus ids
+        # Get the audio_path according to the milvus ids
        self.test_connection()
        str_ids = str(ids).replace('[', '').replace(']', '')
        sql = "select audio_path from " + table_name + " where milvus_id in (" + str_ids + ") order by field (milvus_id," + str_ids + ");"
@ -120,14 +121,83 @@ class MySQLHelper():
            sys.exit(1)

    def count_table(self, table_name):
-        # Get the number of mysql table
+        # Get the number of spk in mysql table
        self.test_connection()
-        sql = "select count(milvus_id) from " + table_name + ";"
+        sql = "select count(spk_id) from " + table_name + ";"
        try:
            self.cursor.execute(sql)
            results = self.cursor.fetchall()
-            LOGGER.debug(f"MYSQL count table:{table_name}")
+            LOGGER.debug(f"MYSQL count table:{results[0][0]}")
            return results[0][0]
        except Exception as e:
            LOGGER.error(f"MYSQL ERROR: {e} with sql: {sql}")
            sys.exit(1)
+
+    def create_mysql_table_vpr(self, table_name):
+        # Create mysql table if not exists
+        self.test_connection()
+        sql = "create table if not exists " + table_name + "(spk_id TEXT, audio_path TEXT, embedding TEXT);"
+        try:
+            self.cursor.execute(sql)
+            LOGGER.debug(f"MYSQL create table: {table_name} with sql: {sql}")
+        except Exception as e:
+            LOGGER.error(f"MYSQL ERROR: {e} with sql: {sql}")
+            sys.exit(1)
+
+    def load_data_to_mysql_vpr(self, table_name, data):
+        # Insert (spk, audio, embedding) to mysql
+        self.test_connection()
+        sql = "insert into " + table_name + " (spk_id,audio_path,embedding) values (%s,%s,%s);"
+        try:
+            self.cursor.execute(sql, data)
+            LOGGER.debug(
+                f"MYSQL loads data to table: {table_name} successfully")
+        except Exception as e:
+            LOGGER.error(f"MYSQL ERROR: {e} with sql: {sql}")
+            sys.exit(1)
+
+    def list_vpr(self, table_name):
+        # Get all records in mysql
+        self.test_connection()
+        sql = "select * from " + table_name + " ;"
+        try:
+            self.cursor.execute(sql)
+            results = self.cursor.fetchall()
+            self.conn.commit()
+            spk_ids = [res[0] for res in results]
+            audio_paths = [res[1] for res in results]
+            embeddings = [
+                numpy.array(
+                    str(res[2]).replace('[', '').replace(']', '').split(","))
+                for res in results
+            ]
+            return spk_ids, audio_paths, embeddings
+        except Exception as e:
+            LOGGER.error(f"MYSQL ERROR: {e} with sql: {sql}")
+            sys.exit(1)
+
+    def search_audio_vpr(self, table_name, spk_id):
+        # Get the audio_path according to the spk_id
+        self.test_connection()
+        sql = "select audio_path from " + table_name + " where spk_id='" + spk_id + "' ;"
+        try:
+            self.cursor.execute(sql)
+            results = self.cursor.fetchall()
+            LOGGER.debug(
+                f"MYSQL search by spk id {spk_id} to get audio {results[0][0]}.")
+            return results[0][0]
+        except Exception as e:
+            LOGGER.error(f"MYSQL ERROR: {e} with sql: {sql}")
+            sys.exit(1)
+
+    def delete_data_vpr(self, table_name, spk_id):
+        # Delete a record by spk_id in mysql table
+        self.test_connection()
+        sql = "delete from " + table_name + " where spk_id='" + spk_id + "';"
+        try:
+            self.cursor.execute(sql)
+            LOGGER.debug(
+                f"MYSQL delete a record {spk_id} in table {table_name}")
+        except Exception as e:
+            LOGGER.error(f"MYSQL ERROR: {e} with sql: {sql}")
+            sys.exit(1)
--- a/demos/audio_searching/src/operations/count.py
+++ b/demos/audio_searching/src/operations/count.py
@ -31,3 +31,45 @@ def do_count(table_name, milvus_cli):
    except Exception as e:
        LOGGER.error(f"Error attempting to count table {e}")
        sys.exit(1)
+
+
+def do_count_vpr(table_name, mysql_cli):
+    """
+    Returns the total number of spk in the system
+    """
+    if not table_name:
+        table_name = DEFAULT_TABLE
+    try:
+        num = mysql_cli.count_table(table_name)
+        return num
+    except Exception as e:
+        LOGGER.error(f"Error attempting to count table {e}")
+        sys.exit(1)
+
+
+def do_list(table_name, mysql_cli):
+    """
+    Returns the total records of vpr in the system
+    """
+    if not table_name:
+        table_name = DEFAULT_TABLE
+    try:
+        spk_ids, audio_paths, _ = mysql_cli.list_vpr(table_name)
+        return spk_ids, audio_paths
+    except Exception as e:
+        LOGGER.error(f"Error attempting to count table {e}")
+        sys.exit(1)
+
+
+def do_get(table_name, spk_id, mysql_cli):
+    """
+    Returns the audio path by spk_id in the system
+    """
+    if not table_name:
+        table_name = DEFAULT_TABLE
+    try:
+        audio_apth = mysql_cli.search_audio_vpr(table_name, spk_id)
+        return audio_apth
+    except Exception as e:
+        LOGGER.error(f"Error attempting to count table {e}")
+        sys.exit(1)
--- a/demos/audio_searching/src/operations/drop.py
+++ b/demos/audio_searching/src/operations/drop.py
@ -32,3 +32,31 @@ def do_drop(table_name, milvus_cli, mysql_cli):
    except Exception as e:
        LOGGER.error(f"Error attempting to drop table: {e}")
        sys.exit(1)
+
+
+def do_drop_vpr(table_name, mysql_cli):
+    """
+    Delete the table of MySQL
+    """
+    if not table_name:
+        table_name = DEFAULT_TABLE
+    try:
+        mysql_cli.delete_table(table_name)
+        return "OK"
+    except Exception as e:
+        LOGGER.error(f"Error attempting to drop table: {e}")
+        sys.exit(1)
+
+
+def do_delete(table_name, spk_id, mysql_cli):
+    """
+    Delete a record by spk_id in MySQL
+    """
+    if not table_name:
+        table_name = DEFAULT_TABLE
+    try:
+        mysql_cli.delete_data_vpr(table_name, spk_id)
+        return "OK"
+    except Exception as e:
+        LOGGER.error(f"Error attempting to drop table: {e}")
+        sys.exit(1)
--- a/demos/audio_searching/src/operations/load.py
+++ b/demos/audio_searching/src/operations/load.py
@ -82,3 +82,16 @@ def do_load(table_name, audio_dir, milvus_cli, mysql_cli):
    mysql_cli.create_mysql_table(table_name)
    mysql_cli.load_data_to_mysql(table_name, format_data(ids, names))
    return len(ids)
+
+
+def do_enroll(table_name, spk_id, audio_path, mysql_cli):
+    """
+    Import spk_id,audio_path,embedding to Mysql
+    """
+    if not table_name:
+        table_name = DEFAULT_TABLE
+    embedding = get_audio_embedding(audio_path)
+    mysql_cli.create_mysql_table_vpr(table_name)
+    data = (spk_id, audio_path, str(embedding))
+    mysql_cli.load_data_to_mysql_vpr(table_name, data)
+    return "OK"
--- a/demos/audio_searching/src/operations/search.py
+++ b/demos/audio_searching/src/operations/search.py
@ -13,6 +13,7 @@
 # limitations under the License.
 import sys

+import numpy
 from config import DEFAULT_TABLE
 from config import TOP_K
 from encode import get_audio_embedding
@ -39,3 +40,26 @@ def do_search(host, table_name, audio_path, milvus_cli, mysql_cli):
    except Exception as e:
        LOGGER.error(f"Error with search: {e}")
        sys.exit(1)
+
+
+def do_search_vpr(host, table_name, audio_path, mysql_cli):
+    """
+    Search the uploaded audio in MySQL
+    """
+    try:
+        if not table_name:
+            table_name = DEFAULT_TABLE
+        emb = get_audio_embedding(audio_path)
+        emb = numpy.array(emb)
+        spk_ids, paths, vectors = mysql_cli.list_vpr(table_name)
+        scores = [numpy.dot(emb, x.astype(numpy.float64)) for x in vectors]
+        spk_ids = [str(x) for x in spk_ids]
+        paths = [str(x) for x in paths]
+        for i in range(len(paths)):
+            tmp = "http://" + str(host) + "/data?audio_path=" + str(paths[i])
+            paths[i] = tmp
+            scores[i] = scores[i] * 100
+        return spk_ids, paths, scores
+    except Exception as e:
+        LOGGER.error(f"Error with search: {e}")
+        sys.exit(1)
--- a/demos/audio_searching/src/test_audio_search.py
+++ b/demos/audio_searching/src/test_audio_search.py
@ -11,8 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from audio_search import app
 from fastapi.testclient import TestClient
-from main import app

 from utils.utility import download
 from utils.utility import unpack
@ -22,7 +22,7 @@ client = TestClient(app)

 def download_audio_data():
    """
-    download audio data
+    Download audio data
    """
    url = "https://paddlespeech.bj.bcebos.com/vector/audio/example_audio.tar.gz"
    md5sum = "52ac69316c1aa1fdef84da7dd2c67b39"
@ -64,7 +64,7 @@ def test_count():
    """
    Returns the total number of vectors in the system
    """
-    response = client.get("audio/count")
+    response = client.get("/audio/count")
    assert response.status_code == 200
    assert response.json() == 20

--- a/demos/audio_searching/src/test_vpr_search.py
+++ b/demos/audio_searching/src/test_vpr_search.py
@ -0,0 +1,115 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from fastapi.testclient import TestClient
+from vpr_search import app
+
+from utils.utility import download
+from utils.utility import unpack
+
+client = TestClient(app)
+
+
+def download_audio_data():
+    """
+    Download audio data
+    """
+    url = "https://paddlespeech.bj.bcebos.com/vector/audio/example_audio.tar.gz"
+    md5sum = "52ac69316c1aa1fdef84da7dd2c67b39"
+    target_dir = "./"
+    filepath = download(url, md5sum, target_dir)
+    unpack(filepath, target_dir, True)
+
+
+def test_drop():
+    """
+    Delete the table of MySQL
+    """
+    response = client.post("/vpr/drop")
+    assert response.status_code == 200
+
+
+def test_enroll_local(spk: str, audio: str):
+    """
+    Enroll the audio to MySQL
+    """
+    response = client.post("/vpr/enroll/local?spk_id=" + spk +
+                           "&audio_path=.%2Fexample_audio%2F" + audio + ".wav")
+    assert response.status_code == 200
+    assert response.json() == {
+        'status': True,
+        'msg': "Successfully enroll data!"
+    }
+
+
+def test_search_local():
+    """
+    Search the spk in MySQL by audio
+    """
+    response = client.post(
+        "/vpr/recog/local?audio_path=.%2Fexample_audio%2Ftest.wav")
+    assert response.status_code == 200
+
+
+def test_list():
+    """
+    Get all records in MySQL
+    """
+    response = client.get("/vpr/list")
+    assert response.status_code == 200
+
+
+def test_data(spk: str):
+    """
+    Get the audio file by spk_id in MySQL
+    """
+    response = client.get("/vpr/data?spk_id=" + spk)
+    assert response.status_code == 200
+
+
+def test_del(spk: str):
+    """
+    Delete the record in MySQL by spk_id
+    """
+    response = client.post("/vpr/del?spk_id=" + spk)
+    assert response.status_code == 200
+
+
+def test_count():
+    """
+    Get the number of spk in MySQL
+    """
+    response = client.get("/vpr/count")
+    assert response.status_code == 200
+
+
+if __name__ == "__main__":
+    download_audio_data()
+
+    test_enroll_local("spk1", "arms_strikes")
+    test_enroll_local("spk2", "sword_wielding")
+    test_enroll_local("spk3", "test")
+    test_list()
+    test_data("spk1")
+    test_count()
+    test_search_local()
+
+    test_del("spk1")
+    test_count()
+    test_search_local()
+
+    test_enroll_local("spk1", "arms_strikes")
+    test_count()
+    test_search_local()
+
+    test_drop()
--- a/demos/audio_searching/src/vpr_search.py
+++ b/demos/audio_searching/src/vpr_search.py
@ -0,0 +1,206 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import uvicorn
+from config import UPLOAD_PATH
+from fastapi import FastAPI
+from fastapi import File
+from fastapi import UploadFile
+from logs import LOGGER
+from mysql_helpers import MySQLHelper
+from operations.count import do_count_vpr
+from operations.count import do_get
+from operations.count import do_list
+from operations.drop import do_delete
+from operations.drop import do_drop_vpr
+from operations.load import do_enroll
+from operations.search import do_search_vpr
+from starlette.middleware.cors import CORSMiddleware
+from starlette.requests import Request
+from starlette.responses import FileResponse
+
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"])
+
+MYSQL_CLI = MySQLHelper()
+
+# Mkdir 'tmp/audio-data'
+if not os.path.exists(UPLOAD_PATH):
+    os.makedirs(UPLOAD_PATH)
+    LOGGER.info(f"Mkdir the path: {UPLOAD_PATH}")
+
+
+@app.post('/vpr/enroll')
+async def vpr_enroll(table_name: str=None,
+                     spk_id: str=None,
+                     audio: UploadFile=File(...)):
+    # Enroll the uploaded audio with spk-id into MySQL
+    try:
+        # Save the upload data to server.
+        content = await audio.read()
+        audio_path = os.path.join(UPLOAD_PATH, audio.filename)
+        with open(audio_path, "wb+") as f:
+            f.write(content)
+        do_enroll(table_name, spk_id, audio_path, MYSQL_CLI)
+        LOGGER.info(f"Successfully enrolled {spk_id} online!")
+        return {'status': True, 'msg': "Successfully enroll data!"}
+    except Exception as e:
+        LOGGER.error(e)
+        return {'status': False, 'msg': e}, 400
+
+
+@app.post('/vpr/enroll/local')
+async def vpr_enroll_local(table_name: str=None,
+                           spk_id: str=None,
+                           audio_path: str=None):
+    # Enroll the local audio with spk-id into MySQL
+    try:
+        do_enroll(table_name, spk_id, audio_path, MYSQL_CLI)
+        LOGGER.info(f"Successfully enrolled {spk_id} locally!")
+        return {'status': True, 'msg': "Successfully enroll data!"}
+    except Exception as e:
+        LOGGER.error(e)
+        return {'status': False, 'msg': e}, 400
+
+
+@app.post('/vpr/recog')
+async def vpr_recog(request: Request,
+                    table_name: str=None,
+                    audio: UploadFile=File(...)):
+    # Voice print recognition online
+    try:
+        # Save the upload data to server.
+        content = await audio.read()
+        query_audio_path = os.path.join(UPLOAD_PATH, audio.filename)
+        with open(query_audio_path, "wb+") as f:
+            f.write(content)
+        host = request.headers['host']
+        spk_ids, paths, scores = do_search_vpr(host, table_name,
+                                               query_audio_path, MYSQL_CLI)
+        for spk_id, path, score in zip(spk_ids, paths, scores):
+            LOGGER.info(f"spk {spk_id}, score {score}, audio path {path}, ")
+        res = dict(zip(spk_ids, zip(paths, scores)))
+        # Sort results by distance metric, closest distances first
+        res = sorted(res.items(), key=lambda item: item[1][1], reverse=True)
+        LOGGER.info("Successfully speaker recognition online!")
+        return res
+    except Exception as e:
+        LOGGER.error(e)
+        return {'status': False, 'msg': e}, 400
+
+
+@app.post('/vpr/recog/local')
+async def vpr_recog_local(request: Request,
+                          table_name: str=None,
+                          audio_path: str=None):
+    # Voice print recognition locally
+    try:
+        host = request.headers['host']
+        spk_ids, paths, scores = do_search_vpr(host, table_name, audio_path,
+                                               MYSQL_CLI)
+        for spk_id, path, score in zip(spk_ids, paths, scores):
+            LOGGER.info(f"spk {spk_id}, score {score}, audio path {path}, ")
+        res = dict(zip(spk_ids, zip(paths, scores)))
+        # Sort results by distance metric, closest distances first
+        res = sorted(res.items(), key=lambda item: item[1][1], reverse=True)
+        LOGGER.info("Successfully speaker recognition locally!")
+        return res
+    except Exception as e:
+        LOGGER.error(e)
+        return {'status': False, 'msg': e}, 400
+
+
+@app.post('/vpr/del')
+async def vpr_del(table_name: str=None, spk_id: str=None):
+    # Delete a record by spk_id in MySQL
+    try:
+        do_delete(table_name, spk_id, MYSQL_CLI)
+        LOGGER.info("Successfully delete a record by spk_id in MySQL")
+        return {'status': True, 'msg': "Successfully delete data!"}
+    except Exception as e:
+        LOGGER.error(e)
+        return {'status': False, 'msg': e}, 400
+
+
+@app.get('/vpr/list')
+async def vpr_list(table_name: str=None):
+    # Get all records in MySQL
+    try:
+        spk_ids, audio_paths = do_list(table_name, MYSQL_CLI)
+        for i in range(len(spk_ids)):
+            LOGGER.debug(f"spk {spk_ids[i]}, audio path {audio_paths[i]}")
+        LOGGER.info("Successfully list all records from mysql!")
+        return spk_ids, audio_paths
+    except Exception as e:
+        LOGGER.error(e)
+        return {'status': False, 'msg': e}, 400
+
+
+@app.get('/vpr/data')
+async def vpr_data(
+    table_name: str=None,
+    spk_id: str=None, ):
+    # Get the audio file from path by spk_id in MySQL
+    try:
+        audio_path = do_get(table_name, spk_id, MYSQL_CLI)
+        LOGGER.info(f"Successfully get audio path {audio_path}!")
+        return FileResponse(audio_path)
+    except Exception as e:
+        LOGGER.error(e)
+        return {'status': False, 'msg': e}, 400
+
+
+@app.get('/vpr/count')
+async def vpr_count(table_name: str=None):
+    # Get the total number of spk in MySQL
+    try:
+        num = do_count_vpr(table_name, MYSQL_CLI)
+        LOGGER.info("Successfully count the number of spk!")
+        return num
+    except Exception as e:
+        LOGGER.error(e)
+        return {'status': False, 'msg': e}, 400
+
+
+@app.post('/vpr/drop')
+async def drop_tables(table_name: str=None):
+    # Delete the table of MySQL
+    try:
+        do_drop_vpr(table_name, MYSQL_CLI)
+        LOGGER.info("Successfully drop tables in MySQL!")
+        return {'status': True, 'msg': "Successfully drop tables!"}
+    except Exception as e:
+        LOGGER.error(e)
+        return {'status': False, 'msg': e}, 400
+
+
+@app.get('/data')
+def audio_path(audio_path):
+    # Get the audio file from path
+    try:
+        LOGGER.info(f"Successfully get audio: {audio_path}")
+        return FileResponse(audio_path)
+    except Exception as e:
+        LOGGER.error(f"get audio error: {e}")
+        return {'status': False, 'msg': e}, 400
+
+
+if __name__ == '__main__':
+    uvicorn.run(app=app, host='0.0.0.0', port=8002)
--- a/demos/speech_recognition/run.sh
+++ b/demos/speech_recognition/run.sh
@ -7,4 +7,4 @@ paddlespeech asr --input ./zh.wav


 # asr + punc
-paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
+paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
--- a/demos/speech_server/README_cn.md
+++ b/demos/speech_server/README_cn.md
@ -85,6 +85,10 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 - 命令行 (推荐使用)
   ```
   paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
+
+   # 流式ASR
+   paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8091 --input ./zh.wav
+
   ```

    使用帮助:
@ -191,7 +195,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee

  ```

-  ### 5. CLS 客户端使用方法
+  ### 6. CLS 客户端使用方法
  **注意：** 初次使用客户端时响应时间会略长
  - 命令行 (推荐使用)
   ```
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@ -37,7 +37,7 @@ Model Type | Dataset| Example Link | Pretrained Models|Static Models|Size (stati
 Tacotron2|LJSpeech|[tacotron2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip)|||
 Tacotron2|CSMSC|[tacotron2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0)|[tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)|[tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip)|103MB|
 TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)|||
-SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_static_0.2.0.zip)|12MB|
+SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2)|[speedyspeech_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip)|[speedyspeech_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_static_0.2.0.zip)|12MB|
 FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip)|157MB|
 FastSpeech2-Conformer| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)|||
 FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)|||
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@ -223,22 +223,28 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
 ## Pretrained Model
 Pretrained SpeedySpeech model with no silence in the edge of audios:
 - [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)
+- [speedyspeech_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip)

 The static model can be downloaded here:
 - [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)
 - [speedyspeech_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_static_0.2.0.zip)

+The ONNX model can be downloaded here:
+- [speedyspeech_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_onnx_0.2.0.zip)
+
+
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/ssim_loss
 :-------------:| :------------:| :-----: | :-----: | :--------:|:--------:
-default| 1(gpu) x 11400|0.83655|0.42324|0.03211| 0.38119
+default| 1(gpu) x 11400|0.79532|0.400246|0.030259| 0.36482

 SpeedySpeech checkpoint contains files listed below.
+
 ```text
-speedyspeech_nosil_baker_ckpt_0.5
+speedyspeech_csmsc_ckpt_0.2.0
 ├── default.yaml            # default config used to train speedyspeech
 ├── feats_stats.npy         # statistics used to normalize spectrogram when training speedyspeech
 ├── phone_id_map.txt        # phone vocabulary file when training speedyspeech
-├── snapshot_iter_11400.pdz # model parameters and optimizer states
+├── snapshot_iter_30600.pdz # model parameters and optimizer states
 └── tone_id_map.txt         # tone vocabulary file when training speedyspeech
 ```
 You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained speedyspeech and parallel wavegan models.
@ -249,9 +255,9 @@ FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../synthesize_e2e.py \
  --am=speedyspeech_csmsc \
-  --am_config=speedyspeech_nosil_baker_ckpt_0.5/default.yaml \
-  --am_ckpt=speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz \
-  --am_stat=speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy \
+  --am_config=speedyspeech_csmsc_ckpt_0.2.0/default.yaml \
+  --am_ckpt=speedyspeech_csmsc_ckpt_0.2.0/snapshot_iter_30600.pdz \
+  --am_stat=speedyspeech_csmsc_ckpt_0.2.0/feats_stats.npy \
  --voc=pwgan_csmsc \
  --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
@ -260,6 +266,6 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
  --text=${BIN_DIR}/../sentences.txt \
  --output_dir=exp/default/test_e2e \
  --inference_dir=exp/default/inference \
-  --phones_dict=speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt \
-  --tones_dict=speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
+  --phones_dict=speedyspeech_csmsc_ckpt_0.2.0/phone_id_map.txt \
+  --tones_dict=speedyspeech_csmsc_ckpt_0.2.0/tone_id_map.txt
 ```
--- a/examples/csmsc/tts2/local/ort_predict.sh
+++ b/examples/csmsc/tts2/local/ort_predict.sh
@ -0,0 +1,32 @@
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# only support default_fastspeech2/speedyspeech + hifigan/mb_melgan now!
+
+# synthesize from metadata
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../ort_predict.py \
+        --inference_dir=${train_output_path}/inference_onnx \
+        --am=speedyspeech_csmsc \
+        --voc=hifigan_csmsc \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/onnx_infer_out \
+        --device=cpu \
+        --cpu_threads=2
+fi
+
+# e2e, synthesize from text
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../ort_predict_e2e.py \
+        --inference_dir=${train_output_path}/inference_onnx \
+        --am=speedyspeech_csmsc \
+        --voc=hifigan_csmsc \
+        --output_dir=${train_output_path}/onnx_infer_out_e2e \
+        --text=${BIN_DIR}/../csmsc_test.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device=cpu \
+        --cpu_threads=2
+fi
--- a/examples/csmsc/tts2/local/paddle2onnx.sh
+++ b/examples/csmsc/tts2/local/paddle2onnx.sh
@ -0,0 +1 @@
+../../tts3/local/paddle2onnx.sh
--- a/examples/csmsc/tts2/run.sh
+++ b/examples/csmsc/tts2/run.sh
@ -40,3 +40,25 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # inference with static model
    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
 fi
+
+# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
+# we have only tested the following models so far
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # install paddle2onnx
+    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
+    if [[ -z "$version" || ${version} != '0.9.4' ]]; then
+        pip install paddle2onnx==0.9.4
+    fi
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
+fi
+
+# inference with onnxruntime, use fastspeech2 + hifigan by default
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    # install onnxruntime
+    version=$(echo `pip list |grep "onnxruntime"` |awk -F" " '{print $2}')
+    if [[ -z "$version" || ${version} != '1.10.0' ]]; then
+        pip install onnxruntime==1.10.0
+    fi
+    ./local/ort_predict.sh ${train_output_path}
+fi
--- a/examples/csmsc/tts3/local/ort_predict.sh
+++ b/examples/csmsc/tts3/local/ort_predict.sh
@ -3,7 +3,7 @@ train_output_path=$1
 stage=0
 stop_stage=0

-# only support default_fastspeech2 + hifigan/mb_melgan now!
+# only support default_fastspeech2/speedyspeech + hifigan/mb_melgan now!

 # synthesize from metadata
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--- a/examples/csmsc/tts3/local/paddle2onnx.sh
+++ b/examples/csmsc/tts3/local/paddle2onnx.sh
@ -19,4 +19,5 @@ paddle2onnx \
    --model_filename ${model}.pdmodel \
    --params_filename ${model}.pdiparams \
    --save_file ${train_output_path}/${output_dir}/${model}.onnx \
+    --opset_version 11 \
    --enable_dev_version ${enable_dev_version}
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@ -133,6 +133,9 @@ The pretrained model can be downloaded here:
 The static model can be downloaded here:
 - [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)

+The ONNX model can be downloaded here:
+- [pwgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_onnx_0.2.0.zip)
+
 Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss| eval/spectral_convergence_loss
 :-------------:| :------------:| :-----: | :-----: | :--------:
 default| 1(gpu) x 400000|1.948763|0.670098|0.248882
--- a/examples/other/1xt2x/src_deepspeech2x/init.py
+++ b/examples/other/1xt2x/src_deepspeech2x/init.py
@ -26,10 +26,10 @@ from paddlespeech.s2t.utils.log import Log
 #TODO(Hui Zhang): remove  fluid import
 logger = Log(__name__).getlog()

-########### hcak logging #############
+########### hack logging #############
 logger.warn = logger.warning

-########### hcak paddle #############
+########### hack paddle #############
 paddle.half = 'float16'
 paddle.float = 'float32'
 paddle.double = 'float64'
@ -110,7 +110,7 @@ if not hasattr(paddle, 'cat'):
    paddle.cat = cat


-########### hcak paddle.Tensor #############
+########### hack paddle.Tensor #############
 def item(x: paddle.Tensor):
    return x.numpy().item()

@ -353,7 +353,7 @@ if not hasattr(paddle.Tensor, 'tolist'):
    setattr(paddle.Tensor, 'tolist', tolist)


-########### hcak paddle.nn #############
+########### hack paddle.nn #############
 class GLU(nn.Layer):
    """Gated Linear Units (GLU) Layer"""

--- a/paddleaudio/paddleaudio/compliance/librosa.py
+++ b/paddleaudio/paddleaudio/compliance/librosa.py
@ -341,7 +341,7 @@ def stft(x: np.ndarray,
        hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None.
        win_length (Optional[int], optional): The size of window. Defaults to None.
        window (str, optional): A string of window specification. Defaults to "hann".
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
        dtype (type, optional): Data type of STFT results. Defaults to np.complex64.
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".

@ -509,7 +509,7 @@ def melspectrogram(x: np.ndarray,
        fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0.
        fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
        window (str, optional): A string of window specification. Defaults to "hann".
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
        power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
        to_db (bool, optional): Enable db scale. Defaults to True.
@ -564,7 +564,7 @@ def spectrogram(x: np.ndarray,
        window_size (int, optional): Size of FFT and window length. Defaults to 512.
        hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
        window (str, optional): A string of window specification. Defaults to "hann".
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
        power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.

--- a/paddleaudio/paddleaudio/features/layers.py
+++ b/paddleaudio/paddleaudio/features/layers.py
@ -42,7 +42,7 @@ class Spectrogram(nn.Layer):
        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
    """
@ -99,7 +99,7 @@ class MelSpectrogram(nn.Layer):
        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
        n_mels (int, optional): Number of mel bins. Defaults to 64.
        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
@ -176,7 +176,7 @@ class LogMelSpectrogram(nn.Layer):
        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
        n_mels (int, optional): Number of mel bins. Defaults to 64.
        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
@ -257,7 +257,7 @@ class MFCC(nn.Layer):
        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
        n_mels (int, optional): Number of mel bins. Defaults to 64.
        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
--- a/paddleaudio/setup.py
+++ b/paddleaudio/setup.py
@ -84,7 +84,7 @@ setuptools.setup(
    install_requires=[
        'numpy >= 1.15.0', 'scipy >= 1.0.0', 'resampy >= 0.2.2',
        'soundfile >= 0.9.0', 'colorlog', 'dtaidistance == 2.3.1', 'pathos'
-        ],
+    ],
    extras_require={
        'test': [
            'nose', 'librosa==0.8.1', 'soundfile==0.10.3.post1',
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@ -43,13 +43,13 @@ pretrained_models = {
    # speedyspeech
    "speedyspeech_csmsc-zh": {
        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip',
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip',
        'md5':
-        '9edce23b1a87f31b814d9477bf52afbc',
+        '6f6fa967b408454b6662c8c00c0027cb',
        'config':
        'default.yaml',
        'ckpt':
-        'snapshot_iter_11400.pdz',
+        'snapshot_iter_30600.pdz',
        'speech_stats':
        'feats_stats.npy',
        'phones_dict':
--- a/paddlespeech/s2t/init.py
+++ b/paddlespeech/s2t/init.py
@ -26,10 +26,10 @@ from paddlespeech.s2t.utils.log import Log
 #TODO(Hui Zhang): remove  fluid import
 logger = Log(__name__).getlog()

-########### hcak logging #############
+########### hack logging #############
 logger.warn = logger.warning

-########### hcak paddle #############
+########### hack paddle #############
 paddle.half = 'float16'
 paddle.float = 'float32'
 paddle.double = 'float64'
@ -110,7 +110,7 @@ if not hasattr(paddle, 'cat'):
    paddle.cat = cat


-########### hcak paddle.Tensor #############
+########### hack paddle.Tensor #############
 def item(x: paddle.Tensor):
    return x.numpy().item()

--- a/paddlespeech/s2t/exps/u2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py
@ -79,7 +79,6 @@ class U2Infer():

            ilen = paddle.to_tensor(feat.shape[0])
            xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0)
-
            decode_config = self.config.decode
            result_transcripts = self.model.decode(
                xs,
@ -129,6 +128,7 @@ if __name__ == "__main__":
    args = parser.parse_args()

    config = CfgNode(new_allowed=True)
+
    if args.config:
        config.merge_from_file(args.config)
    if args.decode_cfg:
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
+import asyncio
 import base64
 import io
 import json
+import logging
 import os
 import random
 import time
@ -28,6 +30,7 @@ from ..executor import BaseExecutor
 from ..util import cli_client_register
 from ..util import stats_wrapper
 from paddlespeech.cli.log import logger
+from paddlespeech.server.tests.asr.online.websocket_client import ASRAudioHandler
 from paddlespeech.server.utils.audio_process import wav2pcm
 from paddlespeech.server.utils.util import wav2base64

@ -230,6 +233,76 @@ class ASRClientExecutor(BaseExecutor):
        return res


+@cli_client_register(
+    name='paddlespeech_client.asr_online',
+    description='visit asr online service')
+class ASRClientExecutor(BaseExecutor):
+    def __init__(self):
+        super(ASRClientExecutor, self).__init__()
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech_client.asr', add_help=True)
+        self.parser.add_argument(
+            '--server_ip', type=str, default='127.0.0.1', help='server ip')
+        self.parser.add_argument(
+            '--port', type=int, default=8091, help='server port')
+        self.parser.add_argument(
+            '--input',
+            type=str,
+            default=None,
+            help='Audio file to be recognized',
+            required=True)
+        self.parser.add_argument(
+            '--sample_rate', type=int, default=16000, help='audio sample rate')
+        self.parser.add_argument(
+            '--lang', type=str, default="zh_cn", help='language')
+        self.parser.add_argument(
+            '--audio_format', type=str, default="wav", help='audio format')
+
+    def execute(self, argv: List[str]) -> bool:
+        args = self.parser.parse_args(argv)
+        input_ = args.input
+        server_ip = args.server_ip
+        port = args.port
+        sample_rate = args.sample_rate
+        lang = args.lang
+        audio_format = args.audio_format
+
+        try:
+            time_start = time.time()
+            res = self(
+                input=input_,
+                server_ip=server_ip,
+                port=port,
+                sample_rate=sample_rate,
+                lang=lang,
+                audio_format=audio_format)
+            time_end = time.time()
+            logger.info(res.json())
+            logger.info("Response time %f s." % (time_end - time_start))
+            return True
+        except Exception as e:
+            logger.error("Failed to speech recognition.")
+            return False
+
+    @stats_wrapper
+    def __call__(self,
+                 input: str,
+                 server_ip: str="127.0.0.1",
+                 port: int=8091,
+                 sample_rate: int=16000,
+                 lang: str="zh_cn",
+                 audio_format: str="wav"):
+        """
+        Python API to call an executor.
+        """
+        logging.basicConfig(level=logging.INFO)
+        logging.info("asr websocket client start")
+        handler = ASRAudioHandler(server_ip, port)
+        loop = asyncio.get_event_loop()
+        loop.run_until_complete(handler.run(input))
+        logging.info("asr websocket client finished")
+
+
@cli_client_register(
    name='paddlespeech_client.cls', description='visit cls service')
 class CLSClientExecutor(BaseExecutor):
--- a/paddlespeech/server/conf/ws_application.yaml
+++ b/paddlespeech/server/conf/ws_application.yaml
@ -4,7 +4,7 @@
 #                             SERVER SETTING                                    #
 #################################################################################
 host: 0.0.0.0
-port: 8096
+port: 8090

 # The task format in the engin_list is: <speech task>_<engine type>
 # task choices = ['asr_online', 'tts_online']
--- a/paddlespeech/server/tests/asr/offline/http_client.py
+++ b/paddlespeech/server/tests/asr/offline/http_client.py
--- a/paddlespeech/server/tests/asr/online/README_cn.md
+++ b/paddlespeech/server/tests/asr/online/README_cn.md
@ -0,0 +1,49 @@
+([简体中文](./README_cn.md)|English)
+
+# 语音服务
+
+## 介绍
+本文档介绍如何使用流式ASR的三种不同客户端:网页、麦克风、Python模拟流式服务。 
+
+
+## 使用方法
+### 1. 安装
+请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
+
+推荐使用 **paddlepaddle 2.2.1** 或以上版本。
+你可以从 medium，hard 三中方式中选择一种方式安装 PaddleSpeech。
+
+
+### 2. 准备测试文件
+
+这个 ASR client 的输入应该是一个 WAV 文件（`.wav`），并且采样率必须与模型的采样率相同。
+
+可以下载此 ASR client的示例音频：
+```bash
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+```
+
+### 2. 流式 ASR 客户端使用方法
+
+- Python模拟流式服务命令行
+   ```
+
+   # 流式ASR
+   paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8091 --input ./zh.wav
+
+   ```
+
+
+- 麦克风
+   ```
+   # 直接调用麦克风设备
+   python microphone_client.py
+
+   ```
+
+
+- 网页
+   ```
+   # 进入web目录后参考相关readme.md
+
+   ```
--- a/paddlespeech/server/tests/asr/online/web/app.py
+++ b/paddlespeech/server/tests/asr/online/web/app.py
@ -1,12 +1,11 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-
 # Copyright 2021 Mobvoi Inc. All Rights Reserved.
 # Author: zhendong.peng@mobvoi.com (Zhendong Peng)
-
 import argparse

-from flask import Flask, render_template
+from flask import Flask
+from flask import render_template

 parser = argparse.ArgumentParser(description='training your network')
 parser.add_argument('--port', default=19999, type=int, help='port id')
@ -14,9 +13,11 @@ args = parser.parse_args()

 app = Flask(__name__)

+
@app.route('/')
 def index():
    return render_template('index.html')

+
 if __name__ == '__main__':
    app.run(host='0.0.0.0', port=args.port, debug=True)
--- a/paddlespeech/server/tests/asr/online/web/paddle_web_demo.png
+++ b/paddlespeech/server/tests/asr/online/web/paddle_web_demo.png
--- a/paddlespeech/server/tests/asr/online/web/readme.md
+++ b/paddlespeech/server/tests/asr/online/web/readme.md
@ -0,0 +1,18 @@
+# paddlespeech serving 网页Demo
+
+- 感谢[wenet](https://github.com/wenet-e2e/wenet)团队的前端demo代码.
+
+
+## 使用方法
+### 1. 在本地电脑启动网页服务
+   ```
+   python app.py
+
+   ```
+
+### 2. 本地电脑浏览器
+
+在浏览器中输入127.0.0.1:19999 即可看到相关网页Demo。
+
+![图片](./paddle_web_demo.png)
+ 
--- a/paddlespeech/server/tests/asr/online/websocket_client.py
+++ b/paddlespeech/server/tests/asr/online/websocket_client.py
@ -34,22 +34,23 @@ class ASRAudioHandler:
    def read_wave(self, wavfile_path: str):
        samples, sample_rate = soundfile.read(wavfile_path, dtype='int16')
        x_len = len(samples)
-        chunk_stride = 40 * 16  #40ms, sample_rate = 16kHz
+        # chunk_stride = 40 * 16  #40ms, sample_rate = 16kHz
        chunk_size = 80 * 16  #80ms, sample_rate = 16kHz

-        if (x_len - chunk_size) % chunk_stride != 0:
-            padding_len_x = chunk_stride - (x_len - chunk_size) % chunk_stride
+        if x_len % chunk_size != 0:
+            padding_len_x = chunk_size - x_len % chunk_size
        else:
            padding_len_x = 0

        padding = np.zeros((padding_len_x), dtype=samples.dtype)
        padded_x = np.concatenate([samples, padding], axis=0)

-        num_chunk = (x_len + padding_len_x - chunk_size) / chunk_stride + 1
+        assert (x_len + padding_len_x) % chunk_size == 0
+        num_chunk = (x_len + padding_len_x) / chunk_size
        num_chunk = int(num_chunk)

        for i in range(0, num_chunk):
-            start = i * chunk_stride
+            start = i * chunk_size
            end = start + chunk_size
            x_chunk = padded_x[start:end]
            yield x_chunk
@ -80,6 +81,7 @@ class ASRAudioHandler:
                msg = await ws.recv()
                msg = json.loads(msg)
                logging.info("receive msg={}".format(msg))
+
            result = msg
            # finished 
            audio_info = json.dumps(
@ -93,6 +95,7 @@ class ASRAudioHandler:
                separators=(',', ': '))
            await ws.send(audio_info)
            msg = await ws.recv()
+            
            # decode the bytes to str
            msg = json.loads(msg)
            logging.info("receive msg={}".format(msg))
@ -103,7 +106,7 @@ class ASRAudioHandler:
 def main(args):
    logging.basicConfig(level=logging.INFO)
    logging.info("asr websocket client start")
-    handler = ASRAudioHandler("127.0.0.1", 8096)
+    handler = ASRAudioHandler("127.0.0.1", 8090)
    loop = asyncio.get_event_loop()

    # support to process single audio file
--- a/paddlespeech/server/utils/buffer.py
+++ b/paddlespeech/server/utils/buffer.py
@ -24,15 +24,38 @@ class Frame(object):

 class ChunkBuffer(object):
    def __init__(self,
-                 frame_duration_ms=80,
-                 shift_ms=40,
+                 window_n=7,
+                 shift_n=4,
+                 window_ms=20,
+                 shift_ms=10,
                 sample_rate=16000,
                 sample_width=2):
-        self.sample_rate = sample_rate
-        self.frame_duration_ms = frame_duration_ms
+        """audio sample data point buffer
+
+        Args:
+            window_n (int, optional): decode window frame length. Defaults to 7 frame.
+            shift_n (int, optional): decode shift frame length. Defaults to 4 frame.
+            window_ms (int, optional): frame length, ms. Defaults to 20 ms.
+            shift_ms (int, optional): shift length, ms. Defaults to 10 ms.
+            sample_rate (int, optional): audio sample rate. Defaults to 16000.
+            sample_width (int, optional): sample point bytes. Defaults to 2 bytes.
+        """
+        self.window_n = window_n
+        self.shift_n = shift_n
+        self.window_ms = window_ms
        self.shift_ms = shift_ms
-        self.remained_audio = b''
+        self.sample_rate = sample_rate
        self.sample_width = sample_width  # int16 = 2; float32 = 4
+        self.remained_audio = b''
+
+        self.window_sec = float((self.window_n - 1) * self.shift_ms +
+                                self.window_ms) / 1000.0
+        self.shift_sec = float(self.shift_n * self.shift_ms / 1000.0)
+
+        self.window_bytes = int(self.window_sec * self.sample_rate *
+                                self.sample_width)
+        self.shift_bytes = int(self.shift_sec * self.sample_rate *
+                               self.sample_width)

    def frame_generator(self, audio):
        """Generates audio frames from PCM audio data.
@ -43,17 +66,13 @@ class ChunkBuffer(object):
        audio = self.remained_audio + audio
        self.remained_audio = b''

-        n = int(self.sample_rate * (self.frame_duration_ms / 1000.0) *
-                self.sample_width)
-        shift_n = int(self.sample_rate * (self.shift_ms / 1000.0) *
-                      self.sample_width)
        offset = 0
        timestamp = 0.0
-        duration = (float(n) / self.sample_rate) / self.sample_width
-        shift_duration = (float(shift_n) / self.sample_rate) / self.sample_width
-        while offset + n <= len(audio):
-            yield Frame(audio[offset:offset + n], timestamp, duration)
-            timestamp += shift_duration
-            offset += shift_n
+
+        while offset + self.window_bytes <= len(audio):
+            yield Frame(audio[offset:offset + self.window_bytes], timestamp,
+                        self.window_sec)
+            timestamp += self.shift_sec
+            offset += self.shift_bytes

        self.remained_audio += audio[offset:]
--- a/paddlespeech/server/ws/asr_socket.py
+++ b/paddlespeech/server/ws/asr_socket.py
@ -37,7 +37,10 @@ async def websocket_endpoint(websocket: WebSocket):
    # init buffer
    chunk_buffer_conf = asr_engine.config.chunk_buffer_conf
    chunk_buffer = ChunkBuffer(
-        frame_duration_ms=chunk_buffer_conf['frame_duration_ms'],
+        window_n=7,
+        shift_n=4,
+        window_ms=20,
+        shift_ms=10,
        sample_rate=chunk_buffer_conf['sample_rate'],
        sample_width=chunk_buffer_conf['sample_width'])
    # init vad
@ -80,10 +83,6 @@ async def websocket_endpoint(websocket: WebSocket):
            elif "bytes" in message:
                message = message["bytes"]

-                # # vad for input bytes audio
-                # vad.add_audio(message)
-                # message = b''.join(f for f in vad.vad_collector()
-                #                    if f is not None)
                engine_pool = get_engine_pool()
                asr_engine = engine_pool['asr']
                asr_results = ""
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@ -38,9 +38,7 @@ def get_predictor(args, filed='am'):
        config.enable_use_gpu(100, 0)
    elif args.device == "cpu":
        config.disable_gpu()
-    # This line must be commented for fastspeech2, if not, it will OOM
-    if model_name != 'fastspeech2':
-        config.enable_memory_optim()
+    config.enable_memory_optim()
    predictor = inference.create_predictor(config)
    return predictor

--- a/paddlespeech/t2s/exps/ort_predict.py
+++ b/paddlespeech/t2s/exps/ort_predict.py
@ -70,8 +70,15 @@ def ort_predict(args):

    # am warmup
    for T in [27, 38, 54]:
-        data = np.random.randint(1, 266, size=(T, ))
-        am_sess.run(None, {"text": data})
+        am_input_feed = {}
+        if am_name == 'fastspeech2':
+            phone_ids = np.random.randint(1, 266, size=(T, ))
+            am_input_feed.update({'text': phone_ids})
+        elif am_name == 'speedyspeech':
+            phone_ids = np.random.randint(1, 92, size=(T, ))
+            tone_ids = np.random.randint(1, 5, size=(T, ))
+            am_input_feed.update({'phones': phone_ids, 'tones': tone_ids})
+        am_sess.run(None, input_feed=am_input_feed)

    # voc warmup
    for T in [227, 308, 544]:
@ -81,14 +88,20 @@ def ort_predict(args):

    N = 0
    T = 0
+    am_input_feed = {}
    for example in test_dataset:
        utt_id = example['utt_id']
-        phone_ids = example["text"]
+        if am_name == 'fastspeech2':
+            phone_ids = example["text"]
+            am_input_feed.update({'text': phone_ids})
+        elif am_name == 'speedyspeech':
+            phone_ids = example["phones"]
+            tone_ids = example["tones"]
+            am_input_feed.update({'phones': phone_ids, 'tones': tone_ids})
        with timer() as t:
-            mel = am_sess.run(output_names=None, input_feed={'text': phone_ids})
+            mel = am_sess.run(output_names=None, input_feed=am_input_feed)
            mel = mel[0]
            wav = voc_sess.run(output_names=None, input_feed={'logmel': mel})
-
            N += len(wav[0])
            T += t.elapse
            speed = len(wav[0]) / t.elapse
@ -110,9 +123,7 @@ def parse_args():
        '--am',
        type=str,
        default='fastspeech2_csmsc',
-        choices=[
-            'fastspeech2_csmsc',
-        ],
+        choices=['fastspeech2_csmsc', 'speedyspeech_csmsc'],
        help='Choose acoustic model type of tts task.')

    # voc
--- a/paddlespeech/t2s/exps/ort_predict_e2e.py
+++ b/paddlespeech/t2s/exps/ort_predict_e2e.py
@ -68,39 +68,58 @@ def ort_predict(args):
    # vocoder
    voc_sess = get_sess(args, filed='voc')

+    # frontend warmup
+    # Loading model cost 0.5+ seconds
+    if args.lang == 'zh':
+        frontend.get_input_ids("你好，欢迎使用飞桨框架进行深度学习研究！", merge_sentences=True)
+    else:
+        print("lang should in be 'zh' here!")
+
    # am warmup
    for T in [27, 38, 54]:
-        data = np.random.randint(1, 266, size=(T, ))
-        am_sess.run(None, {"text": data})
+        am_input_feed = {}
+        if am_name == 'fastspeech2':
+            phone_ids = np.random.randint(1, 266, size=(T, ))
+            am_input_feed.update({'text': phone_ids})
+        elif am_name == 'speedyspeech':
+            phone_ids = np.random.randint(1, 92, size=(T, ))
+            tone_ids = np.random.randint(1, 5, size=(T, ))
+            am_input_feed.update({'phones': phone_ids, 'tones': tone_ids})
+        am_sess.run(None, input_feed=am_input_feed)

    # voc warmup
    for T in [227, 308, 544]:
        data = np.random.rand(T, 80).astype("float32")
-        voc_sess.run(None, {"logmel": data})
+        voc_sess.run(None, input_feed={"logmel": data})
    print("warm up done!")

-    # frontend warmup
-    # Loading model cost 0.5+ seconds
-    if args.lang == 'zh':
-        frontend.get_input_ids("你好，欢迎使用飞桨框架进行深度学习研究！", merge_sentences=True)
-    else:
-        print("lang should in be 'zh' here!")
-
    N = 0
    T = 0
    merge_sentences = True
+    get_tone_ids = False
+    am_input_feed = {}
+    if am_name == 'speedyspeech':
+        get_tone_ids = True
    for utt_id, sentence in sentences:
        with timer() as t:
            if args.lang == 'zh':
                input_ids = frontend.get_input_ids(
-                    sentence, merge_sentences=merge_sentences)
-
+                    sentence,
+                    merge_sentences=merge_sentences,
+                    get_tone_ids=get_tone_ids)
                phone_ids = input_ids["phone_ids"]
+                if get_tone_ids:
+                    tone_ids = input_ids["tone_ids"]
            else:
                print("lang should in be 'zh' here!")
            # merge_sentences=True here, so we only use the first item of phone_ids
            phone_ids = phone_ids[0].numpy()
-            mel = am_sess.run(output_names=None, input_feed={'text': phone_ids})
+            if am_name == 'fastspeech2':
+                am_input_feed.update({'text': phone_ids})
+            elif am_name == 'speedyspeech':
+                tone_ids = tone_ids[0].numpy()
+                am_input_feed.update({'phones': phone_ids, 'tones': tone_ids})
+            mel = am_sess.run(output_names=None, input_feed=am_input_feed)
            mel = mel[0]
            wav = voc_sess.run(output_names=None, input_feed={'logmel': mel})

@ -125,9 +144,7 @@ def parse_args():
        '--am',
        type=str,
        default='fastspeech2_csmsc',
-        choices=[
-            'fastspeech2_csmsc',
-        ],
+        choices=['fastspeech2_csmsc', 'speedyspeech_csmsc'],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@ -68,13 +68,15 @@ def evaluate(args):
    # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
    if am_name == 'tacotron2':
        merge_sentences = True
+
+    get_tone_ids = False
+    if am_name == 'speedyspeech':
+        get_tone_ids = True
+
    N = 0
    T = 0
    for utt_id, sentence in sentences:
        with timer() as t:
-            get_tone_ids = False
-            if am_name == 'speedyspeech':
-                get_tone_ids = True
            if args.lang == 'zh':
                input_ids = frontend.get_input_ids(
                    sentence,
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@ -667,8 +667,8 @@ class FastSpeech2(nn.Layer):
            use_teacher_forcing(bool, optional): Whether to use teacher forcing.
                If true, groundtruth of duration, pitch and energy will be used.
            spk_emb(Tensor, optional, optional): peaker embedding vector (spk_embed_dim,). (Default value = None)
-            spk_id(Tensor, optional(int64), optional): Batch of padded spk ids  (1,). (Default value = None)
-            tone_id(Tensor, optional(int64), optional): Batch of padded tone ids  (T,). (Default value = None)
+            spk_id(Tensor, optional(int64), optional): spk ids (1,). (Default value = None)
+            tone_id(Tensor, optional(int64), optional): tone ids (T,). (Default value = None)

        Returns:

@ -751,7 +751,6 @@ class FastSpeech2(nn.Layer):

        Returns:

-        
        """
        if self.tone_embed_integration_type == "add":
            # apply projection and then add to hidden states
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
@ -11,17 +11,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import List
+
 import paddle
 from paddle import nn

 from paddlespeech.t2s.modules.nets_utils import initialize
-from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding
 from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding


 class ResidualBlock(nn.Layer):
-    def __init__(self, channels, kernel_size, dilation, n=2):
+    def __init__(self,
+                 channels: int=128,
+                 kernel_size: int=3,
+                 dilation: int=3,
+                 n: int=2):
+        """SpeedySpeech encoder module.
+        Args:
+            channels (int, optional): Feature size of the residual output(and also the input).
+            kernel_size (int, optional): Kernel size of the 1D convolution.
+            dilation (int, optional): Dilation of the 1D convolution.
+            n (int): Number of blocks.
+        """
+
        super().__init__()
+        total_pad = (dilation * (kernel_size - 1))
+        begin = total_pad // 2
+        end = total_pad - begin
+        # remove padding='same' here, cause onnx don't support dilation + 'same' padding
        blocks = [
            nn.Sequential(
                nn.Conv1D(
@ -29,14 +47,20 @@ class ResidualBlock(nn.Layer):
                    channels,
                    kernel_size,
                    dilation=dilation,
-                    padding="same",
-                    data_format="NLC"),
+                    # make sure output T == input T
+                    padding=((0, 0), (0, 0), (begin, end))),
                nn.ReLU(),
-                nn.BatchNorm1D(channels, data_format="NLC"), ) for _ in range(n)
+                nn.BatchNorm1D(channels), ) for _ in range(n)
        ]
        self.blocks = nn.Sequential(*blocks)

-    def forward(self, x):
+    def forward(self, x: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor): Batch of input sequences (B, hidden_size, Tmax).
+        Returns:
+            Tensor: The residual output (B, hidden_size, Tmax).
+        """
        return x + self.blocks(x)


@ -62,7 +86,15 @@ class TextEmbedding(nn.Layer):
                tone_vocab_size, tone_embedding_size, tone_padding_idx)
        self.concat = concat

-    def forward(self, text, tone=None):
+    def forward(self, text: paddle.Tensor, tone: paddle.Tensor=None):
+        """Calculate forward propagation.
+        Args:
+            text(Tensor(int64)): Batch of padded token ids (B, Tmax).
+            tones(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax).
+        Returns:
+            Tensor: The residual output (B, Tmax, embedding_size).
+        """
+
        text_embed = self.text_embedding(text)
        if tone is None:
            return text_embed
@ -75,13 +107,24 @@ class TextEmbedding(nn.Layer):


 class SpeedySpeechEncoder(nn.Layer):
+    """SpeedySpeech encoder module.
+    Args:
+        vocab_size (int): Dimension of the inputs.
+        tone_size (Optional[int]): Number of tones.
+        hidden_size (int): Number of encoder hidden units.
+        kernel_size (int): Kernel size of encoder.
+        dilations (List[int]): Dilations of encoder.
+        spk_num (Optional[int]): Number of speakers. 
+    """
+
    def __init__(self,
-                 vocab_size,
-                 tone_size,
-                 hidden_size,
-                 kernel_size,
-                 dilations,
+                 vocab_size: int,
+                 tone_size: int,
+                 hidden_size: int=128,
+                 kernel_size: int=3,
+                 dilations: List[int]=[1, 3, 9, 27, 1, 3, 9, 27, 1, 1],
                 spk_num=None):
+
        super().__init__()
        self.embedding = TextEmbedding(
            vocab_size,
@ -109,34 +152,71 @@ class SpeedySpeechEncoder(nn.Layer):
        self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
        self.postnet2 = nn.Sequential(
            nn.ReLU(),
-            nn.BatchNorm1D(hidden_size, data_format="NLC"),
-            nn.Linear(hidden_size, hidden_size), )
-
-    def forward(self, text, tones, spk_id=None):
+            nn.BatchNorm1D(hidden_size), )
+        self.linear = nn.Linear(hidden_size, hidden_size)
+
+    def forward(self,
+                text: paddle.Tensor,
+                tones: paddle.Tensor,
+                spk_id: paddle.Tensor=None):
+        """Encoder input sequence.
+        Args:
+            text(Tensor(int64)): Batch of padded token ids (B, Tmax).
+            tones(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax).
+            spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,)
+
+        Returns:
+            Tensor: Output tensor (B, Tmax, hidden_size).
+        """
        embedding = self.embedding(text, tones)
        if self.spk_emb:
            embedding += self.spk_emb(spk_id).unsqueeze(1)
        embedding = self.prenet(embedding)
-        x = self.res_blocks(embedding)
+        x = self.res_blocks(embedding.transpose([0, 2, 1])).transpose([0, 2, 1])
+        # (B, T, dim)
        x = embedding + self.postnet1(x)
-        x = self.postnet2(x)
+        x = self.postnet2(x.transpose([0, 2, 1])).transpose([0, 2, 1])
+        x = self.linear(x)
        return x


 class DurationPredictor(nn.Layer):
-    def __init__(self, hidden_size):
+    def __init__(self, hidden_size: int=128):
        super().__init__()
        self.layers = nn.Sequential(
            ResidualBlock(hidden_size, 4, 1, n=1),
            ResidualBlock(hidden_size, 3, 1, n=1),
-            ResidualBlock(hidden_size, 1, 1, n=1), nn.Linear(hidden_size, 1))
+            ResidualBlock(hidden_size, 1, 1, n=1), )
+        self.linear = nn.Linear(hidden_size, 1)

-    def forward(self, x):
-        return paddle.squeeze(self.layers(x), -1)
+    def forward(self, x: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor): Batch of input sequences (B, Tmax, hidden_size).
+
+        Returns:
+            Tensor: Batch of predicted durations in log domain (B, Tmax).
+        """
+        x = self.layers(x.transpose([0, 2, 1])).transpose([0, 2, 1])
+        x = self.linear(x)
+        return paddle.squeeze(x, -1)


 class SpeedySpeechDecoder(nn.Layer):
-    def __init__(self, hidden_size, output_size, kernel_size, dilations):
+    def __init__(self,
+                 hidden_size: int=128,
+                 output_size: int=80,
+                 kernel_size: int=3,
+                 dilations: List[int]=[
+                     1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 1
+                 ]):
+        """SpeedySpeech decoder module.
+        Args:
+            hidden_size (int): Number of decoder hidden units.
+            kernel_size (int): Kernel size of decoder.
+            output_size (int): Dimension of the outputs.
+            dilations (List[int]): Dilations of decoder.
+        """
        super().__init__()
        res_blocks = [
            ResidualBlock(hidden_size, kernel_size, d, n=2) for d in dilations
@ -144,14 +224,21 @@ class SpeedySpeechDecoder(nn.Layer):
        self.res_blocks = nn.Sequential(*res_blocks)

        self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
-        self.postnet2 = nn.Sequential(
-            ResidualBlock(hidden_size, kernel_size, 1, n=2),
-            nn.Linear(hidden_size, output_size))
+        self.postnet2 = ResidualBlock(hidden_size, kernel_size, 1, n=2)
+        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, x):
-        xx = self.res_blocks(x)
+        """Decoder input sequence.
+        Args:
+            x(Tensor): Input tensor (B, time, hidden_size).
+
+        Returns:
+            Tensor: Output tensor (B, time, output_size).
+        """
+        xx = self.res_blocks(x.transpose([0, 2, 1])).transpose([0, 2, 1])
        x = x + self.postnet1(xx)
-        x = self.postnet2(x)
+        x = self.postnet2(x.transpose([0, 2, 1])).transpose([0, 2, 1])
+        x = self.linear(x)
        return x


@ -159,17 +246,35 @@ class SpeedySpeech(nn.Layer):
    def __init__(
            self,
            vocab_size,
-            encoder_hidden_size,
-            encoder_kernel_size,
-            encoder_dilations,
-            duration_predictor_hidden_size,
-            decoder_hidden_size,
-            decoder_output_size,
-            decoder_kernel_size,
-            decoder_dilations,
-            tone_size=None,
-            spk_num=None,
-            init_type: str="xavier_uniform", ):
+            encoder_hidden_size: int=128,
+            encoder_kernel_size: int=3,
+            encoder_dilations: List[int]=[1, 3, 9, 27, 1, 3, 9, 27, 1, 1],
+            duration_predictor_hidden_size: int=128,
+            decoder_hidden_size: int=128,
+            decoder_output_size: int=80,
+            decoder_kernel_size: int=3,
+            decoder_dilations: List[
+                int]=[1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 1],
+            tone_size: int=None,
+            spk_num: int=None,
+            init_type: str="xavier_uniform",
+            positional_dropout_rate: int=0.1):
+        """Initialize SpeedySpeech module.
+        Args:
+            vocab_size (int): Dimension of the inputs.
+            encoder_hidden_size (int): Number of encoder hidden units.
+            encoder_kernel_size (int): Kernel size of encoder.
+            encoder_dilations (List[int]): Dilations of encoder.
+            duration_predictor_hidden_size (int): Number of duration predictor hidden units.
+            decoder_hidden_size (int): Number of decoder hidden units.
+            decoder_kernel_size (int): Kernel size of decoder.
+            decoder_dilations (List[int]): Dilations of decoder.
+            decoder_output_size (int): Dimension of the outputs.
+            tone_size (Optional[int]): Number of tones.
+            spk_num (Optional[int]): Number of speakers. 
+            init_type (str): How to initialize transformer parameters.
+    
+        """
        super().__init__()

        # initialize parameters
@ -181,6 +286,8 @@ class SpeedySpeech(nn.Layer):
        duration_predictor = DurationPredictor(duration_predictor_hidden_size)
        decoder = SpeedySpeechDecoder(decoder_hidden_size, decoder_output_size,
                                      decoder_kernel_size, decoder_dilations)
+        self.position_enc = ScaledPositionalEncoding(encoder_hidden_size,
+                                                     positional_dropout_rate)

        self.encoder = encoder
        self.duration_predictor = duration_predictor
@ -190,7 +297,22 @@ class SpeedySpeech(nn.Layer):

        nn.initializer.set_global_initializer(None)

-    def forward(self, text, tones, durations, spk_id: paddle.Tensor=None):
+    def forward(self,
+                text: paddle.Tensor,
+                tones: paddle.Tensor,
+                durations: paddle.Tensor,
+                spk_id: paddle.Tensor=None):
+        """Calculate forward propagation.
+        Args:
+            text(Tensor(int64)): Batch of padded token ids (B, Tmax).
+            durations(Tensor(int64)): Batch of padded durations (B, Tmax).
+            tones(Tensor, optional(int64)): Batch of padded tone ids  (B, Tmax).
+            spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,)
+
+        Returns:
+            Tensor: Output tensor (B, T_frames, decoder_output_size).
+            Tensor: Predicted durations (B, Tmax).
+        """
        # input of embedding must be int64
        text = paddle.cast(text, 'int64')
        tones = paddle.cast(tones, 'int64')
@ -198,23 +320,30 @@ class SpeedySpeech(nn.Layer):
            spk_id = paddle.cast(spk_id, 'int64')
        durations = paddle.cast(durations, 'int64')
        encodings = self.encoder(text, tones, spk_id)
-
        pred_durations = self.duration_predictor(encodings.detach())
-
        # expand encodings
        durations_to_expand = durations
        encodings = self.length_regulator(encodings, durations_to_expand)
-
+        encodings = self.position_enc(encodings)
        # decode
-        # remove positional encoding here
-        _, t_dec, feature_size = encodings.shape
-        encodings += sinusoid_position_encoding(t_dec, feature_size)
        decoded = self.decoder(encodings)
        return decoded, pred_durations

-    def inference(self, text, tones=None, durations=None, spk_id=None):
-        # text: [T]
-        # tones: [T]
+    def inference(self,
+                  text: paddle.Tensor,
+                  tones: paddle.Tensor=None,
+                  durations: paddle.Tensor=None,
+                  spk_id: paddle.Tensor=None):
+        """Generate the sequence of features given the sequences of characters.
+        Args:
+            text(Tensor(int64)): Input sequence of characters (T,).
+            tones(Tensor, optional(int64)): Batch of padded tone ids (T, ).
+            durations(Tensor, optional (int64)): Groundtruth of duration (T,).
+            spk_id(Tensor, optional(int64), optional): spk ids (1,). (Default value = None)
+
+        Returns:
+            Tensor: logmel (T, decoder_output_size).
+        """
        # input of embedding must be int64
        text = paddle.cast(text, 'int64')
        text = text.unsqueeze(0)
@ -233,10 +362,7 @@ class SpeedySpeech(nn.Layer):
            durations_to_expand = durations
        encodings = self.length_regulator(
            encodings, durations_to_expand, is_inference=True)
-
-        shape = paddle.shape(encodings)
-        t_dec, feature_size = shape[1], shape[2]
-        encodings += sinusoid_position_encoding(t_dec, feature_size)
+        encodings = self.position_enc(encodings)
        decoded = self.decoder(encodings)
        return decoded[0]

--- a/paddlespeech/t2s/modules/predictor/length_regulator.py
+++ b/paddlespeech/t2s/modules/predictor/length_regulator.py
@ -86,7 +86,7 @@ class LengthRegulator(nn.Layer):
            M[:, i] = m - init
            init = m
        M = paddle.reshape(M, shape=[t_dec_1, batch_size, t_enc])
-        M = M[1:, :, :]
+        M = M[1:t_dec_1, :, :]
        M = paddle.transpose(M, (1, 0, 2))
        encodings = paddle.matmul(M, encodings)
        return encodings
--- a/paddlespeech/t2s/modules/residual_block.py
+++ b/paddlespeech/t2s/modules/residual_block.py
@ -30,7 +30,7 @@ class WaveNetResidualBlock(nn.Layer):

    Args:
        kernel_size (int, optional): Kernel size of the 1D convolution, by default 3
-        residual_channels (int, optional): Feature size of the resiaudl output(and also the input), by default 64
+        residual_channels (int, optional): Feature size of the residual output(and also the input), by default 64
        gate_channels (int, optional): Output feature size of the 1D convolution, by default 128
        skip_channels (int, optional): Feature size of the skip output, by default 64
        aux_channels (int, optional): Feature size of the auxiliary input (e.g. spectrogram), by default 80
--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@ -347,7 +347,7 @@ class TransformerEncoder(BaseEncoder):
            encoder_type="transformer")

    def forward(self, xs, masks):
-        """Encode input sequence.
+        """Encoder input sequence.

        Args:
            xs(Tensor): Input tensor (#batch, time, idim).
@ -355,7 +355,7 @@ class TransformerEncoder(BaseEncoder):

        Returns:
            Tensor: Output tensor (#batch, time, attention_dim).
-            Tensor:Mask tensor (#batch, 1, time).
+            Tensor: Mask tensor (#batch, 1, time).
        """
        xs = self.embed(xs)
        xs, masks = self.encoders(xs, masks)
--- a/paddlespeech/vector/cluster/diarization.py
+++ b/paddlespeech/vector/cluster/diarization.py
@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# Modified from speechbrain(https://github.com/speechbrain/speechbrain)
 """
 This script contains basic functions used for speaker diarization.
 This script has an optional dependency on open source sklearn library.
@ -18,11 +19,11 @@ A few sklearn functions are modified in this script as per requirement.
 """
 import argparse
 import warnings
+from distutils.util import strtobool

 import numpy as np
 import scipy
 import sklearn
-from distutils.util import strtobool
 from scipy import sparse
 from scipy.sparse.csgraph import connected_components
 from scipy.sparse.csgraph import laplacian as csgraph_laplacian
--- a/paddlespeech/vector/io/dataset.py
+++ b/paddlespeech/vector/io/dataset.py
@ -13,6 +13,7 @@
 # limitations under the License.
 from dataclasses import dataclass
 from dataclasses import fields
+
 from paddle.io import Dataset

 from paddleaudio import load as load_audio
--- a/paddlespeech/vector/io/dataset_from_json.py
+++ b/paddlespeech/vector/io/dataset_from_json.py
@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
-
 from dataclasses import dataclass
 from dataclasses import fields
+
 from paddle.io import Dataset

 from paddleaudio import load as load_audio
--- a/speechx/examples/CMakeLists.txt
+++ b/speechx/examples/CMakeLists.txt
@ -1,7 +1,4 @@
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)

-add_subdirectory(feat)
-add_subdirectory(nnet)
-add_subdirectory(decoder)
-
-add_subdirectory(glog)
+add_subdirectory(ds2_ol)
+add_subdirectory(dev)
--- a/speechx/examples/README.md
+++ b/speechx/examples/README.md
@ -1,17 +1,25 @@
-# Examples
+# Examples for SpeechX
+
+* dev - for speechx developer, using for test.
+* ngram - using to build NGram ARPA lm.
+* ds2_ol - ds2 streaming test under `aishell-1` test dataset.
+ The entrypoint is `ds2_ol/aishell/run.sh`

-* glog - glog usage
-* feat - mfcc, linear 
-* nnet - ds2 nn
-* decoder - online decoder to work as offline

 ## How to run

 `run.sh` is the entry point.

-Example to play `decoder`:
+Example to play `ds2_ol`:

 ```
-pushd decoder
+pushd ds2_ol/aishell
 bash run.sh
 ```
+
+## Display Model with [Netron](https://github.com/lutzroeder/netron)
+
+```
+pip install netron
+netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel  --port 8022 --host 10.21.55.20
+```
--- a/speechx/examples/aishell/utils
+++ b/speechx/examples/aishell/utils
@ -1 +0,0 @@
-../../../utils
--- a/speechx/examples/decoder/CMakeLists.txt
+++ b/speechx/examples/decoder/CMakeLists.txt
@ -1,18 +0,0 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-add_executable(offline_decoder_sliding_chunk_main ${CMAKE_CURRENT_SOURCE_DIR}/offline_decoder_sliding_chunk_main.cc)
-target_include_directories(offline_decoder_sliding_chunk_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(offline_decoder_sliding_chunk_main PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
-
-add_executable(offline_decoder_main ${CMAKE_CURRENT_SOURCE_DIR}/offline_decoder_main.cc)
-target_include_directories(offline_decoder_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(offline_decoder_main PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
-
-add_executable(offline_wfst_decoder_main ${CMAKE_CURRENT_SOURCE_DIR}/offline_wfst_decoder_main.cc)
-target_include_directories(offline_wfst_decoder_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(offline_wfst_decoder_main PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-decoder ${DEPS})
-
-add_executable(decoder_test_main ${CMAKE_CURRENT_SOURCE_DIR}/decoder_test_main.cc)
-target_include_directories(decoder_test_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(decoder_test_main PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
-
--- a/speechx/examples/decoder/offline_decoder_main.cc
+++ b/speechx/examples/decoder/offline_decoder_main.cc
@ -1,121 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// todo refactor, repalce with gtest
-
-#include "base/flags.h"
-#include "base/log.h"
-#include "decoder/ctc_beam_search_decoder.h"
-#include "frontend/audio/data_cache.h"
-#include "kaldi/util/table-types.h"
-#include "nnet/decodable.h"
-#include "nnet/paddle_nnet.h"
-
-DEFINE_string(feature_respecifier, "", "feature matrix rspecifier");
-DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
-DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
-DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
-DEFINE_string(lm_path, "lm.klm", "language model");
-DEFINE_int32(chunk_size, 35, "feat chunk size");
-
-
-using kaldi::BaseFloat;
-using kaldi::Matrix;
-using std::vector;
-
-// test decoder by feeding speech feature, deprecated.
-int main(int argc, char* argv[]) {
-    gflags::ParseCommandLineFlags(&argc, &argv, false);
-    google::InitGoogleLogging(argv[0]);
-
-    kaldi::SequentialBaseFloatMatrixReader feature_reader(
-        FLAGS_feature_respecifier);
-    std::string model_graph = FLAGS_model_path;
-    std::string model_params = FLAGS_param_path;
-    std::string dict_file = FLAGS_dict_file;
-    std::string lm_path = FLAGS_lm_path;
-    int32 chunk_size = FLAGS_chunk_size;
-    LOG(INFO) << "model path: " << model_graph;
-    LOG(INFO) << "model param: " << model_params;
-    LOG(INFO) << "dict path: " << dict_file;
-    LOG(INFO) << "lm path: " << lm_path;
-    LOG(INFO) << "chunk size (frame): " << chunk_size;
-
-    int32 num_done = 0, num_err = 0;
-
-    // frontend + nnet is decodable
-    ppspeech::ModelOptions model_opts;
-    model_opts.model_path = model_graph;
-    model_opts.params_path = model_params;
-    std::shared_ptr<ppspeech::PaddleNnet> nnet(
-        new ppspeech::PaddleNnet(model_opts));
-    std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
-    std::shared_ptr<ppspeech::Decodable> decodable(
-        new ppspeech::Decodable(nnet, raw_data));
-    LOG(INFO) << "Init decodeable.";
-
-    // init decoder
-    ppspeech::CTCBeamSearchOptions opts;
-    opts.dict_file = dict_file;
-    opts.lm_path = lm_path;
-    ppspeech::CTCBeamSearch decoder(opts);
-    LOG(INFO) << "Init decoder.";
-
-    decoder.InitDecoder();
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-        string utt = feature_reader.Key();
-        const kaldi::Matrix<BaseFloat> feature = feature_reader.Value();
-        LOG(INFO) << "utt: " << utt;
-
-        // feat dim
-        raw_data->SetDim(feature.NumCols());
-        LOG(INFO) << "dim: " << raw_data->Dim();
-
-        int32 row_idx = 0;
-        int32 num_chunks = feature.NumRows() / chunk_size;
-        LOG(INFO) << "n chunks: " << num_chunks;
-        for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
-            // feat chunk
-            kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
-                                                          feature.NumCols());
-            for (int row_id = 0; row_id < chunk_size; ++row_id) {
-                kaldi::SubVector<kaldi::BaseFloat> feat_one_row(feature,
-                                                                row_idx);
-                kaldi::SubVector<kaldi::BaseFloat> f_chunk_tmp(
-                    feature_chunk.Data() + row_id * feature.NumCols(),
-                    feature.NumCols());
-                f_chunk_tmp.CopyFromVec(feat_one_row);
-                row_idx++;
-            }
-            // feed to raw cache
-            raw_data->Accept(feature_chunk);
-            if (chunk_idx == num_chunks - 1) {
-                raw_data->SetFinished();
-            }
-            // decode step
-            decoder.AdvanceDecode(decodable);
-        }
-
-        std::string result;
-        result = decoder.GetFinalBestPath();
-        KALDI_LOG << " the result of " << utt << " is " << result;
-        decodable->Reset();
-        decoder.Reset();
-        ++num_done;
-    }
-
-    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
-              << " with errors.";
-    return (num_done != 0 ? 0 : 1);
-}
--- a/speechx/examples/decoder/run.sh
+++ b/speechx/examples/decoder/run.sh
@ -1,43 +0,0 @@
-#!/bin/bash
-set +x
-set -e
-
-. path.sh
-
-# 1. compile
-if [ ! -d ${SPEECHX_EXAMPLES} ]; then
-    pushd ${SPEECHX_ROOT} 
-    bash build.sh
-    popd
-fi
-
-
-# 2. download model
-if [ ! -d ../paddle_asr_model ]; then
-    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/paddle_asr_model.tar.gz
-    tar xzfv paddle_asr_model.tar.gz
-    mv ./paddle_asr_model ../
-    # produce wav scp
-    echo "utt1 " $PWD/../paddle_asr_model/BAC009S0764W0290.wav > ../paddle_asr_model/wav.scp
-fi
-
-model_dir=../paddle_asr_model
-feat_wspecifier=./feats.ark
-cmvn=./cmvn.ark
-
-
-export GLOG_logtostderr=1
-
-# 3. gen linear feat
-linear_spectrogram_main \
-    --wav_rspecifier=scp:$model_dir/wav.scp \
-    --feature_wspecifier=ark,t:$feat_wspecifier \
-    --cmvn_write_path=$cmvn
-
-# 4. run decoder
-offline_decoder_main \
-    --feature_respecifier=ark:$feat_wspecifier \
-    --model_path=$model_dir/avg_1.jit.pdmodel \
-    --param_path=$model_dir/avg_1.jit.pdparams \
-    --dict_file=$model_dir/vocab.txt \
-    --lm_path=$model_dir/avg_1.jit.klm
--- a/speechx/examples/dev/CMakeLists.txt
+++ b/speechx/examples/dev/CMakeLists.txt
@ -0,0 +1,3 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+add_subdirectory(glog)
--- a/speechx/examples/dev/glog/CMakeLists.txt
+++ b/speechx/examples/dev/glog/CMakeLists.txt
--- a/speechx/examples/dev/glog/README.md
+++ b/speechx/examples/dev/glog/README.md
--- a/speechx/examples/dev/glog/glog_logtostderr_test.cc
+++ b/speechx/examples/dev/glog/glog_logtostderr_test.cc
--- a/speechx/examples/dev/glog/glog_test.cc
+++ b/speechx/examples/dev/glog/glog_test.cc
--- a/speechx/examples/dev/glog/path.sh
+++ b/speechx/examples/dev/glog/path.sh
@ -1,14 +1,15 @@
 # This contains the locations of binarys build required for running the examples.

-SPEECHX_ROOT=$PWD/../..
-SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
+SPEECHX_ROOT=$PWD/../../../

 SPEECHX_TOOLS=$SPEECHX_ROOT/tools
 TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin

-[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }

-export LC_AL=C
+SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
+[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }

-SPEECHX_BIN=$SPEECHX_EXAMPLES/nnet
+SPEECHX_BIN=$SPEECHX_EXAMPLES/dev/glog
 export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
+
+export LC_AL=C
--- a/speechx/examples/dev/glog/run.sh
+++ b/speechx/examples/dev/glog/run.sh
--- a/speechx/examples/ds2_ol/CMakeLists.txt
+++ b/speechx/examples/ds2_ol/CMakeLists.txt
@ -0,0 +1,5 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+add_subdirectory(feat)
+add_subdirectory(nnet)
+add_subdirectory(decoder)
--- a/speechx/examples/ds2_ol/README.md
+++ b/speechx/examples/ds2_ol/README.md
@ -0,0 +1,11 @@
+# Deepspeech2 Streaming
+
+Please go to `aishell` to test it.
+
+* aishell
+Deepspeech2 Streaming Decoding under aishell dataset.
+
+The below is for developing and offline testing:
+* nnet
+* feat
+* decoder
--- a/speechx/examples/ds2_ol/aishell/.gitignore
+++ b/speechx/examples/ds2_ol/aishell/.gitignore
@ -0,0 +1,3 @@
+data
+exp
+aishell_*
--- a/speechx/examples/ds2_ol/aishell/README.md
+++ b/speechx/examples/ds2_ol/aishell/README.md
@ -0,0 +1,21 @@
+# Aishell - Deepspeech2 Streaming
+
+## CTC Prefix Beam Search w/o LM
+
+```
+Overall -> 16.14 % N=104612 C=88190 S=16110 D=312 I=465
+Mandarin -> 16.14 % N=104612 C=88190 S=16110 D=312 I=465
+Other -> 0.00 % N=0 C=0 S=0 D=0 I=0
+```
+
+## CTC Prefix Beam Search w LM
+
+```
+
+```
+
+## CTC WFST
+
+```
+
+```
--- a/speechx/examples/ds2_ol/aishell/local/split_data.sh
+++ b/speechx/examples/ds2_ol/aishell/local/split_data.sh
--- a/speechx/examples/ds2_ol/aishell/path.sh
+++ b/speechx/examples/ds2_ol/aishell/path.sh
@ -1,6 +1,6 @@
 # This contains the locations of binarys build required for running the examples.

-SPEECHX_ROOT=$PWD/../..
+SPEECHX_ROOT=$PWD/../../../
 SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples

 SPEECHX_TOOLS=$SPEECHX_ROOT/tools
@ -10,5 +10,5 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin

 export LC_AL=C

-SPEECHX_BIN=$SPEECHX_EXAMPLES/decoder:$SPEECHX_EXAMPLES/feat
+SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/decoder:$SPEECHX_EXAMPLES/ds2_ol/feat
 export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@ -4,6 +4,9 @@ set -e

 . path.sh

+nj=40
+
+
 # 1. compile
 if [ ! -d ${SPEECHX_EXAMPLES} ]; then
    pushd ${SPEECHX_ROOT} 
@ -11,52 +14,59 @@ if [ ! -d ${SPEECHX_EXAMPLES} ]; then
    popd
 fi

-
-# 2. download model
-if [ ! -d ../paddle_asr_model ]; then
-    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/paddle_asr_model.tar.gz
-    tar xzfv paddle_asr_model.tar.gz
-    mv ./paddle_asr_model ../
-    # produce wav scp
-    echo "utt1 " $PWD/../paddle_asr_model/BAC009S0764W0290.wav > ../paddle_asr_model/wav.scp
-fi
-
+# input
 mkdir -p data
 data=$PWD/data
+
+ckpt_dir=$data/model
+model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
+vocb_dir=$ckpt_dir/data/lang_char/
+
+# output
+mkdir -p exp
+exp=$PWD/exp
+
 aishell_wav_scp=aishell_test.scp
 if [ ! -d $data/test ]; then
+    pushd $data
    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
-    unzip -d $data aishell_test.zip
+    unzip  aishell_test.zip
+    popd
+
    realpath $data/test/*/*.wav > $data/wavlist
    awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
    paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
 fi

-model_dir=$PWD/aishell_ds2_online_model
-if [ ! -d $model_dir ]; then
-    mkdir -p $model_dir 
-    wget -P $model_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
-    tar xzfv $model_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $model_dir
+
+if [ ! -d $ckpt_dir ]; then
+    mkdir -p $ckpt_dir
+    wget -P $ckpt_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+    tar xzfv $model_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $ckpt_dir
+fi
+
+lm=$data/zh_giga.no_cna_cmn.prune01244.klm
+if [ ! -f $lm ]; then
+    pushd $data
+    wget -c https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm
+    popd
 fi

 # 3. make feature
-aishell_online_model=$model_dir/exp/deepspeech2_online/checkpoints
-lm_model_dir=../paddle_asr_model
 label_file=./aishell_result
 wer=./aishell_wer

-nj=40
 export GLOG_logtostderr=1

-#./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
-
-data=$PWD/data
 # 3. gen linear feat
 cmvn=$PWD/cmvn.ark
-cmvn_json2binary_main --json_file=$model_dir/data/mean_std.json --cmvn_write_path=$cmvn
+cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn

-utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat_log \
-linear_spectrogram_without_db_norm_main \
+
+./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
+
+utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat.log \
+linear-spectrogram-wo-db-norm-ol \
    --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
    --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
    --cmvn_file=$cmvn \
@ -65,31 +75,33 @@ linear_spectrogram_without_db_norm_main \
 text=$data/test/text

 # 4. recognizer
-utils/run.pl JOB=1:$nj $data/split${nj}/JOB/log \
-  offline_decoder_sliding_chunk_main \
+utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \
+  ctc-prefix-beam-search-decoder-ol \
    --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
-    --model_path=$aishell_online_model/avg_1.jit.pdmodel \
-    --param_path=$aishell_online_model/avg_1.jit.pdiparams \
+    --model_path=$model_dir/avg_1.jit.pdmodel \
+    --param_path=$model_dir/avg_1.jit.pdiparams \
    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
-    --dict_file=$lm_model_dir/vocab.txt \
+    --dict_file=$vocb_dir/vocab.txt \
    --result_wspecifier=ark,t:$data/split${nj}/JOB/result

 cat $data/split${nj}/*/result > ${label_file}
-local/compute-wer.py --char=1 --v=1 ${label_file} $text > ${wer}
+utils/compute-wer.py --char=1 --v=1 ${label_file} $text > ${wer}

 # 4. decode with lm
-utils/run.pl JOB=1:$nj $data/split${nj}/JOB/log_lm \
-  offline_decoder_sliding_chunk_main \
+utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \
+  ctc-prefix-beam-search-decoder-ol \
    --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
-    --model_path=$aishell_online_model/avg_1.jit.pdmodel \
-    --param_path=$aishell_online_model/avg_1.jit.pdiparams \
+    --model_path=$model_dir/avg_1.jit.pdmodel \
+    --param_path=$model_dir/avg_1.jit.pdiparams \
    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
-    --dict_file=$lm_model_dir/vocab.txt \
-    --lm_path=$lm_model_dir/avg_1.jit.klm \
+    --dict_file=$vocb_dir/vocab.txt \
+    --lm_path=$lm \
    --result_wspecifier=ark,t:$data/split${nj}/JOB/result_lm

+
 cat $data/split${nj}/*/result_lm > ${label_file}_lm
-local/compute-wer.py --char=1 --v=1 ${label_file}_lm $text > ${wer}_lm
+utils/compute-wer.py --char=1 --v=1 ${label_file}_lm $text > ${wer}_lm
+

 graph_dir=./aishell_graph
 if [ ! -d $ ]; then
@ -97,17 +109,19 @@ if [ ! -d $ ]; then
    unzip -d aishell_graph.zip
 fi

+
 # 5. test TLG decoder
-utils/run.pl JOB=1:$nj $data/split${nj}/JOB/log_tlg \
-  offline_wfst_decoder_main \
+utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \
+  wfst-decoder-ol \
    --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
-    --model_path=$aishell_online_model/avg_1.jit.pdmodel \
-    --param_path=$aishell_online_model/avg_1.jit.pdiparams \
+    --model_path=$model_dir/avg_1.jit.pdmodel \
+    --param_path=$model_dir/avg_1.jit.pdiparams \
    --word_symbol_table=$graph_dir/words.txt \
    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
     --graph_path=$graph_dir/TLG.fst --max_active=7500 \
    --acoustic_scale=1.2 \
    --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg

+
 cat $data/split${nj}/*/result_tlg > ${label_file}_tlg
-local/compute-wer.py --char=1 --v=1 ${label_file}_tlg $text > ${wer}_tlg
+utils/compute-wer.py --char=1 --v=1 ${label_file}_tlg $text > ${wer}_tlg
--- a/speechx/examples/ds2_ol/aishell/utils
+++ b/speechx/examples/ds2_ol/aishell/utils
@ -0,0 +1 @@
+../../../../utils/
--- a/speechx/examples/ds2_ol/decoder/.gitignore
+++ b/speechx/examples/ds2_ol/decoder/.gitignore
@ -0,0 +1,2 @@
+data
+exp
--- a/speechx/examples/ds2_ol/decoder/CMakeLists.txt
+++ b/speechx/examples/ds2_ol/decoder/CMakeLists.txt
@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+set(bin_name ctc-prefix-beam-search-decoder-ol)
+add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
+
+
+set(bin_name wfst-decoder-ol)
+add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-decoder ${DEPS})
+
+
+set(bin_name nnet-logprob-decoder-test)
+add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
+
--- a/speechx/examples/ds2_ol/decoder/README.md
+++ b/speechx/examples/ds2_ol/decoder/README.md
@ -0,0 +1,12 @@
+# ASR Decoder
+
+ASR Decoder test bins. We using theses bins to test CTC BeamSearch decoder and WFST decoder.
+
+* decoder_test_main.cc 
+feed nnet output logprob, and only test decoder
+
+* offline_decoder_sliding_chunk_main.cc
+feed streaming audio feature, decode as streaming manner.
+
+* offline_wfst_decoder_main.cc
+feed streaming audio feature, decode using WFST as streaming manner.
--- a/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc
+++ b/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc
@ -34,10 +34,12 @@ DEFINE_int32(receptive_field_length,
 DEFINE_int32(downsampling_rate,
             4,
             "two CNN(kernel=5) module downsampling rate.");
+DEFINE_string(
+    model_input_names,
+    "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box",
+    "model input names");
 DEFINE_string(model_output_names,
-              "save_infer_model/scale_0.tmp_1,save_infer_model/"
-              "scale_1.tmp_1,save_infer_model/scale_2.tmp_1,save_infer_model/"
-              "scale_3.tmp_1",
+              "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0",
              "model output names");
 DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names");

@ -50,9 +52,13 @@ int main(int argc, char* argv[]) {
    gflags::ParseCommandLineFlags(&argc, &argv, false);
    google::InitGoogleLogging(argv[0]);

+    CHECK(FLAGS_result_wspecifier != "");
+    CHECK(FLAGS_feature_rspecifier != "");
+
    kaldi::SequentialBaseFloatMatrixReader feature_reader(
        FLAGS_feature_rspecifier);
    kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
+
    std::string model_graph = FLAGS_model_path;
    std::string model_params = FLAGS_param_path;
    std::string dict_file = FLAGS_dict_file;
@ -73,6 +79,7 @@ int main(int argc, char* argv[]) {
    model_opts.model_path = model_graph;
    model_opts.params_path = model_params;
    model_opts.cache_shape = FLAGS_model_cache_names;
+    model_opts.input_names = FLAGS_model_input_names;
    model_opts.output_names = FLAGS_model_output_names;
    std::shared_ptr<ppspeech::PaddleNnet> nnet(
        new ppspeech::PaddleNnet(model_opts));
--- a/speechx/examples/ds2_ol/decoder/local/model.sh
+++ b/speechx/examples/ds2_ol/decoder/local/model.sh
--- a/speechx/examples/ds2_ol/decoder/nnet-logprob-decoder-test.cc
+++ b/speechx/examples/ds2_ol/decoder/nnet-logprob-decoder-test.cc
--- a/speechx/examples/ds2_ol/decoder/path.sh
+++ b/speechx/examples/ds2_ol/decoder/path.sh
@ -1,6 +1,6 @@
 # This contains the locations of binarys build required for running the examples.

-SPEECHX_ROOT=$PWD/../..
+SPEECHX_ROOT=$PWD/../../../
 SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples

 SPEECHX_TOOLS=$SPEECHX_ROOT/tools
@ -10,5 +10,5 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin

 export LC_AL=C

-SPEECHX_BIN=$SPEECHX_EXAMPLES/decoder:$SPEECHX_EXAMPLES/feat
+SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/decoder:$SPEECHX_EXAMPLES/ds2_ol/feat
 export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
--- a/speechx/examples/ds2_ol/decoder/run.sh
+++ b/speechx/examples/ds2_ol/decoder/run.sh
@ -0,0 +1,79 @@
+#!/bin/bash
+set +x
+set -e
+
+. path.sh
+
+# 1. compile
+if [ ! -d ${SPEECHX_EXAMPLES} ]; then
+    pushd ${SPEECHX_ROOT} 
+    bash build.sh
+    popd
+fi
+
+# input
+mkdir -p data
+data=$PWD/data
+ckpt_dir=$data/model
+model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
+vocb_dir=$ckpt_dir/data/lang_char/
+
+lm=$data/zh_giga.no_cna_cmn.prune01244.klm
+
+# output
+exp_dir=./exp
+mkdir -p $exp_dir
+
+# 2. download model
+if [[ ! -f data/model/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz ]]; then
+    mkdir -p data/model
+    pushd data/model
+    wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+    tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+    popd
+fi
+
+# produce wav scp
+if [ ! -f data/wav.scp ]; then
+    pushd data
+    wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
+    echo "utt1 " $PWD/zh.wav > wav.scp
+    popd 
+fi
+
+# download lm
+if [ ! -f $lm ]; then
+    pushd data
+    wget -c https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm
+    popd
+fi
+
+
+feat_wspecifier=$exp_dir/feats.ark
+cmvn=$exp_dir/cmvn.ark
+
+export GLOG_logtostderr=1
+
+# dump json cmvn to kaldi
+cmvn-json2kaldi \
+    --json_file  $ckpt_dir/data/mean_std.json \
+    --cmvn_write_path $exp_dir/cmvn.ark \
+    --binary=false
+echo "convert json cmvn to kaldi ark."
+
+
+# generate linear feature as streaming
+linear-spectrogram-wo-db-norm-ol \
+    --wav_rspecifier=scp:$data/wav.scp \
+    --feature_wspecifier=ark,t:$feat_wspecifier \
+    --cmvn_file=$exp_dir/cmvn.ark
+echo "compute linear spectrogram feature."
+
+# run ctc beam search decoder as streaming
+ctc-prefix-beam-search-decoder-ol \
+    --result_wspecifier=ark,t:$exp_dir/result.txt \
+    --feature_rspecifier=ark:$feat_wspecifier \
+    --model_path=$model_dir/avg_1.jit.pdmodel \
+    --param_path=$model_dir/avg_1.jit.pdiparams \
+    --dict_file=$vocb_dir/vocab.txt \
+    --lm_path=$lm
--- a/speechx/examples/ds2_ol/decoder/valgrind.sh
+++ b/speechx/examples/ds2_ol/decoder/valgrind.sh
--- a/speechx/examples/decoder/offline_wfst_decoder_main.cc
+++ b/speechx/examples/decoder/offline_wfst_decoder_main.cc
@ -28,6 +28,7 @@ DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
 DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
 DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
 DEFINE_string(graph_path, "TLG", "decoder graph");
+
 DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
 DEFINE_int32(max_active, 7500, "decoder graph");
 DEFINE_int32(receptive_field_length,
--- a/speechx/examples/ds2_ol/feat/.gitignore
+++ b/speechx/examples/ds2_ol/feat/.gitignore
@ -0,0 +1,2 @@
+exp
+data
--- a/speechx/examples/ds2_ol/feat/CMakeLists.txt
+++ b/speechx/examples/ds2_ol/feat/CMakeLists.txt
@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+set(bin_name linear-spectrogram-wo-db-norm-ol)
+add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(${bin_name} frontend kaldi-util kaldi-feat-common gflags glog)
+
+
+set(bin_name cmvn-json2kaldi)
+add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog)
--- a/speechx/examples/ds2_ol/feat/README.md
+++ b/speechx/examples/ds2_ol/feat/README.md
@ -0,0 +1,7 @@
+# Deepspeech2 Straming Audio Feature
+
+ASR audio feature test bins. We using theses bins to test linaer/fbank/mfcc asr feature as streaming manner.
+
+* linear_spectrogram_without_db_norm_main.cc
+
+compute linear spectrogram w/o db norm in streaming manner.
--- a/speechx/examples/ds2_ol/feat/cmvn-json2kaldi.cc
+++ b/speechx/examples/ds2_ol/feat/cmvn-json2kaldi.cc
@ -0,0 +1,81 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Note: Do not print/log ondemand object.
+
+#include "base/flags.h"
+#include "base/log.h"
+#include "kaldi/matrix/kaldi-matrix.h"
+#include "kaldi/util/kaldi-io.h"
+#include "utils/file_utils.h"
+#include "utils/simdjson.h"
+
+DEFINE_string(json_file, "", "cmvn json file");
+DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn");
+DEFINE_bool(binary, true, "write cmvn in binary (true) or text(false)");
+
+using namespace simdjson;
+
+int main(int argc, char* argv[]) {
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+
+    LOG(INFO) << "cmvn josn path: " << FLAGS_json_file;
+
+    try {
+        padded_string json = padded_string::load(FLAGS_json_file);
+
+        ondemand::parser parser;
+        ondemand::document doc = parser.iterate(json);
+        ondemand::value val = doc;
+
+        ondemand::array mean_stat = val["mean_stat"];
+        std::vector<kaldi::BaseFloat> mean_stat_vec;
+        for (double x : mean_stat) {
+            mean_stat_vec.push_back(x);
+        }
+        // LOG(INFO) << mean_stat; this line will casue
+        // simdjson::simdjson_error("Objects and arrays can only be iterated
+        // when
+        // they are first encountered")
+
+        ondemand::array var_stat = val["var_stat"];
+        std::vector<kaldi::BaseFloat> var_stat_vec;
+        for (double x : var_stat) {
+            var_stat_vec.push_back(x);
+        }
+
+        kaldi::int32 frame_num = uint64_t(val["frame_num"]);
+        LOG(INFO) << "nframe: " << frame_num;
+
+        size_t mean_size = mean_stat_vec.size();
+        kaldi::Matrix<double> cmvn_stats(2, mean_size + 1);
+        for (size_t idx = 0; idx < mean_size; ++idx) {
+            cmvn_stats(0, idx) = mean_stat_vec[idx];
+            cmvn_stats(1, idx) = var_stat_vec[idx];
+        }
+        cmvn_stats(0, mean_size) = frame_num;
+        LOG(INFO) << cmvn_stats;
+
+        kaldi::WriteKaldiObject(
+            cmvn_stats, FLAGS_cmvn_write_path, FLAGS_binary);
+        LOG(INFO) << "cmvn stats have write into: " << FLAGS_cmvn_write_path;
+        LOG(INFO) << "Binary: " << FLAGS_binary;
+    } catch (simdjson::simdjson_error& err) {
+        LOG(ERR) << err.what();
+    }
+
+
+    return 0;
+}
--- a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
+++ b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
@ -32,6 +32,7 @@ DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
 DEFINE_string(cmvn_file, "./cmvn.ark", "read cmvn");
 DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");

+
 int main(int argc, char* argv[]) {
    gflags::ParseCommandLineFlags(&argc, &argv, false);
    google::InitGoogleLogging(argv[0]);
--- a/speechx/examples/ds2_ol/feat/path.sh
+++ b/speechx/examples/ds2_ol/feat/path.sh
@ -1,6 +1,6 @@
 # This contains the locations of binarys build required for running the examples.

-SPEECHX_ROOT=$PWD/../..
+SPEECHX_ROOT=$PWD/../../../
 SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples

 SPEECHX_TOOLS=$SPEECHX_ROOT/tools
@ -10,5 +10,5 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin

 export LC_AL=C

-SPEECHX_BIN=$SPEECHX_EXAMPLES/feat
+SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/feat
 export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
--- a/speechx/examples/ds2_ol/feat/run.sh
+++ b/speechx/examples/ds2_ol/feat/run.sh
@ -0,0 +1,57 @@
+#!/bin/bash
+set +x
+set -e
+
+. ./path.sh
+
+# 1. compile
+if [ ! -d ${SPEECHX_EXAMPLES} ]; then
+    pushd ${SPEECHX_ROOT} 
+    bash build.sh
+    popd
+fi
+
+# 2. download model
+if [ ! -e data/model/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz ]; then
+    mkdir -p data/model
+    pushd data/model
+    wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+    tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+    popd
+fi
+
+# produce wav scp
+if [ ! -f data/wav.scp ]; then
+    mkdir -p data
+    pushd data
+    wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
+    echo "utt1 " $PWD/zh.wav > wav.scp
+    popd 
+fi
+
+
+# input
+data_dir=./data
+exp_dir=./exp
+model_dir=$data_dir/model/
+
+mkdir -p $exp_dir
+
+
+# 3. run feat
+export GLOG_logtostderr=1
+
+cmvn-json2kaldi \
+    --json_file  $model_dir/data/mean_std.json \
+    --cmvn_write_path $exp_dir/cmvn.ark \
+    --binary=false
+echo "convert json cmvn to kaldi ark."
+
+
+linear-spectrogram-wo-db-norm-ol \
+    --wav_rspecifier=scp:$data_dir/wav.scp \
+    --feature_wspecifier=ark,t:$exp_dir/feats.ark \
+    --cmvn_file=$exp_dir/cmvn.ark
+echo "compute linear spectrogram feature."
+
+
--- a/speechx/examples/ds2_ol/feat/valgrind.sh
+++ b/speechx/examples/ds2_ol/feat/valgrind.sh
--- a/speechx/examples/ds2_ol/nnet/.gitignore
+++ b/speechx/examples/ds2_ol/nnet/.gitignore
@ -0,0 +1,2 @@
+data
+exp
--- a/speechx/examples/ds2_ol/nnet/CMakeLists.txt
+++ b/speechx/examples/ds2_ol/nnet/CMakeLists.txt
@ -0,0 +1,6 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+set(bin_name ds2-model-ol-test)
+add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(${bin_name} PUBLIC nnet gflags glog ${DEPS})
--- a/speechx/examples/ds2_ol/nnet/README.md
+++ b/speechx/examples/ds2_ol/nnet/README.md
@ -0,0 +1,3 @@
+# Deepspeech2 Streaming NNet Test
+
+Using for ds2 streaming nnet inference test.
--- a/speechx/examples/ds2_ol/nnet/ds2-model-ol-test.cc
+++ b/speechx/examples/ds2_ol/nnet/ds2-model-ol-test.cc
@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <gflags/gflags.h>
+// deepspeech2 online model info
+
 #include <algorithm>
 #include <fstream>
 #include <functional>
@ -20,21 +21,26 @@
 #include <iterator>
 #include <numeric>
 #include <thread>
+#include "base/flags.h"
+#include "base/log.h"
 #include "paddle_inference_api.h"

 using std::cout;
 using std::endl;

-DEFINE_string(model_path, "avg_1.jit.pdmodel", "xxx.pdmodel");
-DEFINE_string(param_path, "avg_1.jit.pdiparams", "xxx.pdiparams");
+
+DEFINE_string(model_path, "", "xxx.pdmodel");
+DEFINE_string(param_path, "", "xxx.pdiparams");
+DEFINE_int32(chunk_size, 35, "feature chunk size, unit:frame");
+DEFINE_int32(feat_dim, 161, "feature dim");


 void produce_data(std::vector<std::vector<float>>* data);
 void model_forward_test();

 void produce_data(std::vector<std::vector<float>>* data) {
-    int chunk_size = 35;  // chunk_size in frame
-    int col_size = 161;   // feat dim
+    int chunk_size = FLAGS_chunk_size;  // chunk_size in frame
+    int col_size = FLAGS_feat_dim;      // feat dim
    cout << "chunk size: " << chunk_size << endl;
    cout << "feat dim: " << col_size << endl;

@ -57,6 +63,8 @@ void model_forward_test() {
    ;
    std::string model_graph = FLAGS_model_path;
    std::string model_params = FLAGS_param_path;
+    CHECK(model_graph != "");
+    CHECK(model_params != "");
    cout << "model path: " << model_graph << endl;
    cout << "model param path : " << model_params << endl;

@ -106,7 +114,7 @@ void model_forward_test() {
    // state_h
    std::unique_ptr<paddle_infer::Tensor> chunk_state_h_box =
        predictor->GetInputHandle(input_names[2]);
-    std::vector<int> chunk_state_h_box_shape = {3, 1, 1024};
+    std::vector<int> chunk_state_h_box_shape = {5, 1, 1024};
    chunk_state_h_box->Reshape(chunk_state_h_box_shape);
    int chunk_state_h_box_size =
        std::accumulate(chunk_state_h_box_shape.begin(),
@ -119,7 +127,7 @@ void model_forward_test() {
    // state_c
    std::unique_ptr<paddle_infer::Tensor> chunk_state_c_box =
        predictor->GetInputHandle(input_names[3]);
-    std::vector<int> chunk_state_c_box_shape = {3, 1, 1024};
+    std::vector<int> chunk_state_c_box_shape = {5, 1, 1024};
    chunk_state_c_box->Reshape(chunk_state_c_box_shape);
    int chunk_state_c_box_size =
        std::accumulate(chunk_state_c_box_shape.begin(),
@ -187,7 +195,9 @@ void model_forward_test() {
 }

 int main(int argc, char* argv[]) {
-    gflags::ParseCommandLineFlags(&argc, &argv, true);
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+
    model_forward_test();
    return 0;
 }
--- a/speechx/examples/ds2_ol/nnet/path.sh
+++ b/speechx/examples/ds2_ol/nnet/path.sh
@ -1,6 +1,6 @@
 # This contains the locations of binarys build required for running the examples.

-SPEECHX_ROOT=$PWD/../..
+SPEECHX_ROOT=$PWD/../../../
 SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples

 SPEECHX_TOOLS=$SPEECHX_ROOT/tools
@ -10,5 +10,5 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin

 export LC_AL=C

-SPEECHX_BIN=$SPEECHX_EXAMPLES/glog
+SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/nnet
 export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
--- a/speechx/examples/ds2_ol/nnet/run.sh
+++ b/speechx/examples/ds2_ol/nnet/run.sh
@ -0,0 +1,38 @@
+#!/bin/bash
+set +x
+set -e
+
+. path.sh
+
+# 1. compile
+if [ ! -d ${SPEECHX_EXAMPLES} ]; then
+    pushd ${SPEECHX_ROOT} 
+    bash build.sh
+    popd
+fi
+
+# 2. download model
+if [ ! -f data/model/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz ]; then
+    mkdir -p data/model
+    pushd data/model
+    wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+    tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+    popd
+fi
+
+# produce wav scp
+if [ ! -f data/wav.scp ]; then
+    mkdir -p data
+    pushd data
+    wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
+    echo "utt1 " $PWD/zh.wav > wav.scp
+    popd 
+fi
+
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
+
+ds2-model-ol-test \
+    --model_path=$model_dir/avg_1.jit.pdmodel \
+    --param_path=$model_dir/avg_1.jit.pdiparams
+
--- a/speechx/examples/ds2_ol/nnet/valgrind.sh
+++ b/speechx/examples/ds2_ol/nnet/valgrind.sh
--- a/speechx/examples/feat/CMakeLists.txt
+++ b/speechx/examples/feat/CMakeLists.txt
@ -1,18 +0,0 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-
-add_executable(mfcc-test ${CMAKE_CURRENT_SOURCE_DIR}/feature-mfcc-test.cc)
-target_include_directories(mfcc-test PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(mfcc-test kaldi-mfcc)
-
-add_executable(linear_spectrogram_main ${CMAKE_CURRENT_SOURCE_DIR}/linear_spectrogram_main.cc)
-target_include_directories(linear_spectrogram_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(linear_spectrogram_main frontend kaldi-util kaldi-feat-common gflags glog)
-
-add_executable(linear_spectrogram_without_db_norm_main ${CMAKE_CURRENT_SOURCE_DIR}/linear_spectrogram_without_db_norm_main.cc)
-target_include_directories(linear_spectrogram_without_db_norm_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(linear_spectrogram_without_db_norm_main frontend kaldi-util kaldi-feat-common gflags glog)
-
-add_executable(cmvn_json2binary_main ${CMAKE_CURRENT_SOURCE_DIR}/cmvn_json2binary_main.cc)
-target_include_directories(cmvn_json2binary_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(cmvn_json2binary_main utils kaldi-util kaldi-matrix gflags glog)
--- a/speechx/examples/feat/cmvn_json2binary_main.cc
+++ b/speechx/examples/feat/cmvn_json2binary_main.cc
@ -1,58 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/flags.h"
-#include "base/log.h"
-#include "kaldi/matrix/kaldi-matrix.h"
-#include "kaldi/util/kaldi-io.h"
-#include "utils/file_utils.h"
-#include "utils/simdjson.h"
-
-DEFINE_string(json_file, "", "cmvn json file");
-DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn");
-DEFINE_bool(binary, true, "write cmvn in binary (true) or text(false)");
-
-using namespace simdjson;
-
-int main(int argc, char* argv[]) {
-    gflags::ParseCommandLineFlags(&argc, &argv, false);
-    google::InitGoogleLogging(argv[0]);
-
-    ondemand::parser parser;
-    padded_string json = padded_string::load(FLAGS_json_file);
-    ondemand::document val = parser.iterate(json);
-    ondemand::object doc = val;
-    kaldi::int32 frame_num = uint64_t(doc["frame_num"]);
-    auto mean_stat = doc["mean_stat"];
-    std::vector<kaldi::BaseFloat> mean_stat_vec;
-    for (double x : mean_stat) {
-        mean_stat_vec.push_back(x);
-    }
-    auto var_stat = doc["var_stat"];
-    std::vector<kaldi::BaseFloat> var_stat_vec;
-    for (double x : var_stat) {
-        var_stat_vec.push_back(x);
-    }
-
-    size_t mean_size = mean_stat_vec.size();
-    kaldi::Matrix<double> cmvn_stats(2, mean_size + 1);
-    for (size_t idx = 0; idx < mean_size; ++idx) {
-        cmvn_stats(0, idx) = mean_stat_vec[idx];
-        cmvn_stats(1, idx) = var_stat_vec[idx];
-    }
-    cmvn_stats(0, mean_size) = frame_num;
-    kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, FLAGS_binary);
-    LOG(INFO) << "the json file have write into " << FLAGS_cmvn_write_path;
-    return 0;
-}
--- a/Show More
+++ b/Show More