diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6e7ae1fb..0435cfbe 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -50,13 +50,20 @@ repos: entry: bash .pre-commit-hooks/clang-format.hook -i language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$ - exclude: (?=speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin).*(\.cpp|\.cc|\.h|\.py)$ + exclude: (?=speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.py)$ #- id: copyright_checker # name: copyright_checker # entry: python .pre-commit-hooks/copyright-check.hook # language: system # files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$ # exclude: (?=third_party|pypinyin|speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin).*(\.cpp|\.cc|\.h|\.py)$ + - id: cpplint + name: cpplint + description: Static code analysis of C/C++ files + language: python + files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$ + exclude: (?=speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.py)$ + entry: cpplint --filter=-build,-whitespace,+whitespace/comma,-whitespace/indent - repo: https://github.com/asottile/reorder_python_imports rev: v2.4.0 hooks: diff --git a/examples/wenetspeech/asr1/local/test_wav.sh b/examples/wenetspeech/asr1/local/test_wav.sh index 47464262..c3a17f49 100755 --- a/examples/wenetspeech/asr1/local/test_wav.sh +++ b/examples/wenetspeech/asr1/local/test_wav.sh @@ -42,6 +42,7 @@ for type in attention_rescoring; do output_dir=${ckpt_prefix} mkdir -p ${output_dir} python3 -u ${BIN_DIR}/test_wav.py \ + --debug True \ --ngpu ${ngpu} \ --config ${config_path} \ --decode_cfg ${decode_config_path} \ diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index d12ea364..0df44319 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -16,6 +16,8 @@ import os import sys from pathlib import Path +import distutils +import numpy as np import paddle import soundfile from yacs.config import CfgNode @@ -74,6 +76,8 @@ class U2Infer(): # fbank feat = self.preprocessing(audio, **self.preprocess_args) logger.info(f"feat shape: {feat.shape}") + if self.args.debug: + np.savetxt("feat.transform.txt", feat) ilen = paddle.to_tensor(feat.shape[0]) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) @@ -126,6 +130,11 @@ if __name__ == "__main__": "--result_file", type=str, help="path of save the asr result") parser.add_argument( "--audio_file", type=str, help="path of the input audio file") + parser.add_argument( + "--debug", + type=distutils.util.strtobool, + default=False, + help="for debug.") args = parser.parse_args() config = CfgNode(new_allowed=True) diff --git a/setup.py b/setup.py index e551d9fa..35668bdd 100644 --- a/setup.py +++ b/setup.py @@ -75,6 +75,7 @@ base = [ "braceexpand", "pyyaml", "pybind11", + "paddleslim==2.3.4", ] server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"] diff --git a/speechx/.clang-format b/speechx/.clang-format new file mode 100644 index 00000000..af946a4a --- /dev/null +++ b/speechx/.clang-format @@ -0,0 +1,29 @@ +# This file is used by clang-format to autoformat paddle source code +# +# The clang-format is part of llvm toolchain. +# It need to install llvm and clang to format source code style. +# +# The basic usage is, +# clang-format -i -style=file PATH/TO/SOURCE/CODE +# +# The -style=file implicit use ".clang-format" file located in one of +# parent directory. +# The -i means inplace change. +# +# The document of clang-format is +# http://clang.llvm.org/docs/ClangFormat.html +# http://clang.llvm.org/docs/ClangFormatStyleOptions.html +--- +Language: Cpp +BasedOnStyle: Google +IndentWidth: 4 +TabWidth: 4 +ContinuationIndentWidth: 4 +MaxEmptyLinesToKeep: 2 +AccessModifierOffset: -2 # The private/protected/public has no indent in class +Standard: Cpp11 +AllowAllParametersOfDeclarationOnNextLine: true +BinPackParameters: false +BinPackArguments: false +... + diff --git a/speechx/.gitignore b/speechx/.gitignore index e0c61847..9a93805c 100644 --- a/speechx/.gitignore +++ b/speechx/.gitignore @@ -1 +1,2 @@ tools/valgrind* +*log diff --git a/speechx/CMakeLists.txt b/speechx/CMakeLists.txt index 4b5838e5..978a23d9 100644 --- a/speechx/CMakeLists.txt +++ b/speechx/CMakeLists.txt @@ -13,7 +13,6 @@ set(CMAKE_CXX_STANDARD 14) set(speechx_cmake_dir ${PROJECT_SOURCE_DIR}/cmake) # Modules -list(APPEND CMAKE_MODULE_PATH ${speechx_cmake_dir}/external) list(APPEND CMAKE_MODULE_PATH ${speechx_cmake_dir}) include(FetchContent) include(ExternalProject) @@ -32,9 +31,13 @@ SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} --std=c++14 -pthread -fPIC -O3 -Wall ############################################################################### # Option Configurations ############################################################################### -# option configurations option(TEST_DEBUG "option for debug" OFF) +option(USE_PROFILING "enable c++ profling" OFF) +option(USING_U2 "compile u2 model." ON) +option(USING_DS2 "compile with ds2 model." ON) + +option(USING_GPU "u2 compute on GPU." OFF) ############################################################################### # Include third party @@ -83,48 +86,65 @@ add_dependencies(openfst gflags glog) # paddle lib -set(paddle_SOURCE_DIR ${fc_patch}/paddle-lib) -set(paddle_PREFIX_DIR ${fc_patch}/paddle-lib-prefix) -ExternalProject_Add(paddle - URL https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/CPU/gcc8.2_avx_mkl/paddle_inference.tgz - URL_HASH SHA256=7c6399e778c6554a929b5a39ba2175e702e115145e8fa690d2af974101d98873 - PREFIX ${paddle_PREFIX_DIR} - SOURCE_DIR ${paddle_SOURCE_DIR} - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" -) - -set(PADDLE_LIB ${fc_patch}/paddle-lib) -include_directories("${PADDLE_LIB}/paddle/include") -set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/") -include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include") -include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include") -include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/include") - -link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib") -link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") -link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") -link_directories("${PADDLE_LIB}/paddle/lib") -link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}mklml/lib") - -##paddle with mkl -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") -set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml") -include_directories("${MATH_LIB_PATH}/include") -set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} - ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) -set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn") -include_directories("${MKLDNN_PATH}/include") -set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) -set(EXTERNAL_LIB "-lrt -ldl -lpthread") - -set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX}) -set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} - glog gflags protobuf xxhash cryptopp - ${EXTERNAL_LIB}) - +include(paddleinference) + + +# paddle core.so +find_package(Threads REQUIRED) +find_package(PythonLibs REQUIRED) +find_package(Python3 REQUIRED) +find_package(pybind11 CONFIG) + +message(STATUS "PYTHON_LIBRARIES = ${PYTHON_LIBRARIES}") +message(STATUS "Python3_EXECUTABLE = ${Python3_EXECUTABLE}") +message(STATUS "Pybind11_INCLUDES = ${pybind11_INCLUDE_DIRS}, pybind11_LIBRARIES=${pybind11_LIBRARIES}, pybind11_DEFINITIONS=${pybind11_DEFINITIONS}") + +# paddle include and link option +# -L/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/libs -L/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/fluid -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so +execute_process( + COMMAND python -c "\ +import os;\ +import paddle;\ +include_dir=paddle.sysconfig.get_include();\ +paddle_dir=os.path.split(include_dir)[0];\ +libs_dir=os.path.join(paddle_dir, 'libs');\ +fluid_dir=os.path.join(paddle_dir, 'fluid');\ +out=' '.join([\"-L\" + libs_dir, \"-L\" + fluid_dir]);\ +out += \" -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so\"; print(out);\ + " + OUTPUT_VARIABLE PADDLE_LINK_FLAGS + RESULT_VARIABLE SUCESS) + +message(STATUS PADDLE_LINK_FLAGS= ${PADDLE_LINK_FLAGS}) +string(STRIP ${PADDLE_LINK_FLAGS} PADDLE_LINK_FLAGS) + +# paddle compile option +# -I/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/include +execute_process( + COMMAND python -c "\ +import paddle; \ +include_dir = paddle.sysconfig.get_include(); \ +print(f\"-I{include_dir}\"); \ + " + OUTPUT_VARIABLE PADDLE_COMPILE_FLAGS) +message(STATUS PADDLE_COMPILE_FLAGS= ${PADDLE_COMPILE_FLAGS}) +string(STRIP ${PADDLE_COMPILE_FLAGS} PADDLE_COMPILE_FLAGS) + + +# for LD_LIBRARY_PATH +# set(PADDLE_LIB_DIRS /workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid:/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/libs/) +execute_process( + COMMAND python -c " \ +import os; \ +import paddle; \ +include_dir=paddle.sysconfig.get_include(); \ +paddle_dir=os.path.split(include_dir)[0]; \ +libs_dir=os.path.join(paddle_dir, 'libs'); \ +fluid_dir=os.path.join(paddle_dir, 'fluid'); \ +out=':'.join([libs_dir, fluid_dir]); print(out); \ + " + OUTPUT_VARIABLE PADDLE_LIB_DIRS) +message(STATUS PADDLE_LIB_DIRS= ${PADDLE_LIB_DIRS}) ############################################################################### diff --git a/speechx/README.md b/speechx/README.md index cd1cd62c..f744defa 100644 --- a/speechx/README.md +++ b/speechx/README.md @@ -3,11 +3,14 @@ ## Environment We develop under: +* python - 3.7 * docker - `registry.baidubce.com/paddlepaddle/paddle:2.2.2-gpu-cuda10.2-cudnn7` * os - Ubuntu 16.04.7 LTS * gcc/g++/gfortran - 8.2.0 * cmake - 3.16.0 +> Please use `tools/env.sh` to create python `venv`, then `source venv/bin/activate` to build speechx. + > We make sure all things work fun under docker, and recommend using it to develop and deploy. * [How to Install Docker](https://docs.docker.com/engine/install/) @@ -24,16 +27,23 @@ docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/workspace --nam * More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html). +2. Create python environment. -2. Build `speechx` and `examples`. +``` +bash tools/venv.sh +``` -> Do not source venv. +2. Build `speechx` and `examples`. +For now we are using feature under `develop` branch of paddle, so we need to install `paddlepaddle` nightly build version. +For example: ``` -pushd /path/to/speechx +source venv/bin/activate +python -m pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html ./build.sh ``` + 3. Go to `examples` to have a fun. More details please see `README.md` under `examples`. diff --git a/speechx/build.sh b/speechx/build.sh index a6eef656..7655f963 100755 --- a/speechx/build.sh +++ b/speechx/build.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +set -xe # the build script had verified in the paddlepaddle docker image. # please follow the instruction below to install PaddlePaddle image. @@ -17,11 +18,6 @@ fi #rm -rf build mkdir -p build -cd build -cmake .. -DBOOST_ROOT:STRING=${boost_SOURCE_DIR} -#cmake .. - -make -j - -cd - +cmake -B build -DBOOST_ROOT:STRING=${boost_SOURCE_DIR} +cmake --build build -j diff --git a/speechx/cmake/external/absl.cmake b/speechx/cmake/absl.cmake similarity index 100% rename from speechx/cmake/external/absl.cmake rename to speechx/cmake/absl.cmake diff --git a/speechx/cmake/external/boost.cmake b/speechx/cmake/boost.cmake similarity index 100% rename from speechx/cmake/external/boost.cmake rename to speechx/cmake/boost.cmake diff --git a/speechx/cmake/external/eigen.cmake b/speechx/cmake/eigen.cmake similarity index 100% rename from speechx/cmake/external/eigen.cmake rename to speechx/cmake/eigen.cmake diff --git a/speechx/cmake/external/gflags.cmake b/speechx/cmake/external/gflags.cmake deleted file mode 100644 index 66ae47f7..00000000 --- a/speechx/cmake/external/gflags.cmake +++ /dev/null @@ -1,12 +0,0 @@ -include(FetchContent) - -FetchContent_Declare( - gflags - URL https://github.com/gflags/gflags/archive/v2.2.1.zip - URL_HASH SHA256=4e44b69e709c826734dbbbd5208f61888a2faf63f239d73d8ba0011b2dccc97a -) - -FetchContent_MakeAvailable(gflags) - -# openfst need -include_directories(${gflags_BINARY_DIR}/include) \ No newline at end of file diff --git a/speechx/cmake/gflags.cmake b/speechx/cmake/gflags.cmake new file mode 100644 index 00000000..36bebc87 --- /dev/null +++ b/speechx/cmake/gflags.cmake @@ -0,0 +1,11 @@ +include(FetchContent) + +FetchContent_Declare( + gflags + URL https://github.com/gflags/gflags/archive/v2.2.2.zip + URL_HASH SHA256=19713a36c9f32b33df59d1c79b4958434cb005b5b47dc5400a7a4b078111d9b5 +) +FetchContent_MakeAvailable(gflags) + +# openfst need +include_directories(${gflags_BINARY_DIR}/include) \ No newline at end of file diff --git a/speechx/cmake/external/glog.cmake b/speechx/cmake/glog.cmake similarity index 100% rename from speechx/cmake/external/glog.cmake rename to speechx/cmake/glog.cmake diff --git a/speechx/cmake/external/gtest.cmake b/speechx/cmake/gtest.cmake similarity index 69% rename from speechx/cmake/external/gtest.cmake rename to speechx/cmake/gtest.cmake index 7fe397fc..1ea8ed0b 100644 --- a/speechx/cmake/external/gtest.cmake +++ b/speechx/cmake/gtest.cmake @@ -1,8 +1,8 @@ include(FetchContent) FetchContent_Declare( gtest - URL https://github.com/google/googletest/archive/release-1.10.0.zip - URL_HASH SHA256=94c634d499558a76fa649edb13721dce6e98fb1e7018dfaeba3cd7a083945e91 + URL https://github.com/google/googletest/archive/release-1.11.0.zip + URL_HASH SHA256=353571c2440176ded91c2de6d6cd88ddd41401d14692ec1f99e35d013feda55a ) FetchContent_MakeAvailable(gtest) diff --git a/speechx/cmake/external/kenlm.cmake b/speechx/cmake/kenlm.cmake similarity index 100% rename from speechx/cmake/external/kenlm.cmake rename to speechx/cmake/kenlm.cmake diff --git a/speechx/cmake/external/libsndfile.cmake b/speechx/cmake/libsndfile.cmake similarity index 100% rename from speechx/cmake/external/libsndfile.cmake rename to speechx/cmake/libsndfile.cmake diff --git a/speechx/cmake/external/openblas.cmake b/speechx/cmake/openblas.cmake similarity index 88% rename from speechx/cmake/external/openblas.cmake rename to speechx/cmake/openblas.cmake index 5c196527..27e13207 100644 --- a/speechx/cmake/external/openblas.cmake +++ b/speechx/cmake/openblas.cmake @@ -1,7 +1,7 @@ include(FetchContent) -set(OpenBLAS_SOURCE_DIR ${fc_patch}/OpenBLAS-src) -set(OpenBLAS_PREFIX ${fc_patch}/OpenBLAS-prefix) +set(OpenBLAS_SOURCE_DIR ${fc_patch}/openblas-src) +set(OpenBLAS_PREFIX ${fc_patch}/openblas-prefix) # ###################################################################################################################### # OPENBLAS https://github.com/lattice/quda/blob/develop/CMakeLists.txt#L575 @@ -43,6 +43,7 @@ ExternalProject_Add( # https://cmake.org/cmake/help/latest/module/ExternalProject.html?highlight=externalproject_get_property#external-project-definition ExternalProject_Get_Property(OPENBLAS INSTALL_DIR) +message(STATUS "OPENBLAS install dir: ${INSTALL_DIR}") set(OpenBLAS_INSTALL_PREFIX ${INSTALL_DIR}) add_library(openblas STATIC IMPORTED) add_dependencies(openblas OPENBLAS) @@ -55,4 +56,6 @@ set_target_properties(openblas PROPERTIES IMPORTED_LOCATION ${OpenBLAS_INSTALL_P # ${CMAKE_INSTALL_LIBDIR} lib # ${CMAKE_INSTALL_INCLUDEDIR} include link_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}) -include_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}) \ No newline at end of file +# include_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}) +# fix for can not find `cblas.h` +include_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/openblas) \ No newline at end of file diff --git a/speechx/cmake/external/openfst.cmake b/speechx/cmake/openfst.cmake similarity index 100% rename from speechx/cmake/external/openfst.cmake rename to speechx/cmake/openfst.cmake diff --git a/speechx/cmake/paddleinference.cmake b/speechx/cmake/paddleinference.cmake new file mode 100644 index 00000000..d8a9c613 --- /dev/null +++ b/speechx/cmake/paddleinference.cmake @@ -0,0 +1,49 @@ +set(paddle_SOURCE_DIR ${fc_patch}/paddle-lib) +set(paddle_PREFIX_DIR ${fc_patch}/paddle-lib-prefix) + +include(FetchContent) +FetchContent_Declare( + paddle + URL https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/CPU/gcc8.2_avx_mkl/paddle_inference.tgz + URL_HASH SHA256=7c6399e778c6554a929b5a39ba2175e702e115145e8fa690d2af974101d98873 + PREFIX ${paddle_PREFIX_DIR} + SOURCE_DIR ${paddle_SOURCE_DIR} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" +) +FetchContent_MakeAvailable(paddle) + +set(PADDLE_LIB_THIRD_PARTY_PATH "${paddle_SOURCE_DIR}/third_party/install/") + +include_directories("${paddle_SOURCE_DIR}/paddle/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/include") + +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") +link_directories("${paddle_SOURCE_DIR}/paddle/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}mklml/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn/lib") + +##paddle with mkl +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") +set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml") +include_directories("${MATH_LIB_PATH}/include") +set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} + ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) + +set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn") +include_directories("${MKLDNN_PATH}/include") +set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) +set(EXTERNAL_LIB "-lrt -ldl -lpthread") + +# global vars +set(DEPS ${paddle_SOURCE_DIR}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX} CACHE INTERNAL "deps") +set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + glog gflags protobuf xxhash cryptopp + ${EXTERNAL_LIB} CACHE INTERNAL "deps") +message(STATUS "Deps libraries: ${DEPS}") diff --git a/speechx/examples/README.md b/speechx/examples/README.md index f7f6f9ac..de27bd94 100644 --- a/speechx/examples/README.md +++ b/speechx/examples/README.md @@ -1,20 +1,42 @@ # Examples for SpeechX +> `u2pp_ol` is recommended. + +* `u2pp_ol` - u2++ streaming asr test under `aishell-1` test dataset. * `ds2_ol` - ds2 streaming test under `aishell-1` test dataset. + ## How to run -`run.sh` is the entry point. +### Create env + +Using `tools/evn.sh` under `speechx` to create python env. + +``` +bash tools/env.sh +``` + +Source env before play with example. +``` +. venv/bin/activate +``` + +### Play with example + +`run.sh` is the entry point for every example. -Example to play `ds2_ol`: +Example to play `u2pp_ol`: ``` -pushd ds2_ol/aishell -bash run.sh +pushd u2pp_ol/wenetspeech +bash run.sh --stop_stage 4 ``` ## Display Model with [Netron](https://github.com/lutzroeder/netron) +If you have a model, we can using this commnd to show model graph. + +For example: ``` pip install netron netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel --port 8022 --host 10.21.55.20 diff --git a/speechx/examples/codelab/README.md b/speechx/examples/codelab/README.md index f89184de..803f25fa 100644 --- a/speechx/examples/codelab/README.md +++ b/speechx/examples/codelab/README.md @@ -1,8 +1,9 @@ # Codelab -## introduction +> The below is for developing and offline testing. +> Do not run it only if you know what it is. -> The below is for developing and offline testing. Do not run it only if you know what it is. * nnet * feat * decoder +* u2 diff --git a/speechx/examples/codelab/decoder/run.sh b/speechx/examples/codelab/decoder/run.sh index a911eb03..1a9e3cd7 100755 --- a/speechx/examples/codelab/decoder/run.sh +++ b/speechx/examples/codelab/decoder/run.sh @@ -69,7 +69,7 @@ compute_linear_spectrogram_main \ echo "compute linear spectrogram feature." # run ctc beam search decoder as streaming -ctc_prefix_beam_search_decoder_main \ +ctc_beam_search_decoder_main \ --result_wspecifier=ark,t:$exp_dir/result.txt \ --feature_rspecifier=ark:$feat_wspecifier \ --model_path=$model_dir/avg_1.jit.pdmodel \ diff --git a/speechx/examples/codelab/feat/.gitignore b/speechx/examples/codelab/feat/.gitignore new file mode 100644 index 00000000..bbd86a25 --- /dev/null +++ b/speechx/examples/codelab/feat/.gitignore @@ -0,0 +1,2 @@ +data +exp diff --git a/speechx/examples/codelab/feat/path.sh b/speechx/examples/codelab/feat/path.sh index 3b89d01e..9d229174 100644 --- a/speechx/examples/codelab/feat/path.sh +++ b/speechx/examples/codelab/feat/path.sh @@ -1,12 +1,12 @@ # This contains the locations of binarys build required for running the examples. SPEECHX_ROOT=$PWD/../../../ -SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples +SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx SPEECHX_TOOLS=$SPEECHX_ROOT/tools TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin -[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } +[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; } export LC_AL=C diff --git a/speechx/examples/codelab/feat/run.sh b/speechx/examples/codelab/feat/run.sh index 1fa37f98..5d7612ae 100755 --- a/speechx/examples/codelab/feat/run.sh +++ b/speechx/examples/codelab/feat/run.sh @@ -42,8 +42,8 @@ mkdir -p $exp_dir export GLOG_logtostderr=1 cmvn_json2kaldi_main \ - --json_file $model_dir/data/mean_std.json \ - --cmvn_write_path $exp_dir/cmvn.ark \ + --json_file=$model_dir/data/mean_std.json \ + --cmvn_write_path=$exp_dir/cmvn.ark \ --binary=false echo "convert json cmvn to kaldi ark." @@ -54,4 +54,10 @@ compute_linear_spectrogram_main \ --cmvn_file=$exp_dir/cmvn.ark echo "compute linear spectrogram feature." +compute_fbank_main \ + --num_bins=161 \ + --wav_rspecifier=scp:$data_dir/wav.scp \ + --feature_wspecifier=ark,t:$exp_dir/fbank.ark \ + --cmvn_file=$exp_dir/cmvn.ark +echo "compute fbank feature." diff --git a/speechx/examples/codelab/nnet/path.sh b/speechx/examples/codelab/nnet/path.sh index 7d395d64..11c8aef8 100644 --- a/speechx/examples/codelab/nnet/path.sh +++ b/speechx/examples/codelab/nnet/path.sh @@ -6,7 +6,7 @@ SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx SPEECHX_TOOLS=$SPEECHX_ROOT/tools TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin -[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } +[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; } export LC_AL=C diff --git a/speechx/examples/codelab/u2/.gitignore b/speechx/examples/codelab/u2/.gitignore new file mode 100644 index 00000000..1269488f --- /dev/null +++ b/speechx/examples/codelab/u2/.gitignore @@ -0,0 +1 @@ +data diff --git a/speechx/examples/codelab/u2/README.md b/speechx/examples/codelab/u2/README.md new file mode 100644 index 00000000..3c85dc91 --- /dev/null +++ b/speechx/examples/codelab/u2/README.md @@ -0,0 +1 @@ +# u2/u2pp Streaming Test diff --git a/speechx/examples/codelab/u2/local/decode.sh b/speechx/examples/codelab/u2/local/decode.sh new file mode 100755 index 00000000..11c1afe8 --- /dev/null +++ b/speechx/examples/codelab/u2/local/decode.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set +x +set -e + +. path.sh + +data=data +exp=exp +mkdir -p $exp +ckpt_dir=$data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + +ctc_prefix_beam_search_decoder_main \ + --model_path=$model_dir/export.jit \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --subsampling_rate=4 \ + --vocab_path=$model_dir/unit.txt \ + --feature_rspecifier=ark,t:$exp/fbank.ark \ + --result_wspecifier=ark,t:$exp/result.ark + +echo "u2 ctc prefix beam search decode." diff --git a/speechx/examples/codelab/u2/local/feat.sh b/speechx/examples/codelab/u2/local/feat.sh new file mode 100755 index 00000000..1eec3aae --- /dev/null +++ b/speechx/examples/codelab/u2/local/feat.sh @@ -0,0 +1,27 @@ +#!/bin/bash +set -x +set -e + +. path.sh + +data=data +exp=exp +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + + +cmvn_json2kaldi_main \ + --json_file $model_dir/mean_std.json \ + --cmvn_write_path $exp/cmvn.ark \ + --binary=false + +echo "convert json cmvn to kaldi ark." + +compute_fbank_main \ + --num_bins 80 \ + --wav_rspecifier=scp:$data/wav.scp \ + --cmvn_file=$exp/cmvn.ark \ + --feature_wspecifier=ark,t:$exp/fbank.ark + +echo "compute fbank feature." diff --git a/speechx/examples/codelab/u2/local/nnet.sh b/speechx/examples/codelab/u2/local/nnet.sh new file mode 100755 index 00000000..4419201c --- /dev/null +++ b/speechx/examples/codelab/u2/local/nnet.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -x +set -e + +. path.sh + +data=data +exp=exp +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + +u2_nnet_main \ + --model_path=$model_dir/export.jit \ + --feature_rspecifier=ark,t:$exp/fbank.ark \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --subsampling_rate=4 \ + --acoustic_scale=1.0 \ + --nnet_encoder_outs_wspecifier=ark,t:$exp/encoder_outs.ark \ + --nnet_prob_wspecifier=ark,t:$exp/logprobs.ark +echo "u2 nnet decode." + diff --git a/speechx/examples/codelab/u2/local/recognizer.sh b/speechx/examples/codelab/u2/local/recognizer.sh new file mode 100755 index 00000000..9f697b45 --- /dev/null +++ b/speechx/examples/codelab/u2/local/recognizer.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -e + +. path.sh + +data=data +exp=exp +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + +u2_recognizer_main \ + --use_fbank=true \ + --num_bins=80 \ + --cmvn_file=$exp/cmvn.ark \ + --model_path=$model_dir/export.jit \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --subsampling_rate=4 \ + --vocab_path=$model_dir/unit.txt \ + --wav_rspecifier=scp:$data/wav.scp \ + --result_wspecifier=ark,t:$exp/result.ark diff --git a/speechx/examples/codelab/u2/path.sh b/speechx/examples/codelab/u2/path.sh new file mode 100644 index 00000000..ec278bd3 --- /dev/null +++ b/speechx/examples/codelab/u2/path.sh @@ -0,0 +1,18 @@ +# This contains the locations of binarys build required for running the examples. + +unset GREP_OPTIONS + +SPEECHX_ROOT=$PWD/../../../ +SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx + +SPEECHX_TOOLS=$SPEECHX_ROOT/tools +TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin + +[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; } + +export LC_AL=C + +export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio:$SPEECHX_BUILD/recognizer + +PADDLE_LIB_PATH=$(python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out);") +export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH diff --git a/speechx/examples/codelab/u2/run.sh b/speechx/examples/codelab/u2/run.sh new file mode 100755 index 00000000..d314262b --- /dev/null +++ b/speechx/examples/codelab/u2/run.sh @@ -0,0 +1,43 @@ +#!/bin/bash +set -x +set -e + +. path.sh + +# 1. compile +if [ ! -d ${SPEECHX_EXAMPLES} ]; then + pushd ${SPEECHX_ROOT} + bash build.sh + popd +fi + +# 2. download model +if [ ! -f data/model/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz ]; then + mkdir -p data/model + pushd data/model + wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/static/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz + tar xzfv asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz + popd +fi + +# produce wav scp +if [ ! -f data/wav.scp ]; then + mkdir -p data + pushd data + wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav + echo "utt1 " $PWD/zh.wav > wav.scp + popd +fi + +data=data +exp=exp +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + + +./local/feat.sh + +./local/nnet.sh + +./local/decode.sh diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh index 82e889ce..794b533f 100755 --- a/speechx/examples/ds2_ol/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -1,5 +1,5 @@ #!/bin/bash -set +x +set -x set -e . path.sh @@ -11,7 +11,7 @@ stop_stage=100 . utils/parse_options.sh # 1. compile -if [ ! -d ${SPEECHX_EXAMPLES} ]; then +if [ ! -d ${SPEECHX_BUILD} ]; then pushd ${SPEECHX_ROOT} bash build.sh popd @@ -84,7 +84,7 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # recognizer utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \ - ctc_prefix_beam_search_decoder_main \ + ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ --model_path=$model_dir/avg_1.jit.pdmodel \ --param_path=$model_dir/avg_1.jit.pdiparams \ @@ -103,7 +103,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # decode with lm utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \ - ctc_prefix_beam_search_decoder_main \ + ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ --model_path=$model_dir/avg_1.jit.pdmodel \ --param_path=$model_dir/avg_1.jit.pdiparams \ @@ -135,7 +135,7 @@ fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # TLG decoder utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \ - tlg_decoder_main \ + ctc_tlg_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ --model_path=$model_dir/avg_1.jit.pdmodel \ --param_path=$model_dir/avg_1.jit.pdiparams \ diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh index 72072835..1c3c3e01 100755 --- a/speechx/examples/ds2_ol/aishell/run_fbank.sh +++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh @@ -84,7 +84,7 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # recognizer utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wolm.log \ - ctc_prefix_beam_search_decoder_main \ + ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \ --model_path=$model_dir/avg_5.jit.pdmodel \ --param_path=$model_dir/avg_5.jit.pdiparams \ @@ -102,7 +102,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # decode with lm utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.lm.log \ - ctc_prefix_beam_search_decoder_main \ + ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \ --model_path=$model_dir/avg_5.jit.pdmodel \ --param_path=$model_dir/avg_5.jit.pdiparams \ @@ -133,7 +133,7 @@ fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # TLG decoder utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wfst.log \ - tlg_decoder_main \ + ctc_tlg_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \ --model_path=$model_dir/avg_5.jit.pdmodel \ --param_path=$model_dir/avg_5.jit.pdiparams \ diff --git a/speechx/examples/u2pp_ol/README.md b/speechx/examples/u2pp_ol/README.md new file mode 100644 index 00000000..838db435 --- /dev/null +++ b/speechx/examples/u2pp_ol/README.md @@ -0,0 +1,5 @@ +# U2/U2++ Streaming ASR + +## Examples + +* `wenetspeech` - Streaming Decoding with wenetspeech u2/u2++ model. Using aishell test data for testing. diff --git a/speechx/examples/u2pp_ol/wenetspeech/.gitignore b/speechx/examples/u2pp_ol/wenetspeech/.gitignore new file mode 100644 index 00000000..02c0cc21 --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/.gitignore @@ -0,0 +1,3 @@ +data +utils +exp diff --git a/speechx/examples/u2pp_ol/wenetspeech/README.md b/speechx/examples/u2pp_ol/wenetspeech/README.md new file mode 100644 index 00000000..9a8f8af5 --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/README.md @@ -0,0 +1,28 @@ +# u2/u2pp Streaming ASR + +## Testing with Aishell Test Data + +## Download wav and model + +``` +run.sh --stop_stage 0 +``` + +### compute feature + +``` +./run.sh --stage 1 --stop_stage 1 +``` + +### decoding using feature + +``` +./run.sh --stage 2 --stop_stage 2 +``` + +### decoding using wav + + +``` +./run.sh --stage 3 --stop_stage 3 +``` diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/aishell_train_lms.sh b/speechx/examples/u2pp_ol/wenetspeech/local/aishell_train_lms.sh new file mode 100755 index 00000000..544a1f59 --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/local/aishell_train_lms.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +# To be run from one directory above this script. +. ./path.sh + +nj=40 +text=data/local/lm/text +lexicon=data/local/dict/lexicon.txt + +for f in "$text" "$lexicon"; do + [ ! -f $x ] && echo "$0: No such file $f" && exit 1; +done + +# Check SRILM tools +if ! which ngram-count > /dev/null; then + echo "srilm tools are not found, please download it and install it from: " + echo "http://www.speech.sri.com/projects/srilm/download.html" + echo "Then add the tools to your PATH" + exit 1 +fi + +# This script takes no arguments. It assumes you have already run +# aishell_data_prep.sh. +# It takes as input the files +# data/local/lm/text +# data/local/dict/lexicon.txt +dir=data/local/lm +mkdir -p $dir + +cleantext=$dir/text.no_oov + +# oov to +# lexicon line: word char0 ... charn +# text line: utt word0 ... wordn -> line: word0 ... wordn +text_dir=$(dirname $text) +split_name=$(basename $text) +./local/split_data.sh $text_dir $text $split_name $nj + +utils/run.pl JOB=1:$nj $text_dir/split${nj}/JOB/${split_name}.no_oov.log \ + cat ${text_dir}/split${nj}/JOB/${split_name} \| awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } + {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ + \> ${text_dir}/split${nj}/JOB/${split_name}.no_oov || exit 1; +cat ${text_dir}/split${nj}/*/${split_name}.no_oov > $cleantext + +# compute word counts, sort in descending order +# line: count word +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort --parallel=`nproc` | uniq -c | \ + sort --parallel=`nproc` -nr > $dir/word.counts || exit 1; + +# Get counts from acoustic training transcripts, and add one-count +# for each word in the lexicon (but not silence, we don't want it +# in the LM-- we'll add it optionally later). +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ + cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ + sort --parallel=`nproc` | uniq -c | sort --parallel=`nproc` -nr > $dir/unigram.counts || exit 1; + +# word with +cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo ""; echo "" ) > $dir/wordlist + +# hold out to compute ppl +heldout_sent=10000 # Don't change this if you want result to be comparable with kaldi_lm results + +mkdir -p $dir +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/heldout +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/train + +ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \ + -map-unk "" -kndiscount -interpolate -lm $dir/lm.arpa +ngram -lm $dir/lm.arpa -ppl $dir/heldout \ No newline at end of file diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/decode.sh b/speechx/examples/u2pp_ol/wenetspeech/local/decode.sh new file mode 100755 index 00000000..c17cdbe6 --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/local/decode.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -e + +. path.sh + +data=data +exp=exp +nj=20 +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + +utils/run.pl JOB=1:$nj $data/split${nj}/JOB/decoder.fbank.wolm.log \ +ctc_prefix_beam_search_decoder_main \ + --model_path=$model_dir/export.jit \ + --vocab_path=$model_dir/unit.txt \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --subsampling_rate=4 \ + --feature_rspecifier=scp:$data/split${nj}/JOB/fbank.scp \ + --result_wspecifier=ark,t:$data/split${nj}/JOB/result_decode.ark + +cat $data/split${nj}/*/result_decode.ark > $exp/${label_file} +utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file} > $exp/${wer} +tail -n 7 $exp/${wer} \ No newline at end of file diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/feat.sh b/speechx/examples/u2pp_ol/wenetspeech/local/feat.sh new file mode 100755 index 00000000..4341cec8 --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/local/feat.sh @@ -0,0 +1,31 @@ +#!/bin/bash +set -e + +. path.sh + +data=data +exp=exp +nj=20 +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ +aishell_wav_scp=aishell_test.scp + +cmvn_json2kaldi_main \ + --json_file $model_dir/mean_std.json \ + --cmvn_write_path $exp/cmvn.ark \ + --binary=false + +echo "convert json cmvn to kaldi ark." + +./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj + +utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat.log \ +compute_fbank_main \ + --num_bins 80 \ + --cmvn_file=$exp/cmvn.ark \ + --streaming_chunk=36 \ + --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ + --feature_wspecifier=ark,scp:$data/split${nj}/JOB/fbank.ark,$data/split${nj}/JOB/fbank.scp + +echo "compute fbank feature." diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/nnet.sh b/speechx/examples/u2pp_ol/wenetspeech/local/nnet.sh new file mode 100755 index 00000000..4419201c --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/local/nnet.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -x +set -e + +. path.sh + +data=data +exp=exp +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + +u2_nnet_main \ + --model_path=$model_dir/export.jit \ + --feature_rspecifier=ark,t:$exp/fbank.ark \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --subsampling_rate=4 \ + --acoustic_scale=1.0 \ + --nnet_encoder_outs_wspecifier=ark,t:$exp/encoder_outs.ark \ + --nnet_prob_wspecifier=ark,t:$exp/logprobs.ark +echo "u2 nnet decode." + diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh new file mode 100755 index 00000000..f4553f2a --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh @@ -0,0 +1,37 @@ +#!/bin/bash +set -e + +. path.sh + +data=data +exp=exp +nj=20 + + +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ +aishell_wav_scp=aishell_test.scp +text=$data/test/text + +./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj + +utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer.log \ +u2_recognizer_main \ + --use_fbank=true \ + --num_bins=80 \ + --cmvn_file=$exp/cmvn.ark \ + --model_path=$model_dir/export.jit \ + --vocab_path=$model_dir/unit.txt \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --subsampling_rate=4 \ + --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ + --result_wspecifier=ark,t:$data/split${nj}/JOB/result_recognizer.ark + + +cat $data/split${nj}/*/result_recognizer.ark > $exp/aishell_recognizer +utils/compute-wer.py --char=1 --v=1 $text $exp/aishell_recognizer > $exp/aishell.recognizer.err +echo "recognizer test have finished!!!" +echo "please checkout in $exp/aishell.recognizer.err" +tail -n 7 $exp/aishell.recognizer.err diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/split_data.sh b/speechx/examples/u2pp_ol/wenetspeech/local/split_data.sh new file mode 100755 index 00000000..faa5c42d --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/local/split_data.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +set -eo pipefail + +data=$1 +scp=$2 +split_name=$3 +numsplit=$4 + +# save in $data/split{n} +# $scp to split +# + +if [[ ! $numsplit -gt 0 ]]; then + echo "$0: Invalid num-split argument"; + exit 1; +fi + +directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n; done) +scp_splits=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_name}; done) + +# if this mkdir fails due to argument-list being too long, iterate. +if ! mkdir -p $directories >&/dev/null; then + for n in `seq $numsplit`; do + mkdir -p $data/split${numsplit}/$n + done +fi + +echo "utils/split_scp.pl $scp $scp_splits" +utils/split_scp.pl $scp $scp_splits diff --git a/speechx/examples/u2pp_ol/wenetspeech/path.sh b/speechx/examples/u2pp_ol/wenetspeech/path.sh new file mode 100644 index 00000000..ec278bd3 --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/path.sh @@ -0,0 +1,18 @@ +# This contains the locations of binarys build required for running the examples. + +unset GREP_OPTIONS + +SPEECHX_ROOT=$PWD/../../../ +SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx + +SPEECHX_TOOLS=$SPEECHX_ROOT/tools +TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin + +[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; } + +export LC_AL=C + +export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio:$SPEECHX_BUILD/recognizer + +PADDLE_LIB_PATH=$(python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out);") +export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH diff --git a/speechx/examples/u2pp_ol/wenetspeech/run.sh b/speechx/examples/u2pp_ol/wenetspeech/run.sh new file mode 100755 index 00000000..12e3af95 --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/run.sh @@ -0,0 +1,76 @@ +#!/bin/bash +set +x +set -e + +. path.sh + +nj=40 +stage=0 +stop_stage=5 + +. utils/parse_options.sh + +# input +data=data +exp=exp +mkdir -p $exp $data + + +# 1. compile +if [ ! -d ${SPEECHX_BUILD} ]; then + pushd ${SPEECHX_ROOT} + bash build.sh + popd +fi + + +ckpt_dir=$data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then + # download model + if [ ! -f $ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz ]; then + mkdir -p $ckpt_dir + pushd $ckpt_dir + + wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/static/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz + tar xzfv asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz + + popd + fi + + # test wav scp + if [ ! -f data/wav.scp ]; then + mkdir -p $data + pushd $data + wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav + echo "utt1 " $PWD/zh.wav > wav.scp + popd + fi + + # aishell wav scp + if [ ! -d $data/test ]; then + pushd $data + wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip + unzip aishell_test.zip + popd + + realpath $data/test/*/*.wav > $data/wavlist + awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id + paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp + fi +fi + + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + ./local/feat.sh +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + ./local/decode.sh +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + ./loca/recognizer.sh +fi \ No newline at end of file diff --git a/speechx/speechx/CMakeLists.txt b/speechx/speechx/CMakeLists.txt index c8e21d48..60c18347 100644 --- a/speechx/speechx/CMakeLists.txt +++ b/speechx/speechx/CMakeLists.txt @@ -32,6 +32,12 @@ ${CMAKE_CURRENT_SOURCE_DIR}/decoder ) add_subdirectory(decoder) +include_directories( +${CMAKE_CURRENT_SOURCE_DIR} +${CMAKE_CURRENT_SOURCE_DIR}/recognizer +) +add_subdirectory(recognizer) + include_directories( ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/protocol diff --git a/speechx/speechx/base/basic_types.h b/speechx/speechx/base/basic_types.h index 206b7be6..2b15a61f 100644 --- a/speechx/speechx/base/basic_types.h +++ b/speechx/speechx/base/basic_types.h @@ -14,47 +14,47 @@ #pragma once -#include "kaldi/base/kaldi-types.h" - #include +#include "kaldi/base/kaldi-types.h" + typedef float BaseFloat; typedef double double64; typedef signed char int8; -typedef short int16; -typedef int int32; +typedef short int16; // NOLINT +typedef int int32; // NOLINT #if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD) -typedef long int64; +typedef long int64; // NOLINT #else -typedef long long int64; +typedef long long int64; // NOLINT #endif -typedef unsigned char uint8; -typedef unsigned short uint16; -typedef unsigned int uint32; +typedef unsigned char uint8; // NOLINT +typedef unsigned short uint16; // NOLINT +typedef unsigned int uint32; // NOLINT #if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD) -typedef unsigned long uint64; +typedef unsigned long uint64; // NOLINT #else -typedef unsigned long long uint64; +typedef unsigned long long uint64; // NOLINT #endif typedef signed int char32; -const uint8 kuint8max = ((uint8)0xFF); -const uint16 kuint16max = ((uint16)0xFFFF); -const uint32 kuint32max = ((uint32)0xFFFFFFFF); -const uint64 kuint64max = ((uint64)(0xFFFFFFFFFFFFFFFFLL)); -const int8 kint8min = ((int8)0x80); -const int8 kint8max = ((int8)0x7F); -const int16 kint16min = ((int16)0x8000); -const int16 kint16max = ((int16)0x7FFF); -const int32 kint32min = ((int32)0x80000000); -const int32 kint32max = ((int32)0x7FFFFFFF); -const int64 kint64min = ((int64)(0x8000000000000000LL)); -const int64 kint64max = ((int64)(0x7FFFFFFFFFFFFFFFLL)); +const uint8 kuint8max = static_cast(0xFF); +const uint16 kuint16max = static_cast(0xFFFF); +const uint32 kuint32max = static_cast(0xFFFFFFFF); +const uint64 kuint64max = static_cast(0xFFFFFFFFFFFFFFFFLL); +const int8 kint8min = static_cast(0x80); +const int8 kint8max = static_cast(0x7F); +const int16 kint16min = static_cast(0x8000); +const int16 kint16max = static_cast(0x7FFF); +const int32 kint32min = static_cast(0x80000000); +const int32 kint32max = static_cast(0x7FFFFFFF); +const int64 kint64min = static_cast(0x8000000000000000LL); +const int64 kint64max = static_cast(0x7FFFFFFFFFFFFFFFLL); const BaseFloat kBaseFloatMax = std::numeric_limits::max(); const BaseFloat kBaseFloatMin = std::numeric_limits::min(); diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h index a9303cbb..97bff966 100644 --- a/speechx/speechx/base/common.h +++ b/speechx/speechx/base/common.h @@ -14,21 +14,30 @@ #pragma once +#include +#include +#include #include +#include #include #include +#include #include #include #include #include #include +#include #include #include #include #include #include +#include #include #include +#include +#include #include #include #include @@ -38,3 +47,5 @@ #include "base/flags.h" #include "base/log.h" #include "base/macros.h" +#include "utils/file_utils.h" +#include "utils/math.h" \ No newline at end of file diff --git a/speechx/speechx/base/macros.h b/speechx/speechx/base/macros.h index d7d5a78d..db989812 100644 --- a/speechx/speechx/base/macros.h +++ b/speechx/speechx/base/macros.h @@ -14,6 +14,9 @@ #pragma once +#include +#include + namespace ppspeech { #ifndef DISALLOW_COPY_AND_ASSIGN @@ -22,4 +25,7 @@ namespace ppspeech { void operator=(const TypeName&) = delete #endif -} // namespace pp_speech \ No newline at end of file +// kSpaceSymbol in UTF-8 is: ▁ +const char kSpaceSymbo[] = "\xe2\x96\x81"; + +} // namespace ppspeech diff --git a/speechx/speechx/base/thread_pool.h b/speechx/speechx/base/thread_pool.h index ba895f71..6d59dac5 100644 --- a/speechx/speechx/base/thread_pool.h +++ b/speechx/speechx/base/thread_pool.h @@ -35,7 +35,7 @@ class ThreadPool { public: - ThreadPool(size_t); + explicit ThreadPool(size_t); template auto enqueue(F&& f, Args&&... args) -> std::future::type>; diff --git a/speechx/speechx/codelab/glog/glog_logtostderr_main.cc b/speechx/speechx/codelab/glog/glog_logtostderr_main.cc index b0616a7d..c891827a 100644 --- a/speechx/speechx/codelab/glog/glog_logtostderr_main.cc +++ b/speechx/speechx/codelab/glog/glog_logtostderr_main.cc @@ -17,7 +17,7 @@ int main(int argc, char* argv[]) { // Initialize Google’s logging library. google::InitGoogleLogging(argv[0]); - + google::InstallFailureSignalHandler(); FLAGS_logtostderr = 1; LOG(INFO) << "Found " << 10 << " cookies"; diff --git a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc index 283466dc..ab7b2cb5 100644 --- a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc +++ b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc @@ -21,6 +21,7 @@ #include #include #include + #include "base/flags.h" #include "base/log.h" #include "paddle_inference_api.h" @@ -63,8 +64,8 @@ void model_forward_test() { ; std::string model_graph = FLAGS_model_path; std::string model_params = FLAGS_param_path; - CHECK(model_graph != ""); - CHECK(model_params != ""); + CHECK_NE(model_graph, ""); + CHECK_NE(model_params, ""); cout << "model path: " << model_graph << endl; cout << "model param path : " << model_params << endl; @@ -195,8 +196,11 @@ void model_forward_test() { } int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; model_forward_test(); return 0; diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt index 1df93511..f0fd32ba 100644 --- a/speechx/speechx/decoder/CMakeLists.txt +++ b/speechx/speechx/decoder/CMakeLists.txt @@ -1,25 +1,55 @@ -project(decoder) - include_directories(${CMAKE_CURRENT_SOURCE_DIR/ctc_decoders}) -add_library(decoder STATIC - ctc_beam_search_decoder.cc + +set(srcs) + +if (USING_DS2) +list(APPEND srcs ctc_decoders/decoder_utils.cpp ctc_decoders/path_trie.cpp ctc_decoders/scorer.cpp + ctc_beam_search_decoder.cc ctc_tlg_decoder.cc - recognizer.cc ) -target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder) +endif() -set(BINS - ctc_prefix_beam_search_decoder_main - nnet_logprob_decoder_main - recognizer_main - tlg_decoder_main -) +if (USING_U2) + list(APPEND srcs + ctc_prefix_beam_search_decoder.cc + ) +endif() + +add_library(decoder STATIC ${srcs}) +target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder absl::strings) + +# test +if (USING_DS2) + set(BINS + ctc_beam_search_decoder_main + nnet_logprob_decoder_main + ctc_tlg_decoder_main + ) + + foreach(bin_name IN LISTS BINS) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS}) + endforeach() +endif() + + +if (USING_U2) + set(TEST_BINS + ctc_prefix_beam_search_decoder_main + ) + + foreach(bin_name IN LISTS TEST_BINS) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util) + target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS}) + target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) + target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) + endforeach() + +endif() -foreach(bin_name IN LISTS BINS) - add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) - target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) - target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS}) -endforeach() diff --git a/speechx/speechx/decoder/common.h b/speechx/speechx/decoder/common.h index 52deffac..0ae73277 100644 --- a/speechx/speechx/decoder/common.h +++ b/speechx/speechx/decoder/common.h @@ -1,3 +1,4 @@ +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,10 +13,36 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "base/basic_types.h" +#pragma once + +#include "base/common.h" struct DecoderResult { BaseFloat acoustic_score; std::vector words_idx; - std::vector> time_stamp; + std::vector> time_stamp; +}; + + +namespace ppspeech { + +struct WordPiece { + std::string word; + int start = -1; + int end = -1; + + WordPiece(std::string word, int start, int end) + : word(std::move(word)), start(start), end(end) {} }; + +struct DecodeResult { + float score = -kBaseFloatMax; + std::string sentence; + std::vector word_pieces; + + static bool CompareFunc(const DecodeResult& a, const DecodeResult& b) { + return a.score > b.score; + } +}; + +} // namespace ppspeech diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_beam_search_decoder.cc index 5a12c0b5..6e3a0d13 100644 --- a/speechx/speechx/decoder/ctc_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_beam_search_decoder.cc @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. + #include "decoder/ctc_beam_search_decoder.h" -#include "base/basic_types.h" +#include "base/common.h" #include "decoder/ctc_decoders/decoder_utils.h" #include "utils/file_utils.h" @@ -24,12 +25,7 @@ using std::vector; using FSTMATCH = fst::SortedMatcher; CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts) - : opts_(opts), - init_ext_scorer_(nullptr), - blank_id_(-1), - space_id_(-1), - num_frame_decoded_(0), - root_(nullptr) { + : opts_(opts), init_ext_scorer_(nullptr), space_id_(-1), root_(nullptr) { LOG(INFO) << "dict path: " << opts_.dict_file; if (!ReadFileToVector(opts_.dict_file, &vocabulary_)) { LOG(INFO) << "load the dict failed"; @@ -43,12 +39,12 @@ CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts) opts_.alpha, opts_.beta, opts_.lm_path, vocabulary_); } - blank_id_ = 0; - auto it = std::find(vocabulary_.begin(), vocabulary_.end(), " "); + CHECK_EQ(opts_.blank, 0); + auto it = std::find(vocabulary_.begin(), vocabulary_.end(), " "); space_id_ = it - vocabulary_.begin(); // if no space in vocabulary - if ((size_t)space_id_ >= vocabulary_.size()) { + if (static_cast(space_id_) >= vocabulary_.size()) { space_id_ = -2; } } @@ -84,8 +80,6 @@ void CTCBeamSearch::Decode( return; } -int32 CTCBeamSearch::NumFrameDecoded() { return num_frame_decoded_ + 1; } - // todo rename, refactor void CTCBeamSearch::AdvanceDecode( const std::shared_ptr& decodable) { @@ -110,17 +104,21 @@ void CTCBeamSearch::ResetPrefixes() { } int CTCBeamSearch::DecodeLikelihoods(const vector>& probs, - vector& nbest_words) { + const vector& nbest_words) { kaldi::Timer timer; - timer.Reset(); AdvanceDecoding(probs); LOG(INFO) << "ctc decoding elapsed time(s) " << static_cast(timer.Elapsed()) / 1000.0f; return 0; } +vector> CTCBeamSearch::GetNBestPath(int n) { + int beam_size = n == -1 ? opts_.beam_size : std::min(n, opts_.beam_size); + return get_beam_search_result(prefixes_, vocabulary_, beam_size); +} + vector> CTCBeamSearch::GetNBestPath() { - return get_beam_search_result(prefixes_, vocabulary_, opts_.beam_size); + return GetNBestPath(-1); } string CTCBeamSearch::GetBestPath() { @@ -167,7 +165,7 @@ void CTCBeamSearch::AdvanceDecoding(const vector>& probs) { continue; } min_cutoff = prefixes_[num_prefixes_ - 1]->score + - std::log(prob[blank_id_]) - + std::log(prob[opts_.blank]) - std::max(0.0, init_ext_scorer_->beta); full_beam = (num_prefixes_ == beam_size); @@ -195,9 +193,9 @@ void CTCBeamSearch::AdvanceDecoding(const vector>& probs) { for (size_t i = beam_size; i < prefixes_.size(); ++i) { prefixes_[i]->remove(); } - } // if + } // end if num_frame_decoded_++; - } // for probs_seq + } // end for probs_seq } int32 CTCBeamSearch::SearchOneChar( @@ -215,7 +213,7 @@ int32 CTCBeamSearch::SearchOneChar( break; } - if (c == blank_id_) { + if (c == opts_.blank) { prefix->log_prob_b_cur = log_sum_exp(prefix->log_prob_b_cur, log_prob_c + prefix->score); continue; diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.h b/speechx/speechx/decoder/ctc_beam_search_decoder.h index 9d0a5d14..f06d88e3 100644 --- a/speechx/speechx/decoder/ctc_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_beam_search_decoder.h @@ -12,67 +12,47 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "base/common.h" -#include "decoder/ctc_decoders/path_trie.h" -#include "decoder/ctc_decoders/scorer.h" -#include "kaldi/decoder/decodable-itf.h" -#include "util/parse-options.h" +// used by deepspeech2 #pragma once -namespace ppspeech { +#include "decoder/ctc_beam_search_opt.h" +#include "decoder/ctc_decoders/path_trie.h" +#include "decoder/ctc_decoders/scorer.h" +#include "decoder/decoder_itf.h" -struct CTCBeamSearchOptions { - std::string dict_file; - std::string lm_path; - BaseFloat alpha; - BaseFloat beta; - BaseFloat cutoff_prob; - int beam_size; - int cutoff_top_n; - int num_proc_bsearch; - CTCBeamSearchOptions() - : dict_file("vocab.txt"), - lm_path(""), - alpha(1.9f), - beta(5.0), - beam_size(300), - cutoff_prob(0.99f), - cutoff_top_n(40), - num_proc_bsearch(10) {} - - void Register(kaldi::OptionsItf* opts) { - opts->Register("dict", &dict_file, "dict file "); - opts->Register("lm-path", &lm_path, "language model file"); - opts->Register("alpha", &alpha, "alpha"); - opts->Register("beta", &beta, "beta"); - opts->Register( - "beam-size", &beam_size, "beam size for beam search method"); - opts->Register("cutoff-prob", &cutoff_prob, "cutoff probs"); - opts->Register("cutoff-top-n", &cutoff_top_n, "cutoff top n"); - opts->Register( - "num-proc-bsearch", &num_proc_bsearch, "num proc bsearch"); - } -}; +namespace ppspeech { -class CTCBeamSearch { +class CTCBeamSearch : public DecoderBase { public: explicit CTCBeamSearch(const CTCBeamSearchOptions& opts); ~CTCBeamSearch() {} + void InitDecoder(); + + void Reset(); + + void AdvanceDecode( + const std::shared_ptr& decodable); + void Decode(std::shared_ptr decodable); + std::string GetBestPath(); std::vector> GetNBestPath(); + std::vector> GetNBestPath(int n); std::string GetFinalBestPath(); - int NumFrameDecoded(); + + std::string GetPartialResult() { + CHECK(false) << "Not implement."; + return {}; + } + int DecodeLikelihoods(const std::vector>& probs, - std::vector& nbest_words); - void AdvanceDecode( - const std::shared_ptr& decodable); - void Reset(); + const std::vector& nbest_words); private: void ResetPrefixes(); + int32 SearchOneChar(const bool& full_beam, const std::pair& log_prob_idx, const BaseFloat& min_cutoff); @@ -83,12 +63,11 @@ class CTCBeamSearch { CTCBeamSearchOptions opts_; std::shared_ptr init_ext_scorer_; // todo separate later std::vector vocabulary_; // todo remove later - size_t blank_id_; int space_id_; std::shared_ptr root_; std::vector prefixes_; - int num_frame_decoded_; + DISALLOW_COPY_AND_ASSIGN(CTCBeamSearch); }; -} // namespace basr \ No newline at end of file +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/decoder/tlg_decoder_main.cc b/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc similarity index 78% rename from speechx/speechx/decoder/tlg_decoder_main.cc rename to speechx/speechx/decoder/ctc_beam_search_decoder_main.cc index b175ed13..ab0376b6 100644 --- a/speechx/speechx/decoder/tlg_decoder_main.cc +++ b/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc @@ -12,29 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -// todo refactor, repalce with gtest +// used by deepspeech2 #include "base/flags.h" #include "base/log.h" -#include "decoder/ctc_tlg_decoder.h" +#include "decoder/ctc_beam_search_decoder.h" #include "frontend/audio/data_cache.h" #include "kaldi/util/table-types.h" #include "nnet/decodable.h" -#include "nnet/paddle_nnet.h" +#include "nnet/ds2_nnet.h" DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier"); DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); -DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); -DEFINE_string(graph_path, "TLG", "decoder graph"); -DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); -DEFINE_int32(max_active, 7500, "decoder graph"); -DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk"); +DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm"); +DEFINE_string(lm_path, "", "language model"); DEFINE_int32(receptive_field_length, 7, "receptive field of two CNN(kernel=3) downsampling module."); -DEFINE_int32(downsampling_rate, +DEFINE_int32(subsampling_rate, 4, "two CNN(kernel=3) module downsampling rate."); DEFINE_string( @@ -48,59 +45,59 @@ DEFINE_string(model_cache_names, "chunk_state_h_box,chunk_state_c_box", "model cache names"); DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); +DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk"); using kaldi::BaseFloat; using kaldi::Matrix; using std::vector; -// test TLG decoder by feeding speech feature. +// test ds2 online decoder by feeding speech feature int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; + + CHECK_NE(FLAGS_result_wspecifier, ""); + CHECK_NE(FLAGS_feature_rspecifier, ""); kaldi::SequentialBaseFloatMatrixReader feature_reader( FLAGS_feature_rspecifier); kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); - std::string model_graph = FLAGS_model_path; + std::string model_path = FLAGS_model_path; std::string model_params = FLAGS_param_path; - std::string word_symbol_table = FLAGS_word_symbol_table; - std::string graph_path = FLAGS_graph_path; - LOG(INFO) << "model path: " << model_graph; + std::string dict_file = FLAGS_dict_file; + std::string lm_path = FLAGS_lm_path; + LOG(INFO) << "model path: " << model_path; LOG(INFO) << "model param: " << model_params; - LOG(INFO) << "word symbol path: " << word_symbol_table; - LOG(INFO) << "graph path: " << graph_path; + LOG(INFO) << "dict path: " << dict_file; + LOG(INFO) << "lm path: " << lm_path; int32 num_done = 0, num_err = 0; - ppspeech::TLGDecoderOptions opts; - opts.word_symbol_table = word_symbol_table; - opts.fst_path = graph_path; - opts.opts.max_active = FLAGS_max_active; - opts.opts.beam = 15.0; - opts.opts.lattice_beam = 7.5; - ppspeech::TLGDecoder decoder(opts); - - ppspeech::ModelOptions model_opts; - model_opts.model_path = model_graph; - model_opts.param_path = model_params; - model_opts.cache_names = FLAGS_model_cache_names; - model_opts.cache_shape = FLAGS_model_cache_shapes; - model_opts.input_names = FLAGS_model_input_names; - model_opts.output_names = FLAGS_model_output_names; + ppspeech::CTCBeamSearchOptions opts; + opts.dict_file = dict_file; + opts.lm_path = lm_path; + ppspeech::CTCBeamSearch decoder(opts); + + ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags(); + std::shared_ptr nnet( new ppspeech::PaddleNnet(model_opts)); std::shared_ptr raw_data(new ppspeech::DataCache()); std::shared_ptr decodable( - new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale)); + new ppspeech::Decodable(nnet, raw_data)); int32 chunk_size = FLAGS_receptive_field_length + - (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate; - int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk; + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate; + int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk; int32 receptive_field_length = FLAGS_receptive_field_length; LOG(INFO) << "chunk size (frame): " << chunk_size; LOG(INFO) << "chunk stride (frame): " << chunk_stride; LOG(INFO) << "receptive field (frame): " << receptive_field_length; decoder.InitDecoder(); + kaldi::Timer timer; for (; !feature_reader.Done(); feature_reader.Next()) { string utt = feature_reader.Key(); @@ -132,6 +129,7 @@ int main(int argc, char* argv[]) { if (feature_chunk_size < receptive_field_length) break; int32 start = chunk_idx * chunk_stride; + for (int row_id = 0; row_id < chunk_size; ++row_id) { kaldi::SubVector tmp(feature, start); kaldi::SubVector f_chunk_tmp( @@ -161,10 +159,9 @@ int main(int argc, char* argv[]) { ++num_done; } - double elapsed = timer.Elapsed(); - KALDI_LOG << " cost:" << elapsed << " s"; - KALDI_LOG << "Done " << num_done << " utterances, " << num_err << " with errors."; + double elapsed = timer.Elapsed(); + KALDI_LOG << " cost:" << elapsed << " s"; return (num_done != 0 ? 0 : 1); } diff --git a/speechx/speechx/decoder/ctc_beam_search_opt.h b/speechx/speechx/decoder/ctc_beam_search_opt.h new file mode 100644 index 00000000..f4a81b3a --- /dev/null +++ b/speechx/speechx/decoder/ctc_beam_search_opt.h @@ -0,0 +1,78 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "base/common.h" +#include "util/parse-options.h" + +namespace ppspeech { + + +struct CTCBeamSearchOptions { + // common + int blank; + + // ds2 + std::string dict_file; + std::string lm_path; + int beam_size; + BaseFloat alpha; + BaseFloat beta; + BaseFloat cutoff_prob; + int cutoff_top_n; + int num_proc_bsearch; + + // u2 + int first_beam_size; + int second_beam_size; + CTCBeamSearchOptions() + : blank(0), + dict_file("vocab.txt"), + lm_path(""), + beam_size(300), + alpha(1.9f), + beta(5.0), + cutoff_prob(0.99f), + cutoff_top_n(40), + num_proc_bsearch(10), + first_beam_size(10), + second_beam_size(10) {} + + void Register(kaldi::OptionsItf* opts) { + std::string module = "Ds2BeamSearchConfig: "; + opts->Register("dict", &dict_file, module + "vocab file path."); + opts->Register( + "lm-path", &lm_path, module + "ngram language model path."); + opts->Register("alpha", &alpha, module + "alpha"); + opts->Register("beta", &beta, module + "beta"); + opts->Register("beam-size", + &beam_size, + module + "beam size for beam search method"); + opts->Register("cutoff-prob", &cutoff_prob, module + "cutoff probs"); + opts->Register("cutoff-top-n", &cutoff_top_n, module + "cutoff top n"); + opts->Register( + "num-proc-bsearch", &num_proc_bsearch, module + "num proc bsearch"); + + opts->Register("blank", &blank, "blank id, default is 0."); + + module = "U2BeamSearchConfig: "; + opts->Register( + "first-beam-size", &first_beam_size, module + "first beam size."); + opts->Register("second-beam-size", + &second_beam_size, + module + "second beam size."); + } +}; + +} // namespace ppspeech diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc new file mode 100644 index 00000000..03a7c133 --- /dev/null +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc @@ -0,0 +1,370 @@ +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) +// 2022 Binbin Zhang (binbzha@qq.com) +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "decoder/ctc_prefix_beam_search_decoder.h" + +#include "absl/strings/str_join.h" +#include "base/common.h" +#include "decoder/ctc_beam_search_opt.h" +#include "decoder/ctc_prefix_beam_search_score.h" +#include "utils/math.h" + +#ifdef USE_PROFILING +#include "paddle/fluid/platform/profiler.h" +using paddle::platform::RecordEvent; +using paddle::platform::TracerEventType; +#endif + +namespace ppspeech { + +CTCPrefixBeamSearch::CTCPrefixBeamSearch(const std::string& vocab_path, + const CTCBeamSearchOptions& opts) + : opts_(opts) { + unit_table_ = std::shared_ptr( + fst::SymbolTable::ReadText(vocab_path)); + CHECK(unit_table_ != nullptr); + + Reset(); +} + +void CTCPrefixBeamSearch::Reset() { + num_frame_decoded_ = 0; + + cur_hyps_.clear(); + + hypotheses_.clear(); + likelihood_.clear(); + viterbi_likelihood_.clear(); + times_.clear(); + outputs_.clear(); + + // empty hyp with Score + std::vector empty; + PrefixScore prefix_score; + prefix_score.InitEmpty(); + cur_hyps_[empty] = prefix_score; + + outputs_.emplace_back(empty); + hypotheses_.emplace_back(empty); + likelihood_.emplace_back(prefix_score.TotalScore()); + times_.emplace_back(empty); +} + +void CTCPrefixBeamSearch::InitDecoder() { Reset(); } + + +void CTCPrefixBeamSearch::AdvanceDecode( + const std::shared_ptr& decodable) { + while (1) { + // forward frame by frame + std::vector frame_prob; + bool flag = decodable->FrameLikelihood(num_frame_decoded_, &frame_prob); + if (flag == false) { + VLOG(1) << "decoder advance decode exit." << frame_prob.size(); + break; + } + + std::vector> likelihood; + likelihood.push_back(frame_prob); + AdvanceDecoding(likelihood); + VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_; + } +} + +static bool PrefixScoreCompare( + const std::pair, PrefixScore>& a, + const std::pair, PrefixScore>& b) { + // log domain + return a.second.TotalScore() > b.second.TotalScore(); +} + + +void CTCPrefixBeamSearch::AdvanceDecoding( + const std::vector>& logp) { +#ifdef USE_PROFILING + RecordEvent event("CtcPrefixBeamSearch::AdvanceDecoding", + TracerEventType::UserDefined, + 1); +#endif + + if (logp.size() == 0) return; + + int first_beam_size = + std::min(static_cast(logp[0].size()), opts_.first_beam_size); + + for (int t = 0; t < logp.size(); ++t, ++num_frame_decoded_) { + const std::vector& logp_t = logp[t]; + std::unordered_map, PrefixScore, PrefixScoreHash> + next_hyps; + + // 1. first beam prune, only select topk candidates + std::vector topk_score; + std::vector topk_index; + TopK(logp_t, first_beam_size, &topk_score, &topk_index); + VLOG(2) << "topk: " << num_frame_decoded_ << " " + << *std::max_element(logp_t.begin(), logp_t.end()) << " " + << topk_score[0]; + for (int i = 0; i < topk_score.size(); i++) { + VLOG(2) << "topk: " << num_frame_decoded_ << " " << topk_score[i]; + } + + // 2. token passing + for (int i = 0; i < topk_index.size(); ++i) { + int id = topk_index[i]; + auto prob = topk_score[i]; + + for (const auto& it : cur_hyps_) { + const std::vector& prefix = it.first; + const PrefixScore& prefix_score = it.second; + + // If prefix doesn't exist in next_hyps, next_hyps[prefix] will + // insert + // PrefixScore(-inf, -inf) by default, since the default + // constructor + // of PrefixScore will set fields b(blank ending Score) and + // nb(none blank ending Score) to -inf, respectively. + + if (id == opts_.blank) { + // case 0: *a + => *a, *a + => *a, + // prefix not + // change + PrefixScore& next_score = next_hyps[prefix]; + next_score.b = + LogSumExp(next_score.b, prefix_score.Score() + prob); + + // timestamp, blank is slince, not effact timestamp + next_score.v_b = prefix_score.ViterbiScore() + prob; + next_score.times_b = prefix_score.Times(); + + // Prefix not changed, copy the context from pefix + if (context_graph_ && !next_score.has_context) { + next_score.CopyContext(prefix_score); + next_score.has_context = true; + } + + } else if (!prefix.empty() && id == prefix.back()) { + // case 1: *a + a => *a, prefix not changed + PrefixScore& next_score1 = next_hyps[prefix]; + next_score1.nb = + LogSumExp(next_score1.nb, prefix_score.nb + prob); + + // timestamp, non-blank symbol effact timestamp + if (next_score1.v_nb < prefix_score.v_nb + prob) { + // compute viterbi Score + next_score1.v_nb = prefix_score.v_nb + prob; + if (next_score1.cur_token_prob < prob) { + // store max token prob + next_score1.cur_token_prob = prob; + // update this timestamp as token appeared here. + next_score1.times_nb = prefix_score.times_nb; + assert(next_score1.times_nb.size() > 0); + next_score1.times_nb.back() = num_frame_decoded_; + } + } + + // Prefix not changed, copy the context from pefix + if (context_graph_ && !next_score1.has_context) { + next_score1.CopyContext(prefix_score); + next_score1.has_context = true; + } + + // case 2: *a + a => *aa, prefix changed. + std::vector new_prefix(prefix); + new_prefix.emplace_back(id); + PrefixScore& next_score2 = next_hyps[new_prefix]; + next_score2.nb = + LogSumExp(next_score2.nb, prefix_score.b + prob); + + // timestamp, non-blank symbol effact timestamp + if (next_score2.v_nb < prefix_score.v_b + prob) { + // compute viterbi Score + next_score2.v_nb = prefix_score.v_b + prob; + // new token added + next_score2.cur_token_prob = prob; + next_score2.times_nb = prefix_score.times_b; + next_score2.times_nb.emplace_back(num_frame_decoded_); + } + + // Prefix changed, calculate the context Score. + if (context_graph_ && !next_score2.has_context) { + next_score2.UpdateContext( + context_graph_, prefix_score, id, prefix.size()); + next_score2.has_context = true; + } + + } else { + // id != prefix.back() + // case 3: *a + b => *ab, *a +b => *ab + std::vector new_prefix(prefix); + new_prefix.emplace_back(id); + PrefixScore& next_score = next_hyps[new_prefix]; + next_score.nb = + LogSumExp(next_score.nb, prefix_score.Score() + prob); + + // timetamp, non-blank symbol effact timestamp + if (next_score.v_nb < prefix_score.ViterbiScore() + prob) { + next_score.v_nb = prefix_score.ViterbiScore() + prob; + + next_score.cur_token_prob = prob; + next_score.times_nb = prefix_score.Times(); + next_score.times_nb.emplace_back(num_frame_decoded_); + } + + // Prefix changed, calculate the context Score. + if (context_graph_ && !next_score.has_context) { + next_score.UpdateContext( + context_graph_, prefix_score, id, prefix.size()); + next_score.has_context = true; + } + } + } // end for (const auto& it : cur_hyps_) + } // end for (int i = 0; i < topk_index.size(); ++i) + + // 3. second beam prune, only keep top n best paths + std::vector, PrefixScore>> arr( + next_hyps.begin(), next_hyps.end()); + int second_beam_size = + std::min(static_cast(arr.size()), opts_.second_beam_size); + std::nth_element(arr.begin(), + arr.begin() + second_beam_size, + arr.end(), + PrefixScoreCompare); + arr.resize(second_beam_size); + std::sort(arr.begin(), arr.end(), PrefixScoreCompare); + + // 4. update cur_hyps by next_hyps, and get new result + UpdateHypotheses(arr); + } // end for (int t = 0; t < logp.size(); ++t, ++num_frame_decoded_) +} + + +void CTCPrefixBeamSearch::UpdateHypotheses( + const std::vector, PrefixScore>>& hyps) { + cur_hyps_.clear(); + + outputs_.clear(); + hypotheses_.clear(); + likelihood_.clear(); + viterbi_likelihood_.clear(); + times_.clear(); + + for (auto& item : hyps) { + cur_hyps_[item.first] = item.second; + + UpdateOutputs(item); + hypotheses_.emplace_back(std::move(item.first)); + likelihood_.emplace_back(item.second.TotalScore()); + viterbi_likelihood_.emplace_back(item.second.ViterbiScore()); + times_.emplace_back(item.second.Times()); + } +} + +void CTCPrefixBeamSearch::UpdateOutputs( + const std::pair, PrefixScore>& prefix) { + const std::vector& input = prefix.first; + const std::vector& start_boundaries = prefix.second.start_boundaries; + const std::vector& end_boundaries = prefix.second.end_boundaries; + + // add tag + std::vector output; + int s = 0; + int e = 0; + for (int i = 0; i < input.size(); ++i) { + output.emplace_back(input[i]); + } + + outputs_.emplace_back(output); +} + +void CTCPrefixBeamSearch::FinalizeSearch() { + UpdateFinalContext(); + + VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_; + int cnt = 0; + for (int i = 0; i < hypotheses_.size(); i++) { + VLOG(2) << "hyp " << cnt << " len: " << hypotheses_[i].size() + << " ctc score: " << likelihood_[i]; + for (int j = 0; j < hypotheses_[i].size(); j++) { + VLOG(2) << hypotheses_[i][j]; + } + } +} + +void CTCPrefixBeamSearch::UpdateFinalContext() { + if (context_graph_ == nullptr) return; + + CHECK(hypotheses_.size() == cur_hyps_.size()); + CHECK(hypotheses_.size() == likelihood_.size()); + + // We should backoff the context Score/state when the context is + // not fully matched at the last time. + for (const auto& prefix : hypotheses_) { + PrefixScore& prefix_score = cur_hyps_[prefix]; + if (prefix_score.context_score != 0) { + prefix_score.UpdateContext( + context_graph_, prefix_score, 0, prefix.size()); + } + } + std::vector, PrefixScore>> arr(cur_hyps_.begin(), + cur_hyps_.end()); + std::sort(arr.begin(), arr.end(), PrefixScoreCompare); + + // Update cur_hyps_ and get new result + UpdateHypotheses(arr); +} + +std::string CTCPrefixBeamSearch::GetBestPath(int index) { + int n_hyps = Outputs().size(); + CHECK_GT(n_hyps, 0); + CHECK_LT(index, n_hyps); + std::vector one = Outputs()[index]; + std::string sentence; + for (int i = 0; i < one.size(); i++) { + sentence += unit_table_->Find(one[i]); + } + return sentence; +} + +std::string CTCPrefixBeamSearch::GetBestPath() { return GetBestPath(0); } + +std::vector> CTCPrefixBeamSearch::GetNBestPath( + int n) { + int hyps_size = hypotheses_.size(); + CHECK_GT(hyps_size, 0); + + int min_n = n == -1 ? hypotheses_.size() : std::min(n, hyps_size); + + std::vector> n_best; + n_best.reserve(min_n); + + for (int i = 0; i < min_n; i++) { + n_best.emplace_back(Likelihood()[i], GetBestPath(i)); + } + return n_best; +} + +std::vector> +CTCPrefixBeamSearch::GetNBestPath() { + return GetNBestPath(-1); +} + +std::string CTCPrefixBeamSearch::GetFinalBestPath() { return GetBestPath(); } + +std::string CTCPrefixBeamSearch::GetPartialResult() { return GetBestPath(); } + + +} // namespace ppspeech diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h new file mode 100644 index 00000000..5013246a --- /dev/null +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h @@ -0,0 +1,101 @@ +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// modified from +// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.cc + +#pragma once + +#include "decoder/ctc_beam_search_opt.h" +#include "decoder/ctc_prefix_beam_search_score.h" +#include "decoder/decoder_itf.h" +#include "fst/symbol-table.h" + +namespace ppspeech { +class ContextGraph; +class CTCPrefixBeamSearch : public DecoderBase { + public: + CTCPrefixBeamSearch(const std::string& vocab_path, + const CTCBeamSearchOptions& opts); + ~CTCPrefixBeamSearch() {} + + SearchType Type() const { return SearchType::kPrefixBeamSearch; } + + void InitDecoder() override; + + void Reset() override; + + void AdvanceDecode( + const std::shared_ptr& decodable) override; + + std::string GetFinalBestPath() override; + std::string GetPartialResult() override; + + void FinalizeSearch(); + + const std::shared_ptr VocabTable() const { + return unit_table_; + } + + const std::vector>& Inputs() const { return hypotheses_; } + const std::vector>& Outputs() const { return outputs_; } + const std::vector& Likelihood() const { return likelihood_; } + const std::vector& ViterbiLikelihood() const { + return viterbi_likelihood_; + } + const std::vector>& Times() const { return times_; } + + + protected: + std::string GetBestPath() override; + std::vector> GetNBestPath() override; + std::vector> GetNBestPath(int n) override; + + private: + std::string GetBestPath(int index); + + void AdvanceDecoding( + const std::vector>& logp); + + void UpdateOutputs(const std::pair, PrefixScore>& prefix); + void UpdateHypotheses( + const std::vector, PrefixScore>>& prefix); + void UpdateFinalContext(); + + + private: + CTCBeamSearchOptions opts_; + std::shared_ptr unit_table_{nullptr}; + + std::unordered_map, PrefixScore, PrefixScoreHash> + cur_hyps_; + + // n-best list and corresponding likelihood, in sorted order + std::vector> hypotheses_; + std::vector likelihood_; + + std::vector> times_; + std::vector viterbi_likelihood_; + + // Outputs contain the hypotheses_ and tags lik: and + std::vector> outputs_; + + std::shared_ptr context_graph_{nullptr}; + + DISALLOW_COPY_AND_ASSIGN(CTCPrefixBeamSearch); +}; + + +} // namespace ppspeech diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc index 7cfee06c..c59b1f2e 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc @@ -12,40 +12,29 @@ // See the License for the specific language governing permissions and // limitations under the License. -// todo refactor, repalce with gtest - -#include "base/flags.h" -#include "base/log.h" -#include "decoder/ctc_beam_search_decoder.h" +#include "absl/strings/str_split.h" +#include "base/common.h" +#include "decoder/ctc_prefix_beam_search_decoder.h" #include "frontend/audio/data_cache.h" +#include "fst/symbol-table.h" #include "kaldi/util/table-types.h" #include "nnet/decodable.h" -#include "nnet/paddle_nnet.h" +#include "nnet/u2_nnet.h" DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier"); -DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); -DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); -DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm"); -DEFINE_string(lm_path, "", "language model"); +DEFINE_string(vocab_path, "", "vocab path"); + +DEFINE_string(model_path, "", "paddle nnet model"); + DEFINE_int32(receptive_field_length, 7, "receptive field of two CNN(kernel=3) downsampling module."); -DEFINE_int32(downsampling_rate, +DEFINE_int32(subsampling_rate, 4, "two CNN(kernel=3) module downsampling rate."); -DEFINE_string( - model_input_names, - "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box", - "model input names"); -DEFINE_string(model_output_names, - "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0", - "model output names"); -DEFINE_string(model_cache_names, - "chunk_state_h_box,chunk_state_c_box", - "model cache names"); -DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); -DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk"); + +DEFINE_int32(nnet_decoder_chunk, 16, "paddle nnet forward chunk"); using kaldi::BaseFloat; using kaldi::Matrix; @@ -53,117 +42,138 @@ using std::vector; // test ds2 online decoder by feeding speech feature int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; - CHECK(FLAGS_result_wspecifier != ""); - CHECK(FLAGS_feature_rspecifier != ""); + int32 num_done = 0, num_err = 0; + + CHECK_NE(FLAGS_result_wspecifier, ""); + CHECK_NE(FLAGS_feature_rspecifier, ""); + CHECK_NE(FLAGS_vocab_path, ""); + CHECK_NE(FLAGS_model_path, ""); + LOG(INFO) << "model path: " << FLAGS_model_path; + LOG(INFO) << "Reading vocab table " << FLAGS_vocab_path; kaldi::SequentialBaseFloatMatrixReader feature_reader( FLAGS_feature_rspecifier); kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); - std::string model_path = FLAGS_model_path; - std::string model_params = FLAGS_param_path; - std::string dict_file = FLAGS_dict_file; - std::string lm_path = FLAGS_lm_path; - LOG(INFO) << "model path: " << model_path; - LOG(INFO) << "model param: " << model_params; - LOG(INFO) << "dict path: " << dict_file; - LOG(INFO) << "lm path: " << lm_path; - int32 num_done = 0, num_err = 0; + // nnet + ppspeech::ModelOptions model_opts; + model_opts.model_path = FLAGS_model_path; + std::shared_ptr nnet = + std::make_shared(model_opts); + // decodeable + std::shared_ptr raw_data = + std::make_shared(); + std::shared_ptr decodable = + std::make_shared(nnet, raw_data); + + // decoder ppspeech::CTCBeamSearchOptions opts; - opts.dict_file = dict_file; - opts.lm_path = lm_path; - ppspeech::CTCBeamSearch decoder(opts); + opts.blank = 0; + opts.first_beam_size = 10; + opts.second_beam_size = 10; + ppspeech::CTCPrefixBeamSearch decoder(FLAGS_vocab_path, opts); - ppspeech::ModelOptions model_opts; - model_opts.model_path = model_path; - model_opts.param_path = model_params; - model_opts.cache_names = FLAGS_model_cache_names; - model_opts.cache_shape = FLAGS_model_cache_shapes; - model_opts.input_names = FLAGS_model_input_names; - model_opts.output_names = FLAGS_model_output_names; - std::shared_ptr nnet( - new ppspeech::PaddleNnet(model_opts)); - std::shared_ptr raw_data(new ppspeech::DataCache()); - std::shared_ptr decodable( - new ppspeech::Decodable(nnet, raw_data)); int32 chunk_size = FLAGS_receptive_field_length + - (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate; - int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk; + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate; + int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk; int32 receptive_field_length = FLAGS_receptive_field_length; LOG(INFO) << "chunk size (frame): " << chunk_size; LOG(INFO) << "chunk stride (frame): " << chunk_stride; LOG(INFO) << "receptive field (frame): " << receptive_field_length; + decoder.InitDecoder(); kaldi::Timer timer; for (; !feature_reader.Done(); feature_reader.Next()) { string utt = feature_reader.Key(); kaldi::Matrix feature = feature_reader.Value(); - raw_data->SetDim(feature.NumCols()); - LOG(INFO) << "process utt: " << utt; - LOG(INFO) << "rows: " << feature.NumRows(); - LOG(INFO) << "cols: " << feature.NumCols(); - int32 row_idx = 0; - int32 padding_len = 0; + int nframes = feature.NumRows(); + int feat_dim = feature.NumCols(); + raw_data->SetDim(feat_dim); + LOG(INFO) << "utt: " << utt; + LOG(INFO) << "feat shape: " << nframes << ", " << feat_dim; + + raw_data->SetDim(feat_dim); + int32 ori_feature_len = feature.NumRows(); - if ((feature.NumRows() - chunk_size) % chunk_stride != 0) { - padding_len = - chunk_stride - (feature.NumRows() - chunk_size) % chunk_stride; - feature.Resize(feature.NumRows() + padding_len, - feature.NumCols(), - kaldi::kCopyData); - } - int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1; + int32 num_chunks = feature.NumRows() / chunk_stride + 1; + LOG(INFO) << "num_chunks: " << num_chunks; + for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) { - kaldi::Vector feature_chunk(chunk_size * - feature.NumCols()); - int32 feature_chunk_size = 0; + int32 this_chunk_size = 0; if (ori_feature_len > chunk_idx * chunk_stride) { - feature_chunk_size = std::min( + this_chunk_size = std::min( ori_feature_len - chunk_idx * chunk_stride, chunk_size); } - if (feature_chunk_size < receptive_field_length) break; + if (this_chunk_size < receptive_field_length) { + LOG(WARNING) + << "utt: " << utt << " skip last " << this_chunk_size + << " frames, expect is " << receptive_field_length; + break; + } + + kaldi::Vector feature_chunk(this_chunk_size * + feat_dim); int32 start = chunk_idx * chunk_stride; + for (int row_id = 0; row_id < this_chunk_size; ++row_id) { + kaldi::SubVector feat_row(feature, start); + kaldi::SubVector feature_chunk_row( + feature_chunk.Data() + row_id * feat_dim, feat_dim); - for (int row_id = 0; row_id < chunk_size; ++row_id) { - kaldi::SubVector tmp(feature, start); - kaldi::SubVector f_chunk_tmp( - feature_chunk.Data() + row_id * feature.NumCols(), - feature.NumCols()); - f_chunk_tmp.CopyFromVec(tmp); + feature_chunk_row.CopyFromVec(feat_row); ++start; } + + // feat to frontend pipeline cache raw_data->Accept(feature_chunk); + + // send data finish signal if (chunk_idx == num_chunks - 1) { raw_data->SetFinished(); } + + // forward nnet decoder.AdvanceDecode(decodable); + + LOG(INFO) << "Partial result: " << decoder.GetPartialResult(); } - std::string result; - result = decoder.GetFinalBestPath(); + + decoder.FinalizeSearch(); + + // get 1-best result + std::string result = decoder.GetFinalBestPath(); + + // after process one utt, then reset state. decodable->Reset(); decoder.Reset(); + if (result.empty()) { // the TokenWriter can not write empty string. ++num_err; - KALDI_LOG << " the result of " << utt << " is empty"; + LOG(INFO) << " the result of " << utt << " is empty"; continue; } - KALDI_LOG << " the result of " << utt << " is " << result; + + LOG(INFO) << " the result of " << utt << " is " << result; result_writer.Write(utt, result); + ++num_done; } - KALDI_LOG << "Done " << num_done << " utterances, " << num_err - << " with errors."; double elapsed = timer.Elapsed(); - KALDI_LOG << " cost:" << elapsed << " s"; + LOG(INFO) << "Program cost:" << elapsed << " sec"; + + LOG(INFO) << "Done " << num_done << " utterances, " << num_err + << " with errors."; return (num_done != 0 ? 0 : 1); } diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h new file mode 100644 index 00000000..76b09e9b --- /dev/null +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h @@ -0,0 +1,98 @@ +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// modified from +// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.h + +#pragma once + +#include "base/common.h" +#include "utils/math.h" + +namespace ppspeech { + +class ContextGraph; + +struct PrefixScore { + // decoding, unit in log scale + float b = -kBaseFloatMax; // blank ending score + float nb = -kBaseFloatMax; // none-blank ending score + + // decoding score, sum + float Score() const { return LogSumExp(b, nb); } + + // timestamp, unit in log sclae + float v_b = -kBaseFloatMax; // viterbi blank ending score + float v_nb = -kBaseFloatMax; // niterbi none-blank ending score + float cur_token_prob = -kBaseFloatMax; // prob of current token + std::vector times_b; // times of viterbi blank path + std::vector times_nb; // times of viterbi non-blank path + + + // timestamp score, max + float ViterbiScore() const { return std::max(v_b, v_nb); } + + // get timestamp + const std::vector& Times() const { + return v_b > v_nb ? times_b : times_nb; + } + + // context state + bool has_context = false; + int context_state = 0; + float context_score = 0; + std::vector start_boundaries; + std::vector end_boundaries; + + + // decodign score with context bias + float TotalScore() const { return Score() + context_score; } + + void CopyContext(const PrefixScore& prefix_score) { + context_state = prefix_score.context_state; + context_score = prefix_score.context_score; + start_boundaries = prefix_score.start_boundaries; + end_boundaries = prefix_score.end_boundaries; + } + + void UpdateContext(const std::shared_ptr& constext_graph, + const PrefixScore& prefix_score, + int word_id, + int prefix_len) { + CHECK(false); + } + + void InitEmpty() { + b = 0.0f; // log(1) + nb = -kBaseFloatMax; // log(0) + v_b = 0.0f; // log(1) + v_nb = 0.0f; // log(1) + } +}; + +struct PrefixScoreHash { + // https://stackoverflow.com/questions/20511347/a-good-hash-function-for-a-vector + std::size_t operator()(const std::vector& prefix) const { + std::size_t seed = prefix.size(); + for (auto& i : prefix) { + seed ^= i + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + return seed; + } +}; + +using PrefixWithScoreType = std::pair, PrefixScoreHash>; + +} // namespace ppspeech diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.cc b/speechx/speechx/decoder/ctc_tlg_decoder.cc index 712d27dd..2c2b6d3c 100644 --- a/speechx/speechx/decoder/ctc_tlg_decoder.cc +++ b/speechx/speechx/decoder/ctc_tlg_decoder.cc @@ -18,37 +18,38 @@ namespace ppspeech { TLGDecoder::TLGDecoder(TLGDecoderOptions opts) { fst_.reset(fst::Fst::Read(opts.fst_path)); CHECK(fst_ != nullptr); + word_symbol_table_.reset( fst::SymbolTable::ReadText(opts.word_symbol_table)); + decoder_.reset(new kaldi::LatticeFasterOnlineDecoder(*fst_, opts.opts)); - decoder_->InitDecoding(); - frame_decoded_size_ = 0; + + Reset(); } -void TLGDecoder::InitDecoder() { +void TLGDecoder::Reset() { decoder_->InitDecoding(); - frame_decoded_size_ = 0; + num_frame_decoded_ = 0; + return; } +void TLGDecoder::InitDecoder() { Reset(); } + void TLGDecoder::AdvanceDecode( const std::shared_ptr& decodable) { - while (!decodable->IsLastFrame(frame_decoded_size_)) { + while (!decodable->IsLastFrame(num_frame_decoded_)) { AdvanceDecoding(decodable.get()); } } void TLGDecoder::AdvanceDecoding(kaldi::DecodableInterface* decodable) { decoder_->AdvanceDecoding(decodable, 1); - frame_decoded_size_++; + num_frame_decoded_++; } -void TLGDecoder::Reset() { - InitDecoder(); - return; -} std::string TLGDecoder::GetPartialResult() { - if (frame_decoded_size_ == 0) { + if (num_frame_decoded_ == 0) { // Assertion failed: (this->NumFramesDecoded() > 0 && "You cannot call // BestPathEnd if no frames were decoded.") return std::string(""); @@ -68,7 +69,7 @@ std::string TLGDecoder::GetPartialResult() { } std::string TLGDecoder::GetFinalBestPath() { - if (frame_decoded_size_ == 0) { + if (num_frame_decoded_ == 0) { // Assertion failed: (this->NumFramesDecoded() > 0 && "You cannot call // BestPathEnd if no frames were decoded.") return std::string(""); @@ -88,4 +89,5 @@ std::string TLGDecoder::GetFinalBestPath() { } return words; } -} + +} // namespace ppspeech diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h index 1ac46ac6..8be69dad 100644 --- a/speechx/speechx/decoder/ctc_tlg_decoder.h +++ b/speechx/speechx/decoder/ctc_tlg_decoder.h @@ -14,37 +14,78 @@ #pragma once -#include "base/basic_types.h" -#include "kaldi/decoder/decodable-itf.h" +#include "base/common.h" +#include "decoder/decoder_itf.h" #include "kaldi/decoder/lattice-faster-online-decoder.h" #include "util/parse-options.h" + +DECLARE_string(graph_path); +DECLARE_string(word_symbol_table); +DECLARE_int32(max_active); +DECLARE_double(beam); +DECLARE_double(lattice_beam); + namespace ppspeech { struct TLGDecoderOptions { - kaldi::LatticeFasterDecoderConfig opts; + kaldi::LatticeFasterDecoderConfig opts{}; // todo remove later, add into decode resource std::string word_symbol_table; std::string fst_path; - TLGDecoderOptions() : word_symbol_table(""), fst_path("") {} + static TLGDecoderOptions InitFromFlags() { + TLGDecoderOptions decoder_opts; + decoder_opts.word_symbol_table = FLAGS_word_symbol_table; + decoder_opts.fst_path = FLAGS_graph_path; + LOG(INFO) << "fst path: " << decoder_opts.fst_path; + LOG(INFO) << "fst symbole table: " << decoder_opts.word_symbol_table; + + decoder_opts.opts.max_active = FLAGS_max_active; + decoder_opts.opts.beam = FLAGS_beam; + decoder_opts.opts.lattice_beam = FLAGS_lattice_beam; + LOG(INFO) << "LatticeFasterDecoder max active: " + << decoder_opts.opts.max_active; + LOG(INFO) << "LatticeFasterDecoder beam: " << decoder_opts.opts.beam; + LOG(INFO) << "LatticeFasterDecoder lattice_beam: " + << decoder_opts.opts.lattice_beam; + + return decoder_opts; + } }; -class TLGDecoder { +class TLGDecoder : public DecoderBase { public: explicit TLGDecoder(TLGDecoderOptions opts); + ~TLGDecoder() = default; + void InitDecoder(); - void Decode(); - std::string GetBestPath(); - std::vector> GetNBestPath(); - std::string GetFinalBestPath(); - std::string GetPartialResult(); - int NumFrameDecoded(); - int DecodeLikelihoods(const std::vector>& probs, - std::vector& nbest_words); + void Reset(); + void AdvanceDecode( const std::shared_ptr& decodable); - void Reset(); + + void Decode(); + + std::string GetFinalBestPath() override; + std::string GetPartialResult() override; + + int DecodeLikelihoods(const std::vector>& probs, + const std::vector& nbest_words); + + protected: + std::string GetBestPath() override { + CHECK(false); + return {}; + } + std::vector> GetNBestPath() override { + CHECK(false); + return {}; + } + std::vector> GetNBestPath(int n) override { + CHECK(false); + return {}; + } private: void AdvanceDecoding(kaldi::DecodableInterface* decodable); @@ -52,8 +93,6 @@ class TLGDecoder { std::shared_ptr decoder_; std::shared_ptr> fst_; std::shared_ptr word_symbol_table_; - // the frame size which have decoded starts from 0. - int32 frame_decoded_size_; }; diff --git a/speechx/speechx/decoder/ctc_tlg_decoder_main.cc b/speechx/speechx/decoder/ctc_tlg_decoder_main.cc new file mode 100644 index 00000000..e9bd8a3f --- /dev/null +++ b/speechx/speechx/decoder/ctc_tlg_decoder_main.cc @@ -0,0 +1,137 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// todo refactor, repalce with gtest + +#include "base/common.h" +#include "decoder/ctc_tlg_decoder.h" +#include "decoder/param.h" +#include "frontend/audio/data_cache.h" +#include "kaldi/util/table-types.h" +#include "nnet/decodable.h" +#include "nnet/ds2_nnet.h" + + +DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); +DEFINE_string(result_wspecifier, "", "test result wspecifier"); + + +using kaldi::BaseFloat; +using kaldi::Matrix; +using std::vector; + +// test TLG decoder by feeding speech feature. +int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); + gflags::ParseCommandLineFlags(&argc, &argv, false); + google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; + + kaldi::SequentialBaseFloatMatrixReader feature_reader( + FLAGS_feature_rspecifier); + kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); + + int32 num_done = 0, num_err = 0; + + ppspeech::TLGDecoderOptions opts = + ppspeech::TLGDecoderOptions::InitFromFlags(); + opts.opts.beam = 15.0; + opts.opts.lattice_beam = 7.5; + ppspeech::TLGDecoder decoder(opts); + + ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags(); + + std::shared_ptr nnet( + new ppspeech::PaddleNnet(model_opts)); + std::shared_ptr raw_data(new ppspeech::DataCache()); + std::shared_ptr decodable( + new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale)); + + int32 chunk_size = FLAGS_receptive_field_length + + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate; + int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk; + int32 receptive_field_length = FLAGS_receptive_field_length; + LOG(INFO) << "chunk size (frame): " << chunk_size; + LOG(INFO) << "chunk stride (frame): " << chunk_stride; + LOG(INFO) << "receptive field (frame): " << receptive_field_length; + + decoder.InitDecoder(); + kaldi::Timer timer; + for (; !feature_reader.Done(); feature_reader.Next()) { + string utt = feature_reader.Key(); + kaldi::Matrix feature = feature_reader.Value(); + raw_data->SetDim(feature.NumCols()); + LOG(INFO) << "process utt: " << utt; + LOG(INFO) << "rows: " << feature.NumRows(); + LOG(INFO) << "cols: " << feature.NumCols(); + + int32 row_idx = 0; + int32 padding_len = 0; + int32 ori_feature_len = feature.NumRows(); + if ((feature.NumRows() - chunk_size) % chunk_stride != 0) { + padding_len = + chunk_stride - (feature.NumRows() - chunk_size) % chunk_stride; + feature.Resize(feature.NumRows() + padding_len, + feature.NumCols(), + kaldi::kCopyData); + } + int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1; + for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) { + kaldi::Vector feature_chunk(chunk_size * + feature.NumCols()); + int32 feature_chunk_size = 0; + if (ori_feature_len > chunk_idx * chunk_stride) { + feature_chunk_size = std::min( + ori_feature_len - chunk_idx * chunk_stride, chunk_size); + } + if (feature_chunk_size < receptive_field_length) break; + + int32 start = chunk_idx * chunk_stride; + for (int row_id = 0; row_id < chunk_size; ++row_id) { + kaldi::SubVector tmp(feature, start); + kaldi::SubVector f_chunk_tmp( + feature_chunk.Data() + row_id * feature.NumCols(), + feature.NumCols()); + f_chunk_tmp.CopyFromVec(tmp); + ++start; + } + raw_data->Accept(feature_chunk); + if (chunk_idx == num_chunks - 1) { + raw_data->SetFinished(); + } + decoder.AdvanceDecode(decodable); + } + std::string result; + result = decoder.GetFinalBestPath(); + decodable->Reset(); + decoder.Reset(); + if (result.empty()) { + // the TokenWriter can not write empty string. + ++num_err; + KALDI_LOG << " the result of " << utt << " is empty"; + continue; + } + KALDI_LOG << " the result of " << utt << " is " << result; + result_writer.Write(utt, result); + ++num_done; + } + + double elapsed = timer.Elapsed(); + KALDI_LOG << " cost:" << elapsed << " s"; + + KALDI_LOG << "Done " << num_done << " utterances, " << num_err + << " with errors."; + return (num_done != 0 ? 0 : 1); +} diff --git a/speechx/speechx/decoder/decoder_itf.h b/speechx/speechx/decoder/decoder_itf.h new file mode 100644 index 00000000..2289b317 --- /dev/null +++ b/speechx/speechx/decoder/decoder_itf.h @@ -0,0 +1,66 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "base/common.h" +#include "kaldi/decoder/decodable-itf.h" + +namespace ppspeech { + +enum SearchType { + kPrefixBeamSearch = 0, + kWfstBeamSearch = 1, +}; +class DecoderInterface { + public: + virtual ~DecoderInterface() {} + + virtual void InitDecoder() = 0; + + virtual void Reset() = 0; + + // call AdvanceDecoding + virtual void AdvanceDecode( + const std::shared_ptr& decodable) = 0; + + // call GetBestPath + virtual std::string GetFinalBestPath() = 0; + + virtual std::string GetPartialResult() = 0; + + protected: + // virtual void AdvanceDecoding(kaldi::DecodableInterface* decodable) = 0; + + // virtual void Decode() = 0; + + virtual std::string GetBestPath() = 0; + + virtual std::vector> GetNBestPath() = 0; + + virtual std::vector> GetNBestPath(int n) = 0; +}; + +class DecoderBase : public DecoderInterface { + protected: + // start from one + int NumFrameDecoded() { return num_frame_decoded_ + 1; } + + protected: + // current decoding frame number, abs_time_step_ + int32 num_frame_decoded_; +}; + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/decoder/nnet_logprob_decoder_main.cc b/speechx/speechx/decoder/nnet_logprob_decoder_main.cc index 0e249cc6..e0acbe77 100644 --- a/speechx/speechx/decoder/nnet_logprob_decoder_main.cc +++ b/speechx/speechx/decoder/nnet_logprob_decoder_main.cc @@ -30,8 +30,11 @@ using std::vector; // test decoder by feeding nnet posterior probability int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; kaldi::SequentialBaseFloatMatrixReader likelihood_reader( FLAGS_nnet_prob_respecifier); diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index d6ee2705..ebdd7119 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -17,23 +17,29 @@ #include "base/common.h" #include "decoder/ctc_beam_search_decoder.h" #include "decoder/ctc_tlg_decoder.h" -#include "frontend/audio/feature_pipeline.h" // feature DEFINE_bool(use_fbank, false, "False for fbank; or linear feature"); +DEFINE_bool(fill_zero, + false, + "fill zero at last chunk, when chunk < chunk_size"); // DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear // feature, or fbank"); DEFINE_int32(num_bins, 161, "num bins of mel"); DEFINE_string(cmvn_file, "", "read cmvn"); + // feature sliding window DEFINE_int32(receptive_field_length, 7, "receptive field of two CNN(kernel=3) downsampling module."); -DEFINE_int32(downsampling_rate, +DEFINE_int32(subsampling_rate, 4, "two CNN(kernel=3) module downsampling rate."); DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk"); + + // nnet +DEFINE_string(vocab_path, "", "nnet vocab path."); DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); DEFINE_string( @@ -48,71 +54,30 @@ DEFINE_string(model_cache_names, "model cache names"); DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); + // decoder -DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); -DEFINE_string(graph_path, "TLG", "decoder graph"); DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); + +DEFINE_string(graph_path, "TLG", "decoder graph"); +DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); DEFINE_int32(max_active, 7500, "max active"); DEFINE_double(beam, 15.0, "decoder beam"); DEFINE_double(lattice_beam, 7.5, "decoder beam"); -namespace ppspeech { -// todo refactor later -FeaturePipelineOptions InitFeaturePipelineOptions() { - FeaturePipelineOptions opts; - opts.cmvn_file = FLAGS_cmvn_file; - kaldi::FrameExtractionOptions frame_opts; - frame_opts.dither = 0.0; - frame_opts.frame_shift_ms = 10; - opts.use_fbank = FLAGS_use_fbank; - if (opts.use_fbank) { - opts.to_float32 = false; - frame_opts.window_type = "povey"; - frame_opts.frame_length_ms = 25; - opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; - opts.fbank_opts.frame_opts = frame_opts; - } else { - opts.to_float32 = true; - frame_opts.remove_dc_offset = false; - frame_opts.frame_length_ms = 20; - frame_opts.window_type = "hanning"; - frame_opts.preemph_coeff = 0.0; - opts.linear_spectrogram_opts.frame_opts = frame_opts; - } - opts.assembler_opts.subsampling_rate = FLAGS_downsampling_rate; - opts.assembler_opts.receptive_filed_length = FLAGS_receptive_field_length; - opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk; - - return opts; -} - -ModelOptions InitModelOptions() { - ModelOptions model_opts; - model_opts.model_path = FLAGS_model_path; - model_opts.param_path = FLAGS_param_path; - model_opts.cache_names = FLAGS_model_cache_names; - model_opts.cache_shape = FLAGS_model_cache_shapes; - model_opts.input_names = FLAGS_model_input_names; - model_opts.output_names = FLAGS_model_output_names; - return model_opts; -} - -TLGDecoderOptions InitDecoderOptions() { - TLGDecoderOptions decoder_opts; - decoder_opts.word_symbol_table = FLAGS_word_symbol_table; - decoder_opts.fst_path = FLAGS_graph_path; - decoder_opts.opts.max_active = FLAGS_max_active; - decoder_opts.opts.beam = FLAGS_beam; - decoder_opts.opts.lattice_beam = FLAGS_lattice_beam; - return decoder_opts; -} -RecognizerResource InitRecognizerResoure() { - RecognizerResource resource; - resource.acoustic_scale = FLAGS_acoustic_scale; - resource.feature_pipeline_opts = InitFeaturePipelineOptions(); - resource.model_opts = InitModelOptions(); - resource.tlg_opts = InitDecoderOptions(); - return resource; -} -} +// DecodeOptions flags +// DEFINE_int32(chunk_size, -1, "decoding chunk size"); +DEFINE_int32(num_left_chunks, -1, "left chunks in decoding"); +DEFINE_double(ctc_weight, + 0.5, + "ctc weight when combining ctc score and rescoring score"); +DEFINE_double(rescoring_weight, + 1.0, + "rescoring weight when combining ctc score and rescoring score"); +DEFINE_double(reverse_weight, + 0.3, + "used for bitransformer rescoring. it must be 0.0 if decoder is" + "conventional transformer decoder, and only reverse_weight > 0.0" + "dose the right to left decoder will be calculated and used"); +DEFINE_int32(nbest, 10, "nbest for ctc wfst or prefix search"); +DEFINE_int32(blank, 0, "blank id in vocab"); diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt index 8ae63256..050d78be 100644 --- a/speechx/speechx/frontend/audio/CMakeLists.txt +++ b/speechx/speechx/frontend/audio/CMakeLists.txt @@ -1,5 +1,3 @@ -project(frontend) - add_library(frontend STATIC cmvn.cc db_norm.cc diff --git a/speechx/speechx/frontend/audio/assembler.cc b/speechx/speechx/frontend/audio/assembler.cc index 37eeec80..56dfc3aa 100644 --- a/speechx/speechx/frontend/audio/assembler.cc +++ b/speechx/speechx/frontend/audio/assembler.cc @@ -16,16 +16,18 @@ namespace ppspeech { +using kaldi::BaseFloat; using kaldi::Vector; using kaldi::VectorBase; -using kaldi::BaseFloat; using std::unique_ptr; Assembler::Assembler(AssemblerOptions opts, unique_ptr base_extractor) { + fill_zero_ = opts.fill_zero; frame_chunk_stride_ = opts.subsampling_rate * opts.nnet_decoder_chunk; frame_chunk_size_ = (opts.nnet_decoder_chunk - 1) * opts.subsampling_rate + opts.receptive_filed_length; + cache_size_ = frame_chunk_size_ - frame_chunk_stride_; receptive_filed_length_ = opts.receptive_filed_length; base_extractor_ = std::move(base_extractor); dim_ = base_extractor_->Dim(); @@ -38,49 +40,83 @@ void Assembler::Accept(const kaldi::VectorBase& inputs) { // pop feature chunk bool Assembler::Read(kaldi::Vector* feats) { - feats->Resize(dim_ * frame_chunk_size_); bool result = Compute(feats); return result; } -// read all data from base_feature_extractor_ into cache_ +// read frame by frame from base_feature_extractor_ into cache_ bool Assembler::Compute(Vector* feats) { - // compute and feed - bool result = false; + // compute and feed frame by frame while (feature_cache_.size() < frame_chunk_size_) { Vector feature; - result = base_extractor_->Read(&feature); + bool result = base_extractor_->Read(&feature); if (result == false || feature.Dim() == 0) { - if (IsFinished() == false) return false; - break; + VLOG(1) << "result: " << result + << " feature dim: " << feature.Dim(); + if (IsFinished() == false) { + VLOG(1) << "finished reading feature. cache size: " + << feature_cache_.size(); + return false; + } else { + VLOG(1) << "break"; + break; + } } + + CHECK(feature.Dim() == dim_); feature_cache_.push(feature); + + nframes_ += 1; + VLOG(1) << "nframes: " << nframes_; } if (feature_cache_.size() < receptive_filed_length_) { + VLOG(1) << "feature_cache less than receptive_filed_lenght. " + << feature_cache_.size() << ": " << receptive_filed_length_; return false; } - while (feature_cache_.size() < frame_chunk_size_) { - Vector feature(dim_, kaldi::kSetZero); - feature_cache_.push(feature); + if (fill_zero_) { + while (feature_cache_.size() < frame_chunk_size_) { + Vector feature(dim_, kaldi::kSetZero); + nframes_ += 1; + feature_cache_.push(feature); + } } + int32 this_chunk_size = + std::min(static_cast(feature_cache_.size()), frame_chunk_size_); + feats->Resize(dim_ * this_chunk_size); + VLOG(1) << "read " << this_chunk_size << " feat."; + int32 counter = 0; - int32 cache_size = frame_chunk_size_ - frame_chunk_stride_; - int32 elem_dim = base_extractor_->Dim(); - while (counter < frame_chunk_size_) { + while (counter < this_chunk_size) { Vector& val = feature_cache_.front(); - int32 start = counter * elem_dim; - feats->Range(start, elem_dim).CopyFromVec(val); - if (frame_chunk_size_ - counter <= cache_size) { + CHECK(val.Dim() == dim_) << val.Dim(); + + int32 start = counter * dim_; + feats->Range(start, dim_).CopyFromVec(val); + + if (this_chunk_size - counter <= cache_size_) { feature_cache_.push(val); } + + // val is reference, so we should pop here feature_cache_.pop(); + counter++; } + CHECK(feature_cache_.size() == cache_size_); - return result; + return true; +} + + +void Assembler::Reset() { + std::queue> empty; + std::swap(feature_cache_, empty); + nframes_ = 0; + base_extractor_->Reset(); } } // namespace ppspeech diff --git a/speechx/speechx/frontend/audio/assembler.h b/speechx/speechx/frontend/audio/assembler.h index 258e61f2..72e6f635 100644 --- a/speechx/speechx/frontend/audio/assembler.h +++ b/speechx/speechx/frontend/audio/assembler.h @@ -22,14 +22,11 @@ namespace ppspeech { struct AssemblerOptions { // refer:https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/s2t/exps/deepspeech2/model.py // the nnet batch forward - int32 receptive_filed_length; - int32 subsampling_rate; - int32 nnet_decoder_chunk; - - AssemblerOptions() - : receptive_filed_length(1), - subsampling_rate(1), - nnet_decoder_chunk(1) {} + int32 receptive_filed_length{1}; + int32 subsampling_rate{1}; + int32 nnet_decoder_chunk{1}; + bool fill_zero{false}; // whether fill zero when last chunk is not equal to + // frame_chunk_size_ }; class Assembler : public FrontendInterface { @@ -39,29 +36,34 @@ class Assembler : public FrontendInterface { std::unique_ptr base_extractor = NULL); // Feed feats or waves - virtual void Accept(const kaldi::VectorBase& inputs); + void Accept(const kaldi::VectorBase& inputs) override; // feats size = num_frames * feat_dim - virtual bool Read(kaldi::Vector* feats); + bool Read(kaldi::Vector* feats) override; // feat dim - virtual size_t Dim() const { return dim_; } + size_t Dim() const override { return dim_; } - virtual void SetFinished() { base_extractor_->SetFinished(); } + void SetFinished() override { base_extractor_->SetFinished(); } - virtual bool IsFinished() const { return base_extractor_->IsFinished(); } + bool IsFinished() const override { return base_extractor_->IsFinished(); } - virtual void Reset() { base_extractor_->Reset(); } + void Reset() override; private: bool Compute(kaldi::Vector* feats); - int32 dim_; + bool fill_zero_{false}; + + int32 dim_; // feat dim int32 frame_chunk_size_; // window int32 frame_chunk_stride_; // stride + int32 cache_size_; // window - stride int32 receptive_filed_length_; std::queue> feature_cache_; std::unique_ptr base_extractor_; + + int32 nframes_; // num frame computed DISALLOW_COPY_AND_ASSIGN(Assembler); }; diff --git a/speechx/speechx/frontend/audio/audio_cache.cc b/speechx/speechx/frontend/audio/audio_cache.cc index b7a15acd..61ef8841 100644 --- a/speechx/speechx/frontend/audio/audio_cache.cc +++ b/speechx/speechx/frontend/audio/audio_cache.cc @@ -13,13 +13,14 @@ // limitations under the License. #include "frontend/audio/audio_cache.h" + #include "kaldi/base/timer.h" namespace ppspeech { using kaldi::BaseFloat; -using kaldi::VectorBase; using kaldi::Vector; +using kaldi::VectorBase; AudioCache::AudioCache(int buffer_size, bool to_float32) : finished_(false), @@ -83,6 +84,10 @@ bool AudioCache::Read(Vector* waves) { } size_ -= chunk_size; offset_ = (offset_ + chunk_size) % ring_buffer_.size(); + + nsamples_ += chunk_size; + VLOG(1) << "nsamples readed: " << nsamples_; + ready_feed_condition_.notify_one(); return true; } diff --git a/speechx/speechx/frontend/audio/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h index fc07d4ba..4708a6e0 100644 --- a/speechx/speechx/frontend/audio/audio_cache.h +++ b/speechx/speechx/frontend/audio/audio_cache.h @@ -41,10 +41,11 @@ class AudioCache : public FrontendInterface { virtual bool IsFinished() const { return finished_; } - virtual void Reset() { + void Reset() override { offset_ = 0; size_ = 0; finished_ = false; + nsamples_ = 0; } private: @@ -61,6 +62,7 @@ class AudioCache : public FrontendInterface { kaldi::int32 timeout_; // millisecond bool to_float32_; // int16 -> float32. used in linear_spectrogram + int32 nsamples_; // number samples readed. DISALLOW_COPY_AND_ASSIGN(AudioCache); }; diff --git a/speechx/speechx/frontend/audio/cmvn.cc b/speechx/speechx/frontend/audio/cmvn.cc index 1ea83aba..3d80e001 100644 --- a/speechx/speechx/frontend/audio/cmvn.cc +++ b/speechx/speechx/frontend/audio/cmvn.cc @@ -14,22 +14,25 @@ #include "frontend/audio/cmvn.h" + #include "kaldi/feat/cmvn.h" #include "kaldi/util/kaldi-io.h" namespace ppspeech { -using kaldi::Vector; -using kaldi::VectorBase; using kaldi::BaseFloat; -using std::vector; using kaldi::SubVector; +using kaldi::Vector; +using kaldi::VectorBase; using std::unique_ptr; +using std::vector; CMVN::CMVN(std::string cmvn_file, unique_ptr base_extractor) : var_norm_(true) { + CHECK_NE(cmvn_file, ""); base_extractor_ = std::move(base_extractor); + bool binary; kaldi::Input ki(cmvn_file, &binary); stats_.Read(ki.Stream(), binary); @@ -55,11 +58,11 @@ bool CMVN::Read(kaldi::Vector* feats) { // feats contain num_frames feature. void CMVN::Compute(VectorBase* feats) const { KALDI_ASSERT(feats != NULL); - int32 dim = stats_.NumCols() - 1; + if (stats_.NumRows() > 2 || stats_.NumRows() < 1 || - feats->Dim() % dim != 0) { - KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << 'x' - << stats_.NumCols() << ", feats " << feats->Dim() << 'x'; + feats->Dim() % dim_ != 0) { + KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << ',' + << stats_.NumCols() - 1 << ", feats " << feats->Dim() << 'x'; } if (stats_.NumRows() == 1 && var_norm_) { KALDI_ERR @@ -67,7 +70,7 @@ void CMVN::Compute(VectorBase* feats) const { << "are supplied."; } - double count = stats_(0, dim); + double count = stats_(0, dim_); // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when // computing an offset and representing it as stats_, we use a count of one. if (count < 1.0) @@ -77,14 +80,14 @@ void CMVN::Compute(VectorBase* feats) const { if (!var_norm_) { Vector offset(feats->Dim()); - SubVector mean_stats(stats_.RowData(0), dim); + SubVector mean_stats(stats_.RowData(0), dim_); Vector mean_stats_apply(feats->Dim()); - // fill the datat of mean_stats in mean_stats_appy whose dim is equal - // with the dim of feature. - // the dim of feats = dim * num_frames; - for (int32 idx = 0; idx < feats->Dim() / dim; ++idx) { - SubVector stats_tmp(mean_stats_apply.Data() + dim * idx, - dim); + // fill the datat of mean_stats in mean_stats_appy whose dim_ is equal + // with the dim_ of feature. + // the dim_ of feats = dim_ * num_frames; + for (int32 idx = 0; idx < feats->Dim() / dim_; ++idx) { + SubVector stats_tmp(mean_stats_apply.Data() + dim_ * idx, + dim_); stats_tmp.CopyFromVec(mean_stats); } offset.AddVec(-1.0 / count, mean_stats_apply); @@ -94,7 +97,7 @@ void CMVN::Compute(VectorBase* feats) const { // norm(0, d) = mean offset; // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d). kaldi::Matrix norm(2, feats->Dim()); - for (int32 d = 0; d < dim; d++) { + for (int32 d = 0; d < dim_; d++) { double mean, offset, scale; mean = stats_(0, d) / count; double var = (stats_(1, d) / count) - mean * mean, floor = 1.0e-20; @@ -111,7 +114,7 @@ void CMVN::Compute(VectorBase* feats) const { for (int32 d_skip = d; d_skip < feats->Dim();) { norm(0, d_skip) = offset; norm(1, d_skip) = scale; - d_skip = d_skip + dim; + d_skip = d_skip + dim_; } } // Apply the normalization. diff --git a/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc b/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc index 0def1466..713c9ef1 100644 --- a/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc +++ b/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc @@ -30,8 +30,11 @@ DEFINE_bool(binary, true, "write cmvn in binary (true) or text(false)"); using namespace boost::json; // from int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; LOG(INFO) << "cmvn josn path: " << FLAGS_json_file; @@ -44,13 +47,13 @@ int main(int argc, char* argv[]) { for (auto obj : value.as_object()) { if (obj.key() == "mean_stat") { - LOG(INFO) << "mean_stat:" << obj.value(); + VLOG(2) << "mean_stat:" << obj.value(); } if (obj.key() == "var_stat") { - LOG(INFO) << "var_stat: " << obj.value(); + VLOG(2) << "var_stat: " << obj.value(); } if (obj.key() == "frame_num") { - LOG(INFO) << "frame_num: " << obj.value(); + VLOG(2) << "frame_num: " << obj.value(); } } @@ -76,7 +79,7 @@ int main(int argc, char* argv[]) { cmvn_stats(1, idx) = var_stat_vec[idx]; } cmvn_stats(0, mean_size) = frame_num; - LOG(INFO) << cmvn_stats; + VLOG(2) << cmvn_stats; kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, FLAGS_binary); LOG(INFO) << "cmvn stats have write into: " << FLAGS_cmvn_write_path; diff --git a/speechx/speechx/frontend/audio/compute_fbank_main.cc b/speechx/speechx/frontend/audio/compute_fbank_main.cc index f7a42315..e2b54a8a 100644 --- a/speechx/speechx/frontend/audio/compute_fbank_main.cc +++ b/speechx/speechx/frontend/audio/compute_fbank_main.cc @@ -16,29 +16,36 @@ #include "base/flags.h" #include "base/log.h" -#include "kaldi/feat/wave-reader.h" -#include "kaldi/util/kaldi-io.h" -#include "kaldi/util/table-types.h" - #include "frontend/audio/audio_cache.h" #include "frontend/audio/data_cache.h" #include "frontend/audio/fbank.h" #include "frontend/audio/feature_cache.h" #include "frontend/audio/frontend_itf.h" #include "frontend/audio/normalizer.h" +#include "kaldi/feat/wave-reader.h" +#include "kaldi/util/kaldi-io.h" +#include "kaldi/util/table-types.h" DEFINE_string(wav_rspecifier, "", "test wav scp path"); DEFINE_string(feature_wspecifier, "", "output feats wspecifier"); DEFINE_string(cmvn_file, "", "read cmvn"); DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); DEFINE_int32(num_bins, 161, "fbank num bins"); +DEFINE_int32(sample_rate, 16000, "sampe rate: 16k, 8k."); int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; + CHECK_GT(FLAGS_wav_rspecifier.size(), 0); + CHECK_GT(FLAGS_feature_wspecifier.size(), 0); kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); + kaldi::SequentialTableReader wav_info_reader( + FLAGS_wav_rspecifier); kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier); int32 num_done = 0, num_err = 0; @@ -54,6 +61,10 @@ int main(int argc, char* argv[]) { opt.frame_opts.frame_shift_ms = 10; opt.mel_opts.num_bins = FLAGS_num_bins; opt.frame_opts.dither = 0.0; + LOG(INFO) << "frame_length_ms: " << opt.frame_opts.frame_length_ms; + LOG(INFO) << "frame_shift_ms: " << opt.frame_opts.frame_shift_ms; + LOG(INFO) << "num_bins: " << opt.mel_opts.num_bins; + LOG(INFO) << "dither: " << opt.frame_opts.dither; std::unique_ptr fbank( new ppspeech::Fbank(opt, std::move(data_source))); @@ -61,53 +72,76 @@ int main(int argc, char* argv[]) { std::unique_ptr cmvn( new ppspeech::CMVN(FLAGS_cmvn_file, std::move(fbank))); - ppspeech::FeatureCacheOptions feat_cache_opts; // the feature cache output feature chunk by chunk. + ppspeech::FeatureCacheOptions feat_cache_opts; ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn)); LOG(INFO) << "fbank: " << true; LOG(INFO) << "feat dim: " << feature_cache.Dim(); - int sample_rate = 16000; + float streaming_chunk = FLAGS_streaming_chunk; - int chunk_sample_size = streaming_chunk * sample_rate; - LOG(INFO) << "sr: " << sample_rate; - LOG(INFO) << "chunk size (s): " << streaming_chunk; + int chunk_sample_size = streaming_chunk * FLAGS_sample_rate; + LOG(INFO) << "sr: " << FLAGS_sample_rate; + LOG(INFO) << "chunk size (sec): " << streaming_chunk; LOG(INFO) << "chunk size (sample): " << chunk_sample_size; - for (; !wav_reader.Done(); wav_reader.Next()) { - std::string utt = wav_reader.Key(); + for (; !wav_reader.Done() && !wav_info_reader.Done(); + wav_reader.Next(), wav_info_reader.Next()) { + const std::string& utt = wav_reader.Key(); const kaldi::WaveData& wave_data = wav_reader.Value(); - LOG(INFO) << "process utt: " << utt; + const std::string& utt2 = wav_info_reader.Key(); + const kaldi::WaveInfo& wave_info = wav_info_reader.Value(); + + CHECK(utt == utt2) + << "wav reader and wav info reader using diff rspecifier!!!"; + LOG(INFO) << "utt: " << utt; + LOG(INFO) << "samples: " << wave_info.SampleCount(); + LOG(INFO) << "dur: " << wave_info.Duration() << " sec"; + CHECK(wave_info.SampFreq() == FLAGS_sample_rate) + << "need " << FLAGS_sample_rate << " get " << wave_info.SampFreq(); + + // load first channel wav int32 this_channel = 0; kaldi::SubVector waveform(wave_data.Data(), this_channel); - int tot_samples = waveform.Dim(); - LOG(INFO) << "wav len (sample): " << tot_samples; + // compute feat chunk by chunk + int tot_samples = waveform.Dim(); int sample_offset = 0; std::vector> feats; int feature_rows = 0; while (sample_offset < tot_samples) { + // cur chunk size int cur_chunk_size = std::min(chunk_sample_size, tot_samples - sample_offset); + // get chunk wav kaldi::Vector wav_chunk(cur_chunk_size); for (int i = 0; i < cur_chunk_size; ++i) { wav_chunk(i) = waveform(sample_offset + i); } - kaldi::Vector features; + // compute feat feature_cache.Accept(wav_chunk); + + // send finish signal if (cur_chunk_size < chunk_sample_size) { feature_cache.SetFinished(); } + + // read feat + kaldi::Vector features; bool flag = true; do { flag = feature_cache.Read(&features); - feats.push_back(features); - feature_rows += features.Dim() / feature_cache.Dim(); + if (flag && features.Dim() != 0) { + feats.push_back(features); + feature_rows += features.Dim() / feature_cache.Dim(); + } } while (flag == true && features.Dim() != 0); + + // forward offset sample_offset += cur_chunk_size; } @@ -125,14 +159,20 @@ int main(int argc, char* argv[]) { ++cur_idx; } } + LOG(INFO) << "feat shape: " << features.NumRows() << " , " + << features.NumCols(); feat_writer.Write(utt, features); + + // reset frontend pipeline state feature_cache.Reset(); if (num_done % 50 == 0 && num_done != 0) - KALDI_VLOG(2) << "Processed " << num_done << " utterances"; + VLOG(2) << "Processed " << num_done << " utterances"; + num_done++; } - KALDI_LOG << "Done " << num_done << " utterances, " << num_err + + LOG(INFO) << "Done " << num_done << " utterances, " << num_err << " with errors."; return (num_done != 0 ? 0 : 1); } diff --git a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc index 162c3529..42693c0c 100644 --- a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc +++ b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc @@ -14,16 +14,15 @@ #include "base/flags.h" #include "base/log.h" -#include "kaldi/feat/wave-reader.h" -#include "kaldi/util/kaldi-io.h" -#include "kaldi/util/table-types.h" - #include "frontend/audio/audio_cache.h" #include "frontend/audio/data_cache.h" #include "frontend/audio/feature_cache.h" #include "frontend/audio/frontend_itf.h" #include "frontend/audio/linear_spectrogram.h" #include "frontend/audio/normalizer.h" +#include "kaldi/feat/wave-reader.h" +#include "kaldi/util/kaldi-io.h" +#include "kaldi/util/table-types.h" DEFINE_string(wav_rspecifier, "", "test wav scp path"); DEFINE_string(feature_wspecifier, "", "output feats wspecifier"); @@ -31,8 +30,11 @@ DEFINE_string(cmvn_file, "./cmvn.ark", "read cmvn"); DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); diff --git a/speechx/speechx/frontend/audio/data_cache.h b/speechx/speechx/frontend/audio/data_cache.h index 64e9db86..5f5cd51b 100644 --- a/speechx/speechx/frontend/audio/data_cache.h +++ b/speechx/speechx/frontend/audio/data_cache.h @@ -27,7 +27,7 @@ namespace ppspeech { // pre-recorded audio/feature class DataCache : public FrontendInterface { public: - explicit DataCache() { finished_ = false; } + DataCache() { finished_ = false; } // accept waves/feats virtual void Accept(const kaldi::VectorBase& inputs) { @@ -56,4 +56,4 @@ class DataCache : public FrontendInterface { DISALLOW_COPY_AND_ASSIGN(DataCache); }; -} \ No newline at end of file +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/audio/db_norm.cc b/speechx/speechx/frontend/audio/db_norm.cc index 931e932d..ad79fcc3 100644 --- a/speechx/speechx/frontend/audio/db_norm.cc +++ b/speechx/speechx/frontend/audio/db_norm.cc @@ -14,17 +14,18 @@ #include "frontend/audio/db_norm.h" + #include "kaldi/feat/cmvn.h" #include "kaldi/util/kaldi-io.h" namespace ppspeech { -using kaldi::Vector; -using kaldi::VectorBase; using kaldi::BaseFloat; -using std::vector; using kaldi::SubVector; +using kaldi::Vector; +using kaldi::VectorBase; using std::unique_ptr; +using std::vector; DecibelNormalizer::DecibelNormalizer( const DecibelNormalizerOptions& opts, diff --git a/speechx/speechx/frontend/audio/fbank.cc b/speechx/speechx/frontend/audio/fbank.cc index 059abbbd..deabe876 100644 --- a/speechx/speechx/frontend/audio/fbank.cc +++ b/speechx/speechx/frontend/audio/fbank.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "frontend/audio/fbank.h" + #include "kaldi/base/kaldi-math.h" #include "kaldi/feat/feature-common.h" #include "kaldi/feat/feature-functions.h" @@ -20,12 +21,12 @@ namespace ppspeech { -using kaldi::int32; using kaldi::BaseFloat; -using kaldi::Vector; +using kaldi::int32; +using kaldi::Matrix; using kaldi::SubVector; +using kaldi::Vector; using kaldi::VectorBase; -using kaldi::Matrix; using std::vector; FbankComputer::FbankComputer(const Options& opts) diff --git a/speechx/speechx/frontend/audio/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc index 509a98c3..3f05eae6 100644 --- a/speechx/speechx/frontend/audio/feature_cache.cc +++ b/speechx/speechx/frontend/audio/feature_cache.cc @@ -16,12 +16,12 @@ namespace ppspeech { -using kaldi::Vector; -using kaldi::VectorBase; using kaldi::BaseFloat; -using std::vector; using kaldi::SubVector; +using kaldi::Vector; +using kaldi::VectorBase; using std::unique_ptr; +using std::vector; FeatureCache::FeatureCache(FeatureCacheOptions opts, unique_ptr base_extractor) { @@ -73,6 +73,9 @@ bool FeatureCache::Compute() { if (result == false || feature.Dim() == 0) return false; int32 num_chunk = feature.Dim() / dim_; + nframe_ += num_chunk; + VLOG(1) << "nframe computed: " << nframe_; + for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) { int32 start = chunk_idx * dim_; Vector feature_chunk(dim_); diff --git a/speechx/speechx/frontend/audio/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h index b922de12..bd869225 100644 --- a/speechx/speechx/frontend/audio/feature_cache.h +++ b/speechx/speechx/frontend/audio/feature_cache.h @@ -41,21 +41,24 @@ class FeatureCache : public FrontendInterface { virtual size_t Dim() const { return dim_; } virtual void SetFinished() { + LOG(INFO) << "set finished"; // std::unique_lock lock(mutex_); base_extractor_->SetFinished(); - LOG(INFO) << "set finished"; + // read the last chunk data Compute(); // ready_feed_condition_.notify_one(); + LOG(INFO) << "compute last feats done."; } virtual bool IsFinished() const { return base_extractor_->IsFinished(); } - virtual void Reset() { + void Reset() override { + std::queue> empty; + std::swap(cache_, empty); + nframe_ = 0; base_extractor_->Reset(); - while (!cache_.empty()) { - cache_.pop(); - } + VLOG(1) << "feature cache reset: cache size: " << cache_.size(); } private: @@ -74,6 +77,7 @@ class FeatureCache : public FrontendInterface { std::condition_variable ready_feed_condition_; std::condition_variable ready_read_condition_; + int32 nframe_; // num of feature computed DISALLOW_COPY_AND_ASSIGN(FeatureCache); }; diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc index 9cacff9f..2931b96b 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.cc +++ b/speechx/speechx/frontend/audio/feature_pipeline.cc @@ -18,7 +18,8 @@ namespace ppspeech { using std::unique_ptr; -FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) { +FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) + : opts_(opts) { unique_ptr data_source( new ppspeech::AudioCache(1000 * kint16max, opts.to_float32)); @@ -32,6 +33,7 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) { opts.linear_spectrogram_opts, std::move(data_source))); } + CHECK_NE(opts.cmvn_file, ""); unique_ptr cmvn( new ppspeech::CMVN(opts.cmvn_file, std::move(base_feature))); @@ -42,4 +44,4 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) { new ppspeech::Assembler(opts.assembler_opts, std::move(cache))); } -} // ppspeech +} // namespace ppspeech diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index 48f95e3f..e83a3f31 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -25,27 +25,78 @@ #include "frontend/audio/linear_spectrogram.h" #include "frontend/audio/normalizer.h" +// feature +DECLARE_bool(use_fbank); +DECLARE_bool(fill_zero); +DECLARE_int32(num_bins); +DECLARE_string(cmvn_file); + +// feature sliding window +DECLARE_int32(receptive_field_length); +DECLARE_int32(subsampling_rate); +DECLARE_int32(nnet_decoder_chunk); + namespace ppspeech { struct FeaturePipelineOptions { - std::string cmvn_file; - bool to_float32; // true, only for linear feature - bool use_fbank; - LinearSpectrogramOptions linear_spectrogram_opts; - kaldi::FbankOptions fbank_opts; - FeatureCacheOptions feature_cache_opts; - AssemblerOptions assembler_opts; - - FeaturePipelineOptions() - : cmvn_file(""), - to_float32(false), // true, only for linear feature - use_fbank(true), - linear_spectrogram_opts(), - fbank_opts(), - feature_cache_opts(), - assembler_opts() {} + std::string cmvn_file{}; + bool to_float32{false}; // true, only for linear feature + bool use_fbank{true}; + LinearSpectrogramOptions linear_spectrogram_opts{}; + kaldi::FbankOptions fbank_opts{}; + FeatureCacheOptions feature_cache_opts{}; + AssemblerOptions assembler_opts{}; + + static FeaturePipelineOptions InitFromFlags() { + FeaturePipelineOptions opts; + opts.cmvn_file = FLAGS_cmvn_file; + LOG(INFO) << "cmvn file: " << opts.cmvn_file; + + // frame options + kaldi::FrameExtractionOptions frame_opts; + frame_opts.dither = 0.0; + LOG(INFO) << "dither: " << frame_opts.dither; + frame_opts.frame_shift_ms = 10; + LOG(INFO) << "frame shift ms: " << frame_opts.frame_shift_ms; + opts.use_fbank = FLAGS_use_fbank; + LOG(INFO) << "feature type: " << (opts.use_fbank ? "fbank" : "linear"); + if (opts.use_fbank) { + opts.to_float32 = false; + frame_opts.window_type = "povey"; + frame_opts.frame_length_ms = 25; + opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; + LOG(INFO) << "num bins: " << opts.fbank_opts.mel_opts.num_bins; + + opts.fbank_opts.frame_opts = frame_opts; + } else { + opts.to_float32 = true; + frame_opts.remove_dc_offset = false; + frame_opts.frame_length_ms = 20; + frame_opts.window_type = "hanning"; + frame_opts.preemph_coeff = 0.0; + + opts.linear_spectrogram_opts.frame_opts = frame_opts; + } + LOG(INFO) << "frame length ms: " << frame_opts.frame_length_ms; + + // assembler opts + opts.assembler_opts.subsampling_rate = FLAGS_subsampling_rate; + opts.assembler_opts.receptive_filed_length = + FLAGS_receptive_field_length; + opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk; + opts.assembler_opts.fill_zero = FLAGS_fill_zero; + LOG(INFO) << "subsampling rate: " + << opts.assembler_opts.subsampling_rate; + LOG(INFO) << "nnet receptive filed length: " + << opts.assembler_opts.receptive_filed_length; + LOG(INFO) << "nnet chunk size: " + << opts.assembler_opts.nnet_decoder_chunk; + LOG(INFO) << "frontend fill zeros: " << opts.assembler_opts.fill_zero; + return opts; + } }; + class FeaturePipeline : public FrontendInterface { public: explicit FeaturePipeline(const FeaturePipelineOptions& opts); @@ -60,7 +111,21 @@ class FeaturePipeline : public FrontendInterface { virtual bool IsFinished() const { return base_extractor_->IsFinished(); } virtual void Reset() { base_extractor_->Reset(); } + const FeaturePipelineOptions& Config() { return opts_; } + + const BaseFloat FrameShift() const { + return opts_.fbank_opts.frame_opts.frame_shift_ms; + } + const BaseFloat FrameLength() const { + return opts_.fbank_opts.frame_opts.frame_length_ms; + } + const BaseFloat SampleRate() const { + return opts_.fbank_opts.frame_opts.samp_freq; + } + private: + FeaturePipelineOptions opts_; std::unique_ptr base_extractor_; }; -} + +} // namespace ppspeech diff --git a/speechx/speechx/frontend/audio/linear_spectrogram.cc b/speechx/speechx/frontend/audio/linear_spectrogram.cc index 55c03978..d4a2fcc6 100644 --- a/speechx/speechx/frontend/audio/linear_spectrogram.cc +++ b/speechx/speechx/frontend/audio/linear_spectrogram.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "frontend/audio/linear_spectrogram.h" + #include "kaldi/base/kaldi-math.h" #include "kaldi/feat/feature-common.h" #include "kaldi/feat/feature-functions.h" @@ -20,12 +21,12 @@ namespace ppspeech { -using kaldi::int32; using kaldi::BaseFloat; -using kaldi::Vector; +using kaldi::int32; +using kaldi::Matrix; using kaldi::SubVector; +using kaldi::Vector; using kaldi::VectorBase; -using kaldi::Matrix; using std::vector; LinearSpectrogramComputer::LinearSpectrogramComputer(const Options& opts) diff --git a/speechx/speechx/frontend/audio/mfcc.cc b/speechx/speechx/frontend/audio/mfcc.cc index bda1f96d..15f8cb0f 100644 --- a/speechx/speechx/frontend/audio/mfcc.cc +++ b/speechx/speechx/frontend/audio/mfcc.cc @@ -14,6 +14,7 @@ #include "frontend/audio/mfcc.h" + #include "kaldi/base/kaldi-math.h" #include "kaldi/feat/feature-common.h" #include "kaldi/feat/feature-functions.h" @@ -21,12 +22,12 @@ namespace ppspeech { -using kaldi::int32; using kaldi::BaseFloat; -using kaldi::Vector; +using kaldi::int32; +using kaldi::Matrix; using kaldi::SubVector; +using kaldi::Vector; using kaldi::VectorBase; -using kaldi::Matrix; using std::vector; Mfcc::Mfcc(const MfccOptions& opts, diff --git a/speechx/speechx/frontend/audio/mfcc.h b/speechx/speechx/frontend/audio/mfcc.h index 62b0078c..6c1c2f7d 100644 --- a/speechx/speechx/frontend/audio/mfcc.h +++ b/speechx/speechx/frontend/audio/mfcc.h @@ -14,7 +14,6 @@ #pragma once -#include "kaldi/feat/feature-mfcc.h" #include "kaldi/feat/feature-mfcc.h" #include "kaldi/matrix/kaldi-vector.h" diff --git a/speechx/speechx/kaldi/decoder/decodable-itf.h b/speechx/speechx/kaldi/decoder/decodable-itf.h index b8ce9143..a7c12588 100644 --- a/speechx/speechx/kaldi/decoder/decodable-itf.h +++ b/speechx/speechx/kaldi/decoder/decodable-itf.h @@ -101,7 +101,9 @@ namespace kaldi { */ class DecodableInterface { public: - /// Returns the log likelihood, which will be negated in the decoder. + virtual ~DecodableInterface() {} + + /// Returns the log likelihood(logprob), which will be negated in the decoder. /// The "frame" starts from zero. You should verify that NumFramesReady() > /// frame /// before calling this. @@ -143,11 +145,12 @@ class DecodableInterface { /// this is for compatibility with OpenFst). virtual int32 NumIndices() const = 0; + /// Returns the likelihood(prob), which will be postive in the decoder. + /// The "frame" starts from zero. You should verify that NumFramesReady() > + /// frame + /// before calling this. virtual bool FrameLikelihood( int32 frame, std::vector* likelihood) = 0; - - - virtual ~DecodableInterface() {} }; /// @} } // namespace Kaldi diff --git a/speechx/speechx/model/CMakeLists.txt b/speechx/speechx/model/CMakeLists.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/speechx/speechx/nnet/CMakeLists.txt b/speechx/speechx/nnet/CMakeLists.txt index c325ce75..43566616 100644 --- a/speechx/speechx/nnet/CMakeLists.txt +++ b/speechx/speechx/nnet/CMakeLists.txt @@ -1,14 +1,39 @@ -project(nnet) +set(srcs decodable.cc) -add_library(nnet STATIC - decodable.cc - paddle_nnet.cc -) +if(USING_DS2) + list(APPEND srcs ds2_nnet.cc) +endif() + +if(USING_U2) + list(APPEND srcs u2_nnet.cc) +endif() + +add_library(nnet STATIC ${srcs}) target_link_libraries(nnet absl::strings) -set(bin_name nnet_forward_main) -add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) -target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet ${DEPS}) +if(USING_U2) + target_compile_options(nnet PUBLIC ${PADDLE_COMPILE_FLAGS}) + target_include_directories(nnet PUBLIC ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) +endif() + + +if(USING_DS2) + set(bin_name ds2_nnet_main) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet) + + target_link_libraries(${bin_name} ${DEPS}) +endif() +# test bin +if(USING_U2) + set(bin_name u2_nnet_main) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet) + target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS}) + target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) + target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) +endif() diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index 465f64a9..11d60d3e 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -18,10 +18,10 @@ namespace ppspeech { using kaldi::BaseFloat; using kaldi::Matrix; -using std::vector; using kaldi::Vector; +using std::vector; -Decodable::Decodable(const std::shared_ptr& nnet, +Decodable::Decodable(const std::shared_ptr& nnet, const std::shared_ptr& frontend, kaldi::BaseFloat acoustic_scale) : frontend_(frontend), @@ -30,17 +30,17 @@ Decodable::Decodable(const std::shared_ptr& nnet, frames_ready_(0), acoustic_scale_(acoustic_scale) {} +// for debug void Decodable::Acceptlikelihood(const Matrix& likelihood) { - nnet_cache_ = likelihood; + nnet_out_cache_ = likelihood; frames_ready_ += likelihood.NumRows(); } -// Decodable::Init(DecodableConfig config) { -//} // return the size of frame have computed. int32 Decodable::NumFramesReady() const { return frames_ready_; } + // frame idx is from 0 to frame_ready_ -1; bool Decodable::IsLastFrame(int32 frame) { bool flag = EnsureFrameHaveComputed(frame); @@ -53,18 +53,9 @@ int32 Decodable::NumIndices() const { return 0; } // id. int32 Decodable::TokenId2NnetId(int32 token_id) { return token_id - 1; } -BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) { - CHECK_LE(index, nnet_cache_.NumCols()); - CHECK_LE(frame, frames_ready_); - int32 frame_idx = frame - frame_offset_; - // the nnet output is prob ranther than log prob - // the index - 1, because the ilabel - return acoustic_scale_ * - std::log(nnet_cache_(frame_idx, TokenId2NnetId(index)) + - std::numeric_limits::min()); -} bool Decodable::EnsureFrameHaveComputed(int32 frame) { + // decoding frame if (frame >= frames_ready_) { return AdvanceChunk(); } @@ -72,38 +63,112 @@ bool Decodable::EnsureFrameHaveComputed(int32 frame) { } bool Decodable::AdvanceChunk() { + kaldi::Timer timer; + // read feats Vector features; if (frontend_ == NULL || frontend_->Read(&features) == false) { + // no feat or frontend_ not init. + VLOG(1) << "decodable exit;"; return false; } - int32 nnet_dim = 0; - Vector inferences; - nnet_->FeedForward(features, frontend_->Dim(), &inferences, &nnet_dim); - nnet_cache_.Resize(inferences.Dim() / nnet_dim, nnet_dim); - nnet_cache_.CopyRowsFromVec(inferences); + VLOG(2) << "Forward in " << features.Dim() / frontend_->Dim() << " feats."; + + // forward feats + NnetOut out; + nnet_->FeedForward(features, frontend_->Dim(), &out); + int32& vocab_dim = out.vocab_dim; + Vector& logprobs = out.logprobs; + VLOG(2) << "Forward out " << logprobs.Dim() / vocab_dim + << " decoder frames."; + // cache nnet outupts + nnet_out_cache_.Resize(logprobs.Dim() / vocab_dim, vocab_dim); + nnet_out_cache_.CopyRowsFromVec(logprobs); + + // update state, decoding frame. frame_offset_ = frames_ready_; - frames_ready_ += nnet_cache_.NumRows(); + frames_ready_ += nnet_out_cache_.NumRows(); + VLOG(2) << "Forward feat chunk cost: " << timer.Elapsed() << " sec."; + return true; +} + +bool Decodable::AdvanceChunk(kaldi::Vector* logprobs, + int* vocab_dim) { + if (AdvanceChunk() == false) { + return false; + } + + int nrows = nnet_out_cache_.NumRows(); + CHECK(nrows == (frames_ready_ - frame_offset_)); + if (nrows <= 0) { + LOG(WARNING) << "No new nnet out in cache."; + return false; + } + + logprobs->Resize(nnet_out_cache_.NumRows() * nnet_out_cache_.NumCols()); + logprobs->CopyRowsFromMat(nnet_out_cache_); + + *vocab_dim = nnet_out_cache_.NumCols(); return true; } +// read one frame likelihood bool Decodable::FrameLikelihood(int32 frame, vector* likelihood) { - std::vector result; - if (EnsureFrameHaveComputed(frame) == false) return false; - likelihood->resize(nnet_cache_.NumCols()); - for (int32 idx = 0; idx < nnet_cache_.NumCols(); ++idx) { + if (EnsureFrameHaveComputed(frame) == false) { + VLOG(1) << "framelikehood exit."; + return false; + } + + int nrows = nnet_out_cache_.NumRows(); + CHECK(nrows == (frames_ready_ - frame_offset_)); + int vocab_size = nnet_out_cache_.NumCols(); + likelihood->resize(vocab_size); + + for (int32 idx = 0; idx < vocab_size; ++idx) { (*likelihood)[idx] = - nnet_cache_(frame - frame_offset_, idx) * acoustic_scale_; + nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_; + + VLOG(4) << "nnet out: " << frame << " offset:" << frame_offset_ << " " + << nnet_out_cache_.NumRows() + << " logprob: " << nnet_out_cache_(frame - frame_offset_, idx); } return true; } +BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) { + if (EnsureFrameHaveComputed(frame) == false) { + return false; + } + + CHECK_LE(index, nnet_out_cache_.NumCols()); + CHECK_LE(frame, frames_ready_); + + // the nnet output is prob ranther than log prob + // the index - 1, because the ilabel + BaseFloat logprob = 0.0; + int32 frame_idx = frame - frame_offset_; + BaseFloat nnet_out = nnet_out_cache_(frame_idx, TokenId2NnetId(index)); + if (nnet_->IsLogProb()) { + logprob = nnet_out; + } else { + logprob = std::log(nnet_out + std::numeric_limits::epsilon()); + } + CHECK(!std::isnan(logprob) && !std::isinf(logprob)); + return acoustic_scale_ * logprob; +} + void Decodable::Reset() { if (frontend_ != nullptr) frontend_->Reset(); if (nnet_ != nullptr) nnet_->Reset(); frame_offset_ = 0; frames_ready_ = 0; - nnet_cache_.Resize(0, 0); + nnet_out_cache_.Resize(0, 0); +} + +void Decodable::AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) { + nnet_->AttentionRescoring(hyps, reverse_weight, rescoring_score); } } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h index 9555fea7..dd7b329e 100644 --- a/speechx/speechx/nnet/decodable.h +++ b/speechx/speechx/nnet/decodable.h @@ -24,38 +24,68 @@ struct DecodableOpts; class Decodable : public kaldi::DecodableInterface { public: - explicit Decodable(const std::shared_ptr& nnet, + explicit Decodable(const std::shared_ptr& nnet, const std::shared_ptr& frontend, kaldi::BaseFloat acoustic_scale = 1.0); + // void Init(DecodableOpts config); + + // nnet logprob output, used by wfst virtual kaldi::BaseFloat LogLikelihood(int32 frame, int32 index); - virtual bool IsLastFrame(int32 frame); - virtual int32 NumIndices() const; - // not logprob + + // nnet output virtual bool FrameLikelihood(int32 frame, std::vector* likelihood); + + // forward nnet with feats + bool AdvanceChunk(); + + // forward nnet with feats, and get nnet output + bool AdvanceChunk(kaldi::Vector* logprobs, + int* vocab_dim); + + void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score); + + virtual bool IsLastFrame(int32 frame); + + // nnet output dim, e.g. vocab size + virtual int32 NumIndices() const; + virtual int32 NumFramesReady() const; - // for offline test - void Acceptlikelihood(const kaldi::Matrix& likelihood); + void Reset(); + bool IsInputFinished() const { return frontend_->IsFinished(); } + bool EnsureFrameHaveComputed(int32 frame); + int32 TokenId2NnetId(int32 token_id); + std::shared_ptr Nnet() { return nnet_; } + + // for offline test + void Acceptlikelihood(const kaldi::Matrix& likelihood); + private: - bool AdvanceChunk(); std::shared_ptr frontend_; - std::shared_ptr nnet_; - kaldi::Matrix nnet_cache_; + std::shared_ptr nnet_; + + // nnet outputs' cache + kaldi::Matrix nnet_out_cache_; + // the frame is nnet prob frame rather than audio feature frame // nnet frame subsample the feature frame // eg: 35 frame features output 8 frame inferences int32 frame_offset_; int32 frames_ready_; + // todo: feature frame mismatch with nnet inference frame // so use subsampled_frame int32 current_log_post_subsampled_offset_; int32 num_chunk_computed_; + kaldi::BaseFloat acoustic_scale_; }; diff --git a/speechx/speechx/nnet/paddle_nnet.cc b/speechx/speechx/nnet/ds2_nnet.cc similarity index 94% rename from speechx/speechx/nnet/paddle_nnet.cc rename to speechx/speechx/nnet/ds2_nnet.cc index 881a82f5..22c7f61b 100644 --- a/speechx/speechx/nnet/paddle_nnet.cc +++ b/speechx/speechx/nnet/ds2_nnet.cc @@ -12,16 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "nnet/paddle_nnet.h" +#include "nnet/ds2_nnet.h" + #include "absl/strings/str_split.h" namespace ppspeech { -using std::vector; -using std::string; -using std::shared_ptr; using kaldi::Matrix; using kaldi::Vector; +using std::shared_ptr; +using std::string; +using std::vector; void PaddleNnet::InitCacheEncouts(const ModelOptions& opts) { std::vector cache_names; @@ -48,6 +49,7 @@ void PaddleNnet::InitCacheEncouts(const ModelOptions& opts) { } PaddleNnet::PaddleNnet(const ModelOptions& opts) : opts_(opts) { + subsampling_rate_ = opts.subsample_rate; paddle_infer::Config config; config.SetModel(opts.model_path, opts.param_path); if (opts.use_gpu) { @@ -143,9 +145,8 @@ shared_ptr> PaddleNnet::GetCacheEncoder(const string& name) { } void PaddleNnet::FeedForward(const Vector& features, - int32 feature_dim, - Vector* inferences, - int32* inference_dim) { + const int32& feature_dim, + NnetOut* out) { paddle_infer::Predictor* predictor = GetPredictor(); int feat_row = features.Dim() / feature_dim; @@ -203,9 +204,13 @@ void PaddleNnet::FeedForward(const Vector& features, std::vector output_shape = output_tensor->shape(); int32 row = output_shape[1]; int32 col = output_shape[2]; - inferences->Resize(row * col); - *inference_dim = col; - output_tensor->CopyToCpu(inferences->Data()); + + + // inferences->Resize(row * col); + // *inference_dim = col; + out->logprobs.Resize(row * col); + out->vocab_dim = col; + output_tensor->CopyToCpu(out->logprobs.Data()); ReleasePredictor(predictor); } diff --git a/speechx/speechx/nnet/paddle_nnet.h b/speechx/speechx/nnet/ds2_nnet.h similarity index 50% rename from speechx/speechx/nnet/paddle_nnet.h rename to speechx/speechx/nnet/ds2_nnet.h index e2b3d5bc..420fa177 100644 --- a/speechx/speechx/nnet/paddle_nnet.h +++ b/speechx/speechx/nnet/ds2_nnet.h @@ -13,64 +13,20 @@ // limitations under the License. #pragma once #include + #include "base/common.h" #include "kaldi/matrix/kaldi-matrix.h" -#include "kaldi/util/options-itf.h" #include "nnet/nnet_itf.h" #include "paddle_inference_api.h" namespace ppspeech { -struct ModelOptions { - std::string model_path; - std::string param_path; - int thread_num; // predictor thread pool size - bool use_gpu; - bool switch_ir_optim; - std::string input_names; - std::string output_names; - std::string cache_names; - std::string cache_shape; - bool enable_fc_padding; - bool enable_profile; - ModelOptions() - : model_path(""), - param_path(""), - thread_num(2), - use_gpu(false), - input_names(""), - output_names(""), - cache_names(""), - cache_shape(""), - switch_ir_optim(false), - enable_fc_padding(false), - enable_profile(false) {} - - void Register(kaldi::OptionsItf* opts) { - opts->Register("model-path", &model_path, "model file path"); - opts->Register("model-param", ¶m_path, "params model file path"); - opts->Register("thread-num", &thread_num, "thread num"); - opts->Register("use-gpu", &use_gpu, "if use gpu"); - opts->Register("input-names", &input_names, "paddle input names"); - opts->Register("output-names", &output_names, "paddle output names"); - opts->Register("cache-names", &cache_names, "cache names"); - opts->Register("cache-shape", &cache_shape, "cache shape"); - opts->Register("switch-ir-optiom", - &switch_ir_optim, - "paddle SwitchIrOptim option"); - opts->Register("enable-fc-padding", - &enable_fc_padding, - "paddle EnableFCPadding option"); - opts->Register( - "enable-profile", &enable_profile, "paddle EnableProfile option"); - } -}; template class Tensor { public: Tensor() {} - Tensor(const std::vector& shape) : _shape(shape) { + explicit Tensor(const std::vector& shape) : _shape(shape) { int neml = std::accumulate( _shape.begin(), _shape.end(), 1, std::multiplies()); LOG(INFO) << "Tensor neml: " << neml; @@ -92,21 +48,35 @@ class Tensor { std::vector _data; }; -class PaddleNnet : public NnetInterface { +class PaddleNnet : public NnetBase { public: - PaddleNnet(const ModelOptions& opts); + explicit PaddleNnet(const ModelOptions& opts); - virtual void FeedForward(const kaldi::Vector& features, - int32 feature_dim, - kaldi::Vector* inferences, - int32* inference_dim); + void FeedForward(const kaldi::Vector& features, + const int32& feature_dim, + NnetOut* out) override; + + void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) override { + VLOG(2) << "deepspeech2 not has AttentionRescoring."; + } void Dim(); - virtual void Reset(); + + void Reset() override; + + bool IsLogProb() override { return false; } + + std::shared_ptr> GetCacheEncoder( const std::string& name); + void InitCacheEncouts(const ModelOptions& opts); + void EncoderOuts(std::vector>* encoder_out) + const override {} + private: paddle_infer::Predictor* GetPredictor(); int ReleasePredictor(paddle_infer::Predictor* predictor); @@ -117,6 +87,7 @@ class PaddleNnet : public NnetInterface { std::map predictor_to_thread_id; std::map cache_names_idx_; std::vector>> cache_encouts_; + ModelOptions opts_; public: diff --git a/speechx/speechx/nnet/nnet_forward_main.cc b/speechx/speechx/nnet/ds2_nnet_main.cc similarity index 77% rename from speechx/speechx/nnet/nnet_forward_main.cc rename to speechx/speechx/nnet/ds2_nnet_main.cc index 0d4ea8ff..6092b8a4 100644 --- a/speechx/speechx/nnet/nnet_forward_main.cc +++ b/speechx/speechx/nnet/ds2_nnet_main.cc @@ -12,45 +12,27 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "base/flags.h" -#include "base/log.h" +#include "base/common.h" +#include "decoder/param.h" #include "frontend/audio/assembler.h" #include "frontend/audio/data_cache.h" #include "kaldi/util/table-types.h" #include "nnet/decodable.h" -#include "nnet/paddle_nnet.h" +#include "nnet/ds2_nnet.h" DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier"); -DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); -DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); -DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk"); -DEFINE_int32(receptive_field_length, - 7, - "receptive field of two CNN(kernel=3) downsampling module."); -DEFINE_int32(downsampling_rate, - 4, - "two CNN(kernel=3) module downsampling rate."); -DEFINE_string( - model_input_names, - "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box", - "model input names"); -DEFINE_string(model_output_names, - "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0", - "model output names"); -DEFINE_string(model_cache_names, - "chunk_state_h_box,chunk_state_c_box", - "model cache names"); -DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); -DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); using kaldi::BaseFloat; using kaldi::Matrix; using std::vector; int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; kaldi::SequentialBaseFloatMatrixReader feature_reader( FLAGS_feature_rspecifier); @@ -62,13 +44,8 @@ int main(int argc, char* argv[]) { int32 num_done = 0, num_err = 0; - ppspeech::ModelOptions model_opts; - model_opts.model_path = model_graph; - model_opts.param_path = model_params; - model_opts.cache_names = FLAGS_model_cache_names; - model_opts.cache_shape = FLAGS_model_cache_shapes; - model_opts.input_names = FLAGS_model_input_names; - model_opts.output_names = FLAGS_model_output_names; + ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags(); + std::shared_ptr nnet( new ppspeech::PaddleNnet(model_opts)); std::shared_ptr raw_data(new ppspeech::DataCache()); @@ -76,8 +53,8 @@ int main(int argc, char* argv[]) { new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale)); int32 chunk_size = FLAGS_receptive_field_length + - (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate; - int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk; + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate; + int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk; int32 receptive_field_length = FLAGS_receptive_field_length; LOG(INFO) << "chunk size (frame): " << chunk_size; LOG(INFO) << "chunk stride (frame): " << chunk_stride; @@ -146,7 +123,7 @@ int main(int argc, char* argv[]) { } kaldi::Matrix result(prob_vec.size(), prob_vec[0].Dim()); - for (int32 row_idx = 0; row_idx < prob_vec.size(); ++row_idx) { + for (int row_idx = 0; row_idx < prob_vec.size(); ++row_idx) { for (int32 col_idx = 0; col_idx < prob_vec[0].Dim(); ++col_idx) { result(row_idx, col_idx) = prob_vec[row_idx](col_idx); } diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h index ac040fba..a504cce5 100644 --- a/speechx/speechx/nnet/nnet_itf.h +++ b/speechx/speechx/nnet/nnet_itf.h @@ -11,24 +11,110 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - - #pragma once #include "base/basic_types.h" #include "kaldi/base/kaldi-types.h" #include "kaldi/matrix/kaldi-matrix.h" +#include "kaldi/util/options-itf.h" + +DECLARE_int32(subsampling_rate); +DECLARE_string(model_path); +DECLARE_string(param_path); +DECLARE_string(model_input_names); +DECLARE_string(model_output_names); +DECLARE_string(model_cache_names); +DECLARE_string(model_cache_shapes); namespace ppspeech { +struct ModelOptions { + // common + int subsample_rate{1}; + int thread_num{1}; // predictor thread pool size for ds2; + bool use_gpu{false}; + std::string model_path; + + std::string param_path; + + // ds2 for inference + std::string input_names{}; + std::string output_names{}; + std::string cache_names{}; + std::string cache_shape{}; + bool switch_ir_optim{false}; + bool enable_fc_padding{false}; + bool enable_profile{false}; + + static ModelOptions InitFromFlags() { + ModelOptions opts; + opts.subsample_rate = FLAGS_subsampling_rate; + LOG(INFO) << "subsampling rate: " << opts.subsample_rate; + opts.model_path = FLAGS_model_path; + LOG(INFO) << "model path: " << opts.model_path; + + opts.param_path = FLAGS_param_path; + LOG(INFO) << "param path: " << opts.param_path; + + LOG(INFO) << "DS2 param: "; + opts.cache_names = FLAGS_model_cache_names; + LOG(INFO) << " cache names: " << opts.cache_names; + opts.cache_shape = FLAGS_model_cache_shapes; + LOG(INFO) << " cache shape: " << opts.cache_shape; + opts.input_names = FLAGS_model_input_names; + LOG(INFO) << " input names: " << opts.input_names; + opts.output_names = FLAGS_model_output_names; + LOG(INFO) << " output names: " << opts.output_names; + return opts; + } +}; + +struct NnetOut { + // nnet out. maybe logprob or prob. Almost time this is logprob. + kaldi::Vector logprobs; + int32 vocab_dim; + + // nnet state. Only using in Attention model. + std::vector> encoder_outs; + + NnetOut() : logprobs({}), vocab_dim(-1), encoder_outs({}) {} +}; + + class NnetInterface { public: + virtual ~NnetInterface() {} + + // forward feat with nnet. + // nnet do not cache feats, feats cached by frontend. + // nnet cache model state, i.e. encoder_outs, att_cache, cnn_cache, + // frame_offset. virtual void FeedForward(const kaldi::Vector& features, - int32 feature_dim, - kaldi::Vector* inferences, - int32* inference_dim) = 0; + const int32& feature_dim, + NnetOut* out) = 0; + + virtual void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) = 0; + + // reset nnet state, e.g. nnet_logprob_cache_, offset_, encoder_outs_. virtual void Reset() = 0; - virtual ~NnetInterface() {} + + // true, nnet output is logprob; otherwise is prob, + virtual bool IsLogProb() = 0; + + // using to get encoder outs. e.g. seq2seq with Attention model. + virtual void EncoderOuts( + std::vector>* encoder_out) const = 0; +}; + + +class NnetBase : public NnetInterface { + public: + int SubsamplingRate() const { return subsampling_rate_; } + + protected: + int subsampling_rate_{1}; }; } // namespace ppspeech diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc new file mode 100644 index 00000000..19cb85fd --- /dev/null +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -0,0 +1,665 @@ +// Copyright 2022 Horizon Robotics. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// modified from +// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.cc + +#include "nnet/u2_nnet.h" + +#ifdef USE_PROFILING +#include "paddle/fluid/platform/profiler.h" +using paddle::platform::RecordEvent; +using paddle::platform::TracerEventType; +#endif // end USE_PROFILING + +namespace ppspeech { + + +void U2Nnet::LoadModel(const std::string& model_path_w_prefix) { + paddle::jit::utils::InitKernelSignatureMap(); + +#ifdef USE_GPU + dev_ = phi::GPUPlace(); +#else + dev_ = phi::CPUPlace(); +#endif + paddle::jit::Layer model = paddle::jit::Load(model_path_w_prefix, dev_); + model_ = std::make_shared(std::move(model)); + + subsampling_rate_ = model_->Attribute("subsampling_rate"); + right_context_ = model_->Attribute("right_context"); + sos_ = model_->Attribute("sos_symbol"); + eos_ = model_->Attribute("eos_symbol"); + is_bidecoder_ = model_->Attribute("is_bidirectional_decoder"); + + forward_encoder_chunk_ = model_->Function("forward_encoder_chunk"); + forward_attention_decoder_ = model_->Function("forward_attention_decoder"); + ctc_activation_ = model_->Function("ctc_activation"); + CHECK(forward_encoder_chunk_.IsValid()); + CHECK(forward_attention_decoder_.IsValid()); + CHECK(ctc_activation_.IsValid()); + + LOG(INFO) << "Paddle Model Info: "; + LOG(INFO) << "\tsubsampling_rate " << subsampling_rate_; + LOG(INFO) << "\tright context " << right_context_; + LOG(INFO) << "\tsos " << sos_; + LOG(INFO) << "\teos " << eos_; + LOG(INFO) << "\tis bidecoder " << is_bidecoder_ << std::endl; + + Warmup(); +} + +void U2Nnet::Warmup() { +#ifdef USE_PROFILING + RecordEvent event("warmup", TracerEventType::UserDefined, 1); +#endif + + { +#ifdef USE_PROFILING + RecordEvent event( + "warmup-encoder-ctc", TracerEventType::UserDefined, 1); +#endif + int feat_dim = 80; + int frame_num = 16 * 4 + 3; // chunk_size * downsample_rate + + // (receptive_field - downsample_rate) + paddle::Tensor feats = paddle::full( + {1, frame_num, feat_dim}, 0.12f, paddle::DataType::FLOAT32); + paddle::Tensor offset = paddle::zeros({1}, paddle::DataType::INT32); + paddle::Tensor att_cache = + paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32); + paddle::Tensor cnn_cache = + paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32); + std::vector inputs = { + feats, offset, /*required_cache_size, */ att_cache, cnn_cache}; + std::vector outputs = forward_encoder_chunk_(inputs); + + auto chunk_out = outputs[0]; + inputs = std::move(std::vector({chunk_out})); + outputs = ctc_activation_(inputs); + } + + { +#ifdef USE_PROFILING + RecordEvent event("warmup-decoder", TracerEventType::UserDefined, 1); +#endif + auto hyps = + paddle::full({10, 8}, 10, paddle::DataType::INT64, phi::CPUPlace()); + auto hyps_lens = + paddle::full({10}, 8, paddle::DataType::INT64, phi::CPUPlace()); + auto encoder_out = paddle::ones( + {1, 20, 512}, paddle::DataType::FLOAT32, phi::CPUPlace()); + + std::vector inputs{ + hyps, hyps_lens, encoder_out}; + + std::vector outputs = + forward_attention_decoder_(inputs); + } + + Reset(); +} + +U2Nnet::U2Nnet(const ModelOptions& opts) : opts_(opts) { + LoadModel(opts_.model_path); +} + +// shallow copy +U2Nnet::U2Nnet(const U2Nnet& other) { + // copy meta + right_context_ = other.right_context_; + subsampling_rate_ = other.subsampling_rate_; + sos_ = other.sos_; + eos_ = other.eos_; + is_bidecoder_ = other.is_bidecoder_; + chunk_size_ = other.chunk_size_; + num_left_chunks_ = other.num_left_chunks_; + + forward_encoder_chunk_ = other.forward_encoder_chunk_; + forward_attention_decoder_ = other.forward_attention_decoder_; + ctc_activation_ = other.ctc_activation_; + + offset_ = other.offset_; + + // copy model ptr + model_ = other.model_; + + // ignore inner states +} + +std::shared_ptr U2Nnet::Copy() const { + auto asr_model = std::make_shared(*this); + // reset inner state for new decoding + asr_model->Reset(); + return asr_model; +} + +void U2Nnet::Reset() { + offset_ = 0; + + att_cache_ = + std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32)); + cnn_cache_ = + std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32)); + + encoder_outs_.clear(); + VLOG(1) << "u2nnet reset"; +} + +// Debug API +void U2Nnet::FeedEncoderOuts(const paddle::Tensor& encoder_out) { + // encoder_out (T,D) + encoder_outs_.clear(); + encoder_outs_.push_back(encoder_out); +} + + +void U2Nnet::FeedForward(const kaldi::Vector& features, + const int32& feature_dim, + NnetOut* out) { + std::vector chunk_feats(features.Data(), + features.Data() + features.Dim()); + + std::vector ctc_probs; + ForwardEncoderChunkImpl( + chunk_feats, feature_dim, &ctc_probs, &out->vocab_dim); + + out->logprobs.Resize(ctc_probs.size(), kaldi::kSetZero); + std::memcpy(out->logprobs.Data(), + ctc_probs.data(), + ctc_probs.size() * sizeof(kaldi::BaseFloat)); +} + + +void U2Nnet::ForwardEncoderChunkImpl( + const std::vector& chunk_feats, + const int32& feat_dim, + std::vector* out_prob, + int32* vocab_dim) { +#ifdef USE_PROFILING + RecordEvent event( + "ForwardEncoderChunkImpl", TracerEventType::UserDefined, 1); +#endif + + // 1. splice cached_feature, and chunk_feats + // First dimension is B, which is 1. + // int num_frames = cached_feats_.size() + chunk_feats.size(); + + int num_frames = chunk_feats.size() / feat_dim; + VLOG(3) << "num_frames: " << num_frames; + VLOG(3) << "feat_dim: " << feat_dim; + + // feats (B=1,T,D) + paddle::Tensor feats = + paddle::zeros({1, num_frames, feat_dim}, paddle::DataType::FLOAT32); + float* feats_ptr = feats.mutable_data(); + + // not cache feature in nnet + CHECK_EQ(cached_feats_.size(), 0); + // CHECK_EQ(std::is_same::value, true); + std::memcpy(feats_ptr, + chunk_feats.data(), + chunk_feats.size() * sizeof(kaldi::BaseFloat)); + + VLOG(3) << "feats shape: " << feats.shape()[0] << ", " << feats.shape()[1] + << ", " << feats.shape()[2]; + +#ifdef TEST_DEBUG + { + std::stringstream path("feat", std::ios_base::app | std::ios_base::out); + path << offset_; + std::ofstream feat_fobj(path.str().c_str(), std::ios::out); + CHECK(feat_fobj.is_open()); + // feat_fobj << feats.shape()[0] << " " << feats.shape()[1] << " " + // << feats.shape()[2] << "\n"; + for (int i = 0; i < feats.numel(); i++) { + feat_fobj << std::setprecision(18) << feats_ptr[i] << " "; + if ((i + 1) % feat_dim == 0) { + feat_fobj << "\n"; + } + } + feat_fobj << "\n"; + } +#endif + +// Endocer chunk forward +#ifdef USE_GPU + feats = feats.copy_to(paddle::GPUPlace(), /*blocking*/ false); + att_cache_ = att_cache_.copy_to(paddle::GPUPlace()), /*blocking*/ false; + cnn_cache_ = cnn_cache_.copy_to(Paddle::GPUPlace(), /*blocking*/ false); +#endif + + int required_cache_size = num_left_chunks_ * chunk_size_; // -1 * 16 + // must be scalar, but paddle do not have scalar. + paddle::Tensor offset = paddle::full({1}, offset_, paddle::DataType::INT32); + // freeze `required_cache_size` in graph, so not specific it in function + // call. + std::vector inputs = { + feats, offset, /*required_cache_size, */ att_cache_, cnn_cache_}; + CHECK_EQ(inputs.size(), 4); + std::vector outputs = forward_encoder_chunk_(inputs); + CHECK_EQ(outputs.size(), 3); + +#ifdef USE_GPU + paddle::Tensor chunk_out = outputs[0].copy_to(paddle::CPUPlace()); + att_cache_ = outputs[1].copy_to(paddle::CPUPlace()); + cnn_cache_ = outputs[2].copy_to(paddle::CPUPlace()); +#else + paddle::Tensor chunk_out = outputs[0]; + att_cache_ = outputs[1]; + cnn_cache_ = outputs[2]; +#endif + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logits", + std::ios_base::app | std::ios_base::out); + auto i = offset_ - chunk_out.shape()[1]; + path << std::max(i, 0L); + std::ofstream logits_fobj(path.str().c_str(), std::ios::out); + CHECK(logits_fobj.is_open()); + logits_fobj << chunk_out.shape()[0] << " " << chunk_out.shape()[1] + << " " << chunk_out.shape()[2] << "\n"; + const float* chunk_out_ptr = chunk_out.data(); + logits_fobj << chunk_out_ptr << std::endl; + for (int i = 0; i < chunk_out.numel(); i++) { + logits_fobj << chunk_out_ptr[i] << " "; + } + logits_fobj << "\n"; + } +#endif // end TEST_DEBUG + + // current offset in decoder frame + // not used in nnet + offset_ += chunk_out.shape()[1]; + VLOG(2) << "encoder out chunk size: " << chunk_out.shape()[1] + << " total: " << offset_; + + + // collects encoder outs. + encoder_outs_.push_back(chunk_out); + VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size(); + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logits_list", + std::ios_base::app | std::ios_base::out); + path << offset_ - encoder_outs_[0].shape()[1]; + std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out); + CHECK(logits_out_fobj.is_open()); + logits_out_fobj << encoder_outs_[0].shape()[0] << " " + << encoder_outs_[0].shape()[1] << " " + << encoder_outs_[0].shape()[2] << "\n"; + const float* encoder_outs_ptr = encoder_outs_[0].data(); + logits_out_fobj << encoder_outs_ptr << std::endl; + for (int i = 0; i < encoder_outs_[0].numel(); i++) { + logits_out_fobj << encoder_outs_ptr[i] << " "; + } + logits_out_fobj << "\n"; + } +#endif // end TEST_DEBUG + +#ifdef USE_GPU + +#error "Not implementation." + +#else + // compute ctc_activation == log_softmax + inputs.clear(); + outputs.clear(); + inputs.push_back(chunk_out); + CHECK_EQ(inputs.size(), 1); + outputs = ctc_activation_(inputs); + CHECK_EQ(outputs.size(), 1); + paddle::Tensor ctc_log_probs = outputs[0]; + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logprob", + std::ios_base::app | std::ios_base::out); + path << offset_ - chunk_out.shape()[1]; + + std::ofstream logprob_fobj(path.str().c_str(), std::ios::out); + CHECK(logprob_fobj.is_open()); + logprob_fobj << ctc_log_probs.shape()[0] << " " + << ctc_log_probs.shape()[1] << " " + << ctc_log_probs.shape()[2] << "\n"; + const float* logprob_ptr = ctc_log_probs.data(); + for (int i = 0; i < ctc_log_probs.numel(); i++) { + logprob_fobj << logprob_ptr[i] << " "; + if ((i + 1) % ctc_log_probs.shape()[2] == 0) { + logprob_fobj << "\n"; + } + } + logprob_fobj << "\n"; + } +#endif // end TEST_DEBUG + +#endif // end USE_GPU + + // Copy to output, (B=1,T,D) + std::vector ctc_log_probs_shape = ctc_log_probs.shape(); + CHECK_EQ(ctc_log_probs_shape.size(), 3); + int B = ctc_log_probs_shape[0]; + CHECK_EQ(B, 1); + int T = ctc_log_probs_shape[1]; + int D = ctc_log_probs_shape[2]; + *vocab_dim = D; + + float* ctc_log_probs_ptr = ctc_log_probs.data(); + + out_prob->resize(T * D); + std::memcpy( + out_prob->data(), ctc_log_probs_ptr, T * D * sizeof(kaldi::BaseFloat)); + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logits_list_ctc", + std::ios_base::app | std::ios_base::out); + path << offset_ - encoder_outs_[0].shape()[1]; + std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out); + CHECK(logits_out_fobj.is_open()); + logits_out_fobj << encoder_outs_[0].shape()[0] << " " + << encoder_outs_[0].shape()[1] << " " + << encoder_outs_[0].shape()[2] << "\n"; + const float* encoder_outs_ptr = encoder_outs_[0].data(); + logits_out_fobj << encoder_outs_ptr << std::endl; + for (int i = 0; i < encoder_outs_[0].numel(); i++) { + logits_out_fobj << encoder_outs_ptr[i] << " "; + } + logits_out_fobj << "\n"; + } +#endif // end TEST_DEBUG + + return; +} + +float U2Nnet::ComputePathScore(const paddle::Tensor& prob, + const std::vector& hyp, + int eos) { + // sum `hyp` path scores in `prob` + // prob (1, Umax, V) + // hyp (U,) + float score = 0.0f; + std::vector dims = prob.shape(); + CHECK_EQ(dims.size(), 3); + VLOG(2) << "prob shape: " << dims[0] << ", " << dims[1] << ", " << dims[2]; + CHECK_EQ(dims[0], 1); + int vocab_dim = static_cast(dims[2]); + + const float* prob_ptr = prob.data(); + for (size_t i = 0; i < hyp.size(); ++i) { + const float* row = prob_ptr + i * vocab_dim; + score += row[hyp[i]]; + } + const float* row = prob_ptr + hyp.size() * vocab_dim; + score += row[eos]; + return score; +} + + +void U2Nnet::AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) { +#ifdef USE_PROFILING + RecordEvent event("AttentionRescoring", TracerEventType::UserDefined, 1); +#endif + + CHECK(rescoring_score != nullptr); + + int num_hyps = hyps.size(); + rescoring_score->resize(num_hyps, 0.0f); + + if (num_hyps == 0) return; + VLOG(2) << "num hyps: " << num_hyps; + + if (encoder_outs_.size() == 0) { + // no encoder outs + std::cerr << "encoder_outs_.size() is zero. Please check it." + << std::endl; + return; + } + + // prepare input + paddle::Tensor hyps_lens = + paddle::zeros({num_hyps}, paddle::DataType::INT64); + int64_t* hyps_len_ptr = hyps_lens.mutable_data(); + int max_hyps_len = 0; + for (size_t i = 0; i < num_hyps; ++i) { + int len = hyps[i].size() + 1; // eos + max_hyps_len = std::max(max_hyps_len, len); + hyps_len_ptr[i] = static_cast(len); + } + VLOG(2) << "max_hyps_len: " << max_hyps_len; + + paddle::Tensor hyps_tensor = + paddle::full({num_hyps, max_hyps_len}, eos_, paddle::DataType::INT64); + int64_t* hyps_ptr = hyps_tensor.mutable_data(); + for (size_t i = 0; i < num_hyps; ++i) { + const std::vector& hyp = hyps[i]; + int64_t* row = hyps_ptr + max_hyps_len * i; + row[0] = sos_; + for (size_t j = 0; j < hyp.size(); ++j) { + row[j + 1] = hyp[j]; + } + } + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logits_concat", + std::ios_base::app | std::ios_base::out); + for (int j = 0; j < encoder_outs_.size(); j++) { + path << j; + std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out); + CHECK(logits_out_fobj.is_open()); + logits_out_fobj << encoder_outs_[j].shape()[0] << " " + << encoder_outs_[j].shape()[1] << " " + << encoder_outs_[j].shape()[2] << "\n"; + const float* encoder_outs_ptr = encoder_outs_[j].data(); + for (int i = 0; i < encoder_outs_[j].numel(); i++) { + logits_out_fobj << encoder_outs_ptr[i] << " "; + } + logits_out_fobj << "\n"; + } + } +#endif // end TEST_DEBUG + + // forward attention decoder by hyps and correspoinding encoder_outs_ + paddle::Tensor encoder_out = paddle::concat(encoder_outs_, 1); + VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size(); + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_out0", + std::ios_base::app | std::ios_base::out); + std::ofstream encoder_out_fobj(path.str().c_str(), std::ios::out); + CHECK(encoder_out_fobj.is_open()); + + encoder_out_fobj << encoder_outs_[0].shape()[0] << " " + << encoder_outs_[0].shape()[1] << " " + << encoder_outs_[0].shape()[2] << "\n"; + const float* enc_logprob_ptr = encoder_outs_[0].data(); + + size_t size = encoder_outs_[0].numel(); + for (int i = 0; i < size; i++) { + encoder_out_fobj << enc_logprob_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_out", + std::ios_base::app | std::ios_base::out); + std::ofstream encoder_out_fobj(path.str().c_str(), std::ios::out); + CHECK(encoder_out_fobj.is_open()); + + encoder_out_fobj << encoder_out.shape()[0] << " " + << encoder_out.shape()[1] << " " + << encoder_out.shape()[2] << "\n"; + const float* enc_logprob_ptr = encoder_out.data(); + + size_t size = encoder_out.numel(); + for (int i = 0; i < size; i++) { + encoder_out_fobj << enc_logprob_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + + std::vector inputs{ + hyps_tensor, hyps_lens, encoder_out}; + std::vector outputs = forward_attention_decoder_(inputs); + CHECK_EQ(outputs.size(), 2); + + // (B, Umax, V) + paddle::Tensor probs = outputs[0]; + std::vector probs_shape = probs.shape(); + CHECK_EQ(probs_shape.size(), 3); + CHECK_EQ(probs_shape[0], num_hyps); + CHECK_EQ(probs_shape[1], max_hyps_len); + +#ifdef TEST_DEBUG + { + std::stringstream path("decoder_logprob", + std::ios_base::app | std::ios_base::out); + std::ofstream dec_logprob_fobj(path.str().c_str(), std::ios::out); + CHECK(dec_logprob_fobj.is_open()); + + dec_logprob_fobj << probs.shape()[0] << " " << probs.shape()[1] << " " + << probs.shape()[2] << "\n"; + const float* dec_logprob_ptr = probs.data(); + + size_t size = probs.numel(); + for (int i = 0; i < size; i++) { + dec_logprob_fobj << dec_logprob_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + +#ifdef TEST_DEBUG + { + std::stringstream path("hyps_lens", + std::ios_base::app | std::ios_base::out); + std::ofstream hyps_len_fobj(path.str().c_str(), std::ios::out); + CHECK(hyps_len_fobj.is_open()); + + const int64_t* hyps_lens_ptr = hyps_lens.data(); + + size_t size = hyps_lens.numel(); + for (int i = 0; i < size; i++) { + hyps_len_fobj << hyps_lens_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + +#ifdef TEST_DEBUG + { + std::stringstream path("hyps_tensor", + std::ios_base::app | std::ios_base::out); + std::ofstream hyps_tensor_fobj(path.str().c_str(), std::ios::out); + CHECK(hyps_tensor_fobj.is_open()); + + const int64_t* hyps_tensor_ptr = hyps_tensor.data(); + + size_t size = hyps_tensor.numel(); + for (int i = 0; i < size; i++) { + hyps_tensor_fobj << hyps_tensor_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + + paddle::Tensor r_probs = outputs[1]; + std::vector r_probs_shape = r_probs.shape(); + if (is_bidecoder_ && reverse_weight > 0) { + CHECK_EQ(r_probs_shape.size(), 3); + CHECK_EQ(r_probs_shape[0], num_hyps); + CHECK_EQ(r_probs_shape[1], max_hyps_len); + } else { + // dump r_probs + CHECK_EQ(r_probs_shape.size(), 1); + CHECK_EQ(r_probs_shape[0], 1) << r_probs_shape[0]; + } + + // compute rescoring score + using IntArray = paddle::experimental::IntArray; + std::vector probs_v = + paddle::experimental::split_with_num(probs, num_hyps, 0); + VLOG(2) << "split prob: " << probs_v.size() << " " + << probs_v[0].shape().size() << " 0: " << probs_v[0].shape()[0] + << ", " << probs_v[0].shape()[1] << ", " << probs_v[0].shape()[2]; + CHECK(static_cast(probs_v.size()) == num_hyps) + << ": is " << probs_v.size() << " expect: " << num_hyps; + + std::vector r_probs_v; + if (is_bidecoder_ && reverse_weight > 0) { + r_probs_v = paddle::experimental::split_with_num(r_probs, num_hyps, 0); + CHECK(static_cast(r_probs_v.size()) == num_hyps) + << "r_probs_v size: is " << r_probs_v.size() + << " expect: " << num_hyps; + } + + for (int i = 0; i < num_hyps; ++i) { + const std::vector& hyp = hyps[i]; + + // left-to-right decoder score + float score = 0.0f; + score = ComputePathScore(probs_v[i], hyp, eos_); + + // right-to-left decoder score + float r_score = 0.0f; + if (is_bidecoder_ && reverse_weight > 0) { + std::vector r_hyp(hyp.size()); + std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin()); + r_score = ComputePathScore(r_probs_v[i], r_hyp, eos_); + } + + // combinded left-to-right and right-to-lfet score + (*rescoring_score)[i] = + score * (1 - reverse_weight) + r_score * reverse_weight; + VLOG(1) << "hyp " << i << " " << hyp.size() << " score: " << score + << " r_score: " << r_score + << " reverse_weight: " << reverse_weight + << " final score: " << (*rescoring_score)[i]; + } +} + + +void U2Nnet::EncoderOuts( + std::vector>* encoder_out) const { + // list of (B=1,T,D) + int size = encoder_outs_.size(); + VLOG(1) << "encoder_outs_ size: " << size; + + for (int i = 0; i < size; i++) { + const paddle::Tensor& item = encoder_outs_[i]; + const std::vector shape = item.shape(); + CHECK_EQ(shape.size(), 3); + const int& B = shape[0]; + const int& T = shape[1]; + const int& D = shape[2]; + CHECK(B == 1) << "Only support batch one."; + VLOG(1) << "encoder out " << i << " shape: (" << B << "," << T << "," + << D << ")"; + + const float* this_tensor_ptr = item.data(); + for (int j = 0; j < T; j++) { + const float* cur = this_tensor_ptr + j * D; + kaldi::Vector out(D); + std::memcpy(out.Data(), cur, D * sizeof(kaldi::BaseFloat)); + encoder_out->emplace_back(out); + } + } +} + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h new file mode 100644 index 00000000..23cc0ea3 --- /dev/null +++ b/speechx/speechx/nnet/u2_nnet.h @@ -0,0 +1,132 @@ +// Copyright 2022 Horizon Robotics. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// modified from +// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.h +#pragma once + +#include "base/common.h" +#include "kaldi/matrix/kaldi-matrix.h" +#include "nnet/nnet_itf.h" +#include "paddle/extension.h" +#include "paddle/jit/all.h" +#include "paddle/phi/api/all.h" + +namespace ppspeech { + + +class U2NnetBase : public NnetBase { + public: + virtual int Context() const { return right_context_ + 1; } + virtual int RightContext() const { return right_context_; } + + virtual int EOS() const { return eos_; } + virtual int SOS() const { return sos_; } + virtual int IsBidecoder() const { return is_bidecoder_; } + // current offset in decoder frame + virtual int Offset() const { return offset_; } + virtual void SetChunkSize(int chunk_size) { chunk_size_ = chunk_size; } + virtual void SetNumLeftChunks(int num_left_chunks) { + num_left_chunks_ = num_left_chunks; + } + + virtual std::shared_ptr Copy() const = 0; + + protected: + virtual void ForwardEncoderChunkImpl( + const std::vector& chunk_feats, + const int32& feat_dim, + std::vector* ctc_probs, + int32* vocab_dim) = 0; + + protected: + // model specification + int right_context_{0}; + + int sos_{0}; + int eos_{0}; + + bool is_bidecoder_{false}; + + int chunk_size_{16}; // num of decoder frames. If chunk_size > 0, streaming + // case. Otherwise, none streaming case + int num_left_chunks_{-1}; // -1 means all left chunks + + // asr decoder state, not used in nnet + int offset_{0}; // current offset in encoder output time stamp. Used by + // position embedding. + std::vector> cached_feats_{}; // features cache +}; + + +class U2Nnet : public U2NnetBase { + public: + explicit U2Nnet(const ModelOptions& opts); + U2Nnet(const U2Nnet& other); + + void FeedForward(const kaldi::Vector& features, + const int32& feature_dim, + NnetOut* out) override; + + void Reset() override; + + bool IsLogProb() override { return true; } + + void Dim(); + + void LoadModel(const std::string& model_path_w_prefix); + void Warmup(); + + std::shared_ptr model() const { return model_; } + + std::shared_ptr Copy() const override; + + void ForwardEncoderChunkImpl( + const std::vector& chunk_feats, + const int32& feat_dim, + std::vector* ctc_probs, + int32* vocab_dim) override; + + float ComputePathScore(const paddle::Tensor& prob, + const std::vector& hyp, + int eos); + + void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) override; + + // debug + void FeedEncoderOuts(const paddle::Tensor& encoder_out); + + void EncoderOuts( + std::vector>* encoder_out) const; + + private: + ModelOptions opts_; + + phi::Place dev_; + std::shared_ptr model_{nullptr}; + std::vector encoder_outs_; + // transformer/conformer attention cache + paddle::Tensor att_cache_ = paddle::full({0, 0, 0, 0}, 0.0); + // conformer-only conv_module cache + paddle::Tensor cnn_cache_ = paddle::full({0, 0, 0, 0}, 0.0); + + paddle::jit::Function forward_encoder_chunk_; + paddle::jit::Function forward_attention_decoder_; + paddle::jit::Function ctc_activation_; +}; + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc new file mode 100644 index 00000000..53fc5554 --- /dev/null +++ b/speechx/speechx/nnet/u2_nnet_main.cc @@ -0,0 +1,194 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "base/common.h" +#include "decoder/param.h" +#include "frontend/audio/assembler.h" +#include "frontend/audio/data_cache.h" +#include "kaldi/util/table-types.h" +#include "nnet/decodable.h" +#include "nnet/u2_nnet.h" + + +DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); +DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier"); +DEFINE_string(nnet_encoder_outs_wspecifier, "", "nnet encoder outs wspecifier"); + +using kaldi::BaseFloat; +using kaldi::Matrix; +using std::vector; + +int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); + gflags::ParseCommandLineFlags(&argc, &argv, false); + google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; + + int32 num_done = 0, num_err = 0; + + CHECK_GT(FLAGS_feature_rspecifier.size(), 0); + CHECK_GT(FLAGS_nnet_prob_wspecifier.size(), 0); + CHECK_GT(FLAGS_model_path.size(), 0); + LOG(INFO) << "input rspecifier: " << FLAGS_feature_rspecifier; + LOG(INFO) << "output wspecifier: " << FLAGS_nnet_prob_wspecifier; + LOG(INFO) << "model path: " << FLAGS_model_path; + + kaldi::SequentialBaseFloatMatrixReader feature_reader( + FLAGS_feature_rspecifier); + kaldi::BaseFloatMatrixWriter nnet_out_writer(FLAGS_nnet_prob_wspecifier); + kaldi::BaseFloatMatrixWriter nnet_encoder_outs_writer( + FLAGS_nnet_encoder_outs_wspecifier); + + ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags(); + + int32 chunk_size = (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate + + FLAGS_receptive_field_length; + int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk; + int32 receptive_field_length = FLAGS_receptive_field_length; + LOG(INFO) << "chunk size (frame): " << chunk_size; + LOG(INFO) << "chunk stride (frame): " << chunk_stride; + LOG(INFO) << "receptive field (frame): " << receptive_field_length; + + std::shared_ptr nnet(new ppspeech::U2Nnet(model_opts)); + std::shared_ptr raw_data(new ppspeech::DataCache()); + std::shared_ptr decodable( + new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale)); + kaldi::Timer timer; + + for (; !feature_reader.Done(); feature_reader.Next()) { + string utt = feature_reader.Key(); + kaldi::Matrix feature = feature_reader.Value(); + + int nframes = feature.NumRows(); + int feat_dim = feature.NumCols(); + raw_data->SetDim(feat_dim); + LOG(INFO) << "utt: " << utt; + LOG(INFO) << "feat shape: " << nframes << ", " << feat_dim; + + int32 frame_idx = 0; + int vocab_dim = 0; + std::vector> prob_vec; + std::vector> encoder_out_vec; + int32 ori_feature_len = feature.NumRows(); + int32 num_chunks = feature.NumRows() / chunk_stride + 1; + LOG(INFO) << "num_chunks: " << num_chunks; + + for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) { + int32 this_chunk_size = 0; + if (ori_feature_len > chunk_idx * chunk_stride) { + this_chunk_size = std::min( + ori_feature_len - chunk_idx * chunk_stride, chunk_size); + } + if (this_chunk_size < receptive_field_length) { + LOG(WARNING) + << "utt: " << utt << " skip last " << this_chunk_size + << " frames, expect is " << receptive_field_length; + break; + } + + kaldi::Vector feature_chunk(this_chunk_size * + feat_dim); + int32 start = chunk_idx * chunk_stride; + for (int row_id = 0; row_id < this_chunk_size; ++row_id) { + kaldi::SubVector feat_row(feature, start); + kaldi::SubVector feature_chunk_row( + feature_chunk.Data() + row_id * feat_dim, feat_dim); + + feature_chunk_row.CopyFromVec(feat_row); + ++start; + } + + // feat to frontend pipeline cache + raw_data->Accept(feature_chunk); + + // send data finish signal + if (chunk_idx == num_chunks - 1) { + raw_data->SetFinished(); + } + + // get nnet outputs + kaldi::Timer timer; + kaldi::Vector logprobs; + bool isok = decodable->AdvanceChunk(&logprobs, &vocab_dim); + CHECK(isok == true); + for (int row_idx = 0; row_idx < logprobs.Dim() / vocab_dim; + row_idx++) { + kaldi::Vector vec_tmp(vocab_dim); + std::memcpy(vec_tmp.Data(), + logprobs.Data() + row_idx * vocab_dim, + sizeof(kaldi::BaseFloat) * vocab_dim); + prob_vec.push_back(vec_tmp); + } + + VLOG(2) << "frame_idx: " << frame_idx + << " elapsed: " << timer.Elapsed() << " sec."; + } + + // get encoder out + decodable->Nnet()->EncoderOuts(&encoder_out_vec); + + // after process one utt, then reset decoder state. + decodable->Reset(); + + if (prob_vec.size() == 0 || encoder_out_vec.size() == 0) { + // the TokenWriter can not write empty string. + ++num_err; + LOG(WARNING) << " the nnet prob/encoder_out of " << utt + << " is empty"; + continue; + } + + { + // writer nnet output + kaldi::MatrixIndexT nrow = prob_vec.size(); + kaldi::MatrixIndexT ncol = prob_vec[0].Dim(); + LOG(INFO) << "nnet out shape: " << nrow << ", " << ncol; + kaldi::Matrix nnet_out(nrow, ncol); + for (int32 row_idx = 0; row_idx < nrow; ++row_idx) { + for (int32 col_idx = 0; col_idx < ncol; ++col_idx) { + nnet_out(row_idx, col_idx) = prob_vec[row_idx](col_idx); + } + } + nnet_out_writer.Write(utt, nnet_out); + } + + + { + // writer nnet encoder outs + kaldi::MatrixIndexT nrow = encoder_out_vec.size(); + kaldi::MatrixIndexT ncol = encoder_out_vec[0].Dim(); + LOG(INFO) << "nnet encoder outs shape: " << nrow << ", " << ncol; + kaldi::Matrix encoder_outs(nrow, ncol); + for (int32 row_idx = 0; row_idx < nrow; ++row_idx) { + for (int32 col_idx = 0; col_idx < ncol; ++col_idx) { + encoder_outs(row_idx, col_idx) = + encoder_out_vec[row_idx](col_idx); + } + } + nnet_encoder_outs_writer.Write(utt, encoder_outs); + } + + ++num_done; + } + + + double elapsed = timer.Elapsed(); + LOG(INFO) << "Program cost:" << elapsed << " sec"; + + LOG(INFO) << "Done " << num_done << " utterances, " << num_err + << " with errors."; + return (num_done != 0 ? 0 : 1); +} diff --git a/speechx/speechx/protocol/CMakeLists.txt b/speechx/speechx/protocol/CMakeLists.txt index 98b2f38b..71b33daa 100644 --- a/speechx/speechx/protocol/CMakeLists.txt +++ b/speechx/speechx/protocol/CMakeLists.txt @@ -1,3 +1 @@ -cmake_minimum_required(VERSION 3.14 FATAL_ERROR) - add_subdirectory(websocket) diff --git a/speechx/speechx/protocol/websocket/CMakeLists.txt b/speechx/speechx/protocol/websocket/CMakeLists.txt index c3454c39..cafbbec7 100644 --- a/speechx/speechx/protocol/websocket/CMakeLists.txt +++ b/speechx/speechx/protocol/websocket/CMakeLists.txt @@ -1,10 +1,8 @@ -project(websocket) - add_library(websocket STATIC websocket_server.cc websocket_client.cc ) -target_link_libraries(websocket PUBLIC frontend decoder nnet) +target_link_libraries(websocket PUBLIC frontend nnet decoder recognizer) add_executable(websocket_server_main ${CMAKE_CURRENT_SOURCE_DIR}/websocket_server_main.cc) target_include_directories(websocket_server_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) diff --git a/speechx/speechx/protocol/websocket/websocket_client.h b/speechx/speechx/protocol/websocket/websocket_client.h index 886da292..7ae6d98d 100644 --- a/speechx/speechx/protocol/websocket/websocket_client.h +++ b/speechx/speechx/protocol/websocket/websocket_client.h @@ -13,7 +13,6 @@ // limitations under the License. #include "base/common.h" - #include "boost/asio/connect.hpp" #include "boost/asio/ip/tcp.hpp" #include "boost/beast/core.hpp" @@ -54,4 +53,4 @@ class WebSocketClient { websocket::stream ws_{ioc_}; std::unique_ptr t_{nullptr}; }; -} \ No newline at end of file +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/protocol/websocket/websocket_client_main.cc b/speechx/speechx/protocol/websocket/websocket_client_main.cc index 7ad36e3a..7c5a4f2f 100644 --- a/speechx/speechx/protocol/websocket/websocket_client_main.cc +++ b/speechx/speechx/protocol/websocket/websocket_client_main.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "websocket/websocket_client.h" #include "kaldi/feat/wave-reader.h" #include "kaldi/util/kaldi-io.h" #include "kaldi/util/table-types.h" +#include "websocket/websocket_client.h" DEFINE_string(host, "127.0.0.1", "host of websocket server"); DEFINE_int32(port, 8082, "port of websocket server"); diff --git a/speechx/speechx/protocol/websocket/websocket_server.h b/speechx/speechx/protocol/websocket/websocket_server.h index 009fc42e..b0dcb3e3 100644 --- a/speechx/speechx/protocol/websocket/websocket_server.h +++ b/speechx/speechx/protocol/websocket/websocket_server.h @@ -15,14 +15,12 @@ #pragma once #include "base/common.h" - #include "boost/asio/connect.hpp" #include "boost/asio/ip/tcp.hpp" #include "boost/beast/core.hpp" #include "boost/beast/websocket.hpp" - -#include "decoder/recognizer.h" #include "frontend/audio/feature_pipeline.h" +#include "recognizer/recognizer.h" namespace beast = boost::beast; // from namespace http = beast::http; // from diff --git a/speechx/speechx/protocol/websocket/websocket_server_main.cc b/speechx/speechx/protocol/websocket/websocket_server_main.cc index 109da96b..5c32caf2 100644 --- a/speechx/speechx/protocol/websocket/websocket_server_main.cc +++ b/speechx/speechx/protocol/websocket/websocket_server_main.cc @@ -12,16 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "websocket/websocket_server.h" #include "decoder/param.h" +#include "websocket/websocket_server.h" DEFINE_int32(port, 8082, "websocket listening port"); +ppspeech::RecognizerResource InitRecognizerResoure() { + ppspeech::RecognizerResource resource; + resource.acoustic_scale = FLAGS_acoustic_scale; + resource.feature_pipeline_opts = + ppspeech::FeaturePipelineOptions::InitFromFlags(); + resource.model_opts = ppspeech::ModelOptions::InitFromFlags(); + resource.tlg_opts = ppspeech::TLGDecoderOptions::InitFromFlags(); + return resource; +} + int main(int argc, char *argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); - ppspeech::RecognizerResource resource = ppspeech::InitRecognizerResoure(); + ppspeech::RecognizerResource resource = InitRecognizerResoure(); ppspeech::WebSocketServer server(FLAGS_port, resource); LOG(INFO) << "Listening at port " << FLAGS_port; diff --git a/speechx/speechx/recognizer/CMakeLists.txt b/speechx/speechx/recognizer/CMakeLists.txt new file mode 100644 index 00000000..05078873 --- /dev/null +++ b/speechx/speechx/recognizer/CMakeLists.txt @@ -0,0 +1,45 @@ +set(srcs) + +if (USING_DS2) +list(APPEND srcs +recognizer.cc +) +endif() + +if (USING_U2) + list(APPEND srcs + u2_recognizer.cc + ) +endif() + +add_library(recognizer STATIC ${srcs}) +target_link_libraries(recognizer PUBLIC decoder) + +# test +if (USING_DS2) + set(BINS recognizer_main) + + foreach(bin_name IN LISTS BINS) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} PUBLIC recognizer nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS}) + endforeach() +endif() + + +if (USING_U2) + set(TEST_BINS + u2_recognizer_main + ) + + foreach(bin_name IN LISTS TEST_BINS) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} recognizer nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util) + target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS}) + target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) + target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) + endforeach() + +endif() + diff --git a/speechx/speechx/decoder/recognizer.cc b/speechx/speechx/recognizer/recognizer.cc similarity index 96% rename from speechx/speechx/decoder/recognizer.cc rename to speechx/speechx/recognizer/recognizer.cc index 44c3911c..c6631813 100644 --- a/speechx/speechx/decoder/recognizer.cc +++ b/speechx/speechx/recognizer/recognizer.cc @@ -12,25 +12,31 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "decoder/recognizer.h" +#include "recognizer/recognizer.h" + namespace ppspeech { -using kaldi::Vector; -using kaldi::VectorBase; using kaldi::BaseFloat; -using std::vector; using kaldi::SubVector; +using kaldi::Vector; +using kaldi::VectorBase; using std::unique_ptr; +using std::vector; + Recognizer::Recognizer(const RecognizerResource& resource) { // resource_ = resource; const FeaturePipelineOptions& feature_opts = resource.feature_pipeline_opts; feature_pipeline_.reset(new FeaturePipeline(feature_opts)); + std::shared_ptr nnet(new PaddleNnet(resource.model_opts)); + BaseFloat ac_scale = resource.acoustic_scale; decodable_.reset(new Decodable(nnet, feature_pipeline_, ac_scale)); + decoder_.reset(new TLGDecoder(resource.tlg_opts)); + input_finished_ = false; } diff --git a/speechx/speechx/decoder/recognizer.h b/speechx/speechx/recognizer/recognizer.h similarity index 64% rename from speechx/speechx/decoder/recognizer.h rename to speechx/speechx/recognizer/recognizer.h index 35e1e167..57d5bb36 100644 --- a/speechx/speechx/decoder/recognizer.h +++ b/speechx/speechx/recognizer/recognizer.h @@ -20,21 +20,31 @@ #include "decoder/ctc_tlg_decoder.h" #include "frontend/audio/feature_pipeline.h" #include "nnet/decodable.h" -#include "nnet/paddle_nnet.h" +#include "nnet/ds2_nnet.h" + +DECLARE_double(acoustic_scale); namespace ppspeech { struct RecognizerResource { - FeaturePipelineOptions feature_pipeline_opts; - ModelOptions model_opts; - TLGDecoderOptions tlg_opts; + kaldi::BaseFloat acoustic_scale{1.0}; + FeaturePipelineOptions feature_pipeline_opts{}; + ModelOptions model_opts{}; + TLGDecoderOptions tlg_opts{}; // CTCBeamSearchOptions beam_search_opts; - kaldi::BaseFloat acoustic_scale; - RecognizerResource() - : acoustic_scale(1.0), - feature_pipeline_opts(), - model_opts(), - tlg_opts() {} + + static RecognizerResource InitFromFlags() { + RecognizerResource resource; + resource.acoustic_scale = FLAGS_acoustic_scale; + resource.feature_pipeline_opts = + FeaturePipelineOptions::InitFromFlags(); + resource.feature_pipeline_opts.assembler_opts.fill_zero = true; + LOG(INFO) << "ds2 need fill zero be true: " + << resource.feature_pipeline_opts.assembler_opts.fill_zero; + resource.model_opts = ModelOptions::InitFromFlags(); + resource.tlg_opts = TLGDecoderOptions::InitFromFlags(); + return resource; + } }; class Recognizer { diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/recognizer/recognizer_main.cc similarity index 93% rename from speechx/speechx/decoder/recognizer_main.cc rename to speechx/speechx/recognizer/recognizer_main.cc index 23251353..cb0de2d6 100644 --- a/speechx/speechx/decoder/recognizer_main.cc +++ b/speechx/speechx/recognizer/recognizer_main.cc @@ -12,21 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "decoder/recognizer.h" #include "decoder/param.h" #include "kaldi/feat/wave-reader.h" #include "kaldi/util/table-types.h" +#include "recognizer/recognizer.h" DEFINE_string(wav_rspecifier, "", "test feature rspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier"); DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); DEFINE_int32(sample_rate, 16000, "sample rate"); + int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; - ppspeech::RecognizerResource resource = ppspeech::InitRecognizerResoure(); + ppspeech::RecognizerResource resource = + ppspeech::RecognizerResource::InitFromFlags(); ppspeech::Recognizer recognizer(resource); kaldi::SequentialTableReader wav_reader( diff --git a/speechx/speechx/recognizer/u2_recognizer.cc b/speechx/speechx/recognizer/u2_recognizer.cc new file mode 100644 index 00000000..382f622f --- /dev/null +++ b/speechx/speechx/recognizer/u2_recognizer.cc @@ -0,0 +1,219 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "recognizer/u2_recognizer.h" + +#include "nnet/u2_nnet.h" + +namespace ppspeech { + +using kaldi::BaseFloat; +using kaldi::SubVector; +using kaldi::Vector; +using kaldi::VectorBase; +using std::unique_ptr; +using std::vector; + +U2Recognizer::U2Recognizer(const U2RecognizerResource& resource) + : opts_(resource) { + const FeaturePipelineOptions& feature_opts = resource.feature_pipeline_opts; + feature_pipeline_.reset(new FeaturePipeline(feature_opts)); + + std::shared_ptr nnet(new U2Nnet(resource.model_opts)); + + BaseFloat am_scale = resource.acoustic_scale; + decodable_.reset(new Decodable(nnet, feature_pipeline_, am_scale)); + + CHECK_NE(resource.vocab_path, ""); + decoder_.reset(new CTCPrefixBeamSearch( + resource.vocab_path, resource.decoder_opts.ctc_prefix_search_opts)); + + unit_table_ = decoder_->VocabTable(); + symbol_table_ = unit_table_; + + input_finished_ = false; + + Reset(); +} + +void U2Recognizer::Reset() { + global_frame_offset_ = 0; + num_frames_ = 0; + result_.clear(); + + decodable_->Reset(); + decoder_->Reset(); +} + +void U2Recognizer::ResetContinuousDecoding() { + global_frame_offset_ = num_frames_; + num_frames_ = 0; + result_.clear(); + + decodable_->Reset(); + decoder_->Reset(); +} + + +void U2Recognizer::Accept(const VectorBase& waves) { + feature_pipeline_->Accept(waves); +} + + +void U2Recognizer::Decode() { + decoder_->AdvanceDecode(decodable_); + UpdateResult(false); +} + +void U2Recognizer::Rescoring() { + // Do attention Rescoring + kaldi::Timer timer; + AttentionRescoring(); + VLOG(1) << "Rescoring cost latency: " << timer.Elapsed() << " sec."; +} + +void U2Recognizer::UpdateResult(bool finish) { + const auto& hypotheses = decoder_->Outputs(); + const auto& inputs = decoder_->Inputs(); + const auto& likelihood = decoder_->Likelihood(); + const auto& times = decoder_->Times(); + result_.clear(); + + CHECK_EQ(hypotheses.size(), likelihood.size()); + for (size_t i = 0; i < hypotheses.size(); i++) { + const std::vector& hypothesis = hypotheses[i]; + + DecodeResult path; + path.score = likelihood[i]; + for (size_t j = 0; j < hypothesis.size(); j++) { + std::string word = symbol_table_->Find(hypothesis[j]); + // A detailed explanation of this if-else branch can be found in + // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058 + if (decoder_->Type() == kWfstBeamSearch) { + path.sentence += (" " + word); + } else { + path.sentence += (word); + } + } + + // TimeStamp is only supported in final result + // TimeStamp of the output of CtcWfstBeamSearch may be inaccurate due to + // various FST operations when building the decoding graph. So here we + // use time stamp of the input(e2e model unit), which is more accurate, + // and it requires the symbol table of the e2e model used in training. + if (unit_table_ != nullptr && finish) { + int offset = global_frame_offset_ * FrameShiftInMs(); + + const std::vector& input = inputs[i]; + const std::vector time_stamp = times[i]; + CHECK_EQ(input.size(), time_stamp.size()); + + for (size_t j = 0; j < input.size(); j++) { + std::string word = unit_table_->Find(input[j]); + + int start = + time_stamp[j] * FrameShiftInMs() - time_stamp_gap_ > 0 + ? time_stamp[j] * FrameShiftInMs() - time_stamp_gap_ + : 0; + if (j > 0) { + start = + (time_stamp[j] - time_stamp[j - 1]) * FrameShiftInMs() < + time_stamp_gap_ + ? (time_stamp[j - 1] + time_stamp[j]) / 2 * + FrameShiftInMs() + : start; + } + + int end = time_stamp[j] * FrameShiftInMs(); + if (j < input.size() - 1) { + end = + (time_stamp[j + 1] - time_stamp[j]) * FrameShiftInMs() < + time_stamp_gap_ + ? (time_stamp[j + 1] + time_stamp[j]) / 2 * + FrameShiftInMs() + : end; + } + + WordPiece word_piece(word, offset + start, offset + end); + path.word_pieces.emplace_back(word_piece); + } + } + + // if (post_processor_ != nullptr) { + // path.sentence = post_processor_->Process(path.sentence, finish); + // } + + result_.emplace_back(path); + } + + if (DecodedSomething()) { + VLOG(1) << "Partial CTC result " << result_[0].sentence; + } +} + +void U2Recognizer::AttentionRescoring() { + decoder_->FinalizeSearch(); + UpdateResult(true); + + // No need to do rescoring + if (0.0 == opts_.decoder_opts.rescoring_weight) { + LOG_EVERY_N(WARNING, 3) << "Not do AttentionRescoring!"; + return; + } + LOG_EVERY_N(WARNING, 3) << "Do AttentionRescoring!"; + + // Inputs() returns N-best input ids, which is the basic unit for rescoring + // In CtcPrefixBeamSearch, inputs are the same to outputs + const auto& hypotheses = decoder_->Inputs(); + int num_hyps = hypotheses.size(); + if (num_hyps <= 0) { + return; + } + + kaldi::Timer timer; + std::vector rescoring_score; + decodable_->AttentionRescoring( + hypotheses, opts_.decoder_opts.reverse_weight, &rescoring_score); + VLOG(1) << "Attention Rescoring takes " << timer.Elapsed() << " sec."; + + // combine ctc score and rescoring score + for (size_t i = 0; i < num_hyps; i++) { + VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i] + << " ctc_score: " << result_[i].score + << " rescoring_weight: " << opts_.decoder_opts.rescoring_weight + << " ctc_weight: " << opts_.decoder_opts.ctc_weight; + result_[i].score = + opts_.decoder_opts.rescoring_weight * rescoring_score[i] + + opts_.decoder_opts.ctc_weight * result_[i].score; + + VLOG(1) << "hyp: " << result_[0].sentence + << " score: " << result_[0].score; + } + + std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc); + VLOG(1) << "result: " << result_[0].sentence + << " score: " << result_[0].score; +} + +std::string U2Recognizer::GetFinalResult() { return result_[0].sentence; } + +std::string U2Recognizer::GetPartialResult() { return result_[0].sentence; } + +void U2Recognizer::SetFinished() { + feature_pipeline_->SetFinished(); + input_finished_ = true; +} + + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/recognizer/u2_recognizer.h b/speechx/speechx/recognizer/u2_recognizer.h new file mode 100644 index 00000000..25850863 --- /dev/null +++ b/speechx/speechx/recognizer/u2_recognizer.h @@ -0,0 +1,172 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "decoder/common.h" +#include "decoder/ctc_beam_search_opt.h" +#include "decoder/ctc_prefix_beam_search_decoder.h" +#include "decoder/decoder_itf.h" +#include "frontend/audio/feature_pipeline.h" +#include "fst/fstlib.h" +#include "fst/symbol-table.h" +#include "nnet/decodable.h" + +DECLARE_int32(nnet_decoder_chunk); +DECLARE_int32(num_left_chunks); +DECLARE_double(ctc_weight); +DECLARE_double(rescoring_weight); +DECLARE_double(reverse_weight); +DECLARE_int32(nbest); +DECLARE_int32(blank); + +DECLARE_double(acoustic_scale); +DECLARE_string(vocab_path); + +namespace ppspeech { + +struct DecodeOptions { + // chunk_size is the frame number of one chunk after subsampling. + // e.g. if subsample rate is 4 and chunk_size = 16, the frames in + // one chunk are 67=16*4 + 3, stride is 64=16*4 + int chunk_size{16}; + int num_left_chunks{-1}; + + // final_score = rescoring_weight * rescoring_score + ctc_weight * + // ctc_score; + // rescoring_score = left_to_right_score * (1 - reverse_weight) + + // right_to_left_score * reverse_weight + // Please note the concept of ctc_scores + // in the following two search methods are different. For + // CtcPrefixBeamSerch, + // it's a sum(prefix) score + context score For CtcWfstBeamSerch, it's a + // max(viterbi) path score + context score So we should carefully set + // ctc_weight accroding to the search methods. + float ctc_weight{0.0}; + float rescoring_weight{1.0}; + float reverse_weight{0.0}; + + // CtcEndpointConfig ctc_endpoint_opts; + CTCBeamSearchOptions ctc_prefix_search_opts{}; + + static DecodeOptions InitFromFlags() { + DecodeOptions decoder_opts; + decoder_opts.chunk_size = FLAGS_nnet_decoder_chunk; + decoder_opts.num_left_chunks = FLAGS_num_left_chunks; + decoder_opts.ctc_weight = FLAGS_ctc_weight; + decoder_opts.rescoring_weight = FLAGS_rescoring_weight; + decoder_opts.reverse_weight = FLAGS_reverse_weight; + decoder_opts.ctc_prefix_search_opts.blank = FLAGS_blank; + decoder_opts.ctc_prefix_search_opts.first_beam_size = FLAGS_nbest; + decoder_opts.ctc_prefix_search_opts.second_beam_size = FLAGS_nbest; + LOG(INFO) << "chunk_size: " << decoder_opts.chunk_size; + LOG(INFO) << "num_left_chunks: " << decoder_opts.num_left_chunks; + LOG(INFO) << "ctc_weight: " << decoder_opts.ctc_weight; + LOG(INFO) << "rescoring_weight: " << decoder_opts.rescoring_weight; + LOG(INFO) << "reverse_weight: " << decoder_opts.reverse_weight; + LOG(INFO) << "blank: " << FLAGS_blank; + LOG(INFO) << "first_beam_size: " << FLAGS_nbest; + LOG(INFO) << "second_beam_size: " << FLAGS_nbest; + return decoder_opts; + } +}; + +struct U2RecognizerResource { + kaldi::BaseFloat acoustic_scale{1.0}; + std::string vocab_path{}; + + FeaturePipelineOptions feature_pipeline_opts{}; + ModelOptions model_opts{}; + DecodeOptions decoder_opts{}; + + static U2RecognizerResource InitFromFlags() { + U2RecognizerResource resource; + resource.vocab_path = FLAGS_vocab_path; + resource.acoustic_scale = FLAGS_acoustic_scale; + LOG(INFO) << "vocab path: " << resource.vocab_path; + LOG(INFO) << "acoustic_scale: " << resource.acoustic_scale; + + resource.feature_pipeline_opts = + ppspeech::FeaturePipelineOptions::InitFromFlags(); + resource.feature_pipeline_opts.assembler_opts.fill_zero = false; + LOG(INFO) << "u2 need fill zero be false: " + << resource.feature_pipeline_opts.assembler_opts.fill_zero; + resource.model_opts = ppspeech::ModelOptions::InitFromFlags(); + resource.decoder_opts = ppspeech::DecodeOptions::InitFromFlags(); + return resource; + } +}; + + +class U2Recognizer { + public: + explicit U2Recognizer(const U2RecognizerResource& resouce); + void Reset(); + void ResetContinuousDecoding(); + + void Accept(const kaldi::VectorBase& waves); + void Decode(); + void Rescoring(); + + + std::string GetFinalResult(); + std::string GetPartialResult(); + + void SetFinished(); + bool IsFinished() { return input_finished_; } + + bool DecodedSomething() const { + return !result_.empty() && !result_[0].sentence.empty(); + } + + + int FrameShiftInMs() const { + // one decoder frame length in ms + return decodable_->Nnet()->SubsamplingRate() * + feature_pipeline_->FrameShift(); + } + + + const std::vector& Result() const { return result_; } + + private: + void AttentionRescoring(); + void UpdateResult(bool finish = false); + + private: + U2RecognizerResource opts_; + + // std::shared_ptr resource_; + // U2RecognizerResource resource_; + std::shared_ptr feature_pipeline_; + std::shared_ptr decodable_; + std::unique_ptr decoder_; + + // e2e unit symbol table + std::shared_ptr unit_table_ = nullptr; + std::shared_ptr symbol_table_ = nullptr; + + std::vector result_; + + // global decoded frame offset + int global_frame_offset_; + // cur decoded frame num + int num_frames_; + // timestamp gap between words in a sentence + const int time_stamp_gap_ = 100; + + bool input_finished_; +}; + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc new file mode 100644 index 00000000..5cb8dbb1 --- /dev/null +++ b/speechx/speechx/recognizer/u2_recognizer_main.cc @@ -0,0 +1,124 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "decoder/param.h" +#include "kaldi/feat/wave-reader.h" +#include "kaldi/util/table-types.h" +#include "recognizer/u2_recognizer.h" + +DEFINE_string(wav_rspecifier, "", "test feature rspecifier"); +DEFINE_string(result_wspecifier, "", "test result wspecifier"); +DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); +DEFINE_int32(sample_rate, 16000, "sample rate"); + +int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); + gflags::ParseCommandLineFlags(&argc, &argv, false); + google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; + + int32 num_done = 0, num_err = 0; + double tot_wav_duration = 0.0; + + kaldi::SequentialTableReader wav_reader( + FLAGS_wav_rspecifier); + kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); + + int sample_rate = FLAGS_sample_rate; + float streaming_chunk = FLAGS_streaming_chunk; + int chunk_sample_size = streaming_chunk * sample_rate; + LOG(INFO) << "sr: " << sample_rate; + LOG(INFO) << "chunk size (s): " << streaming_chunk; + LOG(INFO) << "chunk size (sample): " << chunk_sample_size; + + ppspeech::U2RecognizerResource resource = + ppspeech::U2RecognizerResource::InitFromFlags(); + ppspeech::U2Recognizer recognizer(resource); + + kaldi::Timer timer; + for (; !wav_reader.Done(); wav_reader.Next()) { + kaldi::Timer local_timer; + std::string utt = wav_reader.Key(); + const kaldi::WaveData& wave_data = wav_reader.Value(); + LOG(INFO) << "utt: " << utt; + LOG(INFO) << "wav dur: " << wave_data.Duration() << " sec."; + double dur = wave_data.Duration(); + tot_wav_duration += dur; + + int32 this_channel = 0; + kaldi::SubVector waveform(wave_data.Data(), + this_channel); + int tot_samples = waveform.Dim(); + LOG(INFO) << "wav len (sample): " << tot_samples; + + int sample_offset = 0; + int cnt = 0; + + while (sample_offset < tot_samples) { + int cur_chunk_size = + std::min(chunk_sample_size, tot_samples - sample_offset); + + kaldi::Vector wav_chunk(cur_chunk_size); + for (int i = 0; i < cur_chunk_size; ++i) { + wav_chunk(i) = waveform(sample_offset + i); + } + // wav_chunk = waveform.Range(sample_offset + i, cur_chunk_size); + + recognizer.Accept(wav_chunk); + if (cur_chunk_size < chunk_sample_size) { + recognizer.SetFinished(); + } + recognizer.Decode(); + if (recognizer.DecodedSomething()) { + LOG(INFO) << "Pratial result: " << cnt << " " + << recognizer.GetPartialResult(); + } + + // no overlap + sample_offset += cur_chunk_size; + cnt++; + } + CHECK(sample_offset == tot_samples); + + // second pass decoding + recognizer.Rescoring(); + + std::string result = recognizer.GetFinalResult(); + + recognizer.Reset(); + + if (result.empty()) { + // the TokenWriter can not write empty string. + ++num_err; + LOG(INFO) << " the result of " << utt << " is empty"; + continue; + } + + LOG(INFO) << utt << " " << result; + LOG(INFO) << " RTF: " << local_timer.Elapsed() / dur << " dur: " << dur + << " cost: " << local_timer.Elapsed(); + + result_writer.Write(utt, result); + + ++num_done; + } + + double elapsed = timer.Elapsed(); + + LOG(INFO) << "Done " << num_done << " out of " << (num_err + num_done); + LOG(INFO) << "total cost:" << elapsed << " sec"; + LOG(INFO) << "total wav duration is: " << tot_wav_duration << " sec"; + LOG(INFO) << "RTF is: " << elapsed / tot_wav_duration; +} diff --git a/speechx/speechx/utils/CMakeLists.txt b/speechx/speechx/utils/CMakeLists.txt index 95e86574..c1e875be 100644 --- a/speechx/speechx/utils/CMakeLists.txt +++ b/speechx/speechx/utils/CMakeLists.txt @@ -1,4 +1,5 @@ add_library(utils file_utils.cc + math.cc ) \ No newline at end of file diff --git a/speechx/speechx/utils/file_utils.cc b/speechx/speechx/utils/file_utils.cc index e5943e31..c42a642c 100644 --- a/speechx/speechx/utils/file_utils.cc +++ b/speechx/speechx/utils/file_utils.cc @@ -40,4 +40,4 @@ std::string ReadFile2String(const std::string& path) { return std::string((std::istreambuf_iterator(input_file)), std::istreambuf_iterator()); } -} +} // namespace ppspeech diff --git a/speechx/speechx/utils/file_utils.h b/speechx/speechx/utils/file_utils.h index 8c56c02e..a471e024 100644 --- a/speechx/speechx/utils/file_utils.h +++ b/speechx/speechx/utils/file_utils.h @@ -20,4 +20,4 @@ bool ReadFileToVector(const std::string& filename, std::vector* data); std::string ReadFile2String(const std::string& path); -} +} // namespace ppspeech diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc new file mode 100644 index 00000000..71656cb3 --- /dev/null +++ b/speechx/speechx/utils/math.cc @@ -0,0 +1,98 @@ + +// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "utils/math.h" + +#include +#include +#include +#include + +#include "base/common.h" + + +namespace ppspeech { + +// Sum in log scale +float LogSumExp(float x, float y) { + if (x <= -kBaseFloatMax) return y; + if (y <= -kBaseFloatMax) return x; + float max = std::max(x, y); + return max + std::log(std::exp(x - max) + std::exp(y - max)); +} + +// greater compare for smallest priority_queue +template +struct ValGreaterComp { + bool operator()(const std::pair& lhs, + const std::pair& rhs) const { + return lhs.first > rhs.first || + (lhs.first == rhs.first && lhs.second < rhs.second); + } +}; + +template +void TopK(const std::vector& data, + int32_t k, + std::vector* values, + std::vector* indices) { + int n = data.size(); + int min_k_n = std::min(k, n); + + // smallest heap, (val, idx) + std::vector> smallest_heap; + for (int i = 0; i < min_k_n; i++) { + smallest_heap.emplace_back(data[i], i); + } + + // smallest priority_queue + std::priority_queue, + std::vector>, + ValGreaterComp> + pq(ValGreaterComp(), std::move(smallest_heap)); + + // top k + for (int i = k; i < n; i++) { + if (pq.top().first < data[i]) { + pq.pop(); + pq.emplace(data[i], i); + } + } + + values->resize(min_k_n); + indices->resize(min_k_n); + + // from largest to samllest + int cur = values->size() - 1; + while (!pq.empty()) { + const auto& item = pq.top(); + + (*values)[cur] = item.first; + (*indices)[cur] = item.second; + + // item if reference, must pop here + pq.pop(); + + cur--; + } +} + +template void TopK(const std::vector& data, + int32_t k, + std::vector* values, + std::vector* indices); + +} // namespace ppspeech diff --git a/speechx/speechx/utils/math.h b/speechx/speechx/utils/math.h new file mode 100644 index 00000000..7c863b00 --- /dev/null +++ b/speechx/speechx/utils/math.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +namespace ppspeech { + +// Sum in log scale +float LogSumExp(float x, float y); + +template +void TopK(const std::vector& data, + int32_t k, + std::vector* values, + std::vector* indices); + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/tools/clang-format.sh b/speechx/tools/clang-format.sh new file mode 100755 index 00000000..30f636ff --- /dev/null +++ b/speechx/tools/clang-format.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +find speechx -name '*.c' -o -name '*.h' -not -path "*kaldi*" | xargs -I{} clang-format -i {} diff --git a/speechx/tools/venv.sh b/speechx/tools/venv.sh new file mode 100755 index 00000000..3952988c --- /dev/null +++ b/speechx/tools/venv.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -ex + +PYTHON=python3.7 +test -d venv || virtualenv -p ${PYTHON} venv