From 532b620454486d1b7b29f66ac1617ee8555d0006 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sun, 9 Oct 2022 03:17:28 +0000 Subject: [PATCH 01/60] refactor speechx cmake --- speechx/CMakeLists.txt | 44 +------------------ speechx/build.sh | 10 ++--- speechx/cmake/{external => }/absl.cmake | 0 speechx/cmake/{external => }/boost.cmake | 0 speechx/cmake/{external => }/eigen.cmake | 0 speechx/cmake/{external => }/gflags.cmake | 0 speechx/cmake/{external => }/glog.cmake | 0 speechx/cmake/{external => }/gtest.cmake | 0 speechx/cmake/{external => }/kenlm.cmake | 0 speechx/cmake/{external => }/libsndfile.cmake | 0 speechx/cmake/{external => }/openblas.cmake | 5 ++- speechx/cmake/{external => }/openfst.cmake | 0 speechx/cmake/paddleinference.cmake | 42 ++++++++++++++++++ 13 files changed, 50 insertions(+), 51 deletions(-) rename speechx/cmake/{external => }/absl.cmake (100%) rename speechx/cmake/{external => }/boost.cmake (100%) rename speechx/cmake/{external => }/eigen.cmake (100%) rename speechx/cmake/{external => }/gflags.cmake (100%) rename speechx/cmake/{external => }/glog.cmake (100%) rename speechx/cmake/{external => }/gtest.cmake (100%) rename speechx/cmake/{external => }/kenlm.cmake (100%) rename speechx/cmake/{external => }/libsndfile.cmake (100%) rename speechx/cmake/{external => }/openblas.cmake (92%) rename speechx/cmake/{external => }/openfst.cmake (100%) create mode 100644 speechx/cmake/paddleinference.cmake diff --git a/speechx/CMakeLists.txt b/speechx/CMakeLists.txt index 4b5838e5..8307d992 100644 --- a/speechx/CMakeLists.txt +++ b/speechx/CMakeLists.txt @@ -13,7 +13,6 @@ set(CMAKE_CXX_STANDARD 14) set(speechx_cmake_dir ${PROJECT_SOURCE_DIR}/cmake) # Modules -list(APPEND CMAKE_MODULE_PATH ${speechx_cmake_dir}/external) list(APPEND CMAKE_MODULE_PATH ${speechx_cmake_dir}) include(FetchContent) include(ExternalProject) @@ -83,48 +82,7 @@ add_dependencies(openfst gflags glog) # paddle lib -set(paddle_SOURCE_DIR ${fc_patch}/paddle-lib) -set(paddle_PREFIX_DIR ${fc_patch}/paddle-lib-prefix) -ExternalProject_Add(paddle - URL https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/CPU/gcc8.2_avx_mkl/paddle_inference.tgz - URL_HASH SHA256=7c6399e778c6554a929b5a39ba2175e702e115145e8fa690d2af974101d98873 - PREFIX ${paddle_PREFIX_DIR} - SOURCE_DIR ${paddle_SOURCE_DIR} - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" -) - -set(PADDLE_LIB ${fc_patch}/paddle-lib) -include_directories("${PADDLE_LIB}/paddle/include") -set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/") -include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include") -include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include") -include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/include") - -link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib") -link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") -link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") -link_directories("${PADDLE_LIB}/paddle/lib") -link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}mklml/lib") - -##paddle with mkl -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") -set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml") -include_directories("${MATH_LIB_PATH}/include") -set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} - ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) -set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn") -include_directories("${MKLDNN_PATH}/include") -set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) -set(EXTERNAL_LIB "-lrt -ldl -lpthread") - -set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX}) -set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} - glog gflags protobuf xxhash cryptopp - ${EXTERNAL_LIB}) - +include(paddleinference) ############################################################################### diff --git a/speechx/build.sh b/speechx/build.sh index a6eef656..e0a38675 100755 --- a/speechx/build.sh +++ b/speechx/build.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +set -xe # the build script had verified in the paddlepaddle docker image. # please follow the instruction below to install PaddlePaddle image. @@ -17,11 +18,6 @@ fi #rm -rf build mkdir -p build -cd build -cmake .. -DBOOST_ROOT:STRING=${boost_SOURCE_DIR} -#cmake .. - -make -j - -cd - +cmake -B build -DBOOST_ROOT:STRING=${boost_SOURCE_DIR} +cmake --build build diff --git a/speechx/cmake/external/absl.cmake b/speechx/cmake/absl.cmake similarity index 100% rename from speechx/cmake/external/absl.cmake rename to speechx/cmake/absl.cmake diff --git a/speechx/cmake/external/boost.cmake b/speechx/cmake/boost.cmake similarity index 100% rename from speechx/cmake/external/boost.cmake rename to speechx/cmake/boost.cmake diff --git a/speechx/cmake/external/eigen.cmake b/speechx/cmake/eigen.cmake similarity index 100% rename from speechx/cmake/external/eigen.cmake rename to speechx/cmake/eigen.cmake diff --git a/speechx/cmake/external/gflags.cmake b/speechx/cmake/gflags.cmake similarity index 100% rename from speechx/cmake/external/gflags.cmake rename to speechx/cmake/gflags.cmake diff --git a/speechx/cmake/external/glog.cmake b/speechx/cmake/glog.cmake similarity index 100% rename from speechx/cmake/external/glog.cmake rename to speechx/cmake/glog.cmake diff --git a/speechx/cmake/external/gtest.cmake b/speechx/cmake/gtest.cmake similarity index 100% rename from speechx/cmake/external/gtest.cmake rename to speechx/cmake/gtest.cmake diff --git a/speechx/cmake/external/kenlm.cmake b/speechx/cmake/kenlm.cmake similarity index 100% rename from speechx/cmake/external/kenlm.cmake rename to speechx/cmake/kenlm.cmake diff --git a/speechx/cmake/external/libsndfile.cmake b/speechx/cmake/libsndfile.cmake similarity index 100% rename from speechx/cmake/external/libsndfile.cmake rename to speechx/cmake/libsndfile.cmake diff --git a/speechx/cmake/external/openblas.cmake b/speechx/cmake/openblas.cmake similarity index 92% rename from speechx/cmake/external/openblas.cmake rename to speechx/cmake/openblas.cmake index 5c196527..a8c3dd2d 100644 --- a/speechx/cmake/external/openblas.cmake +++ b/speechx/cmake/openblas.cmake @@ -43,6 +43,7 @@ ExternalProject_Add( # https://cmake.org/cmake/help/latest/module/ExternalProject.html?highlight=externalproject_get_property#external-project-definition ExternalProject_Get_Property(OPENBLAS INSTALL_DIR) +message(STATUS "OPENBLAS install dir: ${INSTALL_DIR}") set(OpenBLAS_INSTALL_PREFIX ${INSTALL_DIR}) add_library(openblas STATIC IMPORTED) add_dependencies(openblas OPENBLAS) @@ -55,4 +56,6 @@ set_target_properties(openblas PROPERTIES IMPORTED_LOCATION ${OpenBLAS_INSTALL_P # ${CMAKE_INSTALL_LIBDIR} lib # ${CMAKE_INSTALL_INCLUDEDIR} include link_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}) -include_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}) \ No newline at end of file +# include_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}) +# fix for can not find `cblas.h` +include_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/openblas) \ No newline at end of file diff --git a/speechx/cmake/external/openfst.cmake b/speechx/cmake/openfst.cmake similarity index 100% rename from speechx/cmake/external/openfst.cmake rename to speechx/cmake/openfst.cmake diff --git a/speechx/cmake/paddleinference.cmake b/speechx/cmake/paddleinference.cmake new file mode 100644 index 00000000..d1f71f00 --- /dev/null +++ b/speechx/cmake/paddleinference.cmake @@ -0,0 +1,42 @@ +set(paddle_SOURCE_DIR ${fc_patch}/paddle-lib) +set(paddle_PREFIX_DIR ${fc_patch}/paddle-lib-prefix) +ExternalProject_Add(paddle + URL https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/CPU/gcc8.2_avx_mkl/paddle_inference.tgz + URL_HASH SHA256=7c6399e778c6554a929b5a39ba2175e702e115145e8fa690d2af974101d98873 + PREFIX ${paddle_PREFIX_DIR} + SOURCE_DIR ${paddle_SOURCE_DIR} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" +) + +set(PADDLE_LIB ${fc_patch}/paddle-lib) +include_directories("${PADDLE_LIB}/paddle/include") +set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/include") + +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") +link_directories("${PADDLE_LIB}/paddle/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}mklml/lib") + +##paddle with mkl +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") +set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml") +include_directories("${MATH_LIB_PATH}/include") +set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} + ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) +set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn") +include_directories("${MKLDNN_PATH}/include") +set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) +set(EXTERNAL_LIB "-lrt -ldl -lpthread") + +# global vars +set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX} CACHE FORCE "DEPS") +set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + glog gflags protobuf xxhash cryptopp + ${EXTERNAL_LIB} CACHE FORCE "DEPS") From b621b5b97489a76d5d48b0eb3900a955b7eefa11 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sun, 9 Oct 2022 06:05:35 +0000 Subject: [PATCH 02/60] add math and macros --- speechx/cmake/paddleinference.cmake | 7 ++- speechx/speechx/base/common.h | 3 + speechx/speechx/base/macros.h | 9 ++- speechx/speechx/decoder/CMakeLists.txt | 1 + speechx/speechx/decoder/param.h | 5 +- speechx/speechx/utils/math.cc | 82 ++++++++++++++++++++++++++ speechx/speechx/utils/math.h | 28 +++++++++ 7 files changed, 131 insertions(+), 4 deletions(-) create mode 100644 speechx/speechx/utils/math.cc create mode 100644 speechx/speechx/utils/math.h diff --git a/speechx/cmake/paddleinference.cmake b/speechx/cmake/paddleinference.cmake index d1f71f00..957e423c 100644 --- a/speechx/cmake/paddleinference.cmake +++ b/speechx/cmake/paddleinference.cmake @@ -22,6 +22,7 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") link_directories("${PADDLE_LIB}/paddle/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}mklml/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn/lib") ##paddle with mkl set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") @@ -29,14 +30,16 @@ set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml") include_directories("${MATH_LIB_PATH}/include") set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn") include_directories("${MKLDNN_PATH}/include") set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) set(EXTERNAL_LIB "-lrt -ldl -lpthread") # global vars -set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX} CACHE FORCE "DEPS") +set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX} CACHE INTERNAL "deps") set(DEPS ${DEPS} ${MATH_LIB} ${MKLDNN_LIB} glog gflags protobuf xxhash cryptopp - ${EXTERNAL_LIB} CACHE FORCE "DEPS") + ${EXTERNAL_LIB} CACHE INTERNAL "deps") +message(STATUS "Deps libraries: ${DEPS}") diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h index a9303cbb..778c06d7 100644 --- a/speechx/speechx/base/common.h +++ b/speechx/speechx/base/common.h @@ -38,3 +38,6 @@ #include "base/flags.h" #include "base/log.h" #include "base/macros.h" + +#include "utils/file_utils.h" +#include "utils/math.h" \ No newline at end of file diff --git a/speechx/speechx/base/macros.h b/speechx/speechx/base/macros.h index d7d5a78d..14332a80 100644 --- a/speechx/speechx/base/macros.h +++ b/speechx/speechx/base/macros.h @@ -14,6 +14,9 @@ #pragma once +#include +#include + namespace ppspeech { #ifndef DISALLOW_COPY_AND_ASSIGN @@ -22,4 +25,8 @@ namespace ppspeech { void operator=(const TypeName&) = delete #endif -} // namespace pp_speech \ No newline at end of file +constexpr float kFloatMax = std::numeric_limits::max(); + +const std::string kSpaceSymbol = "\xe2\x96\x81"; + +} // namespace ppspeech diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt index 1df93511..0383c3ea 100644 --- a/speechx/speechx/decoder/CMakeLists.txt +++ b/speechx/speechx/decoder/CMakeLists.txt @@ -18,6 +18,7 @@ set(BINS tlg_decoder_main ) +message(STATUS "xxxxxxxxxx: " ${DEPS}) foreach(bin_name IN LISTS BINS) add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index d6ee2705..ed895aed 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -56,7 +56,9 @@ DEFINE_int32(max_active, 7500, "max active"); DEFINE_double(beam, 15.0, "decoder beam"); DEFINE_double(lattice_beam, 7.5, "decoder beam"); + namespace ppspeech { + // todo refactor later FeaturePipelineOptions InitFeaturePipelineOptions() { FeaturePipelineOptions opts; @@ -115,4 +117,5 @@ RecognizerResource InitRecognizerResoure() { resource.tlg_opts = InitDecoderOptions(); return resource; } -} + +} // namespace ppspeech diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc new file mode 100644 index 00000000..fe5c7118 --- /dev/null +++ b/speechx/speechx/utils/math.cc @@ -0,0 +1,82 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "utils/math.h" + +#include "base/common.h" + +#include +#include +#include +#include + + +namespace ppspeech { + +// Sum in log scale +float LogSumExp(float x, float y) { + if (x <= -kFloatMax) return y; + if (y <= -kFloatMax) return x; + float max = std::max(x, y); + return max + std::log(std::exp(x - max) + std::exp(y - max)); +} + +// greater compare for smallest priority_queue +template +struct ValGreaterComp { + bool operator()(const std::pair& lhs, const std::pair& rhs) const { + return lhs.first > rhs.first || (lhs.first == rhs.first && lhs.second < rhs.second); + } +} + +template +void TopK(const std::vector& data, int32_t k, std::vector* values, std::vector* indices) { + int n = data.size(); + int min_k_n = std::min(k, n); + + // smallest heap, (val, idx) + std::vector> smallest_heap; + for (int i = 0; i < min_k_n; i++){ + smallest_heap.emplace_back(data[i], i); + } + + // smallest priority_queue + std::priority_queue, std::vector>, ValGreaterComp> pq(ValGreaterComp(), std::move(smallest_heap)); + + // top k + for (int i = k ; i < n; i++){ + if (pq.top().first < data[i]){ + pq.pop(); + pq.emplace_back(data[i], i); + } + } + + values->resize(min_k_n); + indices->resize(min_k_n); + + // from largest to samllest + int cur = values->size() - 1; + while(!pq.empty()){ + const auto& item = pq.top(); + pq.pop(); + + (*values)[cur] = item.first; + (*indices)[cur] = item.second; + + cur--; + } +} + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/utils/math.h b/speechx/speechx/utils/math.h new file mode 100644 index 00000000..452bf089 --- /dev/null +++ b/speechx/speechx/utils/math.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +namespace ppspeech { + +// Sum in log scale +float LogSumExp(float x, float y); + +template +void TopK(const std::vector& data, int32_t k, std::vector* values, std::vector* indices); + +} // namespace ppspeech \ No newline at end of file From 75c578804d6738b40a644a1c38c18a40f0252eed Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sun, 9 Oct 2022 07:54:32 +0000 Subject: [PATCH 03/60] using FetchContent_Declare for paddleinference --- speechx/cmake/openblas.cmake | 4 ++-- speechx/cmake/paddleinference.cmake | 25 +++++++++++++++++++------ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/speechx/cmake/openblas.cmake b/speechx/cmake/openblas.cmake index a8c3dd2d..27e13207 100644 --- a/speechx/cmake/openblas.cmake +++ b/speechx/cmake/openblas.cmake @@ -1,7 +1,7 @@ include(FetchContent) -set(OpenBLAS_SOURCE_DIR ${fc_patch}/OpenBLAS-src) -set(OpenBLAS_PREFIX ${fc_patch}/OpenBLAS-prefix) +set(OpenBLAS_SOURCE_DIR ${fc_patch}/openblas-src) +set(OpenBLAS_PREFIX ${fc_patch}/openblas-prefix) # ###################################################################################################################### # OPENBLAS https://github.com/lattice/quda/blob/develop/CMakeLists.txt#L575 diff --git a/speechx/cmake/paddleinference.cmake b/speechx/cmake/paddleinference.cmake index 957e423c..311804d6 100644 --- a/speechx/cmake/paddleinference.cmake +++ b/speechx/cmake/paddleinference.cmake @@ -1,6 +1,18 @@ set(paddle_SOURCE_DIR ${fc_patch}/paddle-lib) set(paddle_PREFIX_DIR ${fc_patch}/paddle-lib-prefix) -ExternalProject_Add(paddle +# ExternalProject_Add(paddle +# URL https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/CPU/gcc8.2_avx_mkl/paddle_inference.tgz +# URL_HASH SHA256=7c6399e778c6554a929b5a39ba2175e702e115145e8fa690d2af974101d98873 +# PREFIX ${paddle_PREFIX_DIR} +# SOURCE_DIR ${paddle_SOURCE_DIR} +# CONFIGURE_COMMAND "" +# BUILD_COMMAND "" +# INSTALL_COMMAND "" +# ) + +include(FetchContent) +FetchContent_Declare( + paddle URL https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/CPU/gcc8.2_avx_mkl/paddle_inference.tgz URL_HASH SHA256=7c6399e778c6554a929b5a39ba2175e702e115145e8fa690d2af974101d98873 PREFIX ${paddle_PREFIX_DIR} @@ -9,10 +21,11 @@ ExternalProject_Add(paddle BUILD_COMMAND "" INSTALL_COMMAND "" ) +FetchContent_MakeAvailable(paddle) + +set(PADDLE_LIB_THIRD_PARTY_PATH "${paddle_SOURCE_DIR}/third_party/install/") -set(PADDLE_LIB ${fc_patch}/paddle-lib) -include_directories("${PADDLE_LIB}/paddle/include") -set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/") +include_directories("${paddle_SOURCE_DIR}/paddle/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/include") @@ -20,7 +33,7 @@ include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/include") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") -link_directories("${PADDLE_LIB}/paddle/lib") +link_directories("${paddle_SOURCE_DIR}/paddle/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}mklml/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn/lib") @@ -37,7 +50,7 @@ set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) set(EXTERNAL_LIB "-lrt -ldl -lpthread") # global vars -set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX} CACHE INTERNAL "deps") +set(DEPS ${paddle_SOURCE_DIR}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX} CACHE INTERNAL "deps") set(DEPS ${DEPS} ${MATH_LIB} ${MKLDNN_LIB} glog gflags protobuf xxhash cryptopp From e1fc57deb1454c926c8925fba040ada210183168 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sun, 9 Oct 2022 09:29:37 +0000 Subject: [PATCH 04/60] add math and rename ds2 nnet --- speechx/speechx/base/common.h | 5 +++ .../ctc_prefix_beam_search_decoder_main.cc | 2 +- speechx/speechx/decoder/recognizer.h | 2 +- speechx/speechx/decoder/tlg_decoder_main.cc | 2 +- speechx/speechx/nnet/CMakeLists.txt | 8 ++-- .../nnet/{paddle_nnet.cc => ds2_nnet.cc} | 2 +- .../nnet/{paddle_nnet.h => ds2_nnet.h} | 0 ...{nnet_forward_main.cc => ds2_nnet_main.cc} | 2 +- speechx/speechx/utils/math.cc | 37 ++++++++++++------- speechx/speechx/utils/math.h | 9 +++-- 10 files changed, 42 insertions(+), 27 deletions(-) rename speechx/speechx/nnet/{paddle_nnet.cc => ds2_nnet.cc} (99%) rename speechx/speechx/nnet/{paddle_nnet.h => ds2_nnet.h} (100%) rename speechx/speechx/nnet/{nnet_forward_main.cc => ds2_nnet_main.cc} (99%) diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h index 778c06d7..dfb14885 100644 --- a/speechx/speechx/base/common.h +++ b/speechx/speechx/base/common.h @@ -14,19 +14,24 @@ #pragma once +#include #include +#include #include #include +#include #include #include #include #include #include +#include #include #include #include #include #include +#include #include #include #include diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc index 7cfee06c..e4e5c2af 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc @@ -20,7 +20,7 @@ #include "frontend/audio/data_cache.h" #include "kaldi/util/table-types.h" #include "nnet/decodable.h" -#include "nnet/paddle_nnet.h" +#include "nnet/ds2_nnet.h" DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier"); diff --git a/speechx/speechx/decoder/recognizer.h b/speechx/speechx/decoder/recognizer.h index 35e1e167..e47ca433 100644 --- a/speechx/speechx/decoder/recognizer.h +++ b/speechx/speechx/decoder/recognizer.h @@ -20,7 +20,7 @@ #include "decoder/ctc_tlg_decoder.h" #include "frontend/audio/feature_pipeline.h" #include "nnet/decodable.h" -#include "nnet/paddle_nnet.h" +#include "nnet/ds2_nnet.h" namespace ppspeech { diff --git a/speechx/speechx/decoder/tlg_decoder_main.cc b/speechx/speechx/decoder/tlg_decoder_main.cc index b175ed13..93f84da3 100644 --- a/speechx/speechx/decoder/tlg_decoder_main.cc +++ b/speechx/speechx/decoder/tlg_decoder_main.cc @@ -20,7 +20,7 @@ #include "frontend/audio/data_cache.h" #include "kaldi/util/table-types.h" #include "nnet/decodable.h" -#include "nnet/paddle_nnet.h" +#include "nnet/ds2_nnet.h" DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier"); diff --git a/speechx/speechx/nnet/CMakeLists.txt b/speechx/speechx/nnet/CMakeLists.txt index c325ce75..565bba3e 100644 --- a/speechx/speechx/nnet/CMakeLists.txt +++ b/speechx/speechx/nnet/CMakeLists.txt @@ -2,13 +2,11 @@ project(nnet) add_library(nnet STATIC decodable.cc - paddle_nnet.cc + ds2_nnet.cc ) target_link_libraries(nnet absl::strings) -set(bin_name nnet_forward_main) +set(bin_name ds2_nnet_main) add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet ${DEPS}) - - +target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet ${DEPS}) \ No newline at end of file diff --git a/speechx/speechx/nnet/paddle_nnet.cc b/speechx/speechx/nnet/ds2_nnet.cc similarity index 99% rename from speechx/speechx/nnet/paddle_nnet.cc rename to speechx/speechx/nnet/ds2_nnet.cc index 881a82f5..a89c0f20 100644 --- a/speechx/speechx/nnet/paddle_nnet.cc +++ b/speechx/speechx/nnet/ds2_nnet.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "nnet/paddle_nnet.h" +#include "nnet/ds2_nnet.h" #include "absl/strings/str_split.h" namespace ppspeech { diff --git a/speechx/speechx/nnet/paddle_nnet.h b/speechx/speechx/nnet/ds2_nnet.h similarity index 100% rename from speechx/speechx/nnet/paddle_nnet.h rename to speechx/speechx/nnet/ds2_nnet.h diff --git a/speechx/speechx/nnet/nnet_forward_main.cc b/speechx/speechx/nnet/ds2_nnet_main.cc similarity index 99% rename from speechx/speechx/nnet/nnet_forward_main.cc rename to speechx/speechx/nnet/ds2_nnet_main.cc index 0d4ea8ff..e2904208 100644 --- a/speechx/speechx/nnet/nnet_forward_main.cc +++ b/speechx/speechx/nnet/ds2_nnet_main.cc @@ -12,13 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "nnet/ds2_nnet.h" #include "base/flags.h" #include "base/log.h" #include "frontend/audio/assembler.h" #include "frontend/audio/data_cache.h" #include "kaldi/util/table-types.h" #include "nnet/decodable.h" -#include "nnet/paddle_nnet.h" DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier"); diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc index fe5c7118..7c319295 100644 --- a/speechx/speechx/utils/math.cc +++ b/speechx/speechx/utils/math.cc @@ -1,4 +1,5 @@ +// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); @@ -17,10 +18,10 @@ #include "base/common.h" -#include #include -#include +#include #include +#include namespace ppspeech { @@ -36,28 +37,36 @@ float LogSumExp(float x, float y) { // greater compare for smallest priority_queue template struct ValGreaterComp { - bool operator()(const std::pair& lhs, const std::pair& rhs) const { - return lhs.first > rhs.first || (lhs.first == rhs.first && lhs.second < rhs.second); + bool operator()(const std::pair& lhs, + const std::pair& rhs) const { + return lhs.first > rhs.first || + (lhs.first == rhs.first && lhs.second < rhs.second); } } -template -void TopK(const std::vector& data, int32_t k, std::vector* values, std::vector* indices) { - int n = data.size(); - int min_k_n = std::min(k, n); +template +void TopK(const std::vector& data, + int32_t k, + std::vector* values, + std::vector* indices) { + int n = data.size(); + int min_k_n = std::min(k, n); // smallest heap, (val, idx) - std::vector> smallest_heap; - for (int i = 0; i < min_k_n; i++){ + std::vector> smallest_heap; + for (int i = 0; i < min_k_n; i++) { smallest_heap.emplace_back(data[i], i); } // smallest priority_queue - std::priority_queue, std::vector>, ValGreaterComp> pq(ValGreaterComp(), std::move(smallest_heap)); + std::priority_queue, + std::vector>, + ValGreaterComp> + pq(ValGreaterComp(), std::move(smallest_heap)); // top k - for (int i = k ; i < n; i++){ - if (pq.top().first < data[i]){ + for (int i = k; i < n; i++) { + if (pq.top().first < data[i]) { pq.pop(); pq.emplace_back(data[i], i); } @@ -68,7 +77,7 @@ void TopK(const std::vector& data, int32_t k, std::vector* values, std::ve // from largest to samllest int cur = values->size() - 1; - while(!pq.empty()){ + while (!pq.empty()) { const auto& item = pq.top(); pq.pop(); diff --git a/speechx/speechx/utils/math.h b/speechx/speechx/utils/math.h index 452bf089..7c863b00 100644 --- a/speechx/speechx/utils/math.h +++ b/speechx/speechx/utils/math.h @@ -14,15 +14,18 @@ #pragma once -#include #include +#include namespace ppspeech { // Sum in log scale float LogSumExp(float x, float y); -template -void TopK(const std::vector& data, int32_t k, std::vector* values, std::vector* indices); +template +void TopK(const std::vector& data, + int32_t k, + std::vector* values, + std::vector* indices); } // namespace ppspeech \ No newline at end of file From 290c23b9d72b82785aba0e3fe010e461adba9888 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 10 Oct 2022 08:39:51 +0000 Subject: [PATCH 05/60] add u2 nnet, u2 nnet main, codelab, and can compile --- examples/wenetspeech/asr1/local/test_wav.sh | 1 + paddlespeech/s2t/exps/u2/bin/test_wav.py | 9 + speechx/.clang-format | 29 + speechx/CMakeLists.txt | 41 +- speechx/README.md | 12 +- speechx/cmake/gflags.cmake | 5 +- speechx/cmake/gtest.cmake | 4 +- speechx/examples/codelab/feat/.gitignore | 2 + speechx/examples/codelab/feat/path.sh | 4 +- speechx/examples/codelab/feat/run.sh | 6 + speechx/examples/codelab/nnet/path.sh | 2 +- speechx/examples/codelab/u2nnet/.gitignore | 3 + speechx/examples/codelab/u2nnet/README.md | 3 + speechx/examples/codelab/u2nnet/path.sh | 19 + speechx/examples/codelab/u2nnet/run.sh | 59 ++ speechx/examples/codelab/u2nnet/valgrind.sh | 21 + speechx/examples/u2pp_ol/README.md | 5 + speechx/speechx/base/common.h | 1 + .../codelab/glog/glog_logtostderr_main.cc | 2 +- .../codelab/nnet/ds2_model_test_main.cc | 3 + speechx/speechx/decoder/CMakeLists.txt | 1 - .../ctc_prefix_beam_search_decoder_main.cc | 3 + .../decoder/nnet_logprob_decoder_main.cc | 3 + speechx/speechx/decoder/recognizer_main.cc | 3 + speechx/speechx/decoder/tlg_decoder_main.cc | 3 + speechx/speechx/frontend/audio/CMakeLists.txt | 2 - .../frontend/audio/cmvn_json2kaldi_main.cc | 3 + .../frontend/audio/compute_fbank_main.cc | 67 +- .../audio/compute_linear_spectrogram_main.cc | 3 + speechx/speechx/model/CMakeLists.txt | 0 speechx/speechx/nnet/CMakeLists.txt | 46 +- speechx/speechx/nnet/decodable.cc | 32 +- speechx/speechx/nnet/decodable.h | 21 +- speechx/speechx/nnet/ds2_nnet_main.cc | 8 +- speechx/speechx/nnet/u2_nnet.cc | 706 ++++++++++++++++++ speechx/speechx/nnet/u2_nnet.h | 157 ++++ speechx/speechx/nnet/u2_nnet_main.cc | 180 +++++ speechx/speechx/protocol/CMakeLists.txt | 2 - .../speechx/protocol/websocket/CMakeLists.txt | 2 +- speechx/speechx/utils/CMakeLists.txt | 1 + speechx/speechx/utils/math.cc | 4 +- speechx/tools/venv.sh | 5 + 42 files changed, 1425 insertions(+), 58 deletions(-) create mode 100644 speechx/.clang-format create mode 100644 speechx/examples/codelab/feat/.gitignore create mode 100644 speechx/examples/codelab/u2nnet/.gitignore create mode 100644 speechx/examples/codelab/u2nnet/README.md create mode 100644 speechx/examples/codelab/u2nnet/path.sh create mode 100755 speechx/examples/codelab/u2nnet/run.sh create mode 100755 speechx/examples/codelab/u2nnet/valgrind.sh create mode 100644 speechx/examples/u2pp_ol/README.md delete mode 100644 speechx/speechx/model/CMakeLists.txt create mode 100644 speechx/speechx/nnet/u2_nnet.cc create mode 100644 speechx/speechx/nnet/u2_nnet.h create mode 100644 speechx/speechx/nnet/u2_nnet_main.cc create mode 100755 speechx/tools/venv.sh diff --git a/examples/wenetspeech/asr1/local/test_wav.sh b/examples/wenetspeech/asr1/local/test_wav.sh index 47464262..c3a17f49 100755 --- a/examples/wenetspeech/asr1/local/test_wav.sh +++ b/examples/wenetspeech/asr1/local/test_wav.sh @@ -42,6 +42,7 @@ for type in attention_rescoring; do output_dir=${ckpt_prefix} mkdir -p ${output_dir} python3 -u ${BIN_DIR}/test_wav.py \ + --debug True \ --ngpu ${ngpu} \ --config ${config_path} \ --decode_cfg ${decode_config_path} \ diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 2e067ab6..67ef2e53 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -16,6 +16,8 @@ import os import sys from pathlib import Path +import distutils +import numpy as np import paddle import soundfile from yacs.config import CfgNode @@ -74,6 +76,8 @@ class U2Infer(): # fbank feat = self.preprocessing(audio, **self.preprocess_args) logger.info(f"feat shape: {feat.shape}") + if self.args.debug: + np.savetxt("feat.transform.txt", feat) ilen = paddle.to_tensor(feat.shape[0]) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) @@ -125,6 +129,11 @@ if __name__ == "__main__": "--result_file", type=str, help="path of save the asr result") parser.add_argument( "--audio_file", type=str, help="path of the input audio file") + parser.add_argument( + "--debug", + type=distutils.util.strtobool, + default=False, + help="for debug.") args = parser.parse_args() config = CfgNode(new_allowed=True) diff --git a/speechx/.clang-format b/speechx/.clang-format new file mode 100644 index 00000000..af946a4a --- /dev/null +++ b/speechx/.clang-format @@ -0,0 +1,29 @@ +# This file is used by clang-format to autoformat paddle source code +# +# The clang-format is part of llvm toolchain. +# It need to install llvm and clang to format source code style. +# +# The basic usage is, +# clang-format -i -style=file PATH/TO/SOURCE/CODE +# +# The -style=file implicit use ".clang-format" file located in one of +# parent directory. +# The -i means inplace change. +# +# The document of clang-format is +# http://clang.llvm.org/docs/ClangFormat.html +# http://clang.llvm.org/docs/ClangFormatStyleOptions.html +--- +Language: Cpp +BasedOnStyle: Google +IndentWidth: 4 +TabWidth: 4 +ContinuationIndentWidth: 4 +MaxEmptyLinesToKeep: 2 +AccessModifierOffset: -2 # The private/protected/public has no indent in class +Standard: Cpp11 +AllowAllParametersOfDeclarationOnNextLine: true +BinPackParameters: false +BinPackArguments: false +... + diff --git a/speechx/CMakeLists.txt b/speechx/CMakeLists.txt index 8307d992..17e64c04 100644 --- a/speechx/CMakeLists.txt +++ b/speechx/CMakeLists.txt @@ -31,9 +31,13 @@ SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} --std=c++14 -pthread -fPIC -O3 -Wall ############################################################################### # Option Configurations ############################################################################### -# option configurations option(TEST_DEBUG "option for debug" OFF) +option(USE_PROFILING "enable c++ profling" OFF) +option(USING_U2 "compile u2 model." ON) +option(USING_DS2 "compile with ds2 model." ON) + +option(USING_GPU "u2 compute on GPU." OFF) ############################################################################### # Include third party @@ -85,6 +89,41 @@ add_dependencies(openfst gflags glog) include(paddleinference) +# paddle core.so +find_package(Threads REQUIRED) +find_package(PythonLibs REQUIRED) +find_package(Python3 REQUIRED) +find_package(pybind11 CONFIG) + +message(STATUS "PYTHON_LIBRARIES = ${PYTHON_LIBRARIES}") +message(STATUS "Python3_EXECUTABLE = ${Python3_EXECUTABLE}") +message(STATUS "Pybind11_INCLUDES = ${pybind11_INCLUDE_DIRS}, pybind11_LIBRARIES=${pybind11_LIBRARIES}, pybind11_DEFINITIONS=${pybind11_DEFINITIONS}") + +# paddle include and link option +execute_process( + COMMAND python -c "import paddle ; print(' '.join(paddle.sysconfig.get_link_flags()), end='')" + OUTPUT_VARIABLE PADDLE_LINK_FLAGS + RESULT_VARIABLE SUCESS) + +message(STATUS PADDLE_LINK_FLAGS= ${PADDLE_LINK_FLAGS}) +string(STRIP ${PADDLE_LINK_FLAGS} PADDLE_LINK_FLAGS) + +# paddle compile option +execute_process( + COMMAND python -c "import paddle ; print(' '.join(paddle.sysconfig.get_compile_flags()), end='')" + OUTPUT_VARIABLE PADDLE_COMPILE_FLAGS) +message(STATUS PADDLE_COMPILE_FLAGS= ${PADDLE_COMPILE_FLAGS}) +string(STRIP ${PADDLE_COMPILE_FLAGS} PADDLE_COMPILE_FLAGS) + + +# for LD_LIBRARY_PATH +# set(PADDLE_LIB_DIRS /workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid:/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/libs/) +execute_process( + COMMAND python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')" + OUTPUT_VARIABLE PADDLE_LIB_DIRS) +message(STATUS PADDLE_LIB_DIRS= ${PADDLE_LIB_DIRS}) + + ############################################################################### # Add local library ############################################################################### diff --git a/speechx/README.md b/speechx/README.md index cd1cd62c..cc7b13e6 100644 --- a/speechx/README.md +++ b/speechx/README.md @@ -3,11 +3,14 @@ ## Environment We develop under: +* python - 3.7 * docker - `registry.baidubce.com/paddlepaddle/paddle:2.2.2-gpu-cuda10.2-cudnn7` * os - Ubuntu 16.04.7 LTS * gcc/g++/gfortran - 8.2.0 * cmake - 3.16.0 +> Please using `tools/env.sh` to create python `venv`, then `source venv/bin/activate` to build speechx. + > We make sure all things work fun under docker, and recommend using it to develop and deploy. * [How to Install Docker](https://docs.docker.com/engine/install/) @@ -24,13 +27,16 @@ docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/workspace --nam * More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html). +2. Create python environment. -2. Build `speechx` and `examples`. +``` +bash tools/venv.sh +``` -> Do not source venv. +2. Build `speechx` and `examples`. ``` -pushd /path/to/speechx +source venv/bin/activate ./build.sh ``` diff --git a/speechx/cmake/gflags.cmake b/speechx/cmake/gflags.cmake index 66ae47f7..36bebc87 100644 --- a/speechx/cmake/gflags.cmake +++ b/speechx/cmake/gflags.cmake @@ -2,10 +2,9 @@ include(FetchContent) FetchContent_Declare( gflags - URL https://github.com/gflags/gflags/archive/v2.2.1.zip - URL_HASH SHA256=4e44b69e709c826734dbbbd5208f61888a2faf63f239d73d8ba0011b2dccc97a + URL https://github.com/gflags/gflags/archive/v2.2.2.zip + URL_HASH SHA256=19713a36c9f32b33df59d1c79b4958434cb005b5b47dc5400a7a4b078111d9b5 ) - FetchContent_MakeAvailable(gflags) # openfst need diff --git a/speechx/cmake/gtest.cmake b/speechx/cmake/gtest.cmake index 7fe397fc..1ea8ed0b 100644 --- a/speechx/cmake/gtest.cmake +++ b/speechx/cmake/gtest.cmake @@ -1,8 +1,8 @@ include(FetchContent) FetchContent_Declare( gtest - URL https://github.com/google/googletest/archive/release-1.10.0.zip - URL_HASH SHA256=94c634d499558a76fa649edb13721dce6e98fb1e7018dfaeba3cd7a083945e91 + URL https://github.com/google/googletest/archive/release-1.11.0.zip + URL_HASH SHA256=353571c2440176ded91c2de6d6cd88ddd41401d14692ec1f99e35d013feda55a ) FetchContent_MakeAvailable(gtest) diff --git a/speechx/examples/codelab/feat/.gitignore b/speechx/examples/codelab/feat/.gitignore new file mode 100644 index 00000000..bbd86a25 --- /dev/null +++ b/speechx/examples/codelab/feat/.gitignore @@ -0,0 +1,2 @@ +data +exp diff --git a/speechx/examples/codelab/feat/path.sh b/speechx/examples/codelab/feat/path.sh index 3b89d01e..9d229174 100644 --- a/speechx/examples/codelab/feat/path.sh +++ b/speechx/examples/codelab/feat/path.sh @@ -1,12 +1,12 @@ # This contains the locations of binarys build required for running the examples. SPEECHX_ROOT=$PWD/../../../ -SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples +SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx SPEECHX_TOOLS=$SPEECHX_ROOT/tools TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin -[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } +[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; } export LC_AL=C diff --git a/speechx/examples/codelab/feat/run.sh b/speechx/examples/codelab/feat/run.sh index 1fa37f98..66bd8ae2 100755 --- a/speechx/examples/codelab/feat/run.sh +++ b/speechx/examples/codelab/feat/run.sh @@ -54,4 +54,10 @@ compute_linear_spectrogram_main \ --cmvn_file=$exp_dir/cmvn.ark echo "compute linear spectrogram feature." +compute_fbank_main \ + --num_bins 161 \ + --wav_rspecifier=scp:$data_dir/wav.scp \ + --feature_wspecifier=ark,t:$exp_dir/fbank.ark \ + --cmvn_file=$exp_dir/cmvn.ark +echo "compute fbank feature." diff --git a/speechx/examples/codelab/nnet/path.sh b/speechx/examples/codelab/nnet/path.sh index 7d395d64..11c8aef8 100644 --- a/speechx/examples/codelab/nnet/path.sh +++ b/speechx/examples/codelab/nnet/path.sh @@ -6,7 +6,7 @@ SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx SPEECHX_TOOLS=$SPEECHX_ROOT/tools TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin -[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } +[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; } export LC_AL=C diff --git a/speechx/examples/codelab/u2nnet/.gitignore b/speechx/examples/codelab/u2nnet/.gitignore new file mode 100644 index 00000000..d6fe69bc --- /dev/null +++ b/speechx/examples/codelab/u2nnet/.gitignore @@ -0,0 +1,3 @@ +data +exp +*log diff --git a/speechx/examples/codelab/u2nnet/README.md b/speechx/examples/codelab/u2nnet/README.md new file mode 100644 index 00000000..772a58f0 --- /dev/null +++ b/speechx/examples/codelab/u2nnet/README.md @@ -0,0 +1,3 @@ +# Deepspeech2 Streaming NNet Test + +Using for ds2 streaming nnet inference test. diff --git a/speechx/examples/codelab/u2nnet/path.sh b/speechx/examples/codelab/u2nnet/path.sh new file mode 100644 index 00000000..564e9fed --- /dev/null +++ b/speechx/examples/codelab/u2nnet/path.sh @@ -0,0 +1,19 @@ +# This contains the locations of binarys build required for running the examples. + +unset GREP_OPTIONS + +SPEECHX_ROOT=$PWD/../../../ +SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx + +SPEECHX_TOOLS=$SPEECHX_ROOT/tools +TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin + +[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; } + +export LC_AL=C + +SPEECHX_BIN=$SPEECHX_BUILD/nnet +export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN + +PADDLE_LIB_PATH=$(python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')") +export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH diff --git a/speechx/examples/codelab/u2nnet/run.sh b/speechx/examples/codelab/u2nnet/run.sh new file mode 100755 index 00000000..b309bc6f --- /dev/null +++ b/speechx/examples/codelab/u2nnet/run.sh @@ -0,0 +1,59 @@ +#!/bin/bash +set -x +set -e + +. path.sh + +# 1. compile +if [ ! -d ${SPEECHX_EXAMPLES} ]; then + pushd ${SPEECHX_ROOT} + bash build.sh + popd +fi + +# 2. download model +if [ ! -f data/model/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz ]; then + mkdir -p data/model + pushd data/model + wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/static/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz + tar xzfv asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz + popd +fi + +# produce wav scp +if [ ! -f data/wav.scp ]; then + mkdir -p data + pushd data + wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav + echo "utt1 " $PWD/zh.wav > wav.scp + popd +fi + +data=data +exp=exp +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + + +cmvn_json2kaldi_main \ + --json_file $model_dir/mean_std.json \ + --cmvn_write_path $exp/cmvn.ark \ + --binary=false +echo "convert json cmvn to kaldi ark." + +compute_fbank_main \ + --num_bins 80 \ + --wav_rspecifier=scp:$data/wav.scp \ + --cmvn_file=$exp/cmvn.ark \ + --feature_wspecifier=ark,t:$exp/fbank.ark +echo "compute fbank feature." + +u2_nnet_main \ + --model_path=$model_dir/export.jit \ + --feature_rspecifier=ark,t:$exp/fbank.ark \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --downsampling_rate=4 \ + --acoustic_scale=1.0 \ + --nnet_prob_wspecifier=ark,t:$exp/probs.ark diff --git a/speechx/examples/codelab/u2nnet/valgrind.sh b/speechx/examples/codelab/u2nnet/valgrind.sh new file mode 100755 index 00000000..a5aab663 --- /dev/null +++ b/speechx/examples/codelab/u2nnet/valgrind.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# this script is for memory check, so please run ./run.sh first. + +set +x +set -e + +. ./path.sh + +if [ ! -d ${SPEECHX_TOOLS}/valgrind/install ]; then + echo "please install valgrind in the speechx tools dir.\n" + exit 1 +fi + +ckpt_dir=./data/model +model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/ + +valgrind --tool=memcheck --track-origins=yes --leak-check=full --show-leak-kinds=all \ + ds2_model_test_main \ + --model_path=$model_dir/avg_1.jit.pdmodel \ + --param_path=$model_dir/avg_1.jit.pdparams diff --git a/speechx/examples/u2pp_ol/README.md b/speechx/examples/u2pp_ol/README.md new file mode 100644 index 00000000..ce01a8fc --- /dev/null +++ b/speechx/examples/u2pp_ol/README.md @@ -0,0 +1,5 @@ +# U2/U2++ Streaming ASR + +## Examples + +* `wenetspeech` - Streaming Decoding using wenetspeech u2/u2++ model. Using aishell test data for testing. diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h index dfb14885..90fc96a1 100644 --- a/speechx/speechx/base/common.h +++ b/speechx/speechx/base/common.h @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include diff --git a/speechx/speechx/codelab/glog/glog_logtostderr_main.cc b/speechx/speechx/codelab/glog/glog_logtostderr_main.cc index b0616a7d..c891827a 100644 --- a/speechx/speechx/codelab/glog/glog_logtostderr_main.cc +++ b/speechx/speechx/codelab/glog/glog_logtostderr_main.cc @@ -17,7 +17,7 @@ int main(int argc, char* argv[]) { // Initialize Google’s logging library. google::InitGoogleLogging(argv[0]); - + google::InstallFailureSignalHandler(); FLAGS_logtostderr = 1; LOG(INFO) << "Found " << 10 << " cookies"; diff --git a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc index 283466dc..7d99e857 100644 --- a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc +++ b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc @@ -195,8 +195,11 @@ void model_forward_test() { } int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; model_forward_test(); return 0; diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt index 0383c3ea..1df93511 100644 --- a/speechx/speechx/decoder/CMakeLists.txt +++ b/speechx/speechx/decoder/CMakeLists.txt @@ -18,7 +18,6 @@ set(BINS tlg_decoder_main ) -message(STATUS "xxxxxxxxxx: " ${DEPS}) foreach(bin_name IN LISTS BINS) add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc index e4e5c2af..445f470f 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc @@ -53,8 +53,11 @@ using std::vector; // test ds2 online decoder by feeding speech feature int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; CHECK(FLAGS_result_wspecifier != ""); CHECK(FLAGS_feature_rspecifier != ""); diff --git a/speechx/speechx/decoder/nnet_logprob_decoder_main.cc b/speechx/speechx/decoder/nnet_logprob_decoder_main.cc index 0e249cc6..e0acbe77 100644 --- a/speechx/speechx/decoder/nnet_logprob_decoder_main.cc +++ b/speechx/speechx/decoder/nnet_logprob_decoder_main.cc @@ -30,8 +30,11 @@ using std::vector; // test decoder by feeding nnet posterior probability int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; kaldi::SequentialBaseFloatMatrixReader likelihood_reader( FLAGS_nnet_prob_respecifier); diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/decoder/recognizer_main.cc index 23251353..05026646 100644 --- a/speechx/speechx/decoder/recognizer_main.cc +++ b/speechx/speechx/decoder/recognizer_main.cc @@ -23,8 +23,11 @@ DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); DEFINE_int32(sample_rate, 16000, "sample rate"); int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; ppspeech::RecognizerResource resource = ppspeech::InitRecognizerResoure(); ppspeech::Recognizer recognizer(resource); diff --git a/speechx/speechx/decoder/tlg_decoder_main.cc b/speechx/speechx/decoder/tlg_decoder_main.cc index 93f84da3..b633022a 100644 --- a/speechx/speechx/decoder/tlg_decoder_main.cc +++ b/speechx/speechx/decoder/tlg_decoder_main.cc @@ -55,8 +55,11 @@ using std::vector; // test TLG decoder by feeding speech feature. int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; kaldi::SequentialBaseFloatMatrixReader feature_reader( FLAGS_feature_rspecifier); diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt index 8ae63256..050d78be 100644 --- a/speechx/speechx/frontend/audio/CMakeLists.txt +++ b/speechx/speechx/frontend/audio/CMakeLists.txt @@ -1,5 +1,3 @@ -project(frontend) - add_library(frontend STATIC cmvn.cc db_norm.cc diff --git a/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc b/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc index 0def1466..93bad688 100644 --- a/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc +++ b/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc @@ -30,8 +30,11 @@ DEFINE_bool(binary, true, "write cmvn in binary (true) or text(false)"); using namespace boost::json; // from int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; LOG(INFO) << "cmvn josn path: " << FLAGS_json_file; diff --git a/speechx/speechx/frontend/audio/compute_fbank_main.cc b/speechx/speechx/frontend/audio/compute_fbank_main.cc index f7a42315..93a6d407 100644 --- a/speechx/speechx/frontend/audio/compute_fbank_main.cc +++ b/speechx/speechx/frontend/audio/compute_fbank_main.cc @@ -32,13 +32,21 @@ DEFINE_string(feature_wspecifier, "", "output feats wspecifier"); DEFINE_string(cmvn_file, "", "read cmvn"); DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); DEFINE_int32(num_bins, 161, "fbank num bins"); +DEFINE_int32(sample_rate, 16000, "sampe rate: 16k, 8k."); int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; + CHECK(FLAGS_wav_rspecifier.size() > 0); + CHECK(FLAGS_feature_wspecifier.size() > 0); kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); + kaldi::SequentialTableReader wav_info_reader( + FLAGS_wav_rspecifier); kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier); int32 num_done = 0, num_err = 0; @@ -54,6 +62,10 @@ int main(int argc, char* argv[]) { opt.frame_opts.frame_shift_ms = 10; opt.mel_opts.num_bins = FLAGS_num_bins; opt.frame_opts.dither = 0.0; + LOG(INFO) << "frame_length_ms: " << opt.frame_opts.frame_length_ms; + LOG(INFO) << "frame_shift_ms: " << opt.frame_opts.frame_shift_ms; + LOG(INFO) << "num_bins: " << opt.mel_opts.num_bins; + LOG(INFO) << "dither: " << opt.frame_opts.dither; std::unique_ptr fbank( new ppspeech::Fbank(opt, std::move(data_source))); @@ -61,53 +73,73 @@ int main(int argc, char* argv[]) { std::unique_ptr cmvn( new ppspeech::CMVN(FLAGS_cmvn_file, std::move(fbank))); - ppspeech::FeatureCacheOptions feat_cache_opts; // the feature cache output feature chunk by chunk. + ppspeech::FeatureCacheOptions feat_cache_opts; ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn)); LOG(INFO) << "fbank: " << true; LOG(INFO) << "feat dim: " << feature_cache.Dim(); - int sample_rate = 16000; + float streaming_chunk = FLAGS_streaming_chunk; - int chunk_sample_size = streaming_chunk * sample_rate; - LOG(INFO) << "sr: " << sample_rate; - LOG(INFO) << "chunk size (s): " << streaming_chunk; + int chunk_sample_size = streaming_chunk * FLAGS_sample_rate; + LOG(INFO) << "sr: " << FLAGS_sample_rate; + LOG(INFO) << "chunk size (sec): " << streaming_chunk; LOG(INFO) << "chunk size (sample): " << chunk_sample_size; - for (; !wav_reader.Done(); wav_reader.Next()) { - std::string utt = wav_reader.Key(); + for (; !wav_reader.Done() && !wav_info_reader.Done(); wav_reader.Next(), wav_info_reader.Next()) { + const std::string& utt = wav_reader.Key(); const kaldi::WaveData& wave_data = wav_reader.Value(); - LOG(INFO) << "process utt: " << utt; + const std::string& utt2 = wav_info_reader.Key(); + const kaldi::WaveInfo& wave_info = wav_info_reader.Value(); + + CHECK(utt == utt2) << "wav reader and wav info reader using diff rspecifier!!!"; + LOG(INFO) << "utt: " << utt; + LOG(INFO) << "samples: " << wave_info.SampleCount(); + LOG(INFO) << "dur: " << wave_info.Duration() << " sec"; + CHECK(wave_info.SampFreq() == FLAGS_sample_rate) << "need " << FLAGS_sample_rate << " get " << wave_info.SampFreq(); + + // load first channel wav int32 this_channel = 0; kaldi::SubVector waveform(wave_data.Data(), this_channel); + + // compute feat chunk by chunk int tot_samples = waveform.Dim(); - LOG(INFO) << "wav len (sample): " << tot_samples; - int sample_offset = 0; std::vector> feats; int feature_rows = 0; while (sample_offset < tot_samples) { + // cur chunk size int cur_chunk_size = std::min(chunk_sample_size, tot_samples - sample_offset); + // get chunk wav kaldi::Vector wav_chunk(cur_chunk_size); for (int i = 0; i < cur_chunk_size; ++i) { wav_chunk(i) = waveform(sample_offset + i); } - kaldi::Vector features; + // compute feat feature_cache.Accept(wav_chunk); + + // send finish signal if (cur_chunk_size < chunk_sample_size) { feature_cache.SetFinished(); } + + // read feat + kaldi::Vector features; bool flag = true; do { flag = feature_cache.Read(&features); - feats.push_back(features); - feature_rows += features.Dim() / feature_cache.Dim(); + if (flag && features.Dim() != 0) { + feats.push_back(features); + feature_rows += features.Dim() / feature_cache.Dim(); + } } while (flag == true && features.Dim() != 0); + + // forward offset sample_offset += cur_chunk_size; } @@ -125,14 +157,19 @@ int main(int argc, char* argv[]) { ++cur_idx; } } + LOG(INFO) << "feat shape: " << features.NumRows() << " , " << features.NumCols(); feat_writer.Write(utt, features); + + // reset frontend pipeline state feature_cache.Reset(); if (num_done % 50 == 0 && num_done != 0) - KALDI_VLOG(2) << "Processed " << num_done << " utterances"; + VLOG(2) << "Processed " << num_done << " utterances"; + num_done++; } - KALDI_LOG << "Done " << num_done << " utterances, " << num_err + + LOG(INFO) << "Done " << num_done << " utterances, " << num_err << " with errors."; return (num_done != 0 ? 0 : 1); } diff --git a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc index 162c3529..889f5663 100644 --- a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc +++ b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc @@ -31,8 +31,11 @@ DEFINE_string(cmvn_file, "./cmvn.ark", "read cmvn"); DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); diff --git a/speechx/speechx/model/CMakeLists.txt b/speechx/speechx/model/CMakeLists.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/speechx/speechx/nnet/CMakeLists.txt b/speechx/speechx/nnet/CMakeLists.txt index 565bba3e..2a1812fd 100644 --- a/speechx/speechx/nnet/CMakeLists.txt +++ b/speechx/speechx/nnet/CMakeLists.txt @@ -1,12 +1,40 @@ -project(nnet) +set(srcs decodable.cc) -add_library(nnet STATIC - decodable.cc - ds2_nnet.cc -) +if(USING_DS2) + list(APPEND srcs ds2_nnet.cc) +endif() + +if(USING_U2) + list(APPEND srcs u2_nnet.cc) +endif() + +add_library(nnet STATIC ${srcs}) target_link_libraries(nnet absl::strings) -set(bin_name ds2_nnet_main) -add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) -target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet ${DEPS}) \ No newline at end of file +if(USING_U2) + target_compile_options(nnet PUBLIC ${PADDLE_COMPILE_FLAGS}) + target_include_directories(nnet PUBLIC ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) + # target_link_libraries(nnet ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) +endif() + + +if(USING_DS2) + set(bin_name ds2_nnet_main) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet) + + target_link_libraries(${bin_name} ${DEPS}) +endif() + +# test bin +if(USING_U2) + set(bin_name u2_nnet_main) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet) + + target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS}) + target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) + target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) +endif() diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index 465f64a9..7780e5ae 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -30,6 +30,7 @@ Decodable::Decodable(const std::shared_ptr& nnet, frames_ready_(0), acoustic_scale_(acoustic_scale) {} +// for debug void Decodable::Acceptlikelihood(const Matrix& likelihood) { nnet_cache_ = likelihood; frames_ready_ += likelihood.NumRows(); @@ -41,6 +42,7 @@ void Decodable::Acceptlikelihood(const Matrix& likelihood) { // return the size of frame have computed. int32 Decodable::NumFramesReady() const { return frames_ready_; } + // frame idx is from 0 to frame_ready_ -1; bool Decodable::IsLastFrame(int32 frame) { bool flag = EnsureFrameHaveComputed(frame); @@ -72,26 +74,38 @@ bool Decodable::EnsureFrameHaveComputed(int32 frame) { } bool Decodable::AdvanceChunk() { + // read feats Vector features; if (frontend_ == NULL || frontend_->Read(&features) == false) { + // no feat or frontend_ not init. return false; } - int32 nnet_dim = 0; - Vector inferences; - nnet_->FeedForward(features, frontend_->Dim(), &inferences, &nnet_dim); - nnet_cache_.Resize(inferences.Dim() / nnet_dim, nnet_dim); - nnet_cache_.CopyRowsFromVec(inferences); + // forward feats + int32 vocab_dim = 0; + Vector probs; + nnet_->FeedForward(features, frontend_->Dim(), &probs, &vocab_dim); + + // cache nnet outupts + nnet_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim); + nnet_cache_.CopyRowsFromVec(probs); + + // update state frame_offset_ = frames_ready_; frames_ready_ += nnet_cache_.NumRows(); return true; } +// read one frame likelihood bool Decodable::FrameLikelihood(int32 frame, vector* likelihood) { - std::vector result; - if (EnsureFrameHaveComputed(frame) == false) return false; - likelihood->resize(nnet_cache_.NumCols()); - for (int32 idx = 0; idx < nnet_cache_.NumCols(); ++idx) { + if (EnsureFrameHaveComputed(frame) == false) { + return false; + } + + int vocab_size = nnet_cache_.NumCols(); + likelihood->resize(vocab_size); + + for (int32 idx = 0; idx < vocab_size; ++idx) { (*likelihood)[idx] = nnet_cache_(frame - frame_offset_, idx) * acoustic_scale_; } diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h index 9555fea7..241d0419 100644 --- a/speechx/speechx/nnet/decodable.h +++ b/speechx/speechx/nnet/decodable.h @@ -27,35 +27,54 @@ class Decodable : public kaldi::DecodableInterface { explicit Decodable(const std::shared_ptr& nnet, const std::shared_ptr& frontend, kaldi::BaseFloat acoustic_scale = 1.0); + // void Init(DecodableOpts config); + + // nnet logprob output virtual kaldi::BaseFloat LogLikelihood(int32 frame, int32 index); + virtual bool IsLastFrame(int32 frame); + + // nnet output dim, e.g. vocab size virtual int32 NumIndices() const; - // not logprob + + // nnet prob output virtual bool FrameLikelihood(int32 frame, std::vector* likelihood); + virtual int32 NumFramesReady() const; + // for offline test void Acceptlikelihood(const kaldi::Matrix& likelihood); + void Reset(); + bool IsInputFinished() const { return frontend_->IsFinished(); } + bool EnsureFrameHaveComputed(int32 frame); + int32 TokenId2NnetId(int32 token_id); private: bool AdvanceChunk(); + std::shared_ptr frontend_; std::shared_ptr nnet_; + + // nnet outputs' cache kaldi::Matrix nnet_cache_; + // the frame is nnet prob frame rather than audio feature frame // nnet frame subsample the feature frame // eg: 35 frame features output 8 frame inferences int32 frame_offset_; int32 frames_ready_; + // todo: feature frame mismatch with nnet inference frame // so use subsampled_frame int32 current_log_post_subsampled_offset_; int32 num_chunk_computed_; + kaldi::BaseFloat acoustic_scale_; }; diff --git a/speechx/speechx/nnet/ds2_nnet_main.cc b/speechx/speechx/nnet/ds2_nnet_main.cc index e2904208..943d7e5f 100644 --- a/speechx/speechx/nnet/ds2_nnet_main.cc +++ b/speechx/speechx/nnet/ds2_nnet_main.cc @@ -13,8 +13,7 @@ // limitations under the License. #include "nnet/ds2_nnet.h" -#include "base/flags.h" -#include "base/log.h" +#include "base/common.h" #include "frontend/audio/assembler.h" #include "frontend/audio/data_cache.h" #include "kaldi/util/table-types.h" @@ -49,8 +48,11 @@ using kaldi::Matrix; using std::vector; int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; kaldi::SequentialBaseFloatMatrixReader feature_reader( FLAGS_feature_rspecifier); @@ -146,7 +148,7 @@ int main(int argc, char* argv[]) { } kaldi::Matrix result(prob_vec.size(), prob_vec[0].Dim()); - for (int32 row_idx = 0; row_idx < prob_vec.size(); ++row_idx) { + for (int row_idx = 0; row_idx < prob_vec.size(); ++row_idx) { for (int32 col_idx = 0; col_idx < prob_vec[0].Dim(); ++col_idx) { result(row_idx, col_idx) = prob_vec[row_idx](col_idx); } diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc new file mode 100644 index 00000000..67ef0952 --- /dev/null +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -0,0 +1,706 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "nnet/u2_nnet.h" + +#ifdef USE_PROFILING +#include "paddle/fluid/platform/profiler.h" +using paddle::platform::RecordEvent; +using paddle::platform::TracerEventType; +#endif // end USE_PROFILING + +namespace ppspeech { + +int U2NnetBase::num_frames_for_chunk(bool start) const { + int num_needed_frames = 0; // num feat frames + bool first = !start; // start == false is first + + if (chunk_size_ > 0) { + // streaming mode + if (first) { + // first chunk + // 1 decoder frame need `context` feat frames + int context = this->context(); + num_needed_frames = (chunk_size_ - 1) * subsampling_rate_ + context; + } else { + // after first chunk, we need stride this num frames. + num_needed_frames = chunk_size_ * subsampling_rate_; + } + } else { + // non-streaming mode. feed all feats once. + num_needed_frames = std::numeric_limits::max(); + } + + return num_needed_frames; +} + +// cache feats for next chunk +void U2NnetBase::CacheFeature(const std::vector& chunk_feats, + int32 feat_dim) { + // chunk_feats is nframes*feat_dim + const int chunk_size = chunk_feats.size() / feat_dim; + const int cached_feat_size = this->context() - subsampling_rate_; + if (chunk_size >= cached_feat_size) { + cached_feats_.resize(cached_feat_size); + for (int i = 0; i < cached_feat_size; ++i) { + auto start = + chunk_feats.begin() + chunk_size - cached_feat_size + i; + auto end = start + feat_dim; + cached_feats_[i] = std::vector(start, end); + } + } +} + +void U2NnetBase::ForwardEncoderChunk( + const std::vector& chunk_feats, + int32 feat_dim, + std::vector* ctc_probs, + int32* vocab_dim) { + ctc_probs->clear(); + // int num_frames = cached_feats_.size() + chunk_feats.size(); + int num_frames = chunk_feats.size() / feat_dim; + VLOG(3) << "foward encoder chunk: " << num_frames << " frames"; + VLOG(3) << "context: " << this->context() << " frames"; + + if (num_frames >= this->context()) { + this->ForwardEncoderChunkImpl( + chunk_feats, feat_dim, ctc_probs, vocab_dim); + VLOG(3) << "after forward chunk"; + this->CacheFeature(chunk_feats, feat_dim); + } +} + + +void U2Nnet::LoadModel(const std::string& model_path_w_prefix) { + paddle::jit::utils::InitKernelSignatureMap(); + +#ifdef USE_GPU + dev_ = phi::GPUPlace(); +#else + dev_ = phi::CPUPlace(); +#endif + paddle::jit::Layer model = paddle::jit::Load(model_path_w_prefix, dev_); + model_ = std::make_shared(std::move(model)); + + subsampling_rate_ = model_->Attribute("subsampling_rate"); + right_context_ = model_->Attribute("right_context"); + sos_ = model_->Attribute("sos_symbol"); + eos_ = model_->Attribute("eos_symbol"); + is_bidecoder_ = model_->Attribute("is_bidirectional_decoder"); + + forward_encoder_chunk_ = model_->Function("forward_encoder_chunk"); + forward_attention_decoder_ = model_->Function("forward_attention_decoder"); + ctc_activation_ = model_->Function("ctc_activation"); + CHECK(forward_encoder_chunk_.IsValid()); + CHECK(forward_attention_decoder_.IsValid()); + CHECK(ctc_activation_.IsValid()); + + LOG(INFO) << "Paddle Model Info: "; + LOG(INFO) << "\tsubsampling_rate " << subsampling_rate_; + LOG(INFO) << "\tright context " << right_context_; + LOG(INFO) << "\tsos " << sos_; + LOG(INFO) << "\teos " << eos_; + LOG(INFO) << "\tis bidecoder " << is_bidecoder_ << std::endl; + + Warmup(); +} + +void U2Nnet::Warmup() { +#ifdef USE_PROFILING + RecordEvent event("warmup", TracerEventType::UserDefined, 1); +#endif + + { +#ifdef USE_PROFILING + RecordEvent event( + "warmup-encoder-ctc", TracerEventType::UserDefined, 1); +#endif + int feat_dim = 80; + int frame_num = 16 * 4 + 3; // chunk_size * downsample_rate + + // (receptive_field - downsample_rate) + paddle::Tensor feats = paddle::full( + {1, frame_num, feat_dim}, 0.12f, paddle::DataType::FLOAT32); + paddle::Tensor offset = paddle::zeros({1}, paddle::DataType::INT32); + paddle::Tensor att_cache = + paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32); + paddle::Tensor cnn_cache = + paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32); + std::vector inputs = { + feats, offset, /*required_cache_size, */ att_cache, cnn_cache}; + std::vector outputs = forward_encoder_chunk_(inputs); + + auto chunk_out = outputs[0]; + inputs = std::move(std::vector({chunk_out})); + outputs = ctc_activation_(inputs); + } + + { +#ifdef USE_PROFILING + RecordEvent event("warmup-decoder", TracerEventType::UserDefined, 1); +#endif + auto hyps = + paddle::full({10, 8}, 10, paddle::DataType::INT64, phi::CPUPlace()); + auto hyps_lens = + paddle::full({10}, 8, paddle::DataType::INT64, phi::CPUPlace()); + auto encoder_out = paddle::ones( + {1, 20, 512}, paddle::DataType::FLOAT32, phi::CPUPlace()); + + std::vector inputs{ + hyps, hyps_lens, encoder_out}; + + std::vector outputs = + forward_attention_decoder_(inputs); + } + + Reset(); +} + +U2Nnet::U2Nnet(const U2ModelOptions& opts) : opts_(opts) { + LoadModel(opts_.model_path); +} + +// shallow copy +U2Nnet::U2Nnet(const U2Nnet& other) { + // copy meta + right_context_ = other.right_context_; + subsampling_rate_ = other.subsampling_rate_; + sos_ = other.sos_; + eos_ = other.eos_; + is_bidecoder_ = other.is_bidecoder_; + chunk_size_ = other.chunk_size_; + num_left_chunks_ = other.num_left_chunks_; + + forward_encoder_chunk_ = other.forward_encoder_chunk_; + forward_attention_decoder_ = other.forward_attention_decoder_; + ctc_activation_ = other.ctc_activation_; + + // offset_ = other.offset_; // TODO: not used in nnets + + // copy model ptr + model_ = other.model_; + + // ignore inner states +} + +std::shared_ptr U2Nnet::Copy() const { + auto asr_model = std::make_shared(*this); + // reset inner state for new decoding + asr_model->Reset(); + return asr_model; +} + +void U2Nnet::Reset() { + // offset_ = 0; + // cached_feats_.clear(); // TODO: not used in nnets + + att_cache_ = + std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32)); + cnn_cache_ = + std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32)); + + encoder_outs_.clear(); +} + +// Debug API +void U2Nnet::FeedEncoderOuts(paddle::Tensor& encoder_out) { + // encoder_out (T,D) + encoder_outs_.clear(); + encoder_outs_.push_back(encoder_out); +} + + +void U2Nnet::FeedForward(const kaldi::Vector& features, + int32 feature_dim, + kaldi::Vector* inferences, + int32* inference_dim) { + std::vector chunk_feats(features.Data(), + features.Data() + features.Dim()); + std::vector ctc_probs; + ForwardEncoderChunkImpl( + chunk_feats, feature_dim, &ctc_probs, inference_dim); + inferences->Resize(ctc_probs.size(), kaldi::kSetZero); + std::memcpy(inferences->Data(), + ctc_probs.data(), + ctc_probs.size() * sizeof(kaldi::BaseFloat)); +} + + +void U2Nnet::ForwardEncoderChunkImpl( + const std::vector& chunk_feats, + int32 feat_dim, + std::vector* out_prob, + int32* vocab_dim) { +#ifdef USE_PROFILING + RecordEvent event( + "ForwardEncoderChunkImpl", TracerEventType::UserDefined, 1); +#endif + + // 1. splice cached_feature, and chunk_feats + // First dimension is B, which is 1. + // int num_frames = cached_feats_.size() + chunk_feats.size(); + + int num_frames = chunk_feats.size() / feat_dim; + VLOG(3) << "num_frames: " << num_frames; + VLOG(3) << "feat_dim: " << feat_dim; + + // feats (B=1,T,D) + paddle::Tensor feats = + paddle::zeros({1, num_frames, feat_dim}, paddle::DataType::FLOAT32); + float* feats_ptr = feats.mutable_data(); + + // for (size_t i = 0; i < cached_feats_.size(); ++i) { + // float* row = feats_ptr + i * feat_dim; + // std::memcpy(row, cached_feats_[i].data(), feat_dim * sizeof(float)); + // } + + // for (size_t i = 0; i < chunk_feats.size(); ++i) { + // float* row = feats_ptr + (cached_feats_.size() + i) * feat_dim; + // std::memcpy(row, chunk_feats[i].data(), feat_dim * sizeof(float)); + // } + + // not cache feature in nnet + CHECK(cached_feats_.size() == 0); + // CHECK_EQ(std::is_same::value, true); + std::memcpy(feats_ptr, + chunk_feats.data(), + chunk_feats.size() * sizeof(kaldi::BaseFloat)); + + VLOG(3) << "feats shape: " << feats.shape()[0] << ", " << feats.shape()[1] + << ", " << feats.shape()[2]; + +#ifdef TEST_DEBUG + { + std::stringstream path("feat", std::ios_base::app | std::ios_base::out); + path << offset_; + std::ofstream feat_fobj(path.str().c_str(), std::ios::out); + CHECK(feat_fobj.is_open()); + // feat_fobj << feats.shape()[0] << " " << feats.shape()[1] << " " + // << feats.shape()[2] << "\n"; + for (int i = 0; i < feats.numel(); i++) { + feat_fobj << std::setprecision(18) << feats_ptr[i] << " "; + if ((i + 1) % feat_dim == 0) { + feat_fobj << "\n"; + } + } + feat_fobj << "\n"; + } +#endif + +// Endocer chunk forward +#ifdef USE_GPU + feats = feats.copy_to(paddle::GPUPlace(), /*blocking*/ false); + att_cache_ = att_cache_.copy_to(paddle::GPUPlace()), /*blocking*/ false; + cnn_cache_ = cnn_cache_.copy_to(Paddle::GPUPlace(), /*blocking*/ false); +#endif + + int required_cache_size = num_left_chunks_ * chunk_size_; // -1 * 16 + // must be scalar, but paddle do not have scalar. + paddle::Tensor offset = paddle::full({1}, offset_, paddle::DataType::INT32); + // freeze `required_cache_size` in graph, so not specific it in function + // call. + std::vector inputs = { + feats, offset, /*required_cache_size, */ att_cache_, cnn_cache_}; + VLOG(3) << "inputs size: " << inputs.size(); + CHECK(inputs.size() == 4); + std::vector outputs = forward_encoder_chunk_(inputs); + VLOG(3) << "outputs size: " << outputs.size(); + CHECK(outputs.size() == 3); + +#ifdef USE_GPU + paddle::Tensor chunk_out = outputs[0].copy_to(paddle::CPUPlace()); + att_cache_ = outputs[1].copy_to(paddle::CPUPlace()); + cnn_cache_ = outputs[2].copy_to(paddle::CPUPlace()); +#else + paddle::Tensor chunk_out = outputs[0]; + att_cache_ = outputs[1]; + cnn_cache_ = outputs[2]; +#endif + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logits", + std::ios_base::app | std::ios_base::out); + auto i = offset_ - chunk_out.shape()[1]; + path << std::max(i, 0L); + std::ofstream logits_fobj(path.str().c_str(), std::ios::out); + CHECK(logits_fobj.is_open()); + logits_fobj << chunk_out.shape()[0] << " " << chunk_out.shape()[1] + << " " << chunk_out.shape()[2] << "\n"; + const float* chunk_out_ptr = chunk_out.data(); + logits_fobj << chunk_out_ptr << std::endl; + for (int i = 0; i < chunk_out.numel(); i++) { + logits_fobj << chunk_out_ptr[i] << " "; + } + logits_fobj << "\n"; + } +#endif // end TEST_DEBUG + + // current offset in decoder frame + // not used in nnet + offset_ += chunk_out.shape()[1]; + + // collects encoder outs. + VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size(); + encoder_outs_.push_back(chunk_out); + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logits_list", + std::ios_base::app | std::ios_base::out); + path << offset_ - encoder_outs_[0].shape()[1]; + std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out); + CHECK(logits_out_fobj.is_open()); + logits_out_fobj << encoder_outs_[0].shape()[0] << " " + << encoder_outs_[0].shape()[1] << " " + << encoder_outs_[0].shape()[2] << "\n"; + const float* encoder_outs_ptr = encoder_outs_[0].data(); + logits_out_fobj << encoder_outs_ptr << std::endl; + for (int i = 0; i < encoder_outs_[0].numel(); i++) { + logits_out_fobj << encoder_outs_ptr[i] << " "; + } + logits_out_fobj << "\n"; + } +#endif // end TEST_DEBUG + +#ifdef USE_GPU + +#error "Not implementation." + +#else + // compute ctc_activation == log_softmax + inputs.clear(); + outputs.clear(); + inputs.push_back(chunk_out); + CHECK(inputs.size() == 1); + outputs = ctc_activation_(inputs); + CHECK(outputs.size() == 1); + paddle::Tensor ctc_log_probs = outputs[0]; + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logprob", + std::ios_base::app | std::ios_base::out); + path << offset_ - chunk_out.shape()[1]; + + std::ofstream logprob_fobj(path.str().c_str(), std::ios::out); + CHECK(logprob_fobj.is_open()); + logprob_fobj << ctc_log_probs.shape()[0] << " " + << ctc_log_probs.shape()[1] << " " + << ctc_log_probs.shape()[2] << "\n"; + const float* logprob_ptr = ctc_log_probs.data(); + for (int i = 0; i < ctc_log_probs.numel(); i++) { + logprob_fobj << logprob_ptr[i] << " "; + if ((i + 1) % ctc_log_probs.shape()[2] == 0) { + logprob_fobj << "\n"; + } + } + logprob_fobj << "\n"; + } +#endif // end TEST_DEBUG + +#endif // end USE_GPU + + // Copy to output, (B=1,T,D) + std::vector ctc_log_probs_shape = ctc_log_probs.shape(); + CHECK(ctc_log_probs_shape.size() == 3); + int B = ctc_log_probs_shape[0]; + CHECK(B == 1); + int T = ctc_log_probs_shape[1]; + int D = ctc_log_probs_shape[2]; + *vocab_dim = D; + + float* ctc_log_probs_ptr = ctc_log_probs.data(); + + // // vector> + // out_prob->resize(T); + // for (int i = 0; i < T; i++) { + // (*out_prob)[i].resize(D); + // float* dst_ptr = (*out_prob)[i].data(); + // float* src_ptr = ctc_log_probs_ptr + (i * D); + // std::memcpy(dst_ptr, src_ptr, D * sizeof(float)); + // } + // CHECK(std::is_same::value); + out_prob->resize(T * D); + std::memcpy( + out_prob->data(), ctc_log_probs_ptr, T * D * sizeof(kaldi::BaseFloat)); + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logits_list_ctc", + std::ios_base::app | std::ios_base::out); + path << offset_ - encoder_outs_[0].shape()[1]; + std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out); + CHECK(logits_out_fobj.is_open()); + logits_out_fobj << encoder_outs_[0].shape()[0] << " " + << encoder_outs_[0].shape()[1] << " " + << encoder_outs_[0].shape()[2] << "\n"; + const float* encoder_outs_ptr = encoder_outs_[0].data(); + logits_out_fobj << encoder_outs_ptr << std::endl; + for (int i = 0; i < encoder_outs_[0].numel(); i++) { + logits_out_fobj << encoder_outs_ptr[i] << " "; + } + logits_out_fobj << "\n"; + } +#endif // end TEST_DEBUG + + return; +} + +float U2Nnet::ComputePathScore(const paddle::Tensor& prob, + const std::vector& hyp, + int eos) { + // sum `hyp` path scores in `prob` + // prob (1, Umax, V) + // hyp (U,) + float score = 0.0f; + std::vector dims = prob.shape(); + CHECK(dims.size() == 3); + VLOG(2) << "prob shape: " << dims[0] << ", " << dims[1] << ", " << dims[2]; + CHECK(dims[0] == 1); + int vocab_dim = static_cast(dims[2]); + + const float* prob_ptr = prob.data(); + for (size_t i = 0; i < hyp.size(); ++i) { + const float* row = prob_ptr + i * vocab_dim; + score += row[hyp[i]]; + } + const float* row = prob_ptr + hyp.size() * vocab_dim; + score += row[eos]; + return score; +} + + +void U2Nnet::AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) { +#ifdef USE_PROFILING + RecordEvent event("AttentionRescoring", TracerEventType::UserDefined, 1); +#endif + + CHECK(rescoring_score != nullptr); + + int num_hyps = hyps.size(); + rescoring_score->resize(num_hyps, 0.0f); + + if (num_hyps == 0) return; + VLOG(2) << "num hyps: " << num_hyps; + + if (encoder_outs_.size() == 0) { + // no encoder outs + std::cerr << "encoder_outs_.size() is zero. Please check it." + << std::endl; + return; + } + + // prepare input + paddle::Tensor hyps_lens = + paddle::zeros({num_hyps}, paddle::DataType::INT64); + int64_t* hyps_len_ptr = hyps_lens.mutable_data(); + int max_hyps_len = 0; + for (size_t i = 0; i < num_hyps; ++i) { + int len = hyps[i].size() + 1; // eos + max_hyps_len = std::max(max_hyps_len, len); + hyps_len_ptr[i] = static_cast(len); + } + + paddle::Tensor hyps_tensor = + paddle::full({num_hyps, max_hyps_len}, eos_, paddle::DataType::INT64); + int64_t* hyps_ptr = hyps_tensor.mutable_data(); + for (size_t i = 0; i < num_hyps; ++i) { + const std::vector& hyp = hyps[i]; + int64_t* row = hyps_ptr + max_hyps_len * i; + row[0] = sos_; + for (size_t j = 0; j < hyp.size(); ++j) { + row[j + 1] = hyp[j]; + } + } + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logits_concat", + std::ios_base::app | std::ios_base::out); + for (int j = 0; j < encoder_outs_.size(); j++) { + path << j; + std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out); + CHECK(logits_out_fobj.is_open()); + logits_out_fobj << encoder_outs_[j].shape()[0] << " " + << encoder_outs_[j].shape()[1] << " " + << encoder_outs_[j].shape()[2] << "\n"; + const float* encoder_outs_ptr = encoder_outs_[j].data(); + for (int i = 0; i < encoder_outs_[j].numel(); i++) { + logits_out_fobj << encoder_outs_ptr[i] << " "; + } + logits_out_fobj << "\n"; + } + } +#endif // end TEST_DEBUG + + // forward attention decoder by hyps and correspoinding encoder_outs_ + paddle::Tensor encoder_out = paddle::concat(encoder_outs_, 1); + VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size(); + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_out0", + std::ios_base::app | std::ios_base::out); + std::ofstream encoder_out_fobj(path.str().c_str(), std::ios::out); + CHECK(encoder_out_fobj.is_open()); + + encoder_out_fobj << encoder_outs_[0].shape()[0] << " " + << encoder_outs_[0].shape()[1] << " " + << encoder_outs_[0].shape()[2] << "\n"; + const float* enc_logprob_ptr = encoder_outs_[0].data(); + + size_t size = encoder_outs_[0].numel(); + for (int i = 0; i < size; i++) { + encoder_out_fobj << enc_logprob_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_out", + std::ios_base::app | std::ios_base::out); + std::ofstream encoder_out_fobj(path.str().c_str(), std::ios::out); + CHECK(encoder_out_fobj.is_open()); + + encoder_out_fobj << encoder_out.shape()[0] << " " + << encoder_out.shape()[1] << " " + << encoder_out.shape()[2] << "\n"; + const float* enc_logprob_ptr = encoder_out.data(); + + size_t size = encoder_out.numel(); + for (int i = 0; i < size; i++) { + encoder_out_fobj << enc_logprob_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + + std::vector inputs{ + hyps_tensor, hyps_lens, encoder_out}; + std::vector outputs = forward_attention_decoder_(inputs); + CHECK(outputs.size() == 2); + + // (B, Umax, V) + paddle::Tensor probs = outputs[0]; + std::vector probs_shape = probs.shape(); + CHECK(probs_shape.size() == 3); + CHECK(probs_shape[0] == num_hyps); + CHECK(probs_shape[1] == max_hyps_len); + +#ifdef TEST_DEBUG + { + std::stringstream path("decoder_logprob", + std::ios_base::app | std::ios_base::out); + std::ofstream dec_logprob_fobj(path.str().c_str(), std::ios::out); + CHECK(dec_logprob_fobj.is_open()); + + dec_logprob_fobj << probs.shape()[0] << " " << probs.shape()[1] << " " + << probs.shape()[2] << "\n"; + const float* dec_logprob_ptr = probs.data(); + + size_t size = probs.numel(); + for (int i = 0; i < size; i++) { + dec_logprob_fobj << dec_logprob_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + +#ifdef TEST_DEBUG + { + std::stringstream path("hyps_lens", + std::ios_base::app | std::ios_base::out); + std::ofstream hyps_len_fobj(path.str().c_str(), std::ios::out); + CHECK(hyps_len_fobj.is_open()); + + const int64_t* hyps_lens_ptr = hyps_lens.data(); + + size_t size = hyps_lens.numel(); + for (int i = 0; i < size; i++) { + hyps_len_fobj << hyps_lens_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + +#ifdef TEST_DEBUG + { + std::stringstream path("hyps_tensor", + std::ios_base::app | std::ios_base::out); + std::ofstream hyps_tensor_fobj(path.str().c_str(), std::ios::out); + CHECK(hyps_tensor_fobj.is_open()); + + const int64_t* hyps_tensor_ptr = hyps_tensor.data(); + + size_t size = hyps_tensor.numel(); + for (int i = 0; i < size; i++) { + hyps_tensor_fobj << hyps_tensor_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + + paddle::Tensor r_probs = outputs[1]; + std::vector r_probs_shape = r_probs.shape(); + if (is_bidecoder_ && reverse_weight > 0) { + CHECK(r_probs_shape.size() == 3); + CHECK(r_probs_shape[0] == num_hyps); + CHECK(r_probs_shape[1] == max_hyps_len); + } else { + // dump r_probs + CHECK(r_probs_shape.size() == 1); + CHECK(r_probs_shape[0] == 1) << r_probs_shape[0]; + } + + // compute rescoring score + using IntArray = paddle::experimental::IntArray; + std::vector probs_v = + paddle::experimental::split_with_num(probs, num_hyps, 0); + VLOG(2) << "split prob: " << probs_v.size() << " " + << probs_v[0].shape().size() << " 0: " << probs_v[0].shape()[0] + << ", " << probs_v[0].shape()[1] << ", " << probs_v[0].shape()[2]; + CHECK(static_cast(probs_v.size()) == num_hyps) + << ": is " << probs_v.size() << " expect: " << num_hyps; + + std::vector r_probs_v; + if (is_bidecoder_ && reverse_weight > 0) { + r_probs_v = paddle::experimental::split_with_num(r_probs, num_hyps, 0); + CHECK(static_cast(r_probs_v.size()) == num_hyps) + << "r_probs_v size: is " << r_probs_v.size() + << " expect: " << num_hyps; + } + + for (int i = 0; i < num_hyps; ++i) { + const std::vector& hyp = hyps[i]; + + // left-to-right decoder score + float score = 0.0f; + score = ComputePathScore(probs_v[i], hyp, eos_); + + // right-to-left decoder score + float r_score = 0.0f; + if (is_bidecoder_ && reverse_weight > 0) { + std::vector r_hyp(hyp.size()); + std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin()); + r_score = ComputePathScore(r_probs_v[i], r_hyp, eos_); + } + + // combinded left-to-right and right-to-lfet score + (*rescoring_score)[i] = + score * (1 - reverse_weight) + r_score * reverse_weight; + VLOG(1) << "hyp " << i << " score: " << score << " r_score: " << r_score + << " reverse_weight: " << reverse_weight; + } +} + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h new file mode 100644 index 00000000..ddc85b45 --- /dev/null +++ b/speechx/speechx/nnet/u2_nnet.h @@ -0,0 +1,157 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "base/common.h" +#include "kaldi/matrix/kaldi-matrix.h" + +#include "kaldi/util/options-itf.h" +#include "nnet/nnet_itf.h" + +#include "paddle/extension.h" +#include "paddle/jit/all.h" +#include "paddle/phi/api/all.h" + +namespace ppspeech { + +struct U2ModelOptions { + std::string model_path; + int thread_num; + bool use_gpu; + U2ModelOptions() : model_path(""), thread_num(1), use_gpu(false) {} + + void Register(kaldi::OptionsItf* opts) { + opts->Register("model-path", &model_path, "model file path"); + opts->Register("thread-num", &thread_num, "thread num"); + opts->Register("use-gpu", &use_gpu, "if use gpu"); + } +}; + + +class U2NnetBase : public NnetInterface { + public: + virtual int context() const { return right_context_ + 1; } + virtual int right_context() const { return right_context_; } + virtual int subsampling_rate() const { return subsampling_rate_; } + virtual int eos() const { return eos_; } + virtual int sos() const { return sos_; } + virtual int is_bidecoder() const { return is_bidecoder_; } + // current offset in decoder frame + virtual int offset() const { return offset_; } + virtual void set_chunk_size(int chunk_size) { chunk_size_ = chunk_size; } + virtual void set_num_left_chunks(int num_left_chunks) { + num_left_chunks_ = num_left_chunks; + } + // start: false, it is the start chunk of one sentence, else true + virtual int num_frames_for_chunk(bool start) const; + + virtual std::shared_ptr Copy() const = 0; + + virtual void ForwardEncoderChunk( + const std::vector& chunk_feats, + int32 feat_dim, + std::vector* ctc_probs, + int32* vocab_dim); + + virtual void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) = 0; + + protected: + virtual void ForwardEncoderChunkImpl( + const std::vector& chunk_feats, + int32 feat_dim, + std::vector* ctc_probs, + int32* vocab_dim) = 0; + + virtual void CacheFeature(const std::vector& chunk_feats, + int32 feat_dim); + + protected: + // model specification + int right_context_{0}; + int subsampling_rate_{1}; + + int sos_{0}; + int eos_{0}; + + bool is_bidecoder_{false}; + + int chunk_size_{16}; // num of decoder frames. If chunk_size > 0, streaming + // case. Otherwise, none streaming case + int num_left_chunks_{-1}; // -1 means all left chunks + + // asr decoder state + int offset_{0}; // current offset in encoder output time stamp. Used by + // position embedding. + std::vector> cached_feats_{}; // features cache +}; + + +class U2Nnet : public U2NnetBase { + public: + U2Nnet(const U2ModelOptions& opts); + U2Nnet(const U2Nnet& other); + + void FeedForward(const kaldi::Vector& features, + int32 feature_dim, + kaldi::Vector* inferences, + int32* inference_dim) override; + + void Reset() override; + + void Dim(); + + void LoadModel(const std::string& model_path_w_prefix); + void Warmup(); + + std::shared_ptr model() const { return model_; } + + std::shared_ptr Copy() const override; + + void ForwardEncoderChunkImpl( + const std::vector& chunk_feats, + int32 feat_dim, + std::vector* ctc_probs, + int32* vocab_dim) override; + + float ComputePathScore(const paddle::Tensor& prob, + const std::vector& hyp, + int eos); + + void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) override; + + // debug + void FeedEncoderOuts(paddle::Tensor& encoder_out); + + private: + U2ModelOptions opts_; + + phi::Place dev_; + std::shared_ptr model_{nullptr}; + std::vector encoder_outs_; + // transformer/conformer attention cache + paddle::Tensor att_cache_ = paddle::full({0, 0, 0, 0}, 0.0); + // conformer-only conv_module cache + paddle::Tensor cnn_cache_ = paddle::full({0, 0, 0, 0}, 0.0); + + paddle::jit::Function forward_encoder_chunk_; + paddle::jit::Function forward_attention_decoder_; + paddle::jit::Function ctc_activation_; +}; + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc new file mode 100644 index 00000000..1a1a5e02 --- /dev/null +++ b/speechx/speechx/nnet/u2_nnet_main.cc @@ -0,0 +1,180 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "nnet/u2_nnet.h" +#include "base/common.h" +#include "frontend/audio/assembler.h" +#include "frontend/audio/data_cache.h" +#include "kaldi/util/table-types.h" +#include "nnet/decodable.h" + +DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); +DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier"); + +DEFINE_string(model_path, "", "paddle nnet model"); + +DEFINE_int32(nnet_decoder_chunk, 16, "nnet forward chunk"); +DEFINE_int32(receptive_field_length, + 7, + "receptive field of two CNN(kernel=3) downsampling module."); +DEFINE_int32(downsampling_rate, + 4, + "two CNN(kernel=3) module downsampling rate."); +DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); + +using kaldi::BaseFloat; +using kaldi::Matrix; +using std::vector; + +int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); + gflags::ParseCommandLineFlags(&argc, &argv, false); + google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; + + int32 num_done = 0, num_err = 0; + + CHECK(FLAGS_feature_rspecifier.size() > 0); + CHECK(FLAGS_nnet_prob_wspecifier.size() > 0); + CHECK(FLAGS_model_path.size() > 0); + LOG(INFO) << "input rspecifier: " << FLAGS_feature_rspecifier; + LOG(INFO) << "output wspecifier: " << FLAGS_nnet_prob_wspecifier; + LOG(INFO) << "model path: " << FLAGS_model_path; + kaldi::SequentialBaseFloatMatrixReader feature_reader( + FLAGS_feature_rspecifier); + kaldi::BaseFloatMatrixWriter nnet_out_writer(FLAGS_nnet_prob_wspecifier); + + ppspeech::U2ModelOptions model_opts; + model_opts.model_path = FLAGS_model_path; + + int32 chunk_size = + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate + + FLAGS_receptive_field_length; + int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk; + int32 receptive_field_length = FLAGS_receptive_field_length; + LOG(INFO) << "chunk size (frame): " << chunk_size; + LOG(INFO) << "chunk stride (frame): " << chunk_stride; + LOG(INFO) << "receptive field (frame): " << receptive_field_length; + + std::shared_ptr nnet(new ppspeech::U2Nnet(model_opts)); + std::shared_ptr raw_data(new ppspeech::DataCache()); + std::shared_ptr decodable( + new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale)); + kaldi::Timer timer; + + for (; !feature_reader.Done(); feature_reader.Next()) { + string utt = feature_reader.Key(); + kaldi::Matrix feature = feature_reader.Value(); + + int nframes = feature.NumRows(); + int feat_dim = feature.NumCols(); + raw_data->SetDim(feat_dim); + LOG(INFO) << "utt: " << utt; + LOG(INFO) << "feat shape: " << nframes << ", " << feat_dim; + + // // pad feats + // int32 padding_len = 0; + // if ((feature.NumRows() - chunk_size) % chunk_stride != 0) { + // padding_len = + // chunk_stride - (feature.NumRows() - chunk_size) % + // chunk_stride; + // feature.Resize(feature.NumRows() + padding_len, + // feature.NumCols(), + // kaldi::kCopyData); + // } + + int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1; + int32 frame_idx = 0; + std::vector> prob_vec; + int32 ori_feature_len = feature.NumRows(); + + for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) { + kaldi::Vector feature_chunk(chunk_size * + feat_dim); + + int32 feature_chunk_size = 0; + if (ori_feature_len > chunk_idx * chunk_stride) { + feature_chunk_size = std::min( + ori_feature_len - chunk_idx * chunk_stride, chunk_size); + } + if (feature_chunk_size < receptive_field_length) { + LOG(WARNING) << "utt: " << utt << " skip last " + << feature_chunk_size << " frames, expect is " + << receptive_field_length; + break; + } + + int32 start = chunk_idx * chunk_stride; + for (int row_id = 0; row_id < chunk_size; ++row_id) { + kaldi::SubVector feat_row(feature, start); + kaldi::SubVector feature_chunk_row( + feature_chunk.Data() + row_id * feat_dim, feat_dim); + + feature_chunk_row.CopyFromVec(feat_row); + ++start; + } + + // feat to frontend pipeline cache + raw_data->Accept(feature_chunk); + + // send data finish signal + if (chunk_idx == num_chunks - 1) { + raw_data->SetFinished(); + } + + // get nnet outputs + vector prob; + while (decodable->FrameLikelihood(frame_idx, &prob)) { + kaldi::Vector vec_tmp(prob.size()); + std::memcpy(vec_tmp.Data(), + prob.data(), + sizeof(kaldi::BaseFloat) * prob.size()); + prob_vec.push_back(vec_tmp); + frame_idx++; + } + } + + // after process one utt, then reset decoder state. + decodable->Reset(); + + if (prob_vec.size() == 0) { + // the TokenWriter can not write empty string. + ++num_err; + LOG(WARNING) << " the nnet prob of " << utt << " is empty"; + continue; + } + + // writer nnet output + kaldi::MatrixIndexT nrow = prob_vec.size(); + kaldi::MatrixIndexT ncol = prob_vec[0].Dim(); + LOG(INFO) << "nnet out shape: " << nrow << ", " << ncol; + kaldi::Matrix result(nrow, ncol); + for (int32 row_idx = 0; row_idx < nrow; ++row_idx) { + for (int32 col_idx = 0; col_idx < ncol; ++col_idx) { + result(row_idx, col_idx) = prob_vec[row_idx](col_idx); + } + } + nnet_out_writer.Write(utt, result); + + ++num_done; + } + + double elapsed = timer.Elapsed(); + LOG(INFO) << " cost:" << elapsed << " sec"; + + LOG(INFO) << "Done " << num_done << " utterances, " << num_err + << " with errors."; + return (num_done != 0 ? 0 : 1); +} diff --git a/speechx/speechx/protocol/CMakeLists.txt b/speechx/speechx/protocol/CMakeLists.txt index 98b2f38b..71b33daa 100644 --- a/speechx/speechx/protocol/CMakeLists.txt +++ b/speechx/speechx/protocol/CMakeLists.txt @@ -1,3 +1 @@ -cmake_minimum_required(VERSION 3.14 FATAL_ERROR) - add_subdirectory(websocket) diff --git a/speechx/speechx/protocol/websocket/CMakeLists.txt b/speechx/speechx/protocol/websocket/CMakeLists.txt index c3454c39..0f73fd24 100644 --- a/speechx/speechx/protocol/websocket/CMakeLists.txt +++ b/speechx/speechx/protocol/websocket/CMakeLists.txt @@ -1,4 +1,4 @@ -project(websocket) +# project(websocket) add_library(websocket STATIC websocket_server.cc diff --git a/speechx/speechx/utils/CMakeLists.txt b/speechx/speechx/utils/CMakeLists.txt index 95e86574..c1e875be 100644 --- a/speechx/speechx/utils/CMakeLists.txt +++ b/speechx/speechx/utils/CMakeLists.txt @@ -1,4 +1,5 @@ add_library(utils file_utils.cc + math.cc ) \ No newline at end of file diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc index 7c319295..5087ac60 100644 --- a/speechx/speechx/utils/math.cc +++ b/speechx/speechx/utils/math.cc @@ -38,11 +38,11 @@ float LogSumExp(float x, float y) { template struct ValGreaterComp { bool operator()(const std::pair& lhs, - const std::pair& rhs) const { + const std::pair& rhs) const { return lhs.first > rhs.first || (lhs.first == rhs.first && lhs.second < rhs.second); } -} +}; template void TopK(const std::vector& data, diff --git a/speechx/tools/venv.sh b/speechx/tools/venv.sh new file mode 100755 index 00000000..3952988c --- /dev/null +++ b/speechx/tools/venv.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -ex + +PYTHON=python3.7 +test -d venv || virtualenv -p ${PYTHON} venv From cd1ced4ea0f9f85835a63b7afd2b47f8f14a963f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 11 Oct 2022 06:43:07 +0000 Subject: [PATCH 06/60] add nnetout struct --- speechx/examples/ds2_ol/aishell/run.sh | 4 ++-- speechx/speechx/nnet/CMakeLists.txt | 1 - speechx/speechx/nnet/decodable.cc | 25 +++++++++++++------------ speechx/speechx/nnet/decodable.h | 2 +- speechx/speechx/nnet/ds2_nnet.cc | 15 +++++++++------ speechx/speechx/nnet/ds2_nnet.h | 5 ++--- speechx/speechx/nnet/nnet_itf.h | 17 ++++++++++++++--- speechx/speechx/nnet/u2_nnet.cc | 18 ++++++++++-------- speechx/speechx/nnet/u2_nnet.h | 15 ++++++++------- 9 files changed, 59 insertions(+), 43 deletions(-) diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh index 82e889ce..a29be17b 100755 --- a/speechx/examples/ds2_ol/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -1,5 +1,5 @@ #!/bin/bash -set +x +set -x set -e . path.sh @@ -11,7 +11,7 @@ stop_stage=100 . utils/parse_options.sh # 1. compile -if [ ! -d ${SPEECHX_EXAMPLES} ]; then +if [ ! -d ${SPEECHX_BUILD} ]; then pushd ${SPEECHX_ROOT} bash build.sh popd diff --git a/speechx/speechx/nnet/CMakeLists.txt b/speechx/speechx/nnet/CMakeLists.txt index 2a1812fd..43566616 100644 --- a/speechx/speechx/nnet/CMakeLists.txt +++ b/speechx/speechx/nnet/CMakeLists.txt @@ -14,7 +14,6 @@ target_link_libraries(nnet absl::strings) if(USING_U2) target_compile_options(nnet PUBLIC ${PADDLE_COMPILE_FLAGS}) target_include_directories(nnet PUBLIC ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) - # target_link_libraries(nnet ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) endif() diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index 7780e5ae..40fac182 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -32,7 +32,7 @@ Decodable::Decodable(const std::shared_ptr& nnet, // for debug void Decodable::Acceptlikelihood(const Matrix& likelihood) { - nnet_cache_ = likelihood; + nnet_out_cache_ = likelihood; frames_ready_ += likelihood.NumRows(); } @@ -56,13 +56,13 @@ int32 Decodable::NumIndices() const { return 0; } int32 Decodable::TokenId2NnetId(int32 token_id) { return token_id - 1; } BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) { - CHECK_LE(index, nnet_cache_.NumCols()); + CHECK_LE(index, nnet_out_cache_.NumCols()); CHECK_LE(frame, frames_ready_); int32 frame_idx = frame - frame_offset_; // the nnet output is prob ranther than log prob // the index - 1, because the ilabel return acoustic_scale_ * - std::log(nnet_cache_(frame_idx, TokenId2NnetId(index)) + + std::log(nnet_out_cache_(frame_idx, TokenId2NnetId(index)) + std::numeric_limits::min()); } @@ -82,17 +82,18 @@ bool Decodable::AdvanceChunk() { } // forward feats - int32 vocab_dim = 0; - Vector probs; - nnet_->FeedForward(features, frontend_->Dim(), &probs, &vocab_dim); + NnetOut out; + nnet_->FeedForward(features, frontend_->Dim(), &out); + int32& vocab_dim = out.vocab_dim; + Vector& probs = out.logprobs; // cache nnet outupts - nnet_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim); - nnet_cache_.CopyRowsFromVec(probs); + nnet_out_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim); + nnet_out_cache_.CopyRowsFromVec(probs); // update state frame_offset_ = frames_ready_; - frames_ready_ += nnet_cache_.NumRows(); + frames_ready_ += nnet_out_cache_.NumRows(); return true; } @@ -102,12 +103,12 @@ bool Decodable::FrameLikelihood(int32 frame, vector* likelihood) { return false; } - int vocab_size = nnet_cache_.NumCols(); + int vocab_size = nnet_out_cache_.NumCols(); likelihood->resize(vocab_size); for (int32 idx = 0; idx < vocab_size; ++idx) { (*likelihood)[idx] = - nnet_cache_(frame - frame_offset_, idx) * acoustic_scale_; + nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_; } return true; } @@ -117,7 +118,7 @@ void Decodable::Reset() { if (nnet_ != nullptr) nnet_->Reset(); frame_offset_ = 0; frames_ready_ = 0; - nnet_cache_.Resize(0, 0); + nnet_out_cache_.Resize(0, 0); } } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h index 241d0419..8786e4f2 100644 --- a/speechx/speechx/nnet/decodable.h +++ b/speechx/speechx/nnet/decodable.h @@ -62,7 +62,7 @@ class Decodable : public kaldi::DecodableInterface { std::shared_ptr nnet_; // nnet outputs' cache - kaldi::Matrix nnet_cache_; + kaldi::Matrix nnet_out_cache_; // the frame is nnet prob frame rather than audio feature frame // nnet frame subsample the feature frame diff --git a/speechx/speechx/nnet/ds2_nnet.cc b/speechx/speechx/nnet/ds2_nnet.cc index a89c0f20..c6add03c 100644 --- a/speechx/speechx/nnet/ds2_nnet.cc +++ b/speechx/speechx/nnet/ds2_nnet.cc @@ -143,9 +143,8 @@ shared_ptr> PaddleNnet::GetCacheEncoder(const string& name) { } void PaddleNnet::FeedForward(const Vector& features, - int32 feature_dim, - Vector* inferences, - int32* inference_dim) { + const int32& feature_dim, + NnetOut* out) { paddle_infer::Predictor* predictor = GetPredictor(); int feat_row = features.Dim() / feature_dim; @@ -203,9 +202,13 @@ void PaddleNnet::FeedForward(const Vector& features, std::vector output_shape = output_tensor->shape(); int32 row = output_shape[1]; int32 col = output_shape[2]; - inferences->Resize(row * col); - *inference_dim = col; - output_tensor->CopyToCpu(inferences->Data()); + + + // inferences->Resize(row * col); + // *inference_dim = col; + out->logprobs.Resize(row*col); + out->vocab_dim = col; + output_tensor->CopyToCpu(out->logprobs.Data()); ReleasePredictor(predictor); } diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h index e2b3d5bc..717bdb72 100644 --- a/speechx/speechx/nnet/ds2_nnet.h +++ b/speechx/speechx/nnet/ds2_nnet.h @@ -97,9 +97,8 @@ class PaddleNnet : public NnetInterface { PaddleNnet(const ModelOptions& opts); virtual void FeedForward(const kaldi::Vector& features, - int32 feature_dim, - kaldi::Vector* inferences, - int32* inference_dim); + const int32& feature_dim, + NnetOut* out); void Dim(); virtual void Reset(); diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h index ac040fba..12fe3c27 100644 --- a/speechx/speechx/nnet/nnet_itf.h +++ b/speechx/speechx/nnet/nnet_itf.h @@ -21,12 +21,23 @@ namespace ppspeech { +struct NnetOut{ + // nnet out, maybe logprob or prob + kaldi::Vector logprobs; + int32 vocab_dim; + + // nnet state. Only using in Attention model. + std::vector> encoder_outs; + + NnetOut() : logprobs({}), vocab_dim(-1), encoder_outs({}) {} +}; + + class NnetInterface { public: virtual void FeedForward(const kaldi::Vector& features, - int32 feature_dim, - kaldi::Vector* inferences, - int32* inference_dim) = 0; + const int32& feature_dim, + NnetOut* out) = 0; virtual void Reset() = 0; virtual ~NnetInterface() {} }; diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc index 67ef0952..26d7da8f 100644 --- a/speechx/speechx/nnet/u2_nnet.cc +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -64,7 +64,7 @@ void U2NnetBase::CacheFeature(const std::vector& chunk_feats, void U2NnetBase::ForwardEncoderChunk( const std::vector& chunk_feats, - int32 feat_dim, + const int32& feat_dim, std::vector* ctc_probs, int32* vocab_dim) { ctc_probs->clear(); @@ -221,16 +221,17 @@ void U2Nnet::FeedEncoderOuts(paddle::Tensor& encoder_out) { void U2Nnet::FeedForward(const kaldi::Vector& features, - int32 feature_dim, - kaldi::Vector* inferences, - int32* inference_dim) { + const int32& feature_dim, + NnetOut* out) { std::vector chunk_feats(features.Data(), features.Data() + features.Dim()); + std::vector ctc_probs; ForwardEncoderChunkImpl( - chunk_feats, feature_dim, &ctc_probs, inference_dim); - inferences->Resize(ctc_probs.size(), kaldi::kSetZero); - std::memcpy(inferences->Data(), + chunk_feats, feature_dim, &ctc_probs, &out->vocab_dim); + + out->logprobs.Resize(ctc_probs.size(), kaldi::kSetZero); + std::memcpy(out->logprobs.Data(), ctc_probs.data(), ctc_probs.size() * sizeof(kaldi::BaseFloat)); } @@ -238,9 +239,10 @@ void U2Nnet::FeedForward(const kaldi::Vector& features, void U2Nnet::ForwardEncoderChunkImpl( const std::vector& chunk_feats, - int32 feat_dim, + const int32& feat_dim, std::vector* out_prob, int32* vocab_dim) { + #ifdef USE_PROFILING RecordEvent event( "ForwardEncoderChunkImpl", TracerEventType::UserDefined, 1); diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h index ddc85b45..87442959 100644 --- a/speechx/speechx/nnet/u2_nnet.h +++ b/speechx/speechx/nnet/u2_nnet.h @@ -61,7 +61,7 @@ class U2NnetBase : public NnetInterface { virtual void ForwardEncoderChunk( const std::vector& chunk_feats, - int32 feat_dim, + const int32& feat_dim, std::vector* ctc_probs, int32* vocab_dim); @@ -72,7 +72,7 @@ class U2NnetBase : public NnetInterface { protected: virtual void ForwardEncoderChunkImpl( const std::vector& chunk_feats, - int32 feat_dim, + const int32& feat_dim, std::vector* ctc_probs, int32* vocab_dim) = 0; @@ -93,7 +93,7 @@ class U2NnetBase : public NnetInterface { // case. Otherwise, none streaming case int num_left_chunks_{-1}; // -1 means all left chunks - // asr decoder state + // asr decoder state, not used in nnet int offset_{0}; // current offset in encoder output time stamp. Used by // position embedding. std::vector> cached_feats_{}; // features cache @@ -106,9 +106,8 @@ class U2Nnet : public U2NnetBase { U2Nnet(const U2Nnet& other); void FeedForward(const kaldi::Vector& features, - int32 feature_dim, - kaldi::Vector* inferences, - int32* inference_dim) override; + const int32& feature_dim, + NnetOut* out) override; void Reset() override; @@ -123,7 +122,7 @@ class U2Nnet : public U2NnetBase { void ForwardEncoderChunkImpl( const std::vector& chunk_feats, - int32 feat_dim, + const int32& feat_dim, std::vector* ctc_probs, int32* vocab_dim) override; @@ -138,6 +137,8 @@ class U2Nnet : public U2NnetBase { // debug void FeedEncoderOuts(paddle::Tensor& encoder_out); + const std::vector& EncoderOuts() const {return encoder_outs_; } + private: U2ModelOptions opts_; From a75abc1828e46e27ed368b61a6ee4ab7639eaec7 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 11 Oct 2022 07:51:40 +0000 Subject: [PATCH 07/60] fix u2 nnet out frames num --- speechx/.gitignore | 1 + speechx/speechx/nnet/nnet_itf.h | 14 +++++++------- speechx/speechx/nnet/u2_nnet.cc | 6 +++--- speechx/speechx/nnet/u2_nnet.h | 4 +++- speechx/speechx/nnet/u2_nnet_main.cc | 18 +++++++++--------- speechx/tools/clang-format.sh | 3 +++ 6 files changed, 26 insertions(+), 20 deletions(-) create mode 100755 speechx/tools/clang-format.sh diff --git a/speechx/.gitignore b/speechx/.gitignore index e0c61847..9a93805c 100644 --- a/speechx/.gitignore +++ b/speechx/.gitignore @@ -1 +1,2 @@ tools/valgrind* +*log diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h index 12fe3c27..b98f5ebd 100644 --- a/speechx/speechx/nnet/nnet_itf.h +++ b/speechx/speechx/nnet/nnet_itf.h @@ -21,15 +21,15 @@ namespace ppspeech { -struct NnetOut{ - // nnet out, maybe logprob or prob - kaldi::Vector logprobs; - int32 vocab_dim; +struct NnetOut { + // nnet out, maybe logprob or prob + kaldi::Vector logprobs; + int32 vocab_dim; - // nnet state. Only using in Attention model. - std::vector> encoder_outs; + // nnet state. Only using in Attention model. + std::vector> encoder_outs; - NnetOut() : logprobs({}), vocab_dim(-1), encoder_outs({}) {} + NnetOut() : logprobs({}), vocab_dim(-1), encoder_outs({}) {} }; diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc index 26d7da8f..ddb815d2 100644 --- a/speechx/speechx/nnet/u2_nnet.cc +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -313,10 +313,8 @@ void U2Nnet::ForwardEncoderChunkImpl( // call. std::vector inputs = { feats, offset, /*required_cache_size, */ att_cache_, cnn_cache_}; - VLOG(3) << "inputs size: " << inputs.size(); CHECK(inputs.size() == 4); std::vector outputs = forward_encoder_chunk_(inputs); - VLOG(3) << "outputs size: " << outputs.size(); CHECK(outputs.size() == 3); #ifdef USE_GPU @@ -351,10 +349,12 @@ void U2Nnet::ForwardEncoderChunkImpl( // current offset in decoder frame // not used in nnet offset_ += chunk_out.shape()[1]; + VLOG(2) << "encoder out chunk size: " << chunk_out.shape()[1] << " total: " << offset_ ; + // collects encoder outs. - VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size(); encoder_outs_.push_back(chunk_out); + VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size(); #ifdef TEST_DEBUG { diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h index 87442959..775a078a 100644 --- a/speechx/speechx/nnet/u2_nnet.h +++ b/speechx/speechx/nnet/u2_nnet.h @@ -137,7 +137,9 @@ class U2Nnet : public U2NnetBase { // debug void FeedEncoderOuts(paddle::Tensor& encoder_out); - const std::vector& EncoderOuts() const {return encoder_outs_; } + const std::vector& EncoderOuts() const { + return encoder_outs_; + } private: U2ModelOptions opts_; diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc index 1a1a5e02..b602ac4d 100644 --- a/speechx/speechx/nnet/u2_nnet_main.cc +++ b/speechx/speechx/nnet/u2_nnet_main.cc @@ -95,29 +95,29 @@ int main(int argc, char* argv[]) { // kaldi::kCopyData); // } - int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1; int32 frame_idx = 0; std::vector> prob_vec; int32 ori_feature_len = feature.NumRows(); + int32 num_chunks = feature.NumRows() / chunk_stride + 1; + LOG(INFO) << "num_chunks: " << num_chunks; for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) { - kaldi::Vector feature_chunk(chunk_size * - feat_dim); - - int32 feature_chunk_size = 0; + int32 this_chunk_size = 0; if (ori_feature_len > chunk_idx * chunk_stride) { - feature_chunk_size = std::min( + this_chunk_size = std::min( ori_feature_len - chunk_idx * chunk_stride, chunk_size); } - if (feature_chunk_size < receptive_field_length) { + if (this_chunk_size < receptive_field_length) { LOG(WARNING) << "utt: " << utt << " skip last " - << feature_chunk_size << " frames, expect is " + << this_chunk_size << " frames, expect is " << receptive_field_length; break; } + kaldi::Vector feature_chunk(this_chunk_size * + feat_dim); int32 start = chunk_idx * chunk_stride; - for (int row_id = 0; row_id < chunk_size; ++row_id) { + for (int row_id = 0; row_id < this_chunk_size; ++row_id) { kaldi::SubVector feat_row(feature, start); kaldi::SubVector feature_chunk_row( feature_chunk.Data() + row_id * feat_dim, feat_dim); diff --git a/speechx/tools/clang-format.sh b/speechx/tools/clang-format.sh new file mode 100755 index 00000000..30f636ff --- /dev/null +++ b/speechx/tools/clang-format.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +find speechx -name '*.c' -o -name '*.h' -not -path "*kaldi*" | xargs -I{} clang-format -i {} From 5cc874e1c3e6015e2c73fc9ca098a650aa4ef730 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 11 Oct 2022 09:51:07 +0000 Subject: [PATCH 08/60] u2 nnet get encoder out and align with py --- speechx/examples/codelab/u2nnet/run.sh | 7 +++- speechx/speechx/nnet/decodable.h | 2 + speechx/speechx/nnet/ds2_nnet.h | 14 +++++-- speechx/speechx/nnet/nnet_itf.h | 14 ++++++- speechx/speechx/nnet/u2_nnet.cc | 26 +++++++++++++ speechx/speechx/nnet/u2_nnet.h | 5 +-- speechx/speechx/nnet/u2_nnet_main.cc | 51 +++++++++++++++++++------- 7 files changed, 96 insertions(+), 23 deletions(-) diff --git a/speechx/examples/codelab/u2nnet/run.sh b/speechx/examples/codelab/u2nnet/run.sh index b309bc6f..704653e7 100755 --- a/speechx/examples/codelab/u2nnet/run.sh +++ b/speechx/examples/codelab/u2nnet/run.sh @@ -40,6 +40,7 @@ cmvn_json2kaldi_main \ --json_file $model_dir/mean_std.json \ --cmvn_write_path $exp/cmvn.ark \ --binary=false + echo "convert json cmvn to kaldi ark." compute_fbank_main \ @@ -47,6 +48,7 @@ compute_fbank_main \ --wav_rspecifier=scp:$data/wav.scp \ --cmvn_file=$exp/cmvn.ark \ --feature_wspecifier=ark,t:$exp/fbank.ark + echo "compute fbank feature." u2_nnet_main \ @@ -56,4 +58,7 @@ u2_nnet_main \ --receptive_field_length=7 \ --downsampling_rate=4 \ --acoustic_scale=1.0 \ - --nnet_prob_wspecifier=ark,t:$exp/probs.ark + --nnet_encoder_outs_wspecifier=ark,t:$exp/encoder_outs.ark \ + --nnet_prob_wspecifier=ark,t:$exp/logprobs.ark + +echo "u2 nnet decode." diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h index 8786e4f2..39b38dc1 100644 --- a/speechx/speechx/nnet/decodable.h +++ b/speechx/speechx/nnet/decodable.h @@ -55,6 +55,8 @@ class Decodable : public kaldi::DecodableInterface { int32 TokenId2NnetId(int32 token_id); + std::shared_ptr Nnet() { return nnet_; } + private: bool AdvanceChunk(); diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h index 717bdb72..80be6927 100644 --- a/speechx/speechx/nnet/ds2_nnet.h +++ b/speechx/speechx/nnet/ds2_nnet.h @@ -96,16 +96,22 @@ class PaddleNnet : public NnetInterface { public: PaddleNnet(const ModelOptions& opts); - virtual void FeedForward(const kaldi::Vector& features, - const int32& feature_dim, - NnetOut* out); + void FeedForward(const kaldi::Vector& features, + const int32& feature_dim, + NnetOut* out) override; void Dim(); - virtual void Reset(); + + void Reset() override; + std::shared_ptr> GetCacheEncoder( const std::string& name); + void InitCacheEncouts(const ModelOptions& opts); + void EncoderOuts(std::vector>* encoder_out) + const override {} + private: paddle_infer::Predictor* GetPredictor(); int ReleasePredictor(paddle_infer::Predictor* predictor); diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h index b98f5ebd..5dde72a8 100644 --- a/speechx/speechx/nnet/nnet_itf.h +++ b/speechx/speechx/nnet/nnet_itf.h @@ -22,7 +22,7 @@ namespace ppspeech { struct NnetOut { - // nnet out, maybe logprob or prob + // nnet out. maybe logprob or prob. Almost time this is logprob. kaldi::Vector logprobs; int32 vocab_dim; @@ -35,11 +35,21 @@ struct NnetOut { class NnetInterface { public: + virtual ~NnetInterface() {} + + // forward feat with nnet. + // nnet do not cache feats, feats cached by frontend. + // nnet cache model outputs, i.e. logprobs/encoder_outs. virtual void FeedForward(const kaldi::Vector& features, const int32& feature_dim, NnetOut* out) = 0; + + // reset nnet state, e.g. nnet_logprob_cache_, offset_, encoder_outs_. virtual void Reset() = 0; - virtual ~NnetInterface() {} + + // using to get encoder outs. e.g. seq2seq with Attention model. + virtual void EncoderOuts( + std::vector>* encoder_out) const = 0; }; } // namespace ppspeech diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc index ddb815d2..74f8cf78 100644 --- a/speechx/speechx/nnet/u2_nnet.cc +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -705,4 +705,30 @@ void U2Nnet::AttentionRescoring(const std::vector>& hyps, } } + +void U2Nnet::EncoderOuts(std::vector>* encoder_out) const { + // list of (B=1,T,D) + int size = encoder_outs_.size(); + VLOG(1) << "encoder_outs_ size: " << size; + + for (int i = 0; i < size; i++){ + const paddle::Tensor& item = encoder_outs_[i]; + const std::vector shape = item.shape(); + CHECK(shape.size() == 3); + const int& B = shape[0]; + const int& T = shape[1]; + const int& D = shape[2]; + CHECK(B == 1) << "Only support batch one."; + VLOG(1) << "encoder out " << i << " shape: (" << B << "," << T << "," << D << ")"; + + const float *this_tensor_ptr = item.data(); + for (int j = 0; j < T; j++){ + const float* cur = this_tensor_ptr + j * D; + kaldi::Vector out(D); + std::memcpy(out.Data(), cur, D * sizeof(kaldi::BaseFloat)); + encoder_out->emplace_back(out); + } + } + } + } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h index 775a078a..8ce45f43 100644 --- a/speechx/speechx/nnet/u2_nnet.h +++ b/speechx/speechx/nnet/u2_nnet.h @@ -137,9 +137,8 @@ class U2Nnet : public U2NnetBase { // debug void FeedEncoderOuts(paddle::Tensor& encoder_out); - const std::vector& EncoderOuts() const { - return encoder_outs_; - } + void EncoderOuts( + std::vector>* encoder_out) const; private: U2ModelOptions opts_; diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc index b602ac4d..fb9fec23 100644 --- a/speechx/speechx/nnet/u2_nnet_main.cc +++ b/speechx/speechx/nnet/u2_nnet_main.cc @@ -21,6 +21,7 @@ DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier"); +DEFINE_string(nnet_encoder_outs_wspecifier, "", "nnet encoder outs wspecifier"); DEFINE_string(model_path, "", "paddle nnet model"); @@ -52,9 +53,10 @@ int main(int argc, char* argv[]) { LOG(INFO) << "input rspecifier: " << FLAGS_feature_rspecifier; LOG(INFO) << "output wspecifier: " << FLAGS_nnet_prob_wspecifier; LOG(INFO) << "model path: " << FLAGS_model_path; - kaldi::SequentialBaseFloatMatrixReader feature_reader( - FLAGS_feature_rspecifier); + + kaldi::SequentialBaseFloatMatrixReader feature_reader(FLAGS_feature_rspecifier); kaldi::BaseFloatMatrixWriter nnet_out_writer(FLAGS_nnet_prob_wspecifier); + kaldi::BaseFloatMatrixWriter nnet_encoder_outs_writer(FLAGS_nnet_encoder_outs_wspecifier); ppspeech::U2ModelOptions model_opts; model_opts.model_path = FLAGS_model_path; @@ -97,6 +99,7 @@ int main(int argc, char* argv[]) { int32 frame_idx = 0; std::vector> prob_vec; + std::vector> encoder_out_vec; int32 ori_feature_len = feature.NumRows(); int32 num_chunks = feature.NumRows() / chunk_stride + 1; LOG(INFO) << "num_chunks: " << num_chunks; @@ -144,29 +147,51 @@ int main(int argc, char* argv[]) { prob_vec.push_back(vec_tmp); frame_idx++; } + + } + // get encoder out + decodable->Nnet()->EncoderOuts(&encoder_out_vec); + // after process one utt, then reset decoder state. decodable->Reset(); - if (prob_vec.size() == 0) { + if (prob_vec.size() == 0 || encoder_out_vec.size() == 0) { // the TokenWriter can not write empty string. ++num_err; - LOG(WARNING) << " the nnet prob of " << utt << " is empty"; + LOG(WARNING) << " the nnet prob/encoder_out of " << utt << " is empty"; continue; } - // writer nnet output - kaldi::MatrixIndexT nrow = prob_vec.size(); - kaldi::MatrixIndexT ncol = prob_vec[0].Dim(); - LOG(INFO) << "nnet out shape: " << nrow << ", " << ncol; - kaldi::Matrix result(nrow, ncol); - for (int32 row_idx = 0; row_idx < nrow; ++row_idx) { - for (int32 col_idx = 0; col_idx < ncol; ++col_idx) { - result(row_idx, col_idx) = prob_vec[row_idx](col_idx); + { + // writer nnet output + kaldi::MatrixIndexT nrow = prob_vec.size(); + kaldi::MatrixIndexT ncol = prob_vec[0].Dim(); + LOG(INFO) << "nnet out shape: " << nrow << ", " << ncol; + kaldi::Matrix nnet_out(nrow, ncol); + for (int32 row_idx = 0; row_idx < nrow; ++row_idx) { + for (int32 col_idx = 0; col_idx < ncol; ++col_idx) { + nnet_out(row_idx, col_idx) = prob_vec[row_idx](col_idx); + } + } + nnet_out_writer.Write(utt, nnet_out); + } + + + { + // writer nnet encoder outs + kaldi::MatrixIndexT nrow = encoder_out_vec.size(); + kaldi::MatrixIndexT ncol = encoder_out_vec[0].Dim(); + LOG(INFO) << "nnet encoder outs shape: " << nrow << ", " << ncol; + kaldi::Matrix encoder_outs(nrow, ncol); + for (int32 row_idx = 0; row_idx < nrow; ++row_idx) { + for (int32 col_idx = 0; col_idx < ncol; ++col_idx) { + encoder_outs(row_idx, col_idx) = encoder_out_vec[row_idx](col_idx); + } } + nnet_encoder_outs_writer.Write(utt, encoder_outs); } - nnet_out_writer.Write(utt, result); ++num_done; } From 6987751ff82415d3ff211c1624c315520d88aba2 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 12 Oct 2022 05:59:37 +0000 Subject: [PATCH 09/60] fix LogLikelihood and add AdvanceChunk --- speechx/speechx/base/common.h | 1 + .../frontend/audio/cmvn_json2kaldi_main.cc | 8 +-- speechx/speechx/kaldi/decoder/decodable-itf.h | 11 ++-- speechx/speechx/nnet/decodable.cc | 64 +++++++++++++++---- speechx/speechx/nnet/decodable.h | 6 +- speechx/speechx/nnet/ds2_nnet.h | 2 + speechx/speechx/nnet/nnet_itf.h | 6 +- speechx/speechx/nnet/u2_nnet.h | 2 + speechx/speechx/nnet/u2_nnet_main.cc | 20 +++--- 9 files changed, 87 insertions(+), 33 deletions(-) diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h index 90fc96a1..70b11b69 100644 --- a/speechx/speechx/base/common.h +++ b/speechx/speechx/base/common.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include #include diff --git a/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc b/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc index 93bad688..713c9ef1 100644 --- a/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc +++ b/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc @@ -47,13 +47,13 @@ int main(int argc, char* argv[]) { for (auto obj : value.as_object()) { if (obj.key() == "mean_stat") { - LOG(INFO) << "mean_stat:" << obj.value(); + VLOG(2) << "mean_stat:" << obj.value(); } if (obj.key() == "var_stat") { - LOG(INFO) << "var_stat: " << obj.value(); + VLOG(2) << "var_stat: " << obj.value(); } if (obj.key() == "frame_num") { - LOG(INFO) << "frame_num: " << obj.value(); + VLOG(2) << "frame_num: " << obj.value(); } } @@ -79,7 +79,7 @@ int main(int argc, char* argv[]) { cmvn_stats(1, idx) = var_stat_vec[idx]; } cmvn_stats(0, mean_size) = frame_num; - LOG(INFO) << cmvn_stats; + VLOG(2) << cmvn_stats; kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, FLAGS_binary); LOG(INFO) << "cmvn stats have write into: " << FLAGS_cmvn_write_path; diff --git a/speechx/speechx/kaldi/decoder/decodable-itf.h b/speechx/speechx/kaldi/decoder/decodable-itf.h index b8ce9143..a7c12588 100644 --- a/speechx/speechx/kaldi/decoder/decodable-itf.h +++ b/speechx/speechx/kaldi/decoder/decodable-itf.h @@ -101,7 +101,9 @@ namespace kaldi { */ class DecodableInterface { public: - /// Returns the log likelihood, which will be negated in the decoder. + virtual ~DecodableInterface() {} + + /// Returns the log likelihood(logprob), which will be negated in the decoder. /// The "frame" starts from zero. You should verify that NumFramesReady() > /// frame /// before calling this. @@ -143,11 +145,12 @@ class DecodableInterface { /// this is for compatibility with OpenFst). virtual int32 NumIndices() const = 0; + /// Returns the likelihood(prob), which will be postive in the decoder. + /// The "frame" starts from zero. You should verify that NumFramesReady() > + /// frame + /// before calling this. virtual bool FrameLikelihood( int32 frame, std::vector* likelihood) = 0; - - - virtual ~DecodableInterface() {} }; /// @} } // namespace Kaldi diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index 40fac182..1483949b 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -55,18 +55,10 @@ int32 Decodable::NumIndices() const { return 0; } // id. int32 Decodable::TokenId2NnetId(int32 token_id) { return token_id - 1; } -BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) { - CHECK_LE(index, nnet_out_cache_.NumCols()); - CHECK_LE(frame, frames_ready_); - int32 frame_idx = frame - frame_offset_; - // the nnet output is prob ranther than log prob - // the index - 1, because the ilabel - return acoustic_scale_ * - std::log(nnet_out_cache_(frame_idx, TokenId2NnetId(index)) + - std::numeric_limits::min()); -} + bool Decodable::EnsureFrameHaveComputed(int32 frame) { + // decoding frame if (frame >= frames_ready_) { return AdvanceChunk(); } @@ -74,26 +66,48 @@ bool Decodable::EnsureFrameHaveComputed(int32 frame) { } bool Decodable::AdvanceChunk() { + kaldi::Timer timer; // read feats Vector features; if (frontend_ == NULL || frontend_->Read(&features) == false) { // no feat or frontend_ not init. return false; } + VLOG(2) << "Forward with " << features.Dim() << " frames."; // forward feats NnetOut out; nnet_->FeedForward(features, frontend_->Dim(), &out); int32& vocab_dim = out.vocab_dim; - Vector& probs = out.logprobs; + Vector& logprobs = out.logprobs; // cache nnet outupts - nnet_out_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim); - nnet_out_cache_.CopyRowsFromVec(probs); + nnet_out_cache_.Resize(logprobs.Dim() / vocab_dim, vocab_dim); + nnet_out_cache_.CopyRowsFromVec(logprobs); - // update state + // update state, decoding frame. frame_offset_ = frames_ready_; frames_ready_ += nnet_out_cache_.NumRows(); + VLOG(2) << "Forward feat chunk cost: " << timer.Elapsed() << " sec."; + return true; +} + +bool Decodable::AdvanceChunk(kaldi::Vector* logprobs, int* vocab_dim) { + if (AdvanceChunk() == false) { + return false; + } + + int nrows = nnet_out_cache_.NumRows(); + CHECK(nrows == (frames_ready_ - frame_offset_)); + if (nrows <= 0){ + LOG(WARNING) << "No new nnet out in cache."; + return false; + } + + logprobs->Resize(nnet_out_cache_.NumRows() * nnet_out_cache_.NumCols()); + logprobs->CopyRowsFromMat(nnet_out_cache_); + + *vocab_dim = nnet_out_cache_.NumCols(); return true; } @@ -113,6 +127,28 @@ bool Decodable::FrameLikelihood(int32 frame, vector* likelihood) { return true; } +BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) { + if (EnsureFrameHaveComputed(frame) == false) { + return false; + } + + CHECK_LE(index, nnet_out_cache_.NumCols()); + CHECK_LE(frame, frames_ready_); + + // the nnet output is prob ranther than log prob + // the index - 1, because the ilabel + BaseFloat logprob = 0.0; + int32 frame_idx = frame - frame_offset_; + BaseFloat nnet_out = nnet_out_cache_(frame_idx, TokenId2NnetId(index)); + if (nnet_->IsLogProb()){ + logprob = nnet_out; + } else { + logprob = std::log(nnet_out + std::numeric_limits::epsilon()); + } + CHECK(!std::isnan(logprob) && !std::isinf(logprob)); + return acoustic_scale_ * logprob; +} + void Decodable::Reset() { if (frontend_ != nullptr) frontend_->Reset(); if (nnet_ != nullptr) nnet_->Reset(); diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h index 39b38dc1..1ee6afbf 100644 --- a/speechx/speechx/nnet/decodable.h +++ b/speechx/speechx/nnet/decodable.h @@ -57,9 +57,13 @@ class Decodable : public kaldi::DecodableInterface { std::shared_ptr Nnet() { return nnet_; } - private: + // forward nnet with feats bool AdvanceChunk(); + // forward nnet with feats, and get nnet output + bool AdvanceChunk(kaldi::Vector* logprobs, + int* vocab_dim); + private: std::shared_ptr frontend_; std::shared_ptr nnet_; diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h index 80be6927..9e2cb77b 100644 --- a/speechx/speechx/nnet/ds2_nnet.h +++ b/speechx/speechx/nnet/ds2_nnet.h @@ -104,6 +104,8 @@ class PaddleNnet : public NnetInterface { void Reset() override; + bool IsLogProb() override { return false; } + std::shared_ptr> GetCacheEncoder( const std::string& name); diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h index 5dde72a8..d05aabea 100644 --- a/speechx/speechx/nnet/nnet_itf.h +++ b/speechx/speechx/nnet/nnet_itf.h @@ -39,7 +39,8 @@ class NnetInterface { // forward feat with nnet. // nnet do not cache feats, feats cached by frontend. - // nnet cache model outputs, i.e. logprobs/encoder_outs. + // nnet cache model state, i.e. encoder_outs, att_cache, cnn_cache, + // frame_offset. virtual void FeedForward(const kaldi::Vector& features, const int32& feature_dim, NnetOut* out) = 0; @@ -47,6 +48,9 @@ class NnetInterface { // reset nnet state, e.g. nnet_logprob_cache_, offset_, encoder_outs_. virtual void Reset() = 0; + // true, nnet output is logprob; otherwise is prob, + virtual bool IsLogProb() = 0; + // using to get encoder outs. e.g. seq2seq with Attention model. virtual void EncoderOuts( std::vector>* encoder_out) const = 0; diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h index 8ce45f43..4ecbac26 100644 --- a/speechx/speechx/nnet/u2_nnet.h +++ b/speechx/speechx/nnet/u2_nnet.h @@ -111,6 +111,8 @@ class U2Nnet : public U2NnetBase { void Reset() override; + bool IsLogProb() override { return true; } + void Dim(); void LoadModel(const std::string& model_path_w_prefix); diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc index fb9fec23..0c5aed54 100644 --- a/speechx/speechx/nnet/u2_nnet_main.cc +++ b/speechx/speechx/nnet/u2_nnet_main.cc @@ -98,6 +98,7 @@ int main(int argc, char* argv[]) { // } int32 frame_idx = 0; + int vocab_dim = 0; std::vector> prob_vec; std::vector> encoder_out_vec; int32 ori_feature_len = feature.NumRows(); @@ -138,17 +139,17 @@ int main(int argc, char* argv[]) { } // get nnet outputs - vector prob; - while (decodable->FrameLikelihood(frame_idx, &prob)) { - kaldi::Vector vec_tmp(prob.size()); - std::memcpy(vec_tmp.Data(), - prob.data(), - sizeof(kaldi::BaseFloat) * prob.size()); + kaldi::Timer timer; + kaldi::Vector logprobs; + bool isok = decodable->AdvanceChunk(&logprobs, &vocab_dim); + CHECK(isok == true); + for (int row_idx = 0; row_idx < logprobs.Dim() / vocab_dim; row_idx ++) { + kaldi::Vector vec_tmp(vocab_dim); + std::memcpy(vec_tmp.Data(), logprobs.Data() + row_idx*vocab_dim, sizeof(kaldi::BaseFloat) * vocab_dim); prob_vec.push_back(vec_tmp); - frame_idx++; } - + VLOG(2) << "frame_idx: " << frame_idx << " elapsed: " << timer.Elapsed() << " sec."; } // get encoder out @@ -196,8 +197,9 @@ int main(int argc, char* argv[]) { ++num_done; } + double elapsed = timer.Elapsed(); - LOG(INFO) << " cost:" << elapsed << " sec"; + LOG(INFO) << "Program cost:" << elapsed << " sec"; LOG(INFO) << "Done " << num_done << " utterances, " << num_err << " with errors."; From 5c8725e8cdc25b9fe7e697f1cde0b79449f8a652 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 12 Oct 2022 06:34:09 +0000 Subject: [PATCH 10/60] unify model opts; add attention rescore in decodable; rename ds2 ctc beam search --- speechx/examples/codelab/decoder/run.sh | 2 +- speechx/examples/ds2_ol/aishell/run.sh | 4 +- speechx/examples/ds2_ol/aishell/run_fbank.sh | 4 +- speechx/speechx/decoder/CMakeLists.txt | 2 +- .../speechx/decoder/ctc_beam_search_decoder.h | 2 + ...ain.cc => ctc_beam_search_decoder_main.cc} | 2 +- .../speechx/decoder/ctc_prefix_beam_search.cc | 0 speechx/speechx/decoder/param.h | 1 + speechx/speechx/nnet/decodable.cc | 6 +++ speechx/speechx/nnet/decodable.h | 31 ++++++----- speechx/speechx/nnet/ds2_nnet.h | 51 +++---------------- speechx/speechx/nnet/nnet_itf.h | 51 +++++++++++++++++++ speechx/speechx/nnet/u2_nnet.cc | 2 +- speechx/speechx/nnet/u2_nnet.h | 24 ++------- speechx/speechx/nnet/u2_nnet_main.cc | 2 +- 15 files changed, 96 insertions(+), 88 deletions(-) rename speechx/speechx/decoder/{ctc_prefix_beam_search_decoder_main.cc => ctc_beam_search_decoder_main.cc} (99%) create mode 100644 speechx/speechx/decoder/ctc_prefix_beam_search.cc diff --git a/speechx/examples/codelab/decoder/run.sh b/speechx/examples/codelab/decoder/run.sh index a911eb03..1a9e3cd7 100755 --- a/speechx/examples/codelab/decoder/run.sh +++ b/speechx/examples/codelab/decoder/run.sh @@ -69,7 +69,7 @@ compute_linear_spectrogram_main \ echo "compute linear spectrogram feature." # run ctc beam search decoder as streaming -ctc_prefix_beam_search_decoder_main \ +ctc_beam_search_decoder_main \ --result_wspecifier=ark,t:$exp_dir/result.txt \ --feature_rspecifier=ark:$feat_wspecifier \ --model_path=$model_dir/avg_1.jit.pdmodel \ diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh index a29be17b..e5fccc03 100755 --- a/speechx/examples/ds2_ol/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -84,7 +84,7 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # recognizer utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \ - ctc_prefix_beam_search_decoder_main \ + ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ --model_path=$model_dir/avg_1.jit.pdmodel \ --param_path=$model_dir/avg_1.jit.pdiparams \ @@ -103,7 +103,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # decode with lm utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \ - ctc_prefix_beam_search_decoder_main \ + ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ --model_path=$model_dir/avg_1.jit.pdmodel \ --param_path=$model_dir/avg_1.jit.pdiparams \ diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh index 72072835..88ed6287 100755 --- a/speechx/examples/ds2_ol/aishell/run_fbank.sh +++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh @@ -84,7 +84,7 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # recognizer utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wolm.log \ - ctc_prefix_beam_search_decoder_main \ + ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \ --model_path=$model_dir/avg_5.jit.pdmodel \ --param_path=$model_dir/avg_5.jit.pdiparams \ @@ -102,7 +102,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # decode with lm utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.lm.log \ - ctc_prefix_beam_search_decoder_main \ + ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \ --model_path=$model_dir/avg_5.jit.pdmodel \ --param_path=$model_dir/avg_5.jit.pdiparams \ diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt index 1df93511..8d04a997 100644 --- a/speechx/speechx/decoder/CMakeLists.txt +++ b/speechx/speechx/decoder/CMakeLists.txt @@ -12,7 +12,7 @@ add_library(decoder STATIC target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder) set(BINS - ctc_prefix_beam_search_decoder_main + ctc_beam_search_decoder_main nnet_logprob_decoder_main recognizer_main tlg_decoder_main diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.h b/speechx/speechx/decoder/ctc_beam_search_decoder.h index 9d0a5d14..19dbf2f6 100644 --- a/speechx/speechx/decoder/ctc_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_beam_search_decoder.h @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// used by deepspeech2 + #include "base/common.h" #include "decoder/ctc_decoders/path_trie.h" #include "decoder/ctc_decoders/scorer.h" diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc similarity index 99% rename from speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc rename to speechx/speechx/decoder/ctc_beam_search_decoder_main.cc index 445f470f..7e245e9b 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc +++ b/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// todo refactor, repalce with gtest +// used by deepspeech2 #include "base/flags.h" #include "base/log.h" diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search.cc b/speechx/speechx/decoder/ctc_prefix_beam_search.cc new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index ed895aed..8a5990dc 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -67,6 +67,7 @@ FeaturePipelineOptions InitFeaturePipelineOptions() { frame_opts.dither = 0.0; frame_opts.frame_shift_ms = 10; opts.use_fbank = FLAGS_use_fbank; + LOG(INFO) << "feature type: " << opts.use_fbank ? "fbank" : "linear"; if (opts.use_fbank) { opts.to_float32 = false; frame_opts.window_type = "povey"; diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index 1483949b..b76c6280 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -157,4 +157,10 @@ void Decodable::Reset() { nnet_out_cache_.Resize(0, 0); } +void Decodable::AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score){ + nnet_->AttentionRescoring(hyps, reverse_weight, rescoring_score); +} + } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h index 1ee6afbf..bfb75067 100644 --- a/speechx/speechx/nnet/decodable.h +++ b/speechx/speechx/nnet/decodable.h @@ -30,23 +30,31 @@ class Decodable : public kaldi::DecodableInterface { // void Init(DecodableOpts config); - // nnet logprob output + // nnet logprob output, used by wfst virtual kaldi::BaseFloat LogLikelihood(int32 frame, int32 index); + // nnet output + virtual bool FrameLikelihood(int32 frame, + std::vector* likelihood); + + // forward nnet with feats + bool AdvanceChunk(); + + // forward nnet with feats, and get nnet output + bool AdvanceChunk(kaldi::Vector* logprobs, + int* vocab_dim); + + void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score); + virtual bool IsLastFrame(int32 frame); // nnet output dim, e.g. vocab size virtual int32 NumIndices() const; - // nnet prob output - virtual bool FrameLikelihood(int32 frame, - std::vector* likelihood); - virtual int32 NumFramesReady() const; - // for offline test - void Acceptlikelihood(const kaldi::Matrix& likelihood); - void Reset(); bool IsInputFinished() const { return frontend_->IsFinished(); } @@ -57,11 +65,8 @@ class Decodable : public kaldi::DecodableInterface { std::shared_ptr Nnet() { return nnet_; } - // forward nnet with feats - bool AdvanceChunk(); - // forward nnet with feats, and get nnet output - bool AdvanceChunk(kaldi::Vector* logprobs, - int* vocab_dim); + // for offline test + void Acceptlikelihood(const kaldi::Matrix& likelihood); private: std::shared_ptr frontend_; diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h index 9e2cb77b..cd1648b4 100644 --- a/speechx/speechx/nnet/ds2_nnet.h +++ b/speechx/speechx/nnet/ds2_nnet.h @@ -15,56 +15,11 @@ #include #include "base/common.h" #include "kaldi/matrix/kaldi-matrix.h" -#include "kaldi/util/options-itf.h" #include "nnet/nnet_itf.h" #include "paddle_inference_api.h" namespace ppspeech { -struct ModelOptions { - std::string model_path; - std::string param_path; - int thread_num; // predictor thread pool size - bool use_gpu; - bool switch_ir_optim; - std::string input_names; - std::string output_names; - std::string cache_names; - std::string cache_shape; - bool enable_fc_padding; - bool enable_profile; - ModelOptions() - : model_path(""), - param_path(""), - thread_num(2), - use_gpu(false), - input_names(""), - output_names(""), - cache_names(""), - cache_shape(""), - switch_ir_optim(false), - enable_fc_padding(false), - enable_profile(false) {} - - void Register(kaldi::OptionsItf* opts) { - opts->Register("model-path", &model_path, "model file path"); - opts->Register("model-param", ¶m_path, "params model file path"); - opts->Register("thread-num", &thread_num, "thread num"); - opts->Register("use-gpu", &use_gpu, "if use gpu"); - opts->Register("input-names", &input_names, "paddle input names"); - opts->Register("output-names", &output_names, "paddle output names"); - opts->Register("cache-names", &cache_names, "cache names"); - opts->Register("cache-shape", &cache_shape, "cache shape"); - opts->Register("switch-ir-optiom", - &switch_ir_optim, - "paddle SwitchIrOptim option"); - opts->Register("enable-fc-padding", - &enable_fc_padding, - "paddle EnableFCPadding option"); - opts->Register( - "enable-profile", &enable_profile, "paddle EnableProfile option"); - } -}; template class Tensor { @@ -100,6 +55,12 @@ class PaddleNnet : public NnetInterface { const int32& feature_dim, NnetOut* out) override; + void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) override { + VLOG(2) << "deepspeech2 not has AttentionRescoring."; + } + void Dim(); void Reset() override; diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h index d05aabea..2e21ff9b 100644 --- a/speechx/speechx/nnet/nnet_itf.h +++ b/speechx/speechx/nnet/nnet_itf.h @@ -18,9 +18,56 @@ #include "base/basic_types.h" #include "kaldi/base/kaldi-types.h" #include "kaldi/matrix/kaldi-matrix.h" +#include "kaldi/util/options-itf.h" namespace ppspeech { + +struct ModelOptions { + std::string model_path; + std::string param_path; + int thread_num; // predictor thread pool size for ds2; + bool use_gpu; + bool switch_ir_optim; + std::string input_names; + std::string output_names; + std::string cache_names; + std::string cache_shape; + bool enable_fc_padding; + bool enable_profile; + ModelOptions() + : model_path(""), + param_path(""), + thread_num(1), + use_gpu(false), + input_names(""), + output_names(""), + cache_names(""), + cache_shape(""), + switch_ir_optim(false), + enable_fc_padding(false), + enable_profile(false) {} + + void Register(kaldi::OptionsItf* opts) { + opts->Register("model-path", &model_path, "model file path"); + opts->Register("model-param", ¶m_path, "params model file path"); + opts->Register("thread-num", &thread_num, "thread num"); + opts->Register("use-gpu", &use_gpu, "if use gpu"); + opts->Register("input-names", &input_names, "paddle input names"); + opts->Register("output-names", &output_names, "paddle output names"); + opts->Register("cache-names", &cache_names, "cache names"); + opts->Register("cache-shape", &cache_shape, "cache shape"); + opts->Register("switch-ir-optiom", + &switch_ir_optim, + "paddle SwitchIrOptim option"); + opts->Register("enable-fc-padding", + &enable_fc_padding, + "paddle EnableFCPadding option"); + opts->Register( + "enable-profile", &enable_profile, "paddle EnableProfile option"); + } +}; + struct NnetOut { // nnet out. maybe logprob or prob. Almost time this is logprob. kaldi::Vector logprobs; @@ -45,6 +92,10 @@ class NnetInterface { const int32& feature_dim, NnetOut* out) = 0; + virtual void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) = 0; + // reset nnet state, e.g. nnet_logprob_cache_, offset_, encoder_outs_. virtual void Reset() = 0; diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc index 74f8cf78..71252477 100644 --- a/speechx/speechx/nnet/u2_nnet.cc +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -166,7 +166,7 @@ void U2Nnet::Warmup() { Reset(); } -U2Nnet::U2Nnet(const U2ModelOptions& opts) : opts_(opts) { +U2Nnet::U2Nnet(const ModelOptions& opts) : opts_(opts) { LoadModel(opts_.model_path); } diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h index 4ecbac26..1bac652e 100644 --- a/speechx/speechx/nnet/u2_nnet.h +++ b/speechx/speechx/nnet/u2_nnet.h @@ -17,28 +17,14 @@ #include "base/common.h" #include "kaldi/matrix/kaldi-matrix.h" -#include "kaldi/util/options-itf.h" -#include "nnet/nnet_itf.h" +#include "nnet/nnet_itf.h" #include "paddle/extension.h" #include "paddle/jit/all.h" #include "paddle/phi/api/all.h" namespace ppspeech { -struct U2ModelOptions { - std::string model_path; - int thread_num; - bool use_gpu; - U2ModelOptions() : model_path(""), thread_num(1), use_gpu(false) {} - - void Register(kaldi::OptionsItf* opts) { - opts->Register("model-path", &model_path, "model file path"); - opts->Register("thread-num", &thread_num, "thread num"); - opts->Register("use-gpu", &use_gpu, "if use gpu"); - } -}; - class U2NnetBase : public NnetInterface { public: @@ -65,10 +51,6 @@ class U2NnetBase : public NnetInterface { std::vector* ctc_probs, int32* vocab_dim); - virtual void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) = 0; - protected: virtual void ForwardEncoderChunkImpl( const std::vector& chunk_feats, @@ -102,7 +84,7 @@ class U2NnetBase : public NnetInterface { class U2Nnet : public U2NnetBase { public: - U2Nnet(const U2ModelOptions& opts); + U2Nnet(const ModelOptions& opts); U2Nnet(const U2Nnet& other); void FeedForward(const kaldi::Vector& features, @@ -143,7 +125,7 @@ class U2Nnet : public U2NnetBase { std::vector>* encoder_out) const; private: - U2ModelOptions opts_; + ModelOptions opts_; phi::Place dev_; std::shared_ptr model_{nullptr}; diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc index 0c5aed54..2dd1fa0d 100644 --- a/speechx/speechx/nnet/u2_nnet_main.cc +++ b/speechx/speechx/nnet/u2_nnet_main.cc @@ -58,7 +58,7 @@ int main(int argc, char* argv[]) { kaldi::BaseFloatMatrixWriter nnet_out_writer(FLAGS_nnet_prob_wspecifier); kaldi::BaseFloatMatrixWriter nnet_encoder_outs_writer(FLAGS_nnet_encoder_outs_wspecifier); - ppspeech::U2ModelOptions model_opts; + ppspeech::ModelOptions model_opts; model_opts.model_path = FLAGS_model_path; int32 chunk_size = From bc1b6c2e7c2e9c61702f60d4dd44a101e79da679 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 12 Oct 2022 09:00:54 +0000 Subject: [PATCH 11/60] refactor ctc opts, extract decoder interface, add ctc beamsearch score --- speechx/examples/ds2_ol/aishell/run.sh | 2 +- speechx/examples/ds2_ol/aishell/run_fbank.sh | 2 +- speechx/speechx/decoder/CMakeLists.txt | 2 +- .../decoder/ctc_beam_search_decoder.cc | 10 +-- .../speechx/decoder/ctc_beam_search_decoder.h | 69 ++++++---------- speechx/speechx/decoder/ctc_beam_search_opt.h | 78 +++++++++++++++++++ .../speechx/decoder/ctc_prefix_beam_search.cc | 0 .../decoder/ctc_prefix_beam_search_decoder.cc | 13 ++++ .../decoder/ctc_prefix_beam_search_decoder.h | 64 +++++++++++++++ .../decoder/ctc_prefix_beam_search_score.h | 68 ++++++++++++++++ speechx/speechx/decoder/ctc_tlg_decoder.cc | 12 +-- speechx/speechx/decoder/ctc_tlg_decoder.h | 29 ++++--- ...ecoder_main.cc => ctc_tlg_decoder_main.cc} | 10 ++- speechx/speechx/decoder/decoder_itf.h | 56 +++++++++++++ speechx/speechx/nnet/decodable.h | 8 +- speechx/speechx/nnet/ds2_nnet.h | 6 +- 16 files changed, 351 insertions(+), 78 deletions(-) create mode 100644 speechx/speechx/decoder/ctc_beam_search_opt.h delete mode 100644 speechx/speechx/decoder/ctc_prefix_beam_search.cc create mode 100644 speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc create mode 100644 speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h create mode 100644 speechx/speechx/decoder/ctc_prefix_beam_search_score.h rename speechx/speechx/decoder/{tlg_decoder_main.cc => ctc_tlg_decoder_main.cc} (99%) create mode 100644 speechx/speechx/decoder/decoder_itf.h diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh index e5fccc03..794b533f 100755 --- a/speechx/examples/ds2_ol/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -135,7 +135,7 @@ fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # TLG decoder utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \ - tlg_decoder_main \ + ctc_tlg_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ --model_path=$model_dir/avg_1.jit.pdmodel \ --param_path=$model_dir/avg_1.jit.pdiparams \ diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh index 88ed6287..1c3c3e01 100755 --- a/speechx/examples/ds2_ol/aishell/run_fbank.sh +++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh @@ -133,7 +133,7 @@ fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # TLG decoder utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wfst.log \ - tlg_decoder_main \ + ctc_tlg_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \ --model_path=$model_dir/avg_5.jit.pdmodel \ --param_path=$model_dir/avg_5.jit.pdiparams \ diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt index 8d04a997..20e93523 100644 --- a/speechx/speechx/decoder/CMakeLists.txt +++ b/speechx/speechx/decoder/CMakeLists.txt @@ -15,7 +15,7 @@ set(BINS ctc_beam_search_decoder_main nnet_logprob_decoder_main recognizer_main - tlg_decoder_main + ctc_tlg_decoder_main ) foreach(bin_name IN LISTS BINS) diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_beam_search_decoder.cc index 5a12c0b5..ff3298b2 100644 --- a/speechx/speechx/decoder/ctc_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_beam_search_decoder.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "decoder/ctc_beam_search_decoder.h" -#include "base/basic_types.h" +#include "base/common.h" #include "decoder/ctc_decoders/decoder_utils.h" +#include "decoder/ctc_beam_search_decoder.h" #include "utils/file_utils.h" namespace ppspeech { @@ -26,7 +26,7 @@ using FSTMATCH = fst::SortedMatcher; CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts) : opts_(opts), init_ext_scorer_(nullptr), - blank_id_(-1), + blank_id_(opts.blank), space_id_(-1), num_frame_decoded_(0), root_(nullptr) { @@ -43,9 +43,9 @@ CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts) opts_.alpha, opts_.beta, opts_.lm_path, vocabulary_); } - blank_id_ = 0; - auto it = std::find(vocabulary_.begin(), vocabulary_.end(), " "); + CHECK(blank_id_==0); + auto it = std::find(vocabulary_.begin(), vocabulary_.end(), " "); space_id_ = it - vocabulary_.begin(); // if no space in vocabulary if ((size_t)space_id_ >= vocabulary_.size()) { diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.h b/speechx/speechx/decoder/ctc_beam_search_decoder.h index 19dbf2f6..e36eb4a0 100644 --- a/speechx/speechx/decoder/ctc_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_beam_search_decoder.h @@ -14,67 +14,48 @@ // used by deepspeech2 -#include "base/common.h" +#pragma once + +#include "decoder/ctc_beam_search_opt.h" #include "decoder/ctc_decoders/path_trie.h" #include "decoder/ctc_decoders/scorer.h" -#include "kaldi/decoder/decodable-itf.h" -#include "util/parse-options.h" - -#pragma once +#include "decoder/decoder_itf.h" namespace ppspeech { -struct CTCBeamSearchOptions { - std::string dict_file; - std::string lm_path; - BaseFloat alpha; - BaseFloat beta; - BaseFloat cutoff_prob; - int beam_size; - int cutoff_top_n; - int num_proc_bsearch; - CTCBeamSearchOptions() - : dict_file("vocab.txt"), - lm_path(""), - alpha(1.9f), - beta(5.0), - beam_size(300), - cutoff_prob(0.99f), - cutoff_top_n(40), - num_proc_bsearch(10) {} - - void Register(kaldi::OptionsItf* opts) { - opts->Register("dict", &dict_file, "dict file "); - opts->Register("lm-path", &lm_path, "language model file"); - opts->Register("alpha", &alpha, "alpha"); - opts->Register("beta", &beta, "beta"); - opts->Register( - "beam-size", &beam_size, "beam size for beam search method"); - opts->Register("cutoff-prob", &cutoff_prob, "cutoff probs"); - opts->Register("cutoff-top-n", &cutoff_top_n, "cutoff top n"); - opts->Register( - "num-proc-bsearch", &num_proc_bsearch, "num proc bsearch"); - } -}; - -class CTCBeamSearch { +class CTCBeamSearch : public DecoderInterface { public: explicit CTCBeamSearch(const CTCBeamSearchOptions& opts); ~CTCBeamSearch() {} + void InitDecoder(); + + void Reset(); + + void AdvanceDecode( + const std::shared_ptr& decodable); + + std::string GetFinalBestPath(); + + std::string GetPartialResult() { + CHECK(false) << "Not implement."; + return {}; + } + void Decode(std::shared_ptr decodable); + std::string GetBestPath(); std::vector> GetNBestPath(); - std::string GetFinalBestPath(); + + int NumFrameDecoded(); + int DecodeLikelihoods(const std::vector>& probs, std::vector& nbest_words); - void AdvanceDecode( - const std::shared_ptr& decodable); - void Reset(); private: void ResetPrefixes(); + int32 SearchOneChar(const bool& full_beam, const std::pair& log_prob_idx, const BaseFloat& min_cutoff); @@ -93,4 +74,4 @@ class CTCBeamSearch { DISALLOW_COPY_AND_ASSIGN(CTCBeamSearch); }; -} // namespace basr \ No newline at end of file +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/decoder/ctc_beam_search_opt.h b/speechx/speechx/decoder/ctc_beam_search_opt.h new file mode 100644 index 00000000..dcb62258 --- /dev/null +++ b/speechx/speechx/decoder/ctc_beam_search_opt.h @@ -0,0 +1,78 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "base/common.h" +#include "util/parse-options.h" + +#pragma once + +namespace ppspeech { + +struct CTCBeamSearchOptions { + // common + int blank; + + // ds2 + std::string dict_file; + std::string lm_path; + int beam_size; + BaseFloat alpha; + BaseFloat beta; + BaseFloat cutoff_prob; + int cutoff_top_n; + int num_proc_bsearch; + + // u2 + int first_beam_size; + int second_beam_size; + CTCBeamSearchOptions() + : blank(0), + dict_file("vocab.txt"), + lm_path(""), + alpha(1.9f), + beta(5.0), + beam_size(300), + cutoff_prob(0.99f), + cutoff_top_n(40), + num_proc_bsearch(10), + first_beam_size(10), + second_beam_size(10) {} + + void Register(kaldi::OptionsItf* opts) { + std::string module = "Ds2BeamSearchConfig: "; + opts->Register("dict", &dict_file, module + "vocab file path."); + opts->Register( + "lm-path", &lm_path, module + "ngram language model path."); + opts->Register("alpha", &alpha, module + "alpha"); + opts->Register("beta", &beta, module + "beta"); + opts->Register("beam-size", + &beam_size, + module + "beam size for beam search method"); + opts->Register("cutoff-prob", &cutoff_prob, module + "cutoff probs"); + opts->Register("cutoff-top-n", &cutoff_top_n, module + "cutoff top n"); + opts->Register( + "num-proc-bsearch", &num_proc_bsearch, module + "num proc bsearch"); + + opts->Register("blank", &blank, "blank id, default is 0."); + + module = "U2BeamSearchConfig: "; + opts->Register( + "first-beam-size", &first_beam_size, module + "first beam size."); + opts->Register("second-beam-size", + &second_beam_size, + module + "second beam size."); + } +}; + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search.cc b/speechx/speechx/decoder/ctc_prefix_beam_search.cc deleted file mode 100644 index e69de29b..00000000 diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc new file mode 100644 index 00000000..0544a1e2 --- /dev/null +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc @@ -0,0 +1,13 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h new file mode 100644 index 00000000..745c4a83 --- /dev/null +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h @@ -0,0 +1,64 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "decoder/ctc_beam_search_opt.h" +#include "decoder/ctc_prefix_beam_search_score.h" +#include "decoder/decoder_itf.h" + +#include "kaldi/decoder/decodable-itf.h" + +namespace ppspeech { + +class CTCPrefixBeamSearch : public DecoderInterface { + public: + explicit CTCPrefixBeamSearch(const CTCBeamSearchOptions& opts); + ~CTCPrefixBeamSearch() {} + + void InitDecoder(); + + void Decode(std::shared_ptr decodable); + + std::string GetBestPath(); + + std::vector> GetNBestPath(); + + std::string GetFinalBestPath(); + + int NumFrameDecoded(); + + int DecodeLikelihoods(const std::vector>& probs, + std::vector& nbest_words); + + void AdvanceDecode( + const std::shared_ptr& decodable); + void Reset(); + + private: + void ResetPrefixes(); + int32 SearchOneChar(const bool& full_beam, + const std::pair& log_prob_idx, + const BaseFloat& min_cutoff); + void CalculateApproxScore(); + void LMRescore(); + void AdvanceDecoding(const std::vector>& probs); + + CTCBeamSearchOptions opts_; + size_t blank_id_; + int num_frame_decoded_; + DISALLOW_COPY_AND_ASSIGN(CTCPrefixBeamSearch); +}; + +} // namespace basr \ No newline at end of file diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h new file mode 100644 index 00000000..19423b5e --- /dev/null +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h @@ -0,0 +1,68 @@ +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "base/common.h" +#include "utils/math.h" + +namespace ppspeech { + +struct PrefxiScore { + // decoding, unit in log scale + float b = -kFloatMax; // blank ending score + float nb = -kFloatMax; // none-blank ending score + + // timestamp, unit in log sclae + float v_b = -kFloatMax; // viterbi blank ending score + float v_nb = -kFloatMax; // niterbi none-blank ending score + float cur_token_prob = -kFloatMax; // prob of current token + std::vector times_b; // times of viterbi blank path + std::vector times_nb; // times of viterbi non-blank path + + // context state + bool has_context = false; + int context_state = 0; + float context_score = 0; + + // decoding score, sum + float Score() const { return LogSumExp(b, nb); } + + // decodign score with context bias + float TotalScore() const { return Score() + context_score; } + + // timestamp score, max + float ViterbiScore() const { return std::max(v_b, v_nb); } + + // get timestamp + const std::vector& Times() const { + return v_b > v_nb ? times_b : times_nb; + } +}; + +struct PrefixScoreHash { + // https://stackoverflow.com/questions/20511347/a-good-hash-function-for-a-vector + std::size_t operator()(const std::vector& prefix) const { + std::size_t seed = prefix.size(); + for (auto& i : prefix) { + seed ^= i + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + return seed; + } +}; + +using PrefixWithScoreType = std::pair, PrefixScoreHash>; + +} // namespace ppspeech diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.cc b/speechx/speechx/decoder/ctc_tlg_decoder.cc index 712d27dd..de97f6ad 100644 --- a/speechx/speechx/decoder/ctc_tlg_decoder.cc +++ b/speechx/speechx/decoder/ctc_tlg_decoder.cc @@ -22,24 +22,24 @@ TLGDecoder::TLGDecoder(TLGDecoderOptions opts) { fst::SymbolTable::ReadText(opts.word_symbol_table)); decoder_.reset(new kaldi::LatticeFasterOnlineDecoder(*fst_, opts.opts)); decoder_->InitDecoding(); - frame_decoded_size_ = 0; + num_frame_decoded_ = 0; } void TLGDecoder::InitDecoder() { decoder_->InitDecoding(); - frame_decoded_size_ = 0; + num_frame_decoded_ = 0; } void TLGDecoder::AdvanceDecode( const std::shared_ptr& decodable) { - while (!decodable->IsLastFrame(frame_decoded_size_)) { + while (!decodable->IsLastFrame(num_frame_decoded_)) { AdvanceDecoding(decodable.get()); } } void TLGDecoder::AdvanceDecoding(kaldi::DecodableInterface* decodable) { decoder_->AdvanceDecoding(decodable, 1); - frame_decoded_size_++; + num_frame_decoded_++; } void TLGDecoder::Reset() { @@ -48,7 +48,7 @@ void TLGDecoder::Reset() { } std::string TLGDecoder::GetPartialResult() { - if (frame_decoded_size_ == 0) { + if (num_frame_decoded_ == 0) { // Assertion failed: (this->NumFramesDecoded() > 0 && "You cannot call // BestPathEnd if no frames were decoded.") return std::string(""); @@ -68,7 +68,7 @@ std::string TLGDecoder::GetPartialResult() { } std::string TLGDecoder::GetFinalBestPath() { - if (frame_decoded_size_ == 0) { + if (num_frame_decoded_ == 0) { // Assertion failed: (this->NumFramesDecoded() > 0 && "You cannot call // BestPathEnd if no frames were decoded.") return std::string(""); diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h index 1ac46ac6..f2282cb8 100644 --- a/speechx/speechx/decoder/ctc_tlg_decoder.h +++ b/speechx/speechx/decoder/ctc_tlg_decoder.h @@ -14,8 +14,9 @@ #pragma once -#include "base/basic_types.h" -#include "kaldi/decoder/decodable-itf.h" +#include "base/common.h" +#include "decoder/decoder_itf.h" + #include "kaldi/decoder/lattice-faster-online-decoder.h" #include "util/parse-options.h" @@ -30,21 +31,31 @@ struct TLGDecoderOptions { TLGDecoderOptions() : word_symbol_table(""), fst_path("") {} }; -class TLGDecoder { +class TLGDecoder : public DecoderInterface { public: explicit TLGDecoder(TLGDecoderOptions opts); + ~TLGDecoder() = default; + void InitDecoder(); + void Reset(); + + void AdvanceDecode( + const std::shared_ptr& decodable); + + + std::string GetFinalBestPath(); + std::string GetPartialResult(); + + void Decode(); + std::string GetBestPath(); std::vector> GetNBestPath(); - std::string GetFinalBestPath(); - std::string GetPartialResult(); + int NumFrameDecoded(); int DecodeLikelihoods(const std::vector>& probs, std::vector& nbest_words); - void AdvanceDecode( - const std::shared_ptr& decodable); - void Reset(); + private: void AdvanceDecoding(kaldi::DecodableInterface* decodable); @@ -53,7 +64,7 @@ class TLGDecoder { std::shared_ptr> fst_; std::shared_ptr word_symbol_table_; // the frame size which have decoded starts from 0. - int32 frame_decoded_size_; + int32 num_frame_decoded_; }; diff --git a/speechx/speechx/decoder/tlg_decoder_main.cc b/speechx/speechx/decoder/ctc_tlg_decoder_main.cc similarity index 99% rename from speechx/speechx/decoder/tlg_decoder_main.cc rename to speechx/speechx/decoder/ctc_tlg_decoder_main.cc index b633022a..cd1249d8 100644 --- a/speechx/speechx/decoder/tlg_decoder_main.cc +++ b/speechx/speechx/decoder/ctc_tlg_decoder_main.cc @@ -14,13 +14,15 @@ // todo refactor, repalce with gtest -#include "base/flags.h" -#include "base/log.h" -#include "decoder/ctc_tlg_decoder.h" +#include "base/common.h" + #include "frontend/audio/data_cache.h" -#include "kaldi/util/table-types.h" #include "nnet/decodable.h" #include "nnet/ds2_nnet.h" +#include "decoder/ctc_tlg_decoder.h" + +#include "kaldi/util/table-types.h" + DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier"); diff --git a/speechx/speechx/decoder/decoder_itf.h b/speechx/speechx/decoder/decoder_itf.h new file mode 100644 index 00000000..01061939 --- /dev/null +++ b/speechx/speechx/decoder/decoder_itf.h @@ -0,0 +1,56 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "base/common.h" +#include "kaldi/decoder/decodable-itf.h" + +namespace ppspeech { + +class DecoderInterface { + public: + virtual ~DecoderInterface() {} + + virtual void InitDecoder() = 0; + + virtual void Reset() = 0; + + virtual void AdvanceDecode( + const std::shared_ptr& decodable) = 0; + + + virtual std::string GetFinalBestPath() = 0; + + virtual std::string GetPartialResult() = 0; + + // void Decode(); + + // std::string GetBestPath(); + // std::vector> GetNBestPath(); + + // int NumFrameDecoded(); + // int DecodeLikelihoods(const std::vector>& probs, + // std::vector& nbest_words); + + + private: + // void AdvanceDecoding(kaldi::DecodableInterface* decodable); + + // current decoding frame number + int32 num_frame_decoded_; +}; + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h index bfb75067..70a16e2c 100644 --- a/speechx/speechx/nnet/decodable.h +++ b/speechx/speechx/nnet/decodable.h @@ -39,14 +39,14 @@ class Decodable : public kaldi::DecodableInterface { // forward nnet with feats bool AdvanceChunk(); - + // forward nnet with feats, and get nnet output bool AdvanceChunk(kaldi::Vector* logprobs, int* vocab_dim); - + void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score); + float reverse_weight, + std::vector* rescoring_score); virtual bool IsLastFrame(int32 frame); diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h index cd1648b4..e8a49c7d 100644 --- a/speechx/speechx/nnet/ds2_nnet.h +++ b/speechx/speechx/nnet/ds2_nnet.h @@ -56,9 +56,9 @@ class PaddleNnet : public NnetInterface { NnetOut* out) override; void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) override { - VLOG(2) << "deepspeech2 not has AttentionRescoring."; + float reverse_weight, + std::vector* rescoring_score) override { + VLOG(2) << "deepspeech2 not has AttentionRescoring."; } void Dim(); From 3c3aa6b59421f8f911247cd667426095f2298d58 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 12 Oct 2022 12:31:20 +0000 Subject: [PATCH 12/60] simple ctc prefix beam search compile ok --- speechx/speechx/base/common.h | 2 + speechx/speechx/base/macros.h | 3 +- speechx/speechx/decoder/CMakeLists.txt | 3 +- .../decoder/ctc_beam_search_decoder.cc | 12 +- .../speechx/decoder/ctc_beam_search_decoder.h | 3 +- .../decoder/ctc_prefix_beam_search_decoder.cc | 304 ++++++++++++++++++ .../decoder/ctc_prefix_beam_search_decoder.h | 70 +++- .../decoder/ctc_prefix_beam_search_score.h | 50 ++- speechx/speechx/decoder/ctc_tlg_decoder.h | 2 - speechx/speechx/decoder/decoder_itf.h | 3 +- speechx/speechx/utils/math.cc | 4 +- 11 files changed, 406 insertions(+), 50 deletions(-) diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h index 70b11b69..b470b9de 100644 --- a/speechx/speechx/base/common.h +++ b/speechx/speechx/base/common.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include #include @@ -35,6 +36,7 @@ #include #include #include +#include #include #include #include diff --git a/speechx/speechx/base/macros.h b/speechx/speechx/base/macros.h index 14332a80..faf39373 100644 --- a/speechx/speechx/base/macros.h +++ b/speechx/speechx/base/macros.h @@ -25,8 +25,7 @@ namespace ppspeech { void operator=(const TypeName&) = delete #endif -constexpr float kFloatMax = std::numeric_limits::max(); - +// kSpaceSymbol in UTF-8 is: ▁ const std::string kSpaceSymbol = "\xe2\x96\x81"; } // namespace ppspeech diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt index 20e93523..b08aaba5 100644 --- a/speechx/speechx/decoder/CMakeLists.txt +++ b/speechx/speechx/decoder/CMakeLists.txt @@ -2,10 +2,11 @@ project(decoder) include_directories(${CMAKE_CURRENT_SOURCE_DIR/ctc_decoders}) add_library(decoder STATIC - ctc_beam_search_decoder.cc ctc_decoders/decoder_utils.cpp ctc_decoders/path_trie.cpp ctc_decoders/scorer.cpp + ctc_beam_search_decoder.cc + ctc_prefix_beam_search_decoder.cc ctc_tlg_decoder.cc recognizer.cc ) diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_beam_search_decoder.cc index ff3298b2..76342b87 100644 --- a/speechx/speechx/decoder/ctc_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_beam_search_decoder.cc @@ -26,9 +26,7 @@ using FSTMATCH = fst::SortedMatcher; CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts) : opts_(opts), init_ext_scorer_(nullptr), - blank_id_(opts.blank), space_id_(-1), - num_frame_decoded_(0), root_(nullptr) { LOG(INFO) << "dict path: " << opts_.dict_file; if (!ReadFileToVector(opts_.dict_file, &vocabulary_)) { @@ -43,7 +41,7 @@ CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts) opts_.alpha, opts_.beta, opts_.lm_path, vocabulary_); } - CHECK(blank_id_==0); + CHECK(opts_.blank==0); auto it = std::find(vocabulary_.begin(), vocabulary_.end(), " "); space_id_ = it - vocabulary_.begin(); @@ -167,7 +165,7 @@ void CTCBeamSearch::AdvanceDecoding(const vector>& probs) { continue; } min_cutoff = prefixes_[num_prefixes_ - 1]->score + - std::log(prob[blank_id_]) - + std::log(prob[opts_.blank]) - std::max(0.0, init_ext_scorer_->beta); full_beam = (num_prefixes_ == beam_size); @@ -195,9 +193,9 @@ void CTCBeamSearch::AdvanceDecoding(const vector>& probs) { for (size_t i = beam_size; i < prefixes_.size(); ++i) { prefixes_[i]->remove(); } - } // if + } // end if num_frame_decoded_++; - } // for probs_seq + } // end for probs_seq } int32 CTCBeamSearch::SearchOneChar( @@ -215,7 +213,7 @@ int32 CTCBeamSearch::SearchOneChar( break; } - if (c == blank_id_) { + if (c == opts_.blank) { prefix->log_prob_b_cur = log_sum_exp(prefix->log_prob_b_cur, log_prob_c + prefix->score); continue; diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.h b/speechx/speechx/decoder/ctc_beam_search_decoder.h index e36eb4a0..516f8b2c 100644 --- a/speechx/speechx/decoder/ctc_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_beam_search_decoder.h @@ -66,11 +66,10 @@ class CTCBeamSearch : public DecoderInterface { CTCBeamSearchOptions opts_; std::shared_ptr init_ext_scorer_; // todo separate later std::vector vocabulary_; // todo remove later - size_t blank_id_; int space_id_; std::shared_ptr root_; std::vector prefixes_; - int num_frame_decoded_; + DISALLOW_COPY_AND_ASSIGN(CTCBeamSearch); }; diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc index 0544a1e2..fd689023 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc @@ -11,3 +11,307 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + + +#include "base/common.h" +#include "decoder/ctc_beam_search_opt.h" +#include "decoder/ctc_prefix_beam_search_score.h" +#include "decoder/ctc_prefix_beam_search_decoder.h" +#include "utils/math.h" + +#ifdef USE_PROFILING +#include "paddle/fluid/platform/profiler.h" +using paddle::platform::RecordEvent; +using paddle::platform::TracerEventType; +#endif + +namespace ppspeech { + +CTCPrefixBeamSearch::CTCPrefixBeamSearch(const CTCBeamSearchOptions& opts) + : opts_(opts) { + InitDecoder(); +} + +void CTCPrefixBeamSearch::InitDecoder() { + num_frame_decoded_ = 0; + + cur_hyps_.clear(); + + hypotheses_.clear(); + likelihood_.clear(); + viterbi_likelihood_.clear(); + times_.clear(); + outputs_.clear(); + + abs_time_step_ = 0; + + // empty hyp with Score + std::vector empty; + PrefixScore prefix_score; + prefix_score.b = 0.0f; // log(1) + prefix_score.nb = -kBaseFloatMax; // log(0) + prefix_score.v_b = 0.0f; // log(1) + prefix_score.v_nb = 0.0f; // log(1) + cur_hyps_[empty] = prefix_score; + + outputs_.emplace_back(empty); + hypotheses_.emplace_back(empty); + likelihood_.emplace_back(prefix_score.TotalScore()); + times_.emplace_back(empty); + +} + +void CTCPrefixBeamSearch::Reset() { + InitDecoder(); +} + +void CTCPrefixBeamSearch::Decode( + std::shared_ptr decodable) { + return; +} + +int32 CTCPrefixBeamSearch::NumFrameDecoded() { return num_frame_decoded_ + 1; } + + +void CTCPrefixBeamSearch::UpdateOutputs( + const std::pair, PrefixScore>& prefix) { + const std::vector& input = prefix.first; + // const std::vector& start_boundaries = prefix.second.start_boundaries; + // const std::vector& end_boundaries = prefix.second.end_boundaries; + + std::vector output; + int s = 0; + int e = 0; + for (int i = 0; i < input.size(); ++i) { + // if (s < start_boundaries.size() && i == start_boundaries[s]){ + // // + // output.emplace_back(context_graph_->start_tag_id()); + // ++s; + // } + + output.emplace_back(input[i]); + + // if (e < end_boundaries.size() && i == end_boundaries[e]){ + // // + // output.emplace_back(context_graph_->end_tag_id()); + // ++e; + // } + } + + outputs_.emplace_back(output); +} + + +void CTCPrefixBeamSearch::AdvanceDecode( + const std::shared_ptr& decodable) { + while (1) { + std::vector frame_prob; + bool flag = decodable->FrameLikelihood(num_frame_decoded_, &frame_prob); + if (flag == false) break; + std::vector> likelihood; + likelihood.push_back(frame_prob); + AdvanceDecoding(likelihood); + } +} + +static bool PrefixScoreCompare( + const std::pair, PrefixScore>& a, + const std::pair, PrefixScore>& b) { + // log domain + return a.second.TotalScore() > b.second.TotalScore(); +} + + +void CTCPrefixBeamSearch::AdvanceDecoding(const std::vector>& logp) { +#ifdef USE_PROFILING + RecordEvent event( + "CtcPrefixBeamSearch::AdvanceDecoding", TracerEventType::UserDefined, 1); +#endif + + if (logp.size() == 0) return; + + int first_beam_size = + std::min(static_cast(logp[0].size()), opts_.first_beam_size); + + for (int t = 0; t < logp.size(); ++t, ++abs_time_step_) { + const std::vector& logp_t = logp[t]; + std::unordered_map, PrefixScore, PrefixScoreHash> next_hyps; + + // 1. first beam prune, only select topk candidates + std::vector topk_score; + std::vector topk_index; + TopK(logp_t, first_beam_size, &topk_score, &topk_index); + + // 2. token passing + for (int i = 0; i < topk_index.size(); ++i) { + int id = topk_index[i]; + auto prob = topk_score[i]; + + for (const auto& it : cur_hyps_) { + const std::vector& prefix = it.first; + const PrefixScore& prefix_score = it.second; + + // If prefix doesn't exist in next_hyps, next_hyps[prefix] will insert + // PrefixScore(-inf, -inf) by default, since the default constructor + // of PrefixScore will set fields b(blank ending Score) and + // nb(none blank ending Score) to -inf, respectively. + + if (id == opts_.blank) { + // case 0: *a + => *a, *a + => *a, prefix not + // change + PrefixScore& next_score = next_hyps[prefix]; + next_score.b = LogSumExp(next_score.b, prefix_score.Score() + prob); + + // timestamp, blank is slince, not effact timestamp + next_score.v_b = prefix_score.ViterbiScore() + prob; + next_score.times_b = prefix_score.Times(); + + // Prefix not changed, copy the context from pefix + if (context_graph_ && !next_score.has_context) { + next_score.CopyContext(prefix_score); + next_score.has_context = true; + } + + } else if (!prefix.empty() && id == prefix.back()) { + // case 1: *a + a => *a, prefix not changed + PrefixScore& next_score1 = next_hyps[prefix]; + next_score1.nb = LogSumExp(next_score1.nb, prefix_score.nb + prob); + + // timestamp, non-blank symbol effact timestamp + if (next_score1.v_nb < prefix_score.v_nb + prob) { + // compute viterbi Score + next_score1.v_nb = prefix_score.v_nb + prob; + if (next_score1.cur_token_prob < prob) { + // store max token prob + next_score1.cur_token_prob = prob; + // update this timestamp as token appeared here. + next_score1.times_nb = prefix_score.times_nb; + assert(next_score1.times_nb.size() > 0); + next_score1.times_nb.back() = abs_time_step_; + } + } + + // Prefix not changed, copy the context from pefix + if (context_graph_ && !next_score1.has_context) { + next_score1.CopyContext(prefix_score); + next_score1.has_context = true; + } + + // case 2: *a + a => *aa, prefix changed. + std::vector new_prefix(prefix); + new_prefix.emplace_back(id); + PrefixScore& next_score2 = next_hyps[new_prefix]; + next_score2.nb = LogSumExp(next_score2.nb, prefix_score.b + prob); + + // timestamp, non-blank symbol effact timestamp + if (next_score2.v_nb < prefix_score.v_b + prob) { + // compute viterbi Score + next_score2.v_nb = prefix_score.v_b + prob; + // new token added + next_score2.cur_token_prob = prob; + next_score2.times_nb = prefix_score.times_b; + next_score2.times_nb.emplace_back(abs_time_step_); + } + + // Prefix changed, calculate the context Score. + if (context_graph_ && !next_score2.has_context) { + next_score2.UpdateContext( + context_graph_, prefix_score, id, prefix.size()); + next_score2.has_context = true; + } + + } else { + // id != prefix.back() + // case 3: *a + b => *ab, *a +b => *ab + std::vector new_prefix(prefix); + new_prefix.emplace_back(id); + PrefixScore& next_score = next_hyps[new_prefix]; + next_score.nb = LogSumExp(next_score.nb, prefix_score.Score() + prob); + + // timetamp, non-blank symbol effact timestamp + if (next_score.v_nb < prefix_score.ViterbiScore() + prob) { + next_score.v_nb = prefix_score.ViterbiScore() + prob; + + next_score.cur_token_prob = prob; + next_score.times_nb = prefix_score.Times(); + next_score.times_nb.emplace_back(abs_time_step_); + } + + // Prefix changed, calculate the context Score. + if (context_graph_ && !next_score.has_context) { + next_score.UpdateContext( + context_graph_, prefix_score, id, prefix.size()); + next_score.has_context = true; + } + } + } // end for (const auto& it : cur_hyps_) + } // end for (int i = 0; i < topk_index.size(); ++i) + + // 3. second beam prune, only keep top n best paths + std::vector, PrefixScore>> arr(next_hyps.begin(), + next_hyps.end()); + int second_beam_size = + std::min(static_cast(arr.size()), opts_.second_beam_size); + std::nth_element(arr.begin(), + arr.begin() + second_beam_size, + arr.end(), + PrefixScoreCompare); + arr.resize(second_beam_size); + std::sort(arr.begin(), arr.end(), PrefixScoreCompare); + + // 4. update cur_hyps by next_hyps, and get new result + UpdateHypotheses(arr); + + num_frame_decoded_++; + } // end for (int t = 0; t < logp.size(); ++t, ++abs_time_step_) +} + + +void CTCPrefixBeamSearch::UpdateHypotheses( + const std::vector, PrefixScore>>& hyps) { + cur_hyps_.clear(); + + outputs_.clear(); + hypotheses_.clear(); + likelihood_.clear(); + viterbi_likelihood_.clear(); + times_.clear(); + + for (auto& item : hyps) { + cur_hyps_[item.first] = item.second; + + UpdateOutputs(item); + hypotheses_.emplace_back(std::move(item.first)); + likelihood_.emplace_back(item.second.TotalScore()); + viterbi_likelihood_.emplace_back(item.second.ViterbiScore()); + times_.emplace_back(item.second.Times()); + } +} + +void CTCPrefixBeamSearch::FinalizeSearch() { UpdateFinalContext(); } + + +void CTCPrefixBeamSearch::UpdateFinalContext() { + if (context_graph_ == nullptr) return; + assert(hypotheses_.size() == cur_hyps_.size()); + assert(hypotheses_.size() == likelihood_.size()); + + // We should backoff the context Score/state when the context is + // not fully matched at the last time. + for (const auto& prefix : hypotheses_) { + PrefixScore& prefix_score = cur_hyps_[prefix]; + if (prefix_score.context_score != 0) { + // prefix_score.UpdateContext(context_graph_, prefix_score, 0, + // prefix.size()); + } + } + std::vector, PrefixScore>> arr(cur_hyps_.begin(), + cur_hyps_.end()); + std::sort(arr.begin(), arr.end(), PrefixScoreCompare); + + // Update cur_hyps_ and get new result + UpdateHypotheses(arr); +} + + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h index 745c4a83..b67733e8 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h @@ -18,10 +18,8 @@ #include "decoder/ctc_prefix_beam_search_score.h" #include "decoder/decoder_itf.h" -#include "kaldi/decoder/decodable-itf.h" - namespace ppspeech { - +class ContextGraph; class CTCPrefixBeamSearch : public DecoderInterface { public: explicit CTCPrefixBeamSearch(const CTCBeamSearchOptions& opts); @@ -29,36 +27,74 @@ class CTCPrefixBeamSearch : public DecoderInterface { void InitDecoder(); + void Reset(); + + void AdvanceDecode( + const std::shared_ptr& decodable); + + std::string GetFinalBestPath(); + + std::string GetPartialResult() { + CHECK(false) << "Not implement."; + return {}; + } + void Decode(std::shared_ptr decodable); std::string GetBestPath(); std::vector> GetNBestPath(); - std::string GetFinalBestPath(); int NumFrameDecoded(); int DecodeLikelihoods(const std::vector>& probs, std::vector& nbest_words); - void AdvanceDecode( - const std::shared_ptr& decodable); - void Reset(); + const std::vector& ViterbiLikelihood() const { + return viterbi_likelihood_; + } + + const std::vector>& Inputs() const { return hypotheses_; } + + const std::vector>& Outputs() const { return outputs_; } + + const std::vector& Likelihood() const { return likelihood_; } + const std::vector>& Times() const { return times_; } + private: - void ResetPrefixes(); - int32 SearchOneChar(const bool& full_beam, - const std::pair& log_prob_idx, - const BaseFloat& min_cutoff); - void CalculateApproxScore(); - void LMRescore(); - void AdvanceDecoding(const std::vector>& probs); + void AdvanceDecoding(const std::vector>& logp); + + void FinalizeSearch(); + void UpdateOutputs(const std::pair, PrefixScore>& prefix); + void UpdateHypotheses( + const std::vector, PrefixScore>>& prefix); + void UpdateFinalContext(); + + + private: CTCBeamSearchOptions opts_; - size_t blank_id_; - int num_frame_decoded_; + + int abs_time_step_ = 0; + + std::unordered_map, PrefixScore, PrefixScoreHash> + cur_hyps_; + + // n-best list and corresponding likelihood, in sorted order + std::vector> hypotheses_; + std::vector likelihood_; + + std::vector> times_; + std::vector viterbi_likelihood_; + + // Outputs contain the hypotheses_ and tags lik: and + std::vector> outputs_; + + std::shared_ptr context_graph_ = nullptr; + DISALLOW_COPY_AND_ASSIGN(CTCPrefixBeamSearch); }; -} // namespace basr \ No newline at end of file +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h index 19423b5e..da2fb80a 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h @@ -20,35 +20,55 @@ namespace ppspeech { -struct PrefxiScore { +class ContextGraph; + +struct PrefixScore { // decoding, unit in log scale - float b = -kFloatMax; // blank ending score - float nb = -kFloatMax; // none-blank ending score + float b = -kBaseFloatMax; // blank ending score + float nb = -kBaseFloatMax; // none-blank ending score + + // decoding score, sum + float Score() const { return LogSumExp(b, nb); } // timestamp, unit in log sclae - float v_b = -kFloatMax; // viterbi blank ending score - float v_nb = -kFloatMax; // niterbi none-blank ending score - float cur_token_prob = -kFloatMax; // prob of current token - std::vector times_b; // times of viterbi blank path - std::vector times_nb; // times of viterbi non-blank path + float v_b = -kBaseFloatMax; // viterbi blank ending score + float v_nb = -kBaseFloatMax; // niterbi none-blank ending score + float cur_token_prob = -kBaseFloatMax; // prob of current token + std::vector times_b; // times of viterbi blank path + std::vector times_nb; // times of viterbi non-blank path + + + // timestamp score, max + float ViterbiScore() const { return std::max(v_b, v_nb); } + + // get timestamp + const std::vector& Times() const { + return v_b > v_nb ? times_b : times_nb; + } // context state bool has_context = false; int context_state = 0; float context_score = 0; + std::vector start_boundaries; + std::vector end_boundaries; - // decoding score, sum - float Score() const { return LogSumExp(b, nb); } // decodign score with context bias float TotalScore() const { return Score() + context_score; } - // timestamp score, max - float ViterbiScore() const { return std::max(v_b, v_nb); } + void CopyContext(const PrefixScore& prefix_score) { + context_state = prefix_score.context_state; + context_score = prefix_score.context_score; + start_boundaries = prefix_score.start_boundaries; + end_boundaries = prefix_score.end_boundaries; + } - // get timestamp - const std::vector& Times() const { - return v_b > v_nb ? times_b : times_nb; + void UpdateContext(const std::shared_ptr& constext_graph, + const PrefixScore& prefix_score, + int word_id, + int prefix_len) { + CHECK(false); } }; diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h index f2282cb8..f3ecde73 100644 --- a/speechx/speechx/decoder/ctc_tlg_decoder.h +++ b/speechx/speechx/decoder/ctc_tlg_decoder.h @@ -63,8 +63,6 @@ class TLGDecoder : public DecoderInterface { std::shared_ptr decoder_; std::shared_ptr> fst_; std::shared_ptr word_symbol_table_; - // the frame size which have decoded starts from 0. - int32 num_frame_decoded_; }; diff --git a/speechx/speechx/decoder/decoder_itf.h b/speechx/speechx/decoder/decoder_itf.h index 01061939..1bbc6b11 100644 --- a/speechx/speechx/decoder/decoder_itf.h +++ b/speechx/speechx/decoder/decoder_itf.h @@ -31,7 +31,6 @@ class DecoderInterface { virtual void AdvanceDecode( const std::shared_ptr& decodable) = 0; - virtual std::string GetFinalBestPath() = 0; virtual std::string GetPartialResult() = 0; @@ -46,7 +45,7 @@ class DecoderInterface { // std::vector& nbest_words); - private: + protected: // void AdvanceDecoding(kaldi::DecodableInterface* decodable); // current decoding frame number diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc index 5087ac60..6a13f69b 100644 --- a/speechx/speechx/utils/math.cc +++ b/speechx/speechx/utils/math.cc @@ -28,8 +28,8 @@ namespace ppspeech { // Sum in log scale float LogSumExp(float x, float y) { - if (x <= -kFloatMax) return y; - if (y <= -kFloatMax) return x; + if (x <= -kBaseFloatMax) return y; + if (y <= -kBaseFloatMax) return x; float max = std::max(x, y); return max + std::log(std::exp(x - max) + std::exp(y - max)); } From 7dc9cba3be0706cb024f1d998c69b97a5d6816f3 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 13 Oct 2022 11:51:54 +0000 Subject: [PATCH 13/60] ctc prefix beam search for u2, test can run --- speechx/examples/codelab/u2/.gitignore | 1 + speechx/examples/codelab/u2/README.md | 1 + speechx/examples/codelab/u2/local/decode.sh | 22 + speechx/examples/codelab/u2/local/feat.sh | 27 + speechx/examples/codelab/u2/local/nnet.sh | 23 + .../examples/codelab/{u2nnet => u2}/path.sh | 3 +- .../examples/codelab/{u2nnet => u2}/run.sh | 27 +- speechx/examples/codelab/u2nnet/.gitignore | 3 - speechx/examples/codelab/u2nnet/README.md | 3 - speechx/examples/codelab/u2nnet/valgrind.sh | 21 - speechx/speechx/decoder/CMakeLists.txt | 13 +- .../decoder/ctc_beam_search_decoder.cc | 10 +- .../speechx/decoder/ctc_beam_search_decoder.h | 13 +- speechx/speechx/decoder/ctc_beam_search_opt.h | 65 +++ .../decoder/ctc_prefix_beam_search_decoder.cc | 519 ++++++++++-------- .../decoder/ctc_prefix_beam_search_decoder.h | 71 ++- .../ctc_prefix_beam_search_decoder_main.cc | 188 +++++++ .../decoder/ctc_prefix_beam_search_result.h | 41 ++ speechx/speechx/decoder/ctc_tlg_decoder.cc | 17 +- speechx/speechx/decoder/ctc_tlg_decoder.h | 23 +- speechx/speechx/decoder/decoder_itf.h | 22 +- speechx/speechx/nnet/u2_nnet_main.cc | 11 - speechx/speechx/utils/math.cc | 7 +- 23 files changed, 763 insertions(+), 368 deletions(-) create mode 100644 speechx/examples/codelab/u2/.gitignore create mode 100644 speechx/examples/codelab/u2/README.md create mode 100755 speechx/examples/codelab/u2/local/decode.sh create mode 100755 speechx/examples/codelab/u2/local/feat.sh create mode 100755 speechx/examples/codelab/u2/local/nnet.sh rename speechx/examples/codelab/{u2nnet => u2}/path.sh (84%) rename speechx/examples/codelab/{u2nnet => u2}/run.sh (54%) delete mode 100644 speechx/examples/codelab/u2nnet/.gitignore delete mode 100644 speechx/examples/codelab/u2nnet/README.md delete mode 100755 speechx/examples/codelab/u2nnet/valgrind.sh create mode 100644 speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc create mode 100644 speechx/speechx/decoder/ctc_prefix_beam_search_result.h diff --git a/speechx/examples/codelab/u2/.gitignore b/speechx/examples/codelab/u2/.gitignore new file mode 100644 index 00000000..1269488f --- /dev/null +++ b/speechx/examples/codelab/u2/.gitignore @@ -0,0 +1 @@ +data diff --git a/speechx/examples/codelab/u2/README.md b/speechx/examples/codelab/u2/README.md new file mode 100644 index 00000000..3c85dc91 --- /dev/null +++ b/speechx/examples/codelab/u2/README.md @@ -0,0 +1 @@ +# u2/u2pp Streaming Test diff --git a/speechx/examples/codelab/u2/local/decode.sh b/speechx/examples/codelab/u2/local/decode.sh new file mode 100755 index 00000000..12297661 --- /dev/null +++ b/speechx/examples/codelab/u2/local/decode.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -x +set -e + +. path.sh + +data=data +exp=exp +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + +ctc_prefix_beam_search_decoder_main \ + --model_path=$model_dir/export.jit \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --downsampling_rate=4 \ + --vocab_path=$model_dir/unit.txt \ + --feature_rspecifier=ark,t:$exp/fbank.ark \ + --result_wspecifier=ark,t:$exp/result.ark + +echo "u2 ctc prefix beam search decode." diff --git a/speechx/examples/codelab/u2/local/feat.sh b/speechx/examples/codelab/u2/local/feat.sh new file mode 100755 index 00000000..1eec3aae --- /dev/null +++ b/speechx/examples/codelab/u2/local/feat.sh @@ -0,0 +1,27 @@ +#!/bin/bash +set -x +set -e + +. path.sh + +data=data +exp=exp +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + + +cmvn_json2kaldi_main \ + --json_file $model_dir/mean_std.json \ + --cmvn_write_path $exp/cmvn.ark \ + --binary=false + +echo "convert json cmvn to kaldi ark." + +compute_fbank_main \ + --num_bins 80 \ + --wav_rspecifier=scp:$data/wav.scp \ + --cmvn_file=$exp/cmvn.ark \ + --feature_wspecifier=ark,t:$exp/fbank.ark + +echo "compute fbank feature." diff --git a/speechx/examples/codelab/u2/local/nnet.sh b/speechx/examples/codelab/u2/local/nnet.sh new file mode 100755 index 00000000..78663e9c --- /dev/null +++ b/speechx/examples/codelab/u2/local/nnet.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -x +set -e + +. path.sh + +data=data +exp=exp +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + +u2_nnet_main \ + --model_path=$model_dir/export.jit \ + --feature_rspecifier=ark,t:$exp/fbank.ark \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --downsampling_rate=4 \ + --acoustic_scale=1.0 \ + --nnet_encoder_outs_wspecifier=ark,t:$exp/encoder_outs.ark \ + --nnet_prob_wspecifier=ark,t:$exp/logprobs.ark +echo "u2 nnet decode." + diff --git a/speechx/examples/codelab/u2nnet/path.sh b/speechx/examples/codelab/u2/path.sh similarity index 84% rename from speechx/examples/codelab/u2nnet/path.sh rename to speechx/examples/codelab/u2/path.sh index 564e9fed..7f32fbce 100644 --- a/speechx/examples/codelab/u2nnet/path.sh +++ b/speechx/examples/codelab/u2/path.sh @@ -12,8 +12,7 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin export LC_AL=C -SPEECHX_BIN=$SPEECHX_BUILD/nnet -export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN +export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio PADDLE_LIB_PATH=$(python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')") export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH diff --git a/speechx/examples/codelab/u2nnet/run.sh b/speechx/examples/codelab/u2/run.sh similarity index 54% rename from speechx/examples/codelab/u2nnet/run.sh rename to speechx/examples/codelab/u2/run.sh index 704653e7..d314262b 100755 --- a/speechx/examples/codelab/u2nnet/run.sh +++ b/speechx/examples/codelab/u2/run.sh @@ -36,29 +36,8 @@ ckpt_dir=./data/model model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ -cmvn_json2kaldi_main \ - --json_file $model_dir/mean_std.json \ - --cmvn_write_path $exp/cmvn.ark \ - --binary=false +./local/feat.sh -echo "convert json cmvn to kaldi ark." +./local/nnet.sh -compute_fbank_main \ - --num_bins 80 \ - --wav_rspecifier=scp:$data/wav.scp \ - --cmvn_file=$exp/cmvn.ark \ - --feature_wspecifier=ark,t:$exp/fbank.ark - -echo "compute fbank feature." - -u2_nnet_main \ - --model_path=$model_dir/export.jit \ - --feature_rspecifier=ark,t:$exp/fbank.ark \ - --nnet_decoder_chunk=16 \ - --receptive_field_length=7 \ - --downsampling_rate=4 \ - --acoustic_scale=1.0 \ - --nnet_encoder_outs_wspecifier=ark,t:$exp/encoder_outs.ark \ - --nnet_prob_wspecifier=ark,t:$exp/logprobs.ark - -echo "u2 nnet decode." +./local/decode.sh diff --git a/speechx/examples/codelab/u2nnet/.gitignore b/speechx/examples/codelab/u2nnet/.gitignore deleted file mode 100644 index d6fe69bc..00000000 --- a/speechx/examples/codelab/u2nnet/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -data -exp -*log diff --git a/speechx/examples/codelab/u2nnet/README.md b/speechx/examples/codelab/u2nnet/README.md deleted file mode 100644 index 772a58f0..00000000 --- a/speechx/examples/codelab/u2nnet/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Deepspeech2 Streaming NNet Test - -Using for ds2 streaming nnet inference test. diff --git a/speechx/examples/codelab/u2nnet/valgrind.sh b/speechx/examples/codelab/u2nnet/valgrind.sh deleted file mode 100755 index a5aab663..00000000 --- a/speechx/examples/codelab/u2nnet/valgrind.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -# this script is for memory check, so please run ./run.sh first. - -set +x -set -e - -. ./path.sh - -if [ ! -d ${SPEECHX_TOOLS}/valgrind/install ]; then - echo "please install valgrind in the speechx tools dir.\n" - exit 1 -fi - -ckpt_dir=./data/model -model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/ - -valgrind --tool=memcheck --track-origins=yes --leak-check=full --show-leak-kinds=all \ - ds2_model_test_main \ - --model_path=$model_dir/avg_1.jit.pdmodel \ - --param_path=$model_dir/avg_1.jit.pdparams diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt index b08aaba5..8cf94a10 100644 --- a/speechx/speechx/decoder/CMakeLists.txt +++ b/speechx/speechx/decoder/CMakeLists.txt @@ -10,8 +10,9 @@ add_library(decoder STATIC ctc_tlg_decoder.cc recognizer.cc ) -target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder) +target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder absl::strings) +# test set(BINS ctc_beam_search_decoder_main nnet_logprob_decoder_main @@ -24,3 +25,13 @@ foreach(bin_name IN LISTS BINS) target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS}) endforeach() + + +# u2 +set(bin_name ctc_prefix_beam_search_decoder_main) +add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) +target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) +target_link_libraries(${bin_name} nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util) +target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS}) +target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) +target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) \ No newline at end of file diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_beam_search_decoder.cc index 76342b87..3f00ee35 100644 --- a/speechx/speechx/decoder/ctc_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_beam_search_decoder.cc @@ -82,8 +82,6 @@ void CTCBeamSearch::Decode( return; } -int32 CTCBeamSearch::NumFrameDecoded() { return num_frame_decoded_ + 1; } - // todo rename, refactor void CTCBeamSearch::AdvanceDecode( const std::shared_ptr& decodable) { @@ -110,15 +108,19 @@ void CTCBeamSearch::ResetPrefixes() { int CTCBeamSearch::DecodeLikelihoods(const vector>& probs, vector& nbest_words) { kaldi::Timer timer; - timer.Reset(); AdvanceDecoding(probs); LOG(INFO) << "ctc decoding elapsed time(s) " << static_cast(timer.Elapsed()) / 1000.0f; return 0; } +vector> CTCBeamSearch::GetNBestPath(int n) { + int beam_size = n == -1 ? opts_.beam_size: std::min(n, opts_.beam_size); + return get_beam_search_result(prefixes_, vocabulary_, beam_size); +} + vector> CTCBeamSearch::GetNBestPath() { - return get_beam_search_result(prefixes_, vocabulary_, opts_.beam_size); + return GetNBestPath(-1); } string CTCBeamSearch::GetBestPath() { diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.h b/speechx/speechx/decoder/ctc_beam_search_decoder.h index 516f8b2c..479754c3 100644 --- a/speechx/speechx/decoder/ctc_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_beam_search_decoder.h @@ -35,6 +35,11 @@ class CTCBeamSearch : public DecoderInterface { void AdvanceDecode( const std::shared_ptr& decodable); + void Decode(std::shared_ptr decodable); + + std::string GetBestPath(); + std::vector> GetNBestPath(); + std::vector> GetNBestPath(int n); std::string GetFinalBestPath(); std::string GetPartialResult() { @@ -42,14 +47,6 @@ class CTCBeamSearch : public DecoderInterface { return {}; } - void Decode(std::shared_ptr decodable); - - std::string GetBestPath(); - std::vector> GetNBestPath(); - - - int NumFrameDecoded(); - int DecodeLikelihoods(const std::vector>& probs, std::vector& nbest_words); diff --git a/speechx/speechx/decoder/ctc_beam_search_opt.h b/speechx/speechx/decoder/ctc_beam_search_opt.h index dcb62258..af92fad0 100644 --- a/speechx/speechx/decoder/ctc_beam_search_opt.h +++ b/speechx/speechx/decoder/ctc_beam_search_opt.h @@ -19,6 +19,7 @@ namespace ppspeech { + struct CTCBeamSearchOptions { // common int blank; @@ -75,4 +76,68 @@ struct CTCBeamSearchOptions { } }; + +// used by u2 model +struct CTCBeamSearchDecoderOptions { + // chunk_size is the frame number of one chunk after subsampling. + // e.g. if subsample rate is 4 and chunk_size = 16, the frames in + // one chunk are 67=16*4 + 3, stride is 64=16*4 + int chunk_size; + int num_left_chunks; + + // final_score = rescoring_weight * rescoring_score + ctc_weight * + // ctc_score; + // rescoring_score = left_to_right_score * (1 - reverse_weight) + + // right_to_left_score * reverse_weight + // Please note the concept of ctc_scores + // in the following two search methods are different. For + // CtcPrefixBeamSerch, + // it's a sum(prefix) score + context score For CtcWfstBeamSerch, it's a + // max(viterbi) path score + context score So we should carefully set + // ctc_weight accroding to the search methods. + float ctc_weight; + float rescoring_weight; + float reverse_weight; + + // CtcEndpointConfig ctc_endpoint_opts; + + CTCBeamSearchOptions ctc_prefix_search_opts; + + CTCBeamSearchDecoderOptions() + : chunk_size(16), + num_left_chunks(-1), + ctc_weight(0.5), + rescoring_weight(1.0), + reverse_weight(0.0) {} + + void Register(kaldi::OptionsItf* opts) { + std::string module = "DecoderConfig: "; + opts->Register( + "chunk-size", + &chunk_size, + module + "the frame number of one chunk after subsampling."); + opts->Register("num-left-chunks", + &num_left_chunks, + module + "the left history chunks number."); + opts->Register("ctc-weight", + &ctc_weight, + module + + "ctc weight for rescore. final_score = " + "rescoring_weight * rescoring_score + ctc_weight * " + "ctc_score."); + opts->Register("rescoring-weight", + &rescoring_weight, + module + + "attention score weight for rescore. final_score = " + "rescoring_weight * rescoring_score + ctc_weight * " + "ctc_score."); + opts->Register("reverse-weight", + &reverse_weight, + module + + "reverse decoder weight. rescoring_score = " + "left_to_right_score * (1 - reverse_weight) + " + "right_to_left_score * reverse_weight."); + } +}; + } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc index fd689023..f22bfea2 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc @@ -1,3 +1,5 @@ +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) +// 2022 Binbin Zhang (binbzha@qq.com) // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,11 +15,12 @@ // limitations under the License. +#include "decoder/ctc_prefix_beam_search_decoder.h" #include "base/common.h" #include "decoder/ctc_beam_search_opt.h" #include "decoder/ctc_prefix_beam_search_score.h" -#include "decoder/ctc_prefix_beam_search_decoder.h" #include "utils/math.h" +#include "absl/strings/str_join.h" #ifdef USE_PROFILING #include "paddle/fluid/platform/profiler.h" @@ -29,85 +32,47 @@ namespace ppspeech { CTCPrefixBeamSearch::CTCPrefixBeamSearch(const CTCBeamSearchOptions& opts) : opts_(opts) { - InitDecoder(); + Reset(); } -void CTCPrefixBeamSearch::InitDecoder() { +void CTCPrefixBeamSearch::Reset() { num_frame_decoded_ = 0; cur_hyps_.clear(); - hypotheses_.clear(); - likelihood_.clear(); - viterbi_likelihood_.clear(); - times_.clear(); - outputs_.clear(); + hypotheses_.clear(); + likelihood_.clear(); + viterbi_likelihood_.clear(); + times_.clear(); + outputs_.clear(); - abs_time_step_ = 0; + // empty hyp with Score + std::vector empty; + PrefixScore prefix_score; + prefix_score.b = 0.0f; // log(1) + prefix_score.nb = -kBaseFloatMax; // log(0) + prefix_score.v_b = 0.0f; // log(1) + prefix_score.v_nb = 0.0f; // log(1) + cur_hyps_[empty] = prefix_score; - // empty hyp with Score - std::vector empty; - PrefixScore prefix_score; - prefix_score.b = 0.0f; // log(1) - prefix_score.nb = -kBaseFloatMax; // log(0) - prefix_score.v_b = 0.0f; // log(1) - prefix_score.v_nb = 0.0f; // log(1) - cur_hyps_[empty] = prefix_score; + outputs_.emplace_back(empty); + hypotheses_.emplace_back(empty); + likelihood_.emplace_back(prefix_score.TotalScore()); + times_.emplace_back(empty); + } - outputs_.emplace_back(empty); - hypotheses_.emplace_back(empty); - likelihood_.emplace_back(prefix_score.TotalScore()); - times_.emplace_back(empty); - -} +void CTCPrefixBeamSearch::InitDecoder() { Reset(); } -void CTCPrefixBeamSearch::Reset() { - InitDecoder(); -} - -void CTCPrefixBeamSearch::Decode( - std::shared_ptr decodable) { - return; -} - -int32 CTCPrefixBeamSearch::NumFrameDecoded() { return num_frame_decoded_ + 1; } - - -void CTCPrefixBeamSearch::UpdateOutputs( - const std::pair, PrefixScore>& prefix) { - const std::vector& input = prefix.first; - // const std::vector& start_boundaries = prefix.second.start_boundaries; - // const std::vector& end_boundaries = prefix.second.end_boundaries; - - std::vector output; - int s = 0; - int e = 0; - for (int i = 0; i < input.size(); ++i) { - // if (s < start_boundaries.size() && i == start_boundaries[s]){ - // // - // output.emplace_back(context_graph_->start_tag_id()); - // ++s; - // } - - output.emplace_back(input[i]); - - // if (e < end_boundaries.size() && i == end_boundaries[e]){ - // // - // output.emplace_back(context_graph_->end_tag_id()); - // ++e; - // } - } - - outputs_.emplace_back(output); -} void CTCPrefixBeamSearch::AdvanceDecode( const std::shared_ptr& decodable) { while (1) { + // forward frame by frame std::vector frame_prob; bool flag = decodable->FrameLikelihood(num_frame_decoded_, &frame_prob); if (flag == false) break; + std::vector> likelihood; likelihood.push_back(frame_prob); AdvanceDecoding(likelihood); @@ -117,201 +82,279 @@ void CTCPrefixBeamSearch::AdvanceDecode( static bool PrefixScoreCompare( const std::pair, PrefixScore>& a, const std::pair, PrefixScore>& b) { - // log domain - return a.second.TotalScore() > b.second.TotalScore(); + // log domain + return a.second.TotalScore() > b.second.TotalScore(); } -void CTCPrefixBeamSearch::AdvanceDecoding(const std::vector>& logp) { +void CTCPrefixBeamSearch::AdvanceDecoding( + const std::vector>& logp) { #ifdef USE_PROFILING - RecordEvent event( - "CtcPrefixBeamSearch::AdvanceDecoding", TracerEventType::UserDefined, 1); + RecordEvent event("CtcPrefixBeamSearch::AdvanceDecoding", + TracerEventType::UserDefined, + 1); #endif - if (logp.size() == 0) return; - - int first_beam_size = - std::min(static_cast(logp[0].size()), opts_.first_beam_size); - - for (int t = 0; t < logp.size(); ++t, ++abs_time_step_) { - const std::vector& logp_t = logp[t]; - std::unordered_map, PrefixScore, PrefixScoreHash> next_hyps; - - // 1. first beam prune, only select topk candidates - std::vector topk_score; - std::vector topk_index; - TopK(logp_t, first_beam_size, &topk_score, &topk_index); - - // 2. token passing - for (int i = 0; i < topk_index.size(); ++i) { - int id = topk_index[i]; - auto prob = topk_score[i]; - - for (const auto& it : cur_hyps_) { - const std::vector& prefix = it.first; - const PrefixScore& prefix_score = it.second; - - // If prefix doesn't exist in next_hyps, next_hyps[prefix] will insert - // PrefixScore(-inf, -inf) by default, since the default constructor - // of PrefixScore will set fields b(blank ending Score) and - // nb(none blank ending Score) to -inf, respectively. - - if (id == opts_.blank) { - // case 0: *a + => *a, *a + => *a, prefix not - // change - PrefixScore& next_score = next_hyps[prefix]; - next_score.b = LogSumExp(next_score.b, prefix_score.Score() + prob); - - // timestamp, blank is slince, not effact timestamp - next_score.v_b = prefix_score.ViterbiScore() + prob; - next_score.times_b = prefix_score.Times(); - - // Prefix not changed, copy the context from pefix - if (context_graph_ && !next_score.has_context) { - next_score.CopyContext(prefix_score); - next_score.has_context = true; - } - - } else if (!prefix.empty() && id == prefix.back()) { - // case 1: *a + a => *a, prefix not changed - PrefixScore& next_score1 = next_hyps[prefix]; - next_score1.nb = LogSumExp(next_score1.nb, prefix_score.nb + prob); - - // timestamp, non-blank symbol effact timestamp - if (next_score1.v_nb < prefix_score.v_nb + prob) { - // compute viterbi Score - next_score1.v_nb = prefix_score.v_nb + prob; - if (next_score1.cur_token_prob < prob) { - // store max token prob - next_score1.cur_token_prob = prob; - // update this timestamp as token appeared here. - next_score1.times_nb = prefix_score.times_nb; - assert(next_score1.times_nb.size() > 0); - next_score1.times_nb.back() = abs_time_step_; - } - } - - // Prefix not changed, copy the context from pefix - if (context_graph_ && !next_score1.has_context) { - next_score1.CopyContext(prefix_score); - next_score1.has_context = true; - } - - // case 2: *a + a => *aa, prefix changed. - std::vector new_prefix(prefix); - new_prefix.emplace_back(id); - PrefixScore& next_score2 = next_hyps[new_prefix]; - next_score2.nb = LogSumExp(next_score2.nb, prefix_score.b + prob); - - // timestamp, non-blank symbol effact timestamp - if (next_score2.v_nb < prefix_score.v_b + prob) { - // compute viterbi Score - next_score2.v_nb = prefix_score.v_b + prob; - // new token added - next_score2.cur_token_prob = prob; - next_score2.times_nb = prefix_score.times_b; - next_score2.times_nb.emplace_back(abs_time_step_); - } - - // Prefix changed, calculate the context Score. - if (context_graph_ && !next_score2.has_context) { - next_score2.UpdateContext( - context_graph_, prefix_score, id, prefix.size()); - next_score2.has_context = true; - } - - } else { - // id != prefix.back() - // case 3: *a + b => *ab, *a +b => *ab - std::vector new_prefix(prefix); - new_prefix.emplace_back(id); - PrefixScore& next_score = next_hyps[new_prefix]; - next_score.nb = LogSumExp(next_score.nb, prefix_score.Score() + prob); - - // timetamp, non-blank symbol effact timestamp - if (next_score.v_nb < prefix_score.ViterbiScore() + prob) { - next_score.v_nb = prefix_score.ViterbiScore() + prob; - - next_score.cur_token_prob = prob; - next_score.times_nb = prefix_score.Times(); - next_score.times_nb.emplace_back(abs_time_step_); - } - - // Prefix changed, calculate the context Score. - if (context_graph_ && !next_score.has_context) { - next_score.UpdateContext( - context_graph_, prefix_score, id, prefix.size()); - next_score.has_context = true; - } - } - } // end for (const auto& it : cur_hyps_) - } // end for (int i = 0; i < topk_index.size(); ++i) - - // 3. second beam prune, only keep top n best paths - std::vector, PrefixScore>> arr(next_hyps.begin(), - next_hyps.end()); - int second_beam_size = - std::min(static_cast(arr.size()), opts_.second_beam_size); - std::nth_element(arr.begin(), - arr.begin() + second_beam_size, - arr.end(), - PrefixScoreCompare); - arr.resize(second_beam_size); - std::sort(arr.begin(), arr.end(), PrefixScoreCompare); - - // 4. update cur_hyps by next_hyps, and get new result - UpdateHypotheses(arr); - - num_frame_decoded_++; - } // end for (int t = 0; t < logp.size(); ++t, ++abs_time_step_) + if (logp.size() == 0) return; + + int first_beam_size = + std::min(static_cast(logp[0].size()), opts_.first_beam_size); + + for (int t = 0; t < logp.size(); ++t, ++num_frame_decoded_) { + const std::vector& logp_t = logp[t]; + std::unordered_map, PrefixScore, PrefixScoreHash> + next_hyps; + + // 1. first beam prune, only select topk candidates + std::vector topk_score; + std::vector topk_index; + TopK(logp_t, first_beam_size, &topk_score, &topk_index); + + // 2. token passing + for (int i = 0; i < topk_index.size(); ++i) { + int id = topk_index[i]; + auto prob = topk_score[i]; + + for (const auto& it : cur_hyps_) { + const std::vector& prefix = it.first; + const PrefixScore& prefix_score = it.second; + + // If prefix doesn't exist in next_hyps, next_hyps[prefix] will + // insert + // PrefixScore(-inf, -inf) by default, since the default + // constructor + // of PrefixScore will set fields b(blank ending Score) and + // nb(none blank ending Score) to -inf, respectively. + + if (id == opts_.blank) { + // case 0: *a + => *a, *a + => *a, + // prefix not + // change + PrefixScore& next_score = next_hyps[prefix]; + next_score.b = + LogSumExp(next_score.b, prefix_score.Score() + prob); + + // timestamp, blank is slince, not effact timestamp + next_score.v_b = prefix_score.ViterbiScore() + prob; + next_score.times_b = prefix_score.Times(); + + // Prefix not changed, copy the context from pefix + if (context_graph_ && !next_score.has_context) { + next_score.CopyContext(prefix_score); + next_score.has_context = true; + } + + } else if (!prefix.empty() && id == prefix.back()) { + // case 1: *a + a => *a, prefix not changed + PrefixScore& next_score1 = next_hyps[prefix]; + next_score1.nb = + LogSumExp(next_score1.nb, prefix_score.nb + prob); + + // timestamp, non-blank symbol effact timestamp + if (next_score1.v_nb < prefix_score.v_nb + prob) { + // compute viterbi Score + next_score1.v_nb = prefix_score.v_nb + prob; + if (next_score1.cur_token_prob < prob) { + // store max token prob + next_score1.cur_token_prob = prob; + // update this timestamp as token appeared here. + next_score1.times_nb = prefix_score.times_nb; + assert(next_score1.times_nb.size() > 0); + next_score1.times_nb.back() = num_frame_decoded_; + } + } + + // Prefix not changed, copy the context from pefix + if (context_graph_ && !next_score1.has_context) { + next_score1.CopyContext(prefix_score); + next_score1.has_context = true; + } + + // case 2: *a + a => *aa, prefix changed. + std::vector new_prefix(prefix); + new_prefix.emplace_back(id); + PrefixScore& next_score2 = next_hyps[new_prefix]; + next_score2.nb = + LogSumExp(next_score2.nb, prefix_score.b + prob); + + // timestamp, non-blank symbol effact timestamp + if (next_score2.v_nb < prefix_score.v_b + prob) { + // compute viterbi Score + next_score2.v_nb = prefix_score.v_b + prob; + // new token added + next_score2.cur_token_prob = prob; + next_score2.times_nb = prefix_score.times_b; + next_score2.times_nb.emplace_back(num_frame_decoded_); + } + + // Prefix changed, calculate the context Score. + if (context_graph_ && !next_score2.has_context) { + next_score2.UpdateContext( + context_graph_, prefix_score, id, prefix.size()); + next_score2.has_context = true; + } + + } else { + // id != prefix.back() + // case 3: *a + b => *ab, *a +b => *ab + std::vector new_prefix(prefix); + new_prefix.emplace_back(id); + PrefixScore& next_score = next_hyps[new_prefix]; + next_score.nb = + LogSumExp(next_score.nb, prefix_score.Score() + prob); + + // timetamp, non-blank symbol effact timestamp + if (next_score.v_nb < prefix_score.ViterbiScore() + prob) { + next_score.v_nb = prefix_score.ViterbiScore() + prob; + + next_score.cur_token_prob = prob; + next_score.times_nb = prefix_score.Times(); + next_score.times_nb.emplace_back(num_frame_decoded_); + } + + // Prefix changed, calculate the context Score. + if (context_graph_ && !next_score.has_context) { + next_score.UpdateContext( + context_graph_, prefix_score, id, prefix.size()); + next_score.has_context = true; + } + } + } // end for (const auto& it : cur_hyps_) + } // end for (int i = 0; i < topk_index.size(); ++i) + + // 3. second beam prune, only keep top n best paths + std::vector, PrefixScore>> arr( + next_hyps.begin(), next_hyps.end()); + int second_beam_size = + std::min(static_cast(arr.size()), opts_.second_beam_size); + std::nth_element(arr.begin(), + arr.begin() + second_beam_size, + arr.end(), + PrefixScoreCompare); + arr.resize(second_beam_size); + std::sort(arr.begin(), arr.end(), PrefixScoreCompare); + + // 4. update cur_hyps by next_hyps, and get new result + UpdateHypotheses(arr); + } // end for (int t = 0; t < logp.size(); ++t, ++num_frame_decoded_) } void CTCPrefixBeamSearch::UpdateHypotheses( const std::vector, PrefixScore>>& hyps) { - cur_hyps_.clear(); - - outputs_.clear(); - hypotheses_.clear(); - likelihood_.clear(); - viterbi_likelihood_.clear(); - times_.clear(); - - for (auto& item : hyps) { - cur_hyps_[item.first] = item.second; - - UpdateOutputs(item); - hypotheses_.emplace_back(std::move(item.first)); - likelihood_.emplace_back(item.second.TotalScore()); - viterbi_likelihood_.emplace_back(item.second.ViterbiScore()); - times_.emplace_back(item.second.Times()); - } + cur_hyps_.clear(); + + outputs_.clear(); + hypotheses_.clear(); + likelihood_.clear(); + viterbi_likelihood_.clear(); + times_.clear(); + + for (auto& item : hyps) { + cur_hyps_[item.first] = item.second; + + UpdateOutputs(item); + hypotheses_.emplace_back(std::move(item.first)); + likelihood_.emplace_back(item.second.TotalScore()); + viterbi_likelihood_.emplace_back(item.second.ViterbiScore()); + times_.emplace_back(item.second.Times()); + } } -void CTCPrefixBeamSearch::FinalizeSearch() { UpdateFinalContext(); } +void CTCPrefixBeamSearch::UpdateOutputs( + const std::pair, PrefixScore>& prefix) { + const std::vector& input = prefix.first; + const std::vector& start_boundaries = prefix.second.start_boundaries; + const std::vector& end_boundaries = prefix.second.end_boundaries; + + // add tag + std::vector output; + int s = 0; + int e = 0; + for (int i = 0; i < input.size(); ++i) { + // if (s < start_boundaries.size() && i == start_boundaries[s]){ + // // + // output.emplace_back(context_graph_->start_tag_id()); + // ++s; + // } + + output.emplace_back(input[i]); + + // if (e < end_boundaries.size() && i == end_boundaries[e]){ + // // + // output.emplace_back(context_graph_->end_tag_id()); + // ++e; + // } + } + outputs_.emplace_back(output); +} + +void CTCPrefixBeamSearch::FinalizeSearch() { + UpdateFinalContext(); +} void CTCPrefixBeamSearch::UpdateFinalContext() { - if (context_graph_ == nullptr) return; - assert(hypotheses_.size() == cur_hyps_.size()); - assert(hypotheses_.size() == likelihood_.size()); - - // We should backoff the context Score/state when the context is - // not fully matched at the last time. - for (const auto& prefix : hypotheses_) { - PrefixScore& prefix_score = cur_hyps_[prefix]; - if (prefix_score.context_score != 0) { - // prefix_score.UpdateContext(context_graph_, prefix_score, 0, - // prefix.size()); + if (context_graph_ == nullptr) return; + + CHECK(hypotheses_.size() == cur_hyps_.size()); + CHECK(hypotheses_.size() == likelihood_.size()); + + // We should backoff the context Score/state when the context is + // not fully matched at the last time. + for (const auto& prefix : hypotheses_) { + PrefixScore& prefix_score = cur_hyps_[prefix]; + if (prefix_score.context_score != 0) { + prefix_score.UpdateContext(context_graph_, prefix_score, 0, + prefix.size()); + } } + std::vector, PrefixScore>> arr(cur_hyps_.begin(), + cur_hyps_.end()); + std::sort(arr.begin(), arr.end(), PrefixScoreCompare); + + // Update cur_hyps_ and get new result + UpdateHypotheses(arr); +} + + std::string CTCPrefixBeamSearch::GetBestPath(int index) { + int n_hyps = Outputs().size(); + CHECK(n_hyps > 0); + CHECK(index < n_hyps); + std::vector one = Outputs()[index]; + return std::string(absl::StrJoin(one, kSpaceSymbol)); + } + + std::string CTCPrefixBeamSearch::GetBestPath() { + return GetBestPath(0); + } + + std::vector> CTCPrefixBeamSearch::GetNBestPath(int n) { + int hyps_size = hypotheses_.size(); + CHECK(hyps_size > 0); + + int min_n = n == -1 ? hypotheses_.size() : std::min(n, hyps_size); + + std::vector> n_best; + n_best.reserve(min_n); + + for (int i = 0; i < min_n; i++){ + n_best.emplace_back(Likelihood()[i], GetBestPath(i) ); + } + return n_best; + } + + std::vector> CTCPrefixBeamSearch::GetNBestPath() { + return GetNBestPath(-1); } - std::vector, PrefixScore>> arr(cur_hyps_.begin(), - cur_hyps_.end()); - std::sort(arr.begin(), arr.end(), PrefixScoreCompare); - // Update cur_hyps_ and get new result - UpdateHypotheses(arr); +std::string CTCPrefixBeamSearch::GetFinalBestPath() { + return GetBestPath(); +} + +std::string CTCPrefixBeamSearch::GetPartialResult() { + return GetBestPath(); } -} // namespace ppspeech \ No newline at end of file +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h index b67733e8..ba44b0a2 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h @@ -15,6 +15,7 @@ #pragma once #include "decoder/ctc_beam_search_opt.h" +#include "decoder/ctc_prefix_beam_search_result.h" #include "decoder/ctc_prefix_beam_search_score.h" #include "decoder/decoder_itf.h" @@ -25,48 +26,37 @@ class CTCPrefixBeamSearch : public DecoderInterface { explicit CTCPrefixBeamSearch(const CTCBeamSearchOptions& opts); ~CTCPrefixBeamSearch() {} - void InitDecoder(); + void InitDecoder() override; - void Reset(); + void Reset() override; void AdvanceDecode( - const std::shared_ptr& decodable); + const std::shared_ptr& decodable) override; - std::string GetFinalBestPath(); + std::string GetFinalBestPath() override; + std::string GetPartialResult() override; - std::string GetPartialResult() { - CHECK(false) << "Not implement."; - return {}; - } - - void Decode(std::shared_ptr decodable); - - std::string GetBestPath(); - - std::vector> GetNBestPath(); - - - int NumFrameDecoded(); - - int DecodeLikelihoods(const std::vector>& probs, - std::vector& nbest_words); + void FinalizeSearch(); - const std::vector& ViterbiLikelihood() const { - return viterbi_likelihood_; - } + protected: + std::string GetBestPath() override; + std::vector> GetNBestPath() override; + std::vector> GetNBestPath(int n) override; const std::vector>& Inputs() const { return hypotheses_; } - const std::vector>& Outputs() const { return outputs_; } - const std::vector& Likelihood() const { return likelihood_; } + const std::vector& ViterbiLikelihood() const { + return viterbi_likelihood_; + } const std::vector>& Times() const { return times_; } private: - void AdvanceDecoding(const std::vector>& logp); + std::string GetBestPath(int index); - void FinalizeSearch(); + void AdvanceDecoding( + const std::vector>& logp); void UpdateOutputs(const std::pair, PrefixScore>& prefix); void UpdateHypotheses( @@ -77,8 +67,6 @@ class CTCPrefixBeamSearch : public DecoderInterface { private: CTCBeamSearchOptions opts_; - int abs_time_step_ = 0; - std::unordered_map, PrefixScore, PrefixScoreHash> cur_hyps_; @@ -97,4 +85,29 @@ class CTCPrefixBeamSearch : public DecoderInterface { DISALLOW_COPY_AND_ASSIGN(CTCPrefixBeamSearch); }; + +class CTCPrefixBeamSearchDecoder : public CTCPrefixBeamSearch { + public: + explicit CTCPrefixBeamSearchDecoder(const CTCBeamSearchDecoderOptions& opts) + : CTCPrefixBeamSearch(opts.ctc_prefix_search_opts), opts_(opts) {} + + ~CTCPrefixBeamSearchDecoder() {} + + private: + CTCBeamSearchDecoderOptions opts_; + + // cache feature + bool start_ = false; // false, this is first frame. + // for continues decoding + int num_frames_ = 0; + int global_frame_offset_ = 0; + const int time_stamp_gap_ = + 100; // timestamp gap between words in a sentence + + // std::unique_ptr ctc_endpointer_; + + int num_frames_in_current_chunk_ = 0; + std::vector result_; +}; + } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc new file mode 100644 index 00000000..8927a5f4 --- /dev/null +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc @@ -0,0 +1,188 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "base/common.h" +#include "decoder/ctc_prefix_beam_search_decoder.h" +#include "frontend/audio/data_cache.h" +#include "kaldi/util/table-types.h" +#include "nnet/decodable.h" +#include "nnet/u2_nnet.h" +#include "absl/strings/str_split.h" +#include "fst/symbol-table.h" + +DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); +DEFINE_string(result_wspecifier, "", "test result wspecifier"); +DEFINE_string(vocab_path, "", "vocab path"); + +DEFINE_string(model_path, "", "paddle nnet model"); + +DEFINE_int32(receptive_field_length, + 7, + "receptive field of two CNN(kernel=3) downsampling module."); +DEFINE_int32(downsampling_rate, + 4, + "two CNN(kernel=3) module downsampling rate."); + +DEFINE_int32(nnet_decoder_chunk, 16, "paddle nnet forward chunk"); + +using kaldi::BaseFloat; +using kaldi::Matrix; +using std::vector; + +// test ds2 online decoder by feeding speech feature +int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); + gflags::ParseCommandLineFlags(&argc, &argv, false); + google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; + + int32 num_done = 0, num_err = 0; + + CHECK(FLAGS_result_wspecifier != ""); + CHECK(FLAGS_feature_rspecifier != ""); + CHECK(FLAGS_vocab_path != ""); + CHECK(FLAGS_model_path != ""); + LOG(INFO) << "model path: " << FLAGS_model_path; + + kaldi::SequentialBaseFloatMatrixReader feature_reader( + FLAGS_feature_rspecifier); + kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); + + LOG(INFO) << "Reading vocab table " << FLAGS_vocab_path; + fst::SymbolTable* unit_table = fst::SymbolTable::ReadText(FLAGS_vocab_path); + + // nnet + ppspeech::ModelOptions model_opts; + model_opts.model_path = FLAGS_model_path; + std::shared_ptr nnet( + new ppspeech::U2Nnet(model_opts)); + + // decodeable + std::shared_ptr raw_data(new ppspeech::DataCache()); + std::shared_ptr decodable( + new ppspeech::Decodable(nnet, raw_data)); + + // decoder + ppspeech::CTCBeamSearchDecoderOptions opts; + opts.chunk_size = 16; + opts.num_left_chunks = -1; + opts.ctc_weight = 0.5; + opts.rescoring_weight = 1.0; + opts.reverse_weight = 0.3; + opts.ctc_prefix_search_opts.blank = 0; + opts.ctc_prefix_search_opts.first_beam_size = 10; + opts.ctc_prefix_search_opts.second_beam_size = 10; + ppspeech::CTCPrefixBeamSearchDecoder decoder(opts); + + + int32 chunk_size = FLAGS_receptive_field_length + + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate; + int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk; + int32 receptive_field_length = FLAGS_receptive_field_length; + LOG(INFO) << "chunk size (frame): " << chunk_size; + LOG(INFO) << "chunk stride (frame): " << chunk_stride; + LOG(INFO) << "receptive field (frame): " << receptive_field_length; + + decoder.InitDecoder(); + + kaldi::Timer timer; + for (; !feature_reader.Done(); feature_reader.Next()) { + string utt = feature_reader.Key(); + kaldi::Matrix feature = feature_reader.Value(); + + int nframes = feature.NumRows(); + int feat_dim = feature.NumCols(); + raw_data->SetDim(feat_dim); + LOG(INFO) << "utt: " << utt; + LOG(INFO) << "feat shape: " << nframes << ", " << feat_dim; + + raw_data->SetDim(feat_dim); + + int32 ori_feature_len = feature.NumRows(); + int32 num_chunks = feature.NumRows() / chunk_stride + 1; + LOG(INFO) << "num_chunks: " << num_chunks; + + for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) { + int32 this_chunk_size = 0; + if (ori_feature_len > chunk_idx * chunk_stride) { + this_chunk_size = std::min( + ori_feature_len - chunk_idx * chunk_stride, chunk_size); + } + if (this_chunk_size < receptive_field_length) { + LOG(WARNING) << "utt: " << utt << " skip last " + << this_chunk_size << " frames, expect is " + << receptive_field_length; + break; + } + + + kaldi::Vector feature_chunk(this_chunk_size * + feat_dim); + int32 start = chunk_idx * chunk_stride; + for (int row_id = 0; row_id < this_chunk_size; ++row_id) { + kaldi::SubVector feat_row(feature, start); + kaldi::SubVector feature_chunk_row( + feature_chunk.Data() + row_id * feat_dim, feat_dim); + + feature_chunk_row.CopyFromVec(feat_row); + ++start; + } + + // feat to frontend pipeline cache + raw_data->Accept(feature_chunk); + + // send data finish signal + if (chunk_idx == num_chunks - 1) { + raw_data->SetFinished(); + } + + // forward nnet + decoder.AdvanceDecode(decodable); + } + + decoder.FinalizeSearch(); + + // get 1-best result + std::string result_ints = decoder.GetFinalBestPath(); + std::vector tokenids = absl::StrSplit(result_ints, ppspeech::kSpaceSymbol); + std::string result; + for (int i = 0; i < tokenids.size(); i++){ + result += unit_table->Find(std::stoi(tokenids[i])); + } + + // after process one utt, then reset state. + decodable->Reset(); + decoder.Reset(); + + if (result.empty()) { + // the TokenWriter can not write empty string. + ++num_err; + LOG(INFO) << " the result of " << utt << " is empty"; + continue; + } + + LOG(INFO) << " the result of " << utt << " is " << result; + result_writer.Write(utt, result); + + ++num_done; + } + + double elapsed = timer.Elapsed(); + LOG(INFO) << "Program cost:" << elapsed << " sec"; + + LOG(INFO) << "Done " << num_done << " utterances, " << num_err + << " with errors."; + return (num_done != 0 ? 0 : 1); +} diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_result.h b/speechx/speechx/decoder/ctc_prefix_beam_search_result.h new file mode 100644 index 00000000..caa3e37e --- /dev/null +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_result.h @@ -0,0 +1,41 @@ +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "base/common.h" + +namespace ppspeech { + +struct WordPiece { + std::string word; + int start = -1; + int end = -1; + + WordPiece(std::string word, int start, int end) + : word(std::move(word)), start(start), end(end) {} +}; + +struct DecodeResult { + float score = -kBaseFloatMax; + std::string sentence; + std::vector word_pieces; + + static bool CompareFunc(const DecodeResult& a, const DecodeResult& b) { + return a.score > b.score; + } +}; + +} // namespace ppspeech diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.cc b/speechx/speechx/decoder/ctc_tlg_decoder.cc index de97f6ad..4d0a21d5 100644 --- a/speechx/speechx/decoder/ctc_tlg_decoder.cc +++ b/speechx/speechx/decoder/ctc_tlg_decoder.cc @@ -18,16 +18,23 @@ namespace ppspeech { TLGDecoder::TLGDecoder(TLGDecoderOptions opts) { fst_.reset(fst::Fst::Read(opts.fst_path)); CHECK(fst_ != nullptr); + word_symbol_table_.reset( fst::SymbolTable::ReadText(opts.word_symbol_table)); + decoder_.reset(new kaldi::LatticeFasterOnlineDecoder(*fst_, opts.opts)); + + Reset(); +} + +void TLGDecoder::Reset() { decoder_->InitDecoding(); num_frame_decoded_ = 0; + return; } void TLGDecoder::InitDecoder() { - decoder_->InitDecoding(); - num_frame_decoded_ = 0; + Reset(); } void TLGDecoder::AdvanceDecode( @@ -42,10 +49,7 @@ void TLGDecoder::AdvanceDecoding(kaldi::DecodableInterface* decodable) { num_frame_decoded_++; } -void TLGDecoder::Reset() { - InitDecoder(); - return; -} + std::string TLGDecoder::GetPartialResult() { if (num_frame_decoded_ == 0) { @@ -88,4 +92,5 @@ std::string TLGDecoder::GetFinalBestPath() { } return words; } + } diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h index f3ecde73..2f1d6c10 100644 --- a/speechx/speechx/decoder/ctc_tlg_decoder.h +++ b/speechx/speechx/decoder/ctc_tlg_decoder.h @@ -42,20 +42,27 @@ class TLGDecoder : public DecoderInterface { void AdvanceDecode( const std::shared_ptr& decodable); - - std::string GetFinalBestPath(); - std::string GetPartialResult(); - - void Decode(); - std::string GetBestPath(); - std::vector> GetNBestPath(); + std::string GetFinalBestPath() override; + std::string GetPartialResult() override; - int NumFrameDecoded(); int DecodeLikelihoods(const std::vector>& probs, std::vector& nbest_words); + protected: + std::string GetBestPath() override { + CHECK(false); + return {}; + } + std::vector> GetNBestPath() override { + CHECK(false); + return {}; + } + std::vector> GetNBestPath(int n) override { + CHECK(false); + return {}; + } private: void AdvanceDecoding(kaldi::DecodableInterface* decodable); diff --git a/speechx/speechx/decoder/decoder_itf.h b/speechx/speechx/decoder/decoder_itf.h index 1bbc6b11..fe4e7408 100644 --- a/speechx/speechx/decoder/decoder_itf.h +++ b/speechx/speechx/decoder/decoder_itf.h @@ -28,27 +28,31 @@ class DecoderInterface { virtual void Reset() = 0; + // call AdvanceDecoding virtual void AdvanceDecode( const std::shared_ptr& decodable) = 0; + // call GetBestPath virtual std::string GetFinalBestPath() = 0; virtual std::string GetPartialResult() = 0; - // void Decode(); + protected: + // virtual void AdvanceDecoding(kaldi::DecodableInterface* decodable) = 0; - // std::string GetBestPath(); - // std::vector> GetNBestPath(); + // virtual void Decode() = 0; - // int NumFrameDecoded(); - // int DecodeLikelihoods(const std::vector>& probs, - // std::vector& nbest_words); + virtual std::string GetBestPath() = 0; + virtual std::vector> GetNBestPath() = 0; - protected: - // void AdvanceDecoding(kaldi::DecodableInterface* decodable); + virtual std::vector> GetNBestPath(int n) = 0; - // current decoding frame number + // start from one + int NumFrameDecoded() { return num_frame_decoded_ + 1; } + + protected: + // current decoding frame number, abs_time_step_ int32 num_frame_decoded_; }; diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc index 2dd1fa0d..4b30f6b4 100644 --- a/speechx/speechx/nnet/u2_nnet_main.cc +++ b/speechx/speechx/nnet/u2_nnet_main.cc @@ -86,17 +86,6 @@ int main(int argc, char* argv[]) { LOG(INFO) << "utt: " << utt; LOG(INFO) << "feat shape: " << nframes << ", " << feat_dim; - // // pad feats - // int32 padding_len = 0; - // if ((feature.NumRows() - chunk_size) % chunk_stride != 0) { - // padding_len = - // chunk_stride - (feature.NumRows() - chunk_size) % - // chunk_stride; - // feature.Resize(feature.NumRows() + padding_len, - // feature.NumCols(), - // kaldi::kCopyData); - // } - int32 frame_idx = 0; int vocab_dim = 0; std::vector> prob_vec; diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc index 6a13f69b..c218990a 100644 --- a/speechx/speechx/utils/math.cc +++ b/speechx/speechx/utils/math.cc @@ -68,7 +68,7 @@ void TopK(const std::vector& data, for (int i = k; i < n; i++) { if (pq.top().first < data[i]) { pq.pop(); - pq.emplace_back(data[i], i); + pq.emplace(data[i], i); } } @@ -88,4 +88,9 @@ void TopK(const std::vector& data, } } +template void TopK(const std::vector& data, + int32_t k, + std::vector* values, + std::vector* indices) ; + } // namespace ppspeech \ No newline at end of file From 13a7fa9808d0faaa1589e0ef0659c537bd4d5dbb Mon Sep 17 00:00:00 2001 From: "david.95" Date: Fri, 14 Oct 2022 15:37:33 +0800 Subject: [PATCH 14/60] enable chinese words' pinyin specified in text of ssml formats, test=tts --- paddlespeech/t2s/exps/syn_utils.py | 6 +- paddlespeech/t2s/frontend/zh_frontend.py | 156 ++++++++++++++++++++++ paddlespeech/t2s/ssml/xml_processor.py | 163 +++++++++++++++++++++++ 3 files changed, 323 insertions(+), 2 deletions(-) create mode 100644 paddlespeech/t2s/ssml/xml_processor.py diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 15d8dfb7..f9d1cd1b 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -13,6 +13,7 @@ # limitations under the License. import math import os +import re from pathlib import Path from typing import Any from typing import Dict @@ -33,6 +34,7 @@ from paddlespeech.t2s.frontend.mix_frontend import MixFrontend from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.modules.normalizer import ZScore from paddlespeech.utils.dynamic_import import dynamic_import + # remove [W:onnxruntime: xxx] from ort ort.set_default_logger_severity(3) @@ -103,7 +105,7 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'): sentences = [] with open(text_file, 'rt') as f: for line in f: - items = line.strip().split() + items = re.split(r"\s+", line.strip(), 1) utt_id = items[0] if lang == 'zh': sentence = "".join(items[1:]) @@ -180,7 +182,7 @@ def run_frontend(frontend: object, to_tensor: bool=True): outs = dict() if lang == 'zh': - input_ids = frontend.get_input_ids( + input_ids = frontend.get_input_ids_ssml( text, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids, diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index 722eed60..25558780 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -13,6 +13,7 @@ # limitations under the License. import os import re +from operator import itemgetter from typing import Dict from typing import List @@ -31,6 +32,7 @@ from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer +from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor INITIALS = [ 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh', @@ -81,6 +83,7 @@ class Frontend(): g2p_model="g2pW", phone_vocab_path=None, tone_vocab_path=None): + self.mix_ssml_processor = MixTextProcessor() self.tone_modifier = ToneSandhi() self.text_normalizer = TextNormalizer() self.punc = ":,;。?!“”‘’':,;.?!" @@ -143,6 +146,7 @@ class Frontend(): tone_id = [line.strip().split() for line in f.readlines()] for tone, id in tone_id: self.vocab_tones[tone] = int(id) + self.mix_ssml_processor.__repr__() def _init_pypinyin(self): large_pinyin.load() @@ -281,6 +285,65 @@ class Frontend(): phones_list.append(merge_list) return phones_list + def _split_word_to_char(self, words): + res = [] + for x in words: + res.append(x) + return res + + # if using ssml, have pingyin specified, assign pinyin to words + def _g2p_assign(self, + words: List[str], + pinyin_spec: List[str], + merge_sentences: bool=True) -> List[List[str]]: + phones_list = [] + initials = [] + finals = [] + + words = self._split_word_to_char(words[0]) + for pinyin, char in zip(pinyin_spec, words): + sub_initials = [] + sub_finals = [] + pinyin = pinyin.replace("u:", "v") + #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu + if pinyin in self.pinyin2phone: + initial_final_list = self.pinyin2phone[pinyin].split(" ") + if len(initial_final_list) == 2: + sub_initials.append(initial_final_list[0]) + sub_finals.append(initial_final_list[1]) + elif len(initial_final_list) == 1: + sub_initials.append('') + sub_finals.append(initial_final_list[1]) + else: + # If it's not pinyin (possibly punctuation) or no conversion is required + sub_initials.append(pinyin) + sub_finals.append(pinyin) + initials.append(sub_initials) + finals.append(sub_finals) + + initials = sum(initials, []) + finals = sum(finals, []) + phones = [] + for c, v in zip(initials, finals): + # NOTE: post process for pypinyin outputs + # we discriminate i, ii and iii + if c and c not in self.punc: + phones.append(c) + if c and c in self.punc: + phones.append('sp') + if v and v not in self.punc: + phones.append(v) + phones_list.append(phones) + if merge_sentences: + merge_list = sum(phones_list, []) + # rm the last 'sp' to avoid the noise at the end + # cause in the training data, no 'sp' in the end + if merge_list[-1] == 'sp': + merge_list = merge_list[:-1] + phones_list = [] + phones_list.append(merge_list) + return phones_list + def _merge_erhua(self, initials: List[str], finals: List[str], @@ -396,6 +459,52 @@ class Frontend(): print("----------------------------") return phonemes + #@an added for ssml pinyin + def get_phonemes_ssml(self, + ssml_inputs: list, + merge_sentences: bool=True, + with_erhua: bool=True, + robot: bool=False, + print_info: bool=False) -> List[List[str]]: + all_phonemes = [] + for word_pinyin_item in ssml_inputs: + phonemes = [] + sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item) + sentences = self.text_normalizer.normalize(sentence) + if len(pinyin_spec) == 0: + phonemes = self._g2p( + sentences, + merge_sentences=merge_sentences, + with_erhua=with_erhua) + else: + # phonemes should be pinyin_spec + phonemes = self._g2p_assign( + sentences, pinyin_spec, merge_sentences=merge_sentences) + + all_phonemes = all_phonemes + phonemes + + if robot: + new_phonemes = [] + for sentence in all_phonemes: + new_sentence = [] + for item in sentence: + # `er` only have tone `2` + if item[-1] in "12345" and item != "er2": + item = item[:-1] + "1" + new_sentence.append(item) + new_phonemes.append(new_sentence) + all_phonemes = new_phonemes + + if print_info: + print("----------------------------") + print("text norm results:") + print(sentences) + print("----------------------------") + print("g2p results:") + print(all_phonemes[0]) + print("----------------------------") + return [sum(all_phonemes, [])] + def get_input_ids(self, sentence: str, merge_sentences: bool=True, @@ -405,6 +514,7 @@ class Frontend(): add_blank: bool=False, blank_token: str="", to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: + phonemes = self.get_phonemes( sentence, merge_sentences=merge_sentences, @@ -437,3 +547,49 @@ class Frontend(): if temp_phone_ids: result["phone_ids"] = temp_phone_ids return result + + # @an added for ssml + def get_input_ids_ssml( + self, + sentence: str, + merge_sentences: bool=True, + get_tone_ids: bool=False, + robot: bool=False, + print_info: bool=False, + add_blank: bool=False, + blank_token: str="", + to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: + + l_inputs = MixTextProcessor.get_pinyin_split(sentence) + phonemes = self.get_phonemes_ssml( + l_inputs, + merge_sentences=merge_sentences, + print_info=print_info, + robot=robot) + result = {} + phones = [] + tones = [] + temp_phone_ids = [] + temp_tone_ids = [] + + for part_phonemes in phonemes: + phones, tones = self._get_phone_tone( + part_phonemes, get_tone_ids=get_tone_ids) + if add_blank: + phones = insert_after_character(phones, blank_token) + if tones: + tone_ids = self._t2id(tones) + if to_tensor: + tone_ids = paddle.to_tensor(tone_ids) + temp_tone_ids.append(tone_ids) + if phones: + phone_ids = self._p2id(phones) + # if use paddle.to_tensor() in onnxruntime, the first time will be too low + if to_tensor: + phone_ids = paddle.to_tensor(phone_ids) + temp_phone_ids.append(phone_ids) + if temp_tone_ids: + result["tone_ids"] = temp_tone_ids + if temp_phone_ids: + result["phone_ids"] = temp_phone_ids + return result diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/ssml/xml_processor.py new file mode 100644 index 00000000..54f24f59 --- /dev/null +++ b/paddlespeech/t2s/ssml/xml_processor.py @@ -0,0 +1,163 @@ +# -*- coding: utf-8 -*- +import re +import xml.dom.minidom +import xml.parsers.expat +from xml.dom.minidom import Node +from xml.dom.minidom import parseString +''' +Note: xml 有5种特殊字符, &<>"' +其一,采用特殊标签,将包含特殊字符的字符串封装起来。 +例如: + +其二,使用XML转义序列表示这些特殊的字符,这5个特殊字符所对应XML转义序列为: +& & +< < +> > +" " +' ' +例如: +"姓名" + +''' + + +class MixTextProcessor(): + def __repr__(self): + print("@an MixTextProcessor class") + + def get_xml_content(self, mixstr): + '''返回字符串的 xml 内容''' + xmlptn = re.compile(r".*?", re.M | re.S) + ctn = re.search(xmlptn, mixstr) + if ctn: + return ctn.group(0) + else: + return None + + def get_content_split(self, mixstr): + ''' 文本分解,顺序加了列表中,按非xml 和 xml 分开,对应的字符串,带标点符号 + 不能去除空格,因为xml 中tag 属性带空格 + ''' + ctlist = [] + # print("Testing:",mixstr[:20]) + patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S) + mat = re.match(patn, mixstr) + if mat: + pre_xml = mat.group(1) + in_xml = mat.group(2) + after_xml = mat.group(3) + + ctlist.append(pre_xml) + ctlist.append(in_xml) + ctlist.append(after_xml) + return ctlist + else: + ctlist.append(mixstr) + return ctlist + + @classmethod + def get_pinyin_split(self, mixstr): + ctlist = [] + patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S) + mat = re.match(patn, mixstr) + if mat: + pre_xml = mat.group(1) + in_xml = mat.group(2) + after_xml = mat.group(3) + + ctlist.append([pre_xml, []]) + dom = DomXml(in_xml) + pinyinlist = dom.get_pinyins_for_xml() + ctlist = ctlist + pinyinlist + ctlist.append([after_xml, []]) + else: + ctlist.append([mixstr, []]) + return ctlist + + +class DomXml(): + def __init__(self, xmlstr): + print("Parse xml str:", xmlstr) + self.tdom = parseString(xmlstr) #Document + # print("tdom:",type(self.tdom)) + self.root = self.tdom.documentElement #Element + # print("root:",type(self.root)) + self.rnode = self.tdom.childNodes #NodeList + # print("rnode:",type(self.rnode)) + pass + + def get_text(self): + '''返回xml 内容的所有文本内容的 列表''' + res = [] + + for x1 in self.rnode: + if x1.nodeType == Node.TEXT_NODE: + res.append(x1.value) + else: + for x2 in x1.childNodes: + if isinstance(x2, xml.dom.minidom.Text): + res.append(x2.data) + else: + for x3 in x2.childNodes: + if isinstance(x3, xml.dom.minidom.Text): + res.append(x3.data) + else: + print("len(nodes of x3):", len(x3.childNodes)) + + return res + + def get_xmlchild_list(self): + '''返回xml 内容的列表, 包括所有文本内容(不带tag)''' + res = [] + + for x1 in self.rnode: + if x1.nodeType == Node.TEXT_NODE: + res.append(x1.value) + else: + for x2 in x1.childNodes: + if isinstance(x2, xml.dom.minidom.Text): + res.append(x2.data) + else: + for x3 in x2.childNodes: + if isinstance(x3, xml.dom.minidom.Text): + res.append(x3.data) + else: + print("len(nodes of x3):", len(x3.childNodes)) + print(res) + return res + + def get_pinyins_for_xml(self): + '''返回xml 内容,如果字符串 和 拼音的 list , 如 [''' + res = [] + + for x1 in self.rnode: + if x1.nodeType == Node.TEXT_NODE: + t = re.sub(r"\s+", "", x1.value) + res.append([t, []]) + else: + for x2 in x1.childNodes: + if isinstance(x2, xml.dom.minidom.Text): + t = re.sub(r"\s+", "", x2.data) + res.append([t, []]) + else: + # print("x2",x2,x2.tagName) + if x2.hasAttribute('pinyin'): + pinyin_value = x2.getAttribute("pinyin") + pinyins = pinyin_value.split(" ") + for x3 in x2.childNodes: + # print('x3',x3) + if isinstance(x3, xml.dom.minidom.Text): + t = re.sub(r"\s+", "", x3.data) + res.append([t, pinyins]) + else: + print("len(nodes of x3):", len(x3.childNodes)) + + return res + + def get_all_tags(self, tag_name): + '''获取所有的tag 及属性值''' + alltags = self.root.getElementsByTagName(tag_name) + for x in alltags: + if x.hasAttribute('pinyin'): # pinyin + print(x.tagName, 'pinyin', + x.getAttribute('pinyin'), x.firstChild.data) From 86eb718908ea34e3617b76308b1e0fb3f911f1ba Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 14 Oct 2022 11:31:01 +0000 Subject: [PATCH 15/60] add u2 recg --- speechx/examples/codelab/u2/local/decode.sh | 2 +- speechx/speechx/decoder/CMakeLists.txt | 21 +- speechx/speechx/decoder/common.h | 31 ++- speechx/speechx/decoder/ctc_beam_search_opt.h | 64 ------ .../decoder/ctc_prefix_beam_search_decoder.cc | 14 +- .../decoder/ctc_prefix_beam_search_decoder.h | 45 ++-- .../ctc_prefix_beam_search_decoder_main.cc | 28 +-- .../decoder/ctc_prefix_beam_search_result.h | 41 ---- speechx/speechx/decoder/decoder_itf.h | 4 + speechx/speechx/decoder/param.h | 35 +-- speechx/speechx/decoder/recognizer.cc | 6 + speechx/speechx/decoder/recognizer.h | 13 +- speechx/speechx/decoder/recognizer_main.cc | 29 ++- speechx/speechx/decoder/u2_recognizer.cc | 209 ++++++++++++++++++ speechx/speechx/decoder/u2_recognizer.h | 164 ++++++++++++++ speechx/speechx/decoder/u2_recognizer_main.cc | 137 ++++++++++++ .../frontend/audio/feature_pipeline.cc | 2 +- .../speechx/frontend/audio/feature_pipeline.h | 17 +- speechx/speechx/nnet/ds2_nnet.cc | 1 + speechx/speechx/nnet/ds2_nnet.h | 2 + speechx/speechx/nnet/nnet_itf.h | 9 +- speechx/speechx/nnet/u2_nnet.h | 3 +- .../speechx/protocol/websocket/CMakeLists.txt | 2 - .../websocket/websocket_server_main.cc | 29 ++- 24 files changed, 693 insertions(+), 215 deletions(-) delete mode 100644 speechx/speechx/decoder/ctc_prefix_beam_search_result.h create mode 100644 speechx/speechx/decoder/u2_recognizer.cc create mode 100644 speechx/speechx/decoder/u2_recognizer.h create mode 100644 speechx/speechx/decoder/u2_recognizer_main.cc diff --git a/speechx/examples/codelab/u2/local/decode.sh b/speechx/examples/codelab/u2/local/decode.sh index 12297661..24e9fca5 100755 --- a/speechx/examples/codelab/u2/local/decode.sh +++ b/speechx/examples/codelab/u2/local/decode.sh @@ -1,5 +1,5 @@ #!/bin/bash -set -x +set +x set -e . path.sh diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt index 8cf94a10..472d9332 100644 --- a/speechx/speechx/decoder/CMakeLists.txt +++ b/speechx/speechx/decoder/CMakeLists.txt @@ -9,6 +9,7 @@ add_library(decoder STATIC ctc_prefix_beam_search_decoder.cc ctc_tlg_decoder.cc recognizer.cc + u2_recognizer.cc ) target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder absl::strings) @@ -28,10 +29,16 @@ endforeach() # u2 -set(bin_name ctc_prefix_beam_search_decoder_main) -add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) -target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(${bin_name} nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util) -target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS}) -target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) -target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) \ No newline at end of file +set(TEST_BINS + u2_recognizer_main + ctc_prefix_beam_search_decoder_main +) + +foreach(bin_name IN LISTS TEST_BINS) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util) + target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS}) + target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) + target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) +endforeach() \ No newline at end of file diff --git a/speechx/speechx/decoder/common.h b/speechx/speechx/decoder/common.h index 52deffac..0ae73277 100644 --- a/speechx/speechx/decoder/common.h +++ b/speechx/speechx/decoder/common.h @@ -1,3 +1,4 @@ +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,10 +13,36 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "base/basic_types.h" +#pragma once + +#include "base/common.h" struct DecoderResult { BaseFloat acoustic_score; std::vector words_idx; - std::vector> time_stamp; + std::vector> time_stamp; +}; + + +namespace ppspeech { + +struct WordPiece { + std::string word; + int start = -1; + int end = -1; + + WordPiece(std::string word, int start, int end) + : word(std::move(word)), start(start), end(end) {} }; + +struct DecodeResult { + float score = -kBaseFloatMax; + std::string sentence; + std::vector word_pieces; + + static bool CompareFunc(const DecodeResult& a, const DecodeResult& b) { + return a.score > b.score; + } +}; + +} // namespace ppspeech diff --git a/speechx/speechx/decoder/ctc_beam_search_opt.h b/speechx/speechx/decoder/ctc_beam_search_opt.h index af92fad0..d21b3abd 100644 --- a/speechx/speechx/decoder/ctc_beam_search_opt.h +++ b/speechx/speechx/decoder/ctc_beam_search_opt.h @@ -76,68 +76,4 @@ struct CTCBeamSearchOptions { } }; - -// used by u2 model -struct CTCBeamSearchDecoderOptions { - // chunk_size is the frame number of one chunk after subsampling. - // e.g. if subsample rate is 4 and chunk_size = 16, the frames in - // one chunk are 67=16*4 + 3, stride is 64=16*4 - int chunk_size; - int num_left_chunks; - - // final_score = rescoring_weight * rescoring_score + ctc_weight * - // ctc_score; - // rescoring_score = left_to_right_score * (1 - reverse_weight) + - // right_to_left_score * reverse_weight - // Please note the concept of ctc_scores - // in the following two search methods are different. For - // CtcPrefixBeamSerch, - // it's a sum(prefix) score + context score For CtcWfstBeamSerch, it's a - // max(viterbi) path score + context score So we should carefully set - // ctc_weight accroding to the search methods. - float ctc_weight; - float rescoring_weight; - float reverse_weight; - - // CtcEndpointConfig ctc_endpoint_opts; - - CTCBeamSearchOptions ctc_prefix_search_opts; - - CTCBeamSearchDecoderOptions() - : chunk_size(16), - num_left_chunks(-1), - ctc_weight(0.5), - rescoring_weight(1.0), - reverse_weight(0.0) {} - - void Register(kaldi::OptionsItf* opts) { - std::string module = "DecoderConfig: "; - opts->Register( - "chunk-size", - &chunk_size, - module + "the frame number of one chunk after subsampling."); - opts->Register("num-left-chunks", - &num_left_chunks, - module + "the left history chunks number."); - opts->Register("ctc-weight", - &ctc_weight, - module + - "ctc weight for rescore. final_score = " - "rescoring_weight * rescoring_score + ctc_weight * " - "ctc_score."); - opts->Register("rescoring-weight", - &rescoring_weight, - module + - "attention score weight for rescore. final_score = " - "rescoring_weight * rescoring_score + ctc_weight * " - "ctc_score."); - opts->Register("reverse-weight", - &reverse_weight, - module + - "reverse decoder weight. rescoring_score = " - "left_to_right_score * (1 - reverse_weight) + " - "right_to_left_score * reverse_weight."); - } -}; - } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc index f22bfea2..ce2d4dc2 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc @@ -30,8 +30,14 @@ using paddle::platform::TracerEventType; namespace ppspeech { -CTCPrefixBeamSearch::CTCPrefixBeamSearch(const CTCBeamSearchOptions& opts) +CTCPrefixBeamSearch::CTCPrefixBeamSearch( + const std::string vocab_path, + const CTCBeamSearchOptions& opts) : opts_(opts) { + + unit_table_ = std::shared_ptr(fst::SymbolTable::ReadText(vocab_path)); + CHECK(unit_table_ != nullptr); + Reset(); } @@ -322,7 +328,11 @@ void CTCPrefixBeamSearch::UpdateFinalContext() { CHECK(n_hyps > 0); CHECK(index < n_hyps); std::vector one = Outputs()[index]; - return std::string(absl::StrJoin(one, kSpaceSymbol)); + std::string sentence; + for (int i = 0; i < one.size(); i++){ + sentence += unit_table_->Find(one[i]); + } + return sentence; } std::string CTCPrefixBeamSearch::GetBestPath() { diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h index ba44b0a2..2c28bee1 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h @@ -15,17 +15,21 @@ #pragma once #include "decoder/ctc_beam_search_opt.h" -#include "decoder/ctc_prefix_beam_search_result.h" #include "decoder/ctc_prefix_beam_search_score.h" #include "decoder/decoder_itf.h" +#include "fst/symbol-table.h" + namespace ppspeech { class ContextGraph; class CTCPrefixBeamSearch : public DecoderInterface { public: - explicit CTCPrefixBeamSearch(const CTCBeamSearchOptions& opts); + explicit CTCPrefixBeamSearch(const std::string vocab_path, + const CTCBeamSearchOptions& opts); ~CTCPrefixBeamSearch() {} + SearchType Type() const { return SearchType::kPrefixBeamSearch; } + void InitDecoder() override; void Reset() override; @@ -38,10 +42,9 @@ class CTCPrefixBeamSearch : public DecoderInterface { void FinalizeSearch(); - protected: - std::string GetBestPath() override; - std::vector> GetNBestPath() override; - std::vector> GetNBestPath(int n) override; + const std::shared_ptr VocabTable() const { + return unit_table_; + } const std::vector>& Inputs() const { return hypotheses_; } const std::vector>& Outputs() const { return outputs_; } @@ -52,6 +55,11 @@ class CTCPrefixBeamSearch : public DecoderInterface { const std::vector>& Times() const { return times_; } + protected: + std::string GetBestPath() override; + std::vector> GetNBestPath() override; + std::vector> GetNBestPath(int n) override; + private: std::string GetBestPath(int index); @@ -66,6 +74,7 @@ class CTCPrefixBeamSearch : public DecoderInterface { private: CTCBeamSearchOptions opts_; + std::shared_ptr unit_table_; std::unordered_map, PrefixScore, PrefixScoreHash> cur_hyps_; @@ -86,28 +95,4 @@ class CTCPrefixBeamSearch : public DecoderInterface { }; -class CTCPrefixBeamSearchDecoder : public CTCPrefixBeamSearch { - public: - explicit CTCPrefixBeamSearchDecoder(const CTCBeamSearchDecoderOptions& opts) - : CTCPrefixBeamSearch(opts.ctc_prefix_search_opts), opts_(opts) {} - - ~CTCPrefixBeamSearchDecoder() {} - - private: - CTCBeamSearchDecoderOptions opts_; - - // cache feature - bool start_ = false; // false, this is first frame. - // for continues decoding - int num_frames_ = 0; - int global_frame_offset_ = 0; - const int time_stamp_gap_ = - 100; // timestamp gap between words in a sentence - - // std::unique_ptr ctc_endpointer_; - - int num_frames_in_current_chunk_ = 0; - std::vector result_; -}; - } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc index 8927a5f4..dd352378 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc @@ -55,14 +55,12 @@ int main(int argc, char* argv[]) { CHECK(FLAGS_vocab_path != ""); CHECK(FLAGS_model_path != ""); LOG(INFO) << "model path: " << FLAGS_model_path; + LOG(INFO) << "Reading vocab table " << FLAGS_vocab_path; kaldi::SequentialBaseFloatMatrixReader feature_reader( FLAGS_feature_rspecifier); kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); - LOG(INFO) << "Reading vocab table " << FLAGS_vocab_path; - fst::SymbolTable* unit_table = fst::SymbolTable::ReadText(FLAGS_vocab_path); - // nnet ppspeech::ModelOptions model_opts; model_opts.model_path = FLAGS_model_path; @@ -75,16 +73,11 @@ int main(int argc, char* argv[]) { new ppspeech::Decodable(nnet, raw_data)); // decoder - ppspeech::CTCBeamSearchDecoderOptions opts; - opts.chunk_size = 16; - opts.num_left_chunks = -1; - opts.ctc_weight = 0.5; - opts.rescoring_weight = 1.0; - opts.reverse_weight = 0.3; - opts.ctc_prefix_search_opts.blank = 0; - opts.ctc_prefix_search_opts.first_beam_size = 10; - opts.ctc_prefix_search_opts.second_beam_size = 10; - ppspeech::CTCPrefixBeamSearchDecoder decoder(opts); + ppspeech::CTCBeamSearchOptions opts; + opts.blank = 0; + opts.first_beam_size = 10; + opts.second_beam_size = 10; + ppspeech::CTCPrefixBeamSearch decoder(FLAGS_vocab_path, opts); int32 chunk_size = FLAGS_receptive_field_length + @@ -150,17 +143,14 @@ int main(int argc, char* argv[]) { // forward nnet decoder.AdvanceDecode(decodable); + + LOG(INFO) << "Partial result: " << decoder.GetPartialResult(); } decoder.FinalizeSearch(); // get 1-best result - std::string result_ints = decoder.GetFinalBestPath(); - std::vector tokenids = absl::StrSplit(result_ints, ppspeech::kSpaceSymbol); - std::string result; - for (int i = 0; i < tokenids.size(); i++){ - result += unit_table->Find(std::stoi(tokenids[i])); - } + std::string result = decoder.GetFinalBestPath(); // after process one utt, then reset state. decodable->Reset(); diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_result.h b/speechx/speechx/decoder/ctc_prefix_beam_search_result.h deleted file mode 100644 index caa3e37e..00000000 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_result.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "base/common.h" - -namespace ppspeech { - -struct WordPiece { - std::string word; - int start = -1; - int end = -1; - - WordPiece(std::string word, int start, int end) - : word(std::move(word)), start(start), end(end) {} -}; - -struct DecodeResult { - float score = -kBaseFloatMax; - std::string sentence; - std::vector word_pieces; - - static bool CompareFunc(const DecodeResult& a, const DecodeResult& b) { - return a.score > b.score; - } -}; - -} // namespace ppspeech diff --git a/speechx/speechx/decoder/decoder_itf.h b/speechx/speechx/decoder/decoder_itf.h index fe4e7408..eec9bc3d 100644 --- a/speechx/speechx/decoder/decoder_itf.h +++ b/speechx/speechx/decoder/decoder_itf.h @@ -20,6 +20,10 @@ namespace ppspeech { +enum SearchType { + kPrefixBeamSearch = 0, + kWfstBeamSearch = 1, +}; class DecoderInterface { public: virtual ~DecoderInterface() {} diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index 8a5990dc..e0f22d8c 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -19,12 +19,15 @@ #include "decoder/ctc_tlg_decoder.h" #include "frontend/audio/feature_pipeline.h" + // feature DEFINE_bool(use_fbank, false, "False for fbank; or linear feature"); // DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear // feature, or fbank"); DEFINE_int32(num_bins, 161, "num bins of mel"); DEFINE_string(cmvn_file, "", "read cmvn"); + + // feature sliding window DEFINE_int32(receptive_field_length, 7, @@ -33,6 +36,8 @@ DEFINE_int32(downsampling_rate, 4, "two CNN(kernel=3) module downsampling rate."); DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk"); + + // nnet DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); @@ -89,34 +94,4 @@ FeaturePipelineOptions InitFeaturePipelineOptions() { return opts; } -ModelOptions InitModelOptions() { - ModelOptions model_opts; - model_opts.model_path = FLAGS_model_path; - model_opts.param_path = FLAGS_param_path; - model_opts.cache_names = FLAGS_model_cache_names; - model_opts.cache_shape = FLAGS_model_cache_shapes; - model_opts.input_names = FLAGS_model_input_names; - model_opts.output_names = FLAGS_model_output_names; - return model_opts; -} - -TLGDecoderOptions InitDecoderOptions() { - TLGDecoderOptions decoder_opts; - decoder_opts.word_symbol_table = FLAGS_word_symbol_table; - decoder_opts.fst_path = FLAGS_graph_path; - decoder_opts.opts.max_active = FLAGS_max_active; - decoder_opts.opts.beam = FLAGS_beam; - decoder_opts.opts.lattice_beam = FLAGS_lattice_beam; - return decoder_opts; -} - -RecognizerResource InitRecognizerResoure() { - RecognizerResource resource; - resource.acoustic_scale = FLAGS_acoustic_scale; - resource.feature_pipeline_opts = InitFeaturePipelineOptions(); - resource.model_opts = InitModelOptions(); - resource.tlg_opts = InitDecoderOptions(); - return resource; -} - } // namespace ppspeech diff --git a/speechx/speechx/decoder/recognizer.cc b/speechx/speechx/decoder/recognizer.cc index 44c3911c..bb9ea187 100644 --- a/speechx/speechx/decoder/recognizer.cc +++ b/speechx/speechx/decoder/recognizer.cc @@ -14,6 +14,7 @@ #include "decoder/recognizer.h" + namespace ppspeech { using kaldi::Vector; @@ -23,14 +24,19 @@ using std::vector; using kaldi::SubVector; using std::unique_ptr; + Recognizer::Recognizer(const RecognizerResource& resource) { // resource_ = resource; const FeaturePipelineOptions& feature_opts = resource.feature_pipeline_opts; feature_pipeline_.reset(new FeaturePipeline(feature_opts)); + std::shared_ptr nnet(new PaddleNnet(resource.model_opts)); + BaseFloat ac_scale = resource.acoustic_scale; decodable_.reset(new Decodable(nnet, feature_pipeline_, ac_scale)); + decoder_.reset(new TLGDecoder(resource.tlg_opts)); + input_finished_ = false; } diff --git a/speechx/speechx/decoder/recognizer.h b/speechx/speechx/decoder/recognizer.h index e47ca433..4965e7a3 100644 --- a/speechx/speechx/decoder/recognizer.h +++ b/speechx/speechx/decoder/recognizer.h @@ -25,16 +25,11 @@ namespace ppspeech { struct RecognizerResource { - FeaturePipelineOptions feature_pipeline_opts; - ModelOptions model_opts; - TLGDecoderOptions tlg_opts; + FeaturePipelineOptions feature_pipeline_opts{}; + ModelOptions model_opts{}; + TLGDecoderOptions tlg_opts{}; // CTCBeamSearchOptions beam_search_opts; - kaldi::BaseFloat acoustic_scale; - RecognizerResource() - : acoustic_scale(1.0), - feature_pipeline_opts(), - model_opts(), - tlg_opts() {} + kaldi::BaseFloat acoustic_scale{1.0}; }; class Recognizer { diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/decoder/recognizer_main.cc index 05026646..2b497d6e 100644 --- a/speechx/speechx/decoder/recognizer_main.cc +++ b/speechx/speechx/decoder/recognizer_main.cc @@ -22,6 +22,33 @@ DEFINE_string(result_wspecifier, "", "test result wspecifier"); DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); DEFINE_int32(sample_rate, 16000, "sample rate"); +ppspeech::RecognizerResource InitRecognizerResoure() { + ppspeech::RecognizerResource resource; + resource.acoustic_scale = FLAGS_acoustic_scale; + resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions(); + + ppspeech::ModelOptions model_opts; + model_opts.model_path = FLAGS_model_path; + model_opts.param_path = FLAGS_param_path; + model_opts.cache_names = FLAGS_model_cache_names; + model_opts.cache_shape = FLAGS_model_cache_shapes; + model_opts.input_names = FLAGS_model_input_names; + model_opts.output_names = FLAGS_model_output_names; + model_opts.subsample_rate = FLAGS_downsampling_rate; + resource.model_opts = model_opts; + + ppspeech::TLGDecoderOptions decoder_opts; + decoder_opts.word_symbol_table = FLAGS_word_symbol_table; + decoder_opts.fst_path = FLAGS_graph_path; + decoder_opts.opts.max_active = FLAGS_max_active; + decoder_opts.opts.beam = FLAGS_beam; + decoder_opts.opts.lattice_beam = FLAGS_lattice_beam; + + resource.tlg_opts = decoder_opts; + + return resource; +} + int main(int argc, char* argv[]) { gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); @@ -29,7 +56,7 @@ int main(int argc, char* argv[]) { google::InstallFailureSignalHandler(); FLAGS_logtostderr = 1; - ppspeech::RecognizerResource resource = ppspeech::InitRecognizerResoure(); + ppspeech::RecognizerResource resource = InitRecognizerResoure(); ppspeech::Recognizer recognizer(resource); kaldi::SequentialTableReader wav_reader( diff --git a/speechx/speechx/decoder/u2_recognizer.cc b/speechx/speechx/decoder/u2_recognizer.cc new file mode 100644 index 00000000..0ace086c --- /dev/null +++ b/speechx/speechx/decoder/u2_recognizer.cc @@ -0,0 +1,209 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "decoder/u2_recognizer.h" +#include "nnet/u2_nnet.h" + +namespace ppspeech { + +using kaldi::Vector; +using kaldi::VectorBase; +using kaldi::BaseFloat; +using std::vector; +using kaldi::SubVector; +using std::unique_ptr; + +U2Recognizer::U2Recognizer(const U2RecognizerResource& resource): opts_(resource) { + const FeaturePipelineOptions& feature_opts = resource.feature_pipeline_opts; + feature_pipeline_.reset(new FeaturePipeline(feature_opts)); + + std::shared_ptr nnet(new U2Nnet(resource.model_opts)); + + BaseFloat am_scale = resource.acoustic_scale; + decodable_.reset(new Decodable(nnet, feature_pipeline_, am_scale)); + + decoder_.reset(new CTCPrefixBeamSearch(resource.vocab_path, resource.decoder_opts.ctc_prefix_search_opts)); + + unit_table_ = decoder_->VocabTable(); + symbol_table_ = unit_table_; + + input_finished_ = false; +} + +void U2Recognizer::Reset() { + global_frame_offset_ = 0; + num_frames_ = 0; + result_.clear(); + + feature_pipeline_->Reset(); + decodable_->Reset(); + decoder_->Reset(); +} + +void U2Recognizer::ResetContinuousDecoding() { + global_frame_offset_ = num_frames_; + num_frames_ = 0; + result_.clear(); + + feature_pipeline_->Reset(); + decodable_->Reset(); + decoder_->Reset(); +} + + +void U2Recognizer::Accept(const VectorBase& waves) { + feature_pipeline_->Accept(waves); +} + + +void U2Recognizer::Decode() { + decoder_->AdvanceDecode(decodable_); +} + +void U2Recognizer::Rescoring() { + // Do attention Rescoring + kaldi::Timer timer; + AttentionRescoring(); + VLOG(1) << "Rescoring cost latency: " << timer.Elapsed() << " sec."; +} + +void U2Recognizer::UpdateResult(bool finish) { + const auto& hypotheses = decoder_->Outputs(); + const auto& inputs = decoder_->Inputs(); + const auto& likelihood = decoder_->Likelihood(); + const auto& times = decoder_->Times(); + result_.clear(); + + CHECK_EQ(hypotheses.size(), likelihood.size()); + for (size_t i = 0; i < hypotheses.size(); i++) { + const std::vector& hypothesis = hypotheses[i]; + + DecodeResult path; + path.score = likelihood[i]; + for (size_t j = 0; j < hypothesis.size(); j++) { + std::string word = symbol_table_->Find(hypothesis[j]); + // A detailed explanation of this if-else branch can be found in + // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058 + if (decoder_->Type() == kWfstBeamSearch) { + path.sentence += (" " + word); + } else { + path.sentence += (word); + } + } + + // TimeStamp is only supported in final result + // TimeStamp of the output of CtcWfstBeamSearch may be inaccurate due to + // various FST operations when building the decoding graph. So here we use + // time stamp of the input(e2e model unit), which is more accurate, and it + // requires the symbol table of the e2e model used in training. + if (unit_table_ != nullptr && finish) { + int offset = global_frame_offset_ * FrameShiftInMs(); + + const std::vector& input = inputs[i]; + const std::vector time_stamp = times[i]; + CHECK_EQ(input.size(), time_stamp.size()); + + for (size_t j = 0; j < input.size(); j++) { + std::string word = unit_table_->Find(input[j]); + + int start = time_stamp[j] * FrameShiftInMs() - time_stamp_gap_ > 0 + ? time_stamp[j] * FrameShiftInMs() - time_stamp_gap_ + : 0; + if (j > 0) { + start = (time_stamp[j] - time_stamp[j - 1]) * FrameShiftInMs() < + time_stamp_gap_ + ? (time_stamp[j - 1] + time_stamp[j]) / 2 * + FrameShiftInMs() + : start; + } + + int end = time_stamp[j] * FrameShiftInMs(); + if (j < input.size() - 1) { + end = (time_stamp[j + 1] - time_stamp[j]) * FrameShiftInMs() < + time_stamp_gap_ + ? (time_stamp[j + 1] + time_stamp[j]) / 2 * + FrameShiftInMs() + : end; + } + + WordPiece word_piece(word, offset + start, offset + end); + path.word_pieces.emplace_back(word_piece); + } + } + + // if (post_processor_ != nullptr) { + // path.sentence = post_processor_->Process(path.sentence, finish); + // } + + result_.emplace_back(path); + } + + if (DecodedSomething()) { + VLOG(1) << "Partial CTC result " << result_[0].sentence; + } +} + +void U2Recognizer::AttentionRescoring() { + decoder_->FinalizeSearch(); + UpdateResult(true); + + // No need to do rescoring + if (0.0 == opts_.decoder_opts.rescoring_weight) { + LOG_EVERY_N(WARNING, 3) << "Not do AttentionRescoring!"; + return; + } + LOG_EVERY_N(WARNING, 3) << "Do AttentionRescoring!"; + + // Inputs() returns N-best input ids, which is the basic unit for rescoring + // In CtcPrefixBeamSearch, inputs are the same to outputs + const auto& hypotheses = decoder_->Inputs(); + int num_hyps = hypotheses.size(); + if (num_hyps <= 0) { + return; + } + + kaldi::Timer timer; + std::vector rescoring_score; + decodable_->AttentionRescoring( + hypotheses, opts_.decoder_opts.reverse_weight, &rescoring_score); + VLOG(1) << "Attention Rescoring takes " << timer.Elapsed() << " sec."; + + // combine ctc score and rescoring score + for (size_t i = 0; i < num_hyps; i++) { + VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i] + << " ctc_score: " << result_[i].score; + result_[i].score = opts_.decoder_opts.rescoring_weight * rescoring_score[i] + + opts_.decoder_opts.ctc_weight * result_[i].score; + } + + std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc); + VLOG(1) << "result: " << result_[0].sentence + << " score: " << result_[0].score; +} + +std::string U2Recognizer::GetFinalResult() { + return result_[0].sentence; +} + +std::string U2Recognizer::GetPartialResult() { + return result_[0].sentence; +} + +void U2Recognizer::SetFinished() { + feature_pipeline_->SetFinished(); + input_finished_ = true; +} + + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/decoder/u2_recognizer.h b/speechx/speechx/decoder/u2_recognizer.h new file mode 100644 index 00000000..0947e593 --- /dev/null +++ b/speechx/speechx/decoder/u2_recognizer.h @@ -0,0 +1,164 @@ + + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "decoder/common.h" +#include "decoder/ctc_beam_search_opt.h" +#include "decoder/ctc_prefix_beam_search_decoder.h" +#include "decoder/decoder_itf.h" +#include "frontend/audio/feature_pipeline.h" +#include "nnet/decodable.h" + +#include "fst/fstlib.h" +#include "fst/symbol-table.h" + +namespace ppspeech { + + +struct DecodeOptions { + // chunk_size is the frame number of one chunk after subsampling. + // e.g. if subsample rate is 4 and chunk_size = 16, the frames in + // one chunk are 67=16*4 + 3, stride is 64=16*4 + int chunk_size; + int num_left_chunks; + + // final_score = rescoring_weight * rescoring_score + ctc_weight * + // ctc_score; + // rescoring_score = left_to_right_score * (1 - reverse_weight) + + // right_to_left_score * reverse_weight + // Please note the concept of ctc_scores + // in the following two search methods are different. For + // CtcPrefixBeamSerch, + // it's a sum(prefix) score + context score For CtcWfstBeamSerch, it's a + // max(viterbi) path score + context score So we should carefully set + // ctc_weight accroding to the search methods. + float ctc_weight; + float rescoring_weight; + float reverse_weight; + + // CtcEndpointConfig ctc_endpoint_opts; + CTCBeamSearchOptions ctc_prefix_search_opts; + + DecodeOptions() + : chunk_size(16), + num_left_chunks(-1), + ctc_weight(0.5), + rescoring_weight(1.0), + reverse_weight(0.0) {} + + void Register(kaldi::OptionsItf* opts) { + std::string module = "DecoderConfig: "; + opts->Register( + "chunk-size", + &chunk_size, + module + "the frame number of one chunk after subsampling."); + opts->Register("num-left-chunks", + &num_left_chunks, + module + "the left history chunks number."); + opts->Register("ctc-weight", + &ctc_weight, + module + + "ctc weight for rescore. final_score = " + "rescoring_weight * rescoring_score + ctc_weight * " + "ctc_score."); + opts->Register("rescoring-weight", + &rescoring_weight, + module + + "attention score weight for rescore. final_score = " + "rescoring_weight * rescoring_score + ctc_weight * " + "ctc_score."); + opts->Register("reverse-weight", + &reverse_weight, + module + + "reverse decoder weight. rescoring_score = " + "left_to_right_score * (1 - reverse_weight) + " + "right_to_left_score * reverse_weight."); + } +}; + + +struct U2RecognizerResource { + FeaturePipelineOptions feature_pipeline_opts{}; + ModelOptions model_opts{}; + DecodeOptions decoder_opts{}; + // CTCBeamSearchOptions beam_search_opts; + kaldi::BaseFloat acoustic_scale{1.0}; + std::string vocab_path{}; +}; + + +class U2Recognizer { + public: + explicit U2Recognizer(const U2RecognizerResource& resouce); + void Reset(); + void ResetContinuousDecoding(); + + void Accept(const kaldi::VectorBase& waves); + void Decode(); + void Rescoring(); + + + std::string GetFinalResult(); + std::string GetPartialResult(); + + void SetFinished(); + bool IsFinished() { return input_finished_; } + + bool DecodedSomething() const { + return !result_.empty() && !result_[0].sentence.empty(); + } + + + int FrameShiftInMs() const { + // one decoder frame length in ms + return decodable_->Nnet()->SubsamplingRate() * + feature_pipeline_->FrameShift(); + } + + + const std::vector& Result() const { return result_; } + + private: + void AttentionRescoring(); + void UpdateResult(bool finish = false); + + private: + U2RecognizerResource opts_; + + // std::shared_ptr resource_; + // U2RecognizerResource resource_; + std::shared_ptr feature_pipeline_; + std::shared_ptr decodable_; + std::unique_ptr decoder_; + + // e2e unit symbol table + std::shared_ptr unit_table_ = nullptr; + std::shared_ptr symbol_table_ = nullptr; + + std::vector result_; + + // global decoded frame offset + int global_frame_offset_; + // cur decoded frame num + int num_frames_; + // timestamp gap between words in a sentence + const int time_stamp_gap_ = 100; + + bool input_finished_; +}; + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/decoder/u2_recognizer_main.cc b/speechx/speechx/decoder/u2_recognizer_main.cc new file mode 100644 index 00000000..70bc7d67 --- /dev/null +++ b/speechx/speechx/decoder/u2_recognizer_main.cc @@ -0,0 +1,137 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "decoder/u2_recognizer.h" +#include "decoder/param.h" +#include "kaldi/feat/wave-reader.h" +#include "kaldi/util/table-types.h" + +DEFINE_string(wav_rspecifier, "", "test feature rspecifier"); +DEFINE_string(result_wspecifier, "", "test result wspecifier"); +DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); +DEFINE_int32(sample_rate, 16000, "sample rate"); + + +ppspeech::U2RecognizerResource InitOpts() { + ppspeech::U2RecognizerResource resource; + resource.acoustic_scale = FLAGS_acoustic_scale; + resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions(); + + ppspeech::ModelOptions model_opts; + model_opts.model_path = FLAGS_model_path; + + resource.model_opts = model_opts; + + ppspeech::DecodeOptions decoder_opts; + decoder_opts.chunk_size=16; + decoder_opts.num_left_chunks = -1; + decoder_opts.ctc_weight = 0.5; + decoder_opts.rescoring_weight = 1.0; + decoder_opts.reverse_weight = 0.3; + decoder_opts.ctc_prefix_search_opts.blank = 0; + decoder_opts.ctc_prefix_search_opts.first_beam_size = 10; + decoder_opts.ctc_prefix_search_opts.second_beam_size = 10; + + resource.decoder_opts = decoder_opts; + return resource; +} + +int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); + gflags::ParseCommandLineFlags(&argc, &argv, false); + google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; + + int32 num_done = 0, num_err = 0; + double tot_wav_duration = 0.0; + + ppspeech::U2RecognizerResource resource = InitOpts(); + ppspeech::U2Recognizer recognizer(resource); + + kaldi::SequentialTableReader wav_reader( + FLAGS_wav_rspecifier); + kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); + + int sample_rate = FLAGS_sample_rate; + float streaming_chunk = FLAGS_streaming_chunk; + int chunk_sample_size = streaming_chunk * sample_rate; + LOG(INFO) << "sr: " << sample_rate; + LOG(INFO) << "chunk size (s): " << streaming_chunk; + LOG(INFO) << "chunk size (sample): " << chunk_sample_size; + + kaldi::Timer timer; + + for (; !wav_reader.Done(); wav_reader.Next()) { + std::string utt = wav_reader.Key(); + const kaldi::WaveData& wave_data = wav_reader.Value(); + LOG(INFO) << "utt: " << utt; + LOG(INFO) << "wav dur: " << wave_data.Duration() << " sec."; + tot_wav_duration += wave_data.Duration(); + + int32 this_channel = 0; + kaldi::SubVector waveform(wave_data.Data(), + this_channel); + int tot_samples = waveform.Dim(); + LOG(INFO) << "wav len (sample): " << tot_samples; + + int sample_offset = 0; + while (sample_offset < tot_samples) { + int cur_chunk_size = + std::min(chunk_sample_size, tot_samples - sample_offset); + + kaldi::Vector wav_chunk(cur_chunk_size); + for (int i = 0; i < cur_chunk_size; ++i) { + wav_chunk(i) = waveform(sample_offset + i); + } + // wav_chunk = waveform.Range(sample_offset + i, cur_chunk_size); + + recognizer.Accept(wav_chunk); + if (cur_chunk_size < chunk_sample_size) { + recognizer.SetFinished(); + } + recognizer.Decode(); + LOG(INFO) << "Pratial result: " << recognizer.GetPartialResult(); + + // no overlap + sample_offset += cur_chunk_size; + } + // second pass decoding + recognizer.Rescoring(); + + std::string result = recognizer.GetFinalResult(); + + recognizer.Reset(); + + if (result.empty()) { + // the TokenWriter can not write empty string. + ++num_err; + LOG(INFO) << " the result of " << utt << " is empty"; + continue; + } + + LOG(INFO) << " the result of " << utt << " is " << result; + + result_writer.Write(utt, result); + + ++num_done; + } + + double elapsed = timer.Elapsed(); + + LOG(INFO) << "Done " << num_done << " out of " << (num_err + num_done); + LOG(INFO) << "cost:" << elapsed << " sec"; + LOG(INFO) << "total wav duration is: " << tot_wav_duration << " sec"; + LOG(INFO) << "the RTF is: " << elapsed / tot_wav_duration; +} diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc index 9cacff9f..9fc35c95 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.cc +++ b/speechx/speechx/frontend/audio/feature_pipeline.cc @@ -18,7 +18,7 @@ namespace ppspeech { using std::unique_ptr; -FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) { +FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) : opts_(opts) { unique_ptr data_source( new ppspeech::AudioCache(1000 * kint16max, opts.to_float32)); diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index 48f95e3f..613f69c6 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -26,7 +26,6 @@ #include "frontend/audio/normalizer.h" namespace ppspeech { - struct FeaturePipelineOptions { std::string cmvn_file; bool to_float32; // true, only for linear feature @@ -60,7 +59,21 @@ class FeaturePipeline : public FrontendInterface { virtual bool IsFinished() const { return base_extractor_->IsFinished(); } virtual void Reset() { base_extractor_->Reset(); } + const FeaturePipelineOptions& Config() { return opts_; } + + const BaseFloat FrameShift() const { + return opts_.fbank_opts.frame_opts.frame_shift_ms; + } + const BaseFloat FrameLength() const { + return opts_.fbank_opts.frame_opts.frame_length_ms; + } + const BaseFloat SampleRate() const { + return opts_.fbank_opts.frame_opts.samp_freq; + } + private: + FeaturePipelineOptions opts_; std::unique_ptr base_extractor_; }; -} + +} // namespace ppspeech diff --git a/speechx/speechx/nnet/ds2_nnet.cc b/speechx/speechx/nnet/ds2_nnet.cc index c6add03c..8c83f832 100644 --- a/speechx/speechx/nnet/ds2_nnet.cc +++ b/speechx/speechx/nnet/ds2_nnet.cc @@ -48,6 +48,7 @@ void PaddleNnet::InitCacheEncouts(const ModelOptions& opts) { } PaddleNnet::PaddleNnet(const ModelOptions& opts) : opts_(opts) { + subsampling_rate_ = opts.subsample_rate; paddle_infer::Config config; config.SetModel(opts.model_path, opts.param_path); if (opts.use_gpu) { diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h index e8a49c7d..2a53e5f7 100644 --- a/speechx/speechx/nnet/ds2_nnet.h +++ b/speechx/speechx/nnet/ds2_nnet.h @@ -67,6 +67,7 @@ class PaddleNnet : public NnetInterface { bool IsLogProb() override { return false; } + std::shared_ptr> GetCacheEncoder( const std::string& name); @@ -85,6 +86,7 @@ class PaddleNnet : public NnetInterface { std::map predictor_to_thread_id; std::map cache_names_idx_; std::vector>> cache_encouts_; + ModelOptions opts_; public: diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h index 2e21ff9b..109f54e0 100644 --- a/speechx/speechx/nnet/nnet_itf.h +++ b/speechx/speechx/nnet/nnet_itf.h @@ -35,6 +35,7 @@ struct ModelOptions { std::string cache_shape; bool enable_fc_padding; bool enable_profile; + int subsample_rate; ModelOptions() : model_path(""), param_path(""), @@ -46,7 +47,8 @@ struct ModelOptions { cache_shape(""), switch_ir_optim(false), enable_fc_padding(false), - enable_profile(false) {} + enable_profile(false), + subsample_rate(0) {} void Register(kaldi::OptionsItf* opts) { opts->Register("model-path", &model_path, "model file path"); @@ -102,9 +104,14 @@ class NnetInterface { // true, nnet output is logprob; otherwise is prob, virtual bool IsLogProb() = 0; + int SubsamplingRate() const { return subsampling_rate_; } + // using to get encoder outs. e.g. seq2seq with Attention model. virtual void EncoderOuts( std::vector>* encoder_out) const = 0; + + protected: + int subsampling_rate_{1}; }; } // namespace ppspeech diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h index 1bac652e..7058ea94 100644 --- a/speechx/speechx/nnet/u2_nnet.h +++ b/speechx/speechx/nnet/u2_nnet.h @@ -30,7 +30,7 @@ class U2NnetBase : public NnetInterface { public: virtual int context() const { return right_context_ + 1; } virtual int right_context() const { return right_context_; } - virtual int subsampling_rate() const { return subsampling_rate_; } + virtual int eos() const { return eos_; } virtual int sos() const { return sos_; } virtual int is_bidecoder() const { return is_bidecoder_; } @@ -64,7 +64,6 @@ class U2NnetBase : public NnetInterface { protected: // model specification int right_context_{0}; - int subsampling_rate_{1}; int sos_{0}; int eos_{0}; diff --git a/speechx/speechx/protocol/websocket/CMakeLists.txt b/speechx/speechx/protocol/websocket/CMakeLists.txt index 0f73fd24..a171d84d 100644 --- a/speechx/speechx/protocol/websocket/CMakeLists.txt +++ b/speechx/speechx/protocol/websocket/CMakeLists.txt @@ -1,5 +1,3 @@ -# project(websocket) - add_library(websocket STATIC websocket_server.cc websocket_client.cc diff --git a/speechx/speechx/protocol/websocket/websocket_server_main.cc b/speechx/speechx/protocol/websocket/websocket_server_main.cc index 109da96b..9c01a0a1 100644 --- a/speechx/speechx/protocol/websocket/websocket_server_main.cc +++ b/speechx/speechx/protocol/websocket/websocket_server_main.cc @@ -17,11 +17,38 @@ DEFINE_int32(port, 8082, "websocket listening port"); +ppspeech::RecognizerResource InitRecognizerResoure() { + ppspeech::RecognizerResource resource; + resource.acoustic_scale = FLAGS_acoustic_scale; + resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions(); + + ppspeech::ModelOptions model_opts; + model_opts.model_path = FLAGS_model_path; + model_opts.param_path = FLAGS_param_path; + model_opts.cache_names = FLAGS_model_cache_names; + model_opts.cache_shape = FLAGS_model_cache_shapes; + model_opts.input_names = FLAGS_model_input_names; + model_opts.output_names = FLAGS_model_output_names; + model_opts.subsample_rate = FLAGS_downsampling_rate; + resource.model_opts = model_opts; + + ppspeech::TLGDecoderOptions decoder_opts; + decoder_opts.word_symbol_table = FLAGS_word_symbol_table; + decoder_opts.fst_path = FLAGS_graph_path; + decoder_opts.opts.max_active = FLAGS_max_active; + decoder_opts.opts.beam = FLAGS_beam; + decoder_opts.opts.lattice_beam = FLAGS_lattice_beam; + + resource.tlg_opts = decoder_opts; + + return resource; +} + int main(int argc, char *argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); - ppspeech::RecognizerResource resource = ppspeech::InitRecognizerResoure(); + ppspeech::RecognizerResource resource = InitRecognizerResoure(); ppspeech::WebSocketServer server(FLAGS_port, resource); LOG(INFO) << "Listening at port " << FLAGS_port; From 17ea30e7cac2367e2d7850e38d7db7fb7dd50558 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 17 Oct 2022 05:56:38 +0000 Subject: [PATCH 16/60] u2 recog test main ok --- .../examples/codelab/u2/local/recognizer.sh | 22 ++++++++++++++ speechx/speechx/decoder/param.h | 8 +++-- speechx/speechx/decoder/u2_recognizer.cc | 4 +++ speechx/speechx/decoder/u2_recognizer.h | 5 ++-- speechx/speechx/decoder/u2_recognizer_main.cc | 13 ++++---- speechx/speechx/frontend/audio/cmvn.cc | 30 ++++++++++--------- .../frontend/audio/feature_pipeline.cc | 1 + 7 files changed, 59 insertions(+), 24 deletions(-) create mode 100755 speechx/examples/codelab/u2/local/recognizer.sh diff --git a/speechx/examples/codelab/u2/local/recognizer.sh b/speechx/examples/codelab/u2/local/recognizer.sh new file mode 100755 index 00000000..a7359753 --- /dev/null +++ b/speechx/examples/codelab/u2/local/recognizer.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -e + +. path.sh + +data=data +exp=exp +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + +u2_recognizer_main \ + --use_fbank=true \ + --num_bins=80 \ + --cmvn_file=$exp/cmvn.ark \ + --model_path=$model_dir/export.jit \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --downsampling_rate=4 \ + --vocab_path=$model_dir/unit.txt \ + --wav_rspecifier=scp:$data/wav.scp \ + --result_wspecifier=ark,t:$exp/result.ark diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index e0f22d8c..1827e82d 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -52,11 +52,12 @@ DEFINE_string(model_cache_names, "chunk_state_h_box,chunk_state_c_box", "model cache names"); DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); +DEFINE_string(vocab_path, "", "nnet vocab path."); // decoder -DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); -DEFINE_string(graph_path, "TLG", "decoder graph"); DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); +DEFINE_string(graph_path, "TLG", "decoder graph"); +DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); DEFINE_int32(max_active, 7500, "max active"); DEFINE_double(beam, 15.0, "decoder beam"); DEFINE_double(lattice_beam, 7.5, "decoder beam"); @@ -72,13 +73,14 @@ FeaturePipelineOptions InitFeaturePipelineOptions() { frame_opts.dither = 0.0; frame_opts.frame_shift_ms = 10; opts.use_fbank = FLAGS_use_fbank; - LOG(INFO) << "feature type: " << opts.use_fbank ? "fbank" : "linear"; + LOG(INFO) << "feature type: " << (opts.use_fbank ? "fbank" : "linear"); if (opts.use_fbank) { opts.to_float32 = false; frame_opts.window_type = "povey"; frame_opts.frame_length_ms = 25; opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; opts.fbank_opts.frame_opts = frame_opts; + LOG(INFO) << "num bins: " << opts.fbank_opts.mel_opts.num_bins; } else { opts.to_float32 = true; frame_opts.remove_dc_offset = false; diff --git a/speechx/speechx/decoder/u2_recognizer.cc b/speechx/speechx/decoder/u2_recognizer.cc index 0ace086c..8fcc5d79 100644 --- a/speechx/speechx/decoder/u2_recognizer.cc +++ b/speechx/speechx/decoder/u2_recognizer.cc @@ -33,12 +33,15 @@ U2Recognizer::U2Recognizer(const U2RecognizerResource& resource): opts_(resource BaseFloat am_scale = resource.acoustic_scale; decodable_.reset(new Decodable(nnet, feature_pipeline_, am_scale)); + CHECK(resource.vocab_path != ""); decoder_.reset(new CTCPrefixBeamSearch(resource.vocab_path, resource.decoder_opts.ctc_prefix_search_opts)); unit_table_ = decoder_->VocabTable(); symbol_table_ = unit_table_; input_finished_ = false; + + Reset(); } void U2Recognizer::Reset() { @@ -69,6 +72,7 @@ void U2Recognizer::Accept(const VectorBase& waves) { void U2Recognizer::Decode() { decoder_->AdvanceDecode(decodable_); + UpdateResult(false); } void U2Recognizer::Rescoring() { diff --git a/speechx/speechx/decoder/u2_recognizer.h b/speechx/speechx/decoder/u2_recognizer.h index 0947e593..a65cae3b 100644 --- a/speechx/speechx/decoder/u2_recognizer.h +++ b/speechx/speechx/decoder/u2_recognizer.h @@ -92,12 +92,13 @@ struct DecodeOptions { struct U2RecognizerResource { + kaldi::BaseFloat acoustic_scale{1.0}; + std::string vocab_path{}; + FeaturePipelineOptions feature_pipeline_opts{}; ModelOptions model_opts{}; DecodeOptions decoder_opts{}; // CTCBeamSearchOptions beam_search_opts; - kaldi::BaseFloat acoustic_scale{1.0}; - std::string vocab_path{}; }; diff --git a/speechx/speechx/decoder/u2_recognizer_main.cc b/speechx/speechx/decoder/u2_recognizer_main.cc index 70bc7d67..ab2c6695 100644 --- a/speechx/speechx/decoder/u2_recognizer_main.cc +++ b/speechx/speechx/decoder/u2_recognizer_main.cc @@ -25,13 +25,16 @@ DEFINE_int32(sample_rate, 16000, "sample rate"); ppspeech::U2RecognizerResource InitOpts() { ppspeech::U2RecognizerResource resource; + resource.vocab_path = FLAGS_vocab_path; resource.acoustic_scale = FLAGS_acoustic_scale; - resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions(); + resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions(); + LOG(INFO) << "feature!"; ppspeech::ModelOptions model_opts; model_opts.model_path = FLAGS_model_path; resource.model_opts = model_opts; + LOG(INFO) << "model!"; ppspeech::DecodeOptions decoder_opts; decoder_opts.chunk_size=16; @@ -44,6 +47,7 @@ ppspeech::U2RecognizerResource InitOpts() { decoder_opts.ctc_prefix_search_opts.second_beam_size = 10; resource.decoder_opts = decoder_opts; + LOG(INFO) << "decoder!"; return resource; } @@ -57,9 +61,6 @@ int main(int argc, char* argv[]) { int32 num_done = 0, num_err = 0; double tot_wav_duration = 0.0; - ppspeech::U2RecognizerResource resource = InitOpts(); - ppspeech::U2Recognizer recognizer(resource); - kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); @@ -71,8 +72,10 @@ int main(int argc, char* argv[]) { LOG(INFO) << "chunk size (s): " << streaming_chunk; LOG(INFO) << "chunk size (sample): " << chunk_sample_size; - kaldi::Timer timer; + ppspeech::U2RecognizerResource resource = InitOpts(); + ppspeech::U2Recognizer recognizer(resource); + kaldi::Timer timer; for (; !wav_reader.Done(); wav_reader.Next()) { std::string utt = wav_reader.Key(); const kaldi::WaveData& wave_data = wav_reader.Value(); diff --git a/speechx/speechx/frontend/audio/cmvn.cc b/speechx/speechx/frontend/audio/cmvn.cc index 1ea83aba..5e84a1a1 100644 --- a/speechx/speechx/frontend/audio/cmvn.cc +++ b/speechx/speechx/frontend/audio/cmvn.cc @@ -29,7 +29,9 @@ using std::unique_ptr; CMVN::CMVN(std::string cmvn_file, unique_ptr base_extractor) : var_norm_(true) { + CHECK(cmvn_file != ""); base_extractor_ = std::move(base_extractor); + bool binary; kaldi::Input ki(cmvn_file, &binary); stats_.Read(ki.Stream(), binary); @@ -55,11 +57,11 @@ bool CMVN::Read(kaldi::Vector* feats) { // feats contain num_frames feature. void CMVN::Compute(VectorBase* feats) const { KALDI_ASSERT(feats != NULL); - int32 dim = stats_.NumCols() - 1; + if (stats_.NumRows() > 2 || stats_.NumRows() < 1 || - feats->Dim() % dim != 0) { - KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << 'x' - << stats_.NumCols() << ", feats " << feats->Dim() << 'x'; + feats->Dim() % dim_ != 0) { + KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << ',' + << stats_.NumCols() - 1 << ", feats " << feats->Dim() << 'x'; } if (stats_.NumRows() == 1 && var_norm_) { KALDI_ERR @@ -67,7 +69,7 @@ void CMVN::Compute(VectorBase* feats) const { << "are supplied."; } - double count = stats_(0, dim); + double count = stats_(0, dim_); // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when // computing an offset and representing it as stats_, we use a count of one. if (count < 1.0) @@ -77,14 +79,14 @@ void CMVN::Compute(VectorBase* feats) const { if (!var_norm_) { Vector offset(feats->Dim()); - SubVector mean_stats(stats_.RowData(0), dim); + SubVector mean_stats(stats_.RowData(0), dim_); Vector mean_stats_apply(feats->Dim()); - // fill the datat of mean_stats in mean_stats_appy whose dim is equal - // with the dim of feature. - // the dim of feats = dim * num_frames; - for (int32 idx = 0; idx < feats->Dim() / dim; ++idx) { - SubVector stats_tmp(mean_stats_apply.Data() + dim * idx, - dim); + // fill the datat of mean_stats in mean_stats_appy whose dim_ is equal + // with the dim_ of feature. + // the dim_ of feats = dim_ * num_frames; + for (int32 idx = 0; idx < feats->Dim() / dim_; ++idx) { + SubVector stats_tmp(mean_stats_apply.Data() + dim_ * idx, + dim_); stats_tmp.CopyFromVec(mean_stats); } offset.AddVec(-1.0 / count, mean_stats_apply); @@ -94,7 +96,7 @@ void CMVN::Compute(VectorBase* feats) const { // norm(0, d) = mean offset; // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d). kaldi::Matrix norm(2, feats->Dim()); - for (int32 d = 0; d < dim; d++) { + for (int32 d = 0; d < dim_; d++) { double mean, offset, scale; mean = stats_(0, d) / count; double var = (stats_(1, d) / count) - mean * mean, floor = 1.0e-20; @@ -111,7 +113,7 @@ void CMVN::Compute(VectorBase* feats) const { for (int32 d_skip = d; d_skip < feats->Dim();) { norm(0, d_skip) = offset; norm(1, d_skip) = scale; - d_skip = d_skip + dim; + d_skip = d_skip + dim_; } } // Apply the normalization. diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc index 9fc35c95..7232efc4 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.cc +++ b/speechx/speechx/frontend/audio/feature_pipeline.cc @@ -32,6 +32,7 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) : opts_(opt opts.linear_spectrogram_opts, std::move(data_source))); } + CHECK(opts.cmvn_file != ""); unique_ptr cmvn( new ppspeech::CMVN(opts.cmvn_file, std::move(base_feature))); From 278c7a41a83412f02bc4b0b98832c5076f0940cf Mon Sep 17 00:00:00 2001 From: "david.95" Date: Mon, 17 Oct 2022 14:59:23 +0800 Subject: [PATCH 17/60] add module define to fix ci, test=tts --- paddlespeech/t2s/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/paddlespeech/t2s/__init__.py b/paddlespeech/t2s/__init__.py index 7d93c026..57fe82a9 100644 --- a/paddlespeech/t2s/__init__.py +++ b/paddlespeech/t2s/__init__.py @@ -18,5 +18,6 @@ from . import exps from . import frontend from . import models from . import modules +from . import ssml from . import training from . import utils From 616fc4594b2484f12400fb937c4b0ff0e9de4a15 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 17 Oct 2022 08:19:43 +0000 Subject: [PATCH 18/60] refactor options --- speechx/examples/codelab/u2/local/decode.sh | 2 +- speechx/examples/codelab/u2/local/nnet.sh | 2 +- .../examples/codelab/u2/local/recognizer.sh | 2 +- speechx/speechx/decoder/CMakeLists.txt | 83 +++++++++++------- .../decoder/ctc_beam_search_decoder_main.cc | 15 ++-- .../ctc_prefix_beam_search_decoder_main.cc | 6 +- speechx/speechx/decoder/ctc_tlg_decoder.h | 32 +++++-- .../speechx/decoder/ctc_tlg_decoder_main.cc | 53 ++---------- speechx/speechx/decoder/param.h | 60 +++++-------- speechx/speechx/decoder/recognizer.h | 14 ++- speechx/speechx/decoder/recognizer_main.cc | 24 +----- speechx/speechx/decoder/u2_recognizer.h | 84 +++++++++--------- speechx/speechx/decoder/u2_recognizer_main.cc | 31 +------ .../speechx/frontend/audio/feature_pipeline.h | 77 +++++++++++++---- speechx/speechx/nnet/ds2_nnet_main.cc | 35 ++------ speechx/speechx/nnet/nnet_itf.h | 85 ++++++++++--------- speechx/speechx/nnet/u2_nnet.h | 1 - speechx/speechx/nnet/u2_nnet_main.cc | 23 ++--- .../websocket/websocket_server_main.cc | 24 +----- 19 files changed, 293 insertions(+), 360 deletions(-) diff --git a/speechx/examples/codelab/u2/local/decode.sh b/speechx/examples/codelab/u2/local/decode.sh index 24e9fca5..c22ad7f0 100755 --- a/speechx/examples/codelab/u2/local/decode.sh +++ b/speechx/examples/codelab/u2/local/decode.sh @@ -14,7 +14,7 @@ ctc_prefix_beam_search_decoder_main \ --model_path=$model_dir/export.jit \ --nnet_decoder_chunk=16 \ --receptive_field_length=7 \ - --downsampling_rate=4 \ + --subsampling_rate=4 \ --vocab_path=$model_dir/unit.txt \ --feature_rspecifier=ark,t:$exp/fbank.ark \ --result_wspecifier=ark,t:$exp/result.ark diff --git a/speechx/examples/codelab/u2/local/nnet.sh b/speechx/examples/codelab/u2/local/nnet.sh index 78663e9c..4419201c 100755 --- a/speechx/examples/codelab/u2/local/nnet.sh +++ b/speechx/examples/codelab/u2/local/nnet.sh @@ -15,7 +15,7 @@ u2_nnet_main \ --feature_rspecifier=ark,t:$exp/fbank.ark \ --nnet_decoder_chunk=16 \ --receptive_field_length=7 \ - --downsampling_rate=4 \ + --subsampling_rate=4 \ --acoustic_scale=1.0 \ --nnet_encoder_outs_wspecifier=ark,t:$exp/encoder_outs.ark \ --nnet_prob_wspecifier=ark,t:$exp/logprobs.ark diff --git a/speechx/examples/codelab/u2/local/recognizer.sh b/speechx/examples/codelab/u2/local/recognizer.sh index a7359753..9f697b45 100755 --- a/speechx/examples/codelab/u2/local/recognizer.sh +++ b/speechx/examples/codelab/u2/local/recognizer.sh @@ -16,7 +16,7 @@ u2_recognizer_main \ --model_path=$model_dir/export.jit \ --nnet_decoder_chunk=16 \ --receptive_field_length=7 \ - --downsampling_rate=4 \ + --subsampling_rate=4 \ --vocab_path=$model_dir/unit.txt \ --wav_rspecifier=scp:$data/wav.scp \ --result_wspecifier=ark,t:$exp/result.ark diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt index 472d9332..d06c3529 100644 --- a/speechx/speechx/decoder/CMakeLists.txt +++ b/speechx/speechx/decoder/CMakeLists.txt @@ -1,44 +1,61 @@ project(decoder) include_directories(${CMAKE_CURRENT_SOURCE_DIR/ctc_decoders}) -add_library(decoder STATIC - ctc_decoders/decoder_utils.cpp - ctc_decoders/path_trie.cpp - ctc_decoders/scorer.cpp - ctc_beam_search_decoder.cc - ctc_prefix_beam_search_decoder.cc - ctc_tlg_decoder.cc - recognizer.cc - u2_recognizer.cc + +set(decoder_src ) + +if (USING_DS2) +list(APPEND decoder_src +ctc_decoders/decoder_utils.cpp +ctc_decoders/path_trie.cpp +ctc_decoders/scorer.cpp +ctc_beam_search_decoder.cc +ctc_tlg_decoder.cc +recognizer.cc ) +endif() + +if (USING_U2) + list(APPEND decoder_src + ctc_prefix_beam_search_decoder.cc + u2_recognizer.cc + ) +endif() + +add_library(decoder STATIC ${decoder_src}) target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder absl::strings) # test -set(BINS - ctc_beam_search_decoder_main - nnet_logprob_decoder_main - recognizer_main - ctc_tlg_decoder_main -) +if (USING_DS2) + set(BINS + ctc_beam_search_decoder_main + nnet_logprob_decoder_main + recognizer_main + ctc_tlg_decoder_main + ) -foreach(bin_name IN LISTS BINS) - add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) - target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) - target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS}) -endforeach() + foreach(bin_name IN LISTS BINS) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS}) + endforeach() +endif() -# u2 -set(TEST_BINS - u2_recognizer_main - ctc_prefix_beam_search_decoder_main -) +if (USING_U2) + set(TEST_BINS + ctc_prefix_beam_search_decoder_main + u2_recognizer_main + ) + + foreach(bin_name IN LISTS TEST_BINS) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util) + target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS}) + target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) + target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) + endforeach() + +endif() -foreach(bin_name IN LISTS TEST_BINS) - add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) - target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) - target_link_libraries(${bin_name} nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util) - target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS}) - target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) - target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) -endforeach() \ No newline at end of file diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc index 7e245e9b..edf9215a 100644 --- a/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc +++ b/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc @@ -31,7 +31,7 @@ DEFINE_string(lm_path, "", "language model"); DEFINE_int32(receptive_field_length, 7, "receptive field of two CNN(kernel=3) downsampling module."); -DEFINE_int32(downsampling_rate, +DEFINE_int32(subsampling_rate, 4, "two CNN(kernel=3) module downsampling rate."); DEFINE_string( @@ -81,13 +81,8 @@ int main(int argc, char* argv[]) { opts.lm_path = lm_path; ppspeech::CTCBeamSearch decoder(opts); - ppspeech::ModelOptions model_opts; - model_opts.model_path = model_path; - model_opts.param_path = model_params; - model_opts.cache_names = FLAGS_model_cache_names; - model_opts.cache_shape = FLAGS_model_cache_shapes; - model_opts.input_names = FLAGS_model_input_names; - model_opts.output_names = FLAGS_model_output_names; + ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags(); + std::shared_ptr nnet( new ppspeech::PaddleNnet(model_opts)); std::shared_ptr raw_data(new ppspeech::DataCache()); @@ -95,8 +90,8 @@ int main(int argc, char* argv[]) { new ppspeech::Decodable(nnet, raw_data)); int32 chunk_size = FLAGS_receptive_field_length + - (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate; - int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk; + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate; + int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk; int32 receptive_field_length = FLAGS_receptive_field_length; LOG(INFO) << "chunk size (frame): " << chunk_size; LOG(INFO) << "chunk stride (frame): " << chunk_stride; diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc index dd352378..7a488bb0 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc @@ -30,7 +30,7 @@ DEFINE_string(model_path, "", "paddle nnet model"); DEFINE_int32(receptive_field_length, 7, "receptive field of two CNN(kernel=3) downsampling module."); -DEFINE_int32(downsampling_rate, +DEFINE_int32(subsampling_rate, 4, "two CNN(kernel=3) module downsampling rate."); @@ -81,8 +81,8 @@ int main(int argc, char* argv[]) { int32 chunk_size = FLAGS_receptive_field_length + - (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate; - int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk; + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate; + int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk; int32 receptive_field_length = FLAGS_receptive_field_length; LOG(INFO) << "chunk size (frame): " << chunk_size; LOG(INFO) << "chunk stride (frame): " << chunk_stride; diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h index 2f1d6c10..76bbcf42 100644 --- a/speechx/speechx/decoder/ctc_tlg_decoder.h +++ b/speechx/speechx/decoder/ctc_tlg_decoder.h @@ -20,15 +20,37 @@ #include "kaldi/decoder/lattice-faster-online-decoder.h" #include "util/parse-options.h" + +DECLARE_string(graph_path); +DECLARE_string(word_symbol_table); +DECLARE_int32(max_active); +DECLARE_double(beam); +DECLARE_double(lattice_beam); + namespace ppspeech { struct TLGDecoderOptions { - kaldi::LatticeFasterDecoderConfig opts; + kaldi::LatticeFasterDecoderConfig opts{}; // todo remove later, add into decode resource - std::string word_symbol_table; - std::string fst_path; - - TLGDecoderOptions() : word_symbol_table(""), fst_path("") {} + std::string word_symbol_table{}; + std::string fst_path{}; + + static TLGDecoderOptions InitFromFlags(){ + TLGDecoderOptions decoder_opts; + decoder_opts.word_symbol_table = FLAGS_word_symbol_table; + decoder_opts.fst_path = FLAGS_graph_path; + LOG(INFO) << "fst path: " << decoder_opts.fst_path; + LOG(INFO) << "fst symbole table: " << decoder_opts.word_symbol_table; + + decoder_opts.opts.max_active = FLAGS_max_active; + decoder_opts.opts.beam = FLAGS_beam; + decoder_opts.opts.lattice_beam = FLAGS_lattice_beam; + LOG(INFO) << "LatticeFasterDecoder max active: " << decoder_opts.opts.max_active ; + LOG(INFO) << "LatticeFasterDecoder beam: " << decoder_opts.opts.beam ; + LOG(INFO) << "LatticeFasterDecoder lattice_beam: " << decoder_opts.opts.lattice_beam ; + + return decoder_opts; + } }; class TLGDecoder : public DecoderInterface { diff --git a/speechx/speechx/decoder/ctc_tlg_decoder_main.cc b/speechx/speechx/decoder/ctc_tlg_decoder_main.cc index cd1249d8..f262101a 100644 --- a/speechx/speechx/decoder/ctc_tlg_decoder_main.cc +++ b/speechx/speechx/decoder/ctc_tlg_decoder_main.cc @@ -19,6 +19,7 @@ #include "frontend/audio/data_cache.h" #include "nnet/decodable.h" #include "nnet/ds2_nnet.h" +#include "decoder/param.h" #include "decoder/ctc_tlg_decoder.h" #include "kaldi/util/table-types.h" @@ -26,30 +27,7 @@ DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier"); -DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); -DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); -DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); -DEFINE_string(graph_path, "TLG", "decoder graph"); -DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); -DEFINE_int32(max_active, 7500, "decoder graph"); -DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk"); -DEFINE_int32(receptive_field_length, - 7, - "receptive field of two CNN(kernel=3) downsampling module."); -DEFINE_int32(downsampling_rate, - 4, - "two CNN(kernel=3) module downsampling rate."); -DEFINE_string( - model_input_names, - "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box", - "model input names"); -DEFINE_string(model_output_names, - "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0", - "model output names"); -DEFINE_string(model_cache_names, - "chunk_state_h_box,chunk_state_c_box", - "model cache names"); -DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); + using kaldi::BaseFloat; using kaldi::Matrix; @@ -66,32 +44,16 @@ int main(int argc, char* argv[]) { kaldi::SequentialBaseFloatMatrixReader feature_reader( FLAGS_feature_rspecifier); kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); - std::string model_graph = FLAGS_model_path; - std::string model_params = FLAGS_param_path; - std::string word_symbol_table = FLAGS_word_symbol_table; - std::string graph_path = FLAGS_graph_path; - LOG(INFO) << "model path: " << model_graph; - LOG(INFO) << "model param: " << model_params; - LOG(INFO) << "word symbol path: " << word_symbol_table; - LOG(INFO) << "graph path: " << graph_path; int32 num_done = 0, num_err = 0; - ppspeech::TLGDecoderOptions opts; - opts.word_symbol_table = word_symbol_table; - opts.fst_path = graph_path; - opts.opts.max_active = FLAGS_max_active; + ppspeech::TLGDecoderOptions opts = ppspeech::TLGDecoderOptions::InitFromFlags(); opts.opts.beam = 15.0; opts.opts.lattice_beam = 7.5; ppspeech::TLGDecoder decoder(opts); - ppspeech::ModelOptions model_opts; - model_opts.model_path = model_graph; - model_opts.param_path = model_params; - model_opts.cache_names = FLAGS_model_cache_names; - model_opts.cache_shape = FLAGS_model_cache_shapes; - model_opts.input_names = FLAGS_model_input_names; - model_opts.output_names = FLAGS_model_output_names; + ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags(); + std::shared_ptr nnet( new ppspeech::PaddleNnet(model_opts)); std::shared_ptr raw_data(new ppspeech::DataCache()); @@ -99,12 +61,13 @@ int main(int argc, char* argv[]) { new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale)); int32 chunk_size = FLAGS_receptive_field_length + - (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate; - int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk; + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate; + int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk; int32 receptive_field_length = FLAGS_receptive_field_length; LOG(INFO) << "chunk size (frame): " << chunk_size; LOG(INFO) << "chunk stride (frame): " << chunk_stride; LOG(INFO) << "receptive field (frame): " << receptive_field_length; + decoder.InitDecoder(); kaldi::Timer timer; for (; !feature_reader.Done(); feature_reader.Next()) { diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index 1827e82d..5e1120ad 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -17,8 +17,6 @@ #include "base/common.h" #include "decoder/ctc_beam_search_decoder.h" #include "decoder/ctc_tlg_decoder.h" -#include "frontend/audio/feature_pipeline.h" - // feature DEFINE_bool(use_fbank, false, "False for fbank; or linear feature"); @@ -27,18 +25,18 @@ DEFINE_bool(use_fbank, false, "False for fbank; or linear feature"); DEFINE_int32(num_bins, 161, "num bins of mel"); DEFINE_string(cmvn_file, "", "read cmvn"); - // feature sliding window DEFINE_int32(receptive_field_length, 7, "receptive field of two CNN(kernel=3) downsampling module."); -DEFINE_int32(downsampling_rate, - 4, +DEFINE_int32(subsampling_rate, + 4, "two CNN(kernel=3) module downsampling rate."); DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk"); // nnet +DEFINE_string(vocab_path, "", "nnet vocab path."); DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); DEFINE_string( @@ -52,10 +50,11 @@ DEFINE_string(model_cache_names, "chunk_state_h_box,chunk_state_c_box", "model cache names"); DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); -DEFINE_string(vocab_path, "", "nnet vocab path."); + // decoder DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); + DEFINE_string(graph_path, "TLG", "decoder graph"); DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); DEFINE_int32(max_active, 7500, "max active"); @@ -63,37 +62,20 @@ DEFINE_double(beam, 15.0, "decoder beam"); DEFINE_double(lattice_beam, 7.5, "decoder beam"); -namespace ppspeech { - -// todo refactor later -FeaturePipelineOptions InitFeaturePipelineOptions() { - FeaturePipelineOptions opts; - opts.cmvn_file = FLAGS_cmvn_file; - kaldi::FrameExtractionOptions frame_opts; - frame_opts.dither = 0.0; - frame_opts.frame_shift_ms = 10; - opts.use_fbank = FLAGS_use_fbank; - LOG(INFO) << "feature type: " << (opts.use_fbank ? "fbank" : "linear"); - if (opts.use_fbank) { - opts.to_float32 = false; - frame_opts.window_type = "povey"; - frame_opts.frame_length_ms = 25; - opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; - opts.fbank_opts.frame_opts = frame_opts; - LOG(INFO) << "num bins: " << opts.fbank_opts.mel_opts.num_bins; - } else { - opts.to_float32 = true; - frame_opts.remove_dc_offset = false; - frame_opts.frame_length_ms = 20; - frame_opts.window_type = "hanning"; - frame_opts.preemph_coeff = 0.0; - opts.linear_spectrogram_opts.frame_opts = frame_opts; - } - opts.assembler_opts.subsampling_rate = FLAGS_downsampling_rate; - opts.assembler_opts.receptive_filed_length = FLAGS_receptive_field_length; - opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk; - - return opts; -} -} // namespace ppspeech +// DecodeOptions flags +// DEFINE_int32(chunk_size, -1, "decoding chunk size"); +DEFINE_int32(num_left_chunks, -1, "left chunks in decoding"); +DEFINE_double(ctc_weight, + 0.5, + "ctc weight when combining ctc score and rescoring score"); +DEFINE_double(rescoring_weight, + 1.0, + "rescoring weight when combining ctc score and rescoring score"); +DEFINE_double(reverse_weight, + 0.3, + "used for bitransformer rescoring. it must be 0.0 if decoder is" + "conventional transformer decoder, and only reverse_weight > 0.0" + "dose the right to left decoder will be calculated and used"); +DEFINE_int32(nbest, 10, "nbest for ctc wfst or prefix search"); +DEFINE_int32(blank, 0, "blank id in vocab"); diff --git a/speechx/speechx/decoder/recognizer.h b/speechx/speechx/decoder/recognizer.h index 4965e7a3..51b66673 100644 --- a/speechx/speechx/decoder/recognizer.h +++ b/speechx/speechx/decoder/recognizer.h @@ -22,14 +22,26 @@ #include "nnet/decodable.h" #include "nnet/ds2_nnet.h" +DECLARE_double(acoustic_scale); + namespace ppspeech { struct RecognizerResource { + kaldi::BaseFloat acoustic_scale{1.0}; FeaturePipelineOptions feature_pipeline_opts{}; ModelOptions model_opts{}; TLGDecoderOptions tlg_opts{}; // CTCBeamSearchOptions beam_search_opts; - kaldi::BaseFloat acoustic_scale{1.0}; + + static RecognizerResource InitFromFlags(){ + RecognizerResource resource; + resource.acoustic_scale = FLAGS_acoustic_scale; + resource.feature_pipeline_opts = FeaturePipelineOptions::InitFromFlags(); + resource.model_opts = ModelOptions::InitFromFlags(); + resource.tlg_opts = TLGDecoderOptions::InitFromFlags(); + return resource; + + } }; class Recognizer { diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/decoder/recognizer_main.cc index 2b497d6e..662943b5 100644 --- a/speechx/speechx/decoder/recognizer_main.cc +++ b/speechx/speechx/decoder/recognizer_main.cc @@ -25,27 +25,9 @@ DEFINE_int32(sample_rate, 16000, "sample rate"); ppspeech::RecognizerResource InitRecognizerResoure() { ppspeech::RecognizerResource resource; resource.acoustic_scale = FLAGS_acoustic_scale; - resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions(); - - ppspeech::ModelOptions model_opts; - model_opts.model_path = FLAGS_model_path; - model_opts.param_path = FLAGS_param_path; - model_opts.cache_names = FLAGS_model_cache_names; - model_opts.cache_shape = FLAGS_model_cache_shapes; - model_opts.input_names = FLAGS_model_input_names; - model_opts.output_names = FLAGS_model_output_names; - model_opts.subsample_rate = FLAGS_downsampling_rate; - resource.model_opts = model_opts; - - ppspeech::TLGDecoderOptions decoder_opts; - decoder_opts.word_symbol_table = FLAGS_word_symbol_table; - decoder_opts.fst_path = FLAGS_graph_path; - decoder_opts.opts.max_active = FLAGS_max_active; - decoder_opts.opts.beam = FLAGS_beam; - decoder_opts.opts.lattice_beam = FLAGS_lattice_beam; - - resource.tlg_opts = decoder_opts; - + resource.feature_pipeline_opts = ppspeech::FeaturePipelineOptions::InitFromFlags(); + resource.model_opts = ppspeech::ModelOptions::InitFromFlags(); + resource.tlg_opts = ppspeech::TLGDecoderOptions::InitFromFlags(); return resource; } diff --git a/speechx/speechx/decoder/u2_recognizer.h b/speechx/speechx/decoder/u2_recognizer.h index a65cae3b..86bd4821 100644 --- a/speechx/speechx/decoder/u2_recognizer.h +++ b/speechx/speechx/decoder/u2_recognizer.h @@ -26,15 +26,25 @@ #include "fst/fstlib.h" #include "fst/symbol-table.h" -namespace ppspeech { +DECLARE_int32(nnet_decoder_chunk); +DECLARE_int32(num_left_chunks); +DECLARE_double(ctc_weight); +DECLARE_double(rescoring_weight); +DECLARE_double(reverse_weight); +DECLARE_int32(nbest); +DECLARE_int32(blank); + +DECLARE_double(acoustic_scale); +DECLARE_string(vocab_path); +namespace ppspeech { struct DecodeOptions { // chunk_size is the frame number of one chunk after subsampling. // e.g. if subsample rate is 4 and chunk_size = 16, the frames in // one chunk are 67=16*4 + 3, stride is 64=16*4 - int chunk_size; - int num_left_chunks; + int chunk_size{16}; + int num_left_chunks{-1}; // final_score = rescoring_weight * rescoring_score + ctc_weight * // ctc_score; @@ -46,51 +56,27 @@ struct DecodeOptions { // it's a sum(prefix) score + context score For CtcWfstBeamSerch, it's a // max(viterbi) path score + context score So we should carefully set // ctc_weight accroding to the search methods. - float ctc_weight; - float rescoring_weight; - float reverse_weight; + float ctc_weight{0.0}; + float rescoring_weight{1.0}; + float reverse_weight{0.0}; // CtcEndpointConfig ctc_endpoint_opts; - CTCBeamSearchOptions ctc_prefix_search_opts; - - DecodeOptions() - : chunk_size(16), - num_left_chunks(-1), - ctc_weight(0.5), - rescoring_weight(1.0), - reverse_weight(0.0) {} - - void Register(kaldi::OptionsItf* opts) { - std::string module = "DecoderConfig: "; - opts->Register( - "chunk-size", - &chunk_size, - module + "the frame number of one chunk after subsampling."); - opts->Register("num-left-chunks", - &num_left_chunks, - module + "the left history chunks number."); - opts->Register("ctc-weight", - &ctc_weight, - module + - "ctc weight for rescore. final_score = " - "rescoring_weight * rescoring_score + ctc_weight * " - "ctc_score."); - opts->Register("rescoring-weight", - &rescoring_weight, - module + - "attention score weight for rescore. final_score = " - "rescoring_weight * rescoring_score + ctc_weight * " - "ctc_score."); - opts->Register("reverse-weight", - &reverse_weight, - module + - "reverse decoder weight. rescoring_score = " - "left_to_right_score * (1 - reverse_weight) + " - "right_to_left_score * reverse_weight."); + CTCBeamSearchOptions ctc_prefix_search_opts{}; + + static DecodeOptions InitFromFlags(){ + DecodeOptions decoder_opts; + decoder_opts.chunk_size=FLAGS_nnet_decoder_chunk; + decoder_opts.num_left_chunks = FLAGS_num_left_chunks; + decoder_opts.ctc_weight = FLAGS_ctc_weight; + decoder_opts.rescoring_weight = FLAGS_rescoring_weight; + decoder_opts.reverse_weight = FLAGS_reverse_weight; + decoder_opts.ctc_prefix_search_opts.blank = FLAGS_blank; + decoder_opts.ctc_prefix_search_opts.first_beam_size = FLAGS_nbest; + decoder_opts.ctc_prefix_search_opts.second_beam_size = FLAGS_nbest; + return decoder_opts; } }; - struct U2RecognizerResource { kaldi::BaseFloat acoustic_scale{1.0}; std::string vocab_path{}; @@ -98,7 +84,17 @@ struct U2RecognizerResource { FeaturePipelineOptions feature_pipeline_opts{}; ModelOptions model_opts{}; DecodeOptions decoder_opts{}; - // CTCBeamSearchOptions beam_search_opts; + + static U2RecognizerResource InitFromFlags() { + U2RecognizerResource resource; + resource.vocab_path = FLAGS_vocab_path; + resource.acoustic_scale = FLAGS_acoustic_scale; + + resource.feature_pipeline_opts = ppspeech::FeaturePipelineOptions::InitFromFlags(); + resource.model_opts = ppspeech::ModelOptions::InitFromFlags(); + resource.decoder_opts = ppspeech::DecodeOptions::InitFromFlags(); + return resource; +} }; diff --git a/speechx/speechx/decoder/u2_recognizer_main.cc b/speechx/speechx/decoder/u2_recognizer_main.cc index ab2c6695..b1a7b2e8 100644 --- a/speechx/speechx/decoder/u2_recognizer_main.cc +++ b/speechx/speechx/decoder/u2_recognizer_main.cc @@ -22,35 +22,6 @@ DEFINE_string(result_wspecifier, "", "test result wspecifier"); DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); DEFINE_int32(sample_rate, 16000, "sample rate"); - -ppspeech::U2RecognizerResource InitOpts() { - ppspeech::U2RecognizerResource resource; - resource.vocab_path = FLAGS_vocab_path; - resource.acoustic_scale = FLAGS_acoustic_scale; - - resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions(); - LOG(INFO) << "feature!"; - ppspeech::ModelOptions model_opts; - model_opts.model_path = FLAGS_model_path; - - resource.model_opts = model_opts; - LOG(INFO) << "model!"; - - ppspeech::DecodeOptions decoder_opts; - decoder_opts.chunk_size=16; - decoder_opts.num_left_chunks = -1; - decoder_opts.ctc_weight = 0.5; - decoder_opts.rescoring_weight = 1.0; - decoder_opts.reverse_weight = 0.3; - decoder_opts.ctc_prefix_search_opts.blank = 0; - decoder_opts.ctc_prefix_search_opts.first_beam_size = 10; - decoder_opts.ctc_prefix_search_opts.second_beam_size = 10; - - resource.decoder_opts = decoder_opts; - LOG(INFO) << "decoder!"; - return resource; -} - int main(int argc, char* argv[]) { gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); @@ -72,7 +43,7 @@ int main(int argc, char* argv[]) { LOG(INFO) << "chunk size (s): " << streaming_chunk; LOG(INFO) << "chunk size (sample): " << chunk_sample_size; - ppspeech::U2RecognizerResource resource = InitOpts(); + ppspeech::U2RecognizerResource resource = ppspeech::U2RecognizerResource::InitFromFlags(); ppspeech::U2Recognizer recognizer(resource); kaldi::Timer timer; diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index 613f69c6..38a47433 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -25,26 +25,71 @@ #include "frontend/audio/linear_spectrogram.h" #include "frontend/audio/normalizer.h" +// feature +DECLARE_bool(use_fbank); +DECLARE_int32(num_bins); +DECLARE_string(cmvn_file); + +// feature sliding window +DECLARE_int32(receptive_field_length); +DECLARE_int32(subsampling_rate); +DECLARE_int32(nnet_decoder_chunk); + namespace ppspeech { + struct FeaturePipelineOptions { - std::string cmvn_file; - bool to_float32; // true, only for linear feature - bool use_fbank; - LinearSpectrogramOptions linear_spectrogram_opts; - kaldi::FbankOptions fbank_opts; - FeatureCacheOptions feature_cache_opts; - AssemblerOptions assembler_opts; - - FeaturePipelineOptions() - : cmvn_file(""), - to_float32(false), // true, only for linear feature - use_fbank(true), - linear_spectrogram_opts(), - fbank_opts(), - feature_cache_opts(), - assembler_opts() {} + std::string cmvn_file{}; + bool to_float32{false}; // true, only for linear feature + bool use_fbank{true}; + LinearSpectrogramOptions linear_spectrogram_opts{}; + kaldi::FbankOptions fbank_opts{}; + FeatureCacheOptions feature_cache_opts{}; + AssemblerOptions assembler_opts{}; + + static FeaturePipelineOptions InitFromFlags(){ + FeaturePipelineOptions opts; + opts.cmvn_file = FLAGS_cmvn_file; + LOG(INFO) << "cmvn file: " << opts.cmvn_file; + + // frame options + kaldi::FrameExtractionOptions frame_opts; + frame_opts.dither = 0.0; + LOG(INFO) << "dither: " << frame_opts.dither; + frame_opts.frame_shift_ms = 10; + LOG(INFO) << "frame shift ms: " << frame_opts.frame_shift_ms; + opts.use_fbank = FLAGS_use_fbank; + LOG(INFO) << "feature type: " << (opts.use_fbank ? "fbank" : "linear"); + if (opts.use_fbank) { + opts.to_float32 = false; + frame_opts.window_type = "povey"; + frame_opts.frame_length_ms = 25; + opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; + LOG(INFO) << "num bins: " << opts.fbank_opts.mel_opts.num_bins; + + opts.fbank_opts.frame_opts = frame_opts; + } else { + opts.to_float32 = true; + frame_opts.remove_dc_offset = false; + frame_opts.frame_length_ms = 20; + frame_opts.window_type = "hanning"; + frame_opts.preemph_coeff = 0.0; + + opts.linear_spectrogram_opts.frame_opts = frame_opts; + } + LOG(INFO) << "frame length ms: " << frame_opts.frame_length_ms; + + // assembler opts + opts.assembler_opts.subsampling_rate = FLAGS_subsampling_rate; + LOG(INFO) << "subsampling rate: " << opts.assembler_opts.subsampling_rate; + opts.assembler_opts.receptive_filed_length = FLAGS_receptive_field_length; + LOG(INFO) << "nnet receptive filed length: " << opts.assembler_opts.receptive_filed_length; + opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk; + LOG(INFO) << "nnet chunk size: " << opts.assembler_opts.nnet_decoder_chunk; + return opts; + } }; + class FeaturePipeline : public FrontendInterface { public: explicit FeaturePipeline(const FeaturePipelineOptions& opts); diff --git a/speechx/speechx/nnet/ds2_nnet_main.cc b/speechx/speechx/nnet/ds2_nnet_main.cc index 943d7e5f..d8d33e98 100644 --- a/speechx/speechx/nnet/ds2_nnet_main.cc +++ b/speechx/speechx/nnet/ds2_nnet_main.cc @@ -14,6 +14,7 @@ #include "nnet/ds2_nnet.h" #include "base/common.h" +#include "decoder/param.h" #include "frontend/audio/assembler.h" #include "frontend/audio/data_cache.h" #include "kaldi/util/table-types.h" @@ -21,27 +22,6 @@ DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier"); -DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); -DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); -DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk"); -DEFINE_int32(receptive_field_length, - 7, - "receptive field of two CNN(kernel=3) downsampling module."); -DEFINE_int32(downsampling_rate, - 4, - "two CNN(kernel=3) module downsampling rate."); -DEFINE_string( - model_input_names, - "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box", - "model input names"); -DEFINE_string(model_output_names, - "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0", - "model output names"); -DEFINE_string(model_cache_names, - "chunk_state_h_box,chunk_state_c_box", - "model cache names"); -DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); -DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); using kaldi::BaseFloat; using kaldi::Matrix; @@ -64,13 +44,8 @@ int main(int argc, char* argv[]) { int32 num_done = 0, num_err = 0; - ppspeech::ModelOptions model_opts; - model_opts.model_path = model_graph; - model_opts.param_path = model_params; - model_opts.cache_names = FLAGS_model_cache_names; - model_opts.cache_shape = FLAGS_model_cache_shapes; - model_opts.input_names = FLAGS_model_input_names; - model_opts.output_names = FLAGS_model_output_names; + ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags(); + std::shared_ptr nnet( new ppspeech::PaddleNnet(model_opts)); std::shared_ptr raw_data(new ppspeech::DataCache()); @@ -78,8 +53,8 @@ int main(int argc, char* argv[]) { new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale)); int32 chunk_size = FLAGS_receptive_field_length + - (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate; - int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk; + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate; + int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk; int32 receptive_field_length = FLAGS_receptive_field_length; LOG(INFO) << "chunk size (frame): " << chunk_size; LOG(INFO) << "chunk stride (frame): " << chunk_stride; diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h index 109f54e0..f8105b7f 100644 --- a/speechx/speechx/nnet/nnet_itf.h +++ b/speechx/speechx/nnet/nnet_itf.h @@ -20,53 +20,54 @@ #include "kaldi/matrix/kaldi-matrix.h" #include "kaldi/util/options-itf.h" -namespace ppspeech { +DECLARE_int32(subsampling_rate); +DECLARE_string(model_path); +DECLARE_string(param_path); +DECLARE_string(model_input_names); +DECLARE_string(model_output_names); +DECLARE_string(model_cache_names); +DECLARE_string(model_cache_shapes); +namespace ppspeech { struct ModelOptions { + // common + int subsample_rate{1}; + int thread_num{1}; // predictor thread pool size for ds2; + bool use_gpu{false}; std::string model_path; + std::string param_path; - int thread_num; // predictor thread pool size for ds2; - bool use_gpu; - bool switch_ir_optim; - std::string input_names; - std::string output_names; - std::string cache_names; - std::string cache_shape; - bool enable_fc_padding; - bool enable_profile; - int subsample_rate; - ModelOptions() - : model_path(""), - param_path(""), - thread_num(1), - use_gpu(false), - input_names(""), - output_names(""), - cache_names(""), - cache_shape(""), - switch_ir_optim(false), - enable_fc_padding(false), - enable_profile(false), - subsample_rate(0) {} - - void Register(kaldi::OptionsItf* opts) { - opts->Register("model-path", &model_path, "model file path"); - opts->Register("model-param", ¶m_path, "params model file path"); - opts->Register("thread-num", &thread_num, "thread num"); - opts->Register("use-gpu", &use_gpu, "if use gpu"); - opts->Register("input-names", &input_names, "paddle input names"); - opts->Register("output-names", &output_names, "paddle output names"); - opts->Register("cache-names", &cache_names, "cache names"); - opts->Register("cache-shape", &cache_shape, "cache shape"); - opts->Register("switch-ir-optiom", - &switch_ir_optim, - "paddle SwitchIrOptim option"); - opts->Register("enable-fc-padding", - &enable_fc_padding, - "paddle EnableFCPadding option"); - opts->Register( - "enable-profile", &enable_profile, "paddle EnableProfile option"); + + // ds2 for inference + std::string input_names{}; + std::string output_names{}; + std::string cache_names{}; + std::string cache_shape{}; + bool switch_ir_optim{false}; + bool enable_fc_padding{false}; + bool enable_profile{false}; + + static ModelOptions InitFromFlags(){ + ModelOptions opts; + opts.subsample_rate = FLAGS_subsampling_rate; + LOG(INFO) << "subsampling rate: " << opts.subsample_rate; + opts.model_path = FLAGS_model_path; + LOG(INFO) << "model path: " << opts.model_path ; + + opts.param_path = FLAGS_param_path; + LOG(INFO) << "param path: " << opts.param_path ; + + LOG(INFO) << "DS2 param: "; + opts.cache_names = FLAGS_model_cache_names; + LOG(INFO) << " cache names: " << opts.cache_names; + opts.cache_shape = FLAGS_model_cache_shapes; + LOG(INFO) << " cache shape: " << opts.cache_shape; + opts.input_names = FLAGS_model_input_names; + LOG(INFO) << " input names: " << opts.input_names; + opts.output_names = FLAGS_model_output_names; + LOG(INFO) << " output names: " << opts.output_names; + return opts; } }; diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h index 7058ea94..697ac20c 100644 --- a/speechx/speechx/nnet/u2_nnet.h +++ b/speechx/speechx/nnet/u2_nnet.h @@ -17,7 +17,6 @@ #include "base/common.h" #include "kaldi/matrix/kaldi-matrix.h" - #include "nnet/nnet_itf.h" #include "paddle/extension.h" #include "paddle/jit/all.h" diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc index 4b30f6b4..adbbf0e8 100644 --- a/speechx/speechx/nnet/u2_nnet_main.cc +++ b/speechx/speechx/nnet/u2_nnet_main.cc @@ -12,28 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "nnet/u2_nnet.h" + #include "base/common.h" #include "frontend/audio/assembler.h" #include "frontend/audio/data_cache.h" #include "kaldi/util/table-types.h" #include "nnet/decodable.h" +#include "decoder/param.h" +#include "nnet/u2_nnet.h" + DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier"); DEFINE_string(nnet_encoder_outs_wspecifier, "", "nnet encoder outs wspecifier"); -DEFINE_string(model_path, "", "paddle nnet model"); - -DEFINE_int32(nnet_decoder_chunk, 16, "nnet forward chunk"); -DEFINE_int32(receptive_field_length, - 7, - "receptive field of two CNN(kernel=3) downsampling module."); -DEFINE_int32(downsampling_rate, - 4, - "two CNN(kernel=3) module downsampling rate."); -DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); - using kaldi::BaseFloat; using kaldi::Matrix; using std::vector; @@ -58,13 +50,12 @@ int main(int argc, char* argv[]) { kaldi::BaseFloatMatrixWriter nnet_out_writer(FLAGS_nnet_prob_wspecifier); kaldi::BaseFloatMatrixWriter nnet_encoder_outs_writer(FLAGS_nnet_encoder_outs_wspecifier); - ppspeech::ModelOptions model_opts; - model_opts.model_path = FLAGS_model_path; + ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags(); int32 chunk_size = - (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate + + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate + FLAGS_receptive_field_length; - int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk; + int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk; int32 receptive_field_length = FLAGS_receptive_field_length; LOG(INFO) << "chunk size (frame): " << chunk_size; LOG(INFO) << "chunk stride (frame): " << chunk_stride; diff --git a/speechx/speechx/protocol/websocket/websocket_server_main.cc b/speechx/speechx/protocol/websocket/websocket_server_main.cc index 9c01a0a1..827b164f 100644 --- a/speechx/speechx/protocol/websocket/websocket_server_main.cc +++ b/speechx/speechx/protocol/websocket/websocket_server_main.cc @@ -20,27 +20,9 @@ DEFINE_int32(port, 8082, "websocket listening port"); ppspeech::RecognizerResource InitRecognizerResoure() { ppspeech::RecognizerResource resource; resource.acoustic_scale = FLAGS_acoustic_scale; - resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions(); - - ppspeech::ModelOptions model_opts; - model_opts.model_path = FLAGS_model_path; - model_opts.param_path = FLAGS_param_path; - model_opts.cache_names = FLAGS_model_cache_names; - model_opts.cache_shape = FLAGS_model_cache_shapes; - model_opts.input_names = FLAGS_model_input_names; - model_opts.output_names = FLAGS_model_output_names; - model_opts.subsample_rate = FLAGS_downsampling_rate; - resource.model_opts = model_opts; - - ppspeech::TLGDecoderOptions decoder_opts; - decoder_opts.word_symbol_table = FLAGS_word_symbol_table; - decoder_opts.fst_path = FLAGS_graph_path; - decoder_opts.opts.max_active = FLAGS_max_active; - decoder_opts.opts.beam = FLAGS_beam; - decoder_opts.opts.lattice_beam = FLAGS_lattice_beam; - - resource.tlg_opts = decoder_opts; - + resource.feature_pipeline_opts = ppspeech::FeaturePipelineOptions::InitFromFlags(); + resource.model_opts = ppspeech::ModelOptions::InitFromFlags(); + resource.tlg_opts = ppspeech::TLGDecoderOptions::InitFromFlags(); return resource; } From 56a0a02452bb17204cbd5e126200ea2e02fb0be5 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 17 Oct 2022 08:26:01 +0000 Subject: [PATCH 19/60] format code --- speechx/speechx/base/basic_types.h | 4 +-- speechx/speechx/base/common.h | 1 - .../decoder/ctc_prefix_beam_search_decoder.h | 1 - speechx/speechx/decoder/ctc_tlg_decoder.h | 11 ++++---- speechx/speechx/decoder/param.h | 3 +-- speechx/speechx/decoder/recognizer.h | 12 ++++----- speechx/speechx/decoder/u2_recognizer.h | 26 +++++++++---------- speechx/speechx/frontend/audio/data_cache.h | 2 +- .../speechx/frontend/audio/feature_pipeline.h | 22 +++++++++------- speechx/speechx/frontend/audio/mfcc.h | 1 - speechx/speechx/nnet/ds2_nnet.h | 1 + speechx/speechx/nnet/nnet_itf.h | 16 ++++++------ speechx/speechx/nnet/u2_nnet.h | 1 - .../protocol/websocket/websocket_client.h | 3 +-- .../protocol/websocket/websocket_server.h | 2 -- speechx/speechx/utils/file_utils.h | 2 +- 16 files changed, 53 insertions(+), 55 deletions(-) diff --git a/speechx/speechx/base/basic_types.h b/speechx/speechx/base/basic_types.h index 206b7be6..3a648649 100644 --- a/speechx/speechx/base/basic_types.h +++ b/speechx/speechx/base/basic_types.h @@ -14,10 +14,10 @@ #pragma once -#include "kaldi/base/kaldi-types.h" - #include +#include "kaldi/base/kaldi-types.h" + typedef float BaseFloat; typedef double double64; diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h index b470b9de..97bff966 100644 --- a/speechx/speechx/base/common.h +++ b/speechx/speechx/base/common.h @@ -47,6 +47,5 @@ #include "base/flags.h" #include "base/log.h" #include "base/macros.h" - #include "utils/file_utils.h" #include "utils/math.h" \ No newline at end of file diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h index 2c28bee1..eef8823d 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h @@ -17,7 +17,6 @@ #include "decoder/ctc_beam_search_opt.h" #include "decoder/ctc_prefix_beam_search_score.h" #include "decoder/decoder_itf.h" - #include "fst/symbol-table.h" namespace ppspeech { diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h index 76bbcf42..cf8a9b73 100644 --- a/speechx/speechx/decoder/ctc_tlg_decoder.h +++ b/speechx/speechx/decoder/ctc_tlg_decoder.h @@ -16,7 +16,6 @@ #include "base/common.h" #include "decoder/decoder_itf.h" - #include "kaldi/decoder/lattice-faster-online-decoder.h" #include "util/parse-options.h" @@ -35,7 +34,7 @@ struct TLGDecoderOptions { std::string word_symbol_table{}; std::string fst_path{}; - static TLGDecoderOptions InitFromFlags(){ + static TLGDecoderOptions InitFromFlags() { TLGDecoderOptions decoder_opts; decoder_opts.word_symbol_table = FLAGS_word_symbol_table; decoder_opts.fst_path = FLAGS_graph_path; @@ -45,9 +44,11 @@ struct TLGDecoderOptions { decoder_opts.opts.max_active = FLAGS_max_active; decoder_opts.opts.beam = FLAGS_beam; decoder_opts.opts.lattice_beam = FLAGS_lattice_beam; - LOG(INFO) << "LatticeFasterDecoder max active: " << decoder_opts.opts.max_active ; - LOG(INFO) << "LatticeFasterDecoder beam: " << decoder_opts.opts.beam ; - LOG(INFO) << "LatticeFasterDecoder lattice_beam: " << decoder_opts.opts.lattice_beam ; + LOG(INFO) << "LatticeFasterDecoder max active: " + << decoder_opts.opts.max_active; + LOG(INFO) << "LatticeFasterDecoder beam: " << decoder_opts.opts.beam; + LOG(INFO) << "LatticeFasterDecoder lattice_beam: " + << decoder_opts.opts.lattice_beam; return decoder_opts; } diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index 5e1120ad..1f13bbc0 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -30,7 +30,7 @@ DEFINE_int32(receptive_field_length, 7, "receptive field of two CNN(kernel=3) downsampling module."); DEFINE_int32(subsampling_rate, - 4, + 4, "two CNN(kernel=3) module downsampling rate."); DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk"); @@ -62,7 +62,6 @@ DEFINE_double(beam, 15.0, "decoder beam"); DEFINE_double(lattice_beam, 7.5, "decoder beam"); - // DecodeOptions flags // DEFINE_int32(chunk_size, -1, "decoding chunk size"); DEFINE_int32(num_left_chunks, -1, "left chunks in decoding"); diff --git a/speechx/speechx/decoder/recognizer.h b/speechx/speechx/decoder/recognizer.h index 51b66673..0402bcd3 100644 --- a/speechx/speechx/decoder/recognizer.h +++ b/speechx/speechx/decoder/recognizer.h @@ -32,15 +32,15 @@ struct RecognizerResource { ModelOptions model_opts{}; TLGDecoderOptions tlg_opts{}; // CTCBeamSearchOptions beam_search_opts; - - static RecognizerResource InitFromFlags(){ + + static RecognizerResource InitFromFlags() { RecognizerResource resource; resource.acoustic_scale = FLAGS_acoustic_scale; - resource.feature_pipeline_opts = FeaturePipelineOptions::InitFromFlags(); + resource.feature_pipeline_opts = + FeaturePipelineOptions::InitFromFlags(); resource.model_opts = ModelOptions::InitFromFlags(); - resource.tlg_opts = TLGDecoderOptions::InitFromFlags(); - return resource; - + resource.tlg_opts = TLGDecoderOptions::InitFromFlags(); + return resource; } }; diff --git a/speechx/speechx/decoder/u2_recognizer.h b/speechx/speechx/decoder/u2_recognizer.h index 86bd4821..f4e91b18 100644 --- a/speechx/speechx/decoder/u2_recognizer.h +++ b/speechx/speechx/decoder/u2_recognizer.h @@ -21,10 +21,9 @@ #include "decoder/ctc_prefix_beam_search_decoder.h" #include "decoder/decoder_itf.h" #include "frontend/audio/feature_pipeline.h" -#include "nnet/decodable.h" - #include "fst/fstlib.h" #include "fst/symbol-table.h" +#include "nnet/decodable.h" DECLARE_int32(nnet_decoder_chunk); DECLARE_int32(num_left_chunks); @@ -63,9 +62,9 @@ struct DecodeOptions { // CtcEndpointConfig ctc_endpoint_opts; CTCBeamSearchOptions ctc_prefix_search_opts{}; - static DecodeOptions InitFromFlags(){ + static DecodeOptions InitFromFlags() { DecodeOptions decoder_opts; - decoder_opts.chunk_size=FLAGS_nnet_decoder_chunk; + decoder_opts.chunk_size = FLAGS_nnet_decoder_chunk; decoder_opts.num_left_chunks = FLAGS_num_left_chunks; decoder_opts.ctc_weight = FLAGS_ctc_weight; decoder_opts.rescoring_weight = FLAGS_rescoring_weight; @@ -86,15 +85,16 @@ struct U2RecognizerResource { DecodeOptions decoder_opts{}; static U2RecognizerResource InitFromFlags() { - U2RecognizerResource resource; - resource.vocab_path = FLAGS_vocab_path; - resource.acoustic_scale = FLAGS_acoustic_scale; - - resource.feature_pipeline_opts = ppspeech::FeaturePipelineOptions::InitFromFlags(); - resource.model_opts = ppspeech::ModelOptions::InitFromFlags(); - resource.decoder_opts = ppspeech::DecodeOptions::InitFromFlags(); - return resource; -} + U2RecognizerResource resource; + resource.vocab_path = FLAGS_vocab_path; + resource.acoustic_scale = FLAGS_acoustic_scale; + + resource.feature_pipeline_opts = + ppspeech::FeaturePipelineOptions::InitFromFlags(); + resource.model_opts = ppspeech::ModelOptions::InitFromFlags(); + resource.decoder_opts = ppspeech::DecodeOptions::InitFromFlags(); + return resource; + } }; diff --git a/speechx/speechx/frontend/audio/data_cache.h b/speechx/speechx/frontend/audio/data_cache.h index 64e9db86..5fafdeb2 100644 --- a/speechx/speechx/frontend/audio/data_cache.h +++ b/speechx/speechx/frontend/audio/data_cache.h @@ -56,4 +56,4 @@ class DataCache : public FrontendInterface { DISALLOW_COPY_AND_ASSIGN(DataCache); }; -} \ No newline at end of file +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index 38a47433..d91a70e3 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -46,17 +46,17 @@ struct FeaturePipelineOptions { FeatureCacheOptions feature_cache_opts{}; AssemblerOptions assembler_opts{}; - static FeaturePipelineOptions InitFromFlags(){ + static FeaturePipelineOptions InitFromFlags() { FeaturePipelineOptions opts; opts.cmvn_file = FLAGS_cmvn_file; - LOG(INFO) << "cmvn file: " << opts.cmvn_file; + LOG(INFO) << "cmvn file: " << opts.cmvn_file; // frame options kaldi::FrameExtractionOptions frame_opts; frame_opts.dither = 0.0; - LOG(INFO) << "dither: " << frame_opts.dither; + LOG(INFO) << "dither: " << frame_opts.dither; frame_opts.frame_shift_ms = 10; - LOG(INFO) << "frame shift ms: " << frame_opts.frame_shift_ms; + LOG(INFO) << "frame shift ms: " << frame_opts.frame_shift_ms; opts.use_fbank = FLAGS_use_fbank; LOG(INFO) << "feature type: " << (opts.use_fbank ? "fbank" : "linear"); if (opts.use_fbank) { @@ -76,15 +76,19 @@ struct FeaturePipelineOptions { opts.linear_spectrogram_opts.frame_opts = frame_opts; } - LOG(INFO) << "frame length ms: " << frame_opts.frame_length_ms; + LOG(INFO) << "frame length ms: " << frame_opts.frame_length_ms; // assembler opts opts.assembler_opts.subsampling_rate = FLAGS_subsampling_rate; - LOG(INFO) << "subsampling rate: " << opts.assembler_opts.subsampling_rate; - opts.assembler_opts.receptive_filed_length = FLAGS_receptive_field_length; - LOG(INFO) << "nnet receptive filed length: " << opts.assembler_opts.receptive_filed_length; + LOG(INFO) << "subsampling rate: " + << opts.assembler_opts.subsampling_rate; + opts.assembler_opts.receptive_filed_length = + FLAGS_receptive_field_length; + LOG(INFO) << "nnet receptive filed length: " + << opts.assembler_opts.receptive_filed_length; opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk; - LOG(INFO) << "nnet chunk size: " << opts.assembler_opts.nnet_decoder_chunk; + LOG(INFO) << "nnet chunk size: " + << opts.assembler_opts.nnet_decoder_chunk; return opts; } }; diff --git a/speechx/speechx/frontend/audio/mfcc.h b/speechx/speechx/frontend/audio/mfcc.h index 62b0078c..6c1c2f7d 100644 --- a/speechx/speechx/frontend/audio/mfcc.h +++ b/speechx/speechx/frontend/audio/mfcc.h @@ -14,7 +14,6 @@ #pragma once -#include "kaldi/feat/feature-mfcc.h" #include "kaldi/feat/feature-mfcc.h" #include "kaldi/matrix/kaldi-vector.h" diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h index 2a53e5f7..4aeec32f 100644 --- a/speechx/speechx/nnet/ds2_nnet.h +++ b/speechx/speechx/nnet/ds2_nnet.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once #include + #include "base/common.h" #include "kaldi/matrix/kaldi-matrix.h" #include "nnet/nnet_itf.h" diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h index f8105b7f..cc737ce0 100644 --- a/speechx/speechx/nnet/nnet_itf.h +++ b/speechx/speechx/nnet/nnet_itf.h @@ -48,25 +48,25 @@ struct ModelOptions { bool enable_fc_padding{false}; bool enable_profile{false}; - static ModelOptions InitFromFlags(){ + static ModelOptions InitFromFlags() { ModelOptions opts; opts.subsample_rate = FLAGS_subsampling_rate; - LOG(INFO) << "subsampling rate: " << opts.subsample_rate; + LOG(INFO) << "subsampling rate: " << opts.subsample_rate; opts.model_path = FLAGS_model_path; - LOG(INFO) << "model path: " << opts.model_path ; + LOG(INFO) << "model path: " << opts.model_path; opts.param_path = FLAGS_param_path; - LOG(INFO) << "param path: " << opts.param_path ; + LOG(INFO) << "param path: " << opts.param_path; LOG(INFO) << "DS2 param: "; opts.cache_names = FLAGS_model_cache_names; - LOG(INFO) << " cache names: " << opts.cache_names; + LOG(INFO) << " cache names: " << opts.cache_names; opts.cache_shape = FLAGS_model_cache_shapes; - LOG(INFO) << " cache shape: " << opts.cache_shape; + LOG(INFO) << " cache shape: " << opts.cache_shape; opts.input_names = FLAGS_model_input_names; - LOG(INFO) << " input names: " << opts.input_names; + LOG(INFO) << " input names: " << opts.input_names; opts.output_names = FLAGS_model_output_names; - LOG(INFO) << " output names: " << opts.output_names; + LOG(INFO) << " output names: " << opts.output_names; return opts; } }; diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h index 697ac20c..3435bca8 100644 --- a/speechx/speechx/nnet/u2_nnet.h +++ b/speechx/speechx/nnet/u2_nnet.h @@ -16,7 +16,6 @@ #include "base/common.h" #include "kaldi/matrix/kaldi-matrix.h" - #include "nnet/nnet_itf.h" #include "paddle/extension.h" #include "paddle/jit/all.h" diff --git a/speechx/speechx/protocol/websocket/websocket_client.h b/speechx/speechx/protocol/websocket/websocket_client.h index 886da292..7ae6d98d 100644 --- a/speechx/speechx/protocol/websocket/websocket_client.h +++ b/speechx/speechx/protocol/websocket/websocket_client.h @@ -13,7 +13,6 @@ // limitations under the License. #include "base/common.h" - #include "boost/asio/connect.hpp" #include "boost/asio/ip/tcp.hpp" #include "boost/beast/core.hpp" @@ -54,4 +53,4 @@ class WebSocketClient { websocket::stream ws_{ioc_}; std::unique_ptr t_{nullptr}; }; -} \ No newline at end of file +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/protocol/websocket/websocket_server.h b/speechx/speechx/protocol/websocket/websocket_server.h index 009fc42e..8f3360e4 100644 --- a/speechx/speechx/protocol/websocket/websocket_server.h +++ b/speechx/speechx/protocol/websocket/websocket_server.h @@ -15,12 +15,10 @@ #pragma once #include "base/common.h" - #include "boost/asio/connect.hpp" #include "boost/asio/ip/tcp.hpp" #include "boost/beast/core.hpp" #include "boost/beast/websocket.hpp" - #include "decoder/recognizer.h" #include "frontend/audio/feature_pipeline.h" diff --git a/speechx/speechx/utils/file_utils.h b/speechx/speechx/utils/file_utils.h index 8c56c02e..a471e024 100644 --- a/speechx/speechx/utils/file_utils.h +++ b/speechx/speechx/utils/file_utils.h @@ -20,4 +20,4 @@ bool ReadFileToVector(const std::string& filename, std::vector* data); std::string ReadFile2String(const std::string& path); -} +} // namespace ppspeech From 29508f400b23211c9e7380800e2d02c9a16a426f Mon Sep 17 00:00:00 2001 From: "david.95" Date: Mon, 17 Oct 2022 16:44:29 +0800 Subject: [PATCH 20/60] to fix CI issue, test=tts --- paddlespeech/t2s/ssml/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 paddlespeech/t2s/ssml/__init__.py diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/ssml/__init__.py new file mode 100644 index 00000000..e69de29b From f56cc08b18f5fb6fc3254db4dd40ec3597d34f36 Mon Sep 17 00:00:00 2001 From: "david.95" Date: Mon, 17 Oct 2022 16:55:07 +0800 Subject: [PATCH 21/60] add license content, test=tts --- paddlespeech/t2s/ssml/__init__.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/ssml/__init__.py index e69de29b..abf198b9 100644 --- a/paddlespeech/t2s/ssml/__init__.py +++ b/paddlespeech/t2s/ssml/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. From 1067088debd49ba308fc55a8c55d1d04f211ff51 Mon Sep 17 00:00:00 2001 From: "david.95" Date: Mon, 17 Oct 2022 17:18:27 +0800 Subject: [PATCH 22/60] modify __init__ --- paddlespeech/t2s/ssml/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/ssml/__init__.py index abf198b9..f344250d 100644 --- a/paddlespeech/t2s/ssml/__init__.py +++ b/paddlespeech/t2s/ssml/__init__.py @@ -11,3 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from .xml_processor import * From 89e9ea69ebb884d5ba13d02c66c29475a153f2ea Mon Sep 17 00:00:00 2001 From: "david.95" Date: Mon, 17 Oct 2022 17:29:46 +0800 Subject: [PATCH 23/60] modify __init__ --- paddlespeech/t2s/ssml/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/ssml/__init__.py index f344250d..9b4db053 100644 --- a/paddlespeech/t2s/ssml/__init__.py +++ b/paddlespeech/t2s/ssml/__init__.py @@ -11,5 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .xml_processor import * From f295d2d4450099f2cf8b7e2d417a9c9599230563 Mon Sep 17 00:00:00 2001 From: "david.95" Date: Mon, 17 Oct 2022 18:00:13 +0800 Subject: [PATCH 24/60] remove useless code --- paddlespeech/t2s/frontend/zh_frontend.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index 25558780..e3028698 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -146,7 +146,6 @@ class Frontend(): tone_id = [line.strip().split() for line in f.readlines()] for tone, id in tone_id: self.vocab_tones[tone] = int(id) - self.mix_ssml_processor.__repr__() def _init_pypinyin(self): large_pinyin.load() From 72c9e973a2bb9d6c8dab603d67a6ae80a73669f7 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 17 Oct 2022 10:06:06 +0000 Subject: [PATCH 25/60] add scripts --- .../examples/u2pp_ol/wenetspeech/.gitignore | 3 + .../examples/u2pp_ol/wenetspeech/README.md | 28 +++++++ .../wenetspeech/local/aishell_train_lms.sh | 71 +++++++++++++++++ .../u2pp_ol/wenetspeech/local/decode.sh | 25 ++++++ .../u2pp_ol/wenetspeech/local/feat.sh | 31 ++++++++ .../u2pp_ol/wenetspeech/local/nnet.sh | 23 ++++++ .../u2pp_ol/wenetspeech/local/recognizer.sh | 34 +++++++++ .../u2pp_ol/wenetspeech/local/split_data.sh | 30 ++++++++ speechx/examples/u2pp_ol/wenetspeech/path.sh | 18 +++++ speechx/examples/u2pp_ol/wenetspeech/run.sh | 76 +++++++++++++++++++ 10 files changed, 339 insertions(+) create mode 100644 speechx/examples/u2pp_ol/wenetspeech/.gitignore create mode 100644 speechx/examples/u2pp_ol/wenetspeech/README.md create mode 100755 speechx/examples/u2pp_ol/wenetspeech/local/aishell_train_lms.sh create mode 100755 speechx/examples/u2pp_ol/wenetspeech/local/decode.sh create mode 100755 speechx/examples/u2pp_ol/wenetspeech/local/feat.sh create mode 100755 speechx/examples/u2pp_ol/wenetspeech/local/nnet.sh create mode 100755 speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh create mode 100755 speechx/examples/u2pp_ol/wenetspeech/local/split_data.sh create mode 100644 speechx/examples/u2pp_ol/wenetspeech/path.sh create mode 100755 speechx/examples/u2pp_ol/wenetspeech/run.sh diff --git a/speechx/examples/u2pp_ol/wenetspeech/.gitignore b/speechx/examples/u2pp_ol/wenetspeech/.gitignore new file mode 100644 index 00000000..02c0cc21 --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/.gitignore @@ -0,0 +1,3 @@ +data +utils +exp diff --git a/speechx/examples/u2pp_ol/wenetspeech/README.md b/speechx/examples/u2pp_ol/wenetspeech/README.md new file mode 100644 index 00000000..a9a4578f --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/README.md @@ -0,0 +1,28 @@ +# u2/u2pp Streaming ASR + +## Testing with Aishell Test Data + +## Download wav and model + +``` +run.sh --stop_stage 0 +``` + +### compute feature + +``` +./run.sh --stage 1 --stop_stage 1 +``` + +### decoding using feature + +``` +./run.sh --stage 2 --stop_stage 2 +``` + +### decoding using wav + + +``` +./run.sh --stage 3 --stop_stage 3 +``` \ No newline at end of file diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/aishell_train_lms.sh b/speechx/examples/u2pp_ol/wenetspeech/local/aishell_train_lms.sh new file mode 100755 index 00000000..544a1f59 --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/local/aishell_train_lms.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +# To be run from one directory above this script. +. ./path.sh + +nj=40 +text=data/local/lm/text +lexicon=data/local/dict/lexicon.txt + +for f in "$text" "$lexicon"; do + [ ! -f $x ] && echo "$0: No such file $f" && exit 1; +done + +# Check SRILM tools +if ! which ngram-count > /dev/null; then + echo "srilm tools are not found, please download it and install it from: " + echo "http://www.speech.sri.com/projects/srilm/download.html" + echo "Then add the tools to your PATH" + exit 1 +fi + +# This script takes no arguments. It assumes you have already run +# aishell_data_prep.sh. +# It takes as input the files +# data/local/lm/text +# data/local/dict/lexicon.txt +dir=data/local/lm +mkdir -p $dir + +cleantext=$dir/text.no_oov + +# oov to +# lexicon line: word char0 ... charn +# text line: utt word0 ... wordn -> line: word0 ... wordn +text_dir=$(dirname $text) +split_name=$(basename $text) +./local/split_data.sh $text_dir $text $split_name $nj + +utils/run.pl JOB=1:$nj $text_dir/split${nj}/JOB/${split_name}.no_oov.log \ + cat ${text_dir}/split${nj}/JOB/${split_name} \| awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } + {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ + \> ${text_dir}/split${nj}/JOB/${split_name}.no_oov || exit 1; +cat ${text_dir}/split${nj}/*/${split_name}.no_oov > $cleantext + +# compute word counts, sort in descending order +# line: count word +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort --parallel=`nproc` | uniq -c | \ + sort --parallel=`nproc` -nr > $dir/word.counts || exit 1; + +# Get counts from acoustic training transcripts, and add one-count +# for each word in the lexicon (but not silence, we don't want it +# in the LM-- we'll add it optionally later). +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ + cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ + sort --parallel=`nproc` | uniq -c | sort --parallel=`nproc` -nr > $dir/unigram.counts || exit 1; + +# word with +cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo ""; echo "" ) > $dir/wordlist + +# hold out to compute ppl +heldout_sent=10000 # Don't change this if you want result to be comparable with kaldi_lm results + +mkdir -p $dir +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/heldout +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/train + +ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \ + -map-unk "" -kndiscount -interpolate -lm $dir/lm.arpa +ngram -lm $dir/lm.arpa -ppl $dir/heldout \ No newline at end of file diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/decode.sh b/speechx/examples/u2pp_ol/wenetspeech/local/decode.sh new file mode 100755 index 00000000..c17cdbe6 --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/local/decode.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -e + +. path.sh + +data=data +exp=exp +nj=20 +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + +utils/run.pl JOB=1:$nj $data/split${nj}/JOB/decoder.fbank.wolm.log \ +ctc_prefix_beam_search_decoder_main \ + --model_path=$model_dir/export.jit \ + --vocab_path=$model_dir/unit.txt \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --subsampling_rate=4 \ + --feature_rspecifier=scp:$data/split${nj}/JOB/fbank.scp \ + --result_wspecifier=ark,t:$data/split${nj}/JOB/result_decode.ark + +cat $data/split${nj}/*/result_decode.ark > $exp/${label_file} +utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file} > $exp/${wer} +tail -n 7 $exp/${wer} \ No newline at end of file diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/feat.sh b/speechx/examples/u2pp_ol/wenetspeech/local/feat.sh new file mode 100755 index 00000000..4341cec8 --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/local/feat.sh @@ -0,0 +1,31 @@ +#!/bin/bash +set -e + +. path.sh + +data=data +exp=exp +nj=20 +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ +aishell_wav_scp=aishell_test.scp + +cmvn_json2kaldi_main \ + --json_file $model_dir/mean_std.json \ + --cmvn_write_path $exp/cmvn.ark \ + --binary=false + +echo "convert json cmvn to kaldi ark." + +./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj + +utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat.log \ +compute_fbank_main \ + --num_bins 80 \ + --cmvn_file=$exp/cmvn.ark \ + --streaming_chunk=36 \ + --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ + --feature_wspecifier=ark,scp:$data/split${nj}/JOB/fbank.ark,$data/split${nj}/JOB/fbank.scp + +echo "compute fbank feature." diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/nnet.sh b/speechx/examples/u2pp_ol/wenetspeech/local/nnet.sh new file mode 100755 index 00000000..4419201c --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/local/nnet.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -x +set -e + +. path.sh + +data=data +exp=exp +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + +u2_nnet_main \ + --model_path=$model_dir/export.jit \ + --feature_rspecifier=ark,t:$exp/fbank.ark \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --subsampling_rate=4 \ + --acoustic_scale=1.0 \ + --nnet_encoder_outs_wspecifier=ark,t:$exp/encoder_outs.ark \ + --nnet_prob_wspecifier=ark,t:$exp/logprobs.ark +echo "u2 nnet decode." + diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh new file mode 100755 index 00000000..29b50537 --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -e + +. path.sh + +data=data +exp=exp +nj=20 +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ +aishell_wav_scp=aishell_test.scp + +./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj + +utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer.log \ +u2_recognizer_main \ + --use_fbank=true \ + --num_bins=80 \ + --cmvn_file=$exp/cmvn.ark \ + --model_path=$model_dir/export.jit \ + --vocab_path=$model_dir/unit.txt \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --subsampling_rate=4 \ + --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ + --result_wspecifier=ark,t:$data/split${nj}/JOB/result_recognizer.ark + + +cat $data/split${nj}/*/result_recognizer.ark > $exp/${label_file}_recognizer +utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_recognizer > $exp/${wer}.recognizer +echo "recognizer test have finished!!!" +echo "please checkout in ${exp}/${wer}.recognizer" +tail -n 7 $exp/${wer}.recognizer \ No newline at end of file diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/split_data.sh b/speechx/examples/u2pp_ol/wenetspeech/local/split_data.sh new file mode 100755 index 00000000..faa5c42d --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/local/split_data.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +set -eo pipefail + +data=$1 +scp=$2 +split_name=$3 +numsplit=$4 + +# save in $data/split{n} +# $scp to split +# + +if [[ ! $numsplit -gt 0 ]]; then + echo "$0: Invalid num-split argument"; + exit 1; +fi + +directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n; done) +scp_splits=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_name}; done) + +# if this mkdir fails due to argument-list being too long, iterate. +if ! mkdir -p $directories >&/dev/null; then + for n in `seq $numsplit`; do + mkdir -p $data/split${numsplit}/$n + done +fi + +echo "utils/split_scp.pl $scp $scp_splits" +utils/split_scp.pl $scp $scp_splits diff --git a/speechx/examples/u2pp_ol/wenetspeech/path.sh b/speechx/examples/u2pp_ol/wenetspeech/path.sh new file mode 100644 index 00000000..7f32fbce --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/path.sh @@ -0,0 +1,18 @@ +# This contains the locations of binarys build required for running the examples. + +unset GREP_OPTIONS + +SPEECHX_ROOT=$PWD/../../../ +SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx + +SPEECHX_TOOLS=$SPEECHX_ROOT/tools +TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin + +[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; } + +export LC_AL=C + +export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio + +PADDLE_LIB_PATH=$(python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')") +export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH diff --git a/speechx/examples/u2pp_ol/wenetspeech/run.sh b/speechx/examples/u2pp_ol/wenetspeech/run.sh new file mode 100755 index 00000000..12e3af95 --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/run.sh @@ -0,0 +1,76 @@ +#!/bin/bash +set +x +set -e + +. path.sh + +nj=40 +stage=0 +stop_stage=5 + +. utils/parse_options.sh + +# input +data=data +exp=exp +mkdir -p $exp $data + + +# 1. compile +if [ ! -d ${SPEECHX_BUILD} ]; then + pushd ${SPEECHX_ROOT} + bash build.sh + popd +fi + + +ckpt_dir=$data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then + # download model + if [ ! -f $ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz ]; then + mkdir -p $ckpt_dir + pushd $ckpt_dir + + wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/static/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz + tar xzfv asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz + + popd + fi + + # test wav scp + if [ ! -f data/wav.scp ]; then + mkdir -p $data + pushd $data + wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav + echo "utt1 " $PWD/zh.wav > wav.scp + popd + fi + + # aishell wav scp + if [ ! -d $data/test ]; then + pushd $data + wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip + unzip aishell_test.zip + popd + + realpath $data/test/*/*.wav > $data/wavlist + awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id + paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp + fi +fi + + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + ./local/feat.sh +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + ./local/decode.sh +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + ./loca/recognizer.sh +fi \ No newline at end of file From fddcd36fa013ec9bce67e1a95c257d91140faf32 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 17 Oct 2022 19:03:15 +0800 Subject: [PATCH 26/60] format --- .../examples/u2pp_ol/wenetspeech/README.md | 2 +- .../codelab/nnet/ds2_model_test_main.cc | 1 + .../decoder/ctc_beam_search_decoder.cc | 12 +- .../decoder/ctc_prefix_beam_search_decoder.cc | 73 +++--- .../ctc_prefix_beam_search_decoder_main.cc | 17 +- speechx/speechx/decoder/ctc_tlg_decoder.cc | 7 +- .../speechx/decoder/ctc_tlg_decoder_main.cc | 15 +- speechx/speechx/decoder/recognizer.cc | 6 +- speechx/speechx/decoder/recognizer_main.cc | 7 +- speechx/speechx/decoder/u2_recognizer.cc | 231 +++++++++--------- speechx/speechx/decoder/u2_recognizer_main.cc | 7 +- speechx/speechx/frontend/audio/cmvn.cc | 9 +- .../frontend/audio/compute_fbank_main.cc | 21 +- .../audio/compute_linear_spectrogram_main.cc | 7 +- .../frontend/audio/feature_pipeline.cc | 5 +- speechx/speechx/nnet/decodable.cc | 18 +- speechx/speechx/nnet/u2_nnet.cc | 22 +- speechx/speechx/nnet/u2_nnet_main.cc | 35 +-- .../websocket/websocket_server_main.cc | 7 +- speechx/speechx/utils/math.cc | 10 +- 20 files changed, 259 insertions(+), 253 deletions(-) diff --git a/speechx/examples/u2pp_ol/wenetspeech/README.md b/speechx/examples/u2pp_ol/wenetspeech/README.md index a9a4578f..9a8f8af5 100644 --- a/speechx/examples/u2pp_ol/wenetspeech/README.md +++ b/speechx/examples/u2pp_ol/wenetspeech/README.md @@ -25,4 +25,4 @@ run.sh --stop_stage 0 ``` ./run.sh --stage 3 --stop_stage 3 -``` \ No newline at end of file +``` diff --git a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc index 7d99e857..09f9e2fb 100644 --- a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc +++ b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc @@ -21,6 +21,7 @@ #include #include #include + #include "base/flags.h" #include "base/log.h" #include "paddle_inference_api.h" diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_beam_search_decoder.cc index 3f00ee35..c4b35ff0 100644 --- a/speechx/speechx/decoder/ctc_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_beam_search_decoder.cc @@ -13,9 +13,10 @@ // limitations under the License. +#include "decoder/ctc_beam_search_decoder.h" + #include "base/common.h" #include "decoder/ctc_decoders/decoder_utils.h" -#include "decoder/ctc_beam_search_decoder.h" #include "utils/file_utils.h" namespace ppspeech { @@ -24,10 +25,7 @@ using std::vector; using FSTMATCH = fst::SortedMatcher; CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts) - : opts_(opts), - init_ext_scorer_(nullptr), - space_id_(-1), - root_(nullptr) { + : opts_(opts), init_ext_scorer_(nullptr), space_id_(-1), root_(nullptr) { LOG(INFO) << "dict path: " << opts_.dict_file; if (!ReadFileToVector(opts_.dict_file, &vocabulary_)) { LOG(INFO) << "load the dict failed"; @@ -41,7 +39,7 @@ CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts) opts_.alpha, opts_.beta, opts_.lm_path, vocabulary_); } - CHECK(opts_.blank==0); + CHECK(opts_.blank == 0); auto it = std::find(vocabulary_.begin(), vocabulary_.end(), " "); space_id_ = it - vocabulary_.begin(); @@ -115,7 +113,7 @@ int CTCBeamSearch::DecodeLikelihoods(const vector>& probs, } vector> CTCBeamSearch::GetNBestPath(int n) { - int beam_size = n == -1 ? opts_.beam_size: std::min(n, opts_.beam_size); + int beam_size = n == -1 ? opts_.beam_size : std::min(n, opts_.beam_size); return get_beam_search_result(prefixes_, vocabulary_, beam_size); } diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc index ce2d4dc2..a0fe5b2a 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc @@ -16,11 +16,12 @@ #include "decoder/ctc_prefix_beam_search_decoder.h" + +#include "absl/strings/str_join.h" #include "base/common.h" #include "decoder/ctc_beam_search_opt.h" #include "decoder/ctc_prefix_beam_search_score.h" #include "utils/math.h" -#include "absl/strings/str_join.h" #ifdef USE_PROFILING #include "paddle/fluid/platform/profiler.h" @@ -30,18 +31,17 @@ using paddle::platform::TracerEventType; namespace ppspeech { -CTCPrefixBeamSearch::CTCPrefixBeamSearch( - const std::string vocab_path, - const CTCBeamSearchOptions& opts) +CTCPrefixBeamSearch::CTCPrefixBeamSearch(const std::string vocab_path, + const CTCBeamSearchOptions& opts) : opts_(opts) { - - unit_table_ = std::shared_ptr(fst::SymbolTable::ReadText(vocab_path)); + unit_table_ = std::shared_ptr( + fst::SymbolTable::ReadText(vocab_path)); CHECK(unit_table_ != nullptr); Reset(); } -void CTCPrefixBeamSearch::Reset() { +void CTCPrefixBeamSearch::Reset() { num_frame_decoded_ = 0; cur_hyps_.clear(); @@ -65,10 +65,9 @@ void CTCPrefixBeamSearch::Reset() { hypotheses_.emplace_back(empty); likelihood_.emplace_back(prefix_score.TotalScore()); times_.emplace_back(empty); - } - -void CTCPrefixBeamSearch::InitDecoder() { Reset(); } +} +void CTCPrefixBeamSearch::InitDecoder() { Reset(); } void CTCPrefixBeamSearch::AdvanceDecode( @@ -296,9 +295,7 @@ void CTCPrefixBeamSearch::UpdateOutputs( outputs_.emplace_back(output); } -void CTCPrefixBeamSearch::FinalizeSearch() { - UpdateFinalContext(); -} +void CTCPrefixBeamSearch::FinalizeSearch() { UpdateFinalContext(); } void CTCPrefixBeamSearch::UpdateFinalContext() { if (context_graph_ == nullptr) return; @@ -311,8 +308,8 @@ void CTCPrefixBeamSearch::UpdateFinalContext() { for (const auto& prefix : hypotheses_) { PrefixScore& prefix_score = cur_hyps_[prefix]; if (prefix_score.context_score != 0) { - prefix_score.UpdateContext(context_graph_, prefix_score, 0, - prefix.size()); + prefix_score.UpdateContext( + context_graph_, prefix_score, 0, prefix.size()); } } std::vector, PrefixScore>> arr(cur_hyps_.begin(), @@ -323,48 +320,44 @@ void CTCPrefixBeamSearch::UpdateFinalContext() { UpdateHypotheses(arr); } - std::string CTCPrefixBeamSearch::GetBestPath(int index) { +std::string CTCPrefixBeamSearch::GetBestPath(int index) { int n_hyps = Outputs().size(); CHECK(n_hyps > 0); CHECK(index < n_hyps); std::vector one = Outputs()[index]; std::string sentence; - for (int i = 0; i < one.size(); i++){ + for (int i = 0; i < one.size(); i++) { sentence += unit_table_->Find(one[i]); } return sentence; - } +} - std::string CTCPrefixBeamSearch::GetBestPath() { - return GetBestPath(0); - } +std::string CTCPrefixBeamSearch::GetBestPath() { return GetBestPath(0); } - std::vector> CTCPrefixBeamSearch::GetNBestPath(int n) { - int hyps_size = hypotheses_.size(); - CHECK(hyps_size > 0); +std::vector> CTCPrefixBeamSearch::GetNBestPath( + int n) { + int hyps_size = hypotheses_.size(); + CHECK(hyps_size > 0); - int min_n = n == -1 ? hypotheses_.size() : std::min(n, hyps_size); + int min_n = n == -1 ? hypotheses_.size() : std::min(n, hyps_size); - std::vector> n_best; - n_best.reserve(min_n); + std::vector> n_best; + n_best.reserve(min_n); - for (int i = 0; i < min_n; i++){ - n_best.emplace_back(Likelihood()[i], GetBestPath(i) ); - } - return n_best; - } + for (int i = 0; i < min_n; i++) { + n_best.emplace_back(Likelihood()[i], GetBestPath(i)); + } + return n_best; +} - std::vector> CTCPrefixBeamSearch::GetNBestPath() { +std::vector> +CTCPrefixBeamSearch::GetNBestPath() { return GetNBestPath(-1); - } - -std::string CTCPrefixBeamSearch::GetFinalBestPath() { - return GetBestPath(); } -std::string CTCPrefixBeamSearch::GetPartialResult() { - return GetBestPath(); -} +std::string CTCPrefixBeamSearch::GetFinalBestPath() { return GetBestPath(); } + +std::string CTCPrefixBeamSearch::GetPartialResult() { return GetBestPath(); } } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc index 7a488bb0..d9cca147 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "absl/strings/str_split.h" #include "base/common.h" #include "decoder/ctc_prefix_beam_search_decoder.h" #include "frontend/audio/data_cache.h" +#include "fst/symbol-table.h" #include "kaldi/util/table-types.h" #include "nnet/decodable.h" #include "nnet/u2_nnet.h" -#include "absl/strings/str_split.h" -#include "fst/symbol-table.h" DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier"); @@ -64,8 +64,7 @@ int main(int argc, char* argv[]) { // nnet ppspeech::ModelOptions model_opts; model_opts.model_path = FLAGS_model_path; - std::shared_ptr nnet( - new ppspeech::U2Nnet(model_opts)); + std::shared_ptr nnet(new ppspeech::U2Nnet(model_opts)); // decodeable std::shared_ptr raw_data(new ppspeech::DataCache()); @@ -114,9 +113,9 @@ int main(int argc, char* argv[]) { ori_feature_len - chunk_idx * chunk_stride, chunk_size); } if (this_chunk_size < receptive_field_length) { - LOG(WARNING) << "utt: " << utt << " skip last " - << this_chunk_size << " frames, expect is " - << receptive_field_length; + LOG(WARNING) + << "utt: " << utt << " skip last " << this_chunk_size + << " frames, expect is " << receptive_field_length; break; } @@ -127,7 +126,7 @@ int main(int argc, char* argv[]) { for (int row_id = 0; row_id < this_chunk_size; ++row_id) { kaldi::SubVector feat_row(feature, start); kaldi::SubVector feature_chunk_row( - feature_chunk.Data() + row_id * feat_dim, feat_dim); + feature_chunk.Data() + row_id * feat_dim, feat_dim); feature_chunk_row.CopyFromVec(feat_row); ++start; @@ -151,7 +150,7 @@ int main(int argc, char* argv[]) { // get 1-best result std::string result = decoder.GetFinalBestPath(); - + // after process one utt, then reset state. decodable->Reset(); decoder.Reset(); diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.cc b/speechx/speechx/decoder/ctc_tlg_decoder.cc index 4d0a21d5..2c2b6d3c 100644 --- a/speechx/speechx/decoder/ctc_tlg_decoder.cc +++ b/speechx/speechx/decoder/ctc_tlg_decoder.cc @@ -33,9 +33,7 @@ void TLGDecoder::Reset() { return; } -void TLGDecoder::InitDecoder() { - Reset(); -} +void TLGDecoder::InitDecoder() { Reset(); } void TLGDecoder::AdvanceDecode( const std::shared_ptr& decodable) { @@ -50,7 +48,6 @@ void TLGDecoder::AdvanceDecoding(kaldi::DecodableInterface* decodable) { } - std::string TLGDecoder::GetPartialResult() { if (num_frame_decoded_ == 0) { // Assertion failed: (this->NumFramesDecoded() > 0 && "You cannot call @@ -93,4 +90,4 @@ std::string TLGDecoder::GetFinalBestPath() { return words; } -} +} // namespace ppspeech diff --git a/speechx/speechx/decoder/ctc_tlg_decoder_main.cc b/speechx/speechx/decoder/ctc_tlg_decoder_main.cc index f262101a..e9bd8a3f 100644 --- a/speechx/speechx/decoder/ctc_tlg_decoder_main.cc +++ b/speechx/speechx/decoder/ctc_tlg_decoder_main.cc @@ -15,14 +15,12 @@ // todo refactor, repalce with gtest #include "base/common.h" - +#include "decoder/ctc_tlg_decoder.h" +#include "decoder/param.h" #include "frontend/audio/data_cache.h" +#include "kaldi/util/table-types.h" #include "nnet/decodable.h" #include "nnet/ds2_nnet.h" -#include "decoder/param.h" -#include "decoder/ctc_tlg_decoder.h" - -#include "kaldi/util/table-types.h" DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); @@ -47,12 +45,13 @@ int main(int argc, char* argv[]) { int32 num_done = 0, num_err = 0; - ppspeech::TLGDecoderOptions opts = ppspeech::TLGDecoderOptions::InitFromFlags(); + ppspeech::TLGDecoderOptions opts = + ppspeech::TLGDecoderOptions::InitFromFlags(); opts.opts.beam = 15.0; opts.opts.lattice_beam = 7.5; ppspeech::TLGDecoder decoder(opts); - ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags(); + ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags(); std::shared_ptr nnet( new ppspeech::PaddleNnet(model_opts)); @@ -67,7 +66,7 @@ int main(int argc, char* argv[]) { LOG(INFO) << "chunk size (frame): " << chunk_size; LOG(INFO) << "chunk stride (frame): " << chunk_stride; LOG(INFO) << "receptive field (frame): " << receptive_field_length; - + decoder.InitDecoder(); kaldi::Timer timer; for (; !feature_reader.Done(); feature_reader.Next()) { diff --git a/speechx/speechx/decoder/recognizer.cc b/speechx/speechx/decoder/recognizer.cc index bb9ea187..870aa40a 100644 --- a/speechx/speechx/decoder/recognizer.cc +++ b/speechx/speechx/decoder/recognizer.cc @@ -17,12 +17,12 @@ namespace ppspeech { -using kaldi::Vector; -using kaldi::VectorBase; using kaldi::BaseFloat; -using std::vector; using kaldi::SubVector; +using kaldi::Vector; +using kaldi::VectorBase; using std::unique_ptr; +using std::vector; Recognizer::Recognizer(const RecognizerResource& resource) { diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/decoder/recognizer_main.cc index 662943b5..8e83b188 100644 --- a/speechx/speechx/decoder/recognizer_main.cc +++ b/speechx/speechx/decoder/recognizer_main.cc @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "decoder/recognizer.h" #include "decoder/param.h" +#include "decoder/recognizer.h" #include "kaldi/feat/wave-reader.h" #include "kaldi/util/table-types.h" @@ -25,8 +25,9 @@ DEFINE_int32(sample_rate, 16000, "sample rate"); ppspeech::RecognizerResource InitRecognizerResoure() { ppspeech::RecognizerResource resource; resource.acoustic_scale = FLAGS_acoustic_scale; - resource.feature_pipeline_opts = ppspeech::FeaturePipelineOptions::InitFromFlags(); - resource.model_opts = ppspeech::ModelOptions::InitFromFlags(); + resource.feature_pipeline_opts = + ppspeech::FeaturePipelineOptions::InitFromFlags(); + resource.model_opts = ppspeech::ModelOptions::InitFromFlags(); resource.tlg_opts = ppspeech::TLGDecoderOptions::InitFromFlags(); return resource; } diff --git a/speechx/speechx/decoder/u2_recognizer.cc b/speechx/speechx/decoder/u2_recognizer.cc index 8fcc5d79..04712e7b 100644 --- a/speechx/speechx/decoder/u2_recognizer.cc +++ b/speechx/speechx/decoder/u2_recognizer.cc @@ -13,18 +13,20 @@ // limitations under the License. #include "decoder/u2_recognizer.h" + #include "nnet/u2_nnet.h" namespace ppspeech { -using kaldi::Vector; -using kaldi::VectorBase; using kaldi::BaseFloat; -using std::vector; using kaldi::SubVector; +using kaldi::Vector; +using kaldi::VectorBase; using std::unique_ptr; +using std::vector; -U2Recognizer::U2Recognizer(const U2RecognizerResource& resource): opts_(resource) { +U2Recognizer::U2Recognizer(const U2RecognizerResource& resource) + : opts_(resource) { const FeaturePipelineOptions& feature_opts = resource.feature_pipeline_opts; feature_pipeline_.reset(new FeaturePipeline(feature_opts)); @@ -34,7 +36,8 @@ U2Recognizer::U2Recognizer(const U2RecognizerResource& resource): opts_(resource decodable_.reset(new Decodable(nnet, feature_pipeline_, am_scale)); CHECK(resource.vocab_path != ""); - decoder_.reset(new CTCPrefixBeamSearch(resource.vocab_path, resource.decoder_opts.ctc_prefix_search_opts)); + decoder_.reset(new CTCPrefixBeamSearch( + resource.vocab_path, resource.decoder_opts.ctc_prefix_search_opts)); unit_table_ = decoder_->VocabTable(); symbol_table_ = unit_table_; @@ -70,140 +73,140 @@ void U2Recognizer::Accept(const VectorBase& waves) { } -void U2Recognizer::Decode() { - decoder_->AdvanceDecode(decodable_); +void U2Recognizer::Decode() { + decoder_->AdvanceDecode(decodable_); UpdateResult(false); } void U2Recognizer::Rescoring() { - // Do attention Rescoring - kaldi::Timer timer; - AttentionRescoring(); - VLOG(1) << "Rescoring cost latency: " << timer.Elapsed() << " sec."; + // Do attention Rescoring + kaldi::Timer timer; + AttentionRescoring(); + VLOG(1) << "Rescoring cost latency: " << timer.Elapsed() << " sec."; } void U2Recognizer::UpdateResult(bool finish) { - const auto& hypotheses = decoder_->Outputs(); - const auto& inputs = decoder_->Inputs(); - const auto& likelihood = decoder_->Likelihood(); - const auto& times = decoder_->Times(); - result_.clear(); - - CHECK_EQ(hypotheses.size(), likelihood.size()); - for (size_t i = 0; i < hypotheses.size(); i++) { - const std::vector& hypothesis = hypotheses[i]; - - DecodeResult path; - path.score = likelihood[i]; - for (size_t j = 0; j < hypothesis.size(); j++) { - std::string word = symbol_table_->Find(hypothesis[j]); - // A detailed explanation of this if-else branch can be found in - // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058 - if (decoder_->Type() == kWfstBeamSearch) { - path.sentence += (" " + word); - } else { - path.sentence += (word); - } - } + const auto& hypotheses = decoder_->Outputs(); + const auto& inputs = decoder_->Inputs(); + const auto& likelihood = decoder_->Likelihood(); + const auto& times = decoder_->Times(); + result_.clear(); - // TimeStamp is only supported in final result - // TimeStamp of the output of CtcWfstBeamSearch may be inaccurate due to - // various FST operations when building the decoding graph. So here we use - // time stamp of the input(e2e model unit), which is more accurate, and it - // requires the symbol table of the e2e model used in training. - if (unit_table_ != nullptr && finish) { - int offset = global_frame_offset_ * FrameShiftInMs(); + CHECK_EQ(hypotheses.size(), likelihood.size()); + for (size_t i = 0; i < hypotheses.size(); i++) { + const std::vector& hypothesis = hypotheses[i]; + + DecodeResult path; + path.score = likelihood[i]; + for (size_t j = 0; j < hypothesis.size(); j++) { + std::string word = symbol_table_->Find(hypothesis[j]); + // A detailed explanation of this if-else branch can be found in + // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058 + if (decoder_->Type() == kWfstBeamSearch) { + path.sentence += (" " + word); + } else { + path.sentence += (word); + } + } + + // TimeStamp is only supported in final result + // TimeStamp of the output of CtcWfstBeamSearch may be inaccurate due to + // various FST operations when building the decoding graph. So here we + // use time stamp of the input(e2e model unit), which is more accurate, + // and it requires the symbol table of the e2e model used in training. + if (unit_table_ != nullptr && finish) { + int offset = global_frame_offset_ * FrameShiftInMs(); - const std::vector& input = inputs[i]; - const std::vector time_stamp = times[i]; - CHECK_EQ(input.size(), time_stamp.size()); + const std::vector& input = inputs[i]; + const std::vector time_stamp = times[i]; + CHECK_EQ(input.size(), time_stamp.size()); - for (size_t j = 0; j < input.size(); j++) { - std::string word = unit_table_->Find(input[j]); + for (size_t j = 0; j < input.size(); j++) { + std::string word = unit_table_->Find(input[j]); - int start = time_stamp[j] * FrameShiftInMs() - time_stamp_gap_ > 0 + int start = + time_stamp[j] * FrameShiftInMs() - time_stamp_gap_ > 0 ? time_stamp[j] * FrameShiftInMs() - time_stamp_gap_ : 0; - if (j > 0) { - start = (time_stamp[j] - time_stamp[j - 1]) * FrameShiftInMs() < - time_stamp_gap_ - ? (time_stamp[j - 1] + time_stamp[j]) / 2 * - FrameShiftInMs() - : start; + if (j > 0) { + start = + (time_stamp[j] - time_stamp[j - 1]) * FrameShiftInMs() < + time_stamp_gap_ + ? (time_stamp[j - 1] + time_stamp[j]) / 2 * + FrameShiftInMs() + : start; + } + + int end = time_stamp[j] * FrameShiftInMs(); + if (j < input.size() - 1) { + end = + (time_stamp[j + 1] - time_stamp[j]) * FrameShiftInMs() < + time_stamp_gap_ + ? (time_stamp[j + 1] + time_stamp[j]) / 2 * + FrameShiftInMs() + : end; + } + + WordPiece word_piece(word, offset + start, offset + end); + path.word_pieces.emplace_back(word_piece); + } } - int end = time_stamp[j] * FrameShiftInMs(); - if (j < input.size() - 1) { - end = (time_stamp[j + 1] - time_stamp[j]) * FrameShiftInMs() < - time_stamp_gap_ - ? (time_stamp[j + 1] + time_stamp[j]) / 2 * - FrameShiftInMs() - : end; - } + // if (post_processor_ != nullptr) { + // path.sentence = post_processor_->Process(path.sentence, finish); + // } - WordPiece word_piece(word, offset + start, offset + end); - path.word_pieces.emplace_back(word_piece); - } + result_.emplace_back(path); } - // if (post_processor_ != nullptr) { - // path.sentence = post_processor_->Process(path.sentence, finish); - // } - - result_.emplace_back(path); - } - - if (DecodedSomething()) { - VLOG(1) << "Partial CTC result " << result_[0].sentence; - } + if (DecodedSomething()) { + VLOG(1) << "Partial CTC result " << result_[0].sentence; + } } void U2Recognizer::AttentionRescoring() { - decoder_->FinalizeSearch(); - UpdateResult(true); - - // No need to do rescoring - if (0.0 == opts_.decoder_opts.rescoring_weight) { - LOG_EVERY_N(WARNING, 3) << "Not do AttentionRescoring!"; - return; - } - LOG_EVERY_N(WARNING, 3) << "Do AttentionRescoring!"; - - // Inputs() returns N-best input ids, which is the basic unit for rescoring - // In CtcPrefixBeamSearch, inputs are the same to outputs - const auto& hypotheses = decoder_->Inputs(); - int num_hyps = hypotheses.size(); - if (num_hyps <= 0) { - return; - } - - kaldi::Timer timer; - std::vector rescoring_score; - decodable_->AttentionRescoring( - hypotheses, opts_.decoder_opts.reverse_weight, &rescoring_score); - VLOG(1) << "Attention Rescoring takes " << timer.Elapsed() << " sec."; - - // combine ctc score and rescoring score - for (size_t i = 0; i < num_hyps; i++) { - VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i] - << " ctc_score: " << result_[i].score; - result_[i].score = opts_.decoder_opts.rescoring_weight * rescoring_score[i] + - opts_.decoder_opts.ctc_weight * result_[i].score; - } - - std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc); - VLOG(1) << "result: " << result_[0].sentence - << " score: " << result_[0].score; -} + decoder_->FinalizeSearch(); + UpdateResult(true); -std::string U2Recognizer::GetFinalResult() { - return result_[0].sentence; -} + // No need to do rescoring + if (0.0 == opts_.decoder_opts.rescoring_weight) { + LOG_EVERY_N(WARNING, 3) << "Not do AttentionRescoring!"; + return; + } + LOG_EVERY_N(WARNING, 3) << "Do AttentionRescoring!"; + + // Inputs() returns N-best input ids, which is the basic unit for rescoring + // In CtcPrefixBeamSearch, inputs are the same to outputs + const auto& hypotheses = decoder_->Inputs(); + int num_hyps = hypotheses.size(); + if (num_hyps <= 0) { + return; + } + + kaldi::Timer timer; + std::vector rescoring_score; + decodable_->AttentionRescoring( + hypotheses, opts_.decoder_opts.reverse_weight, &rescoring_score); + VLOG(1) << "Attention Rescoring takes " << timer.Elapsed() << " sec."; + + // combine ctc score and rescoring score + for (size_t i = 0; i < num_hyps; i++) { + VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i] + << " ctc_score: " << result_[i].score; + result_[i].score = + opts_.decoder_opts.rescoring_weight * rescoring_score[i] + + opts_.decoder_opts.ctc_weight * result_[i].score; + } -std::string U2Recognizer::GetPartialResult() { - return result_[0].sentence; + std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc); + VLOG(1) << "result: " << result_[0].sentence + << " score: " << result_[0].score; } +std::string U2Recognizer::GetFinalResult() { return result_[0].sentence; } + +std::string U2Recognizer::GetPartialResult() { return result_[0].sentence; } + void U2Recognizer::SetFinished() { feature_pipeline_->SetFinished(); input_finished_ = true; diff --git a/speechx/speechx/decoder/u2_recognizer_main.cc b/speechx/speechx/decoder/u2_recognizer_main.cc index b1a7b2e8..9eb0441b 100644 --- a/speechx/speechx/decoder/u2_recognizer_main.cc +++ b/speechx/speechx/decoder/u2_recognizer_main.cc @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "decoder/u2_recognizer.h" #include "decoder/param.h" +#include "decoder/u2_recognizer.h" #include "kaldi/feat/wave-reader.h" #include "kaldi/util/table-types.h" @@ -43,7 +43,8 @@ int main(int argc, char* argv[]) { LOG(INFO) << "chunk size (s): " << streaming_chunk; LOG(INFO) << "chunk size (sample): " << chunk_sample_size; - ppspeech::U2RecognizerResource resource = ppspeech::U2RecognizerResource::InitFromFlags(); + ppspeech::U2RecognizerResource resource = + ppspeech::U2RecognizerResource::InitFromFlags(); ppspeech::U2Recognizer recognizer(resource); kaldi::Timer timer; @@ -103,7 +104,7 @@ int main(int argc, char* argv[]) { } double elapsed = timer.Elapsed(); - + LOG(INFO) << "Done " << num_done << " out of " << (num_err + num_done); LOG(INFO) << "cost:" << elapsed << " sec"; LOG(INFO) << "total wav duration is: " << tot_wav_duration << " sec"; diff --git a/speechx/speechx/frontend/audio/cmvn.cc b/speechx/speechx/frontend/audio/cmvn.cc index 5e84a1a1..7997e8a7 100644 --- a/speechx/speechx/frontend/audio/cmvn.cc +++ b/speechx/speechx/frontend/audio/cmvn.cc @@ -14,17 +14,18 @@ #include "frontend/audio/cmvn.h" + #include "kaldi/feat/cmvn.h" #include "kaldi/util/kaldi-io.h" namespace ppspeech { -using kaldi::Vector; -using kaldi::VectorBase; using kaldi::BaseFloat; -using std::vector; using kaldi::SubVector; +using kaldi::Vector; +using kaldi::VectorBase; using std::unique_ptr; +using std::vector; CMVN::CMVN(std::string cmvn_file, unique_ptr base_extractor) @@ -57,7 +58,7 @@ bool CMVN::Read(kaldi::Vector* feats) { // feats contain num_frames feature. void CMVN::Compute(VectorBase* feats) const { KALDI_ASSERT(feats != NULL); - + if (stats_.NumRows() > 2 || stats_.NumRows() < 1 || feats->Dim() % dim_ != 0) { KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << ',' diff --git a/speechx/speechx/frontend/audio/compute_fbank_main.cc b/speechx/speechx/frontend/audio/compute_fbank_main.cc index 93a6d407..bb7e449f 100644 --- a/speechx/speechx/frontend/audio/compute_fbank_main.cc +++ b/speechx/speechx/frontend/audio/compute_fbank_main.cc @@ -16,16 +16,15 @@ #include "base/flags.h" #include "base/log.h" -#include "kaldi/feat/wave-reader.h" -#include "kaldi/util/kaldi-io.h" -#include "kaldi/util/table-types.h" - #include "frontend/audio/audio_cache.h" #include "frontend/audio/data_cache.h" #include "frontend/audio/fbank.h" #include "frontend/audio/feature_cache.h" #include "frontend/audio/frontend_itf.h" #include "frontend/audio/normalizer.h" +#include "kaldi/feat/wave-reader.h" +#include "kaldi/util/kaldi-io.h" +#include "kaldi/util/table-types.h" DEFINE_string(wav_rspecifier, "", "test wav scp path"); DEFINE_string(feature_wspecifier, "", "output feats wspecifier"); @@ -86,24 +85,27 @@ int main(int argc, char* argv[]) { LOG(INFO) << "chunk size (sec): " << streaming_chunk; LOG(INFO) << "chunk size (sample): " << chunk_sample_size; - for (; !wav_reader.Done() && !wav_info_reader.Done(); wav_reader.Next(), wav_info_reader.Next()) { + for (; !wav_reader.Done() && !wav_info_reader.Done(); + wav_reader.Next(), wav_info_reader.Next()) { const std::string& utt = wav_reader.Key(); const kaldi::WaveData& wave_data = wav_reader.Value(); const std::string& utt2 = wav_info_reader.Key(); const kaldi::WaveInfo& wave_info = wav_info_reader.Value(); - CHECK(utt == utt2) << "wav reader and wav info reader using diff rspecifier!!!"; + CHECK(utt == utt2) + << "wav reader and wav info reader using diff rspecifier!!!"; LOG(INFO) << "utt: " << utt; LOG(INFO) << "samples: " << wave_info.SampleCount(); LOG(INFO) << "dur: " << wave_info.Duration() << " sec"; - CHECK(wave_info.SampFreq() == FLAGS_sample_rate) << "need " << FLAGS_sample_rate << " get " << wave_info.SampFreq(); + CHECK(wave_info.SampFreq() == FLAGS_sample_rate) + << "need " << FLAGS_sample_rate << " get " << wave_info.SampFreq(); // load first channel wav int32 this_channel = 0; kaldi::SubVector waveform(wave_data.Data(), this_channel); - + // compute feat chunk by chunk int tot_samples = waveform.Dim(); int sample_offset = 0; @@ -157,7 +159,8 @@ int main(int argc, char* argv[]) { ++cur_idx; } } - LOG(INFO) << "feat shape: " << features.NumRows() << " , " << features.NumCols(); + LOG(INFO) << "feat shape: " << features.NumRows() << " , " + << features.NumCols(); feat_writer.Write(utt, features); // reset frontend pipeline state diff --git a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc index 889f5663..42693c0c 100644 --- a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc +++ b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc @@ -14,16 +14,15 @@ #include "base/flags.h" #include "base/log.h" -#include "kaldi/feat/wave-reader.h" -#include "kaldi/util/kaldi-io.h" -#include "kaldi/util/table-types.h" - #include "frontend/audio/audio_cache.h" #include "frontend/audio/data_cache.h" #include "frontend/audio/feature_cache.h" #include "frontend/audio/frontend_itf.h" #include "frontend/audio/linear_spectrogram.h" #include "frontend/audio/normalizer.h" +#include "kaldi/feat/wave-reader.h" +#include "kaldi/util/kaldi-io.h" +#include "kaldi/util/table-types.h" DEFINE_string(wav_rspecifier, "", "test wav scp path"); DEFINE_string(feature_wspecifier, "", "output feats wspecifier"); diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc index 7232efc4..65493e42 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.cc +++ b/speechx/speechx/frontend/audio/feature_pipeline.cc @@ -18,7 +18,8 @@ namespace ppspeech { using std::unique_ptr; -FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) : opts_(opts) { +FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) + : opts_(opts) { unique_ptr data_source( new ppspeech::AudioCache(1000 * kint16max, opts.to_float32)); @@ -43,4 +44,4 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) : opts_(opt new ppspeech::Assembler(opts.assembler_opts, std::move(cache))); } -} // ppspeech +} // namespace ppspeech diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index b76c6280..dc971e0f 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -18,8 +18,8 @@ namespace ppspeech { using kaldi::BaseFloat; using kaldi::Matrix; -using std::vector; using kaldi::Vector; +using std::vector; Decodable::Decodable(const std::shared_ptr& nnet, const std::shared_ptr& frontend, @@ -56,7 +56,6 @@ int32 Decodable::NumIndices() const { return 0; } int32 Decodable::TokenId2NnetId(int32 token_id) { return token_id - 1; } - bool Decodable::EnsureFrameHaveComputed(int32 frame) { // decoding frame if (frame >= frames_ready_) { @@ -92,14 +91,15 @@ bool Decodable::AdvanceChunk() { return true; } -bool Decodable::AdvanceChunk(kaldi::Vector* logprobs, int* vocab_dim) { +bool Decodable::AdvanceChunk(kaldi::Vector* logprobs, + int* vocab_dim) { if (AdvanceChunk() == false) { return false; } int nrows = nnet_out_cache_.NumRows(); - CHECK(nrows == (frames_ready_ - frame_offset_)); - if (nrows <= 0){ + CHECK(nrows == (frames_ready_ - frame_offset_)); + if (nrows <= 0) { LOG(WARNING) << "No new nnet out in cache."; return false; } @@ -107,7 +107,7 @@ bool Decodable::AdvanceChunk(kaldi::Vector* logprobs, int* voc logprobs->Resize(nnet_out_cache_.NumRows() * nnet_out_cache_.NumCols()); logprobs->CopyRowsFromMat(nnet_out_cache_); - *vocab_dim = nnet_out_cache_.NumCols(); + *vocab_dim = nnet_out_cache_.NumCols(); return true; } @@ -140,7 +140,7 @@ BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) { BaseFloat logprob = 0.0; int32 frame_idx = frame - frame_offset_; BaseFloat nnet_out = nnet_out_cache_(frame_idx, TokenId2NnetId(index)); - if (nnet_->IsLogProb()){ + if (nnet_->IsLogProb()) { logprob = nnet_out; } else { logprob = std::log(nnet_out + std::numeric_limits::epsilon()); @@ -158,8 +158,8 @@ void Decodable::Reset() { } void Decodable::AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score){ + float reverse_weight, + std::vector* rescoring_score) { nnet_->AttentionRescoring(hyps, reverse_weight, rescoring_score); } diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc index 71252477..4bafdf83 100644 --- a/speechx/speechx/nnet/u2_nnet.cc +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -242,7 +242,6 @@ void U2Nnet::ForwardEncoderChunkImpl( const int32& feat_dim, std::vector* out_prob, int32* vocab_dim) { - #ifdef USE_PROFILING RecordEvent event( "ForwardEncoderChunkImpl", TracerEventType::UserDefined, 1); @@ -349,8 +348,9 @@ void U2Nnet::ForwardEncoderChunkImpl( // current offset in decoder frame // not used in nnet offset_ += chunk_out.shape()[1]; - VLOG(2) << "encoder out chunk size: " << chunk_out.shape()[1] << " total: " << offset_ ; - + VLOG(2) << "encoder out chunk size: " << chunk_out.shape()[1] + << " total: " << offset_; + // collects encoder outs. encoder_outs_.push_back(chunk_out); @@ -706,12 +706,13 @@ void U2Nnet::AttentionRescoring(const std::vector>& hyps, } -void U2Nnet::EncoderOuts(std::vector>* encoder_out) const { +void U2Nnet::EncoderOuts( + std::vector>* encoder_out) const { // list of (B=1,T,D) int size = encoder_outs_.size(); VLOG(1) << "encoder_outs_ size: " << size; - for (int i = 0; i < size; i++){ + for (int i = 0; i < size; i++) { const paddle::Tensor& item = encoder_outs_[i]; const std::vector shape = item.shape(); CHECK(shape.size() == 3); @@ -719,16 +720,17 @@ void U2Nnet::EncoderOuts(std::vector>* encoder_o const int& T = shape[1]; const int& D = shape[2]; CHECK(B == 1) << "Only support batch one."; - VLOG(1) << "encoder out " << i << " shape: (" << B << "," << T << "," << D << ")"; + VLOG(1) << "encoder out " << i << " shape: (" << B << "," << T << "," + << D << ")"; - const float *this_tensor_ptr = item.data(); - for (int j = 0; j < T; j++){ - const float* cur = this_tensor_ptr + j * D; + const float* this_tensor_ptr = item.data(); + for (int j = 0; j < T; j++) { + const float* cur = this_tensor_ptr + j * D; kaldi::Vector out(D); std::memcpy(out.Data(), cur, D * sizeof(kaldi::BaseFloat)); encoder_out->emplace_back(out); } } - } +} } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc index adbbf0e8..5039a59a 100644 --- a/speechx/speechx/nnet/u2_nnet_main.cc +++ b/speechx/speechx/nnet/u2_nnet_main.cc @@ -14,11 +14,11 @@ #include "base/common.h" +#include "decoder/param.h" #include "frontend/audio/assembler.h" #include "frontend/audio/data_cache.h" #include "kaldi/util/table-types.h" #include "nnet/decodable.h" -#include "decoder/param.h" #include "nnet/u2_nnet.h" @@ -46,15 +46,16 @@ int main(int argc, char* argv[]) { LOG(INFO) << "output wspecifier: " << FLAGS_nnet_prob_wspecifier; LOG(INFO) << "model path: " << FLAGS_model_path; - kaldi::SequentialBaseFloatMatrixReader feature_reader(FLAGS_feature_rspecifier); + kaldi::SequentialBaseFloatMatrixReader feature_reader( + FLAGS_feature_rspecifier); kaldi::BaseFloatMatrixWriter nnet_out_writer(FLAGS_nnet_prob_wspecifier); - kaldi::BaseFloatMatrixWriter nnet_encoder_outs_writer(FLAGS_nnet_encoder_outs_wspecifier); + kaldi::BaseFloatMatrixWriter nnet_encoder_outs_writer( + FLAGS_nnet_encoder_outs_wspecifier); ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags(); - int32 chunk_size = - (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate + - FLAGS_receptive_field_length; + int32 chunk_size = (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate + + FLAGS_receptive_field_length; int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk; int32 receptive_field_length = FLAGS_receptive_field_length; LOG(INFO) << "chunk size (frame): " << chunk_size; @@ -92,9 +93,9 @@ int main(int argc, char* argv[]) { ori_feature_len - chunk_idx * chunk_stride, chunk_size); } if (this_chunk_size < receptive_field_length) { - LOG(WARNING) << "utt: " << utt << " skip last " - << this_chunk_size << " frames, expect is " - << receptive_field_length; + LOG(WARNING) + << "utt: " << utt << " skip last " << this_chunk_size + << " frames, expect is " << receptive_field_length; break; } @@ -123,13 +124,17 @@ int main(int argc, char* argv[]) { kaldi::Vector logprobs; bool isok = decodable->AdvanceChunk(&logprobs, &vocab_dim); CHECK(isok == true); - for (int row_idx = 0; row_idx < logprobs.Dim() / vocab_dim; row_idx ++) { + for (int row_idx = 0; row_idx < logprobs.Dim() / vocab_dim; + row_idx++) { kaldi::Vector vec_tmp(vocab_dim); - std::memcpy(vec_tmp.Data(), logprobs.Data() + row_idx*vocab_dim, sizeof(kaldi::BaseFloat) * vocab_dim); + std::memcpy(vec_tmp.Data(), + logprobs.Data() + row_idx * vocab_dim, + sizeof(kaldi::BaseFloat) * vocab_dim); prob_vec.push_back(vec_tmp); } - VLOG(2) << "frame_idx: " << frame_idx << " elapsed: " << timer.Elapsed() << " sec."; + VLOG(2) << "frame_idx: " << frame_idx + << " elapsed: " << timer.Elapsed() << " sec."; } // get encoder out @@ -141,7 +146,8 @@ int main(int argc, char* argv[]) { if (prob_vec.size() == 0 || encoder_out_vec.size() == 0) { // the TokenWriter can not write empty string. ++num_err; - LOG(WARNING) << " the nnet prob/encoder_out of " << utt << " is empty"; + LOG(WARNING) << " the nnet prob/encoder_out of " << utt + << " is empty"; continue; } @@ -168,7 +174,8 @@ int main(int argc, char* argv[]) { kaldi::Matrix encoder_outs(nrow, ncol); for (int32 row_idx = 0; row_idx < nrow; ++row_idx) { for (int32 col_idx = 0; col_idx < ncol; ++col_idx) { - encoder_outs(row_idx, col_idx) = encoder_out_vec[row_idx](col_idx); + encoder_outs(row_idx, col_idx) = + encoder_out_vec[row_idx](col_idx); } } nnet_encoder_outs_writer.Write(utt, encoder_outs); diff --git a/speechx/speechx/protocol/websocket/websocket_server_main.cc b/speechx/speechx/protocol/websocket/websocket_server_main.cc index 827b164f..5c32caf2 100644 --- a/speechx/speechx/protocol/websocket/websocket_server_main.cc +++ b/speechx/speechx/protocol/websocket/websocket_server_main.cc @@ -12,17 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "websocket/websocket_server.h" #include "decoder/param.h" +#include "websocket/websocket_server.h" DEFINE_int32(port, 8082, "websocket listening port"); ppspeech::RecognizerResource InitRecognizerResoure() { ppspeech::RecognizerResource resource; resource.acoustic_scale = FLAGS_acoustic_scale; - resource.feature_pipeline_opts = ppspeech::FeaturePipelineOptions::InitFromFlags(); + resource.feature_pipeline_opts = + ppspeech::FeaturePipelineOptions::InitFromFlags(); resource.model_opts = ppspeech::ModelOptions::InitFromFlags(); - resource.tlg_opts = ppspeech::TLGDecoderOptions::InitFromFlags(); + resource.tlg_opts = ppspeech::TLGDecoderOptions::InitFromFlags(); return resource; } diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc index c218990a..289470f6 100644 --- a/speechx/speechx/utils/math.cc +++ b/speechx/speechx/utils/math.cc @@ -16,13 +16,13 @@ #include "utils/math.h" -#include "base/common.h" - #include #include #include #include +#include "base/common.h" + namespace ppspeech { @@ -89,8 +89,8 @@ void TopK(const std::vector& data, } template void TopK(const std::vector& data, - int32_t k, - std::vector* values, - std::vector* indices) ; + int32_t k, + std::vector* values, + std::vector* indices); } // namespace ppspeech \ No newline at end of file From 99b3632d4d904e348e4cf37397538bb0a11bd2a8 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 18 Oct 2022 03:41:09 +0000 Subject: [PATCH 27/60] seprate recognizer; NnetBase as base class --- speechx/speechx/CMakeLists.txt | 6 +++ speechx/speechx/decoder/CMakeLists.txt | 14 ++---- speechx/speechx/nnet/decodable.cc | 2 +- speechx/speechx/nnet/decodable.h | 6 +-- speechx/speechx/nnet/ds2_nnet.h | 2 +- speechx/speechx/nnet/nnet_itf.h | 10 +++-- speechx/speechx/nnet/u2_nnet.cc | 2 +- speechx/speechx/nnet/u2_nnet.h | 6 +-- .../speechx/protocol/websocket/CMakeLists.txt | 2 +- .../protocol/websocket/websocket_server.h | 2 +- speechx/speechx/recognizer/CMakeLists.txt | 45 +++++++++++++++++++ .../{decoder => recognizer}/recognizer.cc | 2 +- .../{decoder => recognizer}/recognizer.h | 0 .../recognizer_main.cc | 13 +----- .../{decoder => recognizer}/u2_recognizer.cc | 4 +- .../{decoder => recognizer}/u2_recognizer.h | 0 .../u2_recognizer_main.cc | 2 +- 17 files changed, 78 insertions(+), 40 deletions(-) create mode 100644 speechx/speechx/recognizer/CMakeLists.txt rename speechx/speechx/{decoder => recognizer}/recognizer.cc (97%) rename speechx/speechx/{decoder => recognizer}/recognizer.h (100%) rename speechx/speechx/{decoder => recognizer}/recognizer_main.cc (88%) rename speechx/speechx/{decoder => recognizer}/u2_recognizer.cc (98%) rename speechx/speechx/{decoder => recognizer}/u2_recognizer.h (100%) rename speechx/speechx/{decoder => recognizer}/u2_recognizer_main.cc (99%) diff --git a/speechx/speechx/CMakeLists.txt b/speechx/speechx/CMakeLists.txt index c8e21d48..60c18347 100644 --- a/speechx/speechx/CMakeLists.txt +++ b/speechx/speechx/CMakeLists.txt @@ -32,6 +32,12 @@ ${CMAKE_CURRENT_SOURCE_DIR}/decoder ) add_subdirectory(decoder) +include_directories( +${CMAKE_CURRENT_SOURCE_DIR} +${CMAKE_CURRENT_SOURCE_DIR}/recognizer +) +add_subdirectory(recognizer) + include_directories( ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/protocol diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt index d06c3529..5bec24a6 100644 --- a/speechx/speechx/decoder/CMakeLists.txt +++ b/speechx/speechx/decoder/CMakeLists.txt @@ -1,28 +1,24 @@ -project(decoder) - include_directories(${CMAKE_CURRENT_SOURCE_DIR/ctc_decoders}) -set(decoder_src ) +set(srcs) if (USING_DS2) -list(APPEND decoder_src +list(APPEND srcs ctc_decoders/decoder_utils.cpp ctc_decoders/path_trie.cpp ctc_decoders/scorer.cpp ctc_beam_search_decoder.cc ctc_tlg_decoder.cc -recognizer.cc ) endif() if (USING_U2) - list(APPEND decoder_src + list(APPEND srcs ctc_prefix_beam_search_decoder.cc - u2_recognizer.cc ) endif() -add_library(decoder STATIC ${decoder_src}) +add_library(decoder STATIC ${srcs}) target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder absl::strings) # test @@ -30,7 +26,6 @@ if (USING_DS2) set(BINS ctc_beam_search_decoder_main nnet_logprob_decoder_main - recognizer_main ctc_tlg_decoder_main ) @@ -45,7 +40,6 @@ endif() if (USING_U2) set(TEST_BINS ctc_prefix_beam_search_decoder_main - u2_recognizer_main ) foreach(bin_name IN LISTS TEST_BINS) diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index dc971e0f..9bad8ed4 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -21,7 +21,7 @@ using kaldi::Matrix; using kaldi::Vector; using std::vector; -Decodable::Decodable(const std::shared_ptr& nnet, +Decodable::Decodable(const std::shared_ptr& nnet, const std::shared_ptr& frontend, kaldi::BaseFloat acoustic_scale) : frontend_(frontend), diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h index 70a16e2c..dd7b329e 100644 --- a/speechx/speechx/nnet/decodable.h +++ b/speechx/speechx/nnet/decodable.h @@ -24,7 +24,7 @@ struct DecodableOpts; class Decodable : public kaldi::DecodableInterface { public: - explicit Decodable(const std::shared_ptr& nnet, + explicit Decodable(const std::shared_ptr& nnet, const std::shared_ptr& frontend, kaldi::BaseFloat acoustic_scale = 1.0); @@ -63,14 +63,14 @@ class Decodable : public kaldi::DecodableInterface { int32 TokenId2NnetId(int32 token_id); - std::shared_ptr Nnet() { return nnet_; } + std::shared_ptr Nnet() { return nnet_; } // for offline test void Acceptlikelihood(const kaldi::Matrix& likelihood); private: std::shared_ptr frontend_; - std::shared_ptr nnet_; + std::shared_ptr nnet_; // nnet outputs' cache kaldi::Matrix nnet_out_cache_; diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h index 4aeec32f..d1e3ac8c 100644 --- a/speechx/speechx/nnet/ds2_nnet.h +++ b/speechx/speechx/nnet/ds2_nnet.h @@ -48,7 +48,7 @@ class Tensor { std::vector _data; }; -class PaddleNnet : public NnetInterface { +class PaddleNnet : public NnetBase { public: PaddleNnet(const ModelOptions& opts); diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h index cc737ce0..a504cce5 100644 --- a/speechx/speechx/nnet/nnet_itf.h +++ b/speechx/speechx/nnet/nnet_itf.h @@ -11,8 +11,6 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - - #pragma once #include "base/basic_types.h" @@ -105,11 +103,15 @@ class NnetInterface { // true, nnet output is logprob; otherwise is prob, virtual bool IsLogProb() = 0; - int SubsamplingRate() const { return subsampling_rate_; } - // using to get encoder outs. e.g. seq2seq with Attention model. virtual void EncoderOuts( std::vector>* encoder_out) const = 0; +}; + + +class NnetBase : public NnetInterface { + public: + int SubsamplingRate() const { return subsampling_rate_; } protected: int subsampling_rate_{1}; diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc index 4bafdf83..c92c96aa 100644 --- a/speechx/speechx/nnet/u2_nnet.cc +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -193,7 +193,7 @@ U2Nnet::U2Nnet(const U2Nnet& other) { // ignore inner states } -std::shared_ptr U2Nnet::Copy() const { +std::shared_ptr U2Nnet::Copy() const { auto asr_model = std::make_shared(*this); // reset inner state for new decoding asr_model->Reset(); diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h index 3435bca8..a37a88f2 100644 --- a/speechx/speechx/nnet/u2_nnet.h +++ b/speechx/speechx/nnet/u2_nnet.h @@ -24,7 +24,7 @@ namespace ppspeech { -class U2NnetBase : public NnetInterface { +class U2NnetBase : public NnetBase { public: virtual int context() const { return right_context_ + 1; } virtual int right_context() const { return right_context_; } @@ -41,7 +41,7 @@ class U2NnetBase : public NnetInterface { // start: false, it is the start chunk of one sentence, else true virtual int num_frames_for_chunk(bool start) const; - virtual std::shared_ptr Copy() const = 0; + virtual std::shared_ptr Copy() const = 0; virtual void ForwardEncoderChunk( const std::vector& chunk_feats, @@ -99,7 +99,7 @@ class U2Nnet : public U2NnetBase { std::shared_ptr model() const { return model_; } - std::shared_ptr Copy() const override; + std::shared_ptr Copy() const override; void ForwardEncoderChunkImpl( const std::vector& chunk_feats, diff --git a/speechx/speechx/protocol/websocket/CMakeLists.txt b/speechx/speechx/protocol/websocket/CMakeLists.txt index a171d84d..cafbbec7 100644 --- a/speechx/speechx/protocol/websocket/CMakeLists.txt +++ b/speechx/speechx/protocol/websocket/CMakeLists.txt @@ -2,7 +2,7 @@ add_library(websocket STATIC websocket_server.cc websocket_client.cc ) -target_link_libraries(websocket PUBLIC frontend decoder nnet) +target_link_libraries(websocket PUBLIC frontend nnet decoder recognizer) add_executable(websocket_server_main ${CMAKE_CURRENT_SOURCE_DIR}/websocket_server_main.cc) target_include_directories(websocket_server_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) diff --git a/speechx/speechx/protocol/websocket/websocket_server.h b/speechx/speechx/protocol/websocket/websocket_server.h index 8f3360e4..9b05f868 100644 --- a/speechx/speechx/protocol/websocket/websocket_server.h +++ b/speechx/speechx/protocol/websocket/websocket_server.h @@ -19,7 +19,7 @@ #include "boost/asio/ip/tcp.hpp" #include "boost/beast/core.hpp" #include "boost/beast/websocket.hpp" -#include "decoder/recognizer.h" +#include "recognizer/recognizer.h" #include "frontend/audio/feature_pipeline.h" namespace beast = boost::beast; // from diff --git a/speechx/speechx/recognizer/CMakeLists.txt b/speechx/speechx/recognizer/CMakeLists.txt new file mode 100644 index 00000000..05078873 --- /dev/null +++ b/speechx/speechx/recognizer/CMakeLists.txt @@ -0,0 +1,45 @@ +set(srcs) + +if (USING_DS2) +list(APPEND srcs +recognizer.cc +) +endif() + +if (USING_U2) + list(APPEND srcs + u2_recognizer.cc + ) +endif() + +add_library(recognizer STATIC ${srcs}) +target_link_libraries(recognizer PUBLIC decoder) + +# test +if (USING_DS2) + set(BINS recognizer_main) + + foreach(bin_name IN LISTS BINS) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} PUBLIC recognizer nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS}) + endforeach() +endif() + + +if (USING_U2) + set(TEST_BINS + u2_recognizer_main + ) + + foreach(bin_name IN LISTS TEST_BINS) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} recognizer nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util) + target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS}) + target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) + target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) + endforeach() + +endif() + diff --git a/speechx/speechx/decoder/recognizer.cc b/speechx/speechx/recognizer/recognizer.cc similarity index 97% rename from speechx/speechx/decoder/recognizer.cc rename to speechx/speechx/recognizer/recognizer.cc index 870aa40a..c6631813 100644 --- a/speechx/speechx/decoder/recognizer.cc +++ b/speechx/speechx/recognizer/recognizer.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "decoder/recognizer.h" +#include "recognizer/recognizer.h" namespace ppspeech { diff --git a/speechx/speechx/decoder/recognizer.h b/speechx/speechx/recognizer/recognizer.h similarity index 100% rename from speechx/speechx/decoder/recognizer.h rename to speechx/speechx/recognizer/recognizer.h diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/recognizer/recognizer_main.cc similarity index 88% rename from speechx/speechx/decoder/recognizer_main.cc rename to speechx/speechx/recognizer/recognizer_main.cc index 8e83b188..7c30fe6a 100644 --- a/speechx/speechx/decoder/recognizer_main.cc +++ b/speechx/speechx/recognizer/recognizer_main.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "decoder/param.h" -#include "decoder/recognizer.h" +#include "recognizer/recognizer.h" #include "kaldi/feat/wave-reader.h" #include "kaldi/util/table-types.h" @@ -22,15 +22,6 @@ DEFINE_string(result_wspecifier, "", "test result wspecifier"); DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); DEFINE_int32(sample_rate, 16000, "sample rate"); -ppspeech::RecognizerResource InitRecognizerResoure() { - ppspeech::RecognizerResource resource; - resource.acoustic_scale = FLAGS_acoustic_scale; - resource.feature_pipeline_opts = - ppspeech::FeaturePipelineOptions::InitFromFlags(); - resource.model_opts = ppspeech::ModelOptions::InitFromFlags(); - resource.tlg_opts = ppspeech::TLGDecoderOptions::InitFromFlags(); - return resource; -} int main(int argc, char* argv[]) { gflags::SetUsageMessage("Usage:"); @@ -39,7 +30,7 @@ int main(int argc, char* argv[]) { google::InstallFailureSignalHandler(); FLAGS_logtostderr = 1; - ppspeech::RecognizerResource resource = InitRecognizerResoure(); + ppspeech::RecognizerResource resource = ppspeech::RecognizerResource::InitFromFlags(); ppspeech::Recognizer recognizer(resource); kaldi::SequentialTableReader wav_reader( diff --git a/speechx/speechx/decoder/u2_recognizer.cc b/speechx/speechx/recognizer/u2_recognizer.cc similarity index 98% rename from speechx/speechx/decoder/u2_recognizer.cc rename to speechx/speechx/recognizer/u2_recognizer.cc index 04712e7b..75834aa5 100644 --- a/speechx/speechx/decoder/u2_recognizer.cc +++ b/speechx/speechx/recognizer/u2_recognizer.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "decoder/u2_recognizer.h" +#include "recognizer/u2_recognizer.h" #include "nnet/u2_nnet.h" @@ -30,7 +30,7 @@ U2Recognizer::U2Recognizer(const U2RecognizerResource& resource) const FeaturePipelineOptions& feature_opts = resource.feature_pipeline_opts; feature_pipeline_.reset(new FeaturePipeline(feature_opts)); - std::shared_ptr nnet(new U2Nnet(resource.model_opts)); + std::shared_ptr nnet(new U2Nnet(resource.model_opts)); BaseFloat am_scale = resource.acoustic_scale; decodable_.reset(new Decodable(nnet, feature_pipeline_, am_scale)); diff --git a/speechx/speechx/decoder/u2_recognizer.h b/speechx/speechx/recognizer/u2_recognizer.h similarity index 100% rename from speechx/speechx/decoder/u2_recognizer.h rename to speechx/speechx/recognizer/u2_recognizer.h diff --git a/speechx/speechx/decoder/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc similarity index 99% rename from speechx/speechx/decoder/u2_recognizer_main.cc rename to speechx/speechx/recognizer/u2_recognizer_main.cc index 9eb0441b..ff848f58 100644 --- a/speechx/speechx/decoder/u2_recognizer_main.cc +++ b/speechx/speechx/recognizer/u2_recognizer_main.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "decoder/param.h" -#include "decoder/u2_recognizer.h" +#include "recognizer/u2_recognizer.h" #include "kaldi/feat/wave-reader.h" #include "kaldi/util/table-types.h" From 043246e16807b51e6a9f29b2da5ff18428614f45 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 18 Oct 2022 03:54:06 +0000 Subject: [PATCH 28/60] format --- speechx/speechx/protocol/websocket/websocket_server.h | 2 +- speechx/speechx/recognizer/u2_recognizer_main.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/speechx/speechx/protocol/websocket/websocket_server.h b/speechx/speechx/protocol/websocket/websocket_server.h index 9b05f868..b0dcb3e3 100644 --- a/speechx/speechx/protocol/websocket/websocket_server.h +++ b/speechx/speechx/protocol/websocket/websocket_server.h @@ -19,8 +19,8 @@ #include "boost/asio/ip/tcp.hpp" #include "boost/beast/core.hpp" #include "boost/beast/websocket.hpp" -#include "recognizer/recognizer.h" #include "frontend/audio/feature_pipeline.h" +#include "recognizer/recognizer.h" namespace beast = boost::beast; // from namespace http = beast::http; // from diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc index ff848f58..38bd5ccc 100644 --- a/speechx/speechx/recognizer/u2_recognizer_main.cc +++ b/speechx/speechx/recognizer/u2_recognizer_main.cc @@ -13,9 +13,9 @@ // limitations under the License. #include "decoder/param.h" -#include "recognizer/u2_recognizer.h" #include "kaldi/feat/wave-reader.h" #include "kaldi/util/table-types.h" +#include "recognizer/u2_recognizer.h" DEFINE_string(wav_rspecifier, "", "test feature rspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier"); From 005d0e17be246f3e867be4250fe890cedf58c205 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 18 Oct 2022 03:57:09 +0000 Subject: [PATCH 29/60] update path.sh --- speechx/examples/codelab/u2/path.sh | 2 +- speechx/examples/u2pp_ol/wenetspeech/path.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/speechx/examples/codelab/u2/path.sh b/speechx/examples/codelab/u2/path.sh index 7f32fbce..d0600133 100644 --- a/speechx/examples/codelab/u2/path.sh +++ b/speechx/examples/codelab/u2/path.sh @@ -12,7 +12,7 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin export LC_AL=C -export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio +export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio:$SPEECHX_BUILD/recognizer PADDLE_LIB_PATH=$(python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')") export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH diff --git a/speechx/examples/u2pp_ol/wenetspeech/path.sh b/speechx/examples/u2pp_ol/wenetspeech/path.sh index 7f32fbce..d0600133 100644 --- a/speechx/examples/u2pp_ol/wenetspeech/path.sh +++ b/speechx/examples/u2pp_ol/wenetspeech/path.sh @@ -12,7 +12,7 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin export LC_AL=C -export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio +export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio:$SPEECHX_BUILD/recognizer PADDLE_LIB_PATH=$(python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')") export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH From 138b4fe1f083d24d5bdccc461224dcf13ef0706d Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 18 Oct 2022 06:59:21 +0000 Subject: [PATCH 30/60] fix cmake paddle flags; more doc info --- speechx/CMakeLists.txt | 8 ++++-- speechx/README.md | 4 +++ speechx/examples/README.md | 30 +++++++++++++++++--- speechx/examples/codelab/README.md | 5 ++-- speechx/examples/codelab/u2/path.sh | 2 +- speechx/examples/u2pp_ol/README.md | 2 +- speechx/examples/u2pp_ol/wenetspeech/path.sh | 2 +- 7 files changed, 41 insertions(+), 12 deletions(-) diff --git a/speechx/CMakeLists.txt b/speechx/CMakeLists.txt index 17e64c04..6255cb2e 100644 --- a/speechx/CMakeLists.txt +++ b/speechx/CMakeLists.txt @@ -100,8 +100,9 @@ message(STATUS "Python3_EXECUTABLE = ${Python3_EXECUTABLE}") message(STATUS "Pybind11_INCLUDES = ${pybind11_INCLUDE_DIRS}, pybind11_LIBRARIES=${pybind11_LIBRARIES}, pybind11_DEFINITIONS=${pybind11_DEFINITIONS}") # paddle include and link option +# -L/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/libs -L/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/fluid -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so execute_process( - COMMAND python -c "import paddle ; print(' '.join(paddle.sysconfig.get_link_flags()), end='')" + COMMAND python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=' '.join([\"-L\" + libs_dir, \"-L\" + fluid_dir]); out += \" -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so\"; print(out);" OUTPUT_VARIABLE PADDLE_LINK_FLAGS RESULT_VARIABLE SUCESS) @@ -109,8 +110,9 @@ message(STATUS PADDLE_LINK_FLAGS= ${PADDLE_LINK_FLAGS}) string(STRIP ${PADDLE_LINK_FLAGS} PADDLE_LINK_FLAGS) # paddle compile option +# -I/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/include execute_process( - COMMAND python -c "import paddle ; print(' '.join(paddle.sysconfig.get_compile_flags()), end='')" + COMMAND python -c "import paddle; include_dir = paddle.sysconfig.get_include(); print(f\"-I{include_dir}\");" OUTPUT_VARIABLE PADDLE_COMPILE_FLAGS) message(STATUS PADDLE_COMPILE_FLAGS= ${PADDLE_COMPILE_FLAGS}) string(STRIP ${PADDLE_COMPILE_FLAGS} PADDLE_COMPILE_FLAGS) @@ -119,7 +121,7 @@ string(STRIP ${PADDLE_COMPILE_FLAGS} PADDLE_COMPILE_FLAGS) # for LD_LIBRARY_PATH # set(PADDLE_LIB_DIRS /workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid:/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/libs/) execute_process( - COMMAND python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')" + COMMAND python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out);" OUTPUT_VARIABLE PADDLE_LIB_DIRS) message(STATUS PADDLE_LIB_DIRS= ${PADDLE_LIB_DIRS}) diff --git a/speechx/README.md b/speechx/README.md index cc7b13e6..3861edf3 100644 --- a/speechx/README.md +++ b/speechx/README.md @@ -35,11 +35,15 @@ bash tools/venv.sh 2. Build `speechx` and `examples`. +For now we using feature under `develop` branch of paddle, so we need install `paddlepaddle` nightly build version. +For example: ``` source venv/bin/activate +python -m pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html ./build.sh ``` + 3. Go to `examples` to have a fun. More details please see `README.md` under `examples`. diff --git a/speechx/examples/README.md b/speechx/examples/README.md index f7f6f9ac..de27bd94 100644 --- a/speechx/examples/README.md +++ b/speechx/examples/README.md @@ -1,20 +1,42 @@ # Examples for SpeechX +> `u2pp_ol` is recommended. + +* `u2pp_ol` - u2++ streaming asr test under `aishell-1` test dataset. * `ds2_ol` - ds2 streaming test under `aishell-1` test dataset. + ## How to run -`run.sh` is the entry point. +### Create env + +Using `tools/evn.sh` under `speechx` to create python env. + +``` +bash tools/env.sh +``` + +Source env before play with example. +``` +. venv/bin/activate +``` + +### Play with example + +`run.sh` is the entry point for every example. -Example to play `ds2_ol`: +Example to play `u2pp_ol`: ``` -pushd ds2_ol/aishell -bash run.sh +pushd u2pp_ol/wenetspeech +bash run.sh --stop_stage 4 ``` ## Display Model with [Netron](https://github.com/lutzroeder/netron) +If you have a model, we can using this commnd to show model graph. + +For example: ``` pip install netron netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel --port 8022 --host 10.21.55.20 diff --git a/speechx/examples/codelab/README.md b/speechx/examples/codelab/README.md index f89184de..803f25fa 100644 --- a/speechx/examples/codelab/README.md +++ b/speechx/examples/codelab/README.md @@ -1,8 +1,9 @@ # Codelab -## introduction +> The below is for developing and offline testing. +> Do not run it only if you know what it is. -> The below is for developing and offline testing. Do not run it only if you know what it is. * nnet * feat * decoder +* u2 diff --git a/speechx/examples/codelab/u2/path.sh b/speechx/examples/codelab/u2/path.sh index d0600133..ec278bd3 100644 --- a/speechx/examples/codelab/u2/path.sh +++ b/speechx/examples/codelab/u2/path.sh @@ -14,5 +14,5 @@ export LC_AL=C export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio:$SPEECHX_BUILD/recognizer -PADDLE_LIB_PATH=$(python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')") +PADDLE_LIB_PATH=$(python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out);") export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH diff --git a/speechx/examples/u2pp_ol/README.md b/speechx/examples/u2pp_ol/README.md index ce01a8fc..838db435 100644 --- a/speechx/examples/u2pp_ol/README.md +++ b/speechx/examples/u2pp_ol/README.md @@ -2,4 +2,4 @@ ## Examples -* `wenetspeech` - Streaming Decoding using wenetspeech u2/u2++ model. Using aishell test data for testing. +* `wenetspeech` - Streaming Decoding with wenetspeech u2/u2++ model. Using aishell test data for testing. diff --git a/speechx/examples/u2pp_ol/wenetspeech/path.sh b/speechx/examples/u2pp_ol/wenetspeech/path.sh index d0600133..ec278bd3 100644 --- a/speechx/examples/u2pp_ol/wenetspeech/path.sh +++ b/speechx/examples/u2pp_ol/wenetspeech/path.sh @@ -14,5 +14,5 @@ export LC_AL=C export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio:$SPEECHX_BUILD/recognizer -PADDLE_LIB_PATH=$(python -c "import paddle ; print(':'.join(paddle.sysconfig.get_lib()), end='')") +PADDLE_LIB_PATH=$(python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out);") export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH From 36af34b293a18c0fc3b61de3b261b04468cac1b7 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 18 Oct 2022 09:00:14 +0000 Subject: [PATCH 31/60] add DecoderBase and license --- speechx/speechx/decoder/ctc_beam_search_decoder.h | 2 +- speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h | 5 ++++- speechx/speechx/decoder/ctc_prefix_beam_search_score.h | 2 ++ speechx/speechx/decoder/ctc_tlg_decoder.h | 2 +- speechx/speechx/decoder/decoder_itf.h | 3 +++ speechx/speechx/nnet/u2_nnet.cc | 3 +++ speechx/speechx/nnet/u2_nnet.h | 2 ++ 7 files changed, 16 insertions(+), 3 deletions(-) diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.h b/speechx/speechx/decoder/ctc_beam_search_decoder.h index 479754c3..6347bba8 100644 --- a/speechx/speechx/decoder/ctc_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_beam_search_decoder.h @@ -23,7 +23,7 @@ namespace ppspeech { -class CTCBeamSearch : public DecoderInterface { +class CTCBeamSearch : public DecoderBase { public: explicit CTCBeamSearch(const CTCBeamSearchOptions& opts); ~CTCBeamSearch() {} diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h index eef8823d..ef96ecd9 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h @@ -1,3 +1,4 @@ +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,6 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.cc + #pragma once #include "decoder/ctc_beam_search_opt.h" @@ -21,7 +24,7 @@ namespace ppspeech { class ContextGraph; -class CTCPrefixBeamSearch : public DecoderInterface { +class CTCPrefixBeamSearch : public DecoderBase { public: explicit CTCPrefixBeamSearch(const std::string vocab_path, const CTCBeamSearchOptions& opts); diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h index da2fb80a..908be1d6 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h @@ -13,6 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.h + #pragma once #include "base/common.h" diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h index cf8a9b73..f250ac25 100644 --- a/speechx/speechx/decoder/ctc_tlg_decoder.h +++ b/speechx/speechx/decoder/ctc_tlg_decoder.h @@ -54,7 +54,7 @@ struct TLGDecoderOptions { } }; -class TLGDecoder : public DecoderInterface { +class TLGDecoder : public DecoderBase { public: explicit TLGDecoder(TLGDecoderOptions opts); ~TLGDecoder() = default; diff --git a/speechx/speechx/decoder/decoder_itf.h b/speechx/speechx/decoder/decoder_itf.h index eec9bc3d..2289b317 100644 --- a/speechx/speechx/decoder/decoder_itf.h +++ b/speechx/speechx/decoder/decoder_itf.h @@ -51,7 +51,10 @@ class DecoderInterface { virtual std::vector> GetNBestPath() = 0; virtual std::vector> GetNBestPath(int n) = 0; +}; +class DecoderBase : public DecoderInterface { + protected: // start from one int NumFrameDecoded() { return num_frame_decoded_ + 1; } diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc index c92c96aa..ff6a4dc3 100644 --- a/speechx/speechx/nnet/u2_nnet.cc +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -1,3 +1,4 @@ +// Copyright 2022 Horizon Robotics. All Rights Reserved. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,6 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.cc + #include "nnet/u2_nnet.h" #ifdef USE_PROFILING diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h index a37a88f2..48dd8193 100644 --- a/speechx/speechx/nnet/u2_nnet.h +++ b/speechx/speechx/nnet/u2_nnet.h @@ -1,3 +1,4 @@ +// Copyright 2022 Horizon Robotics. All Rights Reserved. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,6 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.h #pragma once #include "base/common.h" From 0a8ef58af088d58ae882044640eba5fcb64ccf13 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 18 Oct 2022 09:10:49 +0000 Subject: [PATCH 32/60] remove uesless code --- speechx/speechx/nnet/u2_nnet.cc | 83 +-------------------------------- speechx/speechx/nnet/u2_nnet.h | 27 ++++------- 2 files changed, 10 insertions(+), 100 deletions(-) diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc index ff6a4dc3..baae2ce8 100644 --- a/speechx/speechx/nnet/u2_nnet.cc +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -25,65 +25,6 @@ using paddle::platform::TracerEventType; namespace ppspeech { -int U2NnetBase::num_frames_for_chunk(bool start) const { - int num_needed_frames = 0; // num feat frames - bool first = !start; // start == false is first - - if (chunk_size_ > 0) { - // streaming mode - if (first) { - // first chunk - // 1 decoder frame need `context` feat frames - int context = this->context(); - num_needed_frames = (chunk_size_ - 1) * subsampling_rate_ + context; - } else { - // after first chunk, we need stride this num frames. - num_needed_frames = chunk_size_ * subsampling_rate_; - } - } else { - // non-streaming mode. feed all feats once. - num_needed_frames = std::numeric_limits::max(); - } - - return num_needed_frames; -} - -// cache feats for next chunk -void U2NnetBase::CacheFeature(const std::vector& chunk_feats, - int32 feat_dim) { - // chunk_feats is nframes*feat_dim - const int chunk_size = chunk_feats.size() / feat_dim; - const int cached_feat_size = this->context() - subsampling_rate_; - if (chunk_size >= cached_feat_size) { - cached_feats_.resize(cached_feat_size); - for (int i = 0; i < cached_feat_size; ++i) { - auto start = - chunk_feats.begin() + chunk_size - cached_feat_size + i; - auto end = start + feat_dim; - cached_feats_[i] = std::vector(start, end); - } - } -} - -void U2NnetBase::ForwardEncoderChunk( - const std::vector& chunk_feats, - const int32& feat_dim, - std::vector* ctc_probs, - int32* vocab_dim) { - ctc_probs->clear(); - // int num_frames = cached_feats_.size() + chunk_feats.size(); - int num_frames = chunk_feats.size() / feat_dim; - VLOG(3) << "foward encoder chunk: " << num_frames << " frames"; - VLOG(3) << "context: " << this->context() << " frames"; - - if (num_frames >= this->context()) { - this->ForwardEncoderChunkImpl( - chunk_feats, feat_dim, ctc_probs, vocab_dim); - VLOG(3) << "after forward chunk"; - this->CacheFeature(chunk_feats, feat_dim); - } -} - void U2Nnet::LoadModel(const std::string& model_path_w_prefix) { paddle::jit::utils::InitKernelSignatureMap(); @@ -188,7 +129,7 @@ U2Nnet::U2Nnet(const U2Nnet& other) { forward_attention_decoder_ = other.forward_attention_decoder_; ctc_activation_ = other.ctc_activation_; - // offset_ = other.offset_; // TODO: not used in nnets + offset_ = other.offset_; // copy model ptr model_ = other.model_; @@ -204,8 +145,7 @@ std::shared_ptr U2Nnet::Copy() const { } void U2Nnet::Reset() { - // offset_ = 0; - // cached_feats_.clear(); // TODO: not used in nnets + offset_ = 0; att_cache_ = std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32)); @@ -263,16 +203,6 @@ void U2Nnet::ForwardEncoderChunkImpl( paddle::zeros({1, num_frames, feat_dim}, paddle::DataType::FLOAT32); float* feats_ptr = feats.mutable_data(); - // for (size_t i = 0; i < cached_feats_.size(); ++i) { - // float* row = feats_ptr + i * feat_dim; - // std::memcpy(row, cached_feats_[i].data(), feat_dim * sizeof(float)); - // } - - // for (size_t i = 0; i < chunk_feats.size(); ++i) { - // float* row = feats_ptr + (cached_feats_.size() + i) * feat_dim; - // std::memcpy(row, chunk_feats[i].data(), feat_dim * sizeof(float)); - // } - // not cache feature in nnet CHECK(cached_feats_.size() == 0); // CHECK_EQ(std::is_same::value, true); @@ -427,15 +357,6 @@ void U2Nnet::ForwardEncoderChunkImpl( float* ctc_log_probs_ptr = ctc_log_probs.data(); - // // vector> - // out_prob->resize(T); - // for (int i = 0; i < T; i++) { - // (*out_prob)[i].resize(D); - // float* dst_ptr = (*out_prob)[i].data(); - // float* src_ptr = ctc_log_probs_ptr + (i * D); - // std::memcpy(dst_ptr, src_ptr, D * sizeof(float)); - // } - // CHECK(std::is_same::value); out_prob->resize(T * D); std::memcpy( out_prob->data(), ctc_log_probs_ptr, T * D * sizeof(kaldi::BaseFloat)); diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h index 48dd8193..6cbc0570 100644 --- a/speechx/speechx/nnet/u2_nnet.h +++ b/speechx/speechx/nnet/u2_nnet.h @@ -28,29 +28,21 @@ namespace ppspeech { class U2NnetBase : public NnetBase { public: - virtual int context() const { return right_context_ + 1; } - virtual int right_context() const { return right_context_; } + virtual int Context() const { return right_context_ + 1; } + virtual int RightContext() const { return right_context_; } - virtual int eos() const { return eos_; } - virtual int sos() const { return sos_; } - virtual int is_bidecoder() const { return is_bidecoder_; } + virtual int EOS() const { return eos_; } + virtual int SOS() const { return sos_; } + virtual int IsBidecoder() const { return is_bidecoder_; } // current offset in decoder frame - virtual int offset() const { return offset_; } - virtual void set_chunk_size(int chunk_size) { chunk_size_ = chunk_size; } - virtual void set_num_left_chunks(int num_left_chunks) { + virtual int Offset() const { return offset_; } + virtual void SetChunkSize(int chunk_size) { chunk_size_ = chunk_size; } + virtual void SetNumLeftChunks(int num_left_chunks) { num_left_chunks_ = num_left_chunks; } - // start: false, it is the start chunk of one sentence, else true - virtual int num_frames_for_chunk(bool start) const; virtual std::shared_ptr Copy() const = 0; - virtual void ForwardEncoderChunk( - const std::vector& chunk_feats, - const int32& feat_dim, - std::vector* ctc_probs, - int32* vocab_dim); - protected: virtual void ForwardEncoderChunkImpl( const std::vector& chunk_feats, @@ -58,9 +50,6 @@ class U2NnetBase : public NnetBase { std::vector* ctc_probs, int32* vocab_dim) = 0; - virtual void CacheFeature(const std::vector& chunk_feats, - int32 feat_dim); - protected: // model specification int right_context_{0}; From 050d766915c01a59fd4880dfb263dbc30605944f Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Wed, 19 Oct 2022 05:31:18 +0000 Subject: [PATCH 33/60] fix u2pp model --- docs/source/released_model.md | 2 +- paddlespeech/cli/asr/infer.py | 4 ++-- paddlespeech/resource/model_alias.py | 1 - paddlespeech/resource/pretrained_models.py | 26 +++------------------- 4 files changed, 6 insertions(+), 27 deletions(-) diff --git a/docs/source/released_model.md b/docs/source/released_model.md index a2456f1f..586f17c3 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -9,7 +9,7 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | [Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz) | Aishell Dataset | Char-based | 491 MB | 2 Conv + 5 LSTM layers | 0.0666 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) | onnx/inference/python | [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz)| Aishell Dataset | Char-based | 1.4 GB | 2 Conv + 5 bidirectional LSTM layers| 0.0554 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) | inference/python | [Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) 0.1879 (test\_meeting) |-| 10000 h |- | python | -[Conformer U2PP Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.4.model.tar.gz) | WenetSpeech Dataset | Char-based | 476 MB | Encoder:Conformer, Decoder:BiTransformer, Decoding method: Attention rescoring| 0.047198 (aishell test\_-1) 0.059212 (aishell test\_16) |-| 10000 h |- | python | +[Conformer U2PP Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.3.0.model.tar.gz) | WenetSpeech Dataset | Char-based | 476 MB | Encoder:Conformer, Decoder:BiTransformer, Decoding method: Attention rescoring| 0.047198 (aishell test\_-1) 0.059212 (aishell test\_16) |-| 10000 h |- | python | [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) | python | [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_1.0.1.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0460 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) | python | [Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1) | python | diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 437f6463..00414336 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -52,7 +52,7 @@ class ASRExecutor(BaseExecutor): self.parser.add_argument( '--model', type=str, - default='conformer_u2pp_wenetspeech', + default='conformer_u2pp_online_wenetspeech', choices=[ tag[:tag.index('-')] for tag in self.task_resource.pretrained_models.keys() @@ -470,7 +470,7 @@ class ASRExecutor(BaseExecutor): @stats_wrapper def __call__(self, audio_file: os.PathLike, - model: str='conformer_u2pp_wenetspeech', + model: str='conformer_u2pp_online_wenetspeech', lang: str='zh', sample_rate: int=16000, config: os.PathLike=None, diff --git a/paddlespeech/resource/model_alias.py b/paddlespeech/resource/model_alias.py index f5ec655b..8e9ecc4b 100644 --- a/paddlespeech/resource/model_alias.py +++ b/paddlespeech/resource/model_alias.py @@ -25,7 +25,6 @@ model_alias = { "deepspeech2online": ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"], "conformer": ["paddlespeech.s2t.models.u2:U2Model"], "conformer_online": ["paddlespeech.s2t.models.u2:U2Model"], - "conformer_u2pp": ["paddlespeech.s2t.models.u2:U2Model"], "conformer_u2pp_online": ["paddlespeech.s2t.models.u2:U2Model"], "transformer": ["paddlespeech.s2t.models.u2:U2Model"], "wenetspeech": ["paddlespeech.s2t.models.u2:U2Model"], diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index efd6bb3f..df50a6a9 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -68,32 +68,12 @@ asr_dynamic_pretrained_models = { '', }, }, - "conformer_u2pp_wenetspeech-zh-16k": { - '1.1': { - 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.3.model.tar.gz', - 'md5': - '662b347e1d2131b7a4dc5398365e2134', - 'cfg_path': - 'model.yaml', - 'ckpt_path': - 'exp/chunk_conformer_u2pp/checkpoints/avg_10', - 'model': - 'exp/chunk_conformer_u2pp/checkpoints/avg_10.pdparams', - 'params': - 'exp/chunk_conformer_u2pp/checkpoints/avg_10.pdparams', - 'lm_url': - '', - 'lm_md5': - '', - }, - }, "conformer_u2pp_online_wenetspeech-zh-16k": { - '1.1': { + '1.3': { 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.4.model.tar.gz', + 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.3.0.model.tar.gz', 'md5': - '3100fc1eac5779486cab859366992d0b', + '62d230c1bf27731192aa9d3b8deca300', 'cfg_path': 'model.yaml', 'ckpt_path': From f9fc32e89ebd82193feceea3bf79bb27b4ee5d80 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 19 Oct 2022 07:56:57 +0000 Subject: [PATCH 34/60] fix scripts --- .../examples/u2pp_ol/wenetspeech/local/recognizer.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh index 29b50537..bf463545 100755 --- a/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh +++ b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh @@ -6,10 +6,13 @@ set -e data=data exp=exp nj=20 + + mkdir -p $exp ckpt_dir=./data/model model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ aishell_wav_scp=aishell_test.scp +text=$data/test/text ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj @@ -27,8 +30,8 @@ u2_recognizer_main \ --result_wspecifier=ark,t:$data/split${nj}/JOB/result_recognizer.ark -cat $data/split${nj}/*/result_recognizer.ark > $exp/${label_file}_recognizer -utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_recognizer > $exp/${wer}.recognizer +cat $data/split${nj}/*/result_recognizer.ark > $exp/aishell_recognizer +utils/compute-wer.py --char=1 --v=1 $text $exp/aishell_recognizer > $exp/aishell.recognizer.err echo "recognizer test have finished!!!" -echo "please checkout in ${exp}/${wer}.recognizer" -tail -n 7 $exp/${wer}.recognizer \ No newline at end of file +echo "please checkout in $exp/aishell.recognizer.err" +tail -n 7 $exp/aishell.recognizer.err \ No newline at end of file From 7e334ce890a512f067af9a0918632a1c3c45001e Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 19 Oct 2022 12:43:47 +0000 Subject: [PATCH 35/60] fix assembler buf, which not clear cache, and fill zero default --- .../u2pp_ol/wenetspeech/local/recognizer.sh | 2 +- speechx/speechx/frontend/audio/assembler.cc | 50 ++++++++++++++----- speechx/speechx/frontend/audio/assembler.h | 31 ++++++------ speechx/speechx/frontend/audio/audio_cache.cc | 4 ++ speechx/speechx/frontend/audio/audio_cache.h | 4 +- .../speechx/frontend/audio/feature_cache.cc | 3 ++ .../speechx/frontend/audio/feature_cache.h | 10 ++-- speechx/speechx/nnet/u2_nnet.cc | 1 + .../speechx/recognizer/u2_recognizer_main.cc | 6 ++- 9 files changed, 77 insertions(+), 34 deletions(-) diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh index bf463545..f71a8003 100755 --- a/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh +++ b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh @@ -5,7 +5,7 @@ set -e data=data exp=exp -nj=20 +nj=40 mkdir -p $exp diff --git a/speechx/speechx/frontend/audio/assembler.cc b/speechx/speechx/frontend/audio/assembler.cc index 37eeec80..ff1b1f28 100644 --- a/speechx/speechx/frontend/audio/assembler.cc +++ b/speechx/speechx/frontend/audio/assembler.cc @@ -23,9 +23,11 @@ using std::unique_ptr; Assembler::Assembler(AssemblerOptions opts, unique_ptr base_extractor) { + fill_zero_ = opts.fill_zero; frame_chunk_stride_ = opts.subsampling_rate * opts.nnet_decoder_chunk; frame_chunk_size_ = (opts.nnet_decoder_chunk - 1) * opts.subsampling_rate + opts.receptive_filed_length; + cache_size_ = frame_chunk_size_ - frame_chunk_stride_; receptive_filed_length_ = opts.receptive_filed_length; base_extractor_ = std::move(base_extractor); dim_ = base_extractor_->Dim(); @@ -38,14 +40,13 @@ void Assembler::Accept(const kaldi::VectorBase& inputs) { // pop feature chunk bool Assembler::Read(kaldi::Vector* feats) { - feats->Resize(dim_ * frame_chunk_size_); bool result = Compute(feats); return result; } -// read all data from base_feature_extractor_ into cache_ +// read frame by frame from base_feature_extractor_ into cache_ bool Assembler::Compute(Vector* feats) { - // compute and feed + // compute and feed frame by frame bool result = false; while (feature_cache_.size() < frame_chunk_size_) { Vector feature; @@ -54,33 +55,58 @@ bool Assembler::Compute(Vector* feats) { if (IsFinished() == false) return false; break; } + + CHECK(feature.Dim() == dim_); + nframes_ += 1; + VLOG(1) << "nframes: " << nframes_; + feature_cache_.push(feature); } if (feature_cache_.size() < receptive_filed_length_) { + VLOG(1) << "feature_cache less than receptive_filed_lenght. " << feature_cache_.size() << ": " << receptive_filed_length_; return false; } - while (feature_cache_.size() < frame_chunk_size_) { - Vector feature(dim_, kaldi::kSetZero); - feature_cache_.push(feature); + + if (fill_zero_){ + while (feature_cache_.size() < frame_chunk_size_) { + Vector feature(dim_, kaldi::kSetZero); + nframes_ += 1; + feature_cache_.push(feature); + } } + int32 this_chunk_size = std::min(static_cast(feature_cache_.size()), frame_chunk_size_); + feats->Resize(dim_ * this_chunk_size); + int32 counter = 0; - int32 cache_size = frame_chunk_size_ - frame_chunk_stride_; - int32 elem_dim = base_extractor_->Dim(); - while (counter < frame_chunk_size_) { + while (counter < this_chunk_size) { Vector& val = feature_cache_.front(); - int32 start = counter * elem_dim; - feats->Range(start, elem_dim).CopyFromVec(val); - if (frame_chunk_size_ - counter <= cache_size) { + CHECK(val.Dim() == dim_) << val.Dim(); + + int32 start = counter * dim_; + feats->Range(start, dim_).CopyFromVec(val); + + if (this_chunk_size - counter <= cache_size_) { feature_cache_.push(val); } + + // val is reference, so we should pop here feature_cache_.pop(); + counter++; } return result; } + + void Assembler::Reset() { + std::queue> empty; + std::swap(feature_cache_, empty); + nframes_ = 0; + base_extractor_->Reset(); +} + } // namespace ppspeech diff --git a/speechx/speechx/frontend/audio/assembler.h b/speechx/speechx/frontend/audio/assembler.h index 258e61f2..4f165ea8 100644 --- a/speechx/speechx/frontend/audio/assembler.h +++ b/speechx/speechx/frontend/audio/assembler.h @@ -22,14 +22,10 @@ namespace ppspeech { struct AssemblerOptions { // refer:https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/s2t/exps/deepspeech2/model.py // the nnet batch forward - int32 receptive_filed_length; - int32 subsampling_rate; - int32 nnet_decoder_chunk; - - AssemblerOptions() - : receptive_filed_length(1), - subsampling_rate(1), - nnet_decoder_chunk(1) {} + int32 receptive_filed_length{1}; + int32 subsampling_rate{1}; + int32 nnet_decoder_chunk{1}; + bool fill_zero{false}; // whether fill zero when last chunk is not equal to frame_chunk_size_ }; class Assembler : public FrontendInterface { @@ -39,29 +35,34 @@ class Assembler : public FrontendInterface { std::unique_ptr base_extractor = NULL); // Feed feats or waves - virtual void Accept(const kaldi::VectorBase& inputs); + void Accept(const kaldi::VectorBase& inputs) override; // feats size = num_frames * feat_dim - virtual bool Read(kaldi::Vector* feats); + bool Read(kaldi::Vector* feats) override; // feat dim - virtual size_t Dim() const { return dim_; } + size_t Dim() const override { return dim_; } - virtual void SetFinished() { base_extractor_->SetFinished(); } + void SetFinished() override { base_extractor_->SetFinished(); } - virtual bool IsFinished() const { return base_extractor_->IsFinished(); } + bool IsFinished() const override { return base_extractor_->IsFinished(); } - virtual void Reset() { base_extractor_->Reset(); } + void Reset() override; private: bool Compute(kaldi::Vector* feats); - int32 dim_; + bool fill_zero_{false}; + + int32 dim_; // feat dim int32 frame_chunk_size_; // window int32 frame_chunk_stride_; // stride + int32 cache_size_; // window - stride int32 receptive_filed_length_; std::queue> feature_cache_; std::unique_ptr base_extractor_; + + int32 nframes_; // num frame computed DISALLOW_COPY_AND_ASSIGN(Assembler); }; diff --git a/speechx/speechx/frontend/audio/audio_cache.cc b/speechx/speechx/frontend/audio/audio_cache.cc index b7a15acd..71e5d09e 100644 --- a/speechx/speechx/frontend/audio/audio_cache.cc +++ b/speechx/speechx/frontend/audio/audio_cache.cc @@ -83,6 +83,10 @@ bool AudioCache::Read(Vector* waves) { } size_ -= chunk_size; offset_ = (offset_ + chunk_size) % ring_buffer_.size(); + + nsamples_ += chunk_size; + VLOG(1) << "nsamples readed: " << nsamples_; + ready_feed_condition_.notify_one(); return true; } diff --git a/speechx/speechx/frontend/audio/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h index fc07d4ba..da422daa 100644 --- a/speechx/speechx/frontend/audio/audio_cache.h +++ b/speechx/speechx/frontend/audio/audio_cache.h @@ -41,10 +41,11 @@ class AudioCache : public FrontendInterface { virtual bool IsFinished() const { return finished_; } - virtual void Reset() { + void Reset() override { offset_ = 0; size_ = 0; finished_ = false; + nsamples_ = 0; } private: @@ -61,6 +62,7 @@ class AudioCache : public FrontendInterface { kaldi::int32 timeout_; // millisecond bool to_float32_; // int16 -> float32. used in linear_spectrogram + int32 nsamples_; // number samples readed. DISALLOW_COPY_AND_ASSIGN(AudioCache); }; diff --git a/speechx/speechx/frontend/audio/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc index 509a98c3..c712e48e 100644 --- a/speechx/speechx/frontend/audio/feature_cache.cc +++ b/speechx/speechx/frontend/audio/feature_cache.cc @@ -73,6 +73,9 @@ bool FeatureCache::Compute() { if (result == false || feature.Dim() == 0) return false; int32 num_chunk = feature.Dim() / dim_; + nframe_ += num_chunk; + VLOG(1) << "nframe computed: " << nframe_; + for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) { int32 start = chunk_idx * dim_; Vector feature_chunk(dim_); diff --git a/speechx/speechx/frontend/audio/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h index b922de12..09d7f7eb 100644 --- a/speechx/speechx/frontend/audio/feature_cache.h +++ b/speechx/speechx/frontend/audio/feature_cache.h @@ -51,11 +51,12 @@ class FeatureCache : public FrontendInterface { virtual bool IsFinished() const { return base_extractor_->IsFinished(); } - virtual void Reset() { + void Reset() override { + std::queue> empty; + std::swap(cache_, empty); + nframe_ = 0; base_extractor_->Reset(); - while (!cache_.empty()) { - cache_.pop(); - } + VLOG(1) << "feature cache reset: cache size: " << cache_.size(); } private: @@ -74,6 +75,7 @@ class FeatureCache : public FrontendInterface { std::condition_variable ready_feed_condition_; std::condition_variable ready_read_condition_; + int32 nframe_; // num of feature computed DISALLOW_COPY_AND_ASSIGN(FeatureCache); }; diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc index baae2ce8..63a8a793 100644 --- a/speechx/speechx/nnet/u2_nnet.cc +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -153,6 +153,7 @@ void U2Nnet::Reset() { std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32)); encoder_outs_.clear(); + VLOG(1) << "u2nnet reset"; } // Debug API diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc index 38bd5ccc..2375586e 100644 --- a/speechx/speechx/recognizer/u2_recognizer_main.cc +++ b/speechx/speechx/recognizer/u2_recognizer_main.cc @@ -82,9 +82,13 @@ int main(int argc, char* argv[]) { // no overlap sample_offset += cur_chunk_size; } + CHECK(sample_offset == tot_samples); + + // recognizer.SetFinished(); + // second pass decoding recognizer.Rescoring(); - + std::string result = recognizer.GetFinalResult(); recognizer.Reset(); From 3ac7ac253f66c46f01aa11be3de95d6177f47107 Mon Sep 17 00:00:00 2001 From: "david.95" Date: Thu, 20 Oct 2022 09:29:11 +0800 Subject: [PATCH 36/60] fix review issue,test=tts --- paddlespeech/t2s/ssml/xml_processor.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/ssml/xml_processor.py index 54f24f59..b3912134 100644 --- a/paddlespeech/t2s/ssml/xml_processor.py +++ b/paddlespeech/t2s/ssml/xml_processor.py @@ -35,8 +35,8 @@ class MixTextProcessor(): return None def get_content_split(self, mixstr): - ''' 文本分解,顺序加了列表中,按非xml 和 xml 分开,对应的字符串,带标点符号 - 不能去除空格,因为xml 中tag 属性带空格 + ''' 文本分解,顺序加了列表中,按非 xml 和 xml 分开,对应的字符串,带标点符号 + 不能去除空格,因为 xml 中tag 属性带空格 ''' ctlist = [] # print("Testing:",mixstr[:20]) @@ -77,17 +77,12 @@ class MixTextProcessor(): class DomXml(): def __init__(self, xmlstr): - print("Parse xml str:", xmlstr) self.tdom = parseString(xmlstr) #Document - # print("tdom:",type(self.tdom)) self.root = self.tdom.documentElement #Element - # print("root:",type(self.root)) self.rnode = self.tdom.childNodes #NodeList - # print("rnode:",type(self.rnode)) - pass def get_text(self): - '''返回xml 内容的所有文本内容的 列表''' + '''返回 xml 内容的所有文本内容的列表''' res = [] for x1 in self.rnode: @@ -107,7 +102,7 @@ class DomXml(): return res def get_xmlchild_list(self): - '''返回xml 内容的列表, 包括所有文本内容(不带tag)''' + '''返回 xml 内容的列表,包括所有文本内容(不带 tag)''' res = [] for x1 in self.rnode: @@ -127,7 +122,7 @@ class DomXml(): return res def get_pinyins_for_xml(self): - '''返回xml 内容,如果字符串 和 拼音的 list , 如 [''' + '''返回 xml 内容,字符串和拼音的 list ''' res = [] for x1 in self.rnode: @@ -155,7 +150,7 @@ class DomXml(): return res def get_all_tags(self, tag_name): - '''获取所有的tag 及属性值''' + '''获取所有的 tag 及属性值''' alltags = self.root.getElementsByTagName(tag_name) for x in alltags: if x.hasAttribute('pinyin'): # pinyin From 7d5ae651ce92d0bd953f0de54b81d00cf951b01d Mon Sep 17 00:00:00 2001 From: "david.95" Date: Thu, 20 Oct 2022 10:07:21 +0800 Subject: [PATCH 37/60] add readme thanks --- README.md | 2 +- README_cn.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 49e40624..0abb3fd6 100644 --- a/README.md +++ b/README.md @@ -923,7 +923,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P ## Acknowledgement - Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples. -- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. +- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data.Added SSML for Chinese Text Frontend. - Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW). - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help. - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files. diff --git a/README_cn.md b/README_cn.md index bf3ff4df..0c3af5dd 100644 --- a/README_cn.md +++ b/README_cn.md @@ -928,7 +928,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 ## 致谢 - 非常感谢 [HighCWu](https://github.com/HighCWu) 新增 [VITS-aishell3](./examples/aishell3/vits) 和 [VITS-VC](./examples/aishell3/vits-vc) 代码示例。 -- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题,贡献补充多条程序和数据。 +- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题,贡献补充多条程序和数据。新增 SSML 中文文本前端处理。 - 非常感谢 [BarryKCL](https://github.com/BarryKCL) 基于 [G2PW](https://github.com/GitYCC/g2pW) 对 TTS 中文文本前端的优化。 - 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) 多年来的关注和建议,以及在诸多问题上的帮助。 - 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。 From ec1f9edd562275e2d2799c16e36a304bae172e1c Mon Sep 17 00:00:00 2001 From: "david.95" Date: Thu, 20 Oct 2022 10:11:26 +0800 Subject: [PATCH 38/60] add space after punctions --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0abb3fd6..d02ac4c6 100644 --- a/README.md +++ b/README.md @@ -923,7 +923,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P ## Acknowledgement - Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples. -- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data.Added SSML for Chinese Text Frontend. +- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. Added SSML for Chinese Text Frontend. - Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW). - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help. - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files. From da525d346f0a78fc1b6f11db408a5ce1a76c5610 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 20 Oct 2022 06:17:17 +0000 Subject: [PATCH 39/60] fix uvicorn's version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e551d9fa..3353cdad 100644 --- a/setup.py +++ b/setup.py @@ -77,7 +77,7 @@ base = [ "pybind11", ] -server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"] +server = ["fastapi", "uvicorn<=0.18.3", "pattern_singleton", "websockets"] requirements = { "install": From 63c80121e2c5691145a2bc8c49cf1a2b277c7067 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 20 Oct 2022 06:33:07 +0000 Subject: [PATCH 40/60] fix uvicorn's bug --- paddlespeech/server/bin/paddlespeech_server.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py index 10a91d9b..1b1792bd 100644 --- a/paddlespeech/server/bin/paddlespeech_server.py +++ b/paddlespeech/server/bin/paddlespeech_server.py @@ -113,7 +113,7 @@ class ServerExecutor(BaseExecutor): """ config = get_config(config_file) if self.init(config): - uvicorn.run(app, host=config.host, port=config.port, debug=True) + uvicorn.run(app, host=config.host, port=config.port) @cli_server_register( diff --git a/setup.py b/setup.py index 3353cdad..e551d9fa 100644 --- a/setup.py +++ b/setup.py @@ -77,7 +77,7 @@ base = [ "pybind11", ] -server = ["fastapi", "uvicorn<=0.18.3", "pattern_singleton", "websockets"] +server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"] requirements = { "install": From ce153d915e512c5ab38e7791fb733540189ebfb1 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Thu, 20 Oct 2022 07:54:00 +0000 Subject: [PATCH 41/60] update u2pp result.md --- examples/wenetspeech/asr1/RESULTS.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md index f22c652e..cd480163 100644 --- a/examples/wenetspeech/asr1/RESULTS.md +++ b/examples/wenetspeech/asr1/RESULTS.md @@ -53,3 +53,22 @@ Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | -1 | 0.061884 | | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | -1 | 0.062056 | | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | -1 | 0.052110 | + + +## U2PP Steaming Pretrained Model + +Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.3.0.model.tar.gz + +| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention | 16 | 0.057031 | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | 16 | 0.068826 | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | 16 | 0.069111 | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | 16 | 0.059213 | + +| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention | -1 | 0.049256 | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | -1 | 0.052086 | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | -1 | 0.052267 | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | -1 | 0.047198 | From ed0138c6e324a87e31a23138bafe6f878ed8f4e9 Mon Sep 17 00:00:00 2001 From: "david.95" Date: Thu, 20 Oct 2022 18:09:41 +0800 Subject: [PATCH 42/60] add condition check if a ssml input and filter space line, test=tts --- paddlespeech/t2s/exps/syn_utils.py | 36 +++++++++++++++++++----------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index f9d1cd1b..41663891 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -105,14 +105,15 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'): sentences = [] with open(text_file, 'rt') as f: for line in f: - items = re.split(r"\s+", line.strip(), 1) - utt_id = items[0] - if lang == 'zh': - sentence = "".join(items[1:]) - elif lang == 'en': - sentence = " ".join(items[1:]) - elif lang == 'mix': - sentence = " ".join(items[1:]) + if line.strip() != "": + items = re.split(r"\s+", line.strip(), 1) + utt_id = items[0] + if lang == 'zh': + sentence = "".join(items[1:]) + elif lang == 'en': + sentence = " ".join(items[1:]) + elif lang == 'mix': + sentence = " ".join(items[1:]) sentences.append((utt_id, sentence)) return sentences @@ -182,11 +183,20 @@ def run_frontend(frontend: object, to_tensor: bool=True): outs = dict() if lang == 'zh': - input_ids = frontend.get_input_ids_ssml( - text, - merge_sentences=merge_sentences, - get_tone_ids=get_tone_ids, - to_tensor=to_tensor) + input_ids = {} + if text.strip() != "" and re.match(r".*?.*?.*", text, + re.DOTALL): + input_ids = frontend.get_input_ids_ssml( + text, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) + else: + input_ids = frontend.get_input_ids( + text, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) phone_ids = input_ids["phone_ids"] if get_tone_ids: tone_ids = input_ids["tone_ids"] From 64cb4048a85cdaaf5175bcb511ce23d261bb2f71 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 20 Oct 2022 12:37:38 +0000 Subject: [PATCH 43/60] fix topk bug which cause ctc score diff --- speechx/speechx/utils/math.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc index 289470f6..959740a0 100644 --- a/speechx/speechx/utils/math.cc +++ b/speechx/speechx/utils/math.cc @@ -79,10 +79,12 @@ void TopK(const std::vector& data, int cur = values->size() - 1; while (!pq.empty()) { const auto& item = pq.top(); - pq.pop(); - + (*values)[cur] = item.first; (*indices)[cur] = item.second; + + // item if reference, must pop here + pq.pop(); cur--; } @@ -93,4 +95,4 @@ template void TopK(const std::vector& data, std::vector* values, std::vector* indices); -} // namespace ppspeech \ No newline at end of file +} // namespace ppspeech From 48271260103d4e44fd6652be5fb4ce3f9695429d Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 21 Oct 2022 02:24:36 +0000 Subject: [PATCH 44/60] more log --- .../examples/u2pp_ol/wenetspeech/local/recognizer.sh | 4 ++-- speechx/speechx/recognizer/u2_recognizer.h | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh index f71a8003..f4553f2a 100755 --- a/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh +++ b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh @@ -5,7 +5,7 @@ set -e data=data exp=exp -nj=40 +nj=20 mkdir -p $exp @@ -34,4 +34,4 @@ cat $data/split${nj}/*/result_recognizer.ark > $exp/aishell_recognizer utils/compute-wer.py --char=1 --v=1 $text $exp/aishell_recognizer > $exp/aishell.recognizer.err echo "recognizer test have finished!!!" echo "please checkout in $exp/aishell.recognizer.err" -tail -n 7 $exp/aishell.recognizer.err \ No newline at end of file +tail -n 7 $exp/aishell.recognizer.err diff --git a/speechx/speechx/recognizer/u2_recognizer.h b/speechx/speechx/recognizer/u2_recognizer.h index f4e91b18..54f4d258 100644 --- a/speechx/speechx/recognizer/u2_recognizer.h +++ b/speechx/speechx/recognizer/u2_recognizer.h @@ -72,6 +72,14 @@ struct DecodeOptions { decoder_opts.ctc_prefix_search_opts.blank = FLAGS_blank; decoder_opts.ctc_prefix_search_opts.first_beam_size = FLAGS_nbest; decoder_opts.ctc_prefix_search_opts.second_beam_size = FLAGS_nbest; + LOG(INFO) << "chunk_size: " << decoder_opts.chunk_size; + LOG(INFO) << "num_left_chunks: " << decoder_opts.num_left_chunks; + LOG(INFO) << "ctc_weight: " << decoder_opts.ctc_weight; + LOG(INFO) << "rescoring_weight: " << decoder_opts.rescoring_weight; + LOG(INFO) << "reverse_weight: " << decoder_opts.reverse_weight; + LOG(INFO) << "blank: " << FLAGS_blank; + LOG(INFO) << "first_beam_size: " << FLAGS_nbest; + LOG(INFO) << "second_beam_size: " << FLAGS_nbest; return decoder_opts; } }; @@ -88,6 +96,8 @@ struct U2RecognizerResource { U2RecognizerResource resource; resource.vocab_path = FLAGS_vocab_path; resource.acoustic_scale = FLAGS_acoustic_scale; + LOG(INFO) << "vocab path: " << resource.vocab_path; + LOG(INFO) << "acoustic_scale: " << resource.acoustic_scale; resource.feature_pipeline_opts = ppspeech::FeaturePipelineOptions::InitFromFlags(); From fc72ab1e074de0385dc795fe4ae05ff0e4691222 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 21 Oct 2022 02:31:08 +0000 Subject: [PATCH 45/60] more debug info --- speechx/build.sh | 2 +- .../decoder/ctc_prefix_beam_search_decoder.cc | 25 ++++++++++++++++--- speechx/speechx/frontend/audio/assembler.cc | 19 +++++++++----- .../speechx/frontend/audio/feature_cache.h | 4 ++- speechx/speechx/nnet/decodable.cc | 11 +++++--- speechx/speechx/nnet/u2_nnet.cc | 5 ++-- speechx/speechx/recognizer/u2_recognizer.cc | 6 ++--- .../speechx/recognizer/u2_recognizer_main.cc | 5 +++- 8 files changed, 57 insertions(+), 20 deletions(-) diff --git a/speechx/build.sh b/speechx/build.sh index e0a38675..7655f963 100755 --- a/speechx/build.sh +++ b/speechx/build.sh @@ -20,4 +20,4 @@ fi mkdir -p build cmake -B build -DBOOST_ROOT:STRING=${boost_SOURCE_DIR} -cmake --build build +cmake --build build -j diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc index a0fe5b2a..04530fb9 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc @@ -76,11 +76,15 @@ void CTCPrefixBeamSearch::AdvanceDecode( // forward frame by frame std::vector frame_prob; bool flag = decodable->FrameLikelihood(num_frame_decoded_, &frame_prob); - if (flag == false) break; + if (flag == false) { + LOG(INFO) << "decoder advance decode exit." << frame_prob.size(); + break; + } std::vector> likelihood; likelihood.push_back(frame_prob); AdvanceDecoding(likelihood); + VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_; } } @@ -114,7 +118,11 @@ void CTCPrefixBeamSearch::AdvanceDecoding( std::vector topk_score; std::vector topk_index; TopK(logp_t, first_beam_size, &topk_score, &topk_index); - + VLOG(2) << "topk: " << num_frame_decoded_ << " " << *std::max_element(logp_t.begin(), logp_t.end()) << " " << topk_score[0]; + for (int i = 0; i < topk_score.size(); i++){ + VLOG(2) << "topk: " << num_frame_decoded_ << " " << topk_score[i]; + } + // 2. token passing for (int i = 0; i < topk_index.size(); ++i) { int id = topk_index[i]; @@ -295,7 +303,18 @@ void CTCPrefixBeamSearch::UpdateOutputs( outputs_.emplace_back(output); } -void CTCPrefixBeamSearch::FinalizeSearch() { UpdateFinalContext(); } +void CTCPrefixBeamSearch::FinalizeSearch() { + UpdateFinalContext(); + + VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_; + int cnt = 0; + for (int i = 0; i < hypotheses_.size(); i ++){ + VLOG(2) << "hyp " << cnt << " len: " << hypotheses_[i].size() << " ctc score: " << likelihood_[i]; + for (int j = 0; j < hypotheses_[i].size(); j ++){ + VLOG(2) << hypotheses_[i][j]; + } + } +} void CTCPrefixBeamSearch::UpdateFinalContext() { if (context_graph_ == nullptr) return; diff --git a/speechx/speechx/frontend/audio/assembler.cc b/speechx/speechx/frontend/audio/assembler.cc index ff1b1f28..afee3a6a 100644 --- a/speechx/speechx/frontend/audio/assembler.cc +++ b/speechx/speechx/frontend/audio/assembler.cc @@ -52,15 +52,21 @@ bool Assembler::Compute(Vector* feats) { Vector feature; result = base_extractor_->Read(&feature); if (result == false || feature.Dim() == 0) { - if (IsFinished() == false) return false; - break; + VLOG(1) << "result: " << result << "feature dim: " << feature.Dim(); + if (IsFinished() == false) { + LOG(INFO) << "finished reading feature. cache size: " << feature_cache_.size(); + return false; + } else { + LOG(INFO) << "break"; + break; + } } CHECK(feature.Dim() == dim_); + feature_cache_.push(feature); + nframes_ += 1; VLOG(1) << "nframes: " << nframes_; - - feature_cache_.push(feature); } if (feature_cache_.size() < receptive_filed_length_) { @@ -68,8 +74,7 @@ bool Assembler::Compute(Vector* feats) { return false; } - - if (fill_zero_){ + if (fill_zero_) { while (feature_cache_.size() < frame_chunk_size_) { Vector feature(dim_, kaldi::kSetZero); nframes_ += 1; @@ -79,6 +84,7 @@ bool Assembler::Compute(Vector* feats) { int32 this_chunk_size = std::min(static_cast(feature_cache_.size()), frame_chunk_size_); feats->Resize(dim_ * this_chunk_size); + VLOG(1) << "read " << this_chunk_size << " feat."; int32 counter = 0; while (counter < this_chunk_size) { @@ -97,6 +103,7 @@ bool Assembler::Compute(Vector* feats) { counter++; } + CHECK(feature_cache_.size() == cache_size_ ); return result; } diff --git a/speechx/speechx/frontend/audio/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h index 09d7f7eb..b4ed58ff 100644 --- a/speechx/speechx/frontend/audio/feature_cache.h +++ b/speechx/speechx/frontend/audio/feature_cache.h @@ -41,12 +41,14 @@ class FeatureCache : public FrontendInterface { virtual size_t Dim() const { return dim_; } virtual void SetFinished() { + LOG(INFO) << "set finished"; // std::unique_lock lock(mutex_); base_extractor_->SetFinished(); - LOG(INFO) << "set finished"; + // read the last chunk data Compute(); // ready_feed_condition_.notify_one(); + LOG(INFO) << "compute last feats done."; } virtual bool IsFinished() const { return base_extractor_->IsFinished(); } diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index 9bad8ed4..6956a2cb 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -36,8 +36,6 @@ void Decodable::Acceptlikelihood(const Matrix& likelihood) { frames_ready_ += likelihood.NumRows(); } -// Decodable::Init(DecodableConfig config) { -//} // return the size of frame have computed. int32 Decodable::NumFramesReady() const { return frames_ready_; } @@ -70,9 +68,10 @@ bool Decodable::AdvanceChunk() { Vector features; if (frontend_ == NULL || frontend_->Read(&features) == false) { // no feat or frontend_ not init. + VLOG(1) << "decodable exit;"; return false; } - VLOG(2) << "Forward with " << features.Dim() << " frames."; + VLOG(2) << "Forward in " << features.Dim() / frontend_->Dim() << " feats."; // forward feats NnetOut out; @@ -80,6 +79,7 @@ bool Decodable::AdvanceChunk() { int32& vocab_dim = out.vocab_dim; Vector& logprobs = out.logprobs; + VLOG(2) << "Forward out " << logprobs.Dim() / vocab_dim << " decoder frames."; // cache nnet outupts nnet_out_cache_.Resize(logprobs.Dim() / vocab_dim, vocab_dim); nnet_out_cache_.CopyRowsFromVec(logprobs); @@ -114,15 +114,20 @@ bool Decodable::AdvanceChunk(kaldi::Vector* logprobs, // read one frame likelihood bool Decodable::FrameLikelihood(int32 frame, vector* likelihood) { if (EnsureFrameHaveComputed(frame) == false) { + LOG(INFO) << "framelikehood exit."; return false; } + int nrows = nnet_out_cache_.NumRows(); + CHECK(nrows == (frames_ready_ - frame_offset_)); int vocab_size = nnet_out_cache_.NumCols(); likelihood->resize(vocab_size); for (int32 idx = 0; idx < vocab_size; ++idx) { (*likelihood)[idx] = nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_; + + VLOG(4) << "nnet out: " << frame << " offset:" << frame_offset_ << " " << nnet_out_cache_.NumRows() << " logprob: " << nnet_out_cache_(frame - frame_offset_, idx); } return true; } diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc index 63a8a793..07e2dde2 100644 --- a/speechx/speechx/nnet/u2_nnet.cc +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -440,6 +440,7 @@ void U2Nnet::AttentionRescoring(const std::vector>& hyps, max_hyps_len = std::max(max_hyps_len, len); hyps_len_ptr[i] = static_cast(len); } + VLOG(2) << "max_hyps_len: " << max_hyps_len; paddle::Tensor hyps_tensor = paddle::full({num_hyps, max_hyps_len}, eos_, paddle::DataType::INT64); @@ -625,8 +626,8 @@ void U2Nnet::AttentionRescoring(const std::vector>& hyps, // combinded left-to-right and right-to-lfet score (*rescoring_score)[i] = score * (1 - reverse_weight) + r_score * reverse_weight; - VLOG(1) << "hyp " << i << " score: " << score << " r_score: " << r_score - << " reverse_weight: " << reverse_weight; + VLOG(1) << "hyp " << i << " " << hyp.size() << " score: " << score << " r_score: " << r_score + << " reverse_weight: " << reverse_weight << " final score: " << (*rescoring_score)[i]; } } diff --git a/speechx/speechx/recognizer/u2_recognizer.cc b/speechx/speechx/recognizer/u2_recognizer.cc index 75834aa5..b4a1257b 100644 --- a/speechx/speechx/recognizer/u2_recognizer.cc +++ b/speechx/speechx/recognizer/u2_recognizer.cc @@ -52,7 +52,6 @@ void U2Recognizer::Reset() { num_frames_ = 0; result_.clear(); - feature_pipeline_->Reset(); decodable_->Reset(); decoder_->Reset(); } @@ -62,7 +61,6 @@ void U2Recognizer::ResetContinuousDecoding() { num_frames_ = 0; result_.clear(); - feature_pipeline_->Reset(); decodable_->Reset(); decoder_->Reset(); } @@ -192,10 +190,12 @@ void U2Recognizer::AttentionRescoring() { // combine ctc score and rescoring score for (size_t i = 0; i < num_hyps; i++) { VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i] - << " ctc_score: " << result_[i].score; + << " ctc_score: " << result_[i].score << " rescoring_weight: " << opts_.decoder_opts.rescoring_weight << " ctc_weight: " << opts_.decoder_opts.ctc_weight; result_[i].score = opts_.decoder_opts.rescoring_weight * rescoring_score[i] + opts_.decoder_opts.ctc_weight * result_[i].score; + + VLOG(1) << "hyp: " << result_[0].sentence << " score: " << result_[0].score; } std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc); diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc index 2375586e..bfb37fb8 100644 --- a/speechx/speechx/recognizer/u2_recognizer_main.cc +++ b/speechx/speechx/recognizer/u2_recognizer_main.cc @@ -62,6 +62,7 @@ int main(int argc, char* argv[]) { LOG(INFO) << "wav len (sample): " << tot_samples; int sample_offset = 0; + int cnt = 0; while (sample_offset < tot_samples) { int cur_chunk_size = std::min(chunk_sample_size, tot_samples - sample_offset); @@ -77,12 +78,14 @@ int main(int argc, char* argv[]) { recognizer.SetFinished(); } recognizer.Decode(); - LOG(INFO) << "Pratial result: " << recognizer.GetPartialResult(); + LOG(INFO) << "Pratial result: " << cnt << " " << recognizer.GetPartialResult(); // no overlap sample_offset += cur_chunk_size; + cnt++; } CHECK(sample_offset == tot_samples); + VLOG(1) << "num decode: " << cnt; // recognizer.SetFinished(); From 83f885c6ccad46bbc17e5c1d502c9e91417f2c3c Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 21 Oct 2022 09:19:40 +0000 Subject: [PATCH 46/60] fix delete char in wav end bug --- speechx/speechx/frontend/audio/assembler.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speechx/speechx/frontend/audio/assembler.cc b/speechx/speechx/frontend/audio/assembler.cc index afee3a6a..bbd09442 100644 --- a/speechx/speechx/frontend/audio/assembler.cc +++ b/speechx/speechx/frontend/audio/assembler.cc @@ -105,7 +105,7 @@ bool Assembler::Compute(Vector* feats) { } CHECK(feature_cache_.size() == cache_size_ ); - return result; + return true; } From 4dfb3365f637b28b30f0359dd641f571800eb2a8 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 21 Oct 2022 17:23:17 +0800 Subject: [PATCH 47/60] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index d02ac4c6..4ed1a022 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). ### Recent Update +- 🎉 2022.10.21: Add [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for Chinese Text Frontend. - 👑 2022.10.11: Add [Wav2vec2ASR](./examples/librispeech/asr3), wav2vec2.0 fine-tuning for ASR on LibriSpeech. - 🔥 2022.09.26: Add Voice Cloning, TTS finetune, and ERNIE-SAT in [PaddleSpeech Web Demo](./demos/speech_web). - ⚡ 2022.09.09: Add AISHELL-3 Voice Cloning [example](./examples/aishell3/vc2) with ECAPA-TDNN speaker encoder. From 7693bd1812086d2b5d5a19646e704a6155cb1103 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 21 Oct 2022 17:24:40 +0800 Subject: [PATCH 48/60] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4ed1a022..3b26ff9b 100644 --- a/README.md +++ b/README.md @@ -924,7 +924,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P ## Acknowledgement - Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples. -- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. Added SSML for Chinese Text Frontend. +- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. Added [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for Chinese Text Frontend. - Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW). - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help. - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files. From 9c68c2061e1b595deac62229a2f29f9f0659ff17 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 21 Oct 2022 17:29:13 +0800 Subject: [PATCH 49/60] Update README_cn.md --- README_cn.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README_cn.md b/README_cn.md index 0c3af5dd..9a454989 100644 --- a/README_cn.md +++ b/README_cn.md @@ -164,7 +164,8 @@ ### 近期更新 -- 👑 2022.10.11: 新增 [Wav2vec2ASR](./examples/librispeech/asr3), 在 LibriSpeech 上针对ASR任务对wav2vec2.0 的fine-tuning. + - 🎉 2022.10.21: TTS 中文文本前端新增 [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) 功能。 +- 👑 2022.10.11: 新增 [Wav2vec2ASR](./examples/librispeech/asr3), 在 LibriSpeech 上针对 ASR 任务对 wav2vec2.0 的 finetuning。 - 🔥 2022.09.26: 新增 Voice Cloning, TTS finetune 和 ERNIE-SAT 到 [PaddleSpeech 网页应用](./demos/speech_web)。 - ⚡ 2022.09.09: 新增基于 ECAPA-TDNN 声纹模型的 AISHELL-3 Voice Cloning [示例](./examples/aishell3/vc2)。 - ⚡ 2022.08.25: 发布 TTS [finetune](./examples/other/tts_finetune/tts3) 示例。 @@ -928,7 +929,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 ## 致谢 - 非常感谢 [HighCWu](https://github.com/HighCWu) 新增 [VITS-aishell3](./examples/aishell3/vits) 和 [VITS-VC](./examples/aishell3/vits-vc) 代码示例。 -- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题,贡献补充多条程序和数据。新增 SSML 中文文本前端处理。 +- 非常感谢 [david-95](https://github.com/david-95) 修复 TTS 句尾多标点符号出错的问题,贡献补充多条程序和数据。为 TTS 中文文本前端新增 [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) 功能。 - 非常感谢 [BarryKCL](https://github.com/BarryKCL) 基于 [G2PW](https://github.com/GitYCC/g2pW) 对 TTS 中文文本前端的优化。 - 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) 多年来的关注和建议,以及在诸多问题上的帮助。 - 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。 From 09a735af2449a2205a6006287e6bd1e98b355c37 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 21 Oct 2022 17:32:47 +0800 Subject: [PATCH 50/60] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3b26ff9b..26f13d00 100644 --- a/README.md +++ b/README.md @@ -157,7 +157,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). ### Recent Update -- 🎉 2022.10.21: Add [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for Chinese Text Frontend. +- 🎉 2022.10.21: Add [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for TTS Chinese Text Frontend. - 👑 2022.10.11: Add [Wav2vec2ASR](./examples/librispeech/asr3), wav2vec2.0 fine-tuning for ASR on LibriSpeech. - 🔥 2022.09.26: Add Voice Cloning, TTS finetune, and ERNIE-SAT in [PaddleSpeech Web Demo](./demos/speech_web). - ⚡ 2022.09.09: Add AISHELL-3 Voice Cloning [example](./examples/aishell3/vc2) with ECAPA-TDNN speaker encoder. @@ -924,8 +924,8 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P ## Acknowledgement - Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples. -- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. Added [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for Chinese Text Frontend. -- Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW). +- Many thanks to [david-95](https://github.com/david-95) for fixing multi-punctuation bug、contributing to multiple program and data, and adding [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for TTS Chinese Text Frontend. +- Many thanks to [BarryKCL](https://github.com/BarryKCL) for improving TTS Chinses Frontend based on [G2PW](https://github.com/GitYCC/g2pW). - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help. - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files. - Many thanks to [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) for developing Virtual Uploader(VUP)/Virtual YouTuber(VTuber) with PaddleSpeech TTS function. From 28dafea0e01afa9f5acfbdad2cf93e0aaabd7a7d Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 21 Oct 2022 09:44:44 +0000 Subject: [PATCH 51/60] add fill zero opt for frontend --- speechx/speechx/decoder/param.h | 1 + speechx/speechx/frontend/audio/assembler.cc | 11 +++++------ speechx/speechx/frontend/audio/feature_pipeline.h | 10 +++++++--- speechx/speechx/nnet/decodable.cc | 2 +- speechx/speechx/recognizer/recognizer.h | 2 ++ speechx/speechx/recognizer/u2_recognizer.h | 2 ++ speechx/speechx/recognizer/u2_recognizer_main.cc | 3 --- 7 files changed, 18 insertions(+), 13 deletions(-) diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index 1f13bbc0..1a332755 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -20,6 +20,7 @@ // feature DEFINE_bool(use_fbank, false, "False for fbank; or linear feature"); +DEFINE_bool(fill_zero, false, "fill zero at last chunk, when chunk < chunk_size"); // DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear // feature, or fbank"); DEFINE_int32(num_bins, 161, "num bins of mel"); diff --git a/speechx/speechx/frontend/audio/assembler.cc b/speechx/speechx/frontend/audio/assembler.cc index bbd09442..26a3905b 100644 --- a/speechx/speechx/frontend/audio/assembler.cc +++ b/speechx/speechx/frontend/audio/assembler.cc @@ -47,17 +47,16 @@ bool Assembler::Read(kaldi::Vector* feats) { // read frame by frame from base_feature_extractor_ into cache_ bool Assembler::Compute(Vector* feats) { // compute and feed frame by frame - bool result = false; while (feature_cache_.size() < frame_chunk_size_) { Vector feature; - result = base_extractor_->Read(&feature); + bool result = base_extractor_->Read(&feature); if (result == false || feature.Dim() == 0) { - VLOG(1) << "result: " << result << "feature dim: " << feature.Dim(); + VLOG(1) << "result: " << result << " feature dim: " << feature.Dim(); if (IsFinished() == false) { - LOG(INFO) << "finished reading feature. cache size: " << feature_cache_.size(); + VLOG(1) << "finished reading feature. cache size: " << feature_cache_.size(); return false; } else { - LOG(INFO) << "break"; + VLOG(1) << "break"; break; } } @@ -103,7 +102,7 @@ bool Assembler::Compute(Vector* feats) { counter++; } - CHECK(feature_cache_.size() == cache_size_ ); + CHECK(feature_cache_.size() == cache_size_); return true; } diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index d91a70e3..e06995b1 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -27,6 +27,7 @@ // feature DECLARE_bool(use_fbank); +DECLARE_bool(fill_zero); DECLARE_int32(num_bins); DECLARE_string(cmvn_file); @@ -80,15 +81,18 @@ struct FeaturePipelineOptions { // assembler opts opts.assembler_opts.subsampling_rate = FLAGS_subsampling_rate; - LOG(INFO) << "subsampling rate: " - << opts.assembler_opts.subsampling_rate; opts.assembler_opts.receptive_filed_length = FLAGS_receptive_field_length; + opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk; + opts.assembler_opts.fill_zero = FLAGS_fill_zero; + LOG(INFO) << "subsampling rate: " + << opts.assembler_opts.subsampling_rate; LOG(INFO) << "nnet receptive filed length: " << opts.assembler_opts.receptive_filed_length; - opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk; LOG(INFO) << "nnet chunk size: " << opts.assembler_opts.nnet_decoder_chunk; + LOG(INFO) << "frontend fill zeros: " + << opts.assembler_opts.fill_zero; return opts; } }; diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index 6956a2cb..a7de58b5 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -114,7 +114,7 @@ bool Decodable::AdvanceChunk(kaldi::Vector* logprobs, // read one frame likelihood bool Decodable::FrameLikelihood(int32 frame, vector* likelihood) { if (EnsureFrameHaveComputed(frame) == false) { - LOG(INFO) << "framelikehood exit."; + VLOG(1) << "framelikehood exit."; return false; } diff --git a/speechx/speechx/recognizer/recognizer.h b/speechx/speechx/recognizer/recognizer.h index 0402bcd3..27f1228a 100644 --- a/speechx/speechx/recognizer/recognizer.h +++ b/speechx/speechx/recognizer/recognizer.h @@ -38,6 +38,8 @@ struct RecognizerResource { resource.acoustic_scale = FLAGS_acoustic_scale; resource.feature_pipeline_opts = FeaturePipelineOptions::InitFromFlags(); + resource.feature_pipeline_opts.assembler_opts.fill_zero = true; + LOG(INFO) << "ds2 need fill zero be true: " << resource.feature_pipeline_opts.assembler_opts.fill_zero; resource.model_opts = ModelOptions::InitFromFlags(); resource.tlg_opts = TLGDecoderOptions::InitFromFlags(); return resource; diff --git a/speechx/speechx/recognizer/u2_recognizer.h b/speechx/speechx/recognizer/u2_recognizer.h index 54f4d258..4746d86f 100644 --- a/speechx/speechx/recognizer/u2_recognizer.h +++ b/speechx/speechx/recognizer/u2_recognizer.h @@ -101,6 +101,8 @@ struct U2RecognizerResource { resource.feature_pipeline_opts = ppspeech::FeaturePipelineOptions::InitFromFlags(); + resource.feature_pipeline_opts.assembler_opts.fill_zero = false; + LOG(INFO) << "u2 need fill zero be false: " << resource.feature_pipeline_opts.assembler_opts.fill_zero; resource.model_opts = ppspeech::ModelOptions::InitFromFlags(); resource.decoder_opts = ppspeech::DecodeOptions::InitFromFlags(); return resource; diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc index bfb37fb8..7e59d6cb 100644 --- a/speechx/speechx/recognizer/u2_recognizer_main.cc +++ b/speechx/speechx/recognizer/u2_recognizer_main.cc @@ -85,9 +85,6 @@ int main(int argc, char* argv[]) { cnt++; } CHECK(sample_offset == tot_samples); - VLOG(1) << "num decode: " << cnt; - - // recognizer.SetFinished(); // second pass decoding recognizer.Rescoring(); From 850096a3a0d277eabddf292a57e4eecd01d081df Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 21 Oct 2022 09:59:23 +0000 Subject: [PATCH 52/60] format code --- .../decoder/ctc_prefix_beam_search_decoder.cc | 25 +++++++++++-------- .../decoder/ctc_prefix_beam_search_decoder.h | 3 ++- .../decoder/ctc_prefix_beam_search_score.h | 3 ++- speechx/speechx/decoder/param.h | 4 ++- speechx/speechx/frontend/audio/assembler.cc | 22 +++++++++------- speechx/speechx/frontend/audio/assembler.h | 5 ++-- speechx/speechx/frontend/audio/audio_cache.cc | 7 +++--- speechx/speechx/frontend/audio/audio_cache.h | 2 +- .../speechx/frontend/audio/feature_cache.cc | 6 ++--- .../speechx/frontend/audio/feature_cache.h | 2 +- .../speechx/frontend/audio/feature_pipeline.h | 3 +-- speechx/speechx/nnet/decodable.cc | 7 ++++-- speechx/speechx/nnet/u2_nnet.cc | 11 +++++--- speechx/speechx/nnet/u2_nnet.h | 3 ++- speechx/speechx/recognizer/u2_recognizer.cc | 7 ++++-- speechx/speechx/recognizer/u2_recognizer.h | 7 +++--- .../speechx/recognizer/u2_recognizer_main.cc | 5 ++-- speechx/speechx/utils/math.cc | 4 +-- 18 files changed, 75 insertions(+), 51 deletions(-) diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc index 04530fb9..2986ea7e 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc @@ -118,11 +118,13 @@ void CTCPrefixBeamSearch::AdvanceDecoding( std::vector topk_score; std::vector topk_index; TopK(logp_t, first_beam_size, &topk_score, &topk_index); - VLOG(2) << "topk: " << num_frame_decoded_ << " " << *std::max_element(logp_t.begin(), logp_t.end()) << " " << topk_score[0]; - for (int i = 0; i < topk_score.size(); i++){ - VLOG(2) << "topk: " << num_frame_decoded_ << " " << topk_score[i]; + VLOG(2) << "topk: " << num_frame_decoded_ << " " + << *std::max_element(logp_t.begin(), logp_t.end()) << " " + << topk_score[0]; + for (int i = 0; i < topk_score.size(); i++) { + VLOG(2) << "topk: " << num_frame_decoded_ << " " << topk_score[i]; } - + // 2. token passing for (int i = 0; i < topk_index.size(); ++i) { int id = topk_index[i]; @@ -303,15 +305,16 @@ void CTCPrefixBeamSearch::UpdateOutputs( outputs_.emplace_back(output); } -void CTCPrefixBeamSearch::FinalizeSearch() { - UpdateFinalContext(); - +void CTCPrefixBeamSearch::FinalizeSearch() { + UpdateFinalContext(); + VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_; int cnt = 0; - for (int i = 0; i < hypotheses_.size(); i ++){ - VLOG(2) << "hyp " << cnt << " len: " << hypotheses_[i].size() << " ctc score: " << likelihood_[i]; - for (int j = 0; j < hypotheses_[i].size(); j ++){ - VLOG(2) << hypotheses_[i][j]; + for (int i = 0; i < hypotheses_.size(); i++) { + VLOG(2) << "hyp " << cnt << " len: " << hypotheses_[i].size() + << " ctc score: " << likelihood_[i]; + for (int j = 0; j < hypotheses_[i].size(); j++) { + VLOG(2) << hypotheses_[i][j]; } } } diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h index ef96ecd9..475b4d35 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h @@ -13,7 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.cc +// modified from +// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.cc #pragma once diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h index 908be1d6..3547b2b7 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h @@ -13,7 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.h +// modified from +// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.h #pragma once diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index 1a332755..ebdd7119 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -20,7 +20,9 @@ // feature DEFINE_bool(use_fbank, false, "False for fbank; or linear feature"); -DEFINE_bool(fill_zero, false, "fill zero at last chunk, when chunk < chunk_size"); +DEFINE_bool(fill_zero, + false, + "fill zero at last chunk, when chunk < chunk_size"); // DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear // feature, or fbank"); DEFINE_int32(num_bins, 161, "num bins of mel"); diff --git a/speechx/speechx/frontend/audio/assembler.cc b/speechx/speechx/frontend/audio/assembler.cc index 26a3905b..56dfc3aa 100644 --- a/speechx/speechx/frontend/audio/assembler.cc +++ b/speechx/speechx/frontend/audio/assembler.cc @@ -16,9 +16,9 @@ namespace ppspeech { +using kaldi::BaseFloat; using kaldi::Vector; using kaldi::VectorBase; -using kaldi::BaseFloat; using std::unique_ptr; Assembler::Assembler(AssemblerOptions opts, @@ -51,9 +51,11 @@ bool Assembler::Compute(Vector* feats) { Vector feature; bool result = base_extractor_->Read(&feature); if (result == false || feature.Dim() == 0) { - VLOG(1) << "result: " << result << " feature dim: " << feature.Dim(); + VLOG(1) << "result: " << result + << " feature dim: " << feature.Dim(); if (IsFinished() == false) { - VLOG(1) << "finished reading feature. cache size: " << feature_cache_.size(); + VLOG(1) << "finished reading feature. cache size: " + << feature_cache_.size(); return false; } else { VLOG(1) << "break"; @@ -69,7 +71,8 @@ bool Assembler::Compute(Vector* feats) { } if (feature_cache_.size() < receptive_filed_length_) { - VLOG(1) << "feature_cache less than receptive_filed_lenght. " << feature_cache_.size() << ": " << receptive_filed_length_; + VLOG(1) << "feature_cache less than receptive_filed_lenght. " + << feature_cache_.size() << ": " << receptive_filed_length_; return false; } @@ -81,7 +84,8 @@ bool Assembler::Compute(Vector* feats) { } } - int32 this_chunk_size = std::min(static_cast(feature_cache_.size()), frame_chunk_size_); + int32 this_chunk_size = + std::min(static_cast(feature_cache_.size()), frame_chunk_size_); feats->Resize(dim_ * this_chunk_size); VLOG(1) << "read " << this_chunk_size << " feat."; @@ -89,7 +93,7 @@ bool Assembler::Compute(Vector* feats) { while (counter < this_chunk_size) { Vector& val = feature_cache_.front(); CHECK(val.Dim() == dim_) << val.Dim(); - + int32 start = counter * dim_; feats->Range(start, dim_).CopyFromVec(val); @@ -99,7 +103,7 @@ bool Assembler::Compute(Vector* feats) { // val is reference, so we should pop here feature_cache_.pop(); - + counter++; } CHECK(feature_cache_.size() == cache_size_); @@ -108,11 +112,11 @@ bool Assembler::Compute(Vector* feats) { } - void Assembler::Reset() { +void Assembler::Reset() { std::queue> empty; std::swap(feature_cache_, empty); nframes_ = 0; - base_extractor_->Reset(); + base_extractor_->Reset(); } } // namespace ppspeech diff --git a/speechx/speechx/frontend/audio/assembler.h b/speechx/speechx/frontend/audio/assembler.h index 4f165ea8..72e6f635 100644 --- a/speechx/speechx/frontend/audio/assembler.h +++ b/speechx/speechx/frontend/audio/assembler.h @@ -25,7 +25,8 @@ struct AssemblerOptions { int32 receptive_filed_length{1}; int32 subsampling_rate{1}; int32 nnet_decoder_chunk{1}; - bool fill_zero{false}; // whether fill zero when last chunk is not equal to frame_chunk_size_ + bool fill_zero{false}; // whether fill zero when last chunk is not equal to + // frame_chunk_size_ }; class Assembler : public FrontendInterface { @@ -62,7 +63,7 @@ class Assembler : public FrontendInterface { std::queue> feature_cache_; std::unique_ptr base_extractor_; - int32 nframes_; // num frame computed + int32 nframes_; // num frame computed DISALLOW_COPY_AND_ASSIGN(Assembler); }; diff --git a/speechx/speechx/frontend/audio/audio_cache.cc b/speechx/speechx/frontend/audio/audio_cache.cc index 71e5d09e..61ef8841 100644 --- a/speechx/speechx/frontend/audio/audio_cache.cc +++ b/speechx/speechx/frontend/audio/audio_cache.cc @@ -13,13 +13,14 @@ // limitations under the License. #include "frontend/audio/audio_cache.h" + #include "kaldi/base/timer.h" namespace ppspeech { using kaldi::BaseFloat; -using kaldi::VectorBase; using kaldi::Vector; +using kaldi::VectorBase; AudioCache::AudioCache(int buffer_size, bool to_float32) : finished_(false), @@ -85,8 +86,8 @@ bool AudioCache::Read(Vector* waves) { offset_ = (offset_ + chunk_size) % ring_buffer_.size(); nsamples_ += chunk_size; - VLOG(1) << "nsamples readed: " << nsamples_; - + VLOG(1) << "nsamples readed: " << nsamples_; + ready_feed_condition_.notify_one(); return true; } diff --git a/speechx/speechx/frontend/audio/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h index da422daa..4708a6e0 100644 --- a/speechx/speechx/frontend/audio/audio_cache.h +++ b/speechx/speechx/frontend/audio/audio_cache.h @@ -62,7 +62,7 @@ class AudioCache : public FrontendInterface { kaldi::int32 timeout_; // millisecond bool to_float32_; // int16 -> float32. used in linear_spectrogram - int32 nsamples_; // number samples readed. + int32 nsamples_; // number samples readed. DISALLOW_COPY_AND_ASSIGN(AudioCache); }; diff --git a/speechx/speechx/frontend/audio/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc index c712e48e..3f05eae6 100644 --- a/speechx/speechx/frontend/audio/feature_cache.cc +++ b/speechx/speechx/frontend/audio/feature_cache.cc @@ -16,12 +16,12 @@ namespace ppspeech { -using kaldi::Vector; -using kaldi::VectorBase; using kaldi::BaseFloat; -using std::vector; using kaldi::SubVector; +using kaldi::Vector; +using kaldi::VectorBase; using std::unique_ptr; +using std::vector; FeatureCache::FeatureCache(FeatureCacheOptions opts, unique_ptr base_extractor) { diff --git a/speechx/speechx/frontend/audio/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h index b4ed58ff..bd869225 100644 --- a/speechx/speechx/frontend/audio/feature_cache.h +++ b/speechx/speechx/frontend/audio/feature_cache.h @@ -77,7 +77,7 @@ class FeatureCache : public FrontendInterface { std::condition_variable ready_feed_condition_; std::condition_variable ready_read_condition_; - int32 nframe_; // num of feature computed + int32 nframe_; // num of feature computed DISALLOW_COPY_AND_ASSIGN(FeatureCache); }; diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index e06995b1..e83a3f31 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -91,8 +91,7 @@ struct FeaturePipelineOptions { << opts.assembler_opts.receptive_filed_length; LOG(INFO) << "nnet chunk size: " << opts.assembler_opts.nnet_decoder_chunk; - LOG(INFO) << "frontend fill zeros: " - << opts.assembler_opts.fill_zero; + LOG(INFO) << "frontend fill zeros: " << opts.assembler_opts.fill_zero; return opts; } }; diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index a7de58b5..11d60d3e 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -79,7 +79,8 @@ bool Decodable::AdvanceChunk() { int32& vocab_dim = out.vocab_dim; Vector& logprobs = out.logprobs; - VLOG(2) << "Forward out " << logprobs.Dim() / vocab_dim << " decoder frames."; + VLOG(2) << "Forward out " << logprobs.Dim() / vocab_dim + << " decoder frames."; // cache nnet outupts nnet_out_cache_.Resize(logprobs.Dim() / vocab_dim, vocab_dim); nnet_out_cache_.CopyRowsFromVec(logprobs); @@ -127,7 +128,9 @@ bool Decodable::FrameLikelihood(int32 frame, vector* likelihood) { (*likelihood)[idx] = nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_; - VLOG(4) << "nnet out: " << frame << " offset:" << frame_offset_ << " " << nnet_out_cache_.NumRows() << " logprob: " << nnet_out_cache_(frame - frame_offset_, idx); + VLOG(4) << "nnet out: " << frame << " offset:" << frame_offset_ << " " + << nnet_out_cache_.NumRows() + << " logprob: " << nnet_out_cache_(frame - frame_offset_, idx); } return true; } diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc index 07e2dde2..636e2ad4 100644 --- a/speechx/speechx/nnet/u2_nnet.cc +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -13,7 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.cc +// modified from +// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.cc #include "nnet/u2_nnet.h" @@ -129,7 +130,7 @@ U2Nnet::U2Nnet(const U2Nnet& other) { forward_attention_decoder_ = other.forward_attention_decoder_; ctc_activation_ = other.ctc_activation_; - offset_ = other.offset_; + offset_ = other.offset_; // copy model ptr model_ = other.model_; @@ -626,8 +627,10 @@ void U2Nnet::AttentionRescoring(const std::vector>& hyps, // combinded left-to-right and right-to-lfet score (*rescoring_score)[i] = score * (1 - reverse_weight) + r_score * reverse_weight; - VLOG(1) << "hyp " << i << " " << hyp.size() << " score: " << score << " r_score: " << r_score - << " reverse_weight: " << reverse_weight << " final score: " << (*rescoring_score)[i]; + VLOG(1) << "hyp " << i << " " << hyp.size() << " score: " << score + << " r_score: " << r_score + << " reverse_weight: " << reverse_weight + << " final score: " << (*rescoring_score)[i]; } } diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h index 6cbc0570..e548d4c0 100644 --- a/speechx/speechx/nnet/u2_nnet.h +++ b/speechx/speechx/nnet/u2_nnet.h @@ -13,7 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.h +// modified from +// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.h #pragma once #include "base/common.h" diff --git a/speechx/speechx/recognizer/u2_recognizer.cc b/speechx/speechx/recognizer/u2_recognizer.cc index b4a1257b..4ec64665 100644 --- a/speechx/speechx/recognizer/u2_recognizer.cc +++ b/speechx/speechx/recognizer/u2_recognizer.cc @@ -190,12 +190,15 @@ void U2Recognizer::AttentionRescoring() { // combine ctc score and rescoring score for (size_t i = 0; i < num_hyps; i++) { VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i] - << " ctc_score: " << result_[i].score << " rescoring_weight: " << opts_.decoder_opts.rescoring_weight << " ctc_weight: " << opts_.decoder_opts.ctc_weight; + << " ctc_score: " << result_[i].score + << " rescoring_weight: " << opts_.decoder_opts.rescoring_weight + << " ctc_weight: " << opts_.decoder_opts.ctc_weight; result_[i].score = opts_.decoder_opts.rescoring_weight * rescoring_score[i] + opts_.decoder_opts.ctc_weight * result_[i].score; - VLOG(1) << "hyp: " << result_[0].sentence << " score: " << result_[0].score; + VLOG(1) << "hyp: " << result_[0].sentence + << " score: " << result_[0].score; } std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc); diff --git a/speechx/speechx/recognizer/u2_recognizer.h b/speechx/speechx/recognizer/u2_recognizer.h index 4746d86f..9b43b08f 100644 --- a/speechx/speechx/recognizer/u2_recognizer.h +++ b/speechx/speechx/recognizer/u2_recognizer.h @@ -96,13 +96,14 @@ struct U2RecognizerResource { U2RecognizerResource resource; resource.vocab_path = FLAGS_vocab_path; resource.acoustic_scale = FLAGS_acoustic_scale; - LOG(INFO) << "vocab path: " << resource.vocab_path; - LOG(INFO) << "acoustic_scale: " << resource.acoustic_scale; + LOG(INFO) << "vocab path: " << resource.vocab_path; + LOG(INFO) << "acoustic_scale: " << resource.acoustic_scale; resource.feature_pipeline_opts = ppspeech::FeaturePipelineOptions::InitFromFlags(); resource.feature_pipeline_opts.assembler_opts.fill_zero = false; - LOG(INFO) << "u2 need fill zero be false: " << resource.feature_pipeline_opts.assembler_opts.fill_zero; + LOG(INFO) << "u2 need fill zero be false: " + << resource.feature_pipeline_opts.assembler_opts.fill_zero; resource.model_opts = ppspeech::ModelOptions::InitFromFlags(); resource.decoder_opts = ppspeech::DecodeOptions::InitFromFlags(); return resource; diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc index 7e59d6cb..c02e1c23 100644 --- a/speechx/speechx/recognizer/u2_recognizer_main.cc +++ b/speechx/speechx/recognizer/u2_recognizer_main.cc @@ -78,7 +78,8 @@ int main(int argc, char* argv[]) { recognizer.SetFinished(); } recognizer.Decode(); - LOG(INFO) << "Pratial result: " << cnt << " " << recognizer.GetPartialResult(); + LOG(INFO) << "Pratial result: " << cnt << " " + << recognizer.GetPartialResult(); // no overlap sample_offset += cur_chunk_size; @@ -88,7 +89,7 @@ int main(int argc, char* argv[]) { // second pass decoding recognizer.Rescoring(); - + std::string result = recognizer.GetFinalResult(); recognizer.Reset(); diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc index 959740a0..71656cb3 100644 --- a/speechx/speechx/utils/math.cc +++ b/speechx/speechx/utils/math.cc @@ -79,10 +79,10 @@ void TopK(const std::vector& data, int cur = values->size() - 1; while (!pq.empty()) { const auto& item = pq.top(); - + (*values)[cur] = item.first; (*indices)[cur] = item.second; - + // item if reference, must pop here pq.pop(); From 606e2c237fa7283e1d39c3dbb1cb62d7855a55c4 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 24 Oct 2022 03:47:23 +0000 Subject: [PATCH 53/60] fix as comment --- speechx/speechx/decoder/ctc_beam_search_opt.h | 4 ++-- .../decoder/ctc_prefix_beam_search_decoder.cc | 19 ++----------------- .../decoder/ctc_prefix_beam_search_decoder.h | 6 +++--- .../ctc_prefix_beam_search_decoder_main.cc | 7 +++---- .../decoder/ctc_prefix_beam_search_score.h | 7 +++++++ speechx/speechx/decoder/ctc_tlg_decoder.h | 4 ++-- 6 files changed, 19 insertions(+), 28 deletions(-) diff --git a/speechx/speechx/decoder/ctc_beam_search_opt.h b/speechx/speechx/decoder/ctc_beam_search_opt.h index d21b3abd..4a4f817d 100644 --- a/speechx/speechx/decoder/ctc_beam_search_opt.h +++ b/speechx/speechx/decoder/ctc_beam_search_opt.h @@ -37,13 +37,13 @@ struct CTCBeamSearchOptions { // u2 int first_beam_size; int second_beam_size; - CTCBeamSearchOptions() + explicit CTCBeamSearchOptions() : blank(0), dict_file("vocab.txt"), lm_path(""), + beam_size(300), alpha(1.9f), beta(5.0), - beam_size(300), cutoff_prob(0.99f), cutoff_top_n(40), num_proc_bsearch(10), diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc index 2986ea7e..7414d06d 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc @@ -31,7 +31,7 @@ using paddle::platform::TracerEventType; namespace ppspeech { -CTCPrefixBeamSearch::CTCPrefixBeamSearch(const std::string vocab_path, +CTCPrefixBeamSearch::CTCPrefixBeamSearch(const std::string& vocab_path, const CTCBeamSearchOptions& opts) : opts_(opts) { unit_table_ = std::shared_ptr( @@ -55,10 +55,7 @@ void CTCPrefixBeamSearch::Reset() { // empty hyp with Score std::vector empty; PrefixScore prefix_score; - prefix_score.b = 0.0f; // log(1) - prefix_score.nb = -kBaseFloatMax; // log(0) - prefix_score.v_b = 0.0f; // log(1) - prefix_score.v_nb = 0.0f; // log(1) + prefix_score.InitEmpty(); cur_hyps_[empty] = prefix_score; outputs_.emplace_back(empty); @@ -287,19 +284,7 @@ void CTCPrefixBeamSearch::UpdateOutputs( int s = 0; int e = 0; for (int i = 0; i < input.size(); ++i) { - // if (s < start_boundaries.size() && i == start_boundaries[s]){ - // // - // output.emplace_back(context_graph_->start_tag_id()); - // ++s; - // } - output.emplace_back(input[i]); - - // if (e < end_boundaries.size() && i == end_boundaries[e]){ - // // - // output.emplace_back(context_graph_->end_tag_id()); - // ++e; - // } } outputs_.emplace_back(output); diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h index 475b4d35..a0c2a74e 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h @@ -27,7 +27,7 @@ namespace ppspeech { class ContextGraph; class CTCPrefixBeamSearch : public DecoderBase { public: - explicit CTCPrefixBeamSearch(const std::string vocab_path, + explicit CTCPrefixBeamSearch(const std::string& vocab_path, const CTCBeamSearchOptions& opts); ~CTCPrefixBeamSearch() {} @@ -77,7 +77,7 @@ class CTCPrefixBeamSearch : public DecoderBase { private: CTCBeamSearchOptions opts_; - std::shared_ptr unit_table_; + std::shared_ptr unit_table_{nullptr}; std::unordered_map, PrefixScore, PrefixScoreHash> cur_hyps_; @@ -92,7 +92,7 @@ class CTCPrefixBeamSearch : public DecoderBase { // Outputs contain the hypotheses_ and tags lik: and std::vector> outputs_; - std::shared_ptr context_graph_ = nullptr; + std::shared_ptr context_graph_{nullptr}; DISALLOW_COPY_AND_ASSIGN(CTCPrefixBeamSearch); }; diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc index d9cca147..69f32686 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc @@ -64,12 +64,11 @@ int main(int argc, char* argv[]) { // nnet ppspeech::ModelOptions model_opts; model_opts.model_path = FLAGS_model_path; - std::shared_ptr nnet(new ppspeech::U2Nnet(model_opts)); + std::shared_ptr nnet = std::make_shared(model_opts); // decodeable - std::shared_ptr raw_data(new ppspeech::DataCache()); - std::shared_ptr decodable( - new ppspeech::Decodable(nnet, raw_data)); + std::shared_ptr raw_data = std::make_shared(); + std::shared_ptr decodable = std::make_shared(nnet, raw_data); // decoder ppspeech::CTCBeamSearchOptions opts; diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h index 3547b2b7..76b09e9b 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h @@ -73,6 +73,13 @@ struct PrefixScore { int prefix_len) { CHECK(false); } + + void InitEmpty() { + b = 0.0f; // log(1) + nb = -kBaseFloatMax; // log(0) + v_b = 0.0f; // log(1) + v_nb = 0.0f; // log(1) + } }; struct PrefixScoreHash { diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h index f250ac25..0ff1de2a 100644 --- a/speechx/speechx/decoder/ctc_tlg_decoder.h +++ b/speechx/speechx/decoder/ctc_tlg_decoder.h @@ -31,8 +31,8 @@ namespace ppspeech { struct TLGDecoderOptions { kaldi::LatticeFasterDecoderConfig opts{}; // todo remove later, add into decode resource - std::string word_symbol_table{}; - std::string fst_path{}; + std::string word_symbol_table; + std::string fst_path; static TLGDecoderOptions InitFromFlags() { TLGDecoderOptions decoder_opts; From 08c432f70a2f328d146a876920b9179543107b3e Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 24 Oct 2022 07:19:28 +0000 Subject: [PATCH 54/60] add paddleslim --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index e551d9fa..35668bdd 100644 --- a/setup.py +++ b/setup.py @@ -75,6 +75,7 @@ base = [ "braceexpand", "pyyaml", "pybind11", + "paddleslim==2.3.4", ] server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"] From 8271fcfb0a7837d0cc8fdbe8764dedaa17924cc0 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 24 Oct 2022 08:31:42 +0000 Subject: [PATCH 55/60] fix as comment --- speechx/CMakeLists.txt | 27 ++++++++++++++++--- speechx/README.md | 4 +-- speechx/cmake/paddleinference.cmake | 9 ------- speechx/examples/codelab/feat/run.sh | 6 ++--- speechx/examples/codelab/u2/local/decode.sh | 2 +- speechx/speechx/decoder/CMakeLists.txt | 10 +++---- speechx/speechx/decoder/ctc_beam_search_opt.h | 5 ++-- .../decoder/ctc_prefix_beam_search_decoder.cc | 6 ++--- .../decoder/ctc_prefix_beam_search_decoder.h | 4 +-- 9 files changed, 42 insertions(+), 31 deletions(-) diff --git a/speechx/CMakeLists.txt b/speechx/CMakeLists.txt index 6255cb2e..978a23d9 100644 --- a/speechx/CMakeLists.txt +++ b/speechx/CMakeLists.txt @@ -102,7 +102,16 @@ message(STATUS "Pybind11_INCLUDES = ${pybind11_INCLUDE_DIRS}, pybind11_LIBRARIES # paddle include and link option # -L/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/libs -L/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/fluid -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so execute_process( - COMMAND python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=' '.join([\"-L\" + libs_dir, \"-L\" + fluid_dir]); out += \" -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so\"; print(out);" + COMMAND python -c "\ +import os;\ +import paddle;\ +include_dir=paddle.sysconfig.get_include();\ +paddle_dir=os.path.split(include_dir)[0];\ +libs_dir=os.path.join(paddle_dir, 'libs');\ +fluid_dir=os.path.join(paddle_dir, 'fluid');\ +out=' '.join([\"-L\" + libs_dir, \"-L\" + fluid_dir]);\ +out += \" -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so\"; print(out);\ + " OUTPUT_VARIABLE PADDLE_LINK_FLAGS RESULT_VARIABLE SUCESS) @@ -112,7 +121,11 @@ string(STRIP ${PADDLE_LINK_FLAGS} PADDLE_LINK_FLAGS) # paddle compile option # -I/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/include execute_process( - COMMAND python -c "import paddle; include_dir = paddle.sysconfig.get_include(); print(f\"-I{include_dir}\");" + COMMAND python -c "\ +import paddle; \ +include_dir = paddle.sysconfig.get_include(); \ +print(f\"-I{include_dir}\"); \ + " OUTPUT_VARIABLE PADDLE_COMPILE_FLAGS) message(STATUS PADDLE_COMPILE_FLAGS= ${PADDLE_COMPILE_FLAGS}) string(STRIP ${PADDLE_COMPILE_FLAGS} PADDLE_COMPILE_FLAGS) @@ -121,7 +134,15 @@ string(STRIP ${PADDLE_COMPILE_FLAGS} PADDLE_COMPILE_FLAGS) # for LD_LIBRARY_PATH # set(PADDLE_LIB_DIRS /workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid:/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/libs/) execute_process( - COMMAND python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out);" + COMMAND python -c " \ +import os; \ +import paddle; \ +include_dir=paddle.sysconfig.get_include(); \ +paddle_dir=os.path.split(include_dir)[0]; \ +libs_dir=os.path.join(paddle_dir, 'libs'); \ +fluid_dir=os.path.join(paddle_dir, 'fluid'); \ +out=':'.join([libs_dir, fluid_dir]); print(out); \ + " OUTPUT_VARIABLE PADDLE_LIB_DIRS) message(STATUS PADDLE_LIB_DIRS= ${PADDLE_LIB_DIRS}) diff --git a/speechx/README.md b/speechx/README.md index 3861edf3..f744defa 100644 --- a/speechx/README.md +++ b/speechx/README.md @@ -9,7 +9,7 @@ We develop under: * gcc/g++/gfortran - 8.2.0 * cmake - 3.16.0 -> Please using `tools/env.sh` to create python `venv`, then `source venv/bin/activate` to build speechx. +> Please use `tools/env.sh` to create python `venv`, then `source venv/bin/activate` to build speechx. > We make sure all things work fun under docker, and recommend using it to develop and deploy. @@ -35,7 +35,7 @@ bash tools/venv.sh 2. Build `speechx` and `examples`. -For now we using feature under `develop` branch of paddle, so we need install `paddlepaddle` nightly build version. +For now we are using feature under `develop` branch of paddle, so we need to install `paddlepaddle` nightly build version. For example: ``` source venv/bin/activate diff --git a/speechx/cmake/paddleinference.cmake b/speechx/cmake/paddleinference.cmake index 311804d6..d8a9c613 100644 --- a/speechx/cmake/paddleinference.cmake +++ b/speechx/cmake/paddleinference.cmake @@ -1,14 +1,5 @@ set(paddle_SOURCE_DIR ${fc_patch}/paddle-lib) set(paddle_PREFIX_DIR ${fc_patch}/paddle-lib-prefix) -# ExternalProject_Add(paddle -# URL https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/CPU/gcc8.2_avx_mkl/paddle_inference.tgz -# URL_HASH SHA256=7c6399e778c6554a929b5a39ba2175e702e115145e8fa690d2af974101d98873 -# PREFIX ${paddle_PREFIX_DIR} -# SOURCE_DIR ${paddle_SOURCE_DIR} -# CONFIGURE_COMMAND "" -# BUILD_COMMAND "" -# INSTALL_COMMAND "" -# ) include(FetchContent) FetchContent_Declare( diff --git a/speechx/examples/codelab/feat/run.sh b/speechx/examples/codelab/feat/run.sh index 66bd8ae2..5d7612ae 100755 --- a/speechx/examples/codelab/feat/run.sh +++ b/speechx/examples/codelab/feat/run.sh @@ -42,8 +42,8 @@ mkdir -p $exp_dir export GLOG_logtostderr=1 cmvn_json2kaldi_main \ - --json_file $model_dir/data/mean_std.json \ - --cmvn_write_path $exp_dir/cmvn.ark \ + --json_file=$model_dir/data/mean_std.json \ + --cmvn_write_path=$exp_dir/cmvn.ark \ --binary=false echo "convert json cmvn to kaldi ark." @@ -55,7 +55,7 @@ compute_linear_spectrogram_main \ echo "compute linear spectrogram feature." compute_fbank_main \ - --num_bins 161 \ + --num_bins=161 \ --wav_rspecifier=scp:$data_dir/wav.scp \ --feature_wspecifier=ark,t:$exp_dir/fbank.ark \ --cmvn_file=$exp_dir/cmvn.ark diff --git a/speechx/examples/codelab/u2/local/decode.sh b/speechx/examples/codelab/u2/local/decode.sh index c22ad7f0..11c1afe8 100755 --- a/speechx/examples/codelab/u2/local/decode.sh +++ b/speechx/examples/codelab/u2/local/decode.sh @@ -7,7 +7,7 @@ set -e data=data exp=exp mkdir -p $exp -ckpt_dir=./data/model +ckpt_dir=$data/model model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ ctc_prefix_beam_search_decoder_main \ diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt index 5bec24a6..f0fd32ba 100644 --- a/speechx/speechx/decoder/CMakeLists.txt +++ b/speechx/speechx/decoder/CMakeLists.txt @@ -4,11 +4,11 @@ set(srcs) if (USING_DS2) list(APPEND srcs -ctc_decoders/decoder_utils.cpp -ctc_decoders/path_trie.cpp -ctc_decoders/scorer.cpp -ctc_beam_search_decoder.cc -ctc_tlg_decoder.cc + ctc_decoders/decoder_utils.cpp + ctc_decoders/path_trie.cpp + ctc_decoders/scorer.cpp + ctc_beam_search_decoder.cc + ctc_tlg_decoder.cc ) endif() diff --git a/speechx/speechx/decoder/ctc_beam_search_opt.h b/speechx/speechx/decoder/ctc_beam_search_opt.h index 4a4f817d..f9e5933c 100644 --- a/speechx/speechx/decoder/ctc_beam_search_opt.h +++ b/speechx/speechx/decoder/ctc_beam_search_opt.h @@ -11,12 +11,11 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#pragma once #include "base/common.h" #include "util/parse-options.h" -#pragma once - namespace ppspeech { @@ -76,4 +75,4 @@ struct CTCBeamSearchOptions { } }; -} // namespace ppspeech \ No newline at end of file +} // namespace ppspeech diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc index 7414d06d..56867c70 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc @@ -107,12 +107,12 @@ void CTCPrefixBeamSearch::AdvanceDecoding( std::min(static_cast(logp[0].size()), opts_.first_beam_size); for (int t = 0; t < logp.size(); ++t, ++num_frame_decoded_) { - const std::vector& logp_t = logp[t]; + const std::vector& logp_t = logp[t]; std::unordered_map, PrefixScore, PrefixScoreHash> next_hyps; // 1. first beam prune, only select topk candidates - std::vector topk_score; + std::vector topk_score; std::vector topk_index; TopK(logp_t, first_beam_size, &topk_score, &topk_index); VLOG(2) << "topk: " << num_frame_decoded_ << " " @@ -367,4 +367,4 @@ std::string CTCPrefixBeamSearch::GetFinalBestPath() { return GetBestPath(); } std::string CTCPrefixBeamSearch::GetPartialResult() { return GetBestPath(); } -} // namespace ppspeech \ No newline at end of file +} // namespace ppspeech diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h index a0c2a74e..91977092 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h @@ -27,7 +27,7 @@ namespace ppspeech { class ContextGraph; class CTCPrefixBeamSearch : public DecoderBase { public: - explicit CTCPrefixBeamSearch(const std::string& vocab_path, + CTCPrefixBeamSearch(const std::string& vocab_path, const CTCBeamSearchOptions& opts); ~CTCPrefixBeamSearch() {} @@ -98,4 +98,4 @@ class CTCPrefixBeamSearch : public DecoderBase { }; -} // namespace ppspeech \ No newline at end of file +} // namespace ppspeech From a6b2a0a697cade73112ab66dd7fef477e44e9577 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 24 Oct 2022 09:28:38 +0000 Subject: [PATCH 56/60] cpplint --- .pre-commit-config.yaml | 9 +++- speechx/speechx/base/basic_types.h | 42 +++++++++---------- speechx/speechx/base/macros.h | 2 +- speechx/speechx/base/thread_pool.h | 2 +- .../codelab/nnet/ds2_model_test_main.cc | 4 +- .../decoder/ctc_beam_search_decoder.cc | 6 +-- .../speechx/decoder/ctc_beam_search_decoder.h | 2 +- .../decoder/ctc_beam_search_decoder_main.cc | 4 +- speechx/speechx/decoder/ctc_beam_search_opt.h | 2 +- .../decoder/ctc_prefix_beam_search_decoder.cc | 6 +-- .../decoder/ctc_prefix_beam_search_decoder.h | 2 +- .../ctc_prefix_beam_search_decoder_main.cc | 17 ++++---- speechx/speechx/decoder/ctc_tlg_decoder.h | 2 +- speechx/speechx/frontend/audio/cmvn.cc | 2 +- .../frontend/audio/compute_fbank_main.cc | 4 +- speechx/speechx/frontend/audio/data_cache.h | 2 +- speechx/speechx/frontend/audio/db_norm.cc | 7 ++-- speechx/speechx/frontend/audio/fbank.cc | 7 ++-- .../frontend/audio/feature_pipeline.cc | 2 +- .../frontend/audio/linear_spectrogram.cc | 7 ++-- speechx/speechx/frontend/audio/mfcc.cc | 7 ++-- speechx/speechx/nnet/ds2_nnet.cc | 9 ++-- speechx/speechx/nnet/ds2_nnet.h | 4 +- speechx/speechx/nnet/ds2_nnet_main.cc | 4 +- speechx/speechx/nnet/u2_nnet.cc | 40 +++++++++--------- speechx/speechx/nnet/u2_nnet.h | 4 +- speechx/speechx/nnet/u2_nnet_main.cc | 6 +-- .../websocket/websocket_client_main.cc | 2 +- speechx/speechx/recognizer/recognizer.h | 3 +- speechx/speechx/recognizer/recognizer_main.cc | 5 ++- speechx/speechx/recognizer/u2_recognizer.cc | 2 +- speechx/speechx/recognizer/u2_recognizer.h | 2 - speechx/speechx/utils/file_utils.cc | 2 +- 33 files changed, 118 insertions(+), 103 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6e7ae1fb..0435cfbe 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -50,13 +50,20 @@ repos: entry: bash .pre-commit-hooks/clang-format.hook -i language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$ - exclude: (?=speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin).*(\.cpp|\.cc|\.h|\.py)$ + exclude: (?=speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.py)$ #- id: copyright_checker # name: copyright_checker # entry: python .pre-commit-hooks/copyright-check.hook # language: system # files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$ # exclude: (?=third_party|pypinyin|speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin).*(\.cpp|\.cc|\.h|\.py)$ + - id: cpplint + name: cpplint + description: Static code analysis of C/C++ files + language: python + files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$ + exclude: (?=speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.py)$ + entry: cpplint --filter=-build,-whitespace,+whitespace/comma,-whitespace/indent - repo: https://github.com/asottile/reorder_python_imports rev: v2.4.0 hooks: diff --git a/speechx/speechx/base/basic_types.h b/speechx/speechx/base/basic_types.h index 3a648649..96bc0ca5 100644 --- a/speechx/speechx/base/basic_types.h +++ b/speechx/speechx/base/basic_types.h @@ -22,39 +22,39 @@ typedef float BaseFloat; typedef double double64; typedef signed char int8; -typedef short int16; -typedef int int32; +typedef short int16; // NOLINT +typedef int int32; // NOLINT #if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD) -typedef long int64; +typedef long int64; // NOLINT #else -typedef long long int64; +typedef long long int64; // NOLINT #endif -typedef unsigned char uint8; -typedef unsigned short uint16; -typedef unsigned int uint32; +typedef unsigned char uint8; // NOLINT +typedef unsigned short uint16; // NOLINT +typedef unsigned int uint32; // NOLINT #if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD) -typedef unsigned long uint64; +typedef unsigned long uint64; // NOLINT #else -typedef unsigned long long uint64; +typedef unsigned long long uint64; // NOLINT #endif typedef signed int char32; -const uint8 kuint8max = ((uint8)0xFF); -const uint16 kuint16max = ((uint16)0xFFFF); -const uint32 kuint32max = ((uint32)0xFFFFFFFF); -const uint64 kuint64max = ((uint64)(0xFFFFFFFFFFFFFFFFLL)); -const int8 kint8min = ((int8)0x80); -const int8 kint8max = ((int8)0x7F); -const int16 kint16min = ((int16)0x8000); -const int16 kint16max = ((int16)0x7FFF); -const int32 kint32min = ((int32)0x80000000); -const int32 kint32max = ((int32)0x7FFFFFFF); -const int64 kint64min = ((int64)(0x8000000000000000LL)); -const int64 kint64max = ((int64)(0x7FFFFFFFFFFFFFFFLL)); +const uint8 kuint8max = (static_cast 0xFF); +const uint16 kuint16max = (static_cast 0xFFFF); +const uint32 kuint32max = (static_cast 0xFFFFFFFF); +const uint64 kuint64max = (static_cast(0xFFFFFFFFFFFFFFFFLL)); +const int8 kint8min = (static_cast 0x80); +const int8 kint8max = (static_cast 0x7F); +const int16 kint16min = (static_cast 0x8000); +const int16 kint16max = (static_cast 0x7FFF); +const int32 kint32min = (static_cast 0x80000000); +const int32 kint32max = (static_cast 0x7FFFFFFF); +const int64 kint64min = (static_cast(0x8000000000000000LL)); +const int64 kint64max = (static_cast(0x7FFFFFFFFFFFFFFFLL)); const BaseFloat kBaseFloatMax = std::numeric_limits::max(); const BaseFloat kBaseFloatMin = std::numeric_limits::min(); diff --git a/speechx/speechx/base/macros.h b/speechx/speechx/base/macros.h index faf39373..95608f40 100644 --- a/speechx/speechx/base/macros.h +++ b/speechx/speechx/base/macros.h @@ -26,6 +26,6 @@ namespace ppspeech { #endif // kSpaceSymbol in UTF-8 is: ▁ -const std::string kSpaceSymbol = "\xe2\x96\x81"; +const char[] kSpaceSymbol = "\xe2\x96\x81"; } // namespace ppspeech diff --git a/speechx/speechx/base/thread_pool.h b/speechx/speechx/base/thread_pool.h index ba895f71..6d59dac5 100644 --- a/speechx/speechx/base/thread_pool.h +++ b/speechx/speechx/base/thread_pool.h @@ -35,7 +35,7 @@ class ThreadPool { public: - ThreadPool(size_t); + explicit ThreadPool(size_t); template auto enqueue(F&& f, Args&&... args) -> std::future::type>; diff --git a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc index 09f9e2fb..ab7b2cb5 100644 --- a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc +++ b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc @@ -64,8 +64,8 @@ void model_forward_test() { ; std::string model_graph = FLAGS_model_path; std::string model_params = FLAGS_param_path; - CHECK(model_graph != ""); - CHECK(model_params != ""); + CHECK_NE(model_graph, ""); + CHECK_NE(model_params, ""); cout << "model path: " << model_graph << endl; cout << "model param path : " << model_params << endl; diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_beam_search_decoder.cc index c4b35ff0..6e3a0d13 100644 --- a/speechx/speechx/decoder/ctc_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_beam_search_decoder.cc @@ -39,12 +39,12 @@ CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts) opts_.alpha, opts_.beta, opts_.lm_path, vocabulary_); } - CHECK(opts_.blank == 0); + CHECK_EQ(opts_.blank, 0); auto it = std::find(vocabulary_.begin(), vocabulary_.end(), " "); space_id_ = it - vocabulary_.begin(); // if no space in vocabulary - if ((size_t)space_id_ >= vocabulary_.size()) { + if (static_cast(space_id_) >= vocabulary_.size()) { space_id_ = -2; } } @@ -104,7 +104,7 @@ void CTCBeamSearch::ResetPrefixes() { } int CTCBeamSearch::DecodeLikelihoods(const vector>& probs, - vector& nbest_words) { + const vector& nbest_words) { kaldi::Timer timer; AdvanceDecoding(probs); LOG(INFO) << "ctc decoding elapsed time(s) " diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.h b/speechx/speechx/decoder/ctc_beam_search_decoder.h index 6347bba8..f06d88e3 100644 --- a/speechx/speechx/decoder/ctc_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_beam_search_decoder.h @@ -48,7 +48,7 @@ class CTCBeamSearch : public DecoderBase { } int DecodeLikelihoods(const std::vector>& probs, - std::vector& nbest_words); + const std::vector& nbest_words); private: void ResetPrefixes(); diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc index edf9215a..ab0376b6 100644 --- a/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc +++ b/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc @@ -59,8 +59,8 @@ int main(int argc, char* argv[]) { google::InstallFailureSignalHandler(); FLAGS_logtostderr = 1; - CHECK(FLAGS_result_wspecifier != ""); - CHECK(FLAGS_feature_rspecifier != ""); + CHECK_NE(FLAGS_result_wspecifier, ""); + CHECK_NE(FLAGS_feature_rspecifier, ""); kaldi::SequentialBaseFloatMatrixReader feature_reader( FLAGS_feature_rspecifier); diff --git a/speechx/speechx/decoder/ctc_beam_search_opt.h b/speechx/speechx/decoder/ctc_beam_search_opt.h index f9e5933c..f4a81b3a 100644 --- a/speechx/speechx/decoder/ctc_beam_search_opt.h +++ b/speechx/speechx/decoder/ctc_beam_search_opt.h @@ -36,7 +36,7 @@ struct CTCBeamSearchOptions { // u2 int first_beam_size; int second_beam_size; - explicit CTCBeamSearchOptions() + CTCBeamSearchOptions() : blank(0), dict_file("vocab.txt"), lm_path(""), diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc index 56867c70..0a0afcd7 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc @@ -329,8 +329,8 @@ void CTCPrefixBeamSearch::UpdateFinalContext() { std::string CTCPrefixBeamSearch::GetBestPath(int index) { int n_hyps = Outputs().size(); - CHECK(n_hyps > 0); - CHECK(index < n_hyps); + CHECK_GT(n_hyps, 0); + CHECK_LT(index, n_hyps); std::vector one = Outputs()[index]; std::string sentence; for (int i = 0; i < one.size(); i++) { @@ -344,7 +344,7 @@ std::string CTCPrefixBeamSearch::GetBestPath() { return GetBestPath(0); } std::vector> CTCPrefixBeamSearch::GetNBestPath( int n) { int hyps_size = hypotheses_.size(); - CHECK(hyps_size > 0); + CHECK_GT(hyps_size, 0); int min_n = n == -1 ? hypotheses_.size() : std::min(n, hyps_size); diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h index 91977092..5013246a 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h @@ -28,7 +28,7 @@ class ContextGraph; class CTCPrefixBeamSearch : public DecoderBase { public: CTCPrefixBeamSearch(const std::string& vocab_path, - const CTCBeamSearchOptions& opts); + const CTCBeamSearchOptions& opts); ~CTCPrefixBeamSearch() {} SearchType Type() const { return SearchType::kPrefixBeamSearch; } diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc index 69f32686..c59b1f2e 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc @@ -50,10 +50,10 @@ int main(int argc, char* argv[]) { int32 num_done = 0, num_err = 0; - CHECK(FLAGS_result_wspecifier != ""); - CHECK(FLAGS_feature_rspecifier != ""); - CHECK(FLAGS_vocab_path != ""); - CHECK(FLAGS_model_path != ""); + CHECK_NE(FLAGS_result_wspecifier, ""); + CHECK_NE(FLAGS_feature_rspecifier, ""); + CHECK_NE(FLAGS_vocab_path, ""); + CHECK_NE(FLAGS_model_path, ""); LOG(INFO) << "model path: " << FLAGS_model_path; LOG(INFO) << "Reading vocab table " << FLAGS_vocab_path; @@ -64,11 +64,14 @@ int main(int argc, char* argv[]) { // nnet ppspeech::ModelOptions model_opts; model_opts.model_path = FLAGS_model_path; - std::shared_ptr nnet = std::make_shared(model_opts); + std::shared_ptr nnet = + std::make_shared(model_opts); // decodeable - std::shared_ptr raw_data = std::make_shared(); - std::shared_ptr decodable = std::make_shared(nnet, raw_data); + std::shared_ptr raw_data = + std::make_shared(); + std::shared_ptr decodable = + std::make_shared(nnet, raw_data); // decoder ppspeech::CTCBeamSearchOptions opts; diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h index 0ff1de2a..8be69dad 100644 --- a/speechx/speechx/decoder/ctc_tlg_decoder.h +++ b/speechx/speechx/decoder/ctc_tlg_decoder.h @@ -71,7 +71,7 @@ class TLGDecoder : public DecoderBase { std::string GetPartialResult() override; int DecodeLikelihoods(const std::vector>& probs, - std::vector& nbest_words); + const std::vector& nbest_words); protected: std::string GetBestPath() override { diff --git a/speechx/speechx/frontend/audio/cmvn.cc b/speechx/speechx/frontend/audio/cmvn.cc index 7997e8a7..3d80e001 100644 --- a/speechx/speechx/frontend/audio/cmvn.cc +++ b/speechx/speechx/frontend/audio/cmvn.cc @@ -30,7 +30,7 @@ using std::vector; CMVN::CMVN(std::string cmvn_file, unique_ptr base_extractor) : var_norm_(true) { - CHECK(cmvn_file != ""); + CHECK_NE(cmvn_file, ""); base_extractor_ = std::move(base_extractor); bool binary; diff --git a/speechx/speechx/frontend/audio/compute_fbank_main.cc b/speechx/speechx/frontend/audio/compute_fbank_main.cc index bb7e449f..e2b54a8a 100644 --- a/speechx/speechx/frontend/audio/compute_fbank_main.cc +++ b/speechx/speechx/frontend/audio/compute_fbank_main.cc @@ -40,8 +40,8 @@ int main(int argc, char* argv[]) { google::InstallFailureSignalHandler(); FLAGS_logtostderr = 1; - CHECK(FLAGS_wav_rspecifier.size() > 0); - CHECK(FLAGS_feature_wspecifier.size() > 0); + CHECK_GT(FLAGS_wav_rspecifier.size(), 0); + CHECK_GT(FLAGS_feature_wspecifier.size(), 0); kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); kaldi::SequentialTableReader wav_info_reader( diff --git a/speechx/speechx/frontend/audio/data_cache.h b/speechx/speechx/frontend/audio/data_cache.h index 5fafdeb2..5f5cd51b 100644 --- a/speechx/speechx/frontend/audio/data_cache.h +++ b/speechx/speechx/frontend/audio/data_cache.h @@ -27,7 +27,7 @@ namespace ppspeech { // pre-recorded audio/feature class DataCache : public FrontendInterface { public: - explicit DataCache() { finished_ = false; } + DataCache() { finished_ = false; } // accept waves/feats virtual void Accept(const kaldi::VectorBase& inputs) { diff --git a/speechx/speechx/frontend/audio/db_norm.cc b/speechx/speechx/frontend/audio/db_norm.cc index 931e932d..ad79fcc3 100644 --- a/speechx/speechx/frontend/audio/db_norm.cc +++ b/speechx/speechx/frontend/audio/db_norm.cc @@ -14,17 +14,18 @@ #include "frontend/audio/db_norm.h" + #include "kaldi/feat/cmvn.h" #include "kaldi/util/kaldi-io.h" namespace ppspeech { -using kaldi::Vector; -using kaldi::VectorBase; using kaldi::BaseFloat; -using std::vector; using kaldi::SubVector; +using kaldi::Vector; +using kaldi::VectorBase; using std::unique_ptr; +using std::vector; DecibelNormalizer::DecibelNormalizer( const DecibelNormalizerOptions& opts, diff --git a/speechx/speechx/frontend/audio/fbank.cc b/speechx/speechx/frontend/audio/fbank.cc index 059abbbd..deabe876 100644 --- a/speechx/speechx/frontend/audio/fbank.cc +++ b/speechx/speechx/frontend/audio/fbank.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "frontend/audio/fbank.h" + #include "kaldi/base/kaldi-math.h" #include "kaldi/feat/feature-common.h" #include "kaldi/feat/feature-functions.h" @@ -20,12 +21,12 @@ namespace ppspeech { -using kaldi::int32; using kaldi::BaseFloat; -using kaldi::Vector; +using kaldi::int32; +using kaldi::Matrix; using kaldi::SubVector; +using kaldi::Vector; using kaldi::VectorBase; -using kaldi::Matrix; using std::vector; FbankComputer::FbankComputer(const Options& opts) diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc index 65493e42..2931b96b 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.cc +++ b/speechx/speechx/frontend/audio/feature_pipeline.cc @@ -33,7 +33,7 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) opts.linear_spectrogram_opts, std::move(data_source))); } - CHECK(opts.cmvn_file != ""); + CHECK_NE(opts.cmvn_file, ""); unique_ptr cmvn( new ppspeech::CMVN(opts.cmvn_file, std::move(base_feature))); diff --git a/speechx/speechx/frontend/audio/linear_spectrogram.cc b/speechx/speechx/frontend/audio/linear_spectrogram.cc index 55c03978..d4a2fcc6 100644 --- a/speechx/speechx/frontend/audio/linear_spectrogram.cc +++ b/speechx/speechx/frontend/audio/linear_spectrogram.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "frontend/audio/linear_spectrogram.h" + #include "kaldi/base/kaldi-math.h" #include "kaldi/feat/feature-common.h" #include "kaldi/feat/feature-functions.h" @@ -20,12 +21,12 @@ namespace ppspeech { -using kaldi::int32; using kaldi::BaseFloat; -using kaldi::Vector; +using kaldi::int32; +using kaldi::Matrix; using kaldi::SubVector; +using kaldi::Vector; using kaldi::VectorBase; -using kaldi::Matrix; using std::vector; LinearSpectrogramComputer::LinearSpectrogramComputer(const Options& opts) diff --git a/speechx/speechx/frontend/audio/mfcc.cc b/speechx/speechx/frontend/audio/mfcc.cc index bda1f96d..15f8cb0f 100644 --- a/speechx/speechx/frontend/audio/mfcc.cc +++ b/speechx/speechx/frontend/audio/mfcc.cc @@ -14,6 +14,7 @@ #include "frontend/audio/mfcc.h" + #include "kaldi/base/kaldi-math.h" #include "kaldi/feat/feature-common.h" #include "kaldi/feat/feature-functions.h" @@ -21,12 +22,12 @@ namespace ppspeech { -using kaldi::int32; using kaldi::BaseFloat; -using kaldi::Vector; +using kaldi::int32; +using kaldi::Matrix; using kaldi::SubVector; +using kaldi::Vector; using kaldi::VectorBase; -using kaldi::Matrix; using std::vector; Mfcc::Mfcc(const MfccOptions& opts, diff --git a/speechx/speechx/nnet/ds2_nnet.cc b/speechx/speechx/nnet/ds2_nnet.cc index 8c83f832..22c7f61b 100644 --- a/speechx/speechx/nnet/ds2_nnet.cc +++ b/speechx/speechx/nnet/ds2_nnet.cc @@ -13,15 +13,16 @@ // limitations under the License. #include "nnet/ds2_nnet.h" + #include "absl/strings/str_split.h" namespace ppspeech { -using std::vector; -using std::string; -using std::shared_ptr; using kaldi::Matrix; using kaldi::Vector; +using std::shared_ptr; +using std::string; +using std::vector; void PaddleNnet::InitCacheEncouts(const ModelOptions& opts) { std::vector cache_names; @@ -207,7 +208,7 @@ void PaddleNnet::FeedForward(const Vector& features, // inferences->Resize(row * col); // *inference_dim = col; - out->logprobs.Resize(row*col); + out->logprobs.Resize(row * col); out->vocab_dim = col; output_tensor->CopyToCpu(out->logprobs.Data()); diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h index d1e3ac8c..420fa177 100644 --- a/speechx/speechx/nnet/ds2_nnet.h +++ b/speechx/speechx/nnet/ds2_nnet.h @@ -26,7 +26,7 @@ template class Tensor { public: Tensor() {} - Tensor(const std::vector& shape) : _shape(shape) { + explicit Tensor(const std::vector& shape) : _shape(shape) { int neml = std::accumulate( _shape.begin(), _shape.end(), 1, std::multiplies()); LOG(INFO) << "Tensor neml: " << neml; @@ -50,7 +50,7 @@ class Tensor { class PaddleNnet : public NnetBase { public: - PaddleNnet(const ModelOptions& opts); + explicit PaddleNnet(const ModelOptions& opts); void FeedForward(const kaldi::Vector& features, const int32& feature_dim, diff --git a/speechx/speechx/nnet/ds2_nnet_main.cc b/speechx/speechx/nnet/ds2_nnet_main.cc index d8d33e98..6092b8a4 100644 --- a/speechx/speechx/nnet/ds2_nnet_main.cc +++ b/speechx/speechx/nnet/ds2_nnet_main.cc @@ -12,13 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "nnet/ds2_nnet.h" #include "base/common.h" #include "decoder/param.h" #include "frontend/audio/assembler.h" #include "frontend/audio/data_cache.h" #include "kaldi/util/table-types.h" #include "nnet/decodable.h" +#include "nnet/ds2_nnet.h" DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier"); @@ -44,7 +44,7 @@ int main(int argc, char* argv[]) { int32 num_done = 0, num_err = 0; - ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags(); + ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags(); std::shared_ptr nnet( new ppspeech::PaddleNnet(model_opts)); diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc index 636e2ad4..19cb85fd 100644 --- a/speechx/speechx/nnet/u2_nnet.cc +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -158,7 +158,7 @@ void U2Nnet::Reset() { } // Debug API -void U2Nnet::FeedEncoderOuts(paddle::Tensor& encoder_out) { +void U2Nnet::FeedEncoderOuts(const paddle::Tensor& encoder_out) { // encoder_out (T,D) encoder_outs_.clear(); encoder_outs_.push_back(encoder_out); @@ -206,7 +206,7 @@ void U2Nnet::ForwardEncoderChunkImpl( float* feats_ptr = feats.mutable_data(); // not cache feature in nnet - CHECK(cached_feats_.size() == 0); + CHECK_EQ(cached_feats_.size(), 0); // CHECK_EQ(std::is_same::value, true); std::memcpy(feats_ptr, chunk_feats.data(), @@ -247,9 +247,9 @@ void U2Nnet::ForwardEncoderChunkImpl( // call. std::vector inputs = { feats, offset, /*required_cache_size, */ att_cache_, cnn_cache_}; - CHECK(inputs.size() == 4); + CHECK_EQ(inputs.size(), 4); std::vector outputs = forward_encoder_chunk_(inputs); - CHECK(outputs.size() == 3); + CHECK_EQ(outputs.size(), 3); #ifdef USE_GPU paddle::Tensor chunk_out = outputs[0].copy_to(paddle::CPUPlace()); @@ -319,9 +319,9 @@ void U2Nnet::ForwardEncoderChunkImpl( inputs.clear(); outputs.clear(); inputs.push_back(chunk_out); - CHECK(inputs.size() == 1); + CHECK_EQ(inputs.size(), 1); outputs = ctc_activation_(inputs); - CHECK(outputs.size() == 1); + CHECK_EQ(outputs.size(), 1); paddle::Tensor ctc_log_probs = outputs[0]; #ifdef TEST_DEBUG @@ -350,9 +350,9 @@ void U2Nnet::ForwardEncoderChunkImpl( // Copy to output, (B=1,T,D) std::vector ctc_log_probs_shape = ctc_log_probs.shape(); - CHECK(ctc_log_probs_shape.size() == 3); + CHECK_EQ(ctc_log_probs_shape.size(), 3); int B = ctc_log_probs_shape[0]; - CHECK(B == 1); + CHECK_EQ(B, 1); int T = ctc_log_probs_shape[1]; int D = ctc_log_probs_shape[2]; *vocab_dim = D; @@ -393,9 +393,9 @@ float U2Nnet::ComputePathScore(const paddle::Tensor& prob, // hyp (U,) float score = 0.0f; std::vector dims = prob.shape(); - CHECK(dims.size() == 3); + CHECK_EQ(dims.size(), 3); VLOG(2) << "prob shape: " << dims[0] << ", " << dims[1] << ", " << dims[2]; - CHECK(dims[0] == 1); + CHECK_EQ(dims[0], 1); int vocab_dim = static_cast(dims[2]); const float* prob_ptr = prob.data(); @@ -520,14 +520,14 @@ void U2Nnet::AttentionRescoring(const std::vector>& hyps, std::vector inputs{ hyps_tensor, hyps_lens, encoder_out}; std::vector outputs = forward_attention_decoder_(inputs); - CHECK(outputs.size() == 2); + CHECK_EQ(outputs.size(), 2); // (B, Umax, V) paddle::Tensor probs = outputs[0]; std::vector probs_shape = probs.shape(); - CHECK(probs_shape.size() == 3); - CHECK(probs_shape[0] == num_hyps); - CHECK(probs_shape[1] == max_hyps_len); + CHECK_EQ(probs_shape.size(), 3); + CHECK_EQ(probs_shape[0], num_hyps); + CHECK_EQ(probs_shape[1], max_hyps_len); #ifdef TEST_DEBUG { @@ -582,13 +582,13 @@ void U2Nnet::AttentionRescoring(const std::vector>& hyps, paddle::Tensor r_probs = outputs[1]; std::vector r_probs_shape = r_probs.shape(); if (is_bidecoder_ && reverse_weight > 0) { - CHECK(r_probs_shape.size() == 3); - CHECK(r_probs_shape[0] == num_hyps); - CHECK(r_probs_shape[1] == max_hyps_len); + CHECK_EQ(r_probs_shape.size(), 3); + CHECK_EQ(r_probs_shape[0], num_hyps); + CHECK_EQ(r_probs_shape[1], max_hyps_len); } else { // dump r_probs - CHECK(r_probs_shape.size() == 1); - CHECK(r_probs_shape[0] == 1) << r_probs_shape[0]; + CHECK_EQ(r_probs_shape.size(), 1); + CHECK_EQ(r_probs_shape[0], 1) << r_probs_shape[0]; } // compute rescoring score @@ -644,7 +644,7 @@ void U2Nnet::EncoderOuts( for (int i = 0; i < size; i++) { const paddle::Tensor& item = encoder_outs_[i]; const std::vector shape = item.shape(); - CHECK(shape.size() == 3); + CHECK_EQ(shape.size(), 3); const int& B = shape[0]; const int& T = shape[1]; const int& D = shape[2]; diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h index e548d4c0..23cc0ea3 100644 --- a/speechx/speechx/nnet/u2_nnet.h +++ b/speechx/speechx/nnet/u2_nnet.h @@ -73,7 +73,7 @@ class U2NnetBase : public NnetBase { class U2Nnet : public U2NnetBase { public: - U2Nnet(const ModelOptions& opts); + explicit U2Nnet(const ModelOptions& opts); U2Nnet(const U2Nnet& other); void FeedForward(const kaldi::Vector& features, @@ -108,7 +108,7 @@ class U2Nnet : public U2NnetBase { std::vector* rescoring_score) override; // debug - void FeedEncoderOuts(paddle::Tensor& encoder_out); + void FeedEncoderOuts(const paddle::Tensor& encoder_out); void EncoderOuts( std::vector>* encoder_out) const; diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc index 5039a59a..53fc5554 100644 --- a/speechx/speechx/nnet/u2_nnet_main.cc +++ b/speechx/speechx/nnet/u2_nnet_main.cc @@ -39,9 +39,9 @@ int main(int argc, char* argv[]) { int32 num_done = 0, num_err = 0; - CHECK(FLAGS_feature_rspecifier.size() > 0); - CHECK(FLAGS_nnet_prob_wspecifier.size() > 0); - CHECK(FLAGS_model_path.size() > 0); + CHECK_GT(FLAGS_feature_rspecifier.size(), 0); + CHECK_GT(FLAGS_nnet_prob_wspecifier.size(), 0); + CHECK_GT(FLAGS_model_path.size(), 0); LOG(INFO) << "input rspecifier: " << FLAGS_feature_rspecifier; LOG(INFO) << "output wspecifier: " << FLAGS_nnet_prob_wspecifier; LOG(INFO) << "model path: " << FLAGS_model_path; diff --git a/speechx/speechx/protocol/websocket/websocket_client_main.cc b/speechx/speechx/protocol/websocket/websocket_client_main.cc index 7ad36e3a..7c5a4f2f 100644 --- a/speechx/speechx/protocol/websocket/websocket_client_main.cc +++ b/speechx/speechx/protocol/websocket/websocket_client_main.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "websocket/websocket_client.h" #include "kaldi/feat/wave-reader.h" #include "kaldi/util/kaldi-io.h" #include "kaldi/util/table-types.h" +#include "websocket/websocket_client.h" DEFINE_string(host, "127.0.0.1", "host of websocket server"); DEFINE_int32(port, 8082, "port of websocket server"); diff --git a/speechx/speechx/recognizer/recognizer.h b/speechx/speechx/recognizer/recognizer.h index 27f1228a..57d5bb36 100644 --- a/speechx/speechx/recognizer/recognizer.h +++ b/speechx/speechx/recognizer/recognizer.h @@ -39,7 +39,8 @@ struct RecognizerResource { resource.feature_pipeline_opts = FeaturePipelineOptions::InitFromFlags(); resource.feature_pipeline_opts.assembler_opts.fill_zero = true; - LOG(INFO) << "ds2 need fill zero be true: " << resource.feature_pipeline_opts.assembler_opts.fill_zero; + LOG(INFO) << "ds2 need fill zero be true: " + << resource.feature_pipeline_opts.assembler_opts.fill_zero; resource.model_opts = ModelOptions::InitFromFlags(); resource.tlg_opts = TLGDecoderOptions::InitFromFlags(); return resource; diff --git a/speechx/speechx/recognizer/recognizer_main.cc b/speechx/speechx/recognizer/recognizer_main.cc index 7c30fe6a..cb0de2d6 100644 --- a/speechx/speechx/recognizer/recognizer_main.cc +++ b/speechx/speechx/recognizer/recognizer_main.cc @@ -13,9 +13,9 @@ // limitations under the License. #include "decoder/param.h" -#include "recognizer/recognizer.h" #include "kaldi/feat/wave-reader.h" #include "kaldi/util/table-types.h" +#include "recognizer/recognizer.h" DEFINE_string(wav_rspecifier, "", "test feature rspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier"); @@ -30,7 +30,8 @@ int main(int argc, char* argv[]) { google::InstallFailureSignalHandler(); FLAGS_logtostderr = 1; - ppspeech::RecognizerResource resource = ppspeech::RecognizerResource::InitFromFlags(); + ppspeech::RecognizerResource resource = + ppspeech::RecognizerResource::InitFromFlags(); ppspeech::Recognizer recognizer(resource); kaldi::SequentialTableReader wav_reader( diff --git a/speechx/speechx/recognizer/u2_recognizer.cc b/speechx/speechx/recognizer/u2_recognizer.cc index 4ec64665..382f622f 100644 --- a/speechx/speechx/recognizer/u2_recognizer.cc +++ b/speechx/speechx/recognizer/u2_recognizer.cc @@ -35,7 +35,7 @@ U2Recognizer::U2Recognizer(const U2RecognizerResource& resource) BaseFloat am_scale = resource.acoustic_scale; decodable_.reset(new Decodable(nnet, feature_pipeline_, am_scale)); - CHECK(resource.vocab_path != ""); + CHECK_NE(resource.vocab_path, ""); decoder_.reset(new CTCPrefixBeamSearch( resource.vocab_path, resource.decoder_opts.ctc_prefix_search_opts)); diff --git a/speechx/speechx/recognizer/u2_recognizer.h b/speechx/speechx/recognizer/u2_recognizer.h index 9b43b08f..25850863 100644 --- a/speechx/speechx/recognizer/u2_recognizer.h +++ b/speechx/speechx/recognizer/u2_recognizer.h @@ -1,5 +1,3 @@ - - // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/speechx/speechx/utils/file_utils.cc b/speechx/speechx/utils/file_utils.cc index e5943e31..c42a642c 100644 --- a/speechx/speechx/utils/file_utils.cc +++ b/speechx/speechx/utils/file_utils.cc @@ -40,4 +40,4 @@ std::string ReadFile2String(const std::string& path) { return std::string((std::istreambuf_iterator(input_file)), std::istreambuf_iterator()); } -} +} // namespace ppspeech From 8ef3b339ea829661e3f0a4da24e77f4f096ada6f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 24 Oct 2022 09:35:27 +0000 Subject: [PATCH 57/60] fix cpplint --- speechx/speechx/base/basic_types.h | 24 ++++++++++++------------ speechx/speechx/base/macros.h | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/speechx/speechx/base/basic_types.h b/speechx/speechx/base/basic_types.h index 96bc0ca5..3e298b1b 100644 --- a/speechx/speechx/base/basic_types.h +++ b/speechx/speechx/base/basic_types.h @@ -43,18 +43,18 @@ typedef unsigned long long uint64; // NOLINT typedef signed int char32; -const uint8 kuint8max = (static_cast 0xFF); -const uint16 kuint16max = (static_cast 0xFFFF); -const uint32 kuint32max = (static_cast 0xFFFFFFFF); -const uint64 kuint64max = (static_cast(0xFFFFFFFFFFFFFFFFLL)); -const int8 kint8min = (static_cast 0x80); -const int8 kint8max = (static_cast 0x7F); -const int16 kint16min = (static_cast 0x8000); -const int16 kint16max = (static_cast 0x7FFF); -const int32 kint32min = (static_cast 0x80000000); -const int32 kint32max = (static_cast 0x7FFFFFFF); -const int64 kint64min = (static_cast(0x8000000000000000LL)); -const int64 kint64max = (static_cast(0x7FFFFFFFFFFFFFFFLL)); +const uint8 kuint8max = static_cast(0xFF); +const uint16 kuint16max = static_cast(0xFFFF); +const uint32 kuint32max = static_cast(0xFFFFFFFF); +const uint64 kuint64max = static_cast(0xFFFFFFFFFFFFFFFFLL); +const int8 kint8min = static_cast (0x80); +const int8 kint8max = static_cast (0x7F); +const int16 kint16min = static_cast (0x8000); +const int16 kint16max = static_cast (0x7FFF); +const int32 kint32min = static_cast(0x80000000); +const int32 kint32max = static_cast(0x7FFFFFFF); +const int64 kint64min = static_cast(0x8000000000000000LL); +const int64 kint64max = static_cast(0x7FFFFFFFFFFFFFFFLL); const BaseFloat kBaseFloatMax = std::numeric_limits::max(); const BaseFloat kBaseFloatMin = std::numeric_limits::min(); diff --git a/speechx/speechx/base/macros.h b/speechx/speechx/base/macros.h index 95608f40..db989812 100644 --- a/speechx/speechx/base/macros.h +++ b/speechx/speechx/base/macros.h @@ -26,6 +26,6 @@ namespace ppspeech { #endif // kSpaceSymbol in UTF-8 is: ▁ -const char[] kSpaceSymbol = "\xe2\x96\x81"; +const char kSpaceSymbo[] = "\xe2\x96\x81"; } // namespace ppspeech From 4dbff16f7062aee8a13bb5fad78b1b5d820f4563 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 24 Oct 2022 11:01:27 +0000 Subject: [PATCH 58/60] fix format --- speechx/speechx/base/basic_types.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/speechx/speechx/base/basic_types.h b/speechx/speechx/base/basic_types.h index 3e298b1b..2b15a61f 100644 --- a/speechx/speechx/base/basic_types.h +++ b/speechx/speechx/base/basic_types.h @@ -47,10 +47,10 @@ const uint8 kuint8max = static_cast(0xFF); const uint16 kuint16max = static_cast(0xFFFF); const uint32 kuint32max = static_cast(0xFFFFFFFF); const uint64 kuint64max = static_cast(0xFFFFFFFFFFFFFFFFLL); -const int8 kint8min = static_cast (0x80); -const int8 kint8max = static_cast (0x7F); -const int16 kint16min = static_cast (0x8000); -const int16 kint16max = static_cast (0x7FFF); +const int8 kint8min = static_cast(0x80); +const int8 kint8max = static_cast(0x7F); +const int16 kint16min = static_cast(0x8000); +const int16 kint16max = static_cast(0x7FFF); const int32 kint32min = static_cast(0x80000000); const int32 kint32max = static_cast(0x7FFFFFFF); const int64 kint64min = static_cast(0x8000000000000000LL); From aaf39863e03a52ec4c1cfc9e580c4c73d277f3bc Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 25 Oct 2022 06:35:20 +0000 Subject: [PATCH 59/60] more info --- .../decoder/ctc_prefix_beam_search_decoder.cc | 2 +- .../speechx/recognizer/u2_recognizer_main.cc | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc index 0a0afcd7..03a7c133 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc @@ -74,7 +74,7 @@ void CTCPrefixBeamSearch::AdvanceDecode( std::vector frame_prob; bool flag = decodable->FrameLikelihood(num_frame_decoded_, &frame_prob); if (flag == false) { - LOG(INFO) << "decoder advance decode exit." << frame_prob.size(); + VLOG(1) << "decoder advance decode exit." << frame_prob.size(); break; } diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc index c02e1c23..61330259 100644 --- a/speechx/speechx/recognizer/u2_recognizer_main.cc +++ b/speechx/speechx/recognizer/u2_recognizer_main.cc @@ -49,11 +49,13 @@ int main(int argc, char* argv[]) { kaldi::Timer timer; for (; !wav_reader.Done(); wav_reader.Next()) { + kaldi::Timer local_timer; std::string utt = wav_reader.Key(); const kaldi::WaveData& wave_data = wav_reader.Value(); LOG(INFO) << "utt: " << utt; LOG(INFO) << "wav dur: " << wave_data.Duration() << " sec."; - tot_wav_duration += wave_data.Duration(); + double dur = wave_data.Duration(); + tot_wav_duration += dur; int32 this_channel = 0; kaldi::SubVector waveform(wave_data.Data(), @@ -63,6 +65,7 @@ int main(int argc, char* argv[]) { int sample_offset = 0; int cnt = 0; + while (sample_offset < tot_samples) { int cur_chunk_size = std::min(chunk_sample_size, tot_samples - sample_offset); @@ -78,8 +81,10 @@ int main(int argc, char* argv[]) { recognizer.SetFinished(); } recognizer.Decode(); - LOG(INFO) << "Pratial result: " << cnt << " " - << recognizer.GetPartialResult(); + if (recognizer.DecodedSomething()) { + LOG(INFO) << "Pratial result: " << cnt << " " + << recognizer.GetPartialResult(); + } // no overlap sample_offset += cur_chunk_size; @@ -101,7 +106,9 @@ int main(int argc, char* argv[]) { continue; } - LOG(INFO) << " the result of " << utt << " is " << result; + LOG(INFO) << utt << " " << result; + LOG(INFO) << " RTF: " << dur / local_timer.Elapsed() << " dur: " << dur + << " cost: " << local_timer.Elapsed(); result_writer.Write(utt, result); @@ -111,7 +118,7 @@ int main(int argc, char* argv[]) { double elapsed = timer.Elapsed(); LOG(INFO) << "Done " << num_done << " out of " << (num_err + num_done); - LOG(INFO) << "cost:" << elapsed << " sec"; + LOG(INFO) << "total cost:" << elapsed << " sec"; LOG(INFO) << "total wav duration is: " << tot_wav_duration << " sec"; - LOG(INFO) << "the RTF is: " << elapsed / tot_wav_duration; + LOG(INFO) << "RTF is: " << elapsed / tot_wav_duration; } From b4d1dc1d6526b45b417327a1f4d1a35228c385ca Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 25 Oct 2022 06:53:35 +0000 Subject: [PATCH 60/60] fix rtf compute --- speechx/speechx/recognizer/u2_recognizer_main.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc index 61330259..5cb8dbb1 100644 --- a/speechx/speechx/recognizer/u2_recognizer_main.cc +++ b/speechx/speechx/recognizer/u2_recognizer_main.cc @@ -107,7 +107,7 @@ int main(int argc, char* argv[]) { } LOG(INFO) << utt << " " << result; - LOG(INFO) << " RTF: " << dur / local_timer.Elapsed() << " dur: " << dur + LOG(INFO) << " RTF: " << local_timer.Elapsed() / dur << " dur: " << dur << " cost: " << local_timer.Elapsed(); result_writer.Write(utt, result);