[runtime] optimization compile and add vad interface ()

* vad recipe ok

* refactor vad, add vad conf, vad inerface, vad recipe

* format

* install vad lib/bin/inc

* using cpack

* add vad doc, fix vad state name

* add comment

* refactor fastdeploy download

* add vad jni; format code

* add timer; compute vad rtf; vad add beam param

* andorid find library

* fix log; add vad rtf

* fix glog

* fix BUILD_TYPE bug

* update doc

* rm jni
pull/3050/head
Hui Zhang 2 years ago committed by GitHub
parent 2beb7ffce0
commit bf914a9c8b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,4 +1,5 @@
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
# >=3.17 support -DCMAKE_FIND_DEBUG_MODE=ON
cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
set(CMAKE_PROJECT_INCLUDE_BEFORE "${CMAKE_CURRENT_SOURCE_DIR}/cmake/EnableCMP0048.cmake")
@ -6,20 +7,12 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
include(system)
# Ninja Generator will set CMAKE_BUILD_TYPE to Debug
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE
"Release"
CACHE
STRING
"Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
FORCE)
endif()
project(paddlespeech VERSION 0.1)
include(FetchContent)
include(ExternalProject)
set(PPS_VERSION_MAJOR 1)
set(PPS_VERSION_MINOR 0)
set(PPS_VERSION_PATCH 0)
set(PPS_VERSION "${PPS_VERSION_MAJOR}.${PPS_VERSION_MINOR}.${PPS_VERSION_PATCH}")
# fc_patch dir
set(FETCHCONTENT_QUIET off)
@ -27,21 +20,36 @@ get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR
set(FETCHCONTENT_BASE_DIR ${fc_patch})
set(CMAKE_VERBOSE_MAKEFILE ON)
set(CMAKE_FIND_DEBUG_MODE OFF)
set(PPS_CXX_STANDARD 14)
# set std-14
set(CMAKE_CXX_STANDARD ${PPS_CXX_STANDARD})
add_compile_options(-fPIC)
# compiler option
# Keep the same with openfst, -fPIC or -fpic
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --std=c++14 -pthread -fPIC -O0 -Wall -g -ldl")
SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} --std=c++14 -pthread -fPIC -O0 -Wall -g -ggdb")
SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} --std=c++14 -pthread -fPIC -O3 -Wall")
# Ninja Generator will set CMAKE_BUILD_TYPE to Debug
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" FORCE)
endif()
# find_* e.g. find_library work when Cross-Compiling
if(ANDROID)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM BOTH)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH)
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH)
endif()
# install dir into `build/install`
set(CMAKE_INSTALL_PREFIX ${CMAKE_CURRENT_BINARY_DIR}/install)
include(FetchContent)
include(ExternalProject)
###############################################################################
# Option Configurations
###############################################################################
# https://github.com/google/brotli/pull/655
option(BUILD_SHARED_LIBS "Build shared libraries" ON)
option(WITH_ASR "build asr" ON)
option(WITH_CLS "build cls" ON)
option(WITH_VAD "build vad" ON)
@ -77,6 +85,7 @@ endif()
###############################################################################
# Find Package
###############################################################################
# https://github.com/Kitware/CMake/blob/v3.1.0/Modules/FindThreads.cmake#L207
find_package(Threads REQUIRED)
if(WITH_ASR)
@ -157,6 +166,22 @@ include(summary)
###############################################################################
# Add local library
###############################################################################
set(ENGINE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/engine)
set(ENGINE_ROOT ${CMAKE_SOURCE_DIR}/engine)
add_subdirectory(engine)
add_subdirectory(engine)
###############################################################################
# CPack library
###############################################################################
# build a CPack driven installer package
include (InstallRequiredSystemLibraries)
set(CPACK_PACKAGE_NAME "paddlespeech_library")
set(CPACK_PACKAGE_VENDOR "paddlespeech")
set(CPACK_PACKAGE_VERSION_MAJOR 1)
set(CPACK_PACKAGE_VERSION_MINOR 0)
set(CPACK_PACKAGE_VERSION_PATCH 0)
set(CPACK_PACKAGE_DESCRIPTION "paddlespeech library")
set(CPACK_PACKAGE_CONTACT "paddlespeech@baidu.com")
set(CPACK_SOURCE_GENERATOR "TGZ")
include (CPack)

@ -1,8 +1,20 @@
#!/usr/bin/env bash
set -xe
BUILD_ROOT=build/Linux
BUILD_DIR=${BUILD_ROOT}/x86_64
mkdir -p ${BUILD_DIR}
# the build script had verified in the paddlepaddle docker image.
# please follow the instruction below to install PaddlePaddle image.
# https://www.paddlepaddle.org.cn/documentation/docs/zh/install/docker/linux-docker.html
cmake -B build -DWITH_ASR=ON -DWITH_CLS=OFF -DWITH_VAD=OFF
cmake --build build -j
#cmake -B build -DBUILD_SHARED_LIBS=OFF -DWITH_ASR=OFF -DWITH_CLS=OFF -DWITH_VAD=ON -DFASTDEPLOY_INSTALL_DIR=/workspace/zhanghui/paddle/FastDeploy/build/Android/arm64-v8a-api-21/install
cmake -B ${BUILD_DIR} \
-DCMAKE_BUILD_TYPE=Release \
-DBUILD_SHARED_LIBS=OFF \
-DWITH_ASR=OFF \
-DWITH_CLS=OFF \
-DWITH_VAD=ON \
-DFASTDEPLOY_INSTALL_DIR=/workspace/zhanghui/paddle/FastDeploy/build/Linux/x86_64/install
cmake --build ${BUILD_DIR} -j

@ -14,8 +14,8 @@ TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake
# Create build directory
BUILD_ROOT=build/Android
BUILD_DIR=${BUILD_ROOT}/${ANDROID_ABI}-api-21
#FASDEPLOY_INSTALL_DIR="${BUILD_DIR}/install"
#mkdir build && mkdir ${BUILD_ROOT} && mkdir ${BUILD_DIR}
FASTDEPLOY_INSTALL_DIR="/workspace/zhanghui/paddle/FastDeploy/build/Android/arm64-v8a-api-21/install"
mkdir -p ${BUILD_DIR}
cd ${BUILD_DIR}
@ -27,10 +27,13 @@ cmake -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN_FILE} \
-DANDROID_PLATFORM=${ANDROID_PLATFORM} \
-DANDROID_STL=${ANDROID_STL} \
-DANDROID_TOOLCHAIN=${ANDROID_TOOLCHAIN} \
-DBUILD_SHARED_LIBS=OFF \
-DWITH_ASR=OFF \
-DWITH_CLS=OFF \
-DWITH_VAD=ON \
-DFASTDEPLOY_INSTALL_DIR=${FASTDEPLOY_INSTALL_DIR} \
-DCMAKE_FIND_DEBUG_MODE=OFF \
-Wno-dev ../../..
#-DFASTDEPLOY_INSTALL_DIR=${FASTDEPLOY_INSTALL_DIR} \
# Build FastDeploy Android C++ SDK
make

@ -1,42 +1,119 @@
set(ARCH "mserver_x86_64" CACHE STRING "Target Architecture:
android_arm, android_armv7, android_armv8, android_x86, android_x86_64,
mserver_x86_64, ubuntu_x86_64, ios_armv7, ios_armv7s, ios_armv8, ios_x86_64, ios_x86,
windows_x86")
set(FASTDEPLOY_DIR ${CMAKE_SOURCE_DIR}/fc_patch/fastdeploy)
if(NOT EXISTS ${FASTDEPLOY_DIR}/fastdeploy-linux-x64-1.0.4.tgz)
exec_program("mkdir -p ${FASTDEPLOY_DIR} &&
wget -c https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-1.0.4.tgz -P ${FASTDEPLOY_DIR} &&
tar xzvf ${FASTDEPLOY_DIR}/fastdeploy-linux-x64-1.0.4.tgz -C ${FASTDEPLOY_DIR} &&
mv ${FASTDEPLOY_DIR}/fastdeploy-linux-x64-1.0.4 ${FASTDEPLOY_DIR}/linux-x64")
endif()
include(FetchContent)
if(NOT EXISTS ${FASTDEPLOY_DIR}/fastdeploy-android-1.0.4-shared.tgz)
exec_program("mkdir -p ${FASTDEPLOY_DIR} &&
wget -c https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-1.0.4-shared.tgz -P ${FASTDEPLOY_DIR} &&
tar xzvf ${FASTDEPLOY_DIR}/fastdeploy-android-1.0.4-shared.tgz -C ${FASTDEPLOY_DIR} &&
mv ${FASTDEPLOY_DIR}/fastdeploy-android-1.0.4-shared ${FASTDEPLOY_DIR}/android-armv7v8")
endif()
set(EXTERNAL_PROJECT_LOG_ARGS
LOG_DOWNLOAD 1 # Wrap download in script to log output
LOG_UPDATE 1 # Wrap update in script to log output
LOG_PATCH 1
LOG_CONFIGURE 1# Wrap configure in script to log output
LOG_BUILD 1 # Wrap build in script to log output
LOG_INSTALL 1
LOG_TEST 1 # Wrap test in script to log output
LOG_MERGED_STDOUTERR 1
LOG_OUTPUT_ON_FAILURE 1
)
if(NOT FASTDEPLOY_INSTALL_DIR)
if(ANDROID)
FetchContent_Declare(
fastdeploy
URL https://bj.bcebos.com/fastdeploy/release/android/fastdeploy-android-1.0.4-shared.tgz
URL_HASH MD5=2a15301158e9eb157a4f11283689e7ba
${EXTERNAL_PROJECT_LOG_ARGS}
)
add_definitions("-DUSE_PADDLE_LITE_BAKEND")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -mfloat-abi=softfp -mfpu=vfpv3 -mfpu=neon -fPIC -pie -fPIE")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -g0 -O3 -mfloat-abi=softfp -mfpu=vfpv3 -mfpu=neon -fPIC -pie -fPIE")
else() # Linux
FetchContent_Declare(
fastdeploy
URL https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-1.0.4.tgz
URL_HASH MD5=125df3bfce603521960cc5c8b47faab0
${EXTERNAL_PROJECT_LOG_ARGS}
)
add_definitions("-DUSE_PADDLE_INFERENCE_BACKEND")
# add_definitions("-DUSE_ORT_BACKEND")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -msse -msse2")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -msse -msse2 -mavx -O3")
endif()
if(ANDROID)
set(FASTDEPLOY_INSTALL_DIR ${FASTDEPLOY_DIR}/android-armv7v8)
add_definitions("-DUSE_PADDLE_LITE_BAKEND")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -mfloat-abi=softfp -mfpu=vfpv3 -mfpu=neon -fPIC -pie -fPIE")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -g0 -O3 -mfloat-abi=softfp -mfpu=vfpv3 -mfpu=neon -fPIC -pie -fPIE")
elseif(UNIX)
set(FASTDEPLOY_INSTALL_DIR ${FASTDEPLOY_DIR}/linux-x64)
add_definitions("-DUSE_PADDLE_INFERENCE_BACKEND")
# add_definitions("-DUSE_ORT_BACKEND")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -msse -msse2")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -msse -msse2 -mavx -O3")
FetchContent_MakeAvailable(fastdeploy)
set(FASTDEPLOY_INSTALL_DIR ${fc_patch}/fastdeploy-src)
endif()
message(STATUS "FASTDEPLOY_INSTALL_DIR=${FASTDEPLOY_INSTALL_DIR} ${UNIX}")
include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
# fix compiler flags conflict, since fastdeploy using c++11 for project
# this line must after `include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)`
set(CMAKE_CXX_STANDARD ${PPS_CXX_STANDARD})
include_directories(${FASTDEPLOY_INCS})
message(STATUS "FASTDEPLOY_INCS=${FASTDEPLOY_INCS}")
# install fastdeploy and dependents lib
# install_fastdeploy_libraries(${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR})
# No dynamic libs need to install while using
# FastDeploy static lib.
if(ANDROID AND WITH_ANDROID_STATIC_LIB)
return()
endif()
set(DYN_LIB_SUFFIX "*.so*")
if(WIN32)
set(DYN_LIB_SUFFIX "*.dll")
elseif(APPLE)
set(DYN_LIB_SUFFIX "*.dylib*")
endif()
if(FastDeploy_DIR)
set(DYN_SEARCH_DIR ${FastDeploy_DIR})
elseif(FASTDEPLOY_INSTALL_DIR)
set(DYN_SEARCH_DIR ${FASTDEPLOY_INSTALL_DIR})
else()
message(FATAL_ERROR "Please set FastDeploy_DIR/FASTDEPLOY_INSTALL_DIR before call install_fastdeploy_libraries.")
endif()
file(GLOB_RECURSE ALL_NEED_DYN_LIBS ${DYN_SEARCH_DIR}/lib/${DYN_LIB_SUFFIX})
file(GLOB_RECURSE ALL_DEPS_DYN_LIBS ${DYN_SEARCH_DIR}/third_libs/${DYN_LIB_SUFFIX})
if(ENABLE_VISION)
# OpenCV
if(ANDROID)
file(GLOB_RECURSE ALL_OPENCV_DYN_LIBS ${OpenCV_NATIVE_DIR}/libs/${DYN_LIB_SUFFIX})
else()
file(GLOB_RECURSE ALL_OPENCV_DYN_LIBS ${OpenCV_DIR}/../../${DYN_LIB_SUFFIX})
endif()
list(REMOVE_ITEM ALL_DEPS_DYN_LIBS ${ALL_OPENCV_DYN_LIBS})
if(WIN32)
file(GLOB OPENCV_DYN_LIBS ${OpenCV_DIR}/x64/vc15/bin/${DYN_LIB_SUFFIX})
install(FILES ${OPENCV_DYN_LIBS} DESTINATION lib)
elseif(ANDROID AND (NOT WITH_ANDROID_OPENCV_STATIC))
file(GLOB OPENCV_DYN_LIBS ${OpenCV_NATIVE_DIR}/libs/${ANDROID_ABI}/${DYN_LIB_SUFFIX})
install(FILES ${OPENCV_DYN_LIBS} DESTINATION lib)
else() # linux/mac
file(GLOB OPENCV_DYN_LIBS ${OpenCV_DIR}/lib/${DYN_LIB_SUFFIX})
install(FILES ${OPENCV_DYN_LIBS} DESTINATION lib)
endif()
# FlyCV
if(ENABLE_FLYCV)
file(GLOB_RECURSE ALL_FLYCV_DYN_LIBS ${FLYCV_LIB_DIR}/${DYN_LIB_SUFFIX})
list(REMOVE_ITEM ALL_DEPS_DYN_LIBS ${ALL_FLYCV_DYN_LIBS})
if(ANDROID AND (NOT WITH_ANDROID_FLYCV_STATIC))
install(FILES ${ALL_FLYCV_DYN_LIBS} DESTINATION lib)
endif()
endif()
endif()
if(ENABLE_OPENVINO_BACKEND)
# need plugins.xml for openvino backend
set(OPENVINO_RUNTIME_BIN_DIR ${OPENVINO_DIR}/bin)
file(GLOB OPENVINO_PLUGIN_XML ${OPENVINO_RUNTIME_BIN_DIR}/*.xml)
install(FILES ${OPENVINO_PLUGIN_XML} DESTINATION lib)
endif()
# Install other libraries
install(FILES ${ALL_NEED_DYN_LIBS} DESTINATION lib)
install(FILES ${ALL_DEPS_DYN_LIBS} DESTINATION lib)

@ -9,3 +9,5 @@ FetchContent_MakeAvailable(gflags)
# openfst need
include_directories(${gflags_BINARY_DIR}/include)
install(FILES ${gflags_BINARY_DIR}/libgflags_nothreads.a DESTINATION lib)

@ -7,6 +7,19 @@ else() # UNIX
glog
URL https://paddleaudio.bj.bcebos.com/build/glog-0.4.0.zip
URL_HASH SHA256=9e1b54eb2782f53cd8af107ecf08d2ab64b8d0dc2b7f5594472f3bd63ca85cdc
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DWITH_GFLAGS=OFF
-DBUILD_TESTING=OFF
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
${EXTERNAL_OPTIONAL_ARGS}
)
FetchContent_MakeAvailable(glog)
include_directories(${glog_BINARY_DIR} ${glog_SOURCE_DIR}/src)
@ -15,7 +28,8 @@ endif()
if(ANDROID)
add_library(extern_glog INTERFACE)
add_dependencies(extern_glog gflags)
else() # UNIX
add_dependencies(glog gflags)
add_library(extern_glog ALIAS glog)
add_dependencies(extern_glog gflags)
endif()

@ -10,9 +10,19 @@ include(FetchContent)
#Application of Automata, (CIAA 2007), volume 4783 of Lecture Notes in
#Computer Science, pages 11-23. Springer, 2007. http://www.openfst.org.
set(EXTERNAL_PROJECT_LOG_ARGS
LOG_DOWNLOAD 1 # Wrap download in script to log output
LOG_UPDATE 1 # Wrap update in script to log output
LOG_CONFIGURE 1# Wrap configure in script to log output
LOG_BUILD 1 # Wrap build in script to log output
LOG_TEST 1 # Wrap test in script to log output
LOG_INSTALL 1 # Wrap install in script to log output
)
ExternalProject_Add(openfst
URL https://paddleaudio.bj.bcebos.com/build/openfst_1.7.2.zip
URL_HASH SHA256=ffc56931025579a8af3515741c0f3b0fc3a854c023421472c07ca0c6389c75e6
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${openfst_PREFIX_DIR}
SOURCE_DIR ${openfst_SOURCE_DIR}
BINARY_DIR ${openfst_BINARY_DIR}

@ -15,6 +15,7 @@
function(pps_summary)
message(STATUS "")
message(STATUS "*************PaddleSpeech Building Summary**********")
message(STATUS " PPS_VERSION : ${PPS_VERSION}")
message(STATUS " CMake version : ${CMAKE_VERSION}")
message(STATUS " CMake command : ${CMAKE_COMMAND}")
message(STATUS " UNIX : ${UNIX}")
@ -24,10 +25,13 @@ function(pps_summary)
message(STATUS " C++ compiler version : ${CMAKE_CXX_COMPILER_VERSION}")
message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS}")
message(STATUS " Build type : ${CMAKE_BUILD_TYPE}")
message(STATUS " BUILD_SHARED_LIBS : ${BUILD_SHARED_LIBS}")
get_directory_property(tmp DIRECTORY ${PROJECT_SOURCE_DIR} COMPILE_DEFINITIONS)
message(STATUS " Compile definitions : ${tmp}")
message(STATUS " CMAKE_PREFIX_PATH : ${CMAKE_PREFIX_PATH}")
message(STATUS " CMAKE_CURRENT_BINARY_DIR : ${CMAKE_CURRENT_BINARY_DIR}")
message(STATUS " CMAKE_INSTALL_PREFIX : ${CMAKE_INSTALL_PREFIX}")
message(STATUS " CMAKE_INSTALL_LIBDIR : ${CMAKE_INSTALL_LIBDIR}")
message(STATUS " CMAKE_MODULE_PATH : ${CMAKE_MODULE_PATH}")
message(STATUS " CMAKE_SYSTEM_NAME : ${CMAKE_SYSTEM_NAME}")
message(STATUS "")
@ -39,6 +43,8 @@ function(pps_summary)
message(STATUS " WITH_TESTING : ${WITH_TESTING}")
message(STATUS " WITH_PROFILING : ${WITH_PROFILING}")
message(STATUS " FASTDEPLOY_INSTALL_DIR : ${FASTDEPLOY_INSTALL_DIR}")
message(STATUS " FASTDEPLOY_INCS : ${FASTDEPLOY_INCS}")
message(STATUS " FASTDEPLOY_LIBS : ${FASTDEPLOY_LIBS}")
if(WITH_GPU)
message(STATUS " CUDA_DIRECTORY : ${CUDA_DIRECTORY}")
endif()

@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "decoder/ctc_prefix_beam_search_decoder.h"
#include "base/common.h"
#include "decoder/ctc_prefix_beam_search_decoder.h"
#include "frontend/data_cache.h"
#include "fst/symbol-table.h"
#include "kaldi/util/table-types.h"
@ -117,9 +117,9 @@ int main(int argc, char* argv[]) {
ori_feature_len - chunk_idx * chunk_stride, chunk_size);
}
if (this_chunk_size < receptive_field_length) {
LOG(WARNING) << "utt: " << utt << " skip last "
<< this_chunk_size << " frames, expect is "
<< receptive_field_length;
LOG(WARNING)
<< "utt: " << utt << " skip last " << this_chunk_size
<< " frames, expect is " << receptive_field_length;
break;
}

@ -14,8 +14,8 @@
// todo refactor, repalce with gtest
#include "decoder/ctc_tlg_decoder.h"
#include "base/common.h"
#include "decoder/ctc_tlg_decoder.h"
#include "decoder/param.h"
#include "frontend/data_cache.h"
#include "kaldi/util/table-types.h"

@ -13,12 +13,13 @@
// limitations under the License.
#include "nnet/nnet_producer.h"
#include "matrix/kaldi-matrix.h"
namespace ppspeech {
using std::vector;
using kaldi::BaseFloat;
using std::vector;
NnetProducer::NnetProducer(std::shared_ptr<NnetBase> nnet,
std::shared_ptr<FrontendInterface> frontend)

@ -13,13 +13,13 @@
// limitations under the License.
#include "nnet/u2_nnet.h"
#include "base/common.h"
#include "decoder/param.h"
#include "frontend/assembler.h"
#include "frontend/data_cache.h"
#include "kaldi/util/table-types.h"
#include "nnet/decodable.h"
#include "nnet/u2_nnet.h"
DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
@ -93,9 +93,9 @@ int main(int argc, char* argv[]) {
ori_feature_len - chunk_idx * chunk_stride, chunk_size);
}
if (this_chunk_size < receptive_field_length) {
LOG(WARNING) << "utt: " << utt << " skip last "
<< this_chunk_size << " frames, expect is "
<< receptive_field_length;
LOG(WARNING)
<< "utt: " << utt << " skip last " << this_chunk_size
<< " frames, expect is " << receptive_field_length;
break;
}

@ -13,7 +13,6 @@
// limitations under the License.
#include "nnet/u2_nnet.h"
#include "base/common.h"
#include "decoder/param.h"
#include "frontend/feature_pipeline.h"
@ -21,6 +20,7 @@
#include "kaldi/util/table-types.h"
#include "nnet/decodable.h"
#include "nnet/nnet_producer.h"
#include "nnet/u2_nnet.h"
DEFINE_string(wav_rspecifier, "", "test wav rspecifier");
DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier");

@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "recognizer/u2_recognizer.h"
#include "common/base/thread_pool.h"
#include "common/utils/file_utils.h"
#include "common/utils/strings.h"
@ -20,6 +19,7 @@
#include "frontend/wave-reader.h"
#include "kaldi/util/table-types.h"
#include "nnet/u2_nnet.h"
#include "recognizer/u2_recognizer.h"
DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
DEFINE_string(result_wspecifier, "", "test result wspecifier");

@ -12,10 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "recognizer/u2_recognizer.h"
#include "decoder/param.h"
#include "frontend/wave-reader.h"
#include "kaldi/util/table-types.h"
#include "recognizer/u2_recognizer.h"
DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
DEFINE_string(result_wspecifier, "", "test result wspecifier");

@ -12,10 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "recognizer/u2_recognizer.h"
#include "decoder/param.h"
#include "frontend/wave-reader.h"
#include "kaldi/util/table-types.h"
#include "recognizer/u2_recognizer.h"
DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
DEFINE_string(result_wspecifier, "", "test result wspecifier");

@ -12,10 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "websocket/websocket_client.h"
#include "kaldi/feat/wave-reader.h"
#include "kaldi/util/kaldi-io.h"
#include "kaldi/util/table-types.h"
#include "websocket/websocket_client.h"
DEFINE_string(host, "127.0.0.1", "host of websocket server");
DEFINE_int32(port, 8082, "port of websocket server");

@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "websocket/websocket_server.h"
#include "decoder/param.h"
#include "websocket/websocket_server.h"
DEFINE_int32(port, 8082, "websocket listening port");

@ -3,7 +3,7 @@ set(srcs
panns_interface.cc
)
add_library(cls SHARED ${srcs})
add_library(cls ${srcs})
target_link_libraries(cls INTERFACE -static-libstdc++;-Wl,-Bsymbolic ${FASTDEPLOY_LIBS} kaldi-matrix kaldi-base frontend utils )
set(bin_name panns_nnet_main)

@ -13,6 +13,7 @@
// limitations under the License.
#include "cls/nnet/panns_interface.h"
#include "cls/nnet/panns_nnet.h"
#include "common/base/config.h"

@ -14,6 +14,7 @@
#include <fstream>
#include <string>
#include "base/flags.h"
#include "cls/nnet/panns_interface.h"

@ -12,4 +12,8 @@ ${CMAKE_CURRENT_SOURCE_DIR}/frontend
add_subdirectory(frontend)
add_library(common INTERFACE)
add_definitions(common base utils kaldi-matrix frontend)
target_link_libraries(common INTERFACE base utils kaldi-matrix frontend)
install(TARGETS base DESTINATION lib)
install(TARGETS utils DESTINATION lib)
install(TARGETS kaldi-matrix DESTINATION lib)
install(TARGETS frontend DESTINATION lib)

@ -36,6 +36,7 @@ if(ANDROID)
glog_utils.cc
)
add_library(base ${csrc})
target_link_libraries(base gflags)
else() # UNIX
set(csrc)
add_library(base INTERFACE)

@ -28,7 +28,7 @@ typedef int int32; // NOLINT
#if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD)
typedef long int64; // NOLINT
#else
typedef long long int64; // NOLINT
typedef long long int64; // NOLINT
#endif
typedef unsigned char uint8; // NOLINT

@ -50,4 +50,5 @@
#include "base/log.h"
#include "base/macros.h"
#include "utils/file_utils.h"
#include "utils/math.h"
#include "utils/math.h"
#include "utils/timer.h"

@ -10,11 +10,14 @@ using namespace std;
#pragma once
#ifdef _MSC_VER
#pragma region ParseIniFile
#endif
/*
* \brief Generic configuration Class
*
*/
* \brief Generic configuration Class
*
*/
class Config {
// Data
protected:
@ -32,7 +35,7 @@ class Config {
std::string comment = "#");
Config();
template <class T>
T Read(const std::string& in_key) const; //!<Search for key and read value
T Read(const std::string& in_key) const; //!< Search for key and read value
//! or optional default value, call
//! as read<T>
template <class T>
@ -335,4 +338,6 @@ void Config::ReadFile(string filename, string delimiter, string comment) {
in >> (*this);
}
#ifdef _MSC_VER
#pragma endregion ParseIniFIle
#endif

@ -29,9 +29,9 @@ LogMessage::LogMessage(const char* file,
bool out_to_file /* = false */)
: level_(level), verbose_(verbose), out_to_file_(out_to_file) {
if (FLAGS_logtostderr == 0) {
stream_ = std::shared_ptr<std::ostream>(&std::cout);
stream_ = static_cast<std::ostream*>(&std::cout);
} else if (FLAGS_logtostderr == 1) {
stream_ = std::shared_ptr<std::ostream>(&std::cerr);
stream_ = static_cast<std::ostream*>(&std::cerr);
} else if (out_to_file_) {
// logfile
lock_.lock();
@ -46,11 +46,21 @@ LogMessage::~LogMessage() {
lock_.unlock();
}
if (level_ == FATAL) {
if (verbose_ && level_ == FATAL) {
std::abort();
}
}
std::ostream* LogMessage::nullstream() {
thread_local static std::ofstream os;
thread_local static bool flag_set = false;
if (!flag_set) {
os.setstate(std::ios_base::badbit);
flag_set = true;
}
return &os;
}
void LogMessage::init(const char* file, int line) {
time_t t = time(0);
char tmp[100];
@ -73,30 +83,20 @@ void LogMessage::init(const char* file, int line) {
std::string("log." + proc_name + ".log.FATAL." + tmp + "." + pid);
}
std::ofstream ofs;
thread_local static std::ofstream ofs;
if (level_ == DEBUG) {
stream_ = std::make_shared<std::ofstream>(
s_debug_logfile_.c_str(), std::ios::out | std::ios::app);
// ofs.open(s_debug_logfile_.c_str(), std::ios::out | std::ios::app);
ofs.open(s_debug_logfile_.c_str(), std::ios::out | std::ios::app);
} else if (level_ == INFO) {
// ofs.open(s_warning_logfile_.c_str(), std::ios::out | std::ios::app);
stream_ = std::make_shared<std::ofstream>(
s_warning_logfile_.c_str(), std::ios::out | std::ios::app);
ofs.open(s_info_logfile_.c_str(), std::ios::out | std::ios::app);
} else if (level_ == WARNING) {
// ofs.open(s_warning_logfile_.c_str(), std::ios::out | std::ios::app);
stream_ = std::make_shared<std::ofstream>(
s_warning_logfile_.c_str(), std::ios::out | std::ios::app);
ofs.open(s_warning_logfile_.c_str(), std::ios::out | std::ios::app);
} else if (level_ == ERROR) {
// ofs.open(s_error_logfile_.c_str(), std::ios::out | std::ios::app);
stream_ = std::make_shared<std::ofstream>(
s_error_logfile_.c_str(), std::ios::out | std::ios::app);
ofs.open(s_error_logfile_.c_str(), std::ios::out | std::ios::app);
} else {
// ofs.open(s_fatal_logfile_.c_str(), std::ios::out | std::ios::app);
stream_ = std::make_shared<std::ofstream>(
s_fatal_logfile_.c_str(), std::ios::out | std::ios::app);
ofs.open(s_fatal_logfile_.c_str(), std::ios::out | std::ios::app);
}
// stream_ = &ofs;
stream_ = &ofs;
stream() << tmp << " " << file << " line " << line << "; ";
stream() << std::flush;

@ -18,6 +18,9 @@
#pragma once
#include <stdlib.h>
#include <unistd.h>
#include <fstream>
#include <iostream>
#include <mutex>
@ -25,9 +28,6 @@
#include <string>
#include <thread>
#include <stdlib.h>
#include <unistd.h>
#include "base/common.h"
#include "base/macros.h"
#ifndef WITH_GLOG
@ -61,13 +61,15 @@ class LogMessage {
~LogMessage();
std::ostream& stream() { return *stream_; }
std::ostream& stream() { return verbose_ ? *stream_ : *nullstream(); }
private:
void init(const char* file, int line);
std::ostream* nullstream();
private:
std::shared_ptr<std::ostream> stream_;
std::ostream* stream_;
std::ostream* null_stream_;
Severity level_;
bool verbose_;
bool out_to_file_;
@ -88,14 +90,16 @@ class LogMessage {
} // namespace ppspeech
#ifndef NDEBUG
#define DLOG_DEBUG \
ppspeech::log::LogMessage(__FILE__, __LINE__, ppspeech::log::DEBUG, false)
#ifdef NDEBUG
#define DLOG_INFO \
ppspeech::log::LogMessage(__FILE__, __LINE__, ppspeech::log::INFO, false)
#define DLOG_WARNING \
ppspeech::log::LogMessage(__FILE__, __LINE__, ppspeech::log::WARNING, false)
#define DLOG_ERROR \
ppspeech::log::LogMessage(__FILE__, __LINE__, ppspeech::log::ERROR, false)
#define DLOG_FATAL \
ppspeech::log::LogMessage(__FILE__, __LINE__, ppspeech::log::FATAL, false)
#else
#define DLOG_DEBUG \
ppspeech::log::LogMessage(__FILE__, __LINE__, ppspeech::log::DEBUG, true)
#endif
#define DLOG_INFO \
ppspeech::log::LogMessage(__FILE__, __LINE__, ppspeech::log::INFO, true)
#define DLOG_WARNING \
@ -104,17 +108,30 @@ class LogMessage {
ppspeech::log::LogMessage(__FILE__, __LINE__, ppspeech::log::ERROR, true)
#define DLOG_FATAL \
ppspeech::log::LogMessage(__FILE__, __LINE__, ppspeech::log::FATAL, true)
#endif
#define DLOG_0 DLOG_DEBUG
#define DLOG_1 DLOG_INFO
#define DLOG_2 DLOG_WARNING
#define DLOG_3 DLOG_ERROR
#define DLOG_4 DLOG_FATAL
#define LOG(level) DLOG_##level.stream()
#define LOG_INFO \
ppspeech::log::LogMessage(__FILE__, __LINE__, ppspeech::log::INFO, true)
#define LOG_WARNING \
ppspeech::log::LogMessage(__FILE__, __LINE__, ppspeech::log::WARNING, true)
#define LOG_ERROR \
ppspeech::log::LogMessage(__FILE__, __LINE__, ppspeech::log::ERROR, true)
#define LOG_FATAL \
ppspeech::log::LogMessage(__FILE__, __LINE__, ppspeech::log::FATAL, true)
#define VLOG(verboselevel) LOG(verboselevel)
#define LOG_0 LOG_DEBUG
#define LOG_1 LOG_INFO
#define LOG_2 LOG_WARNING
#define LOG_3 LOG_ERROR
#define LOG_4 LOG_FATAL
#define LOG(level) LOG_##level.stream()
#define DLOG(level) DLOG_##level.stream()
#define VLOG(verboselevel) LOG(verboselevel)
#define CHECK(exp) \
ppspeech::log::LogMessage( \

@ -6,6 +6,7 @@ add_library(kaldi-native-fbank-core
mel-computations.cc
rfft.cc
)
target_link_libraries(kaldi-native-fbank-core PUBLIC utils base)
add_library(frontend STATIC
cmvn.cc
@ -15,7 +16,7 @@ add_library(frontend STATIC
assembler.cc
wave-reader.cc
)
target_link_libraries(frontend PUBLIC kaldi-native-fbank-core utils)
target_link_libraries(frontend PUBLIC kaldi-native-fbank-core utils base)
set(BINS
compute_fbank_main
@ -24,5 +25,6 @@ set(BINS
foreach(bin_name IN LISTS BINS)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
target_link_libraries(${bin_name} PUBLIC frontend base utils kaldi-util gflags extern_glog)
# https://github.com/Kitware/CMake/blob/v3.1.0/Modules/FindThreads.cmake#L207
target_link_libraries(${bin_name} PUBLIC frontend base utils kaldi-util gflags Threads::Threads extern_glog)
endforeach()

@ -17,9 +17,8 @@
namespace ppspeech {
using kaldi::BaseFloat;
using std::vector;
using std::vector;
using std::unique_ptr;
using std::vector;
Assembler::Assembler(AssemblerOptions opts,
unique_ptr<FrontendInterface> base_extractor) {

@ -821,12 +821,12 @@ void cftfsub(int n, double *a, int *ip, int nw, double *w) {
} else
#endif /* USE_CDFT_THREADS */
if (n > 512) {
cftrec4(n, a, nw, w);
} else if (n > 128) {
cftleaf(n, 1, a, nw, w);
} else {
cftfx41(n, a, nw, w);
}
cftrec4(n, a, nw, w);
} else if (n > 128) {
cftleaf(n, 1, a, nw, w);
} else {
cftfx41(n, a, nw, w);
}
bitrv2(n, ip, a);
} else if (n == 32) {
cftf161(a, &w[nw - 8]);
@ -868,12 +868,12 @@ void cftbsub(int n, double *a, int *ip, int nw, double *w) {
} else
#endif /* USE_CDFT_THREADS */
if (n > 512) {
cftrec4(n, a, nw, w);
} else if (n > 128) {
cftleaf(n, 1, a, nw, w);
} else {
cftfx41(n, a, nw, w);
}
cftrec4(n, a, nw, w);
} else if (n > 128) {
cftleaf(n, 1, a, nw, w);
} else {
cftfx41(n, a, nw, w);
}
bitrv2conj(n, ip, a);
} else if (n == 32) {
cftf161(a, &w[nw - 8]);

@ -17,12 +17,13 @@
*/
#include "frontend/rfft.h"
#include "base/log.h"
#include <cmath>
#include <memory>
#include <vector>
#include "base/log.h"
// see fftsg.c
#ifdef __cplusplus
extern "C" void rdft(int n, int isgn, double *a, int *ip, double *w);

@ -19,6 +19,8 @@
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "frontend/wave-reader.h"
#include <algorithm>
#include <cstdio>
#include <limits>
@ -27,7 +29,6 @@
#include "base/kaldi-error.h"
#include "base/kaldi-utils.h"
#include "frontend/wave-reader.h"
namespace kaldi {
@ -243,10 +244,9 @@ void WaveInfo::Read(std::istream &is) {
<< ", data chunk size: " << data_chunk_size
<< ". Assume 'stream mode' (reading data to EOF).";
if (!is_stream_mode &&
std::abs(static_cast<int64>(riff_chunk_read) +
static_cast<int64>(data_chunk_size) -
static_cast<int64>(riff_chunk_size)) > 1) {
if (!is_stream_mode && std::abs(static_cast<int64>(riff_chunk_read) +
static_cast<int64>(data_chunk_size) -
static_cast<int64>(riff_chunk_size)) > 1) {
// We allow the size to be off by one without warning, because there is
// a
// weirdness in the format of RIFF files that means that the input may

@ -590,7 +590,7 @@ class MatrixBase {
* SpMatrix and use Eig() function there, which uses eigenvalue
* decomposition
* directly rather than SVD.
*/
*/
/// stream read.
/// Use instead of stream<<*this, if you want to add to existing contents.

@ -24,8 +24,10 @@
// limitations under the License.
#include "matrix/kaldi-vector.h"
#include <algorithm>
#include <string>
#include "matrix/kaldi-matrix.h"
namespace kaldi {

@ -90,7 +90,7 @@ typedef uint32 UnsignedMatrixIndexT;
// typedef size_t MatrixIndexT;
// typedef ssize_t SignedMatrixIndexT;
// typedef size_t UnsignedMatrixIndexT;
}
} // namespace kaldi
#endif // KALDI_MATRIX_MATRIX_COMMON_H_

@ -5,6 +5,7 @@ set(csrc
math.cc
strings.cc
audio_process.cc
timer.cc
)
add_library(utils ${csrc})

@ -0,0 +1,63 @@
// Copyright 2020 Xiaomi Corporation (authors: Haowen Qiu)
// Mobvoi Inc. (authors: Fangjun Kuang)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <chrono>
#include "common/utils/timer.h"
namespace ppspeech{
struct TimerImpl{
TimerImpl() = default;
virtual ~TimerImpl() = default;
virtual void Reset() = 0;
// time in seconds
virtual double Elapsed() = 0;
};
class CpuTimerImpl : public TimerImpl {
public:
CpuTimerImpl() { Reset(); }
using high_resolution_clock = std::chrono::high_resolution_clock;
void Reset() override { begin_ = high_resolution_clock::now(); }
// time in seconds
double Elapsed() override {
auto end = high_resolution_clock::now();
auto dur =
std::chrono::duration_cast<std::chrono::microseconds>(end - begin_);
return dur.count() / 1000000.0;
}
private:
high_resolution_clock::time_point begin_;
};
Timer::Timer() {
impl_ = std::make_unique<CpuTimerImpl>();
}
Timer::~Timer() = default;
void Timer::Reset() const { impl_->Reset(); }
double Timer::Elapsed() const { return impl_->Elapsed(); }
} //namespace ppspeech

@ -0,0 +1,39 @@
// Copyright 2020 Xiaomi Corporation (authors: Haowen Qiu)
// Mobvoi Inc. (authors: Fangjun Kuang)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
namespace ppspeech {
struct TimerImpl;
class Timer {
public:
Timer();
~Timer();
void Reset() const;
// time in seconds
double Elapsed() const;
private:
std::unique_ptr<TimerImpl> impl_;
};
} //namespace ppspeech

@ -1,5 +1,7 @@
include_directories(
${CMAKE_CURRENT_SOURCE_DIR}/../
)
add_subdirectory(nnet)
set(bin_name silero_vad_main)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc vad.cc)
target_link_libraries(${bin_name} ${FASTDEPLOY_LIBS} gflags extern_glog)
add_subdirectory(interface)

@ -17,6 +17,8 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <string>
namespace wav {

@ -0,0 +1,25 @@
set(srcs
vad_interface.cc
)
add_library(pps_vad_interface ${srcs})
target_link_libraries(pps_vad_interface PUBLIC pps_vad extern_glog)
set(bin_name vad_interface_main)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_link_libraries(${bin_name} pps_vad_interface)
# set_target_properties(${bin_name} PROPERTIES PUBLIC_HEADER "vad_interface.h;../frontend/wav.h")
file(RELATIVE_PATH DEST_DIR ${ENGINE_ROOT} ${CMAKE_CURRENT_SOURCE_DIR})
install(TARGETS pps_vad_interface DESTINATION lib)
install(FILES vad_interface.h DESTINATION include/${DEST_DIR})
install(TARGETS vad_interface_main
RUNTIME DESTINATION bin
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib
PUBLIC_HEADER DESTINATION include/${DEST_DIR}
)
install(FILES vad_interface_main.cc DESTINATION demo/${DEST_DIR})

@ -0,0 +1,94 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "vad/interface/vad_interface.h"
#include "common/base/config.h"
#include "vad/nnet/vad.h"
PPSHandle_t PPSVadCreateInstance(const char* conf_path) {
Config conf(conf_path);
ppspeech::VadNnetConf nnet_conf;
nnet_conf.sr = conf.Read("sr", 16000);
nnet_conf.frame_ms = conf.Read("frame_ms", 32);
nnet_conf.threshold = conf.Read("threshold", 0.45f);
nnet_conf.beam = conf.Read("beam", 0.15f);
nnet_conf.min_silence_duration_ms =
conf.Read("min_silence_duration_ms", 200);
nnet_conf.speech_pad_left_ms = conf.Read("speech_pad_left_ms", 0);
nnet_conf.speech_pad_right_ms = conf.Read("speech_pad_right_ms", 0);
nnet_conf.model_file_path = conf.Read("model_path", std::string(""));
nnet_conf.param_file_path = conf.Read("param_path", std::string(""));
nnet_conf.num_cpu_thread = conf.Read("num_cpu_thread", 1);
ppspeech::Vad* model = new ppspeech::Vad(nnet_conf.model_file_path);
// custom config, but must be set before init
model->SetConfig(nnet_conf);
model->Init();
return static_cast<PPSHandle_t>(model);
}
int PPSVadDestroyInstance(PPSHandle_t instance) {
ppspeech::Vad* model = static_cast<ppspeech::Vad*>(instance);
if (model != nullptr) {
delete model;
model = nullptr;
}
return 0;
}
int PPSVadChunkSizeSamples(PPSHandle_t instance) {
ppspeech::Vad* model = static_cast<ppspeech::Vad*>(instance);
if (model == nullptr) {
printf("instance is null\n");
return -1;
}
return model->WindowSizeSamples();
}
PPSVadState_t PPSVadFeedForward(PPSHandle_t instance,
float* chunk,
int num_element) {
ppspeech::Vad* model = static_cast<ppspeech::Vad*>(instance);
if (model == nullptr) {
printf("instance is null\n");
return PPS_VAD_ILLEGAL;
}
std::vector<float> chunk_in(chunk, chunk + num_element);
if (!model->ForwardChunk(chunk_in)) {
printf("forward chunk failed\n");
return PPS_VAD_ILLEGAL;
}
ppspeech::Vad::State s = model->Postprocess();
PPSVadState_t ret = (PPSVadState_t)s;
return ret;
}
int PPSVadReset(PPSHandle_t instance) {
ppspeech::Vad* model = static_cast<ppspeech::Vad*>(instance);
if (model == nullptr) {
printf("instance is null\n");
return -1;
}
model->Reset();
return 0;
}

@ -0,0 +1,46 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef __cplusplus
extern "C" {
#endif
typedef void* PPSHandle_t;
typedef enum {
PPS_VAD_ILLEGAL = 0, // error
PPS_VAD_SIL, // silence
PPS_VAD_START, // start speech
PPS_VAD_SPEECH, // in speech
PPS_VAD_END, // end speech
PPS_VAD_NUMSTATES, // number of states
} PPSVadState_t;
PPSHandle_t PPSVadCreateInstance(const char* conf_path);
int PPSVadDestroyInstance(PPSHandle_t instance);
int PPSVadReset(PPSHandle_t instance);
int PPSVadChunkSizeSamples(PPSHandle_t instance);
PPSVadState_t PPSVadFeedForward(PPSHandle_t instance,
float* chunk,
int num_element);
#ifdef __cplusplus
}
#endif // __cplusplus

@ -0,0 +1,71 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <vector>
#include "common/base/common.h"
#include "vad/frontend/wav.h"
#include "vad/interface/vad_interface.h"
int main(int argc, char* argv[]) {
if (argc < 3) {
std::cout << "Usage: vad_interface_main path/to/config path/to/audio "
"run_option, "
"e.g ./vad_interface_main config sample.wav"
<< std::endl;
return -1;
}
std::string config_path = argv[1];
std::string audio_file = argv[2];
PPSHandle_t handle = PPSVadCreateInstance(config_path.c_str());
std::vector<float> inputWav; // [0, 1]
wav::WavReader wav_reader = wav::WavReader(audio_file);
auto sr = wav_reader.sample_rate();
CHECK(sr == 16000) << " sr is " << sr << " expect 16000";
auto num_samples = wav_reader.num_samples();
inputWav.resize(num_samples);
for (int i = 0; i < num_samples; i++) {
inputWav[i] = wav_reader.data()[i] / 32768;
}
ppspeech::Timer timer;
int window_size_samples = PPSVadChunkSizeSamples(handle);
for (int64_t j = 0; j < num_samples; j += window_size_samples) {
auto start = j;
auto end = start + window_size_samples >= num_samples
? num_samples
: start + window_size_samples;
auto current_chunk_size = end - start;
std::vector<float> r{&inputWav[0] + start, &inputWav[0] + end};
assert(r.size() == static_cast<size_t>(current_chunk_size));
PPSVadState_t s = PPSVadFeedForward(handle, r.data(), r.size());
std::cout << s << " ";
}
std::cout << std::endl;
std::cout << "RTF=" << timer.Elapsed() / double(num_samples / sr)
<< std::endl;
PPSVadReset(handle);
return 0;
}

@ -0,0 +1,16 @@
set(srcs
vad.cc
)
add_library(pps_vad ${srcs})
target_link_libraries(pps_vad PUBLIC ${FASTDEPLOY_LIBS} common extern_glog)
set(bin_name vad_nnet_main)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_link_libraries(${bin_name} pps_vad)
file(RELATIVE_PATH DEST_DIR ${ENGINE_ROOT} ${CMAKE_CURRENT_SOURCE_DIR})
install(TARGETS pps_vad DESTINATION lib)
install(TARGETS extern_glog DESTINATION lib)

@ -1,4 +1,5 @@
// Copyright (c) 2023 Chen Qianhe Authors. All Rights Reserved.
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -11,20 +12,15 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "vad.h"
#include "vad/nnet/vad.h"
#include <cstring>
#include <iomanip>
#include "common/base/common.h"
#ifdef NDEBUG
#define LOG_DEBUG \
::fastdeploy::FDLogger(true, "[DEBUG]") << __REL_FILE__ << "(" << __LINE__ \
<< ")::" << __FUNCTION__ << "\t"
#else
#define LOG_DEBUG \
::fastdeploy::FDLogger(false, "[DEBUG]") \
<< __REL_FILE__ << "(" << __LINE__ << ")::" << __FUNCTION__ << "\t"
#endif
namespace ppspeech {
Vad::Vad(const std::string& model_file,
const fastdeploy::RuntimeOption&
@ -48,18 +44,30 @@ Vad::Vad(const std::string& model_file,
}
void Vad::Init() {
std::call_once(init_, [&]() { initialized = Initialize(); });
std::lock_guard<std::mutex> lock(init_lock_);
Initialize();
}
std::string Vad::ModelName() const { return "VAD"; }
void Vad::SetConfig(int sr,
int frame_ms,
float threshold,
int min_silence_duration_ms,
int speech_pad_left_ms,
int speech_pad_right_ms) {
if (initialized) {
void Vad::SetConfig(const VadNnetConf conf) {
SetConfig(conf.sr,
conf.frame_ms,
conf.threshold,
conf.beam,
conf.min_silence_duration_ms,
conf.speech_pad_left_ms,
conf.speech_pad_right_ms);
}
void Vad::SetConfig(const int& sr,
const int& frame_ms,
const float& threshold,
const float& beam,
const int& min_silence_duration_ms,
const int& speech_pad_left_ms,
const int& speech_pad_right_ms) {
if (initialized_) {
fastdeploy::FDERROR << "SetConfig must be called before init"
<< std::endl;
throw std::runtime_error("SetConfig must be called before init");
@ -67,6 +75,7 @@ void Vad::SetConfig(int sr,
sample_rate_ = sr;
sr_per_ms_ = sr / 1000;
threshold_ = threshold;
beam_ = beam;
frame_ms_ = frame_ms;
min_silence_samples_ = min_silence_duration_ms * sr_per_ms_;
speech_pad_left_samples_ = speech_pad_left_ms * sr_per_ms_;
@ -76,8 +85,8 @@ void Vad::SetConfig(int sr,
window_size_samples_ = frame_ms * sr_per_ms_;
current_chunk_size_ = window_size_samples_;
fastdeploy::FDINFO << "sr=" << sr << " threshold=" << threshold
<< " frame_ms=" << frame_ms
fastdeploy::FDINFO << "sr=" << sr_per_ms_ << " threshold=" << threshold_
<< " beam=" << beam_ << " frame_ms=" << frame_ms_
<< " min_silence_duration_ms=" << min_silence_duration_ms
<< " speech_pad_left_ms=" << speech_pad_left_ms
<< " speech_pad_right_ms=" << speech_pad_right_ms;
@ -114,12 +123,17 @@ bool Vad::Initialize() {
Reset();
// InitRuntime
if (!InitRuntime()) {
fastdeploy::FDERROR << "Failed to initialize fastdeploy backend."
<< std::endl;
return false;
}
initialized_ = true;
fastdeploy::FDINFO << "init done.";
return true;
}
@ -162,8 +176,8 @@ const Vad::State& Vad::Postprocess() {
if (outputProb_ < threshold_ && !triggerd_) {
// 1. Silence
LOG_DEBUG << "{ silence: " << 1.0 * current_sample_ / sample_rate_
<< " s; prob: " << outputProb_ << " }";
DLOG(INFO) << "{ silence: " << 1.0 * current_sample_ / sample_rate_
<< " s; prob: " << outputProb_ << " }";
states_.emplace_back(Vad::State::SIL);
} else if (outputProb_ >= threshold_ && !triggerd_) {
// 2. Start
@ -172,27 +186,28 @@ const Vad::State& Vad::Postprocess() {
current_sample_ - current_chunk_size_ - speech_pad_left_samples_;
float start_sec = 1.0 * speech_start_ / sample_rate_;
speakStart_.emplace_back(start_sec);
LOG_DEBUG << "{ speech start: " << start_sec
<< " s; prob: " << outputProb_ << " }";
DLOG(INFO) << "{ speech start: " << start_sec
<< " s; prob: " << outputProb_ << " }";
states_.emplace_back(Vad::State::START);
} else if (outputProb_ >= threshold_ - 0.15 && triggerd_) {
} else if (outputProb_ >= threshold_ - beam_ && triggerd_) {
// 3. Continue
if (temp_end_ != 0) {
// speech prob relaxation, speech continues again
LOG_DEBUG << "{ speech fake end(sil < min_silence_ms) to continue: "
<< 1.0 * current_sample_ / sample_rate_
<< " s; prob: " << outputProb_ << " }";
DLOG(INFO)
<< "{ speech fake end(sil < min_silence_ms) to continue: "
<< 1.0 * current_sample_ / sample_rate_
<< " s; prob: " << outputProb_ << " }";
temp_end_ = 0;
} else {
// speech prob relaxation, keep tracking speech
LOG_DEBUG << "{ speech continue: "
<< 1.0 * current_sample_ / sample_rate_
<< " s; prob: " << outputProb_ << " }";
DLOG(INFO) << "{ speech continue: "
<< 1.0 * current_sample_ / sample_rate_
<< " s; prob: " << outputProb_ << " }";
}
states_.emplace_back(Vad::State::SPEECH);
} else if (outputProb_ < threshold_ - 0.15 && triggerd_) {
} else if (outputProb_ < threshold_ - beam_ && triggerd_) {
// 4. End
if (temp_end_ == 0) {
temp_end_ = current_sample_;
@ -201,9 +216,9 @@ const Vad::State& Vad::Postprocess() {
// check possible speech end
if (current_sample_ - temp_end_ < min_silence_samples_) {
// a. silence < min_slience_samples, continue speaking
LOG_DEBUG << "{ speech fake end(sil < min_silence_ms): "
<< 1.0 * current_sample_ / sample_rate_
<< " s; prob: " << outputProb_ << " }";
DLOG(INFO) << "{ speech fake end(sil < min_silence_ms): "
<< 1.0 * current_sample_ / sample_rate_
<< " s; prob: " << outputProb_ << " }";
states_.emplace_back(Vad::State::SIL);
} else {
// b. silence >= min_slience_samples, end speaking
@ -212,8 +227,8 @@ const Vad::State& Vad::Postprocess() {
triggerd_ = false;
auto end_sec = 1.0 * speech_end_ / sample_rate_;
speakEnd_.emplace_back(end_sec);
LOG_DEBUG << "{ speech end: " << end_sec
<< " s; prob: " << outputProb_ << " }";
DLOG(INFO) << "{ speech end: " << end_sec
<< " s; prob: " << outputProb_ << " }";
states_.emplace_back(Vad::State::END);
}
}
@ -303,4 +318,6 @@ std::ostream& operator<<(std::ostream& os, const Vad::State& s) {
break;
}
return os;
}
}
} // namespace ppspeech

@ -1,4 +1,5 @@
// Copyright (c) 2023 Chen Qianhe Authors. All Rights Reserved.
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -11,33 +12,59 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <iostream>
#include <mutex>
#include <vector>
#include "./wav.h"
#include "fastdeploy/fastdeploy_model.h"
#include "fastdeploy/runtime.h"
#include "vad/frontend/wav.h"
namespace ppspeech {
struct VadNnetConf {
// wav
int sr;
int frame_ms;
float threshold;
float beam;
int min_silence_duration_ms;
int speech_pad_left_ms;
int speech_pad_right_ms;
// model
std::string model_file_path;
std::string param_file_path;
std::string dict_file_path;
int num_cpu_thread; // 1 thred
std::string backend; // ort,lite, etc.
};
class Vad : public fastdeploy::FastDeployModel {
public:
enum class State { SIL = 0, START, SPEECH, END };
enum class State { ILLEGAL = 0, SIL, START, SPEECH, END };
friend std::ostream& operator<<(std::ostream& os, const Vad::State& s);
Vad(const std::string& model_file,
const fastdeploy::RuntimeOption& custom_option =
fastdeploy::RuntimeOption());
virtual ~Vad() {}
void Init();
void Reset();
void SetConfig(int sr,
int frame_ms,
float threshold,
int min_silence_duration_ms,
int speech_pad_left_ms,
int speech_pad_right_ms);
void SetConfig(const int& sr,
const int& frame_ms,
const float& threshold,
const float& beam,
const int& min_silence_duration_ms,
const int& speech_pad_left_ms,
const int& speech_pad_right_ms);
void SetConfig(const VadNnetConf conf);
bool ForwardChunk(std::vector<float>& chunk);
@ -78,7 +105,9 @@ class Vad : public fastdeploy::FastDeployModel {
bool Initialize();
private:
std::once_flag init_;
std::mutex init_lock_;
bool initialized_{false};
// input and output
std::vector<fastdeploy::FDTensor> inputTensors_;
std::vector<fastdeploy::FDTensor> outputTensors_;
@ -103,6 +132,7 @@ class Vad : public fastdeploy::FastDeployModel {
int sample_rate_ = 16000;
int frame_ms_ = 32; // 32, 64, 96 for 16k
float threshold_ = 0.5f;
float beam_ = 0.15f;
int64_t window_size_samples_; // support 256 512 768 for 8k; 512 1024 1536
// for 16k.
@ -122,3 +152,5 @@ class Vad : public fastdeploy::FastDeployModel {
const std::vector<int64_t> sr_node_dims_ = {1};
const std::vector<int64_t> hc_node_dims_ = {2, 1, 64};
};
} // namespace ppspeech

@ -1,11 +1,26 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "vad.h"
#include "common/base/common.h"
#include "vad/nnet/vad.h"
int main(int argc, char* argv[]) {
if (argc < 3) {
std::cout << "Usage: infer_onnx_silero_vad path/to/model path/to/audio "
std::cout << "Usage: vad_nnet_main path/to/model path/to/audio "
"run_option, "
"e.g ./infer_onnx_silero_vad silero_vad.onnx sample.wav"
"e.g ./vad_nnet_main silero_vad.onnx sample.wav"
<< std::endl;
return -1;
}
@ -14,9 +29,9 @@ int main(int argc, char* argv[]) {
std::string audio_file = argv[2];
int sr = 16000;
Vad vad(model_file);
ppspeech::Vad vad(model_file);
// custom config, but must be set before init
vad.SetConfig(sr, 32, 0.45f, 200, 0, 0);
vad.SetConfig(sr, 32, 0.5f, 0.15, 200, 0, 0);
vad.Init();
std::vector<float> inputWav; // [0, 1]
@ -30,6 +45,7 @@ int main(int argc, char* argv[]) {
inputWav[i] = wav_reader.data()[i] / 32768;
}
ppspeech::Timer timer;
int window_size_samples = vad.WindowSizeSamples();
for (int64_t j = 0; j < num_samples; j += window_size_samples) {
auto start = j;
@ -39,7 +55,7 @@ int main(int argc, char* argv[]) {
auto current_chunk_size = end - start;
std::vector<float> r{&inputWav[0] + start, &inputWav[0] + end};
assert(r.size() == current_chunk_size);
assert(r.size() == static_cast<size_t>(current_chunk_size));
if (!vad.ForwardChunk(r)) {
std::cerr << "Failed to inference while using model:"
@ -47,11 +63,14 @@ int main(int argc, char* argv[]) {
return false;
}
Vad::State s = vad.Postprocess();
ppspeech::Vad::State s = vad.Postprocess();
std::cout << s << " ";
}
std::cout << std::endl;
std::cout << "RTF=" << timer.Elapsed() / double(num_samples / sr)
<< std::endl;
std::vector<std::map<std::string, float>> result = vad.GetResult();
for (auto& res : result) {
std::cout << "speak start: " << res["start"]

@ -1,121 +0,0 @@
English | [简体中文](README_CN.md)
# Silero VAD Deployment Example
This directory provides examples that `infer_onnx_silero_vad` fast finishes the deployment of VAD models on CPU/GPU.
Before deployment, two steps require confirmation.
- 1. Software and hardware should meet the requirements. Please refer to [FastDeploy Environment Requirements](../../../../docs/en/build_and_install/download_prebuilt_libraries.md).
- 2. Download the precompiled deployment library and samples code according to your development environment. Refer to [FastDeploy Precompiled Library](../../../../docs/en/build_and_install/download_prebuilt_libraries.md).
Taking VAD inference on Linux as an example, the compilation test can be completed by executing the following command in this directory.
```bash
mkdir build
cd build
# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above
wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
tar xvf fastdeploy-linux-x64-x.x.x.tgz
cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
make -j
# Download the VAD model file and test audio. After decompression, place the model and test audio in the infer_onnx_silero_vad.cc peer directory
wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad.tgz
wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad_sample.wav
# inference
./infer_onnx_silero_vad ../silero_vad.onnx ../silero_vad_sample.wav
```
- The above command works for Linux or MacOS. Refer to:
- [How to use FastDeploy C++ SDK in Windows](../../../../docs/en/faq/use_sdk_on_windows.md) for SDK use-pattern in Windows
## VAD C++ Interface
### Vad Class
```c++
Vad::Vad(const std::string& model_file,
const fastdeploy::RuntimeOption& custom_option = fastdeploy::RuntimeOption())
```
**Parameter**
> * **model_file**(str): Model file path
> * **runtime_option**(RuntimeOption): Backend inference configuration. None by default. (use the default configuration)
### setAudioCofig function
**Must be called before the `init` function**
```c++
void Vad::setAudioCofig(int sr, int frame_ms, float threshold, int min_silence_duration_ms, int speech_pad_ms);
```
**Parameter**
> * **sr**(int): sampling rate
> * **frame_ms**(int): The length of each detection frame, and it is used to calculate the detection window size
> * **threshold**(float): Result probability judgment threshold
> * **min_silence_duration_ms**(int): The threshold used to calculate whether it is silence
> * **speech_pad_ms**(int): Used to calculate the end time of the speech
### init function
Used to initialize audio-related parameters.
```c++
void Vad::init();
```
### loadAudio function
Load audio.
```c++
void Vad::loadAudio(const std::string& wavPath)
```
**Parameter**
> * **wavPath**(str): Audio file path
### Predict function
Used to start model reasoning.
```c++
bool Vad::Predict();
```
### getResult function
**Used to obtain reasoning results**
```c++
std::vector<std::map<std::string, float>> Vad::getResult(
float removeThreshold = 1.6, float expandHeadThreshold = 0.32, float expandTailThreshold = 0,
float mergeThreshold = 0.3);
```
**Parameter**
> * **removeThreshold**(float): Discard result fragment threshold; If some recognition results are too short, they will be discarded according to this threshold
> * **expandHeadThreshold**(float): Offset at the beginning of the segment; The recognized start time may be too close to the voice part, so move forward the start time accordingly
> * **expandTailThreshold**(float): Offset at the end of the segment; The recognized end time may be too close to the voice part, so the end time is moved back accordingly
> * **mergeThreshold**(float): Some result segments are very close and can be combined into one, and the vocal segments can be combined accordingly
**The output result format is**`std::vector<std::map<std::string, float>>`
> Output a list, each element is a speech fragment
>
> Each clip can use 'start' to get the start time and 'end' to get the end time
### Tips
1. `The setAudioCofig`function must be called before the `init` function
2. The sampling rate of the input audio file must be consistent with that set in the code
- [Model Description](../)
- [How to switch the model inference backend engine](../../../../docs/en/faq/how_to_change_backend.md)

@ -1,119 +0,0 @@
[English](README.md) | 简体中文
# Silero VAD 部署示例
本目录下提供`infer_onnx_silero_vad`快速完成 Silero VAD 模型在CPU/GPU。
在部署前,需确认以下两个步骤
- 1. 软硬件环境满足要求,参考[FastDeploy环境要求](../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
- 2. 根据开发环境下载预编译部署库和samples代码参考[FastDeploy预编译库](../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
以Linux上 VAD 推理为例,在本目录执行如下命令即可完成编译测试。
```bash
mkdir build
cd build
# 下载FastDeploy预编译库用户可在上文提到的`FastDeploy预编译库`中自行选择合适的版本使用
wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
tar xvf fastdeploy-linux-x64-x.x.x.tgz
cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
make -j
# 下载 VAD 模型文件和测试音频,解压后将模型和测试音频放置在与 infer_onnx_silero_vad.cc 同级目录下
wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad.tgz
wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad_sample.wav
# 推理
./infer_onnx_silero_vad ../silero_vad.onnx ../silero_vad_sample.wav
```
以上命令只适用于Linux或MacOS, Windows下SDK的使用方式请参考:
- [如何在Windows中使用FastDeploy C++ SDK](../../../../docs/cn/faq/use_sdk_on_windows.md)
## VAD C++ 接口
### Vad 类
```c++
Vad::Vad(const std::string& model_file,
const fastdeploy::RuntimeOption& custom_option = fastdeploy::RuntimeOption())
```
**参数**
> * **model_file**(str): 模型文件路径
> * **runtime_option**(RuntimeOption): 后端推理配置默认为None即采用默认配置
### setAudioCofig 函数
**必须在`init`函数前调用**
```c++
void Vad::setAudioCofig(int sr, int frame_ms, float threshold, int min_silence_duration_ms, int speech_pad_ms);
```
**参数**
> * **sr**(int): 采样率
> * **frame_ms**(int): 每次检测帧长,用于计算检测窗口大小
> * **threshold**(float): 结果概率判断阈值
> * **min_silence_duration_ms**(int): 用于计算判断是否是 silence 的阈值
> * **speech_pad_ms**(int): 用于计算 speach 结束时刻
### init 函数
用于初始化音频相关参数
```c++
void Vad::init();
```
### loadAudio 函数
加载音频
```c++
void Vad::loadAudio(const std::string& wavPath)
```
**参数**
> * **wavPath**(str): 音频文件路径
### Predict 函数
用于开始模型推理
```c++
bool Vad::Predict();
```
### getResult 函数
**用于获取推理结果**
```c++
std::vector<std::map<std::string, float>> Vad::getResult(
float removeThreshold = 1.6, float expandHeadThreshold = 0.32, float expandTailThreshold = 0,
float mergeThreshold = 0.3);
```
**参数**
> * **removeThreshold**(float): 丢弃结果片段阈值;部分识别结果太短则根据此阈值丢弃
> * **expandHeadThreshold**(float): 结果片段开始时刻偏移;识别到的开始时刻可能过于贴近发声部分,因此据此前移开始时刻
> * **expandTailThreshold**(float): 结果片段结束时刻偏移;识别到的结束时刻可能过于贴近发声部分,因此据此后移结束时刻
> * **mergeThreshold**(float): 有的结果片段十分靠近,可以合并成一个,据此合并发声片段
**输出结果格式为**`std::vector<std::map<std::string, float>>`
> 输出一个列表,每个元素是一个讲话片段
>
> 每个片段可以用 'start' 获取到开始时刻,用 'end' 获取到结束时刻
### 提示
1. `setAudioCofig`函数必须在`init`函数前调用
2. 输入的音频文件的采样率必须与代码中设置的保持一致
- [模型介绍](../)
- [如何切换模型推理后端引擎](../../../../docs/cn/faq/how_to_change_backend.md)

@ -1,18 +0,0 @@
# This contains the locations of binarys build required for running the examples.
unset GREP_OPTIONS
ENGINE_ROOT=$PWD/../../../
ENGINE_BUILD=$ENGINE_ROOT/build/engine/asr
ENGINE_TOOLS=$ENGINE_ROOT/tools
TOOLS_BIN=$ENGINE_TOOLS/valgrind/install/bin
[ -d $ENGINE_BUILD ] || { echo "Error: 'build/runtime' directory not found. please ensure that the project build successfully"; }
export LC_AL=C
export PATH=$PATH:$TOOLS_BIN:$ENGINE_BUILD/nnet:$ENGINE_BUILD/decoder:$ENGINE_BUILD/../common/frontend/audio:$ENGINE_BUILD/recognizer
#PADDLE_LIB_PATH=$(python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out);")
export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH

@ -3,7 +3,7 @@
unset GREP_OPTIONS
ENGINE_ROOT=$PWD/../../../
ENGINE_BUILD=$ENGINE_ROOT/build/engine/asr
ENGINE_BUILD=$ENGINE_ROOT/build/Linux/x86_64/engine/asr
ENGINE_TOOLS=$ENGINE_ROOT/tools
TOOLS_BIN=$ENGINE_TOOLS/valgrind/install/bin

@ -0,0 +1,261 @@
# Silero VAD - pre-trained enterprise-grade Voice Activity Detector
This directory provides VAD models on CPU/GPU.
![](https://user-images.githubusercontent.com/36505480/198026365-8da383e0-5398-4a12-b7f8-22c2c0059512.png)
## VAD Interface
For vad interface please see [](../../engine/vad/interface/).
### Create Handdle
```c++
PPSHandle_t PPSVadCreateInstance(const char* conf_path);
```
### Destroy Handdle
```c++
int PPSVadDestroyInstance(PPSHandle_t instance);
```
### Reset Vad State
```c++
int PPSVadReset(PPSHandle_t instance);
```
Reset Vad state before processing next `wav`.
### Get Chunk Size
```c++
int PPSVadChunkSizeSamples(PPSHandle_t instance);
```
This API will return chunk size in `sample` unit.
When do forward, we need feed `chunk size` samples, except last chunk.
### Vad Forward
```c++
PPSVadState_t PPSVadFeedForward(PPSHandle_t instance,
float* chunk,
int num_element);
```
Vad has below states:
```c++
typedef enum {
PPS_VAD_ILLEGAL = 0, // error
PPS_VAD_SIL, // silence
PPS_VAD_START, // start speech
PPS_VAD_SPEECH, // in speech
PPS_VAD_END, // end speech
PPS_VAD_NUMSTATES, // number of states
} PPSVadState_t;
```
If `PPSVadFeedForward` occur an error will return `PPS_VAD_ILLEGAL` state.
## Linux
### Build Runtime
```bash
# cd /path/to/paddlespeech/runtime
cmake -B build -DBUILD_SHARED_LIBS=OFF -DWITH_ASR=OFF -DWITH_CLS=OFF -DWITH_VAD=ON
cmake --build build
```
Since VAD using FastDeploy runtime, if you have another FastDeploy Library, you can using this command to build:
```bash
# cd /path/to/paddlespeech/runtime
cmake -B build -DBUILD_SHARED_LIBS=OFF -DWITH_ASR=OFF -DWITH_CLS=OFF -DWITH_VAD=ON -DFASTDEPLOY_INSTALL_DIR=/workspace//paddle/FastDeploy/build/Linux/x86_64/install
cmake --build build
```
`DFASTDEPLOY_INSTALL_DIR` is the directory of FastDeploy Library.
### Run Demo
After building success, we can do this to run demo under this example dir:
```bash
bash run.sh
```
The output like these:
```bash
/workspace//PaddleSpeech/runtime/engine/vad/nnet/vad.cc(88)::SetConfig sr=16 threshold=0.5 beam=0.15 frame_ms=32 min_silence_duration_ms=200 speech_pad_left_ms=0 speech_pad_right_ms=0[INFO] fastdeploy/runtime/runtime.cc(293)::CreateOrtBackend Runtime initialized with Backend::ORT in Device::CPU./workspace//PaddleSpeech/runtime/engine/vad/nnet/vad.cc(137)::Initialize init done.[SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [STA] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [END] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [STA] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SIL] [SIL] [SIL] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [END] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [STA] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [END] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [STA] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SIL] [SIL] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [END] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL]
RTF=0.00774591
speak start: 0.32 s, end: 2.464 s | speak start: 3.296 s, end: 4.64 s | speak start: 5.408 s, end: 7.872 s | speak start: 8.192 s, end: 10.72 s
vad_nnet_main done!
sr = 16000
frame_ms = 32
threshold = 0.5
beam = 0.15
min_silence_duration_ms = 200
speech_pad_left_ms = 0
speech_pad_right_ms = 0
model_path = ./data/silero_vad/silero_vad.onnx
param_path = (default)num_cpu_thread = 1(default)/workspace//PaddleSpeech/runtime/engine/vad/nnet/vad.cc(88)::SetConfig sr=16 threshold=0.5 beam=0.15 frame_ms=32 min_silence_duration_ms=200 speech_pad_left_ms=0 speech_pad_right_ms=0[INFO] fastdeploy/runtime/runtime.cc(293)::CreateOrtBackend Runtime initialized with Backend::ORT in Device::CPU./workspace//PaddleSpeech/runtime/engine/vad/nnet/vad.cc(137)::Initialize init done.
1 1 1 1 1 1 1 1 1 1 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 2 3 3 3 3 3 3 3 3 3 3 3 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1
RTF=0.00778218
vad_interface_main done!
```
## Android
When to using on Android, please setup your `NDK` enverment before, then do as below:
```bash
# cd /path/to/paddlespeech/runtime
bash build_android.sh
```
## Result
| Arch | RTF | Runtime Size |
|--|--|--|
| x86_64 | 0.00778218 | |
| arm64-v8a | 0.00744745 | ~10.532MB |
## Machine Information
#### x86_64
The environment as below:
```text
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 80
On-line CPU(s) list: 0-79
Thread(s) per core: 2
Core(s) per socket: 20
Socket(s): 2
NUMA node(s): 2
Vendor ID: GenuineIntel
CPU family: 6
Model: 85
Model name: Intel(R) Xeon(R) Gold 6271C CPU @ 2.60GHz
Stepping: 7
CPU MHz: 2599.998
BogoMIPS: 5199.99
Hypervisor vendor: KVM
Virtualization type: full
L1d cache: 32K
L1i cache: 32K
L2 cache: 1024K
L3 cache: 33792K
NUMA node0 CPU(s): 0-39
NUMA node1 CPU(s): 40-79
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl xtopology nonstop_tsc eagerfpu pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb ibrs_enhanced fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 arat umip pku ospke avx512_vnni spec_ctrl arch_capabilities
```
#### arm64-v8a
```text
Processor : AArch64 Processor rev 14 (aarch64)
processor : 0
BogoMIPS : 38.40
Features : fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm lrcpc dcpop
CPU implementer : 0x51
CPU architecture: 8
CPU variant : 0xd
CPU part : 0x805
CPU revision : 14
processor : 1
BogoMIPS : 38.40
Features : fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm lrcpc dcpop
CPU implementer : 0x51
CPU architecture: 8
CPU variant : 0xd
CPU part : 0x805
CPU revision : 14
processor : 2
BogoMIPS : 38.40
Features : fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm lrcpc dcpop
CPU implementer : 0x51
CPU architecture: 8
CPU variant : 0xd
CPU part : 0x805
CPU revision : 14
processor : 3
BogoMIPS : 38.40
Features : fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm lrcpc dcpop
CPU implementer : 0x51
CPU architecture: 8
CPU variant : 0xd
CPU part : 0x805
CPU revision : 14
processor : 4
BogoMIPS : 38.40
Features : fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm lrcpc dcpop
CPU implementer : 0x51
CPU architecture: 8
CPU variant : 0xd
CPU part : 0x804
CPU revision : 14
processor : 5
BogoMIPS : 38.40
Features : fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm lrcpc dcpop
CPU implementer : 0x51
CPU architecture: 8
CPU variant : 0xd
CPU part : 0x804
CPU revision : 14
processor : 6
BogoMIPS : 38.40
Features : fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm lrcpc dcpop
CPU implementer : 0x51
CPU architecture: 8
CPU variant : 0xd
CPU part : 0x804
CPU revision : 14
processor : 7
BogoMIPS : 38.40
Features : fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm lrcpc dcpop
CPU implementer : 0x51
CPU architecture: 8
CPU variant : 0xd
CPU part : 0x804
CPU revision : 14
Hardware : Qualcomm Technologies, Inc SM8150
```
## Download Pre-trained ONNX Model
For developers' testing, model exported by VAD are provided below. Developers can download them directly.
| 模型 | 大小 | 备注 |
| :----------------------------------------------------------- | :---- | :----------------------------------------------------------- |
| [silero-vad](https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad.tgz) | 1.8MB | This model file is sourced from [snakers4/silero-vad](https://github.com/snakers4/silero-vad)MIT License |
## FastDeploy Runtime
For FastDeploy software and hardware requements, and pre-released library please to see [FastDeploy](https://github.com/PaddlePaddle/FastDeploy):
- 1. [FastDeploy Environment Requirements](https://github.com/PaddlePaddle/FastDeploy/docs/en/build_and_install/download_prebuilt_libraries.md).
- 2. [FastDeploy Precompiled Library](https://github.com/PaddlePaddle/FastDeploy/docs/en/build_and_install/download_prebuilt_libraries.md).
## Reference
* https://github.com/snakers4/silero-vad
* https://github.com/PaddlePaddle/FastDeploy/blob/develop/examples/audio/silero-vad/README.md

@ -0,0 +1,11 @@
[model]
model_path=./data/silero_vad/silero_vad.onnx
[vad]
sr = 16000 # 16k
frame_ms = 32 # 32, 64, 96 for 16k
threshold = 0.5
beam = 0.15
min_silence_duration_ms = 200
speech_pad_left_ms = 0
speech_pad_right_ms = 0

@ -0,0 +1,23 @@
#!/bin/bash
set -e
conf=conf
data=data
exp=exp
. utils/parse_options.sh
mkdir -p $exp
ckpt_dir=$data/silero_vad
model=$ckpt_dir/silero_vad.onnx
test_wav=$data/silero_vad_sample.wav
conf_file=$conf/vad.ini
vad_nnet_main $model $test_wav
echo "vad_nnet_main done!"
vad_interface_main $conf_file $test_wav
echo "vad_interface_main done!"

@ -0,0 +1,17 @@
# This contains the locations of binarys build required for running the examples.
unset GREP_OPTIONS
ENGINE_ROOT=$PWD/../../
ENGINE_BUILD=$ENGINE_ROOT/build/Linux/x86_64/engine/vad
ENGINE_TOOLS=$ENGINE_ROOT/tools
TOOLS_BIN=$ENGINE_TOOLS/valgrind/install/bin
[ -d $ENGINE_BUILD ] || { echo "Error: 'build/runtime' directory not found. please ensure that the project build successfully"; }
export LC_AL=C
export PATH=$PATH:$TOOLS_BIN:$ENGINE_BUILD/nnet:$ENGINE_BUILD/interface
export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH

@ -15,8 +15,8 @@ exp=exp
mkdir -p $exp $data
# 1. compile
if [ ! -d ${SPEECHX_BUILD} ]; then
pushd ${SPEECHX_ROOT}
if [ ! -d ${ENGINE_BUILD} ]; then
pushd ${ENGINE_ROOT}
bash build.sh
# build for android armv8/armv7
@ -24,8 +24,6 @@ if [ ! -d ${SPEECHX_BUILD} ]; then
popd
fi
ckpt_dir=$data/silero_vad
wav=$data/silero_vad_sample.wav
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then
./local/download.sh
Loading…
Cancel
Save