From 040271ede7b338a29faa337d659c89f7d532291c Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 9 Mar 2023 04:45:05 +0000 Subject: [PATCH] add timer; compute vad rtf; vad add beam param --- runtime/build_android.sh | 4 +- runtime/engine/common/CMakeLists.txt | 2 +- runtime/engine/common/base/common.h | 3 +- runtime/engine/common/utils/CMakeLists.txt | 1 + runtime/engine/common/utils/timer.cc | 63 +++++++++++++++++++ runtime/engine/common/utils/timer.h | 39 ++++++++++++ runtime/engine/vad/interface/CMakeLists.txt | 4 +- runtime/engine/vad/interface/vad_interface.cc | 1 + runtime/engine/vad/jni/CMakeLists.txt | 2 +- runtime/engine/vad/jni/vad_jni_interface.cc | 10 +-- runtime/engine/vad/jni/vad_jni_interface.h | 16 ++--- runtime/engine/vad/nnet/CMakeLists.txt | 2 +- runtime/engine/vad/nnet/vad.cc | 11 ++-- runtime/engine/vad/nnet/vad.h | 3 + runtime/engine/vad/nnet/vad_nnet_main.cc | 7 ++- runtime/examples/u2pp_ol/wenetspeech/path.sh | 2 +- runtime/examples/vad/conf/vad.ini | 5 +- runtime/examples/vad/path.sh | 2 +- 18 files changed, 147 insertions(+), 30 deletions(-) create mode 100644 runtime/engine/common/utils/timer.cc create mode 100644 runtime/engine/common/utils/timer.h diff --git a/runtime/build_android.sh b/runtime/build_android.sh index 41ae7c611..29ea2df0c 100755 --- a/runtime/build_android.sh +++ b/runtime/build_android.sh @@ -14,7 +14,7 @@ TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake # Create build directory BUILD_ROOT=build/Android BUILD_DIR=${BUILD_ROOT}/${ANDROID_ABI}-api-21 -#FASDEPLOY_INSTALL_DIR="${BUILD_DIR}/install" +FASDEPLOY_INSTALL_DIR="/workspace/zhanghui/paddle/FastDeploy/build/Android/arm64-v8a-api-21/install" mkdir -p ${BUILD_DIR} cd ${BUILD_DIR} @@ -29,8 +29,8 @@ cmake -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN_FILE} \ -DANDROID_TOOLCHAIN=${ANDROID_TOOLCHAIN} \ -DWITH_ASR=OFF \ -DWITH_CLS=OFF \ + -DFASTDEPLOY_INSTALL_DIR=${FASTDEPLOY_INSTALL_DIR} \ -Wno-dev ../../.. - #-DFASTDEPLOY_INSTALL_DIR=${FASTDEPLOY_INSTALL_DIR} \ # Build FastDeploy Android C++ SDK make diff --git a/runtime/engine/common/CMakeLists.txt b/runtime/engine/common/CMakeLists.txt index 4c2235b14..495921b2b 100644 --- a/runtime/engine/common/CMakeLists.txt +++ b/runtime/engine/common/CMakeLists.txt @@ -12,4 +12,4 @@ ${CMAKE_CURRENT_SOURCE_DIR}/frontend add_subdirectory(frontend) add_library(common INTERFACE) -add_definitions(common base utils kaldi-matrix frontend) \ No newline at end of file +target_link_libraries(common INTERFACE base utils kaldi-matrix frontend) \ No newline at end of file diff --git a/runtime/engine/common/base/common.h b/runtime/engine/common/base/common.h index 17560102e..b31fc53e0 100644 --- a/runtime/engine/common/base/common.h +++ b/runtime/engine/common/base/common.h @@ -50,4 +50,5 @@ #include "base/log.h" #include "base/macros.h" #include "utils/file_utils.h" -#include "utils/math.h" \ No newline at end of file +#include "utils/math.h" +#include "utils/timer.h" \ No newline at end of file diff --git a/runtime/engine/common/utils/CMakeLists.txt b/runtime/engine/common/utils/CMakeLists.txt index eb3c71979..14733648c 100644 --- a/runtime/engine/common/utils/CMakeLists.txt +++ b/runtime/engine/common/utils/CMakeLists.txt @@ -5,6 +5,7 @@ set(csrc math.cc strings.cc audio_process.cc + timer.cc ) add_library(utils ${csrc}) diff --git a/runtime/engine/common/utils/timer.cc b/runtime/engine/common/utils/timer.cc new file mode 100644 index 000000000..ff43cd04c --- /dev/null +++ b/runtime/engine/common/utils/timer.cc @@ -0,0 +1,63 @@ +// Copyright 2020 Xiaomi Corporation (authors: Haowen Qiu) +// Mobvoi Inc. (authors: Fangjun Kuang) +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include + +#include "common/utils/timer.h" + +namespace ppspeech{ + +struct TimerImpl{ + TimerImpl() = default; + virtual ~TimerImpl() = default; + virtual void Reset() = 0; + // time in seconds + virtual double Elapsed() = 0; +}; + +class CpuTimerImpl : public TimerImpl { + public: + CpuTimerImpl() { Reset(); } + + using high_resolution_clock = std::chrono::high_resolution_clock; + + void Reset() override { begin_ = high_resolution_clock::now(); } + + // time in seconds + double Elapsed() override { + auto end = high_resolution_clock::now(); + auto dur = + std::chrono::duration_cast(end - begin_); + return dur.count() / 1000000.0; + } + + private: + high_resolution_clock::time_point begin_; +}; + +Timer::Timer() { + impl_ = std::make_unique(); +} + +Timer::~Timer() = default; + +void Timer::Reset() const { impl_->Reset(); } + +double Timer::Elapsed() const { return impl_->Elapsed(); } + + +} //namespace ppspeech \ No newline at end of file diff --git a/runtime/engine/common/utils/timer.h b/runtime/engine/common/utils/timer.h new file mode 100644 index 000000000..6f4ae1f8d --- /dev/null +++ b/runtime/engine/common/utils/timer.h @@ -0,0 +1,39 @@ +// Copyright 2020 Xiaomi Corporation (authors: Haowen Qiu) +// Mobvoi Inc. (authors: Fangjun Kuang) +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace ppspeech { + +struct TimerImpl; + +class Timer { + public: + Timer(); + ~Timer(); + + void Reset() const; + + // time in seconds + double Elapsed() const; + + private: + std::unique_ptr impl_; +}; + +} //namespace ppspeech \ No newline at end of file diff --git a/runtime/engine/vad/interface/CMakeLists.txt b/runtime/engine/vad/interface/CMakeLists.txt index 838296df5..4c3f56740 100644 --- a/runtime/engine/vad/interface/CMakeLists.txt +++ b/runtime/engine/vad/interface/CMakeLists.txt @@ -3,12 +3,12 @@ set(srcs ) add_library(pps_vad_interface ${srcs}) -target_link_libraries(pps_vad_interface PUBLIC ${FASTDEPLOY_LIBS} pps_vad) +target_link_libraries(pps_vad_interface PUBLIC pps_vad) set(bin_name vad_interface_main) add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) -target_link_libraries(${bin_name} ${FASTDEPLOY_LIBS} pps_vad_interface) +target_link_libraries(${bin_name} pps_vad_interface) # set_target_properties(${bin_name} PROPERTIES PUBLIC_HEADER "vad_interface.h;../frontend/wav.h") diff --git a/runtime/engine/vad/interface/vad_interface.cc b/runtime/engine/vad/interface/vad_interface.cc index 89e25f1bf..4c3877ff0 100644 --- a/runtime/engine/vad/interface/vad_interface.cc +++ b/runtime/engine/vad/interface/vad_interface.cc @@ -25,6 +25,7 @@ PPSHandle_t PPSVadCreateInstance(const char* conf_path) { nnet_conf.sr = conf.Read("sr", 16000); nnet_conf.frame_ms = conf.Read("frame_ms", 32); nnet_conf.threshold = conf.Read("threshold", 0.45f); + nnet_conf.beam = conf.Read("beam", 0.15f); nnet_conf.min_silence_duration_ms = conf.Read("min_silence_duration_ms", 200); nnet_conf.speech_pad_left_ms = conf.Read("speech_pad_left_ms", 0); diff --git a/runtime/engine/vad/jni/CMakeLists.txt b/runtime/engine/vad/jni/CMakeLists.txt index 6fdfb1e1f..dc88451fa 100644 --- a/runtime/engine/vad/jni/CMakeLists.txt +++ b/runtime/engine/vad/jni/CMakeLists.txt @@ -3,7 +3,7 @@ set(srcs ) add_library(pps_vad_jni_interface ${srcs}) -target_link_libraries(pps_vad_jni_interface PUBLIC ${FASTDEPLOY_LIBS} pps_vad_interface) +target_link_libraries(pps_vad_jni_interface PUBLIC pps_vad_interface) file(RELATIVE_PATH DEST_DIR ${ENGINE_ROOT} ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/runtime/engine/vad/jni/vad_jni_interface.cc b/runtime/engine/vad/jni/vad_jni_interface.cc index 6ba63e35f..805c38f77 100644 --- a/runtime/engine/vad/jni/vad_jni_interface.cc +++ b/runtime/engine/vad/jni/vad_jni_interface.cc @@ -14,7 +14,7 @@ #include "vad/jni/vad_jni_interface.h" -JNIEXPORT jlong JNICALL Java_com_baidu_paddlespeech_PPSVadJni_createInstance( +JNIEXPORT jlong JNICALL Java_com_baidu_paddlespeech_vadjni_createInstance( JNIEnv* env, jobject thiz, jstring conf_path) { const char* path = env->GetStringUTFChars(conf_path, JNI_FALSE); PPSHandle_t handle = PPSVadCreateInstance(path); @@ -22,26 +22,26 @@ JNIEXPORT jlong JNICALL Java_com_baidu_paddlespeech_PPSVadJni_createInstance( return (jlong)(handle); } -JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_PPSVadJni_destoryInstance( +JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_vadjni_destoryInstance( JNIEnv* env, jobject thiz, PPSJniHandle_t instance) { PPSHandle_t handle = (PPSHandle_t)(instance); return (jint)PPSVadDestroyInstance(handle); } -JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_PPSVadJni_reset( +JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_vadjni_reset( JNIEnv* env, jobject thiz, PPSJniHandle_t instance) { PPSHandle_t handle = (PPSHandle_t)(instance); return (jint)PPSVadReset(handle); } -JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_PPSVadJni_chunkSizeSamples( +JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_vadjni_chunkSizeSamples( JNIEnv* env, jobject thiz, PPSJniHandle_t instance) { PPSHandle_t handle = (PPSHandle_t)(instance); return (jint)PPSVadChunkSizeSamples(handle); } -JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_PPSVadJni_feedForward( +JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_vadjni_feedForward( JNIEnv* env, jobject thiz, PPSJniHandle_t instance, jfloatArray chunk) { PPSHandle_t handle = (PPSHandle_t)(instance); jsize num_elms = env->GetArrayLength(chunk); diff --git a/runtime/engine/vad/jni/vad_jni_interface.h b/runtime/engine/vad/jni/vad_jni_interface.h index 2bf976570..5564f05e8 100644 --- a/runtime/engine/vad/jni/vad_jni_interface.h +++ b/runtime/engine/vad/jni/vad_jni_interface.h @@ -14,7 +14,7 @@ // PackageName: paddlespeech.baidu.com -// ClassName: PPSVadJni +// ClassName: vadjni #include #include "vad/interface/vad_interface.h" @@ -26,18 +26,18 @@ extern "C" { typedef jlong PPSJniHandle_t; JNIEXPORT PPSJniHandle_t JNICALL -Java_com_baidu_paddlespeech_PPSVadJni_createInstance(JNIEnv* env, - jobject thiz, - jstring conf_path); +Java_com_baidu_paddlespeech_vadjni_createInstance(JNIEnv* env, + jobject thiz, + jstring conf_path); -JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_PPSVadJni_destoryInstance( +JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_vadjni_destoryInstance( JNIEnv* env, jobject thiz, PPSJniHandle_t instance); -JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_PPSVadJni_reset( +JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_vadjni_reset( JNIEnv* env, jobject thiz, PPSJniHandle_t instance); -JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_PPSVadJni_chunkSizeSamples( +JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_vadjni_chunkSizeSamples( JNIEnv* env, jobject thiz, PPSJniHandle_t instance); // typedef enum { @@ -49,7 +49,7 @@ JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_PPSVadJni_chunkSizeSamples( // PPS_VAD_NUMSTATES, // number of states // } PPSVadState_t; -JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_PPSVadJni_feedForward( +JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_vadjni_feedForward( JNIEnv* env, jobject thiz, PPSJniHandle_t instance, jfloatArray chunk); #ifdef __cplusplus diff --git a/runtime/engine/vad/nnet/CMakeLists.txt b/runtime/engine/vad/nnet/CMakeLists.txt index ffcb536a7..7195c21bd 100644 --- a/runtime/engine/vad/nnet/CMakeLists.txt +++ b/runtime/engine/vad/nnet/CMakeLists.txt @@ -9,7 +9,7 @@ target_link_libraries(pps_vad PUBLIC ${FASTDEPLOY_LIBS} common) set(bin_name vad_nnet_main) add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) # target_link_libraries(${bin_name} ${FASTDEPLOY_LIBS} vad gflags extern_glog) -target_link_libraries(${bin_name} ${FASTDEPLOY_LIBS} pps_vad) +target_link_libraries(${bin_name} pps_vad) file(RELATIVE_PATH DEST_DIR ${ENGINE_ROOT} ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/runtime/engine/vad/nnet/vad.cc b/runtime/engine/vad/nnet/vad.cc index fed538402..efb2a2bf4 100644 --- a/runtime/engine/vad/nnet/vad.cc +++ b/runtime/engine/vad/nnet/vad.cc @@ -62,6 +62,7 @@ void Vad::SetConfig(const VadNnetConf conf) { SetConfig(conf.sr, conf.frame_ms, conf.threshold, + conf.beam, conf.min_silence_duration_ms, conf.speech_pad_left_ms, conf.speech_pad_right_ms); @@ -70,6 +71,7 @@ void Vad::SetConfig(const VadNnetConf conf) { void Vad::SetConfig(const int& sr, const int& frame_ms, const float& threshold, + const float& beam, const int& min_silence_duration_ms, const int& speech_pad_left_ms, const int& speech_pad_right_ms) { @@ -81,6 +83,7 @@ void Vad::SetConfig(const int& sr, sample_rate_ = sr; sr_per_ms_ = sr / 1000; threshold_ = threshold; + beam_ = beam; frame_ms_ = frame_ms; min_silence_samples_ = min_silence_duration_ms * sr_per_ms_; speech_pad_left_samples_ = speech_pad_left_ms * sr_per_ms_; @@ -90,8 +93,8 @@ void Vad::SetConfig(const int& sr, window_size_samples_ = frame_ms * sr_per_ms_; current_chunk_size_ = window_size_samples_; - fastdeploy::FDINFO << "sr=" << sr << " threshold=" << threshold - << " frame_ms=" << frame_ms + fastdeploy::FDINFO << "sr=" << sr_per_ms_ << " threshold=" << threshold_ + << " beam=" << beam_ << " frame_ms=" << frame_ms_ << " min_silence_duration_ms=" << min_silence_duration_ms << " speech_pad_left_ms=" << speech_pad_left_ms << " speech_pad_right_ms=" << speech_pad_right_ms; @@ -194,7 +197,7 @@ const Vad::State& Vad::Postprocess() { LOG_DEBUG << "{ speech start: " << start_sec << " s; prob: " << outputProb_ << " }"; states_.emplace_back(Vad::State::START); - } else if (outputProb_ >= threshold_ - 0.15 && triggerd_) { + } else if (outputProb_ >= threshold_ - beam_ && triggerd_) { // 3. Continue if (temp_end_ != 0) { @@ -211,7 +214,7 @@ const Vad::State& Vad::Postprocess() { } states_.emplace_back(Vad::State::SPEECH); - } else if (outputProb_ < threshold_ - 0.15 && triggerd_) { + } else if (outputProb_ < threshold_ - beam_ && triggerd_) { // 4. End if (temp_end_ == 0) { temp_end_ = current_sample_; diff --git a/runtime/engine/vad/nnet/vad.h b/runtime/engine/vad/nnet/vad.h index 7c322388a..de557ec67 100644 --- a/runtime/engine/vad/nnet/vad.h +++ b/runtime/engine/vad/nnet/vad.h @@ -29,6 +29,7 @@ struct VadNnetConf { int sr; int frame_ms; float threshold; + float beam; int min_silence_duration_ms; int speech_pad_left_ms; int speech_pad_right_ms; @@ -59,6 +60,7 @@ class Vad : public fastdeploy::FastDeployModel { void SetConfig(const int& sr, const int& frame_ms, const float& threshold, + const float& beam, const int& min_silence_duration_ms, const int& speech_pad_left_ms, const int& speech_pad_right_ms); @@ -130,6 +132,7 @@ class Vad : public fastdeploy::FastDeployModel { int sample_rate_ = 16000; int frame_ms_ = 32; // 32, 64, 96 for 16k float threshold_ = 0.5f; + float beam_ = 0.15f; int64_t window_size_samples_; // support 256 512 768 for 8k; 512 1024 1536 // for 16k. diff --git a/runtime/engine/vad/nnet/vad_nnet_main.cc b/runtime/engine/vad/nnet/vad_nnet_main.cc index 4615aa956..7b89d1af3 100644 --- a/runtime/engine/vad/nnet/vad_nnet_main.cc +++ b/runtime/engine/vad/nnet/vad_nnet_main.cc @@ -13,6 +13,7 @@ // limitations under the License. +#include "common/base/common.h" #include "vad/nnet/vad.h" int main(int argc, char* argv[]) { @@ -30,7 +31,7 @@ int main(int argc, char* argv[]) { int sr = 16000; ppspeech::Vad vad(model_file); // custom config, but must be set before init - vad.SetConfig(sr, 32, 0.45f, 200, 0, 0); + vad.SetConfig(sr, 32, 0.5f, 0.15, 200, 0, 0); vad.Init(); std::vector inputWav; // [0, 1] @@ -44,6 +45,7 @@ int main(int argc, char* argv[]) { inputWav[i] = wav_reader.data()[i] / 32768; } + ppspeech::Timer timer; int window_size_samples = vad.WindowSizeSamples(); for (int64_t j = 0; j < num_samples; j += window_size_samples) { auto start = j; @@ -66,6 +68,9 @@ int main(int argc, char* argv[]) { } std::cout << std::endl; + std::cout << "RTF=" << timer.Elapsed() / double(num_samples / sr) + << std::endl; + std::vector> result = vad.GetResult(); for (auto& res : result) { std::cout << "speak start: " << res["start"] diff --git a/runtime/examples/u2pp_ol/wenetspeech/path.sh b/runtime/examples/u2pp_ol/wenetspeech/path.sh index ad3a73584..544e2048b 100644 --- a/runtime/examples/u2pp_ol/wenetspeech/path.sh +++ b/runtime/examples/u2pp_ol/wenetspeech/path.sh @@ -3,7 +3,7 @@ unset GREP_OPTIONS ENGINE_ROOT=$PWD/../../../ -ENGINE_BUILD=$ENGINE_ROOT/build/engine/asr +ENGINE_BUILD=$ENGINE_ROOT/build/Linux/x86_64/engine/asr ENGINE_TOOLS=$ENGINE_ROOT/tools TOOLS_BIN=$ENGINE_TOOLS/valgrind/install/bin diff --git a/runtime/examples/vad/conf/vad.ini b/runtime/examples/vad/conf/vad.ini index 94742ff5b..c168c73b5 100644 --- a/runtime/examples/vad/conf/vad.ini +++ b/runtime/examples/vad/conf/vad.ini @@ -4,7 +4,8 @@ model_path=./data/silero_vad/silero_vad.onnx [vad] sr = 16000 # 16k frame_ms = 32 # 32, 64, 96 for 16k -threshold = 0.45 +threshold = 0.5 +beam = 0.15 min_silence_duration_ms = 200 -speech_pad_left_ms = 200 +speech_pad_left_ms = 0 speech_pad_right_ms = 0 diff --git a/runtime/examples/vad/path.sh b/runtime/examples/vad/path.sh index 3ed85dc1e..b49911113 100644 --- a/runtime/examples/vad/path.sh +++ b/runtime/examples/vad/path.sh @@ -3,7 +3,7 @@ unset GREP_OPTIONS ENGINE_ROOT=$PWD/../../ -ENGINE_BUILD=$ENGINE_ROOT/build/engine/vad +ENGINE_BUILD=$ENGINE_ROOT/build/Linux/x86_64/engine/vad ENGINE_TOOLS=$ENGINE_ROOT/tools TOOLS_BIN=$ENGINE_TOOLS/valgrind/install/bin