add timer; compute vad rtf; vad add beam param

pull/3026/head
Hui Zhang 3 years ago
parent 27ae3482d4
commit 040271ede7

@ -14,7 +14,7 @@ TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake
# Create build directory
BUILD_ROOT=build/Android
BUILD_DIR=${BUILD_ROOT}/${ANDROID_ABI}-api-21
#FASDEPLOY_INSTALL_DIR="${BUILD_DIR}/install"
FASDEPLOY_INSTALL_DIR="/workspace/zhanghui/paddle/FastDeploy/build/Android/arm64-v8a-api-21/install"
mkdir -p ${BUILD_DIR}
cd ${BUILD_DIR}
@ -29,8 +29,8 @@ cmake -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN_FILE} \
-DANDROID_TOOLCHAIN=${ANDROID_TOOLCHAIN} \
-DWITH_ASR=OFF \
-DWITH_CLS=OFF \
-DFASTDEPLOY_INSTALL_DIR=${FASTDEPLOY_INSTALL_DIR} \
-Wno-dev ../../..
#-DFASTDEPLOY_INSTALL_DIR=${FASTDEPLOY_INSTALL_DIR} \
# Build FastDeploy Android C++ SDK
make

@ -12,4 +12,4 @@ ${CMAKE_CURRENT_SOURCE_DIR}/frontend
add_subdirectory(frontend)
add_library(common INTERFACE)
add_definitions(common base utils kaldi-matrix frontend)
target_link_libraries(common INTERFACE base utils kaldi-matrix frontend)

@ -51,3 +51,4 @@
#include "base/macros.h"
#include "utils/file_utils.h"
#include "utils/math.h"
#include "utils/timer.h"

@ -5,6 +5,7 @@ set(csrc
math.cc
strings.cc
audio_process.cc
timer.cc
)
add_library(utils ${csrc})

@ -0,0 +1,63 @@
// Copyright 2020 Xiaomi Corporation (authors: Haowen Qiu)
// Mobvoi Inc. (authors: Fangjun Kuang)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <chrono>
#include "common/utils/timer.h"
namespace ppspeech{
struct TimerImpl{
TimerImpl() = default;
virtual ~TimerImpl() = default;
virtual void Reset() = 0;
// time in seconds
virtual double Elapsed() = 0;
};
class CpuTimerImpl : public TimerImpl {
public:
CpuTimerImpl() { Reset(); }
using high_resolution_clock = std::chrono::high_resolution_clock;
void Reset() override { begin_ = high_resolution_clock::now(); }
// time in seconds
double Elapsed() override {
auto end = high_resolution_clock::now();
auto dur =
std::chrono::duration_cast<std::chrono::microseconds>(end - begin_);
return dur.count() / 1000000.0;
}
private:
high_resolution_clock::time_point begin_;
};
Timer::Timer() {
impl_ = std::make_unique<CpuTimerImpl>();
}
Timer::~Timer() = default;
void Timer::Reset() const { impl_->Reset(); }
double Timer::Elapsed() const { return impl_->Elapsed(); }
} //namespace ppspeech

@ -0,0 +1,39 @@
// Copyright 2020 Xiaomi Corporation (authors: Haowen Qiu)
// Mobvoi Inc. (authors: Fangjun Kuang)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
namespace ppspeech {
struct TimerImpl;
class Timer {
public:
Timer();
~Timer();
void Reset() const;
// time in seconds
double Elapsed() const;
private:
std::unique_ptr<TimerImpl> impl_;
};
} //namespace ppspeech

@ -3,12 +3,12 @@ set(srcs
)
add_library(pps_vad_interface ${srcs})
target_link_libraries(pps_vad_interface PUBLIC ${FASTDEPLOY_LIBS} pps_vad)
target_link_libraries(pps_vad_interface PUBLIC pps_vad)
set(bin_name vad_interface_main)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_link_libraries(${bin_name} ${FASTDEPLOY_LIBS} pps_vad_interface)
target_link_libraries(${bin_name} pps_vad_interface)
# set_target_properties(${bin_name} PROPERTIES PUBLIC_HEADER "vad_interface.h;../frontend/wav.h")

@ -25,6 +25,7 @@ PPSHandle_t PPSVadCreateInstance(const char* conf_path) {
nnet_conf.sr = conf.Read("sr", 16000);
nnet_conf.frame_ms = conf.Read("frame_ms", 32);
nnet_conf.threshold = conf.Read("threshold", 0.45f);
nnet_conf.beam = conf.Read("beam", 0.15f);
nnet_conf.min_silence_duration_ms =
conf.Read("min_silence_duration_ms", 200);
nnet_conf.speech_pad_left_ms = conf.Read("speech_pad_left_ms", 0);

@ -3,7 +3,7 @@ set(srcs
)
add_library(pps_vad_jni_interface ${srcs})
target_link_libraries(pps_vad_jni_interface PUBLIC ${FASTDEPLOY_LIBS} pps_vad_interface)
target_link_libraries(pps_vad_jni_interface PUBLIC pps_vad_interface)
file(RELATIVE_PATH DEST_DIR ${ENGINE_ROOT} ${CMAKE_CURRENT_SOURCE_DIR})

@ -14,7 +14,7 @@
#include "vad/jni/vad_jni_interface.h"
JNIEXPORT jlong JNICALL Java_com_baidu_paddlespeech_PPSVadJni_createInstance(
JNIEXPORT jlong JNICALL Java_com_baidu_paddlespeech_vadjni_createInstance(
JNIEnv* env, jobject thiz, jstring conf_path) {
const char* path = env->GetStringUTFChars(conf_path, JNI_FALSE);
PPSHandle_t handle = PPSVadCreateInstance(path);
@ -22,26 +22,26 @@ JNIEXPORT jlong JNICALL Java_com_baidu_paddlespeech_PPSVadJni_createInstance(
return (jlong)(handle);
}
JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_PPSVadJni_destoryInstance(
JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_vadjni_destoryInstance(
JNIEnv* env, jobject thiz, PPSJniHandle_t instance) {
PPSHandle_t handle = (PPSHandle_t)(instance);
return (jint)PPSVadDestroyInstance(handle);
}
JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_PPSVadJni_reset(
JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_vadjni_reset(
JNIEnv* env, jobject thiz, PPSJniHandle_t instance) {
PPSHandle_t handle = (PPSHandle_t)(instance);
return (jint)PPSVadReset(handle);
}
JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_PPSVadJni_chunkSizeSamples(
JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_vadjni_chunkSizeSamples(
JNIEnv* env, jobject thiz, PPSJniHandle_t instance) {
PPSHandle_t handle = (PPSHandle_t)(instance);
return (jint)PPSVadChunkSizeSamples(handle);
}
JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_PPSVadJni_feedForward(
JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_vadjni_feedForward(
JNIEnv* env, jobject thiz, PPSJniHandle_t instance, jfloatArray chunk) {
PPSHandle_t handle = (PPSHandle_t)(instance);
jsize num_elms = env->GetArrayLength(chunk);

@ -14,7 +14,7 @@
// PackageName: paddlespeech.baidu.com
// ClassName: PPSVadJni
// ClassName: vadjni
#include <jni.h>
#include "vad/interface/vad_interface.h"
@ -26,18 +26,18 @@ extern "C" {
typedef jlong PPSJniHandle_t;
JNIEXPORT PPSJniHandle_t JNICALL
Java_com_baidu_paddlespeech_PPSVadJni_createInstance(JNIEnv* env,
jobject thiz,
jstring conf_path);
Java_com_baidu_paddlespeech_vadjni_createInstance(JNIEnv* env,
jobject thiz,
jstring conf_path);
JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_PPSVadJni_destoryInstance(
JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_vadjni_destoryInstance(
JNIEnv* env, jobject thiz, PPSJniHandle_t instance);
JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_PPSVadJni_reset(
JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_vadjni_reset(
JNIEnv* env, jobject thiz, PPSJniHandle_t instance);
JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_PPSVadJni_chunkSizeSamples(
JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_vadjni_chunkSizeSamples(
JNIEnv* env, jobject thiz, PPSJniHandle_t instance);
// typedef enum {
@ -49,7 +49,7 @@ JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_PPSVadJni_chunkSizeSamples(
// PPS_VAD_NUMSTATES, // number of states
// } PPSVadState_t;
JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_PPSVadJni_feedForward(
JNIEXPORT jint JNICALL Java_com_baidu_paddlespeech_vadjni_feedForward(
JNIEnv* env, jobject thiz, PPSJniHandle_t instance, jfloatArray chunk);
#ifdef __cplusplus

@ -9,7 +9,7 @@ target_link_libraries(pps_vad PUBLIC ${FASTDEPLOY_LIBS} common)
set(bin_name vad_nnet_main)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
# target_link_libraries(${bin_name} ${FASTDEPLOY_LIBS} vad gflags extern_glog)
target_link_libraries(${bin_name} ${FASTDEPLOY_LIBS} pps_vad)
target_link_libraries(${bin_name} pps_vad)
file(RELATIVE_PATH DEST_DIR ${ENGINE_ROOT} ${CMAKE_CURRENT_SOURCE_DIR})

@ -62,6 +62,7 @@ void Vad::SetConfig(const VadNnetConf conf) {
SetConfig(conf.sr,
conf.frame_ms,
conf.threshold,
conf.beam,
conf.min_silence_duration_ms,
conf.speech_pad_left_ms,
conf.speech_pad_right_ms);
@ -70,6 +71,7 @@ void Vad::SetConfig(const VadNnetConf conf) {
void Vad::SetConfig(const int& sr,
const int& frame_ms,
const float& threshold,
const float& beam,
const int& min_silence_duration_ms,
const int& speech_pad_left_ms,
const int& speech_pad_right_ms) {
@ -81,6 +83,7 @@ void Vad::SetConfig(const int& sr,
sample_rate_ = sr;
sr_per_ms_ = sr / 1000;
threshold_ = threshold;
beam_ = beam;
frame_ms_ = frame_ms;
min_silence_samples_ = min_silence_duration_ms * sr_per_ms_;
speech_pad_left_samples_ = speech_pad_left_ms * sr_per_ms_;
@ -90,8 +93,8 @@ void Vad::SetConfig(const int& sr,
window_size_samples_ = frame_ms * sr_per_ms_;
current_chunk_size_ = window_size_samples_;
fastdeploy::FDINFO << "sr=" << sr << " threshold=" << threshold
<< " frame_ms=" << frame_ms
fastdeploy::FDINFO << "sr=" << sr_per_ms_ << " threshold=" << threshold_
<< " beam=" << beam_ << " frame_ms=" << frame_ms_
<< " min_silence_duration_ms=" << min_silence_duration_ms
<< " speech_pad_left_ms=" << speech_pad_left_ms
<< " speech_pad_right_ms=" << speech_pad_right_ms;
@ -194,7 +197,7 @@ const Vad::State& Vad::Postprocess() {
LOG_DEBUG << "{ speech start: " << start_sec
<< " s; prob: " << outputProb_ << " }";
states_.emplace_back(Vad::State::START);
} else if (outputProb_ >= threshold_ - 0.15 && triggerd_) {
} else if (outputProb_ >= threshold_ - beam_ && triggerd_) {
// 3. Continue
if (temp_end_ != 0) {
@ -211,7 +214,7 @@ const Vad::State& Vad::Postprocess() {
}
states_.emplace_back(Vad::State::SPEECH);
} else if (outputProb_ < threshold_ - 0.15 && triggerd_) {
} else if (outputProb_ < threshold_ - beam_ && triggerd_) {
// 4. End
if (temp_end_ == 0) {
temp_end_ = current_sample_;

@ -29,6 +29,7 @@ struct VadNnetConf {
int sr;
int frame_ms;
float threshold;
float beam;
int min_silence_duration_ms;
int speech_pad_left_ms;
int speech_pad_right_ms;
@ -59,6 +60,7 @@ class Vad : public fastdeploy::FastDeployModel {
void SetConfig(const int& sr,
const int& frame_ms,
const float& threshold,
const float& beam,
const int& min_silence_duration_ms,
const int& speech_pad_left_ms,
const int& speech_pad_right_ms);
@ -130,6 +132,7 @@ class Vad : public fastdeploy::FastDeployModel {
int sample_rate_ = 16000;
int frame_ms_ = 32; // 32, 64, 96 for 16k
float threshold_ = 0.5f;
float beam_ = 0.15f;
int64_t window_size_samples_; // support 256 512 768 for 8k; 512 1024 1536
// for 16k.

@ -13,6 +13,7 @@
// limitations under the License.
#include "common/base/common.h"
#include "vad/nnet/vad.h"
int main(int argc, char* argv[]) {
@ -30,7 +31,7 @@ int main(int argc, char* argv[]) {
int sr = 16000;
ppspeech::Vad vad(model_file);
// custom config, but must be set before init
vad.SetConfig(sr, 32, 0.45f, 200, 0, 0);
vad.SetConfig(sr, 32, 0.5f, 0.15, 200, 0, 0);
vad.Init();
std::vector<float> inputWav; // [0, 1]
@ -44,6 +45,7 @@ int main(int argc, char* argv[]) {
inputWav[i] = wav_reader.data()[i] / 32768;
}
ppspeech::Timer timer;
int window_size_samples = vad.WindowSizeSamples();
for (int64_t j = 0; j < num_samples; j += window_size_samples) {
auto start = j;
@ -66,6 +68,9 @@ int main(int argc, char* argv[]) {
}
std::cout << std::endl;
std::cout << "RTF=" << timer.Elapsed() / double(num_samples / sr)
<< std::endl;
std::vector<std::map<std::string, float>> result = vad.GetResult();
for (auto& res : result) {
std::cout << "speak start: " << res["start"]

@ -3,7 +3,7 @@
unset GREP_OPTIONS
ENGINE_ROOT=$PWD/../../../
ENGINE_BUILD=$ENGINE_ROOT/build/engine/asr
ENGINE_BUILD=$ENGINE_ROOT/build/Linux/x86_64/engine/asr
ENGINE_TOOLS=$ENGINE_ROOT/tools
TOOLS_BIN=$ENGINE_TOOLS/valgrind/install/bin

@ -4,7 +4,8 @@ model_path=./data/silero_vad/silero_vad.onnx
[vad]
sr = 16000 # 16k
frame_ms = 32 # 32, 64, 96 for 16k
threshold = 0.45
threshold = 0.5
beam = 0.15
min_silence_duration_ms = 200
speech_pad_left_ms = 200
speech_pad_left_ms = 0
speech_pad_right_ms = 0

@ -3,7 +3,7 @@
unset GREP_OPTIONS
ENGINE_ROOT=$PWD/../../
ENGINE_BUILD=$ENGINE_ROOT/build/engine/vad
ENGINE_BUILD=$ENGINE_ROOT/build/Linux/x86_64/engine/vad
ENGINE_TOOLS=$ENGINE_ROOT/tools
TOOLS_BIN=$ENGINE_TOOLS/valgrind/install/bin

Loading…
Cancel
Save