refactor vad, add vad conf, vad inerface, vad recipe

pull/3026/head
Hui Zhang 3 years ago
parent dd081cd6b1
commit 77a3ceaa08

@ -42,6 +42,9 @@ SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} --std=c++14 -pthread -fPIC -O3 -Wall
###############################################################################
# Option Configurations
###############################################################################
# https://github.com/google/brotli/pull/655
option(BUILD_SHARED_LIBS "Build shared libraries" ON)
option(WITH_ASR "build asr" ON)
option(WITH_CLS "build cls" ON)
option(WITH_VAD "build vad" ON)

@ -4,5 +4,5 @@ set -xe
# the build script had verified in the paddlepaddle docker image.
# please follow the instruction below to install PaddlePaddle image.
# https://www.paddlepaddle.org.cn/documentation/docs/zh/install/docker/linux-docker.html
cmake -B build -DWITH_ASR=ON -DWITH_CLS=OFF -DWITH_VAD=OFF
cmake -B build -DBUILD_SHARED_LIBS=OFF -DWITH_ASR=OFF -DWITH_CLS=OFF -DWITH_VAD=ON -DFASTDEPLOY_INSTALL_DIR=/workspace/zhanghui/paddle/FastDeploy/build/Android/arm64-v8a-api-21/install
cmake --build build -j

@ -20,7 +20,9 @@ endif()
if(ANDROID)
set(FASTDEPLOY_INSTALL_DIR ${FASTDEPLOY_DIR}/android-armv7v8)
if(NOT DEFINED FASTDEPLOY_INSTALL_DIR)
set(FASTDEPLOY_INSTALL_DIR ${FASTDEPLOY_DIR}/android-armv7v8)
endif()
add_definitions("-DUSE_PADDLE_LITE_BAKEND")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -mfloat-abi=softfp -mfpu=vfpv3 -mfpu=neon -fPIC -pie -fPIE")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -g0 -O3 -mfloat-abi=softfp -mfpu=vfpv3 -mfpu=neon -fPIC -pie -fPIE")

@ -10,9 +10,19 @@ include(FetchContent)
#Application of Automata, (CIAA 2007), volume 4783 of Lecture Notes in
#Computer Science, pages 11-23. Springer, 2007. http://www.openfst.org.
set(EXTERNAL_PROJECT_LOG_ARGS
LOG_DOWNLOAD 1 # Wrap download in script to log output
LOG_UPDATE 1 # Wrap update in script to log output
LOG_CONFIGURE 1# Wrap configure in script to log output
LOG_BUILD 1 # Wrap build in script to log output
LOG_TEST 1 # Wrap test in script to log output
LOG_INSTALL 1 # Wrap install in script to log output
)
ExternalProject_Add(openfst
URL https://paddleaudio.bj.bcebos.com/build/openfst_1.7.2.zip
URL_HASH SHA256=ffc56931025579a8af3515741c0f3b0fc3a854c023421472c07ca0c6389c75e6
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${openfst_PREFIX_DIR}
SOURCE_DIR ${openfst_SOURCE_DIR}
BINARY_DIR ${openfst_BINARY_DIR}

@ -3,7 +3,7 @@ set(srcs
panns_interface.cc
)
add_library(cls SHARED ${srcs})
add_library(cls ${srcs})
target_link_libraries(cls INTERFACE -static-libstdc++;-Wl,-Bsymbolic ${FASTDEPLOY_LIBS} kaldi-matrix kaldi-base frontend utils )
set(bin_name panns_nnet_main)

@ -12,4 +12,4 @@ ${CMAKE_CURRENT_SOURCE_DIR}/frontend
add_subdirectory(frontend)
add_library(common INTERFACE)
add_definitions(common base utils kaldi-matrix frontend)
add_definitions(common base utils kaldi-matrix frontend)

@ -10,7 +10,10 @@ using namespace std;
#pragma once
#ifdef _MSC_VER
#pragma region ParseIniFile
#endif
/*
* \brief Generic configuration Class
*
@ -335,4 +338,6 @@ void Config::ReadFile(string filename, string delimiter, string comment) {
in >> (*this);
}
#ifdef _MSC_VER
#pragma endregion ParseIniFIle
#endif

@ -1,5 +1,7 @@
include_directories(
${CMAKE_CURRENT_SOURCE_DIR}/../
)
add_subdirectory(nnet)
set(bin_name silero_vad_main)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc vad.cc)
target_link_libraries(${bin_name} ${FASTDEPLOY_LIBS} gflags extern_glog)
add_subdirectory(interface)

@ -18,6 +18,7 @@
#include <stdlib.h>
#include <string.h>
#include <string>
#include <iostream>
namespace wav {

@ -0,0 +1,11 @@
set(srcs
vad_interface.cc
)
add_library(vad_interface ${srcs})
target_link_libraries(vad_interface INTERFACE ${FASTDEPLOY_LIBS} vad)
set(bin_name vad_interface_main)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_link_libraries(${bin_name} ${FASTDEPLOY_LIBS} vad_interface)

@ -0,0 +1,92 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "vad/interface/vad_interface.h"
#include "common/base/log.h"
#include "common/base/config.h"
#include "vad/nnet/vad.h"
PPSHandle_t PPSVadCreateInstance(const char* conf_path) {
Config conf(conf_path);
ppspeech::VadNnetConf nnet_conf;
nnet_conf.sr = conf.Read("sr", 16000);
nnet_conf.frame_ms = conf.Read("frame_ms", 32);
nnet_conf.threshold = conf.Read("threshold", 0.45f);
nnet_conf.min_silence_duration_ms = conf.Read("min_silence_duration_ms", 200);
nnet_conf.speech_pad_left_ms = conf.Read("speech_pad_left_ms", 0);
nnet_conf.speech_pad_right_ms = conf.Read("speech_pad_right_ms", 0);
nnet_conf.model_file_path = conf.Read("model_path", std::string(""));
nnet_conf.param_file_path = conf.Read("param_path", std::string(""));
nnet_conf.num_cpu_thread = conf.Read("num_cpu_thread", 1);
ppspeech::Vad* model = new ppspeech::Vad(nnet_conf.model_file_path);
// custom config, but must be set before init
model->SetConfig(nnet_conf);
model->Init();
return static_cast<PPSHandle_t>(model);
}
int PPSVadDestroyInstance(PPSHandle_t instance) {
ppspeech::Vad* model = static_cast<ppspeech::Vad*>(instance);
if (model != nullptr) {
delete model;
model = nullptr;
}
return 0;
}
int PPSVadChunkSizeSamples(PPSHandle_t instance) {
ppspeech::Vad* model = static_cast<ppspeech::Vad*>(instance);
if (model == nullptr) {
printf("instance is null\n");
return -1;
}
return model->WindowSizeSamples();
}
PPSVadState_t PPSVadFeedForward(PPSHandle_t instance,
float* chunk,
int num_element) {
ppspeech::Vad* model = static_cast<ppspeech::Vad*>(instance);
if (model == nullptr) {
LOG(ERROR) << "instance is null";
return PPS_ILLEGAL;
}
std::vector<float> chunk_in(chunk, chunk + num_element);
if (!model->ForwardChunk(chunk_in)){
LOG(ERROR) << "forward chunk failed";
return PPS_ILLEGAL;
}
ppspeech::Vad::State s = model->Postprocess();
PPSVadState_t ret = (PPSVadState_t)s;
return ret;
}
int PPSVadReset(PPSHandle_t instance) {
ppspeech::Vad* model = static_cast<ppspeech::Vad*>(instance);
if (model == nullptr) {
printf("instance is null\n");
return -1;
}
model->Reset();
return 0;
}

@ -0,0 +1,43 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef __cplusplus
extern "C" {
#endif
typedef void* PPSHandle_t;
typedef enum {
PPS_ILLEGAL = 0, // error
PPS_SIL, // silence
PPS_START, // start speech
PPS_SPEECH, // in speech
PPS_END, // end speech
} PPSVadState_t;
PPSHandle_t PPSVadCreateInstance(const char* conf_path);
int PPSVadDestroyInstance(PPSHandle_t instance);
int PPSVadReset(PPSHandle_t instance);
int PPSVadChunkSizeSamples(PPSHandle_t instance);
PPSVadState_t PPSVadFeedForward(PPSHandle_t instance, float* chunk,int num_element);
#ifdef __cplusplus
}
#endif // __cplusplus

@ -0,0 +1,63 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "vad/interface/vad_interface.h"
#include "vad/frontend/wav.h"
#include <vector>
#include <iostream>
int main(int argc, char* argv[]) {
if (argc < 3) {
std::cout << "Usage: vad_interface_main path/to/config path/to/audio "
"run_option, "
"e.g ./vad_interface_main config sample.wav"
<< std::endl;
return -1;
}
std::string config_path = argv[1];
std::string audio_file = argv[2];
PPSHandle_t handle = PPSVadCreateInstance(config_path.c_str());
std::vector<float> inputWav; // [0, 1]
wav::WavReader wav_reader = wav::WavReader(audio_file);
auto num_samples = wav_reader.num_samples();
inputWav.resize(num_samples);
for (int i = 0; i < num_samples; i++) {
inputWav[i] = wav_reader.data()[i] / 32768;
}
int window_size_samples = PPSVadChunkSizeSamples(handle);
for (int64_t j = 0; j < num_samples; j += window_size_samples) {
auto start = j;
auto end = start + window_size_samples >= num_samples
? num_samples
: start + window_size_samples;
auto current_chunk_size = end - start;
std::vector<float> r{&inputWav[0] + start, &inputWav[0] + end};
assert(r.size() == static_cast<size_t>(current_chunk_size));
PPSVadState_t s = PPSVadFeedForward(handle, r.data(), r.size());
std::cout << s << " ";
}
std::cout << std::endl;
PPSVadReset(handle);
return 0;
}

@ -0,0 +1,13 @@
set(srcs
vad.cc
)
add_library(vad ${srcs})
target_link_libraries(vad INTERFACE ${FASTDEPLOY_LIBS} common)
target_link_libraries(vad PRIVATE common)
set(bin_name vad_nnet_main)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
# target_link_libraries(${bin_name} ${FASTDEPLOY_LIBS} vad gflags extern_glog)
target_link_libraries(${bin_name} ${FASTDEPLOY_LIBS} vad)

@ -1,4 +1,5 @@
// Copyright (c) 2023 Chen Qianhe Authors. All Rights Reserved.
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -11,7 +12,7 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "vad.h"
#include "vad/nnet/vad.h"
#include <cstring>
#include <iomanip>
@ -26,6 +27,8 @@
<< __REL_FILE__ << "(" << __LINE__ << ")::" << __FUNCTION__ << "\t"
#endif
namespace ppspeech {
Vad::Vad(const std::string& model_file,
const fastdeploy::RuntimeOption&
custom_option /* = fastdeploy::RuntimeOption() */) {
@ -48,18 +51,29 @@ Vad::Vad(const std::string& model_file,
}
void Vad::Init() {
std::call_once(init_, [&]() { initialized = Initialize(); });
std::lock_guard<std::mutex> lock(init_lock_);
Initialize();
}
std::string Vad::ModelName() const { return "VAD"; }
void Vad::SetConfig(int sr,
int frame_ms,
float threshold,
int min_silence_duration_ms,
int speech_pad_left_ms,
int speech_pad_right_ms) {
if (initialized) {
void Vad::SetConfig(const VadNnetConf conf){
SetConfig(
conf.sr,
conf.frame_ms,
conf.threshold,
conf.min_silence_duration_ms,
conf.speech_pad_left_ms,
conf.speech_pad_right_ms);
}
void Vad::SetConfig(const int& sr,
const int& frame_ms,
const float& threshold,
const int& min_silence_duration_ms,
const int& speech_pad_left_ms,
const int& speech_pad_right_ms) {
if (initialized_) {
fastdeploy::FDERROR << "SetConfig must be called before init"
<< std::endl;
throw std::runtime_error("SetConfig must be called before init");
@ -114,12 +128,18 @@ bool Vad::Initialize() {
Reset();
// InitRuntime
if (!InitRuntime()) {
fastdeploy::FDERROR << "Failed to initialize fastdeploy backend."
<< std::endl;
return false;
}
initialized_=true;
fastdeploy::FDINFO << "init done.";
return true;
}
@ -303,4 +323,6 @@ std::ostream& operator<<(std::ostream& os, const Vad::State& s) {
break;
}
return os;
}
}
} // namepsace ppspeech

@ -1,4 +1,5 @@
// Copyright (c) 2023 Chen Qianhe Authors. All Rights Reserved.
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -11,17 +12,37 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <iostream>
#include <mutex>
#include <vector>
#include "./wav.h"
#include "vad/frontend/wav.h"
#include "fastdeploy/fastdeploy_model.h"
#include "fastdeploy/runtime.h"
namespace ppspeech {
struct VadNnetConf {
// wav
int sr;
int frame_ms;
float threshold;
int min_silence_duration_ms;
int speech_pad_left_ms;
int speech_pad_right_ms;
// model
std::string model_file_path;
std::string param_file_path;
std::string dict_file_path;
int num_cpu_thread; // 1 thred
std::string backend; // ort,lite, etc.
};
class Vad : public fastdeploy::FastDeployModel {
public:
enum class State { SIL = 0, START, SPEECH, END };
enum class State { ILLEGAL = 0, SIL, START, SPEECH, END };
friend std::ostream& operator<<(std::ostream& os, const Vad::State& s);
Vad(const std::string& model_file,
@ -32,12 +53,13 @@ class Vad : public fastdeploy::FastDeployModel {
void Reset();
void SetConfig(int sr,
int frame_ms,
float threshold,
int min_silence_duration_ms,
int speech_pad_left_ms,
int speech_pad_right_ms);
void SetConfig(const int& sr,
const int& frame_ms,
const float& threshold,
const int& min_silence_duration_ms,
const int& speech_pad_left_ms,
const int& speech_pad_right_ms);
void SetConfig(const VadNnetConf conf);
bool ForwardChunk(std::vector<float>& chunk);
@ -78,7 +100,9 @@ class Vad : public fastdeploy::FastDeployModel {
bool Initialize();
private:
std::once_flag init_;
std::mutex init_lock_;
bool initialized_{false};
// input and output
std::vector<fastdeploy::FDTensor> inputTensors_;
std::vector<fastdeploy::FDTensor> outputTensors_;
@ -122,3 +146,5 @@ class Vad : public fastdeploy::FastDeployModel {
const std::vector<int64_t> sr_node_dims_ = {1};
const std::vector<int64_t> hc_node_dims_ = {2, 1, 64};
};
} // namepsace ppspeech

@ -1,11 +1,25 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "vad.h"
#include "vad/nnet/vad.h"
int main(int argc, char* argv[]) {
if (argc < 3) {
std::cout << "Usage: infer_onnx_silero_vad path/to/model path/to/audio "
std::cout << "Usage: vad_nnet_main path/to/model path/to/audio "
"run_option, "
"e.g ./infer_onnx_silero_vad silero_vad.onnx sample.wav"
"e.g ./vad_nnet_main silero_vad.onnx sample.wav"
<< std::endl;
return -1;
}
@ -14,7 +28,7 @@ int main(int argc, char* argv[]) {
std::string audio_file = argv[2];
int sr = 16000;
Vad vad(model_file);
ppspeech::Vad vad(model_file);
// custom config, but must be set before init
vad.SetConfig(sr, 32, 0.45f, 200, 0, 0);
vad.Init();
@ -39,7 +53,7 @@ int main(int argc, char* argv[]) {
auto current_chunk_size = end - start;
std::vector<float> r{&inputWav[0] + start, &inputWav[0] + end};
assert(r.size() == current_chunk_size);
assert(r.size() == static_cast<size_t>(current_chunk_size));
if (!vad.ForwardChunk(r)) {
std::cerr << "Failed to inference while using model:"
@ -47,7 +61,7 @@ int main(int argc, char* argv[]) {
return false;
}
Vad::State s = vad.Postprocess();
ppspeech::Vad::State s = vad.Postprocess();
std::cout << s << " ";
}
std::cout << std::endl;

@ -0,0 +1,10 @@
[model]
model_path=./data/silero_vad/silero_vad.onnx
[vad]
sr = 16000 # 16k
frame_ms = 32 # 32, 64, 96 for 16k
threshold = 0.45
min_silence_duration_ms = 200
speech_pad_left_ms = 200
speech_pad_right_ms = 0

@ -1,6 +1,7 @@
#!/bin/bash
set -e
conf=conf
data=data
exp=exp
@ -10,8 +11,13 @@ mkdir -p $exp
ckpt_dir=$data/silero_vad
model=$ckpt_dir/silero_vad.onnx
test_wav=$data/silero_vad_sample.wav
conf_file=$conf/vad.ini
silero_vad_main $model $test_wav
vad_nnet_main $model $test_wav
echo "vad_nnet_main done!"
vad_interface_main $conf_file $test_wav
echo "vad_interface_main done!"
echo "silero vad done!"

@ -12,6 +12,6 @@ TOOLS_BIN=$ENGINE_TOOLS/valgrind/install/bin
export LC_AL=C
export PATH=$PATH:$TOOLS_BIN:$ENGINE_BUILD
export PATH=$PATH:$TOOLS_BIN:$ENGINE_BUILD/nnet:$ENGINE_BUILD/interface
export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH
Loading…
Cancel
Save