parent
dd081cd6b1
commit
77a3ceaa08
@ -1,5 +1,7 @@
|
||||
include_directories(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/../
|
||||
)
|
||||
|
||||
add_subdirectory(nnet)
|
||||
|
||||
set(bin_name silero_vad_main)
|
||||
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc vad.cc)
|
||||
target_link_libraries(${bin_name} ${FASTDEPLOY_LIBS} gflags extern_glog)
|
||||
add_subdirectory(interface)
|
@ -0,0 +1,11 @@
|
||||
set(srcs
|
||||
vad_interface.cc
|
||||
)
|
||||
|
||||
add_library(vad_interface ${srcs})
|
||||
target_link_libraries(vad_interface INTERFACE ${FASTDEPLOY_LIBS} vad)
|
||||
|
||||
|
||||
set(bin_name vad_interface_main)
|
||||
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
|
||||
target_link_libraries(${bin_name} ${FASTDEPLOY_LIBS} vad_interface)
|
@ -0,0 +1,92 @@
|
||||
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "vad/interface/vad_interface.h"
|
||||
#include "common/base/log.h"
|
||||
#include "common/base/config.h"
|
||||
#include "vad/nnet/vad.h"
|
||||
|
||||
|
||||
PPSHandle_t PPSVadCreateInstance(const char* conf_path) {
|
||||
Config conf(conf_path);
|
||||
ppspeech::VadNnetConf nnet_conf;
|
||||
nnet_conf.sr = conf.Read("sr", 16000);
|
||||
nnet_conf.frame_ms = conf.Read("frame_ms", 32);
|
||||
nnet_conf.threshold = conf.Read("threshold", 0.45f);
|
||||
nnet_conf.min_silence_duration_ms = conf.Read("min_silence_duration_ms", 200);
|
||||
nnet_conf.speech_pad_left_ms = conf.Read("speech_pad_left_ms", 0);
|
||||
nnet_conf.speech_pad_right_ms = conf.Read("speech_pad_right_ms", 0);
|
||||
|
||||
nnet_conf.model_file_path = conf.Read("model_path", std::string(""));
|
||||
nnet_conf.param_file_path = conf.Read("param_path", std::string(""));
|
||||
nnet_conf.num_cpu_thread = conf.Read("num_cpu_thread", 1);
|
||||
|
||||
ppspeech::Vad* model = new ppspeech::Vad(nnet_conf.model_file_path);
|
||||
|
||||
// custom config, but must be set before init
|
||||
model->SetConfig(nnet_conf);
|
||||
model->Init();
|
||||
|
||||
return static_cast<PPSHandle_t>(model);
|
||||
}
|
||||
|
||||
|
||||
int PPSVadDestroyInstance(PPSHandle_t instance) {
|
||||
ppspeech::Vad* model = static_cast<ppspeech::Vad*>(instance);
|
||||
if (model != nullptr) {
|
||||
delete model;
|
||||
model = nullptr;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int PPSVadChunkSizeSamples(PPSHandle_t instance) {
|
||||
ppspeech::Vad* model = static_cast<ppspeech::Vad*>(instance);
|
||||
if (model == nullptr) {
|
||||
printf("instance is null\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return model->WindowSizeSamples();
|
||||
}
|
||||
|
||||
PPSVadState_t PPSVadFeedForward(PPSHandle_t instance,
|
||||
float* chunk,
|
||||
int num_element) {
|
||||
ppspeech::Vad* model = static_cast<ppspeech::Vad*>(instance);
|
||||
if (model == nullptr) {
|
||||
LOG(ERROR) << "instance is null";
|
||||
return PPS_ILLEGAL;
|
||||
}
|
||||
|
||||
std::vector<float> chunk_in(chunk, chunk + num_element);
|
||||
if (!model->ForwardChunk(chunk_in)){
|
||||
LOG(ERROR) << "forward chunk failed";
|
||||
return PPS_ILLEGAL;
|
||||
}
|
||||
ppspeech::Vad::State s = model->Postprocess();
|
||||
PPSVadState_t ret = (PPSVadState_t)s;
|
||||
return ret;
|
||||
}
|
||||
|
||||
int PPSVadReset(PPSHandle_t instance) {
|
||||
ppspeech::Vad* model = static_cast<ppspeech::Vad*>(instance);
|
||||
if (model == nullptr) {
|
||||
printf("instance is null\n");
|
||||
return -1;
|
||||
}
|
||||
model->Reset();
|
||||
return 0;
|
||||
}
|
@ -0,0 +1,43 @@
|
||||
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef void* PPSHandle_t;
|
||||
|
||||
typedef enum {
|
||||
PPS_ILLEGAL = 0, // error
|
||||
PPS_SIL, // silence
|
||||
PPS_START, // start speech
|
||||
PPS_SPEECH, // in speech
|
||||
PPS_END, // end speech
|
||||
} PPSVadState_t;
|
||||
|
||||
PPSHandle_t PPSVadCreateInstance(const char* conf_path);
|
||||
|
||||
int PPSVadDestroyInstance(PPSHandle_t instance);
|
||||
|
||||
int PPSVadReset(PPSHandle_t instance);
|
||||
|
||||
int PPSVadChunkSizeSamples(PPSHandle_t instance);
|
||||
|
||||
PPSVadState_t PPSVadFeedForward(PPSHandle_t instance, float* chunk,int num_element);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif // __cplusplus
|
@ -0,0 +1,63 @@
|
||||
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "vad/interface/vad_interface.h"
|
||||
#include "vad/frontend/wav.h"
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
if (argc < 3) {
|
||||
std::cout << "Usage: vad_interface_main path/to/config path/to/audio "
|
||||
"run_option, "
|
||||
"e.g ./vad_interface_main config sample.wav"
|
||||
<< std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::string config_path = argv[1];
|
||||
std::string audio_file = argv[2];
|
||||
|
||||
PPSHandle_t handle = PPSVadCreateInstance(config_path.c_str());
|
||||
|
||||
std::vector<float> inputWav; // [0, 1]
|
||||
wav::WavReader wav_reader = wav::WavReader(audio_file);
|
||||
|
||||
auto num_samples = wav_reader.num_samples();
|
||||
inputWav.resize(num_samples);
|
||||
for (int i = 0; i < num_samples; i++) {
|
||||
inputWav[i] = wav_reader.data()[i] / 32768;
|
||||
}
|
||||
|
||||
int window_size_samples = PPSVadChunkSizeSamples(handle);
|
||||
for (int64_t j = 0; j < num_samples; j += window_size_samples) {
|
||||
auto start = j;
|
||||
auto end = start + window_size_samples >= num_samples
|
||||
? num_samples
|
||||
: start + window_size_samples;
|
||||
auto current_chunk_size = end - start;
|
||||
|
||||
std::vector<float> r{&inputWav[0] + start, &inputWav[0] + end};
|
||||
assert(r.size() == static_cast<size_t>(current_chunk_size));
|
||||
|
||||
PPSVadState_t s = PPSVadFeedForward(handle, r.data(), r.size());
|
||||
std::cout << s << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
PPSVadReset(handle);
|
||||
|
||||
return 0;
|
||||
}
|
@ -0,0 +1,13 @@
|
||||
set(srcs
|
||||
vad.cc
|
||||
)
|
||||
|
||||
add_library(vad ${srcs})
|
||||
target_link_libraries(vad INTERFACE ${FASTDEPLOY_LIBS} common)
|
||||
target_link_libraries(vad PRIVATE common)
|
||||
|
||||
|
||||
set(bin_name vad_nnet_main)
|
||||
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
|
||||
# target_link_libraries(${bin_name} ${FASTDEPLOY_LIBS} vad gflags extern_glog)
|
||||
target_link_libraries(${bin_name} ${FASTDEPLOY_LIBS} vad)
|
@ -0,0 +1,10 @@
|
||||
[model]
|
||||
model_path=./data/silero_vad/silero_vad.onnx
|
||||
|
||||
[vad]
|
||||
sr = 16000 # 16k
|
||||
frame_ms = 32 # 32, 64, 96 for 16k
|
||||
threshold = 0.45
|
||||
min_silence_duration_ms = 200
|
||||
speech_pad_left_ms = 200
|
||||
speech_pad_right_ms = 0
|
Loading…
Reference in new issue