opt to compile asr,cls,vad; add vad; format code (#2968)
parent
78e29c8ec4
commit
b35fc01a3a
@ -1,3 +1,6 @@
|
||||
engine/common/base/flags.h
|
||||
engine/common/base/log.h
|
||||
|
||||
tools/valgrind*
|
||||
*log
|
||||
fc_patch/*
|
||||
|
@ -0,0 +1,20 @@
|
||||
if(WITH_ASR)
|
||||
add_compile_options(-DWITH_ASR)
|
||||
set(PPS_FLAGS_LIB "fst/flags.h")
|
||||
set(PPS_GLOB_LIB "fst/log.h")
|
||||
else()
|
||||
set(PPS_FLAGS_LIB "gflags/gflags.h")
|
||||
set(PPS_GLOB_LIB "glog/logging.h")
|
||||
endif()
|
||||
|
||||
configure_file(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/flags.h.in
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/flags.h @ONLY
|
||||
)
|
||||
message(STATUS "Generated ${CMAKE_CURRENT_SOURCE_DIR}/flags.h")
|
||||
|
||||
configure_file(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/log.h.in
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/log.h @ONLY
|
||||
)
|
||||
message(STATUS "Generated ${CMAKE_CURRENT_SOURCE_DIR}/log.h")
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,14 +1,15 @@
|
||||
project(kaldi)
|
||||
include_directories(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
)
|
||||
|
||||
add_subdirectory(base)
|
||||
add_subdirectory(util)
|
||||
add_subdirectory(lat)
|
||||
add_subdirectory(fstext)
|
||||
add_subdirectory(decoder)
|
||||
add_subdirectory(lm)
|
||||
if(WITH_ASR)
|
||||
add_subdirectory(lat)
|
||||
add_subdirectory(fstext)
|
||||
add_subdirectory(decoder)
|
||||
add_subdirectory(lm)
|
||||
|
||||
add_subdirectory(fstbin)
|
||||
add_subdirectory(lmbin)
|
||||
add_subdirectory(fstbin)
|
||||
add_subdirectory(lmbin)
|
||||
endif()
|
||||
|
@ -0,0 +1,18 @@
|
||||
# set(CMAKE_CXX_STANDARD 11)
|
||||
|
||||
# # 指定下载解压后的fastdeploy库路径
|
||||
# set(FASTDEPLOY_INSTALL_DIR "fdlib/fastdeploy-linux-x64-1.0.4" CACHE STRING force)
|
||||
|
||||
# if(NOT EXISTS ${FASTDEPLOY_INSTALL_DIR})
|
||||
# message(FATAL_ERROR "Please using cmake -B build -DFASTDEPLOY_INSTALL_DIR=${FASTDEPLOY_INSTALL_DIR}")
|
||||
# endif()
|
||||
|
||||
# include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
|
||||
|
||||
# # 添加FastDeploy依赖头文件
|
||||
# include_directories(${FASTDEPLOY_INCS})
|
||||
|
||||
add_executable(infer_onnx_silero_vad ${CMAKE_CURRENT_SOURCE_DIR}/infer_onnx_silero_vad.cc wav.h vad.cc vad.h)
|
||||
|
||||
# 添加FastDeploy库依赖
|
||||
target_link_libraries(infer_onnx_silero_vad ${FASTDEPLOY_LIBS})
|
@ -0,0 +1,121 @@
|
||||
English | [简体中文](README_CN.md)
|
||||
|
||||
# Silero VAD Deployment Example
|
||||
|
||||
This directory provides examples that `infer_onnx_silero_vad` fast finishes the deployment of VAD models on CPU/GPU.
|
||||
|
||||
Before deployment, two steps require confirmation.
|
||||
|
||||
- 1. Software and hardware should meet the requirements. Please refer to [FastDeploy Environment Requirements](../../../../docs/en/build_and_install/download_prebuilt_libraries.md).
|
||||
- 2. Download the precompiled deployment library and samples code according to your development environment. Refer to [FastDeploy Precompiled Library](../../../../docs/en/build_and_install/download_prebuilt_libraries.md).
|
||||
|
||||
Taking VAD inference on Linux as an example, the compilation test can be completed by executing the following command in this directory.
|
||||
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above
|
||||
wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
|
||||
tar xvf fastdeploy-linux-x64-x.x.x.tgz
|
||||
cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
|
||||
make -j
|
||||
|
||||
# Download the VAD model file and test audio. After decompression, place the model and test audio in the infer_onnx_silero_vad.cc peer directory
|
||||
wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad.tgz
|
||||
wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad_sample.wav
|
||||
|
||||
# inference
|
||||
./infer_onnx_silero_vad ../silero_vad.onnx ../silero_vad_sample.wav
|
||||
```
|
||||
|
||||
- The above command works for Linux or MacOS. Refer to:
|
||||
- [How to use FastDeploy C++ SDK in Windows](../../../../docs/en/faq/use_sdk_on_windows.md) for SDK use-pattern in Windows
|
||||
|
||||
## VAD C++ Interface
|
||||
|
||||
### Vad Class
|
||||
|
||||
```c++
|
||||
Vad::Vad(const std::string& model_file,
|
||||
const fastdeploy::RuntimeOption& custom_option = fastdeploy::RuntimeOption())
|
||||
```
|
||||
|
||||
**Parameter**
|
||||
|
||||
> * **model_file**(str): Model file path
|
||||
> * **runtime_option**(RuntimeOption): Backend inference configuration. None by default. (use the default configuration)
|
||||
|
||||
### setAudioCofig function
|
||||
|
||||
**Must be called before the `init` function**
|
||||
|
||||
```c++
|
||||
void Vad::setAudioCofig(int sr, int frame_ms, float threshold, int min_silence_duration_ms, int speech_pad_ms);
|
||||
```
|
||||
|
||||
**Parameter**
|
||||
|
||||
> * **sr**(int): sampling rate
|
||||
> * **frame_ms**(int): The length of each detection frame, and it is used to calculate the detection window size
|
||||
> * **threshold**(float): Result probability judgment threshold
|
||||
> * **min_silence_duration_ms**(int): The threshold used to calculate whether it is silence
|
||||
> * **speech_pad_ms**(int): Used to calculate the end time of the speech
|
||||
|
||||
### init function
|
||||
|
||||
Used to initialize audio-related parameters.
|
||||
|
||||
```c++
|
||||
void Vad::init();
|
||||
```
|
||||
|
||||
### loadAudio function
|
||||
|
||||
Load audio.
|
||||
|
||||
```c++
|
||||
void Vad::loadAudio(const std::string& wavPath)
|
||||
```
|
||||
|
||||
**Parameter**
|
||||
|
||||
> * **wavPath**(str): Audio file path
|
||||
|
||||
### Predict function
|
||||
|
||||
Used to start model reasoning.
|
||||
|
||||
```c++
|
||||
bool Vad::Predict();
|
||||
```
|
||||
|
||||
### getResult function
|
||||
|
||||
**Used to obtain reasoning results**
|
||||
|
||||
```c++
|
||||
std::vector<std::map<std::string, float>> Vad::getResult(
|
||||
float removeThreshold = 1.6, float expandHeadThreshold = 0.32, float expandTailThreshold = 0,
|
||||
float mergeThreshold = 0.3);
|
||||
```
|
||||
|
||||
**Parameter**
|
||||
|
||||
> * **removeThreshold**(float): Discard result fragment threshold; If some recognition results are too short, they will be discarded according to this threshold
|
||||
> * **expandHeadThreshold**(float): Offset at the beginning of the segment; The recognized start time may be too close to the voice part, so move forward the start time accordingly
|
||||
> * **expandTailThreshold**(float): Offset at the end of the segment; The recognized end time may be too close to the voice part, so the end time is moved back accordingly
|
||||
> * **mergeThreshold**(float): Some result segments are very close and can be combined into one, and the vocal segments can be combined accordingly
|
||||
|
||||
**The output result format is**`std::vector<std::map<std::string, float>>`
|
||||
|
||||
> Output a list, each element is a speech fragment
|
||||
>
|
||||
> Each clip can use 'start' to get the start time and 'end' to get the end time
|
||||
|
||||
### Tips
|
||||
|
||||
1. `The setAudioCofig`function must be called before the `init` function
|
||||
2. The sampling rate of the input audio file must be consistent with that set in the code
|
||||
|
||||
- [Model Description](../)
|
||||
- [How to switch the model inference backend engine](../../../../docs/en/faq/how_to_change_backend.md)
|
@ -0,0 +1,65 @@
|
||||
|
||||
#include "vad.h"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
if (argc < 3) {
|
||||
std::cout << "Usage: infer_onnx_silero_vad path/to/model path/to/audio "
|
||||
"run_option, "
|
||||
"e.g ./infer_onnx_silero_vad silero_vad.onnx sample.wav"
|
||||
<< std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::string model_file = argv[1];
|
||||
std::string audio_file = argv[2];
|
||||
|
||||
int sr = 16000;
|
||||
Vad vad(model_file);
|
||||
// custom config, but must be set before init
|
||||
vad.SetConfig(sr, 32, 0.45f, 200, 0, 0);
|
||||
vad.Init();
|
||||
|
||||
std::vector<float> inputWav; // [0, 1]
|
||||
wav::WavReader wav_reader = wav::WavReader(audio_file);
|
||||
assert(wav_reader.sample_rate() == sr);
|
||||
|
||||
|
||||
auto num_samples = wav_reader.num_samples();
|
||||
inputWav.resize(num_samples);
|
||||
for (int i = 0; i < num_samples; i++) {
|
||||
inputWav[i] = wav_reader.data()[i] / 32768;
|
||||
}
|
||||
|
||||
int window_size_samples = vad.WindowSizeSamples();
|
||||
for (int64_t j = 0; j < num_samples; j += window_size_samples) {
|
||||
auto start = j;
|
||||
auto end = start + window_size_samples >= num_samples
|
||||
? num_samples
|
||||
: start + window_size_samples;
|
||||
auto current_chunk_size = end - start;
|
||||
|
||||
std::vector<float> r{&inputWav[0] + start, &inputWav[0] + end};
|
||||
assert(r.size() == current_chunk_size);
|
||||
|
||||
if (!vad.ForwardChunk(r)) {
|
||||
std::cerr << "Failed to inference while using model:"
|
||||
<< vad.ModelName() << "." << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
Vad::State s = vad.Postprocess();
|
||||
std::cout << s << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
std::vector<std::map<std::string, float>> result = vad.GetResult();
|
||||
for (auto& res : result) {
|
||||
std::cout << "speak start: " << res["start"]
|
||||
<< " s, end: " << res["end"] << " s | ";
|
||||
}
|
||||
std::cout << "\b\b " << std::endl;
|
||||
|
||||
vad.Reset();
|
||||
|
||||
return 0;
|
||||
}
|
@ -0,0 +1,306 @@
|
||||
// Copyright (c) 2023 Chen Qianhe Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "vad.h"
|
||||
#include <cstring>
|
||||
#include <iomanip>
|
||||
|
||||
|
||||
#ifdef NDEBUG
|
||||
#define LOG_DEBUG \
|
||||
::fastdeploy::FDLogger(true, "[DEBUG]") << __REL_FILE__ << "(" << __LINE__ \
|
||||
<< ")::" << __FUNCTION__ << "\t"
|
||||
#else
|
||||
#define LOG_DEBUG \
|
||||
::fastdeploy::FDLogger(false, "[DEBUG]") \
|
||||
<< __REL_FILE__ << "(" << __LINE__ << ")::" << __FUNCTION__ << "\t"
|
||||
#endif
|
||||
|
||||
Vad::Vad(const std::string& model_file,
|
||||
const fastdeploy::RuntimeOption&
|
||||
custom_option /* = fastdeploy::RuntimeOption() */) {
|
||||
valid_cpu_backends = {fastdeploy::Backend::ORT,
|
||||
fastdeploy::Backend::OPENVINO};
|
||||
valid_gpu_backends = {fastdeploy::Backend::ORT, fastdeploy::Backend::TRT};
|
||||
|
||||
runtime_option = custom_option;
|
||||
// ORT backend
|
||||
runtime_option.UseCpu();
|
||||
runtime_option.UseOrtBackend();
|
||||
runtime_option.model_format = fastdeploy::ModelFormat::ONNX;
|
||||
// grap opt level
|
||||
runtime_option.ort_option.graph_optimization_level = 99;
|
||||
// one-thread
|
||||
runtime_option.ort_option.intra_op_num_threads = 1;
|
||||
runtime_option.ort_option.inter_op_num_threads = 1;
|
||||
// model path
|
||||
runtime_option.model_file = model_file;
|
||||
}
|
||||
|
||||
void Vad::Init() {
|
||||
std::call_once(init_, [&]() { initialized = Initialize(); });
|
||||
}
|
||||
|
||||
std::string Vad::ModelName() const { return "VAD"; }
|
||||
|
||||
void Vad::SetConfig(int sr,
|
||||
int frame_ms,
|
||||
float threshold,
|
||||
int min_silence_duration_ms,
|
||||
int speech_pad_left_ms,
|
||||
int speech_pad_right_ms) {
|
||||
if (initialized) {
|
||||
fastdeploy::FDERROR << "SetConfig must be called before init"
|
||||
<< std::endl;
|
||||
throw std::runtime_error("SetConfig must be called before init");
|
||||
}
|
||||
sample_rate_ = sr;
|
||||
sr_per_ms_ = sr / 1000;
|
||||
threshold_ = threshold;
|
||||
frame_ms_ = frame_ms;
|
||||
min_silence_samples_ = min_silence_duration_ms * sr_per_ms_;
|
||||
speech_pad_left_samples_ = speech_pad_left_ms * sr_per_ms_;
|
||||
speech_pad_right_samples_ = speech_pad_right_ms * sr_per_ms_;
|
||||
|
||||
// init chunk size
|
||||
window_size_samples_ = frame_ms * sr_per_ms_;
|
||||
current_chunk_size_ = window_size_samples_;
|
||||
|
||||
fastdeploy::FDINFO << "sr=" << sr << " threshold=" << threshold
|
||||
<< " frame_ms=" << frame_ms
|
||||
<< " min_silence_duration_ms=" << min_silence_duration_ms
|
||||
<< " speech_pad_left_ms=" << speech_pad_left_ms
|
||||
<< " speech_pad_right_ms=" << speech_pad_right_ms;
|
||||
}
|
||||
|
||||
void Vad::Reset() {
|
||||
std::memset(h_.data(), 0.0f, h_.size() * sizeof(float));
|
||||
std::memset(c_.data(), 0.0f, c_.size() * sizeof(float));
|
||||
|
||||
triggerd_ = false;
|
||||
temp_end_ = 0;
|
||||
current_sample_ = 0;
|
||||
|
||||
speakStart_.clear();
|
||||
speakEnd_.clear();
|
||||
|
||||
states_.clear();
|
||||
}
|
||||
|
||||
bool Vad::Initialize() {
|
||||
// input & output holder
|
||||
inputTensors_.resize(4);
|
||||
outputTensors_.resize(3);
|
||||
|
||||
// input shape
|
||||
input_node_dims_.emplace_back(1);
|
||||
input_node_dims_.emplace_back(window_size_samples_);
|
||||
// sr buffer
|
||||
sr_.resize(1);
|
||||
sr_[0] = sample_rate_;
|
||||
// hidden state buffer
|
||||
h_.resize(size_hc_);
|
||||
c_.resize(size_hc_);
|
||||
|
||||
Reset();
|
||||
|
||||
// InitRuntime
|
||||
if (!InitRuntime()) {
|
||||
fastdeploy::FDERROR << "Failed to initialize fastdeploy backend."
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
fastdeploy::FDINFO << "init done.";
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Vad::ForwardChunk(std::vector<float>& chunk) {
|
||||
// last chunk may not be window_size_samples_
|
||||
input_node_dims_.back() = chunk.size();
|
||||
assert(window_size_samples_ >= chunk.size());
|
||||
current_chunk_size_ = chunk.size();
|
||||
|
||||
inputTensors_[0].name = "input";
|
||||
inputTensors_[0].SetExternalData(
|
||||
input_node_dims_, fastdeploy::FDDataType::FP32, chunk.data());
|
||||
inputTensors_[1].name = "sr";
|
||||
inputTensors_[1].SetExternalData(
|
||||
sr_node_dims_, fastdeploy::FDDataType::INT64, sr_.data());
|
||||
inputTensors_[2].name = "h";
|
||||
inputTensors_[2].SetExternalData(
|
||||
hc_node_dims_, fastdeploy::FDDataType::FP32, h_.data());
|
||||
inputTensors_[3].name = "c";
|
||||
inputTensors_[3].SetExternalData(
|
||||
hc_node_dims_, fastdeploy::FDDataType::FP32, c_.data());
|
||||
|
||||
if (!Infer(inputTensors_, &outputTensors_)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Push forward sample index
|
||||
current_sample_ += current_chunk_size_;
|
||||
return true;
|
||||
}
|
||||
|
||||
const Vad::State& Vad::Postprocess() {
|
||||
// update prob, h, c
|
||||
outputProb_ = *(float*)outputTensors_[0].Data();
|
||||
auto* hn = static_cast<float*>(outputTensors_[1].MutableData());
|
||||
std::memcpy(h_.data(), hn, h_.size() * sizeof(float));
|
||||
auto* cn = static_cast<float*>(outputTensors_[2].MutableData());
|
||||
std::memcpy(c_.data(), cn, c_.size() * sizeof(float));
|
||||
|
||||
if (outputProb_ < threshold_ && !triggerd_) {
|
||||
// 1. Silence
|
||||
LOG_DEBUG << "{ silence: " << 1.0 * current_sample_ / sample_rate_
|
||||
<< " s; prob: " << outputProb_ << " }";
|
||||
states_.emplace_back(Vad::State::SIL);
|
||||
} else if (outputProb_ >= threshold_ && !triggerd_) {
|
||||
// 2. Start
|
||||
triggerd_ = true;
|
||||
speech_start_ =
|
||||
current_sample_ - current_chunk_size_ - speech_pad_left_samples_;
|
||||
float start_sec = 1.0 * speech_start_ / sample_rate_;
|
||||
speakStart_.emplace_back(start_sec);
|
||||
LOG_DEBUG << "{ speech start: " << start_sec
|
||||
<< " s; prob: " << outputProb_ << " }";
|
||||
states_.emplace_back(Vad::State::START);
|
||||
} else if (outputProb_ >= threshold_ - 0.15 && triggerd_) {
|
||||
// 3. Continue
|
||||
|
||||
if (temp_end_ != 0) {
|
||||
// speech prob relaxation, speech continues again
|
||||
LOG_DEBUG << "{ speech fake end(sil < min_silence_ms) to continue: "
|
||||
<< 1.0 * current_sample_ / sample_rate_
|
||||
<< " s; prob: " << outputProb_ << " }";
|
||||
temp_end_ = 0;
|
||||
} else {
|
||||
// speech prob relaxation, keep tracking speech
|
||||
LOG_DEBUG << "{ speech continue: "
|
||||
<< 1.0 * current_sample_ / sample_rate_
|
||||
<< " s; prob: " << outputProb_ << " }";
|
||||
}
|
||||
|
||||
states_.emplace_back(Vad::State::SPEECH);
|
||||
} else if (outputProb_ < threshold_ - 0.15 && triggerd_) {
|
||||
// 4. End
|
||||
if (temp_end_ == 0) {
|
||||
temp_end_ = current_sample_;
|
||||
}
|
||||
|
||||
// check possible speech end
|
||||
if (current_sample_ - temp_end_ < min_silence_samples_) {
|
||||
// a. silence < min_slience_samples, continue speaking
|
||||
LOG_DEBUG << "{ speech fake end(sil < min_silence_ms): "
|
||||
<< 1.0 * current_sample_ / sample_rate_
|
||||
<< " s; prob: " << outputProb_ << " }";
|
||||
states_.emplace_back(Vad::State::SIL);
|
||||
} else {
|
||||
// b. silence >= min_slience_samples, end speaking
|
||||
speech_end_ = current_sample_ + speech_pad_right_samples_;
|
||||
temp_end_ = 0;
|
||||
triggerd_ = false;
|
||||
auto end_sec = 1.0 * speech_end_ / sample_rate_;
|
||||
speakEnd_.emplace_back(end_sec);
|
||||
LOG_DEBUG << "{ speech end: " << end_sec
|
||||
<< " s; prob: " << outputProb_ << " }";
|
||||
states_.emplace_back(Vad::State::END);
|
||||
}
|
||||
}
|
||||
|
||||
return states_.back();
|
||||
}
|
||||
|
||||
const std::vector<std::map<std::string, float>> Vad::GetResult(
|
||||
float removeThreshold,
|
||||
float expandHeadThreshold,
|
||||
float expandTailThreshold,
|
||||
float mergeThreshold) const {
|
||||
float audioLength = 1.0 * current_sample_ / sample_rate_;
|
||||
if (speakStart_.empty() && speakEnd_.empty()) {
|
||||
return {};
|
||||
}
|
||||
if (speakEnd_.size() != speakStart_.size()) {
|
||||
// set the audio length as the last end
|
||||
speakEnd_.emplace_back(audioLength);
|
||||
}
|
||||
// Remove too short segments
|
||||
// auto startIter = speakStart_.begin();
|
||||
// auto endIter = speakEnd_.begin();
|
||||
// while (startIter != speakStart_.end()) {
|
||||
// if (removeThreshold < audioLength &&
|
||||
// *endIter - *startIter < removeThreshold) {
|
||||
// startIter = speakStart_.erase(startIter);
|
||||
// endIter = speakEnd_.erase(endIter);
|
||||
// } else {
|
||||
// startIter++;
|
||||
// endIter++;
|
||||
// }
|
||||
// }
|
||||
// // Expand to avoid to tight cut.
|
||||
// startIter = speakStart_.begin();
|
||||
// endIter = speakEnd_.begin();
|
||||
// *startIter = std::fmax(0.f, *startIter - expandHeadThreshold);
|
||||
// *endIter = std::fmin(*endIter + expandTailThreshold, *(startIter + 1));
|
||||
// endIter = speakEnd_.end() - 1;
|
||||
// startIter = speakStart_.end() - 1;
|
||||
// *startIter = fmax(*startIter - expandHeadThreshold, *(endIter - 1));
|
||||
// *endIter = std::fmin(*endIter + expandTailThreshold, audioLength);
|
||||
// for (int i = 1; i < speakStart_.size() - 1; ++i) {
|
||||
// speakStart_[i] = std::fmax(speakStart_[i] - expandHeadThreshold,
|
||||
// speakEnd_[i - 1]);
|
||||
// speakEnd_[i] = std::fmin(speakEnd_[i] + expandTailThreshold,
|
||||
// speakStart_[i + 1]);
|
||||
// }
|
||||
// // Merge very closed segments
|
||||
// startIter = speakStart_.begin() + 1;
|
||||
// endIter = speakEnd_.begin();
|
||||
// while (startIter != speakStart_.end()) {
|
||||
// if (*startIter - *endIter < mergeThreshold) {
|
||||
// startIter = speakStart_.erase(startIter);
|
||||
// endIter = speakEnd_.erase(endIter);
|
||||
// } else {
|
||||
// startIter++;
|
||||
// endIter++;
|
||||
// }
|
||||
// }
|
||||
|
||||
std::vector<std::map<std::string, float>> result;
|
||||
for (int i = 0; i < speakStart_.size(); ++i) {
|
||||
result.emplace_back(std::map<std::string, float>(
|
||||
{{"start", speakStart_[i]}, {"end", speakEnd_[i]}}));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const Vad::State& s) {
|
||||
switch (s) {
|
||||
case Vad::State::SIL:
|
||||
os << "[SIL]";
|
||||
break;
|
||||
case Vad::State::START:
|
||||
os << "[STA]";
|
||||
break;
|
||||
case Vad::State::SPEECH:
|
||||
os << "[SPE]";
|
||||
break;
|
||||
case Vad::State::END:
|
||||
os << "[END]";
|
||||
break;
|
||||
default:
|
||||
// illegal state
|
||||
os << "[ILL]";
|
||||
break;
|
||||
}
|
||||
return os;
|
||||
}
|
@ -0,0 +1,124 @@
|
||||
// Copyright (c) 2023 Chen Qianhe Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#pragma once
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
#include "./wav.h"
|
||||
#include "fastdeploy/fastdeploy_model.h"
|
||||
#include "fastdeploy/runtime.h"
|
||||
|
||||
class Vad : public fastdeploy::FastDeployModel {
|
||||
public:
|
||||
enum class State { SIL = 0, START, SPEECH, END };
|
||||
friend std::ostream& operator<<(std::ostream& os, const Vad::State& s);
|
||||
|
||||
Vad(const std::string& model_file,
|
||||
const fastdeploy::RuntimeOption& custom_option =
|
||||
fastdeploy::RuntimeOption());
|
||||
|
||||
void Init();
|
||||
|
||||
void Reset();
|
||||
|
||||
void SetConfig(int sr,
|
||||
int frame_ms,
|
||||
float threshold,
|
||||
int min_silence_duration_ms,
|
||||
int speech_pad_left_ms,
|
||||
int speech_pad_right_ms);
|
||||
|
||||
bool ForwardChunk(std::vector<float>& chunk);
|
||||
|
||||
const State& Postprocess();
|
||||
|
||||
const std::vector<std::map<std::string, float>> GetResult(
|
||||
float removeThreshold = 0.0,
|
||||
float expandHeadThreshold = 0.0,
|
||||
float expandTailThreshold = 0,
|
||||
float mergeThreshold = 0.0) const;
|
||||
|
||||
const std::vector<State> GetStates() const { return states_; }
|
||||
|
||||
int SampleRate() const { return sample_rate_; }
|
||||
|
||||
int FrameMs() const { return frame_ms_; }
|
||||
int64_t WindowSizeSamples() const { return window_size_samples_; }
|
||||
|
||||
float Threshold() const { return threshold_; }
|
||||
|
||||
int MinSilenceDurationMs() const {
|
||||
return min_silence_samples_ / sample_rate_;
|
||||
}
|
||||
int SpeechPadLeftMs() const {
|
||||
return speech_pad_left_samples_ / sample_rate_;
|
||||
}
|
||||
int SpeechPadRightMs() const {
|
||||
return speech_pad_right_samples_ / sample_rate_;
|
||||
}
|
||||
|
||||
int MinSilenceSamples() const { return min_silence_samples_; }
|
||||
int SpeechPadLeftSamples() const { return speech_pad_left_samples_; }
|
||||
int SpeechPadRightSamples() const { return speech_pad_right_samples_; }
|
||||
|
||||
std::string ModelName() const override;
|
||||
|
||||
private:
|
||||
bool Initialize();
|
||||
|
||||
private:
|
||||
std::once_flag init_;
|
||||
// input and output
|
||||
std::vector<fastdeploy::FDTensor> inputTensors_;
|
||||
std::vector<fastdeploy::FDTensor> outputTensors_;
|
||||
|
||||
// model states
|
||||
bool triggerd_ = false;
|
||||
unsigned int speech_start_ = 0;
|
||||
unsigned int speech_end_ = 0;
|
||||
unsigned int temp_end_ = 0;
|
||||
unsigned int current_sample_ = 0;
|
||||
unsigned int current_chunk_size_ = 0;
|
||||
// MAX 4294967295 samples / 8sample per ms / 1000 / 60 = 8947 minutes
|
||||
float outputProb_;
|
||||
|
||||
std::vector<float> speakStart_;
|
||||
mutable std::vector<float> speakEnd_;
|
||||
|
||||
std::vector<State> states_;
|
||||
|
||||
/* ========================================================================
|
||||
*/
|
||||
int sample_rate_ = 16000;
|
||||
int frame_ms_ = 32; // 32, 64, 96 for 16k
|
||||
float threshold_ = 0.5f;
|
||||
|
||||
int64_t window_size_samples_; // support 256 512 768 for 8k; 512 1024 1536
|
||||
// for 16k.
|
||||
int sr_per_ms_; // support 8 or 16
|
||||
int min_silence_samples_; // sr_per_ms_ * frame_ms_
|
||||
int speech_pad_left_samples_{0}; // usually 250ms
|
||||
int speech_pad_right_samples_{0}; // usually 0
|
||||
|
||||
/* ========================================================================
|
||||
*/
|
||||
std::vector<int64_t> sr_;
|
||||
const size_t size_hc_ = 2 * 1 * 64; // It's FIXED.
|
||||
std::vector<float> h_;
|
||||
std::vector<float> c_;
|
||||
|
||||
std::vector<int64_t> input_node_dims_;
|
||||
const std::vector<int64_t> sr_node_dims_ = {1};
|
||||
const std::vector<int64_t> hc_node_dims_ = {2, 1, 64};
|
||||
};
|
@ -0,0 +1,197 @@
|
||||
// Copyright (c) 2016 Personal (Binbin Zhang)
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#pragma once
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
|
||||
namespace wav {
|
||||
|
||||
struct WavHeader {
|
||||
char riff[4]; // "riff"
|
||||
unsigned int size;
|
||||
char wav[4]; // "WAVE"
|
||||
char fmt[4]; // "fmt "
|
||||
unsigned int fmt_size;
|
||||
uint16_t format;
|
||||
uint16_t channels;
|
||||
unsigned int sample_rate;
|
||||
unsigned int bytes_per_second;
|
||||
uint16_t block_size;
|
||||
uint16_t bit;
|
||||
char data[4]; // "data"
|
||||
unsigned int data_size;
|
||||
};
|
||||
|
||||
class WavReader {
|
||||
public:
|
||||
WavReader() : data_(nullptr) {}
|
||||
explicit WavReader(const std::string& filename) { Open(filename); }
|
||||
|
||||
bool Open(const std::string& filename) {
|
||||
FILE* fp = fopen(filename.c_str(), "rb");
|
||||
if (NULL == fp) {
|
||||
std::cout << "Error in read " << filename;
|
||||
return false;
|
||||
}
|
||||
|
||||
WavHeader header;
|
||||
fread(&header, 1, sizeof(header), fp);
|
||||
if (header.fmt_size < 16) {
|
||||
fprintf(stderr,
|
||||
"WaveData: expect PCM format data "
|
||||
"to have fmt chunk of at least size 16.\n");
|
||||
return false;
|
||||
} else if (header.fmt_size > 16) {
|
||||
int offset = 44 - 8 + header.fmt_size - 16;
|
||||
fseek(fp, offset, SEEK_SET);
|
||||
fread(header.data, 8, sizeof(char), fp);
|
||||
}
|
||||
// check "riff" "WAVE" "fmt " "data"
|
||||
|
||||
// Skip any sub-chunks between "fmt" and "data". Usually there will
|
||||
// be a single "fact" sub chunk, but on Windows there can also be a
|
||||
// "list" sub chunk.
|
||||
while (0 != strncmp(header.data, "data", 4)) {
|
||||
// We will just ignore the data in these chunks.
|
||||
fseek(fp, header.data_size, SEEK_CUR);
|
||||
// read next sub chunk
|
||||
fread(header.data, 8, sizeof(char), fp);
|
||||
}
|
||||
|
||||
num_channel_ = header.channels;
|
||||
sample_rate_ = header.sample_rate;
|
||||
bits_per_sample_ = header.bit;
|
||||
int num_data = header.data_size / (bits_per_sample_ / 8);
|
||||
data_ = new float[num_data]; // Create 1-dim array
|
||||
num_samples_ = num_data / num_channel_;
|
||||
|
||||
for (int i = 0; i < num_data; ++i) {
|
||||
switch (bits_per_sample_) {
|
||||
case 8: {
|
||||
char sample;
|
||||
fread(&sample, 1, sizeof(char), fp);
|
||||
data_[i] = static_cast<float>(sample);
|
||||
break;
|
||||
}
|
||||
case 16: {
|
||||
int16_t sample;
|
||||
fread(&sample, 1, sizeof(int16_t), fp);
|
||||
// std::cout << sample;
|
||||
data_[i] = static_cast<float>(sample);
|
||||
// std::cout << data_[i];
|
||||
break;
|
||||
}
|
||||
case 32: {
|
||||
int sample;
|
||||
fread(&sample, 1, sizeof(int), fp);
|
||||
data_[i] = static_cast<float>(sample);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
fprintf(stderr, "unsupported quantization bits");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
fclose(fp);
|
||||
return true;
|
||||
}
|
||||
|
||||
int num_channel() const { return num_channel_; }
|
||||
int sample_rate() const { return sample_rate_; }
|
||||
int bits_per_sample() const { return bits_per_sample_; }
|
||||
int num_samples() const { return num_samples_; }
|
||||
const float* data() const { return data_; }
|
||||
|
||||
private:
|
||||
int num_channel_;
|
||||
int sample_rate_;
|
||||
int bits_per_sample_;
|
||||
int num_samples_; // sample points per channel
|
||||
float* data_;
|
||||
};
|
||||
|
||||
class WavWriter {
|
||||
public:
|
||||
WavWriter(const float* data,
|
||||
int num_samples,
|
||||
int num_channel,
|
||||
int sample_rate,
|
||||
int bits_per_sample)
|
||||
: data_(data),
|
||||
num_samples_(num_samples),
|
||||
num_channel_(num_channel),
|
||||
sample_rate_(sample_rate),
|
||||
bits_per_sample_(bits_per_sample) {}
|
||||
|
||||
void Write(const std::string& filename) {
|
||||
FILE* fp = fopen(filename.c_str(), "w");
|
||||
// init char 'riff' 'WAVE' 'fmt ' 'data'
|
||||
WavHeader header;
|
||||
char wav_header[44] = {
|
||||
0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57, 0x41, 0x56,
|
||||
0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
|
||||
memcpy(&header, wav_header, sizeof(header));
|
||||
header.channels = num_channel_;
|
||||
header.bit = bits_per_sample_;
|
||||
header.sample_rate = sample_rate_;
|
||||
header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
|
||||
header.size = sizeof(header) - 8 + header.data_size;
|
||||
header.bytes_per_second =
|
||||
sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
|
||||
header.block_size = num_channel_ * (bits_per_sample_ / 8);
|
||||
|
||||
fwrite(&header, 1, sizeof(header), fp);
|
||||
|
||||
for (int i = 0; i < num_samples_; ++i) {
|
||||
for (int j = 0; j < num_channel_; ++j) {
|
||||
switch (bits_per_sample_) {
|
||||
case 8: {
|
||||
char sample =
|
||||
static_cast<char>(data_[i * num_channel_ + j]);
|
||||
fwrite(&sample, 1, sizeof(sample), fp);
|
||||
break;
|
||||
}
|
||||
case 16: {
|
||||
int16_t sample =
|
||||
static_cast<int16_t>(data_[i * num_channel_ + j]);
|
||||
fwrite(&sample, 1, sizeof(sample), fp);
|
||||
break;
|
||||
}
|
||||
case 32: {
|
||||
int sample =
|
||||
static_cast<int>(data_[i * num_channel_ + j]);
|
||||
fwrite(&sample, 1, sizeof(sample), fp);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
private:
|
||||
const float* data_;
|
||||
int num_samples_; // total float points in data_
|
||||
int num_channel_;
|
||||
int sample_rate_;
|
||||
int bits_per_sample_;
|
||||
};
|
||||
|
||||
} // namespace wav
|
Loading…
Reference in new issue