add vad doc, fix vad state name

pull/3026/head
Hui Zhang 3 years ago
parent f0951165d6
commit 4302c170fd

@ -15,7 +15,6 @@
#include "vad/interface/vad_interface.h"
#include "common/base/config.h"
#include "common/base/log.h"
#include "vad/nnet/vad.h"
@ -68,14 +67,14 @@ PPSVadState_t PPSVadFeedForward(PPSHandle_t instance,
int num_element) {
ppspeech::Vad* model = static_cast<ppspeech::Vad*>(instance);
if (model == nullptr) {
LOG(ERROR) << "instance is null";
return PPS_ILLEGAL;
printf("instance is null\n");
return PPS_VAD_ILLEGAL;
}
std::vector<float> chunk_in(chunk, chunk + num_element);
if (!model->ForwardChunk(chunk_in)) {
LOG(ERROR) << "forward chunk failed";
return PPS_ILLEGAL;
printf("forward chunk failed\n");
return PPS_VAD_ILLEGAL;
}
ppspeech::Vad::State s = model->Postprocess();
PPSVadState_t ret = (PPSVadState_t)s;

@ -21,11 +21,12 @@ extern "C" {
typedef void* PPSHandle_t;
typedef enum {
PPS_ILLEGAL = 0, // error
PPS_SIL, // silence
PPS_START, // start speech
PPS_SPEECH, // in speech
PPS_END, // end speech
PPS_VAD_ILLEGAL = 0, // error
PPS_VAD_SIL, // silence
PPS_VAD_START, // start speech
PPS_VAD_SPEECH, // in speech
PPS_VAD_END, // end speech
PPS_VAD_NUMSTATES, // number of states
} PPSVadState_t;
PPSHandle_t PPSVadCreateInstance(const char* conf_path);

@ -49,6 +49,8 @@ class Vad : public fastdeploy::FastDeployModel {
const fastdeploy::RuntimeOption& custom_option =
fastdeploy::RuntimeOption());
virtual ~Vad() {}
void Init();
void Reset();

@ -1,121 +1,166 @@
English | [简体中文](README_CN.md)
# Silero VAD - pre-trained enterprise-grade Voice Activity Detector
# Silero VAD Deployment Example
This directory provides VAD models on CPU/GPU.
This directory provides examples that `infer_onnx_silero_vad` fast finishes the deployment of VAD models on CPU/GPU.
![](https://user-images.githubusercontent.com/36505480/198026365-8da383e0-5398-4a12-b7f8-22c2c0059512.png)
Before deployment, two steps require confirmation.
## Linux
- 1. Software and hardware should meet the requirements. Please refer to [FastDeploy Environment Requirements](../../../../docs/en/build_and_install/download_prebuilt_libraries.md).
- 2. Download the precompiled deployment library and samples code according to your development environment. Refer to [FastDeploy Precompiled Library](../../../../docs/en/build_and_install/download_prebuilt_libraries.md).
### Build Runtime
```bash
# cd /path/to/paddlespeech/runtime
cmake -B build -DBUILD_SHARED_LIBS=OFF -DWITH_ASR=OFF -DWITH_CLS=OFF -DWITH_VAD=ON
cmake --build build
```
Taking VAD inference on Linux as an example, the compilation test can be completed by executing the following command in this directory.
Since VAD using FastDeploy runtime, if you have another FastDeploy Library, you can using this command to build:
```bash
mkdir build
cd build
# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above
wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
tar xvf fastdeploy-linux-x64-x.x.x.tgz
cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
make -j
# Download the VAD model file and test audio. After decompression, place the model and test audio in the infer_onnx_silero_vad.cc peer directory
wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad.tgz
wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad_sample.wav
# inference
./infer_onnx_silero_vad ../silero_vad.onnx ../silero_vad_sample.wav
# cd /path/to/paddlespeech/runtime
cmake -B build -DBUILD_SHARED_LIBS=OFF -DWITH_ASR=OFF -DWITH_CLS=OFF -DWITH_VAD=ON -DFASTDEPLOY_INSTALL_DIR=/workspace/zhanghui/paddle/FastDeploy/build/Linux/x86_64/install
cmake --build build
```
- The above command works for Linux or MacOS. Refer to:
- [How to use FastDeploy C++ SDK in Windows](../../../../docs/en/faq/use_sdk_on_windows.md) for SDK use-pattern in Windows
`DFASTDEPLOY_INSTALL_DIR` is the directory of FastDeploy Library.
## VAD C++ Interface
### Run Demo
### Vad Class
After building success, we can do this to run demo under this example dir:
```c++
Vad::Vad(const std::string& model_file,
const fastdeploy::RuntimeOption& custom_option = fastdeploy::RuntimeOption())
```bash
bash run.sh
```
**Parameter**
### Result
> * **model_file**(str): Model file path
> * **runtime_option**(RuntimeOption): Backend inference configuration. None by default. (use the default configuration)
```bash
/workspace/zhanghui/PaddleSpeech/runtime/engine/vad/nnet/vad.cc(92)::SetConfig sr=16000 threshold=0.45 frame_ms=32 min_silence_duration_ms=200 speech_pad_left_ms=0 speech_pad_right_ms=0[INFO] fastdeploy/runtime/runtime.cc(293)::CreateOrtBackend Runtime initialized with Backend::ORT in Device::CPU./workspace/zhanghui/PaddleSpeech/runtime/engine/vad/nnet/vad.cc(141)::Initialize init done.
[SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [STA] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [END] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [STA] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SIL] [SIL] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [END] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [STA] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [END] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [STA] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SIL] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SPE] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [END] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] [SIL] speak start: 0.32 s, end: 2.496 s | speak start: 3.296 s, end: 4.672 s | speak start: 5.408 s, end: 7.936 s | speak start: 8.192 s, end: 10.72 s
vad_nnet_main done!
sr = 16000
frame_ms = 32
threshold = 0.45
min_silence_duration_ms = 200
speech_pad_left_ms = 200
speech_pad_right_ms = 0
model_path = ./data/silero_vad/silero_vad.onnx
param_path = (default)
num_cpu_thread = 1(default)
/workspace/zhanghui/PaddleSpeech/runtime/engine/vad/nnet/vad.cc(92)::SetConfig sr=16000 threshold=0.45 frame_ms=32 min_silence_duration_ms=200 speech_pad_left_ms=200 speech_pad_right_ms=0
[INFO] fastdeploy/runtime/runtime.cc(293)::CreateOrtBackend Runtime initialized with Backend::ORT in Device::CPU.
/workspace/zhanghui/PaddleSpeech/runtime/engine/vad/nnet/vad.cc(141)::Initialize init done.
1 1 1 1 1 1 1 1 1 1 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 2 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 vad_interface_main done!
```
### setAudioCofig function
The environment as below:
```text
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 80
On-line CPU(s) list: 0-79
Thread(s) per core: 2
Core(s) per socket: 20
Socket(s): 2
NUMA node(s): 2
Vendor ID: GenuineIntel
CPU family: 6Model: 85Model name: Intel(R) Xeon(R) Gold 6271C CPU @ 2.60GHz
Stepping: 7
CPU MHz: 2599.998
BogoMIPS: 5199.99
Hypervisor vendor: KVM
Virtualization type: full
L1d cache: 32KL1i cache: 32KL2 cache: 1024K
L3 cache: 33792K
NUMA node0 CPU(s): 0-39
NUMA node1 CPU(s): 40-79
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl xtopology nonstop_tsc eagerfpu pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb ibrs_enhanced fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 arat umip pku ospke avx512_vnni spec_ctrl arch_capabilities
```
**Must be called before the `init` function**
## Android
```c++
void Vad::setAudioCofig(int sr, int frame_ms, float threshold, int min_silence_duration_ms, int speech_pad_ms);
```
When to using on Android, please setup your `NDK` enverment before, then do as below:
**Parameter**
```bash
# cd /path/to/paddlespeech/runtime
bash build_android.sh
```
> * **sr**(int): sampling rate
> * **frame_ms**(int): The length of each detection frame, and it is used to calculate the detection window size
> * **threshold**(float): Result probability judgment threshold
> * **min_silence_duration_ms**(int): The threshold used to calculate whether it is silence
> * **speech_pad_ms**(int): Used to calculate the end time of the speech
## VAD Interface
### init function
For vad interface please see [](../../engine/vad/interface/).
Used to initialize audio-related parameters.
### Create Handdle
```c++
void Vad::init();
PPSHandle_t PPSVadCreateInstance(const char* conf_path);
```
### loadAudio function
Load audio.
### Destroy Handdle
```c++
void Vad::loadAudio(const std::string& wavPath)
int PPSVadDestroyInstance(PPSHandle_t instance);
```
**Parameter**
### Reset Vad State
> * **wavPath**(str): Audio file path
```c++
int PPSVadReset(PPSHandle_t instance);
```
### Predict function
Reset Vad state before processing next `wav`.
Used to start model reasoning.
### Get Chunk Size
```c++
bool Vad::Predict();
int PPSVadChunkSizeSamples(PPSHandle_t instance);
```
### getResult function
This API will return chunk size in `sample` unit.
When do forward, we need feed `chunk size` samples, except last chunk.
**Used to obtain reasoning results**
### Vad Forward
```c++
PPSVadState_t PPSVadFeedForward(PPSHandle_t instance,
float* chunk,
int num_element);
```
Vad has below states:
```c++
std::vector<std::map<std::string, float>> Vad::getResult(
float removeThreshold = 1.6, float expandHeadThreshold = 0.32, float expandTailThreshold = 0,
float mergeThreshold = 0.3);
typedef enum {
PPS_VAD_ILLEGAL = 0, // error
PPS_VAD_SIL, // silence
PPS_VAD_START, // start speech
PPS_VAD_SPEECH, // in speech
PPS_VAD_END, // end speech
PPS_VAD_NUMSTATES, // number of states
} PPSVadState_t;
```
**Parameter**
If `PPSVadFeedForward` occur an error will return `PPS_VAD_ILLEGAL` state.
## FastDeploy Runtime
For FastDeploy software and hardware requements, and pre-released library please to see [FastDeploy](https://github.com/PaddlePaddle/FastDeploy):
- 1. [FastDeploy Environment Requirements](https://github.com/PaddlePaddle/FastDeploy/docs/en/build_and_install/download_prebuilt_libraries.md).
- 2. [FastDeploy Precompiled Library](https://github.com/PaddlePaddle/FastDeploy/docs/en/build_and_install/download_prebuilt_libraries.md).
> * **removeThreshold**(float): Discard result fragment threshold; If some recognition results are too short, they will be discarded according to this threshold
> * **expandHeadThreshold**(float): Offset at the beginning of the segment; The recognized start time may be too close to the voice part, so move forward the start time accordingly
> * **expandTailThreshold**(float): Offset at the end of the segment; The recognized end time may be too close to the voice part, so the end time is moved back accordingly
> * **mergeThreshold**(float): Some result segments are very close and can be combined into one, and the vocal segments can be combined accordingly
**The output result format is**`std::vector<std::map<std::string, float>>`
## Download Pre-trained ONNX Model
> Output a list, each element is a speech fragment
>
> Each clip can use 'start' to get the start time and 'end' to get the end time
For developers' testing, model exported by VAD are provided below. Developers can download them directly.
### Tips
| 模型 | 大小 | 备注 |
| :----------------------------------------------------------- | :---- | :----------------------------------------------------------- |
| [silero-vad](https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad.tgz) | 1.8MB | This model file is sourced from [snakers4/silero-vad](https://github.com/snakers4/silero-vad)MIT License |
1. `The setAudioCofig`function must be called before the `init` function
2. The sampling rate of the input audio file must be consistent with that set in the code
- [Model Description](../)
- [How to switch the model inference backend engine](../../../../docs/en/faq/how_to_change_backend.md)
## Reference
* https://github.com/snakers4/silero-vad
* https://github.com/PaddlePaddle/FastDeploy/blob/develop/examples/audio/silero-vad/README.md

@ -1,119 +0,0 @@
[English](README.md) | 简体中文
# Silero VAD 部署示例
本目录下提供`infer_onnx_silero_vad`快速完成 Silero VAD 模型在CPU/GPU。
在部署前,需确认以下两个步骤
- 1. 软硬件环境满足要求,参考[FastDeploy环境要求](../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
- 2. 根据开发环境下载预编译部署库和samples代码参考[FastDeploy预编译库](../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
以Linux上 VAD 推理为例,在本目录执行如下命令即可完成编译测试。
```bash
mkdir build
cd build
# 下载FastDeploy预编译库用户可在上文提到的`FastDeploy预编译库`中自行选择合适的版本使用
wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
tar xvf fastdeploy-linux-x64-x.x.x.tgz
cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
make -j
# 下载 VAD 模型文件和测试音频,解压后将模型和测试音频放置在与 infer_onnx_silero_vad.cc 同级目录下
wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad.tgz
wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad_sample.wav
# 推理
./infer_onnx_silero_vad ../silero_vad.onnx ../silero_vad_sample.wav
```
以上命令只适用于Linux或MacOS, Windows下SDK的使用方式请参考:
- [如何在Windows中使用FastDeploy C++ SDK](../../../../docs/cn/faq/use_sdk_on_windows.md)
## VAD C++ 接口
### Vad 类
```c++
Vad::Vad(const std::string& model_file,
const fastdeploy::RuntimeOption& custom_option = fastdeploy::RuntimeOption())
```
**参数**
> * **model_file**(str): 模型文件路径
> * **runtime_option**(RuntimeOption): 后端推理配置默认为None即采用默认配置
### setAudioCofig 函数
**必须在`init`函数前调用**
```c++
void Vad::setAudioCofig(int sr, int frame_ms, float threshold, int min_silence_duration_ms, int speech_pad_ms);
```
**参数**
> * **sr**(int): 采样率
> * **frame_ms**(int): 每次检测帧长,用于计算检测窗口大小
> * **threshold**(float): 结果概率判断阈值
> * **min_silence_duration_ms**(int): 用于计算判断是否是 silence 的阈值
> * **speech_pad_ms**(int): 用于计算 speach 结束时刻
### init 函数
用于初始化音频相关参数
```c++
void Vad::init();
```
### loadAudio 函数
加载音频
```c++
void Vad::loadAudio(const std::string& wavPath)
```
**参数**
> * **wavPath**(str): 音频文件路径
### Predict 函数
用于开始模型推理
```c++
bool Vad::Predict();
```
### getResult 函数
**用于获取推理结果**
```c++
std::vector<std::map<std::string, float>> Vad::getResult(
float removeThreshold = 1.6, float expandHeadThreshold = 0.32, float expandTailThreshold = 0,
float mergeThreshold = 0.3);
```
**参数**
> * **removeThreshold**(float): 丢弃结果片段阈值;部分识别结果太短则根据此阈值丢弃
> * **expandHeadThreshold**(float): 结果片段开始时刻偏移;识别到的开始时刻可能过于贴近发声部分,因此据此前移开始时刻
> * **expandTailThreshold**(float): 结果片段结束时刻偏移;识别到的结束时刻可能过于贴近发声部分,因此据此后移结束时刻
> * **mergeThreshold**(float): 有的结果片段十分靠近,可以合并成一个,据此合并发声片段
**输出结果格式为**`std::vector<std::map<std::string, float>>`
> 输出一个列表,每个元素是一个讲话片段
>
> 每个片段可以用 'start' 获取到开始时刻,用 'end' 获取到结束时刻
### 提示
1. `setAudioCofig`函数必须在`init`函数前调用
2. 输入的音频文件的采样率必须与代码中设置的保持一致
- [模型介绍](../)
- [如何切换模型推理后端引擎](../../../../docs/cn/faq/how_to_change_backend.md)

@ -15,8 +15,8 @@ exp=exp
mkdir -p $exp $data
# 1. compile
if [ ! -d ${SPEECHX_BUILD} ]; then
pushd ${SPEECHX_ROOT}
if [ ! -d ${ENGINE_BUILD} ]; then
pushd ${ENGINE_ROOT}
bash build.sh
# build for android armv8/armv7
@ -24,8 +24,6 @@ if [ ! -d ${SPEECHX_BUILD} ]; then
popd
fi
ckpt_dir=$data/silero_vad
wav=$data/silero_vad_sample.wav
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then
./local/download.sh

Loading…
Cancel
Save