add vad interface GetVadResult (#3140)

* add vad interface GetVadResult

* fix comment
pull/3156/head
masimeng1994 2 years ago committed by GitHub
parent f35a87ab89
commit d03ebe872a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -92,3 +92,12 @@ int PPSVadReset(PPSHandle_t instance) {
model->Reset(); model->Reset();
return 0; return 0;
} }
int PPSVadGetResult(PPSHandle_t instance, char* result, int max_len){
ppspeech::Vad* model = static_cast<ppspeech::Vad*>(instance);
if (model == nullptr) {
printf("instance is null\n");
return -1;
}
return model->GetResult(result, max_len);
};

@ -41,6 +41,7 @@ PPSVadState_t PPSVadFeedForward(PPSHandle_t instance,
float* chunk, float* chunk,
int num_element); int num_element);
int PPSVadGetResult(PPSHandle_t instance, char* result, int max_len);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif // __cplusplus #endif // __cplusplus

@ -16,26 +16,30 @@
#include <iostream> #include <iostream>
#include <vector> #include <vector>
#include <fstream>
#include "common/base/common.h" #include "common/base/common.h"
#include "vad/frontend/wav.h" #include "vad/frontend/wav.h"
#include "vad/interface/vad_interface.h" #include "vad/interface/vad_interface.h"
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
if (argc < 3) { if (argc < 3) {
std::cout << "Usage: vad_interface_main path/to/config path/to/audio " std::cout << "Usage: vad_interface_main path/to/config wav.scp"
"run_option, " "run_option, "
"e.g ./vad_interface_main config sample.wav" "e.g ./vad_interface_main config wav.scp"
<< std::endl; << std::endl;
return -1; return -1;
} }
std::string config_path = argv[1]; std::string config_path = argv[1];
std::string audio_file = argv[2]; std::string wav_scp = argv[2];
PPSHandle_t handle = PPSVadCreateInstance(config_path.c_str()); PPSHandle_t handle = PPSVadCreateInstance(config_path.c_str());
std::ifstream fp_wav(wav_scp);
std::string line = "";
while(getline(fp_wav, line)){
std::vector<float> inputWav; // [0, 1] std::vector<float> inputWav; // [0, 1]
wav::WavReader wav_reader = wav::WavReader(audio_file); wav::WavReader wav_reader = wav::WavReader(line);
auto sr = wav_reader.sample_rate(); auto sr = wav_reader.sample_rate();
CHECK(sr == 16000) << " sr is " << sr << " expect 16000"; CHECK(sr == 16000) << " sr is " << sr << " expect 16000";
@ -52,20 +56,23 @@ int main(int argc, char* argv[]) {
auto end = start + window_size_samples >= num_samples auto end = start + window_size_samples >= num_samples
? num_samples ? num_samples
: start + window_size_samples; : start + window_size_samples;
std::vector<float> r(window_size_samples, 0);
auto current_chunk_size = end - start; auto current_chunk_size = end - start;
memcpy(r.data(), inputWav.data() + start, current_chunk_size * sizeof(float));
std::vector<float> r{&inputWav[0] + start, &inputWav[0] + end};
assert(r.size() == static_cast<size_t>(current_chunk_size));
PPSVadState_t s = PPSVadFeedForward(handle, r.data(), r.size()); PPSVadState_t s = PPSVadFeedForward(handle, r.data(), r.size());
std::cout << s << " ";
} }
std::cout << std::endl;
std::cout << "RTF=" << timer.Elapsed() / double(num_samples / sr) std::cout << "RTF=" << timer.Elapsed() / double(num_samples / sr)
<< std::endl; << std::endl;
PPSVadReset(handle); char result[10240] = {0};
PPSVadGetResult(handle, result, 10240);
std::cout << line << " " << result << std::endl;
PPSVadReset(handle);
// getchar();
}
PPSVadDestroyInstance(handle);
return 0; return 0;
} }

@ -100,8 +100,8 @@ void Vad::Reset() {
temp_end_ = 0; temp_end_ = 0;
current_sample_ = 0; current_sample_ = 0;
speakStart_.clear(); speechStart_.clear();
speakEnd_.clear(); speechEnd_.clear();
states_.clear(); states_.clear();
} }
@ -176,34 +176,43 @@ const Vad::State& Vad::Postprocess() {
if (outputProb_ < threshold_ && !triggerd_) { if (outputProb_ < threshold_ && !triggerd_) {
// 1. Silence // 1. Silence
#ifdef PPS_DEBUG
DLOG(INFO) << "{ silence: " << 1.0 * current_sample_ / sample_rate_ DLOG(INFO) << "{ silence: " << 1.0 * current_sample_ / sample_rate_
<< " s; prob: " << outputProb_ << " }"; << " s; prob: " << outputProb_ << " }";
#endif
states_.emplace_back(Vad::State::SIL); states_.emplace_back(Vad::State::SIL);
} else if (outputProb_ >= threshold_ && !triggerd_) { } else if (outputProb_ >= threshold_ && !triggerd_) {
// 2. Start // 2. Start
triggerd_ = true; triggerd_ = true;
speech_start_ = speech_start_ =
current_sample_ - current_chunk_size_ - speech_pad_left_samples_; current_sample_ - current_chunk_size_ - speech_pad_left_samples_;
speech_start_ = std::max(int(speech_start_), 0);
float start_sec = 1.0 * speech_start_ / sample_rate_; float start_sec = 1.0 * speech_start_ / sample_rate_;
speakStart_.emplace_back(start_sec); speechStart_.emplace_back(start_sec);
#ifdef PPS_DEBUG
DLOG(INFO) << "{ speech start: " << start_sec DLOG(INFO) << "{ speech start: " << start_sec
<< " s; prob: " << outputProb_ << " }"; << " s; prob: " << outputProb_ << " }";
#endif
states_.emplace_back(Vad::State::START); states_.emplace_back(Vad::State::START);
} else if (outputProb_ >= threshold_ - beam_ && triggerd_) { } else if (outputProb_ >= threshold_ - beam_ && triggerd_) {
// 3. Continue // 3. Continue
if (temp_end_ != 0) { if (temp_end_ != 0) {
// speech prob relaxation, speech continues again // speech prob relaxation, speech continues again
#ifdef PPS_DEBUG
DLOG(INFO) DLOG(INFO)
<< "{ speech fake end(sil < min_silence_ms) to continue: " << "{ speech fake end(sil < min_silence_ms) to continue: "
<< 1.0 * current_sample_ / sample_rate_ << 1.0 * current_sample_ / sample_rate_
<< " s; prob: " << outputProb_ << " }"; << " s; prob: " << outputProb_ << " }";
#endif
temp_end_ = 0; temp_end_ = 0;
} else { } else {
// speech prob relaxation, keep tracking speech // speech prob relaxation, keep tracking speech
#ifdef PPS_DEBUG
DLOG(INFO) << "{ speech continue: " DLOG(INFO) << "{ speech continue: "
<< 1.0 * current_sample_ / sample_rate_ << 1.0 * current_sample_ / sample_rate_
<< " s; prob: " << outputProb_ << " }"; << " s; prob: " << outputProb_ << " }";
#endif
} }
states_.emplace_back(Vad::State::SPEECH); states_.emplace_back(Vad::State::SPEECH);
@ -216,9 +225,11 @@ const Vad::State& Vad::Postprocess() {
// check possible speech end // check possible speech end
if (current_sample_ - temp_end_ < min_silence_samples_) { if (current_sample_ - temp_end_ < min_silence_samples_) {
// a. silence < min_slience_samples, continue speaking // a. silence < min_slience_samples, continue speaking
#ifdef PPS_DEBUG
DLOG(INFO) << "{ speech fake end(sil < min_silence_ms): " DLOG(INFO) << "{ speech fake end(sil < min_silence_ms): "
<< 1.0 * current_sample_ / sample_rate_ << 1.0 * current_sample_ / sample_rate_
<< " s; prob: " << outputProb_ << " }"; << " s; prob: " << outputProb_ << " }";
#endif
states_.emplace_back(Vad::State::SIL); states_.emplace_back(Vad::State::SIL);
} else { } else {
// b. silence >= min_slience_samples, end speaking // b. silence >= min_slience_samples, end speaking
@ -226,9 +237,11 @@ const Vad::State& Vad::Postprocess() {
temp_end_ = 0; temp_end_ = 0;
triggerd_ = false; triggerd_ = false;
auto end_sec = 1.0 * speech_end_ / sample_rate_; auto end_sec = 1.0 * speech_end_ / sample_rate_;
speakEnd_.emplace_back(end_sec); speechEnd_.emplace_back(end_sec);
#ifdef PPS_DEBUG
DLOG(INFO) << "{ speech end: " << end_sec DLOG(INFO) << "{ speech end: " << end_sec
<< " s; prob: " << outputProb_ << " }"; << " s; prob: " << outputProb_ << " }";
#endif
states_.emplace_back(Vad::State::END); states_.emplace_back(Vad::State::END);
} }
} }
@ -236,66 +249,63 @@ const Vad::State& Vad::Postprocess() {
return states_.back(); return states_.back();
} }
const std::vector<std::map<std::string, float>> Vad::GetResult( std::string Vad::ConvertTime(float time_s) const{
float seconds_tmp, minutes_tmp, hours_tmp;
float seconds;
int minutes, hours;
// 计算小时
hours_tmp = time_s / 60 / 60; // 1
hours = (int)hours_tmp;
// 计算分钟
minutes_tmp = time_s / 60;
if (minutes_tmp >= 60) {
minutes = minutes_tmp - 60 * (double)hours;
}
else {
minutes = minutes_tmp;
}
// 计算秒数
seconds_tmp = (60 * 60 * hours) + (60 * minutes);
seconds = time_s - seconds_tmp;
// 输出格式
std::stringstream ss;
ss << hours << ":" << minutes << ":" << seconds;
return ss.str();
}
int Vad::GetResult(char* result, int max_len,
float removeThreshold, float removeThreshold,
float expandHeadThreshold, float expandHeadThreshold,
float expandTailThreshold, float expandTailThreshold,
float mergeThreshold) const { float mergeThreshold) const {
float audioLength = 1.0 * current_sample_ / sample_rate_; float audioLength = 1.0 * current_sample_ / sample_rate_;
if (speakStart_.empty() && speakEnd_.empty()) { if (speechStart_.empty() && speechEnd_.empty()) {
return {}; return {};
} }
if (speakEnd_.size() != speakStart_.size()) { if (speechEnd_.size() != speechStart_.size()) {
// set the audio length as the last end // set the audio length as the last end
speakEnd_.emplace_back(audioLength); speechEnd_.emplace_back(audioLength);
} }
// Remove too short segments
// auto startIter = speakStart_.begin(); std::string json = "[";
// auto endIter = speakEnd_.begin();
// while (startIter != speakStart_.end()) { for (int i = 0; i < speechStart_.size(); ++i) {
// if (removeThreshold < audioLength && json += "{\"s\":\"" + ConvertTime(speechStart_[i]) + "\",\"e\":\"" + ConvertTime(speechEnd_[i]) + "\"},";
// *endIter - *startIter < removeThreshold) { }
// startIter = speakStart_.erase(startIter); json.pop_back();
// endIter = speakEnd_.erase(endIter); json += "]";
// } else {
// startIter++; if(result != NULL){
// endIter++; snprintf(result, max_len, "%s", json.c_str());
// } } else {
// } DLOG(INFO) << "result is NULL";
// // Expand to avoid to tight cut.
// startIter = speakStart_.begin();
// endIter = speakEnd_.begin();
// *startIter = std::fmax(0.f, *startIter - expandHeadThreshold);
// *endIter = std::fmin(*endIter + expandTailThreshold, *(startIter + 1));
// endIter = speakEnd_.end() - 1;
// startIter = speakStart_.end() - 1;
// *startIter = fmax(*startIter - expandHeadThreshold, *(endIter - 1));
// *endIter = std::fmin(*endIter + expandTailThreshold, audioLength);
// for (int i = 1; i < speakStart_.size() - 1; ++i) {
// speakStart_[i] = std::fmax(speakStart_[i] - expandHeadThreshold,
// speakEnd_[i - 1]);
// speakEnd_[i] = std::fmin(speakEnd_[i] + expandTailThreshold,
// speakStart_[i + 1]);
// }
// // Merge very closed segments
// startIter = speakStart_.begin() + 1;
// endIter = speakEnd_.begin();
// while (startIter != speakStart_.end()) {
// if (*startIter - *endIter < mergeThreshold) {
// startIter = speakStart_.erase(startIter);
// endIter = speakEnd_.erase(endIter);
// } else {
// startIter++;
// endIter++;
// }
// }
std::vector<std::map<std::string, float>> result;
for (int i = 0; i < speakStart_.size(); ++i) {
result.emplace_back(std::map<std::string, float>(
{{"start", speakStart_[i]}, {"end", speakEnd_[i]}}));
} }
return result; return 0;
} }
std::ostream& operator<<(std::ostream& os, const Vad::State& s) { std::ostream& operator<<(std::ostream& os, const Vad::State& s) {

@ -70,7 +70,7 @@ class Vad : public fastdeploy::FastDeployModel {
const State& Postprocess(); const State& Postprocess();
const std::vector<std::map<std::string, float>> GetResult( int GetResult(char* result, int max_len,
float removeThreshold = 0.0, float removeThreshold = 0.0,
float expandHeadThreshold = 0.0, float expandHeadThreshold = 0.0,
float expandTailThreshold = 0, float expandTailThreshold = 0,
@ -103,6 +103,7 @@ class Vad : public fastdeploy::FastDeployModel {
private: private:
bool Initialize(); bool Initialize();
std::string ConvertTime(float time_s) const;
private: private:
std::mutex init_lock_; std::mutex init_lock_;
@ -122,8 +123,8 @@ class Vad : public fastdeploy::FastDeployModel {
// MAX 4294967295 samples / 8sample per ms / 1000 / 60 = 8947 minutes // MAX 4294967295 samples / 8sample per ms / 1000 / 60 = 8947 minutes
float outputProb_; float outputProb_;
std::vector<float> speakStart_; std::vector<float> speechStart_;
mutable std::vector<float> speakEnd_; mutable std::vector<float> speechEnd_;
std::vector<State> states_; std::vector<State> states_;

@ -70,12 +70,6 @@ int main(int argc, char* argv[]) {
std::cout << "RTF=" << timer.Elapsed() / double(num_samples / sr) std::cout << "RTF=" << timer.Elapsed() / double(num_samples / sr)
<< std::endl; << std::endl;
std::vector<std::map<std::string, float>> result = vad.GetResult();
for (auto& res : result) {
std::cout << "speak start: " << res["start"]
<< " s, end: " << res["end"] << " s | ";
}
std::cout << "\b\b " << std::endl; std::cout << "\b\b " << std::endl;
vad.Reset(); vad.Reset();

Loading…
Cancel
Save