diff --git a/runtime/engine/vad/interface/vad_interface.cc b/runtime/engine/vad/interface/vad_interface.cc index 4c3877ff0..2e5c91752 100644 --- a/runtime/engine/vad/interface/vad_interface.cc +++ b/runtime/engine/vad/interface/vad_interface.cc @@ -91,4 +91,13 @@ int PPSVadReset(PPSHandle_t instance) { } model->Reset(); return 0; -} \ No newline at end of file +} + +int PPSVadGetResult(PPSHandle_t instance, char* result, int max_len){ + ppspeech::Vad* model = static_cast(instance); + if (model == nullptr) { + printf("instance is null\n"); + return -1; + } + return model->GetResult(result, max_len); +}; \ No newline at end of file diff --git a/runtime/engine/vad/interface/vad_interface.h b/runtime/engine/vad/interface/vad_interface.h index 5d7ca7091..15d0b811c 100644 --- a/runtime/engine/vad/interface/vad_interface.h +++ b/runtime/engine/vad/interface/vad_interface.h @@ -41,6 +41,7 @@ PPSVadState_t PPSVadFeedForward(PPSHandle_t instance, float* chunk, int num_element); +int PPSVadGetResult(PPSHandle_t instance, char* result, int max_len); #ifdef __cplusplus } #endif // __cplusplus \ No newline at end of file diff --git a/runtime/engine/vad/interface/vad_interface_main.cc b/runtime/engine/vad/interface/vad_interface_main.cc index 16059c41f..6dba794d2 100644 --- a/runtime/engine/vad/interface/vad_interface_main.cc +++ b/runtime/engine/vad/interface/vad_interface_main.cc @@ -16,56 +16,63 @@ #include #include +#include #include "common/base/common.h" #include "vad/frontend/wav.h" #include "vad/interface/vad_interface.h" int main(int argc, char* argv[]) { if (argc < 3) { - std::cout << "Usage: vad_interface_main path/to/config path/to/audio " + std::cout << "Usage: vad_interface_main path/to/config wav.scp" "run_option, " - "e.g ./vad_interface_main config sample.wav" + "e.g ./vad_interface_main config wav.scp" << std::endl; return -1; } std::string config_path = argv[1]; - std::string audio_file = argv[2]; + std::string wav_scp = argv[2]; PPSHandle_t handle = PPSVadCreateInstance(config_path.c_str()); - std::vector inputWav; // [0, 1] - wav::WavReader wav_reader = wav::WavReader(audio_file); - auto sr = wav_reader.sample_rate(); - CHECK(sr == 16000) << " sr is " << sr << " expect 16000"; + std::ifstream fp_wav(wav_scp); + std::string line = ""; + while(getline(fp_wav, line)){ + std::vector inputWav; // [0, 1] + wav::WavReader wav_reader = wav::WavReader(line); + auto sr = wav_reader.sample_rate(); + CHECK(sr == 16000) << " sr is " << sr << " expect 16000"; - auto num_samples = wav_reader.num_samples(); - inputWav.resize(num_samples); - for (int i = 0; i < num_samples; i++) { - inputWav[i] = wav_reader.data()[i] / 32768; - } - - ppspeech::Timer timer; - int window_size_samples = PPSVadChunkSizeSamples(handle); - for (int64_t j = 0; j < num_samples; j += window_size_samples) { - auto start = j; - auto end = start + window_size_samples >= num_samples - ? num_samples - : start + window_size_samples; - auto current_chunk_size = end - start; + auto num_samples = wav_reader.num_samples(); + inputWav.resize(num_samples); + for (int i = 0; i < num_samples; i++) { + inputWav[i] = wav_reader.data()[i] / 32768; + } - std::vector r{&inputWav[0] + start, &inputWav[0] + end}; - assert(r.size() == static_cast(current_chunk_size)); + ppspeech::Timer timer; + int window_size_samples = PPSVadChunkSizeSamples(handle); + for (int64_t j = 0; j < num_samples; j += window_size_samples) { + auto start = j; + auto end = start + window_size_samples >= num_samples + ? num_samples + : start + window_size_samples; + std::vector r(window_size_samples, 0); + auto current_chunk_size = end - start; + memcpy(r.data(), inputWav.data() + start, current_chunk_size * sizeof(float)); - PPSVadState_t s = PPSVadFeedForward(handle, r.data(), r.size()); - std::cout << s << " "; - } - std::cout << std::endl; + PPSVadState_t s = PPSVadFeedForward(handle, r.data(), r.size()); + } - std::cout << "RTF=" << timer.Elapsed() / double(num_samples / sr) - << std::endl; - - PPSVadReset(handle); + std::cout << "RTF=" << timer.Elapsed() / double(num_samples / sr) + << std::endl; + char result[10240] = {0}; + PPSVadGetResult(handle, result, 10240); + std::cout << line << " " << result << std::endl; + + PPSVadReset(handle); + // getchar(); + } + PPSVadDestroyInstance(handle); return 0; } diff --git a/runtime/engine/vad/nnet/vad.cc b/runtime/engine/vad/nnet/vad.cc index 0b77e632a..101f2370a 100644 --- a/runtime/engine/vad/nnet/vad.cc +++ b/runtime/engine/vad/nnet/vad.cc @@ -100,8 +100,8 @@ void Vad::Reset() { temp_end_ = 0; current_sample_ = 0; - speakStart_.clear(); - speakEnd_.clear(); + speechStart_.clear(); + speechEnd_.clear(); states_.clear(); } @@ -176,34 +176,43 @@ const Vad::State& Vad::Postprocess() { if (outputProb_ < threshold_ && !triggerd_) { // 1. Silence +#ifdef PPS_DEBUG DLOG(INFO) << "{ silence: " << 1.0 * current_sample_ / sample_rate_ << " s; prob: " << outputProb_ << " }"; +#endif states_.emplace_back(Vad::State::SIL); } else if (outputProb_ >= threshold_ && !triggerd_) { // 2. Start triggerd_ = true; speech_start_ = current_sample_ - current_chunk_size_ - speech_pad_left_samples_; + speech_start_ = std::max(int(speech_start_), 0); float start_sec = 1.0 * speech_start_ / sample_rate_; - speakStart_.emplace_back(start_sec); + speechStart_.emplace_back(start_sec); +#ifdef PPS_DEBUG DLOG(INFO) << "{ speech start: " << start_sec << " s; prob: " << outputProb_ << " }"; +#endif states_.emplace_back(Vad::State::START); } else if (outputProb_ >= threshold_ - beam_ && triggerd_) { // 3. Continue if (temp_end_ != 0) { // speech prob relaxation, speech continues again +#ifdef PPS_DEBUG DLOG(INFO) << "{ speech fake end(sil < min_silence_ms) to continue: " << 1.0 * current_sample_ / sample_rate_ << " s; prob: " << outputProb_ << " }"; +#endif temp_end_ = 0; } else { // speech prob relaxation, keep tracking speech +#ifdef PPS_DEBUG DLOG(INFO) << "{ speech continue: " << 1.0 * current_sample_ / sample_rate_ << " s; prob: " << outputProb_ << " }"; +#endif } states_.emplace_back(Vad::State::SPEECH); @@ -216,9 +225,11 @@ const Vad::State& Vad::Postprocess() { // check possible speech end if (current_sample_ - temp_end_ < min_silence_samples_) { // a. silence < min_slience_samples, continue speaking +#ifdef PPS_DEBUG DLOG(INFO) << "{ speech fake end(sil < min_silence_ms): " << 1.0 * current_sample_ / sample_rate_ << " s; prob: " << outputProb_ << " }"; +#endif states_.emplace_back(Vad::State::SIL); } else { // b. silence >= min_slience_samples, end speaking @@ -226,9 +237,11 @@ const Vad::State& Vad::Postprocess() { temp_end_ = 0; triggerd_ = false; auto end_sec = 1.0 * speech_end_ / sample_rate_; - speakEnd_.emplace_back(end_sec); + speechEnd_.emplace_back(end_sec); +#ifdef PPS_DEBUG DLOG(INFO) << "{ speech end: " << end_sec << " s; prob: " << outputProb_ << " }"; +#endif states_.emplace_back(Vad::State::END); } } @@ -236,66 +249,63 @@ const Vad::State& Vad::Postprocess() { return states_.back(); } -const std::vector> Vad::GetResult( +std::string Vad::ConvertTime(float time_s) const{ + float seconds_tmp, minutes_tmp, hours_tmp; + float seconds; + int minutes, hours; + + // 计算小时 + hours_tmp = time_s / 60 / 60; // 1 + hours = (int)hours_tmp; + + // 计算分钟 + minutes_tmp = time_s / 60; + if (minutes_tmp >= 60) { + minutes = minutes_tmp - 60 * (double)hours; + } + else { + minutes = minutes_tmp; + } + + // 计算秒数 + seconds_tmp = (60 * 60 * hours) + (60 * minutes); + seconds = time_s - seconds_tmp; + + // 输出格式 + std::stringstream ss; + ss << hours << ":" << minutes << ":" << seconds; + + return ss.str(); +} + +int Vad::GetResult(char* result, int max_len, float removeThreshold, float expandHeadThreshold, float expandTailThreshold, float mergeThreshold) const { float audioLength = 1.0 * current_sample_ / sample_rate_; - if (speakStart_.empty() && speakEnd_.empty()) { + if (speechStart_.empty() && speechEnd_.empty()) { return {}; } - if (speakEnd_.size() != speakStart_.size()) { + if (speechEnd_.size() != speechStart_.size()) { // set the audio length as the last end - speakEnd_.emplace_back(audioLength); + speechEnd_.emplace_back(audioLength); + } + + std::string json = "["; + + for (int i = 0; i < speechStart_.size(); ++i) { + json += "{\"s\":\"" + ConvertTime(speechStart_[i]) + "\",\"e\":\"" + ConvertTime(speechEnd_[i]) + "\"},"; } - // Remove too short segments - // auto startIter = speakStart_.begin(); - // auto endIter = speakEnd_.begin(); - // while (startIter != speakStart_.end()) { - // if (removeThreshold < audioLength && - // *endIter - *startIter < removeThreshold) { - // startIter = speakStart_.erase(startIter); - // endIter = speakEnd_.erase(endIter); - // } else { - // startIter++; - // endIter++; - // } - // } - // // Expand to avoid to tight cut. - // startIter = speakStart_.begin(); - // endIter = speakEnd_.begin(); - // *startIter = std::fmax(0.f, *startIter - expandHeadThreshold); - // *endIter = std::fmin(*endIter + expandTailThreshold, *(startIter + 1)); - // endIter = speakEnd_.end() - 1; - // startIter = speakStart_.end() - 1; - // *startIter = fmax(*startIter - expandHeadThreshold, *(endIter - 1)); - // *endIter = std::fmin(*endIter + expandTailThreshold, audioLength); - // for (int i = 1; i < speakStart_.size() - 1; ++i) { - // speakStart_[i] = std::fmax(speakStart_[i] - expandHeadThreshold, - // speakEnd_[i - 1]); - // speakEnd_[i] = std::fmin(speakEnd_[i] + expandTailThreshold, - // speakStart_[i + 1]); - // } - // // Merge very closed segments - // startIter = speakStart_.begin() + 1; - // endIter = speakEnd_.begin(); - // while (startIter != speakStart_.end()) { - // if (*startIter - *endIter < mergeThreshold) { - // startIter = speakStart_.erase(startIter); - // endIter = speakEnd_.erase(endIter); - // } else { - // startIter++; - // endIter++; - // } - // } - - std::vector> result; - for (int i = 0; i < speakStart_.size(); ++i) { - result.emplace_back(std::map( - {{"start", speakStart_[i]}, {"end", speakEnd_[i]}})); + json.pop_back(); + json += "]"; + + if(result != NULL){ + snprintf(result, max_len, "%s", json.c_str()); + } else { + DLOG(INFO) << "result is NULL"; } - return result; + return 0; } std::ostream& operator<<(std::ostream& os, const Vad::State& s) { diff --git a/runtime/engine/vad/nnet/vad.h b/runtime/engine/vad/nnet/vad.h index de557ec67..31db78d21 100644 --- a/runtime/engine/vad/nnet/vad.h +++ b/runtime/engine/vad/nnet/vad.h @@ -70,7 +70,7 @@ class Vad : public fastdeploy::FastDeployModel { const State& Postprocess(); - const std::vector> GetResult( + int GetResult(char* result, int max_len, float removeThreshold = 0.0, float expandHeadThreshold = 0.0, float expandTailThreshold = 0, @@ -103,6 +103,7 @@ class Vad : public fastdeploy::FastDeployModel { private: bool Initialize(); + std::string ConvertTime(float time_s) const; private: std::mutex init_lock_; @@ -122,8 +123,8 @@ class Vad : public fastdeploy::FastDeployModel { // MAX 4294967295 samples / 8sample per ms / 1000 / 60 = 8947 minutes float outputProb_; - std::vector speakStart_; - mutable std::vector speakEnd_; + std::vector speechStart_; + mutable std::vector speechEnd_; std::vector states_; diff --git a/runtime/engine/vad/nnet/vad_nnet_main.cc b/runtime/engine/vad/nnet/vad_nnet_main.cc index 7b89d1af3..f3079b42e 100644 --- a/runtime/engine/vad/nnet/vad_nnet_main.cc +++ b/runtime/engine/vad/nnet/vad_nnet_main.cc @@ -70,12 +70,6 @@ int main(int argc, char* argv[]) { std::cout << "RTF=" << timer.Elapsed() / double(num_samples / sr) << std::endl; - - std::vector> result = vad.GetResult(); - for (auto& res : result) { - std::cout << "speak start: " << res["start"] - << " s, end: " << res["end"] << " s | "; - } std::cout << "\b\b " << std::endl; vad.Reset();