You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/runtime/engine/vad/nnet/vad.cc

333 lines
10 KiB

// Copyright (c) 2023 Chen Qianhe Authors. All Rights Reserved.
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "vad/nnet/vad.h"
#include <cstring>
#include <iomanip>
#include "common/base/common.h"
namespace ppspeech {
Vad::Vad(const std::string& model_file,
const fastdeploy::RuntimeOption&
custom_option /* = fastdeploy::RuntimeOption() */) {
valid_cpu_backends = {fastdeploy::Backend::ORT,
fastdeploy::Backend::OPENVINO};
valid_gpu_backends = {fastdeploy::Backend::ORT, fastdeploy::Backend::TRT};
runtime_option = custom_option;
// ORT backend
runtime_option.UseCpu();
runtime_option.UseOrtBackend();
runtime_option.model_format = fastdeploy::ModelFormat::ONNX;
// grap opt level
runtime_option.ort_option.graph_optimization_level = 99;
// one-thread
runtime_option.ort_option.intra_op_num_threads = 1;
runtime_option.ort_option.inter_op_num_threads = 1;
// model path
runtime_option.model_file = model_file;
}
void Vad::Init() {
std::lock_guard<std::mutex> lock(init_lock_);
Initialize();
}
std::string Vad::ModelName() const { return "VAD"; }
void Vad::SetConfig(const VadNnetConf conf) {
SetConfig(conf.sr,
conf.frame_ms,
conf.threshold,
conf.beam,
conf.min_silence_duration_ms,
conf.speech_pad_left_ms,
conf.speech_pad_right_ms);
}
void Vad::SetConfig(const int& sr,
const int& frame_ms,
const float& threshold,
const float& beam,
const int& min_silence_duration_ms,
const int& speech_pad_left_ms,
const int& speech_pad_right_ms) {
if (initialized_) {
fastdeploy::FDERROR << "SetConfig must be called before init"
<< std::endl;
throw std::runtime_error("SetConfig must be called before init");
}
sample_rate_ = sr;
sr_per_ms_ = sr / 1000;
threshold_ = threshold;
beam_ = beam;
frame_ms_ = frame_ms;
min_silence_samples_ = min_silence_duration_ms * sr_per_ms_;
speech_pad_left_samples_ = speech_pad_left_ms * sr_per_ms_;
speech_pad_right_samples_ = speech_pad_right_ms * sr_per_ms_;
// init chunk size
window_size_samples_ = frame_ms * sr_per_ms_;
current_chunk_size_ = window_size_samples_;
fastdeploy::FDINFO << "sr=" << sr_per_ms_ << " threshold=" << threshold_
<< " beam=" << beam_ << " frame_ms=" << frame_ms_
<< " min_silence_duration_ms=" << min_silence_duration_ms
<< " speech_pad_left_ms=" << speech_pad_left_ms
<< " speech_pad_right_ms=" << speech_pad_right_ms;
}
void Vad::Reset() {
std::memset(h_.data(), 0.0f, h_.size() * sizeof(float));
std::memset(c_.data(), 0.0f, c_.size() * sizeof(float));
triggerd_ = false;
temp_end_ = 0;
current_sample_ = 0;
speechStart_.clear();
speechEnd_.clear();
states_.clear();
}
bool Vad::Initialize() {
// input & output holder
inputTensors_.resize(4);
outputTensors_.resize(3);
// input shape
input_node_dims_.emplace_back(1);
input_node_dims_.emplace_back(window_size_samples_);
// sr buffer
sr_.resize(1);
sr_[0] = sample_rate_;
// hidden state buffer
h_.resize(size_hc_);
c_.resize(size_hc_);
Reset();
// InitRuntime
if (!InitRuntime()) {
fastdeploy::FDERROR << "Failed to initialize fastdeploy backend."
<< std::endl;
return false;
}
initialized_ = true;
fastdeploy::FDINFO << "init done.";
return true;
}
bool Vad::ForwardChunk(std::vector<float>& chunk) {
// last chunk may not be window_size_samples_
input_node_dims_.back() = chunk.size();
assert(window_size_samples_ >= chunk.size());
current_chunk_size_ = chunk.size();
inputTensors_[0].name = "input";
inputTensors_[0].SetExternalData(
input_node_dims_, fastdeploy::FDDataType::FP32, chunk.data());
inputTensors_[1].name = "sr";
inputTensors_[1].SetExternalData(
sr_node_dims_, fastdeploy::FDDataType::INT64, sr_.data());
inputTensors_[2].name = "h";
inputTensors_[2].SetExternalData(
hc_node_dims_, fastdeploy::FDDataType::FP32, h_.data());
inputTensors_[3].name = "c";
inputTensors_[3].SetExternalData(
hc_node_dims_, fastdeploy::FDDataType::FP32, c_.data());
if (!Infer(inputTensors_, &outputTensors_)) {
return false;
}
// Push forward sample index
current_sample_ += current_chunk_size_;
return true;
}
const Vad::State& Vad::Postprocess() {
// update prob, h, c
outputProb_ = *(float*)outputTensors_[0].Data();
auto* hn = static_cast<float*>(outputTensors_[1].MutableData());
std::memcpy(h_.data(), hn, h_.size() * sizeof(float));
auto* cn = static_cast<float*>(outputTensors_[2].MutableData());
std::memcpy(c_.data(), cn, c_.size() * sizeof(float));
if (outputProb_ < threshold_ && !triggerd_) {
// 1. Silence
#ifdef PPS_DEBUG
DLOG(INFO) << "{ silence: " << 1.0 * current_sample_ / sample_rate_
<< " s; prob: " << outputProb_ << " }";
#endif
states_.emplace_back(Vad::State::SIL);
} else if (outputProb_ >= threshold_ && !triggerd_) {
// 2. Start
triggerd_ = true;
speech_start_ =
current_sample_ - current_chunk_size_ - speech_pad_left_samples_;
speech_start_ = std::max(int(speech_start_), 0);
float start_sec = 1.0 * speech_start_ / sample_rate_;
speechStart_.emplace_back(start_sec);
#ifdef PPS_DEBUG
DLOG(INFO) << "{ speech start: " << start_sec
<< " s; prob: " << outputProb_ << " }";
#endif
states_.emplace_back(Vad::State::START);
} else if (outputProb_ >= threshold_ - beam_ && triggerd_) {
// 3. Continue
if (temp_end_ != 0) {
// speech prob relaxation, speech continues again
#ifdef PPS_DEBUG
DLOG(INFO)
<< "{ speech fake end(sil < min_silence_ms) to continue: "
<< 1.0 * current_sample_ / sample_rate_
<< " s; prob: " << outputProb_ << " }";
#endif
temp_end_ = 0;
} else {
// speech prob relaxation, keep tracking speech
#ifdef PPS_DEBUG
DLOG(INFO) << "{ speech continue: "
<< 1.0 * current_sample_ / sample_rate_
<< " s; prob: " << outputProb_ << " }";
#endif
}
states_.emplace_back(Vad::State::SPEECH);
} else if (outputProb_ < threshold_ - beam_ && triggerd_) {
// 4. End
if (temp_end_ == 0) {
temp_end_ = current_sample_;
}
// check possible speech end
if (current_sample_ - temp_end_ < min_silence_samples_) {
// a. silence < min_slience_samples, continue speaking
#ifdef PPS_DEBUG
DLOG(INFO) << "{ speech fake end(sil < min_silence_ms): "
<< 1.0 * current_sample_ / sample_rate_
<< " s; prob: " << outputProb_ << " }";
#endif
states_.emplace_back(Vad::State::SIL);
} else {
// b. silence >= min_slience_samples, end speaking
speech_end_ = current_sample_ + speech_pad_right_samples_;
temp_end_ = 0;
triggerd_ = false;
auto end_sec = 1.0 * speech_end_ / sample_rate_;
speechEnd_.emplace_back(end_sec);
#ifdef PPS_DEBUG
DLOG(INFO) << "{ speech end: " << end_sec
<< " s; prob: " << outputProb_ << " }";
#endif
states_.emplace_back(Vad::State::END);
}
}
return states_.back();
}
std::string Vad::ConvertTime(float time_s) const{
float seconds_tmp, minutes_tmp, hours_tmp;
float seconds;
int minutes, hours;
// 计算小时
hours_tmp = time_s / 60 / 60; // 1
hours = (int)hours_tmp;
// 计算分钟
minutes_tmp = time_s / 60;
if (minutes_tmp >= 60) {
minutes = minutes_tmp - 60 * (double)hours;
}
else {
minutes = minutes_tmp;
}
// 计算秒数
seconds_tmp = (60 * 60 * hours) + (60 * minutes);
seconds = time_s - seconds_tmp;
// 输出格式
std::stringstream ss;
ss << hours << ":" << minutes << ":" << seconds;
return ss.str();
}
int Vad::GetResult(char* result, int max_len,
float removeThreshold,
float expandHeadThreshold,
float expandTailThreshold,
float mergeThreshold) const {
float audioLength = 1.0 * current_sample_ / sample_rate_;
if (speechStart_.empty() && speechEnd_.empty()) {
return {};
}
if (speechEnd_.size() != speechStart_.size()) {
// set the audio length as the last end
speechEnd_.emplace_back(audioLength);
}
std::string json = "[";
for (int i = 0; i < speechStart_.size(); ++i) {
json += "{\"s\":\"" + ConvertTime(speechStart_[i]) + "\",\"e\":\"" + ConvertTime(speechEnd_[i]) + "\"},";
}
json.pop_back();
json += "]";
if(result != NULL){
snprintf(result, max_len, "%s", json.c_str());
} else {
DLOG(INFO) << "result is NULL";
}
return 0;
}
std::ostream& operator<<(std::ostream& os, const Vad::State& s) {
switch (s) {
case Vad::State::SIL:
os << "[SIL]";
break;
case Vad::State::START:
os << "[STA]";
break;
case Vad::State::SPEECH:
os << "[SPE]";
break;
case Vad::State::END:
os << "[END]";
break;
default:
// illegal state
os << "[ILL]";
break;
}
return os;
}
} // namespace ppspeech