You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/runtime/engine/common/frontend/feature_common_inl.h

103 lines
3.7 KiB

// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
namespace ppspeech {
template <class F>
StreamingFeatureTpl<F>::StreamingFeatureTpl(
const Options& opts, std::unique_ptr<FrontendInterface> base_extractor)
: opts_(opts), computer_(opts), window_function_(opts.frame_opts) {
base_extractor_ = std::move(base_extractor);
}
template <class F>
void StreamingFeatureTpl<F>::Accept(
const std::vector<kaldi::BaseFloat>& waves) {
base_extractor_->Accept(waves);
}
template <class F>
bool StreamingFeatureTpl<F>::Read(std::vector<kaldi::BaseFloat>* feats) {
std::vector<kaldi::BaseFloat> wav(base_extractor_->Dim());
bool flag = base_extractor_->Read(&wav);
if (flag == false || wav.size() == 0) return false;
// append remaned waves
int32 wav_len = wav.size();
int32 left_len = remained_wav_.size();
std::vector<kaldi::BaseFloat> waves(left_len + wav_len);
std::memcpy(waves.data(),
remained_wav_.data(),
left_len * sizeof(kaldi::BaseFloat));
std::memcpy(waves.data() + left_len,
wav.data(),
wav_len * sizeof(kaldi::BaseFloat));
// compute speech feature
Compute(waves, feats);
// cache remaned waves
knf::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
int32 num_frames = knf::NumFrames(waves.size(), frame_opts);
int32 frame_shift = frame_opts.WindowShift();
int32 left_samples = waves.size() - frame_shift * num_frames;
remained_wav_.resize(left_samples);
std::memcpy(remained_wav_.data(),
waves.data() + frame_shift * num_frames,
left_samples * sizeof(BaseFloat));
return true;
}
// Compute feat
template <class F>
bool StreamingFeatureTpl<F>::Compute(const std::vector<kaldi::BaseFloat>& waves,
std::vector<kaldi::BaseFloat>* feats) {
const knf::FrameExtractionOptions& frame_opts = computer_.GetFrameOptions();
int32 num_samples = waves.size();
int32 frame_length = frame_opts.WindowSize();
int32 sample_rate = frame_opts.samp_freq;
if (num_samples < frame_length) {
return true;
}
int32 num_frames = knf::NumFrames(num_samples, frame_opts);
feats->resize(num_frames * Dim());
std::vector<kaldi::BaseFloat> window;
bool need_raw_log_energy = computer_.NeedRawLogEnergy();
for (int32 frame = 0; frame < num_frames; frame++) {
std::fill(window.begin(), window.end(), 0);
kaldi::BaseFloat raw_log_energy = 0.0;
kaldi::BaseFloat vtln_warp = 1.0;
knf::ExtractWindow(0,
waves,
frame,
frame_opts,
window_function_,
&window,
need_raw_log_energy ? &raw_log_energy : NULL);
std::vector<kaldi::BaseFloat> this_feature(computer_.Dim());
computer_.Compute(
raw_log_energy, vtln_warp, &window, this_feature.data());
std::memcpy(feats->data() + frame * Dim(),
this_feature.data(),
sizeof(BaseFloat) * Dim());
}
return true;
}
} // namespace ppspeech