You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
155 lines
5.2 KiB
155 lines
5.2 KiB
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "frontend/linear_spectrogram.h"
|
|
#include "kaldi/base/kaldi-math.h"
|
|
#include "kaldi/matrix/matrix-functions.h"
|
|
|
|
namespace ppspeech {
|
|
|
|
using kaldi::int32;
|
|
using kaldi::BaseFloat;
|
|
using kaldi::Vector;
|
|
using kaldi::VectorBase;
|
|
using kaldi::Matrix;
|
|
using std::vector;
|
|
|
|
LinearSpectrogram::LinearSpectrogram(
|
|
const LinearSpectrogramOptions& opts,
|
|
std::unique_ptr<FeatureExtractorInterface> base_extractor) {
|
|
opts_ = opts;
|
|
base_extractor_ = std::move(base_extractor);
|
|
int32 window_size = opts.frame_opts.WindowSize();
|
|
int32 window_shift = opts.frame_opts.WindowShift();
|
|
fft_points_ = window_size;
|
|
chunk_sample_size_ =
|
|
static_cast<int32>(opts.streaming_chunk * opts.frame_opts.samp_freq);
|
|
hanning_window_.resize(window_size);
|
|
|
|
double a = M_2PI / (window_size - 1);
|
|
hanning_window_energy_ = 0;
|
|
for (int i = 0; i < window_size; ++i) {
|
|
hanning_window_[i] = 0.5 - 0.5 * cos(a * i);
|
|
hanning_window_energy_ += hanning_window_[i] * hanning_window_[i];
|
|
}
|
|
|
|
dim_ = fft_points_ / 2 + 1; // the dimension is Fs/2 Hz
|
|
}
|
|
|
|
void LinearSpectrogram::Accept(const VectorBase<BaseFloat>& inputs) {
|
|
base_extractor_->Accept(inputs);
|
|
}
|
|
|
|
bool LinearSpectrogram::Read(Vector<BaseFloat>* feats) {
|
|
Vector<BaseFloat> input_feats(chunk_sample_size_);
|
|
bool flag = base_extractor_->Read(&input_feats);
|
|
if (flag == false || input_feats.Dim() == 0) return false;
|
|
|
|
vector<BaseFloat> input_feats_vec(input_feats.Dim());
|
|
std::memcpy(input_feats_vec.data(), input_feats.Data(),
|
|
input_feats.Dim()*sizeof(BaseFloat));
|
|
vector<vector<BaseFloat>> result;
|
|
Compute(input_feats_vec, result);
|
|
int32 feat_size = 0;
|
|
if (result.size() != 0) {
|
|
feat_size = result.size() * result[0].size();
|
|
}
|
|
feats->Resize(feat_size);
|
|
// todo refactor (SimleGoat)
|
|
for (size_t idx = 0; idx < feat_size; ++idx) {
|
|
(*feats)(idx) = result[idx / dim_][idx % dim_];
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void LinearSpectrogram::Hanning(vector<float>* data) const {
|
|
CHECK_GE(data->size(), hanning_window_.size());
|
|
|
|
for (size_t i = 0; i < hanning_window_.size(); ++i) {
|
|
data->at(i) *= hanning_window_[i];
|
|
}
|
|
}
|
|
|
|
bool LinearSpectrogram::NumpyFft(vector<BaseFloat>* v,
|
|
vector<BaseFloat>* real,
|
|
vector<BaseFloat>* img) const {
|
|
Vector<BaseFloat> v_tmp;
|
|
v_tmp.Resize(v->size());
|
|
std::memcpy(v_tmp.Data(), v->data(), sizeof(BaseFloat)*(v->size()));
|
|
RealFft(&v_tmp, true);
|
|
v->resize(v_tmp.Dim());
|
|
std::memcpy(v->data(), v_tmp.Data(), sizeof(BaseFloat)*(v->size()));
|
|
|
|
real->push_back(v->at(0));
|
|
img->push_back(0);
|
|
for (int i = 1; i < v->size() / 2; i++) {
|
|
real->push_back(v->at(2 * i));
|
|
img->push_back(v->at(2 * i + 1));
|
|
}
|
|
real->push_back(v->at(1));
|
|
img->push_back(0);
|
|
|
|
return true;
|
|
}
|
|
|
|
// Compute spectrogram feat
|
|
// todo: refactor later (SmileGoat)
|
|
bool LinearSpectrogram::Compute(const vector<float>& waves,
|
|
vector<vector<float>>& feats) {
|
|
int num_samples = waves.size();
|
|
const int& frame_length = opts_.frame_opts.WindowSize();
|
|
const int& sample_rate = opts_.frame_opts.samp_freq;
|
|
const int& frame_shift = opts_.frame_opts.WindowShift();
|
|
const int& fft_points = fft_points_;
|
|
const float scale = hanning_window_energy_ * sample_rate;
|
|
|
|
if (num_samples < frame_length) {
|
|
return true;
|
|
}
|
|
|
|
int num_frames = 1 + ((num_samples - frame_length) / frame_shift);
|
|
feats.resize(num_frames);
|
|
vector<float> fft_real((fft_points_ / 2 + 1), 0);
|
|
vector<float> fft_img((fft_points_ / 2 + 1), 0);
|
|
vector<float> v(frame_length, 0);
|
|
vector<float> power((fft_points / 2 + 1));
|
|
|
|
for (int i = 0; i < num_frames; ++i) {
|
|
vector<float> data(waves.data() + i * frame_shift,
|
|
waves.data() + i * frame_shift + frame_length);
|
|
Hanning(&data);
|
|
fft_img.clear();
|
|
fft_real.clear();
|
|
v.assign(data.begin(), data.end());
|
|
NumpyFft(&v, &fft_real, &fft_img);
|
|
|
|
feats[i].resize(fft_points / 2 + 1); // the last dimension is Fs/2 Hz
|
|
for (int j = 0; j < (fft_points / 2 + 1); ++j) {
|
|
power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j];
|
|
feats[i][j] = power[j];
|
|
|
|
if (j == 0 || j == feats[0].size() - 1) {
|
|
feats[i][j] /= scale;
|
|
} else {
|
|
feats[i][j] *= (2.0 / scale);
|
|
}
|
|
|
|
// log added eps=1e-14
|
|
feats[i][j] = std::log(feats[i][j] + 1e-14);
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
} // namespace ppspeech
|