[speechx]add kaldi-native-fbank && refactor frontend (#2794)
* replace kaldi-fbank with kaldi-native-fbank * make kaldi-native-fbank workpull/2854/head
parent
acf1d27230
commit
c1b1ae0515
@ -1,16 +1,10 @@
|
||||
include_directories(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/base
|
||||
)
|
||||
|
||||
include_directories(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/../
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/utils
|
||||
)
|
||||
add_subdirectory(utils)
|
||||
|
||||
include_directories(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/frontend
|
||||
)
|
||||
add_subdirectory(frontend)
|
||||
|
@ -1,29 +1,27 @@
|
||||
add_library(kaldi-native-fbank-core
|
||||
feature-fbank.cc
|
||||
feature-functions.cc
|
||||
feature-window.cc
|
||||
fftsg.c
|
||||
mel-computations.cc
|
||||
rfft.cc
|
||||
)
|
||||
|
||||
add_library(frontend STATIC
|
||||
cmvn.cc
|
||||
db_norm.cc
|
||||
linear_spectrogram.cc
|
||||
audio_cache.cc
|
||||
feature_cache.cc
|
||||
feature_pipeline.cc
|
||||
fbank.cc
|
||||
assembler.cc
|
||||
)
|
||||
target_link_libraries(frontend PUBLIC kaldi-matrix kaldi-feat-common kaldi-fbank)
|
||||
|
||||
|
||||
|
||||
set(bin_name cmvn_json2kaldi_main)
|
||||
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
|
||||
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
|
||||
target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog)
|
||||
target_link_libraries(frontend PUBLIC kaldi-native-fbank-core utils)
|
||||
|
||||
set(BINS
|
||||
compute_linear_spectrogram_main
|
||||
compute_fbank_main
|
||||
)
|
||||
|
||||
foreach(bin_name IN LISTS BINS)
|
||||
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
|
||||
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
|
||||
target_link_libraries(${bin_name} PUBLIC frontend utils kaldi-util gflags glog)
|
||||
target_link_libraries(${bin_name} PUBLIC frontend utils kaldi-util gflags glog kaldi-feat-common)
|
||||
endforeach()
|
||||
|
@ -1,98 +0,0 @@
|
||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Note: Do not print/log ondemand object.
|
||||
|
||||
#include "base/common.h"
|
||||
#include "base/flags.h"
|
||||
#include "base/log.h"
|
||||
#include "kaldi/matrix/kaldi-matrix.h"
|
||||
#include "kaldi/util/kaldi-io.h"
|
||||
#include "utils/file_utils.h"
|
||||
#include "utils/picojson.h"
|
||||
|
||||
DEFINE_string(json_file, "", "cmvn json file");
|
||||
DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn");
|
||||
DEFINE_bool(binary, true, "write cmvn in binary (true) or text(false)");
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
gflags::SetUsageMessage("Usage:");
|
||||
gflags::ParseCommandLineFlags(&argc, &argv, false);
|
||||
google::InitGoogleLogging(argv[0]);
|
||||
google::InstallFailureSignalHandler();
|
||||
FLAGS_logtostderr = 1;
|
||||
|
||||
LOG(INFO) << "cmvn josn path: " << FLAGS_json_file;
|
||||
|
||||
auto ifs = std::ifstream(FLAGS_json_file);
|
||||
std::string json_str = ppspeech::ReadFile2String(FLAGS_json_file);
|
||||
picojson::value value;
|
||||
std::string err;
|
||||
const char* json_end = picojson::parse(
|
||||
value, json_str.c_str(), json_str.c_str() + json_str.size(), &err);
|
||||
if (!value.is<picojson::object>()) {
|
||||
LOG(ERROR) << "Input json file format error.";
|
||||
}
|
||||
|
||||
const picojson::value::object& obj = value.get<picojson::object>();
|
||||
for (picojson::value::object::const_iterator elem = obj.begin();
|
||||
elem != obj.end();
|
||||
++elem) {
|
||||
if (elem->first == "mean_stat") {
|
||||
VLOG(2) << "mean_stat:" << elem->second;
|
||||
// const picojson::value tmp =
|
||||
// elem->second.get(0);//<picojson::array>();
|
||||
double tmp =
|
||||
elem->second.get(0).get<double>(); //<picojson::array>();
|
||||
VLOG(2) << "tmp: " << tmp;
|
||||
}
|
||||
if (elem->first == "var_stat") {
|
||||
VLOG(2) << "var_stat: " << elem->second;
|
||||
}
|
||||
if (elem->first == "frame_num") {
|
||||
VLOG(2) << "frame_num: " << elem->second;
|
||||
}
|
||||
}
|
||||
|
||||
const picojson::value::array& mean_stat =
|
||||
value.get("mean_stat").get<picojson::array>();
|
||||
std::vector<kaldi::BaseFloat> mean_stat_vec;
|
||||
for (auto it = mean_stat.begin(); it != mean_stat.end(); it++) {
|
||||
mean_stat_vec.push_back((*it).get<double>());
|
||||
}
|
||||
|
||||
const picojson::value::array& var_stat =
|
||||
value.get("var_stat").get<picojson::array>();
|
||||
std::vector<kaldi::BaseFloat> var_stat_vec;
|
||||
for (auto it = var_stat.begin(); it != var_stat.end(); it++) {
|
||||
var_stat_vec.push_back((*it).get<double>());
|
||||
}
|
||||
|
||||
kaldi::int32 frame_num = value.get("frame_num").get<int64_t>();
|
||||
LOG(INFO) << "nframe: " << frame_num;
|
||||
|
||||
size_t mean_size = mean_stat_vec.size();
|
||||
kaldi::Matrix<double> cmvn_stats(2, mean_size + 1);
|
||||
for (size_t idx = 0; idx < mean_size; ++idx) {
|
||||
cmvn_stats(0, idx) = mean_stat_vec[idx];
|
||||
cmvn_stats(1, idx) = var_stat_vec[idx];
|
||||
}
|
||||
cmvn_stats(0, mean_size) = frame_num;
|
||||
VLOG(2) << cmvn_stats;
|
||||
|
||||
kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, FLAGS_binary);
|
||||
LOG(INFO) << "cmvn stats have write into: " << FLAGS_cmvn_write_path;
|
||||
LOG(INFO) << "Binary: " << FLAGS_binary;
|
||||
return 0;
|
||||
}
|
@ -0,0 +1,123 @@
|
||||
/**
|
||||
* Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// This file is copied/modified from kaldi/src/feat/feature-fbank.cc
|
||||
//
|
||||
#include "frontend/audio/feature-fbank.h"
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "frontend/audio/feature-functions.h"
|
||||
|
||||
namespace knf {
|
||||
|
||||
static void Sqrt(float *in_out, int32_t n) {
|
||||
for (int32_t i = 0; i != n; ++i) {
|
||||
in_out[i] = std::sqrt(in_out[i]);
|
||||
}
|
||||
}
|
||||
|
||||
std::ostream &operator<<(std::ostream &os, const FbankOptions &opts) {
|
||||
os << opts.ToString();
|
||||
return os;
|
||||
}
|
||||
|
||||
FbankComputer::FbankComputer(const FbankOptions &opts)
|
||||
: opts_(opts), rfft_(opts.frame_opts.PaddedWindowSize()) {
|
||||
if (opts.energy_floor > 0.0f) {
|
||||
log_energy_floor_ = logf(opts.energy_floor);
|
||||
}
|
||||
|
||||
// We'll definitely need the filterbanks info for VTLN warping factor 1.0.
|
||||
// [note: this call caches it.]
|
||||
GetMelBanks(1.0f);
|
||||
}
|
||||
|
||||
FbankComputer::~FbankComputer() {
|
||||
for (auto iter = mel_banks_.begin(); iter != mel_banks_.end(); ++iter)
|
||||
delete iter->second;
|
||||
}
|
||||
|
||||
const MelBanks *FbankComputer::GetMelBanks(float vtln_warp) {
|
||||
MelBanks *this_mel_banks = nullptr;
|
||||
|
||||
// std::map<float, MelBanks *>::iterator iter = mel_banks_.find(vtln_warp);
|
||||
auto iter = mel_banks_.find(vtln_warp);
|
||||
if (iter == mel_banks_.end()) {
|
||||
this_mel_banks =
|
||||
new MelBanks(opts_.mel_opts, opts_.frame_opts, vtln_warp);
|
||||
mel_banks_[vtln_warp] = this_mel_banks;
|
||||
} else {
|
||||
this_mel_banks = iter->second;
|
||||
}
|
||||
return this_mel_banks;
|
||||
}
|
||||
|
||||
void FbankComputer::Compute(float signal_raw_log_energy,
|
||||
float vtln_warp,
|
||||
std::vector<float> *signal_frame,
|
||||
float *feature) {
|
||||
const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
|
||||
|
||||
CHECK_EQ(signal_frame->size(), opts_.frame_opts.PaddedWindowSize());
|
||||
|
||||
// Compute energy after window function (not the raw one).
|
||||
if (opts_.use_energy && !opts_.raw_energy) {
|
||||
signal_raw_log_energy =
|
||||
std::log(std::max<float>(InnerProduct(signal_frame->data(),
|
||||
signal_frame->data(),
|
||||
signal_frame->size()),
|
||||
std::numeric_limits<float>::epsilon()));
|
||||
}
|
||||
rfft_.Compute(signal_frame->data()); // signal_frame is modified in-place
|
||||
ComputePowerSpectrum(signal_frame);
|
||||
|
||||
// Use magnitude instead of power if requested.
|
||||
if (!opts_.use_power) {
|
||||
Sqrt(signal_frame->data(), signal_frame->size() / 2 + 1);
|
||||
}
|
||||
|
||||
int32_t mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
|
||||
|
||||
// Its length is opts_.mel_opts.num_bins
|
||||
float *mel_energies = feature + mel_offset;
|
||||
|
||||
// Sum with mel filter banks over the power spectrum
|
||||
mel_banks.Compute(signal_frame->data(), mel_energies);
|
||||
|
||||
if (opts_.use_log_fbank) {
|
||||
// Avoid log of zero (which should be prevented anyway by dithering).
|
||||
for (int32_t i = 0; i != opts_.mel_opts.num_bins; ++i) {
|
||||
auto t = std::max(mel_energies[i],
|
||||
std::numeric_limits<float>::epsilon());
|
||||
mel_energies[i] = std::log(t);
|
||||
}
|
||||
}
|
||||
|
||||
// Copy energy as first value (or the last, if htk_compat == true).
|
||||
if (opts_.use_energy) {
|
||||
if (opts_.energy_floor > 0.0 &&
|
||||
signal_raw_log_energy < log_energy_floor_) {
|
||||
signal_raw_log_energy = log_energy_floor_;
|
||||
}
|
||||
int32_t energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0;
|
||||
feature[energy_index] = signal_raw_log_energy;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace knf
|
@ -0,0 +1,137 @@
|
||||
/**
|
||||
* Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// This file is copied/modified from kaldi/src/feat/feature-fbank.h
|
||||
|
||||
#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
|
||||
#define KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
|
||||
|
||||
#include <map>
|
||||
|
||||
#include "frontend/audio/feature-window.h"
|
||||
#include "frontend/audio/mel-computations.h"
|
||||
#include "frontend/audio/rfft.h"
|
||||
|
||||
namespace knf {
|
||||
|
||||
struct FbankOptions {
|
||||
FrameExtractionOptions frame_opts;
|
||||
MelBanksOptions mel_opts;
|
||||
// append an extra dimension with energy to the filter banks
|
||||
bool use_energy = false;
|
||||
float energy_floor = 0.0f; // active iff use_energy==true
|
||||
|
||||
// If true, compute log_energy before preemphasis and windowing
|
||||
// If false, compute log_energy after preemphasis ans windowing
|
||||
bool raw_energy = true; // active iff use_energy==true
|
||||
|
||||
// If true, put energy last (if using energy)
|
||||
// If false, put energy first
|
||||
bool htk_compat = false; // active iff use_energy==true
|
||||
|
||||
// if true (default), produce log-filterbank, else linear
|
||||
bool use_log_fbank = true;
|
||||
|
||||
// if true (default), use power in filterbank
|
||||
// analysis, else magnitude.
|
||||
bool use_power = true;
|
||||
|
||||
FbankOptions() { mel_opts.num_bins = 23; }
|
||||
|
||||
std::string ToString() const {
|
||||
std::ostringstream os;
|
||||
os << "frame_opts: \n";
|
||||
os << frame_opts << "\n";
|
||||
os << "\n";
|
||||
|
||||
os << "mel_opts: \n";
|
||||
os << mel_opts << "\n";
|
||||
|
||||
os << "use_energy: " << use_energy << "\n";
|
||||
os << "energy_floor: " << energy_floor << "\n";
|
||||
os << "raw_energy: " << raw_energy << "\n";
|
||||
os << "htk_compat: " << htk_compat << "\n";
|
||||
os << "use_log_fbank: " << use_log_fbank << "\n";
|
||||
os << "use_power: " << use_power << "\n";
|
||||
return os.str();
|
||||
}
|
||||
};
|
||||
|
||||
std::ostream &operator<<(std::ostream &os, const FbankOptions &opts);
|
||||
|
||||
class FbankComputer {
|
||||
public:
|
||||
using Options = FbankOptions;
|
||||
|
||||
explicit FbankComputer(const FbankOptions &opts);
|
||||
~FbankComputer();
|
||||
|
||||
int32_t Dim() const {
|
||||
return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
|
||||
}
|
||||
|
||||
// if true, compute log_energy_pre_window but after dithering and dc removal
|
||||
bool NeedRawLogEnergy() const {
|
||||
return opts_.use_energy && opts_.raw_energy;
|
||||
}
|
||||
|
||||
const FrameExtractionOptions &GetFrameOptions() const {
|
||||
return opts_.frame_opts;
|
||||
}
|
||||
|
||||
const FbankOptions &GetOptions() const { return opts_; }
|
||||
|
||||
/**
|
||||
Function that computes one frame of features from
|
||||
one frame of signal.
|
||||
|
||||
@param [in] signal_raw_log_energy The log-energy of the frame of the
|
||||
signal
|
||||
prior to windowing and pre-emphasis, or
|
||||
log(numeric_limits<float>::min()), whichever is greater. Must be
|
||||
ignored by this function if this class returns false from
|
||||
this->NeedsRawLogEnergy().
|
||||
@param [in] vtln_warp The VTLN warping factor that the user wants
|
||||
to be applied when computing features for this utterance. Will
|
||||
normally be 1.0, meaning no warping is to be done. The value will
|
||||
be ignored for feature types that don't support VLTN, such as
|
||||
spectrogram features.
|
||||
@param [in] signal_frame One frame of the signal,
|
||||
as extracted using the function ExtractWindow() using the options
|
||||
returned by this->GetFrameOptions(). The function will use the
|
||||
vector as a workspace, which is why it's a non-const pointer.
|
||||
@param [out] feature Pointer to a vector of size this->Dim(), to which
|
||||
the computed feature will be written. It should be pre-allocated.
|
||||
*/
|
||||
void Compute(float signal_raw_log_energy,
|
||||
float vtln_warp,
|
||||
std::vector<float> *signal_frame,
|
||||
float *feature);
|
||||
|
||||
private:
|
||||
const MelBanks *GetMelBanks(float vtln_warp);
|
||||
|
||||
FbankOptions opts_;
|
||||
float log_energy_floor_;
|
||||
std::map<float, MelBanks *> mel_banks_; // float is VTLN coefficient.
|
||||
Rfft rfft_;
|
||||
};
|
||||
|
||||
} // namespace knf
|
||||
|
||||
#endif // KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
|
@ -0,0 +1,49 @@
|
||||
/**
|
||||
* Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// This file is copied/modified from kaldi/src/feat/feature-functions.cc
|
||||
|
||||
#include "frontend/audio/feature-functions.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
namespace knf {
|
||||
|
||||
void ComputePowerSpectrum(std::vector<float> *complex_fft) {
|
||||
int32_t dim = complex_fft->size();
|
||||
|
||||
// now we have in complex_fft, first half of complex spectrum
|
||||
// it's stored as [real0, realN/2, real1, im1, real2, im2, ...]
|
||||
|
||||
float *p = complex_fft->data();
|
||||
int32_t half_dim = dim / 2;
|
||||
float first_energy = p[0] * p[0];
|
||||
float last_energy = p[1] * p[1]; // handle this special case
|
||||
|
||||
for (int32_t i = 1; i < half_dim; ++i) {
|
||||
float real = p[i * 2];
|
||||
float im = p[i * 2 + 1];
|
||||
p[i] = real * real + im * im;
|
||||
}
|
||||
p[0] = first_energy;
|
||||
p[half_dim] = last_energy; // Will actually never be used, and anyway
|
||||
// if the signal has been bandlimited sensibly this should be zero.
|
||||
}
|
||||
|
||||
} // namespace knf
|
@ -0,0 +1,38 @@
|
||||
/**
|
||||
* Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// This file is copied/modified from kaldi/src/feat/feature-functions.h
|
||||
#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H
|
||||
#define KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H
|
||||
|
||||
#include <vector>
|
||||
namespace knf {
|
||||
|
||||
// ComputePowerSpectrum converts a complex FFT (as produced by the FFT
|
||||
// functions in csrc/rfft.h), and converts it into
|
||||
// a power spectrum. If the complex FFT is a vector of size n (representing
|
||||
// half of the complex FFT of a real signal of size n, as described there),
|
||||
// this function computes in the first (n/2) + 1 elements of it, the
|
||||
// energies of the fft bins from zero to the Nyquist frequency. Contents of the
|
||||
// remaining (n/2) - 1 elements are undefined at output.
|
||||
|
||||
void ComputePowerSpectrum(std::vector<float> *complex_fft);
|
||||
|
||||
} // namespace knf
|
||||
|
||||
#endif // KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H
|
@ -0,0 +1,247 @@
|
||||
// kaldi-native-fbank/csrc/feature-window.cc
|
||||
//
|
||||
// Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
|
||||
// This file is copied/modified from kaldi/src/feat/feature-window.cc
|
||||
|
||||
#include "frontend/audio/feature-window.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
#ifndef M_2PI
|
||||
#define M_2PI 6.283185307179586476925286766559005
|
||||
#endif
|
||||
|
||||
namespace knf {
|
||||
|
||||
std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts) {
|
||||
os << opts.ToString();
|
||||
return os;
|
||||
}
|
||||
|
||||
FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts)
|
||||
: window_(opts.WindowSize()) {
|
||||
int32_t frame_length = opts.WindowSize();
|
||||
CHECK_GT(frame_length, 0);
|
||||
|
||||
float *window_data = window_.data();
|
||||
|
||||
double a = M_2PI / (frame_length - 1);
|
||||
for (int32_t i = 0; i < frame_length; i++) {
|
||||
double i_fl = static_cast<double>(i);
|
||||
if (opts.window_type == "hanning") {
|
||||
window_data[i] = 0.5 - 0.5 * cos(a * i_fl);
|
||||
} else if (opts.window_type == "sine") {
|
||||
// when you are checking ws wikipedia, please
|
||||
// note that 0.5 * a = M_PI/(frame_length-1)
|
||||
window_data[i] = sin(0.5 * a * i_fl);
|
||||
} else if (opts.window_type == "hamming") {
|
||||
window_data[i] = 0.54 - 0.46 * cos(a * i_fl);
|
||||
} else if (opts.window_type ==
|
||||
"povey") { // like hamming but goes to zero at edges.
|
||||
window_data[i] = pow(0.5 - 0.5 * cos(a * i_fl), 0.85);
|
||||
} else if (opts.window_type == "rectangular") {
|
||||
window_data[i] = 1.0;
|
||||
} else if (opts.window_type == "blackman") {
|
||||
window_data[i] = opts.blackman_coeff - 0.5 * cos(a * i_fl) +
|
||||
(0.5 - opts.blackman_coeff) * cos(2 * a * i_fl);
|
||||
} else {
|
||||
LOG(FATAL) << "Invalid window type " << opts.window_type;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureWindowFunction::Apply(float *wave) const {
|
||||
int32_t window_size = window_.size();
|
||||
const float *p = window_.data();
|
||||
for (int32_t k = 0; k != window_size; ++k) {
|
||||
wave[k] *= p[k];
|
||||
}
|
||||
}
|
||||
|
||||
int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts) {
|
||||
int64_t frame_shift = opts.WindowShift();
|
||||
if (opts.snip_edges) {
|
||||
return frame * frame_shift;
|
||||
} else {
|
||||
int64_t midpoint_of_frame = frame_shift * frame + frame_shift / 2,
|
||||
beginning_of_frame = midpoint_of_frame - opts.WindowSize() / 2;
|
||||
return beginning_of_frame;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t NumFrames(int64_t num_samples,
|
||||
const FrameExtractionOptions &opts,
|
||||
bool flush /*= true*/) {
|
||||
int64_t frame_shift = opts.WindowShift();
|
||||
int64_t frame_length = opts.WindowSize();
|
||||
if (opts.snip_edges) {
|
||||
// with --snip-edges=true (the default), we use a HTK-like approach to
|
||||
// determining the number of frames-- all frames have to fit completely
|
||||
// into
|
||||
// the waveform, and the first frame begins at sample zero.
|
||||
if (num_samples < frame_length)
|
||||
return 0;
|
||||
else
|
||||
return (1 + ((num_samples - frame_length) / frame_shift));
|
||||
// You can understand the expression above as follows: 'num_samples -
|
||||
// frame_length' is how much room we have to shift the frame within the
|
||||
// waveform; 'frame_shift' is how much we shift it each time; and the
|
||||
// ratio
|
||||
// is how many times we can shift it (integer arithmetic rounds down).
|
||||
} else {
|
||||
// if --snip-edges=false, the number of frames is determined by rounding
|
||||
// the
|
||||
// (file-length / frame-shift) to the nearest integer. The point of
|
||||
// this
|
||||
// formula is to make the number of frames an obvious and predictable
|
||||
// function of the frame shift and signal length, which makes many
|
||||
// segmentation-related questions simpler.
|
||||
//
|
||||
// Because integer division in C++ rounds toward zero, we add (half the
|
||||
// frame-shift minus epsilon) before dividing, to have the effect of
|
||||
// rounding towards the closest integer.
|
||||
int32_t num_frames = (num_samples + (frame_shift / 2)) / frame_shift;
|
||||
|
||||
if (flush) return num_frames;
|
||||
|
||||
// note: 'end' always means the last plus one, i.e. one past the last.
|
||||
int64_t end_sample_of_last_frame =
|
||||
FirstSampleOfFrame(num_frames - 1, opts) + frame_length;
|
||||
|
||||
// the following code is optimized more for clarity than efficiency.
|
||||
// If flush == false, we can't output frames that extend past the end
|
||||
// of the signal.
|
||||
while (num_frames > 0 && end_sample_of_last_frame > num_samples) {
|
||||
num_frames--;
|
||||
end_sample_of_last_frame -= frame_shift;
|
||||
}
|
||||
return num_frames;
|
||||
}
|
||||
}
|
||||
|
||||
void ExtractWindow(int64_t sample_offset,
|
||||
const std::vector<float> &wave,
|
||||
int32_t f,
|
||||
const FrameExtractionOptions &opts,
|
||||
const FeatureWindowFunction &window_function,
|
||||
std::vector<float> *window,
|
||||
float *log_energy_pre_window /*= nullptr*/) {
|
||||
CHECK(sample_offset >= 0 && wave.size() != 0);
|
||||
|
||||
int32_t frame_length = opts.WindowSize();
|
||||
int32_t frame_length_padded = opts.PaddedWindowSize();
|
||||
|
||||
int64_t num_samples = sample_offset + wave.size();
|
||||
int64_t start_sample = FirstSampleOfFrame(f, opts);
|
||||
int64_t end_sample = start_sample + frame_length;
|
||||
|
||||
if (opts.snip_edges) {
|
||||
CHECK(start_sample >= sample_offset && end_sample <= num_samples);
|
||||
} else {
|
||||
CHECK(sample_offset == 0 || start_sample >= sample_offset);
|
||||
}
|
||||
|
||||
if (window->size() != frame_length_padded) {
|
||||
window->resize(frame_length_padded);
|
||||
}
|
||||
|
||||
// wave_start and wave_end are start and end indexes into 'wave', for the
|
||||
// piece of wave that we're trying to extract.
|
||||
int32_t wave_start = int32_t(start_sample - sample_offset);
|
||||
int32_t wave_end = wave_start + frame_length;
|
||||
|
||||
if (wave_start >= 0 && wave_end <= wave.size()) {
|
||||
// the normal case-- no edge effects to consider.
|
||||
std::copy(wave.begin() + wave_start,
|
||||
wave.begin() + wave_start + frame_length,
|
||||
window->data());
|
||||
} else {
|
||||
// Deal with any end effects by reflection, if needed. This code will
|
||||
// only
|
||||
// be reached for about two frames per utterance, so we don't concern
|
||||
// ourselves excessively with efficiency.
|
||||
int32_t wave_dim = wave.size();
|
||||
for (int32_t s = 0; s < frame_length; ++s) {
|
||||
int32_t s_in_wave = s + wave_start;
|
||||
while (s_in_wave < 0 || s_in_wave >= wave_dim) {
|
||||
// reflect around the beginning or end of the wave.
|
||||
// e.g. -1 -> 0, -2 -> 1.
|
||||
// dim -> dim - 1, dim + 1 -> dim - 2.
|
||||
// the code supports repeated reflections, although this
|
||||
// would only be needed in pathological cases.
|
||||
if (s_in_wave < 0)
|
||||
s_in_wave = -s_in_wave - 1;
|
||||
else
|
||||
s_in_wave = 2 * wave_dim - 1 - s_in_wave;
|
||||
}
|
||||
(*window)[s] = wave[s_in_wave];
|
||||
}
|
||||
}
|
||||
|
||||
ProcessWindow(opts, window_function, window->data(), log_energy_pre_window);
|
||||
}
|
||||
|
||||
static void RemoveDcOffset(float *d, int32_t n) {
|
||||
float sum = 0;
|
||||
for (int32_t i = 0; i != n; ++i) {
|
||||
sum += d[i];
|
||||
}
|
||||
|
||||
float mean = sum / n;
|
||||
|
||||
for (int32_t i = 0; i != n; ++i) {
|
||||
d[i] -= mean;
|
||||
}
|
||||
}
|
||||
|
||||
float InnerProduct(const float *a, const float *b, int32_t n) {
|
||||
float sum = 0;
|
||||
for (int32_t i = 0; i != n; ++i) {
|
||||
sum += a[i] * b[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
static void Preemphasize(float *d, int32_t n, float preemph_coeff) {
|
||||
if (preemph_coeff == 0.0) {
|
||||
return;
|
||||
}
|
||||
|
||||
CHECK(preemph_coeff >= 0.0 && preemph_coeff <= 1.0);
|
||||
|
||||
for (int32_t i = n - 1; i > 0; --i) {
|
||||
d[i] -= preemph_coeff * d[i - 1];
|
||||
}
|
||||
d[0] -= preemph_coeff * d[0];
|
||||
}
|
||||
|
||||
void ProcessWindow(const FrameExtractionOptions &opts,
|
||||
const FeatureWindowFunction &window_function,
|
||||
float *window,
|
||||
float *log_energy_pre_window /*= nullptr*/) {
|
||||
int32_t frame_length = opts.WindowSize();
|
||||
|
||||
// TODO(fangjun): Remove dither
|
||||
CHECK_EQ(opts.dither, 0);
|
||||
|
||||
if (opts.remove_dc_offset) {
|
||||
RemoveDcOffset(window, frame_length);
|
||||
}
|
||||
|
||||
if (log_energy_pre_window != NULL) {
|
||||
float energy =
|
||||
std::max<float>(InnerProduct(window, window, frame_length),
|
||||
std::numeric_limits<float>::epsilon());
|
||||
*log_energy_pre_window = std::log(energy);
|
||||
}
|
||||
|
||||
if (opts.preemph_coeff != 0.0) {
|
||||
Preemphasize(window, frame_length, opts.preemph_coeff);
|
||||
}
|
||||
|
||||
window_function.Apply(window);
|
||||
}
|
||||
|
||||
} // namespace knf
|
@ -0,0 +1,183 @@
|
||||
// kaldi-native-fbank/csrc/feature-window.h
|
||||
//
|
||||
// Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
|
||||
// This file is copied/modified from kaldi/src/feat/feature-window.h
|
||||
|
||||
#ifndef KALDI_NATIVE_FEAT_CSRC_FEATURE_WINDOW_H_
|
||||
#define KALDI_NATIVE_FEAT_CSRC_FEATURE_WINDOW_H_
|
||||
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "base/log.h"
|
||||
|
||||
namespace knf {
|
||||
|
||||
inline int32_t RoundUpToNearestPowerOfTwo(int32_t n) {
|
||||
// copied from kaldi/src/base/kaldi-math.cc
|
||||
CHECK_GT(n, 0);
|
||||
n--;
|
||||
n |= n >> 1;
|
||||
n |= n >> 2;
|
||||
n |= n >> 4;
|
||||
n |= n >> 8;
|
||||
n |= n >> 16;
|
||||
return n + 1;
|
||||
}
|
||||
|
||||
struct FrameExtractionOptions {
|
||||
float samp_freq = 16000;
|
||||
float frame_shift_ms = 10.0f; // in milliseconds.
|
||||
float frame_length_ms = 25.0f; // in milliseconds.
|
||||
float dither = 1.0f; // Amount of dithering, 0.0 means no dither.
|
||||
float preemph_coeff = 0.97f; // Preemphasis coefficient.
|
||||
bool remove_dc_offset = true; // Subtract mean of wave before FFT.
|
||||
std::string window_type = "povey"; // e.g. Hamming window
|
||||
// May be "hamming", "rectangular", "povey", "hanning", "sine", "blackman"
|
||||
// "povey" is a window I made to be similar to Hamming but to go to zero at
|
||||
// the edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85) I just don't think
|
||||
// the
|
||||
// Hamming window makes sense as a windowing function.
|
||||
bool round_to_power_of_two = true;
|
||||
float blackman_coeff = 0.42f;
|
||||
bool snip_edges = true;
|
||||
// bool allow_downsample = false;
|
||||
// bool allow_upsample = false;
|
||||
|
||||
// Used for streaming feature extraction. It indicates the number
|
||||
// of feature frames to keep in the recycling vector. -1 means to
|
||||
// keep all feature frames.
|
||||
int32_t max_feature_vectors = -1;
|
||||
|
||||
int32_t WindowShift() const {
|
||||
return static_cast<int32_t>(samp_freq * 0.001f * frame_shift_ms);
|
||||
}
|
||||
int32_t WindowSize() const {
|
||||
return static_cast<int32_t>(samp_freq * 0.001f * frame_length_ms);
|
||||
}
|
||||
int32_t PaddedWindowSize() const {
|
||||
return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize())
|
||||
: WindowSize());
|
||||
}
|
||||
std::string ToString() const {
|
||||
std::ostringstream os;
|
||||
#define KNF_PRINT(x) os << #x << ": " << x << "\n"
|
||||
KNF_PRINT(samp_freq);
|
||||
KNF_PRINT(frame_shift_ms);
|
||||
KNF_PRINT(frame_length_ms);
|
||||
KNF_PRINT(dither);
|
||||
KNF_PRINT(preemph_coeff);
|
||||
KNF_PRINT(remove_dc_offset);
|
||||
KNF_PRINT(window_type);
|
||||
KNF_PRINT(round_to_power_of_two);
|
||||
KNF_PRINT(blackman_coeff);
|
||||
KNF_PRINT(snip_edges);
|
||||
// KNF_PRINT(allow_downsample);
|
||||
// KNF_PRINT(allow_upsample);
|
||||
KNF_PRINT(max_feature_vectors);
|
||||
#undef KNF_PRINT
|
||||
return os.str();
|
||||
}
|
||||
};
|
||||
|
||||
std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts);
|
||||
|
||||
class FeatureWindowFunction {
|
||||
public:
|
||||
FeatureWindowFunction() = default;
|
||||
explicit FeatureWindowFunction(const FrameExtractionOptions &opts);
|
||||
/**
|
||||
* @param wave Pointer to a 1-D array of shape [window_size].
|
||||
* It is modified in-place: wave[i] = wave[i] * window_[i].
|
||||
* @param
|
||||
*/
|
||||
void Apply(float *wave) const;
|
||||
|
||||
private:
|
||||
std::vector<float> window_; // of size opts.WindowSize()
|
||||
};
|
||||
|
||||
int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts);
|
||||
|
||||
/**
|
||||
This function returns the number of frames that we can extract from a wave
|
||||
file with the given number of samples in it (assumed to have the same
|
||||
sampling rate as specified in 'opts').
|
||||
|
||||
@param [in] num_samples The number of samples in the wave file.
|
||||
@param [in] opts The frame-extraction options class
|
||||
|
||||
@param [in] flush True if we are asserting that this number of samples
|
||||
is 'all there is', false if we expecting more data to possibly come in. This
|
||||
only makes a difference to the answer
|
||||
if opts.snips_edges== false. For offline feature extraction you always want
|
||||
flush == true. In an online-decoding context, once you know (or decide) that
|
||||
no more data is coming in, you'd call it with flush == true at the end to
|
||||
flush out any remaining data.
|
||||
*/
|
||||
int32_t NumFrames(int64_t num_samples,
|
||||
const FrameExtractionOptions &opts,
|
||||
bool flush = true);
|
||||
|
||||
/*
|
||||
ExtractWindow() extracts a windowed frame of waveform (possibly with a
|
||||
power-of-two, padded size, depending on the config), including all the
|
||||
processing done by ProcessWindow().
|
||||
|
||||
@param [in] sample_offset If 'wave' is not the entire waveform, but
|
||||
part of it to the left has been discarded, then the
|
||||
number of samples prior to 'wave' that we have
|
||||
already discarded. Set this to zero if you are
|
||||
processing the entire waveform in one piece, or
|
||||
if you get 'no matching function' compilation
|
||||
errors when updating the code.
|
||||
@param [in] wave The waveform
|
||||
@param [in] f The frame index to be extracted, with
|
||||
0 <= f < NumFrames(sample_offset + wave.Dim(), opts, true)
|
||||
@param [in] opts The options class to be used
|
||||
@param [in] window_function The windowing function, as derived from the
|
||||
options class.
|
||||
@param [out] window The windowed, possibly-padded waveform to be
|
||||
extracted. Will be resized as needed.
|
||||
@param [out] log_energy_pre_window If non-NULL, the log-energy of
|
||||
the signal prior to pre-emphasis and multiplying by
|
||||
the windowing function will be written to here.
|
||||
*/
|
||||
void ExtractWindow(int64_t sample_offset,
|
||||
const std::vector<float> &wave,
|
||||
int32_t f,
|
||||
const FrameExtractionOptions &opts,
|
||||
const FeatureWindowFunction &window_function,
|
||||
std::vector<float> *window,
|
||||
float *log_energy_pre_window = nullptr);
|
||||
|
||||
/**
|
||||
This function does all the windowing steps after actually
|
||||
extracting the windowed signal: depending on the
|
||||
configuration, it does dithering, dc offset removal,
|
||||
preemphasis, and multiplication by the windowing function.
|
||||
@param [in] opts The options class to be used
|
||||
@param [in] window_function The windowing function-- should have
|
||||
been initialized using 'opts'.
|
||||
@param [in,out] window A vector of size opts.WindowSize(). Note:
|
||||
it will typically be a sub-vector of a larger vector of size
|
||||
opts.PaddedWindowSize(), with the remaining samples zero,
|
||||
as the FFT code is more efficient if it operates on data with
|
||||
power-of-two size.
|
||||
@param [out] log_energy_pre_window If non-NULL, then after dithering and
|
||||
DC offset removal, this function will write to this pointer the log of
|
||||
the total energy (i.e. sum-squared) of the frame.
|
||||
*/
|
||||
void ProcessWindow(const FrameExtractionOptions &opts,
|
||||
const FeatureWindowFunction &window_function,
|
||||
float *window,
|
||||
float *log_energy_pre_window = nullptr);
|
||||
|
||||
// Compute the inner product of two vectors
|
||||
float InnerProduct(const float *a, const float *b, int32_t n);
|
||||
|
||||
} // namespace knf
|
||||
|
||||
#endif // KALDI_NATIVE_FEAT_CSRC_FEATURE_WINDOW_H_
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,277 @@
|
||||
/**
|
||||
* Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// This file is copied/modified from kaldi/src/feat/mel-computations.cc
|
||||
|
||||
#include "frontend/audio/mel-computations.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
|
||||
#include "frontend/audio/feature-window.h"
|
||||
|
||||
namespace knf {
|
||||
|
||||
std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts) {
|
||||
os << opts.ToString();
|
||||
return os;
|
||||
}
|
||||
|
||||
float MelBanks::VtlnWarpFreq(
|
||||
float vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN.
|
||||
float vtln_high_cutoff,
|
||||
float low_freq, // upper+lower frequency cutoffs in mel computation
|
||||
float high_freq,
|
||||
float vtln_warp_factor,
|
||||
float freq) {
|
||||
/// This computes a VTLN warping function that is not the same as HTK's one,
|
||||
/// but has similar inputs (this function has the advantage of never
|
||||
/// producing
|
||||
/// empty bins).
|
||||
|
||||
/// This function computes a warp function F(freq), defined between low_freq
|
||||
/// and high_freq inclusive, with the following properties:
|
||||
/// F(low_freq) == low_freq
|
||||
/// F(high_freq) == high_freq
|
||||
/// The function is continuous and piecewise linear with two inflection
|
||||
/// points.
|
||||
/// The lower inflection point (measured in terms of the unwarped
|
||||
/// frequency) is at frequency l, determined as described below.
|
||||
/// The higher inflection point is at a frequency h, determined as
|
||||
/// described below.
|
||||
/// If l <= f <= h, then F(f) = f/vtln_warp_factor.
|
||||
/// If the higher inflection point (measured in terms of the unwarped
|
||||
/// frequency) is at h, then max(h, F(h)) == vtln_high_cutoff.
|
||||
/// Since (by the last point) F(h) == h/vtln_warp_factor, then
|
||||
/// max(h, h/vtln_warp_factor) == vtln_high_cutoff, so
|
||||
/// h = vtln_high_cutoff / max(1, 1/vtln_warp_factor).
|
||||
/// = vtln_high_cutoff * min(1, vtln_warp_factor).
|
||||
/// If the lower inflection point (measured in terms of the unwarped
|
||||
/// frequency) is at l, then min(l, F(l)) == vtln_low_cutoff
|
||||
/// This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor)
|
||||
/// = vtln_low_cutoff * max(1, vtln_warp_factor)
|
||||
|
||||
if (freq < low_freq || freq > high_freq)
|
||||
return freq; // in case this gets called
|
||||
// for out-of-range frequencies, just return the freq.
|
||||
|
||||
CHECK_GT(vtln_low_cutoff, low_freq);
|
||||
CHECK_LT(vtln_high_cutoff, high_freq);
|
||||
|
||||
float one = 1.0f;
|
||||
float l = vtln_low_cutoff * std::max(one, vtln_warp_factor);
|
||||
float h = vtln_high_cutoff * std::min(one, vtln_warp_factor);
|
||||
float scale = 1.0f / vtln_warp_factor;
|
||||
float Fl = scale * l; // F(l);
|
||||
float Fh = scale * h; // F(h);
|
||||
CHECK(l > low_freq && h < high_freq);
|
||||
// slope of left part of the 3-piece linear function
|
||||
float scale_left = (Fl - low_freq) / (l - low_freq);
|
||||
// [slope of center part is just "scale"]
|
||||
|
||||
// slope of right part of the 3-piece linear function
|
||||
float scale_right = (high_freq - Fh) / (high_freq - h);
|
||||
|
||||
if (freq < l) {
|
||||
return low_freq + scale_left * (freq - low_freq);
|
||||
} else if (freq < h) {
|
||||
return scale * freq;
|
||||
} else { // freq >= h
|
||||
return high_freq + scale_right * (freq - high_freq);
|
||||
}
|
||||
}
|
||||
|
||||
float MelBanks::VtlnWarpMelFreq(
|
||||
float vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN.
|
||||
float vtln_high_cutoff,
|
||||
float low_freq, // upper+lower frequency cutoffs in mel computation
|
||||
float high_freq,
|
||||
float vtln_warp_factor,
|
||||
float mel_freq) {
|
||||
return MelScale(VtlnWarpFreq(vtln_low_cutoff,
|
||||
vtln_high_cutoff,
|
||||
low_freq,
|
||||
high_freq,
|
||||
vtln_warp_factor,
|
||||
InverseMelScale(mel_freq)));
|
||||
}
|
||||
|
||||
MelBanks::MelBanks(const MelBanksOptions &opts,
|
||||
const FrameExtractionOptions &frame_opts,
|
||||
float vtln_warp_factor)
|
||||
: htk_mode_(opts.htk_mode) {
|
||||
int32_t num_bins = opts.num_bins;
|
||||
if (num_bins < 3) LOG(FATAL) << "Must have at least 3 mel bins";
|
||||
|
||||
float sample_freq = frame_opts.samp_freq;
|
||||
int32_t window_length_padded = frame_opts.PaddedWindowSize();
|
||||
CHECK_EQ(window_length_padded % 2, 0);
|
||||
|
||||
int32_t num_fft_bins = window_length_padded / 2;
|
||||
float nyquist = 0.5f * sample_freq;
|
||||
|
||||
float low_freq = opts.low_freq, high_freq;
|
||||
if (opts.high_freq > 0.0f)
|
||||
high_freq = opts.high_freq;
|
||||
else
|
||||
high_freq = nyquist + opts.high_freq;
|
||||
|
||||
if (low_freq < 0.0f || low_freq >= nyquist || high_freq <= 0.0f ||
|
||||
high_freq > nyquist || high_freq <= low_freq) {
|
||||
LOG(FATAL) << "Bad values in options: low-freq " << low_freq
|
||||
<< " and high-freq " << high_freq << " vs. nyquist "
|
||||
<< nyquist;
|
||||
}
|
||||
|
||||
float fft_bin_width = sample_freq / window_length_padded;
|
||||
// fft-bin width [think of it as Nyquist-freq / half-window-length]
|
||||
|
||||
float mel_low_freq = MelScale(low_freq);
|
||||
float mel_high_freq = MelScale(high_freq);
|
||||
|
||||
debug_ = opts.debug_mel;
|
||||
|
||||
// divide by num_bins+1 in next line because of end-effects where the bins
|
||||
// spread out to the sides.
|
||||
float mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1);
|
||||
|
||||
float vtln_low = opts.vtln_low, vtln_high = opts.vtln_high;
|
||||
if (vtln_high < 0.0f) {
|
||||
vtln_high += nyquist;
|
||||
}
|
||||
|
||||
if (vtln_warp_factor != 1.0f &&
|
||||
(vtln_low < 0.0f || vtln_low <= low_freq || vtln_low >= high_freq ||
|
||||
vtln_high <= 0.0f || vtln_high >= high_freq ||
|
||||
vtln_high <= vtln_low)) {
|
||||
LOG(FATAL) << "Bad values in options: vtln-low " << vtln_low
|
||||
<< " and vtln-high " << vtln_high << ", versus "
|
||||
<< "low-freq " << low_freq << " and high-freq " << high_freq;
|
||||
}
|
||||
|
||||
bins_.resize(num_bins);
|
||||
center_freqs_.resize(num_bins);
|
||||
|
||||
for (int32_t bin = 0; bin < num_bins; ++bin) {
|
||||
float left_mel = mel_low_freq + bin * mel_freq_delta,
|
||||
center_mel = mel_low_freq + (bin + 1) * mel_freq_delta,
|
||||
right_mel = mel_low_freq + (bin + 2) * mel_freq_delta;
|
||||
|
||||
if (vtln_warp_factor != 1.0f) {
|
||||
left_mel = VtlnWarpMelFreq(vtln_low,
|
||||
vtln_high,
|
||||
low_freq,
|
||||
high_freq,
|
||||
vtln_warp_factor,
|
||||
left_mel);
|
||||
center_mel = VtlnWarpMelFreq(vtln_low,
|
||||
vtln_high,
|
||||
low_freq,
|
||||
high_freq,
|
||||
vtln_warp_factor,
|
||||
center_mel);
|
||||
right_mel = VtlnWarpMelFreq(vtln_low,
|
||||
vtln_high,
|
||||
low_freq,
|
||||
high_freq,
|
||||
vtln_warp_factor,
|
||||
right_mel);
|
||||
}
|
||||
center_freqs_[bin] = InverseMelScale(center_mel);
|
||||
|
||||
// this_bin will be a vector of coefficients that is only
|
||||
// nonzero where this mel bin is active.
|
||||
std::vector<float> this_bin(num_fft_bins);
|
||||
|
||||
int32_t first_index = -1, last_index = -1;
|
||||
for (int32_t i = 0; i < num_fft_bins; ++i) {
|
||||
float freq = (fft_bin_width * i); // Center frequency of this fft
|
||||
// bin.
|
||||
float mel = MelScale(freq);
|
||||
if (mel > left_mel && mel < right_mel) {
|
||||
float weight;
|
||||
if (mel <= center_mel)
|
||||
weight = (mel - left_mel) / (center_mel - left_mel);
|
||||
else
|
||||
weight = (right_mel - mel) / (right_mel - center_mel);
|
||||
this_bin[i] = weight;
|
||||
if (first_index == -1) first_index = i;
|
||||
last_index = i;
|
||||
}
|
||||
}
|
||||
CHECK(first_index != -1 && last_index >= first_index &&
|
||||
"You may have set num_mel_bins too large.");
|
||||
|
||||
bins_[bin].first = first_index;
|
||||
int32_t size = last_index + 1 - first_index;
|
||||
bins_[bin].second.insert(bins_[bin].second.end(),
|
||||
this_bin.begin() + first_index,
|
||||
this_bin.begin() + first_index + size);
|
||||
|
||||
// Replicate a bug in HTK, for testing purposes.
|
||||
if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0f) {
|
||||
bins_[bin].second[0] = 0.0;
|
||||
}
|
||||
} // for (int32_t bin = 0; bin < num_bins; ++bin) {
|
||||
|
||||
if (debug_) {
|
||||
std::ostringstream os;
|
||||
for (size_t i = 0; i < bins_.size(); i++) {
|
||||
os << "bin " << i << ", offset = " << bins_[i].first << ", vec = ";
|
||||
for (auto k : bins_[i].second) os << k << ", ";
|
||||
os << "\n";
|
||||
}
|
||||
LOG(INFO) << os.str();
|
||||
}
|
||||
}
|
||||
|
||||
// "power_spectrum" contains fft energies.
|
||||
void MelBanks::Compute(const float *power_spectrum,
|
||||
float *mel_energies_out) const {
|
||||
int32_t num_bins = bins_.size();
|
||||
|
||||
for (int32_t i = 0; i < num_bins; i++) {
|
||||
int32_t offset = bins_[i].first;
|
||||
const auto &v = bins_[i].second;
|
||||
float energy = 0;
|
||||
for (int32_t k = 0; k != v.size(); ++k) {
|
||||
energy += v[k] * power_spectrum[k + offset];
|
||||
}
|
||||
|
||||
// HTK-like flooring- for testing purposes (we prefer dither)
|
||||
if (htk_mode_ && energy < 1.0) {
|
||||
energy = 1.0;
|
||||
}
|
||||
|
||||
mel_energies_out[i] = energy;
|
||||
|
||||
// The following assert was added due to a problem with OpenBlas that
|
||||
// we had at one point (it was a bug in that library). Just to detect
|
||||
// it early.
|
||||
CHECK_EQ(energy, energy); // check that energy is not nan
|
||||
}
|
||||
|
||||
if (debug_) {
|
||||
fprintf(stderr, "MEL BANKS:\n");
|
||||
for (int32_t i = 0; i < num_bins; i++)
|
||||
fprintf(stderr, " %f", mel_energies_out[i]);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace knf
|
@ -0,0 +1,120 @@
|
||||
/**
|
||||
* Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
// This file is copied/modified from kaldi/src/feat/mel-computations.h
|
||||
#ifndef KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
|
||||
#define KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
|
||||
|
||||
#include <cmath>
|
||||
#include <string>
|
||||
|
||||
#include "frontend/audio/feature-window.h"
|
||||
|
||||
namespace knf {
|
||||
|
||||
struct MelBanksOptions {
|
||||
int32_t num_bins = 25; // e.g. 25; number of triangular bins
|
||||
float low_freq = 20; // e.g. 20; lower frequency cutoff
|
||||
|
||||
// an upper frequency cutoff; 0 -> no cutoff, negative
|
||||
// ->added to the Nyquist frequency to get the cutoff.
|
||||
float high_freq = 0;
|
||||
|
||||
float vtln_low = 100; // vtln lower cutoff of warping function.
|
||||
|
||||
// vtln upper cutoff of warping function: if negative, added
|
||||
// to the Nyquist frequency to get the cutoff.
|
||||
float vtln_high = -500;
|
||||
|
||||
bool debug_mel = false;
|
||||
// htk_mode is a "hidden" config, it does not show up on command line.
|
||||
// Enables more exact compatibility with HTK, for testing purposes. Affects
|
||||
// mel-energy flooring and reproduces a bug in HTK.
|
||||
bool htk_mode = false;
|
||||
|
||||
std::string ToString() const {
|
||||
std::ostringstream os;
|
||||
os << "num_bins: " << num_bins << "\n";
|
||||
os << "low_freq: " << low_freq << "\n";
|
||||
os << "high_freq: " << high_freq << "\n";
|
||||
os << "vtln_low: " << vtln_low << "\n";
|
||||
os << "vtln_high: " << vtln_high << "\n";
|
||||
os << "debug_mel: " << debug_mel << "\n";
|
||||
os << "htk_mode: " << htk_mode << "\n";
|
||||
return os.str();
|
||||
}
|
||||
};
|
||||
|
||||
std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts);
|
||||
|
||||
class MelBanks {
|
||||
public:
|
||||
static inline float InverseMelScale(float mel_freq) {
|
||||
return 700.0f * (expf(mel_freq / 1127.0f) - 1.0f);
|
||||
}
|
||||
|
||||
static inline float MelScale(float freq) {
|
||||
return 1127.0f * logf(1.0f + freq / 700.0f);
|
||||
}
|
||||
|
||||
static float VtlnWarpFreq(
|
||||
float vtln_low_cutoff,
|
||||
float vtln_high_cutoff, // discontinuities in warp func
|
||||
float low_freq,
|
||||
float high_freq, // upper+lower frequency cutoffs in
|
||||
// the mel computation
|
||||
float vtln_warp_factor,
|
||||
float freq);
|
||||
|
||||
static float VtlnWarpMelFreq(float vtln_low_cutoff,
|
||||
float vtln_high_cutoff,
|
||||
float low_freq,
|
||||
float high_freq,
|
||||
float vtln_warp_factor,
|
||||
float mel_freq);
|
||||
|
||||
// TODO(fangjun): Remove vtln_warp_factor
|
||||
MelBanks(const MelBanksOptions &opts,
|
||||
const FrameExtractionOptions &frame_opts,
|
||||
float vtln_warp_factor);
|
||||
|
||||
/// Compute Mel energies (note: not log energies).
|
||||
/// At input, "fft_energies" contains the FFT energies (not log).
|
||||
///
|
||||
/// @param fft_energies 1-D array of size num_fft_bins/2+1
|
||||
/// @param mel_energies_out 1-D array of size num_mel_bins
|
||||
void Compute(const float *fft_energies, float *mel_energies_out) const;
|
||||
|
||||
int32_t NumBins() const { return bins_.size(); }
|
||||
|
||||
private:
|
||||
// center frequencies of bins, numbered from 0 ... num_bins-1.
|
||||
// Needed by GetCenterFreqs().
|
||||
std::vector<float> center_freqs_;
|
||||
|
||||
// the "bins_" vector is a vector, one for each bin, of a pair:
|
||||
// (the first nonzero fft-bin), (the vector of weights).
|
||||
std::vector<std::pair<int32_t, std::vector<float>>> bins_;
|
||||
|
||||
// TODO(fangjun): Remove debug_ and htk_mode_
|
||||
bool debug_;
|
||||
bool htk_mode_;
|
||||
};
|
||||
|
||||
} // namespace knf
|
||||
|
||||
#endif // KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
|
@ -0,0 +1,66 @@
|
||||
/**
|
||||
* Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "frontend/audio/rfft.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
#include "base/log.h"
|
||||
|
||||
// see fftsg.c
|
||||
#ifdef __cplusplus
|
||||
extern "C" void rdft(int n, int isgn, double *a, int *ip, double *w);
|
||||
#else
|
||||
void rdft(int n, int isgn, double *a, int *ip, double *w);
|
||||
#endif
|
||||
|
||||
namespace knf {
|
||||
class Rfft::RfftImpl {
|
||||
public:
|
||||
explicit RfftImpl(int32_t n) : n_(n), ip_(2 + std::sqrt(n / 2)), w_(n / 2) {
|
||||
CHECK_EQ(n & (n - 1), 0);
|
||||
}
|
||||
|
||||
void Compute(float *in_out) {
|
||||
std::vector<double> d(in_out, in_out + n_);
|
||||
|
||||
Compute(d.data());
|
||||
|
||||
std::copy(d.begin(), d.end(), in_out);
|
||||
}
|
||||
|
||||
void Compute(double *in_out) {
|
||||
// 1 means forward fft
|
||||
rdft(n_, 1, in_out, ip_.data(), w_.data());
|
||||
}
|
||||
|
||||
private:
|
||||
int32_t n_;
|
||||
std::vector<int32_t> ip_;
|
||||
std::vector<double> w_;
|
||||
};
|
||||
|
||||
Rfft::Rfft(int32_t n) : impl_(std::make_unique<RfftImpl>(n)) {}
|
||||
|
||||
Rfft::~Rfft() = default;
|
||||
|
||||
void Rfft::Compute(float *in_out) { impl_->Compute(in_out); }
|
||||
void Rfft::Compute(double *in_out) { impl_->Compute(in_out); }
|
||||
|
||||
} // namespace knf
|
@ -0,0 +1,56 @@
|
||||
/**
|
||||
* Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef KALDI_NATIVE_FBANK_CSRC_RFFT_H_
|
||||
#define KALDI_NATIVE_FBANK_CSRC_RFFT_H_
|
||||
|
||||
#include <memory>
|
||||
|
||||
namespace knf {
|
||||
|
||||
// n-point Real discrete Fourier transform
|
||||
// where n is a power of 2. n >= 2
|
||||
//
|
||||
// R[k] = sum_j=0^n-1 in[j]*cos(2*pi*j*k/n), 0<=k<=n/2
|
||||
// I[k] = sum_j=0^n-1 in[j]*sin(2*pi*j*k/n), 0<k<n/2
|
||||
class Rfft {
|
||||
public:
|
||||
// @param n Number of fft bins. it should be a power of 2.
|
||||
explicit Rfft(int32_t n);
|
||||
~Rfft();
|
||||
|
||||
/** @param in_out A 1-D array of size n.
|
||||
* On return:
|
||||
* in_out[0] = R[0]
|
||||
* in_out[1] = R[n/2]
|
||||
* for 1 < k < n/2,
|
||||
* in_out[2*k] = R[k]
|
||||
* in_out[2*k+1] = I[k]
|
||||
*
|
||||
*/
|
||||
void Compute(float *in_out);
|
||||
void Compute(double *in_out);
|
||||
|
||||
private:
|
||||
class RfftImpl;
|
||||
std::unique_ptr<RfftImpl> impl_;
|
||||
};
|
||||
|
||||
} // namespace knf
|
||||
|
||||
#endif // KALDI_NATIVE_FBANK_CSRC_RFFT_H_
|
Loading…
Reference in new issue