[audio]replace kaldi fbank with kaldi-native-fbank in paddleaudio (#2799)
* replace kaldi_fbank with kaldi-native-fbank in paddleaudio * fix macpull/2801/head
parent
964211a81b
commit
d7a6268bcc
@ -0,0 +1,22 @@
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../)
|
||||
add_library(kaldi-native-fbank-core
|
||||
feature-fbank.cc
|
||||
feature-functions.cc
|
||||
feature-window.cc
|
||||
fftsg.c
|
||||
log.cc
|
||||
mel-computations.cc
|
||||
rfft.cc
|
||||
)
|
||||
# We are using std::call_once() in log.h,which requires us to link with -pthread
|
||||
if(NOT WIN32)
|
||||
target_link_libraries(kaldi-native-fbank-core -pthread)
|
||||
endif()
|
||||
|
||||
if(KNF_HAVE_EXECINFO_H)
|
||||
target_compile_definitions(kaldi-native-fbank-core PRIVATE KNF_HAVE_EXECINFO_H=1)
|
||||
endif()
|
||||
|
||||
if(KNF_HAVE_CXXABI_H)
|
||||
target_compile_definitions(kaldi-native-fbank-core PRIVATE KNF_HAVE_CXXABI_H=1)
|
||||
endif()
|
@ -0,0 +1,117 @@
|
||||
/**
|
||||
* Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// This file is copied/modified from kaldi/src/feat/feature-fbank.cc
|
||||
//
|
||||
#include "kaldi-native-fbank/csrc/feature-fbank.h"
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "kaldi-native-fbank/csrc/feature-functions.h"
|
||||
|
||||
namespace knf {
|
||||
|
||||
static void Sqrt(float *in_out, int32_t n) {
|
||||
for (int32_t i = 0; i != n; ++i) {
|
||||
in_out[i] = std::sqrt(in_out[i]);
|
||||
}
|
||||
}
|
||||
|
||||
std::ostream &operator<<(std::ostream &os, const FbankOptions &opts) {
|
||||
os << opts.ToString();
|
||||
return os;
|
||||
}
|
||||
|
||||
FbankComputer::FbankComputer(const FbankOptions &opts)
|
||||
: opts_(opts), rfft_(opts.frame_opts.PaddedWindowSize()) {
|
||||
if (opts.energy_floor > 0.0f) {
|
||||
log_energy_floor_ = logf(opts.energy_floor);
|
||||
}
|
||||
|
||||
// We'll definitely need the filterbanks info for VTLN warping factor 1.0.
|
||||
// [note: this call caches it.]
|
||||
GetMelBanks(1.0f);
|
||||
}
|
||||
|
||||
FbankComputer::~FbankComputer() {
|
||||
for (auto iter = mel_banks_.begin(); iter != mel_banks_.end(); ++iter)
|
||||
delete iter->second;
|
||||
}
|
||||
|
||||
const MelBanks *FbankComputer::GetMelBanks(float vtln_warp) {
|
||||
MelBanks *this_mel_banks = nullptr;
|
||||
|
||||
// std::map<float, MelBanks *>::iterator iter = mel_banks_.find(vtln_warp);
|
||||
auto iter = mel_banks_.find(vtln_warp);
|
||||
if (iter == mel_banks_.end()) {
|
||||
this_mel_banks = new MelBanks(opts_.mel_opts, opts_.frame_opts, vtln_warp);
|
||||
mel_banks_[vtln_warp] = this_mel_banks;
|
||||
} else {
|
||||
this_mel_banks = iter->second;
|
||||
}
|
||||
return this_mel_banks;
|
||||
}
|
||||
|
||||
void FbankComputer::Compute(float signal_raw_log_energy, float vtln_warp,
|
||||
std::vector<float> *signal_frame, float *feature) {
|
||||
const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
|
||||
|
||||
KNF_CHECK_EQ(signal_frame->size(), opts_.frame_opts.PaddedWindowSize());
|
||||
|
||||
// Compute energy after window function (not the raw one).
|
||||
if (opts_.use_energy && !opts_.raw_energy) {
|
||||
signal_raw_log_energy = std::log(
|
||||
std::max<float>(InnerProduct(signal_frame->data(), signal_frame->data(),
|
||||
signal_frame->size()),
|
||||
std::numeric_limits<float>::epsilon()));
|
||||
}
|
||||
rfft_.Compute(signal_frame->data()); // signal_frame is modified in-place
|
||||
ComputePowerSpectrum(signal_frame);
|
||||
|
||||
// Use magnitude instead of power if requested.
|
||||
if (!opts_.use_power) {
|
||||
Sqrt(signal_frame->data(), signal_frame->size() / 2 + 1);
|
||||
}
|
||||
|
||||
int32_t mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
|
||||
|
||||
// Its length is opts_.mel_opts.num_bins
|
||||
float *mel_energies = feature + mel_offset;
|
||||
|
||||
// Sum with mel filter banks over the power spectrum
|
||||
mel_banks.Compute(signal_frame->data(), mel_energies);
|
||||
|
||||
if (opts_.use_log_fbank) {
|
||||
// Avoid log of zero (which should be prevented anyway by dithering).
|
||||
for (int32_t i = 0; i != opts_.mel_opts.num_bins; ++i) {
|
||||
auto t = std::max(mel_energies[i], std::numeric_limits<float>::epsilon());
|
||||
mel_energies[i] = std::log(t);
|
||||
}
|
||||
}
|
||||
|
||||
// Copy energy as first value (or the last, if htk_compat == true).
|
||||
if (opts_.use_energy) {
|
||||
if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) {
|
||||
signal_raw_log_energy = log_energy_floor_;
|
||||
}
|
||||
int32_t energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0;
|
||||
feature[energy_index] = signal_raw_log_energy;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace knf
|
@ -0,0 +1,132 @@
|
||||
/**
|
||||
* Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// This file is copied/modified from kaldi/src/feat/feature-fbank.h
|
||||
|
||||
#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
|
||||
#define KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
|
||||
|
||||
#include <map>
|
||||
|
||||
#include "kaldi-native-fbank/csrc/feature-window.h"
|
||||
#include "kaldi-native-fbank/csrc/mel-computations.h"
|
||||
#include "kaldi-native-fbank/csrc/rfft.h"
|
||||
|
||||
namespace knf {
|
||||
|
||||
struct FbankOptions {
|
||||
FrameExtractionOptions frame_opts;
|
||||
MelBanksOptions mel_opts;
|
||||
// append an extra dimension with energy to the filter banks
|
||||
bool use_energy = false;
|
||||
float energy_floor = 0.0f; // active iff use_energy==true
|
||||
|
||||
// If true, compute log_energy before preemphasis and windowing
|
||||
// If false, compute log_energy after preemphasis ans windowing
|
||||
bool raw_energy = true; // active iff use_energy==true
|
||||
|
||||
// If true, put energy last (if using energy)
|
||||
// If false, put energy first
|
||||
bool htk_compat = false; // active iff use_energy==true
|
||||
|
||||
// if true (default), produce log-filterbank, else linear
|
||||
bool use_log_fbank = true;
|
||||
|
||||
// if true (default), use power in filterbank
|
||||
// analysis, else magnitude.
|
||||
bool use_power = true;
|
||||
|
||||
FbankOptions() { mel_opts.num_bins = 23; }
|
||||
|
||||
std::string ToString() const {
|
||||
std::ostringstream os;
|
||||
os << "frame_opts: \n";
|
||||
os << frame_opts << "\n";
|
||||
os << "\n";
|
||||
|
||||
os << "mel_opts: \n";
|
||||
os << mel_opts << "\n";
|
||||
|
||||
os << "use_energy: " << use_energy << "\n";
|
||||
os << "energy_floor: " << energy_floor << "\n";
|
||||
os << "raw_energy: " << raw_energy << "\n";
|
||||
os << "htk_compat: " << htk_compat << "\n";
|
||||
os << "use_log_fbank: " << use_log_fbank << "\n";
|
||||
os << "use_power: " << use_power << "\n";
|
||||
return os.str();
|
||||
}
|
||||
};
|
||||
|
||||
std::ostream &operator<<(std::ostream &os, const FbankOptions &opts);
|
||||
|
||||
class FbankComputer {
|
||||
public:
|
||||
using Options = FbankOptions;
|
||||
|
||||
explicit FbankComputer(const FbankOptions &opts);
|
||||
~FbankComputer();
|
||||
|
||||
int32_t Dim() const {
|
||||
return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
|
||||
}
|
||||
|
||||
// if true, compute log_energy_pre_window but after dithering and dc removal
|
||||
bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
|
||||
|
||||
const FrameExtractionOptions &GetFrameOptions() const {
|
||||
return opts_.frame_opts;
|
||||
}
|
||||
|
||||
const FbankOptions &GetOptions() const { return opts_; }
|
||||
|
||||
/**
|
||||
Function that computes one frame of features from
|
||||
one frame of signal.
|
||||
|
||||
@param [in] signal_raw_log_energy The log-energy of the frame of the signal
|
||||
prior to windowing and pre-emphasis, or
|
||||
log(numeric_limits<float>::min()), whichever is greater. Must be
|
||||
ignored by this function if this class returns false from
|
||||
this->NeedsRawLogEnergy().
|
||||
@param [in] vtln_warp The VTLN warping factor that the user wants
|
||||
to be applied when computing features for this utterance. Will
|
||||
normally be 1.0, meaning no warping is to be done. The value will
|
||||
be ignored for feature types that don't support VLTN, such as
|
||||
spectrogram features.
|
||||
@param [in] signal_frame One frame of the signal,
|
||||
as extracted using the function ExtractWindow() using the options
|
||||
returned by this->GetFrameOptions(). The function will use the
|
||||
vector as a workspace, which is why it's a non-const pointer.
|
||||
@param [out] feature Pointer to a vector of size this->Dim(), to which
|
||||
the computed feature will be written. It should be pre-allocated.
|
||||
*/
|
||||
void Compute(float signal_raw_log_energy, float vtln_warp,
|
||||
std::vector<float> *signal_frame, float *feature);
|
||||
|
||||
private:
|
||||
const MelBanks *GetMelBanks(float vtln_warp);
|
||||
|
||||
FbankOptions opts_;
|
||||
float log_energy_floor_;
|
||||
std::map<float, MelBanks *> mel_banks_; // float is VTLN coefficient.
|
||||
Rfft rfft_;
|
||||
};
|
||||
|
||||
} // namespace knf
|
||||
|
||||
#endif // KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
|
@ -0,0 +1,49 @@
|
||||
/**
|
||||
* Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// This file is copied/modified from kaldi/src/feat/feature-functions.cc
|
||||
|
||||
#include "kaldi-native-fbank/csrc/feature-functions.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
namespace knf {
|
||||
|
||||
void ComputePowerSpectrum(std::vector<float> *complex_fft) {
|
||||
int32_t dim = complex_fft->size();
|
||||
|
||||
// now we have in complex_fft, first half of complex spectrum
|
||||
// it's stored as [real0, realN/2, real1, im1, real2, im2, ...]
|
||||
|
||||
float *p = complex_fft->data();
|
||||
int32_t half_dim = dim / 2;
|
||||
float first_energy = p[0] * p[0];
|
||||
float last_energy = p[1] * p[1]; // handle this special case
|
||||
|
||||
for (int32_t i = 1; i < half_dim; ++i) {
|
||||
float real = p[i * 2];
|
||||
float im = p[i * 2 + 1];
|
||||
p[i] = real * real + im * im;
|
||||
}
|
||||
p[0] = first_energy;
|
||||
p[half_dim] = last_energy; // Will actually never be used, and anyway
|
||||
// if the signal has been bandlimited sensibly this should be zero.
|
||||
}
|
||||
|
||||
} // namespace knf
|
@ -0,0 +1,38 @@
|
||||
/**
|
||||
* Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// This file is copied/modified from kaldi/src/feat/feature-functions.h
|
||||
#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H
|
||||
#define KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H
|
||||
|
||||
#include <vector>
|
||||
namespace knf {
|
||||
|
||||
// ComputePowerSpectrum converts a complex FFT (as produced by the FFT
|
||||
// functions in csrc/rfft.h), and converts it into
|
||||
// a power spectrum. If the complex FFT is a vector of size n (representing
|
||||
// half of the complex FFT of a real signal of size n, as described there),
|
||||
// this function computes in the first (n/2) + 1 elements of it, the
|
||||
// energies of the fft bins from zero to the Nyquist frequency. Contents of the
|
||||
// remaining (n/2) - 1 elements are undefined at output.
|
||||
|
||||
void ComputePowerSpectrum(std::vector<float> *complex_fft);
|
||||
|
||||
} // namespace knf
|
||||
|
||||
#endif // KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H
|
@ -0,0 +1,236 @@
|
||||
// kaldi-native-fbank/csrc/feature-window.cc
|
||||
//
|
||||
// Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
|
||||
// This file is copied/modified from kaldi/src/feat/feature-window.cc
|
||||
|
||||
#include "kaldi-native-fbank/csrc/feature-window.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
#ifndef M_2PI
|
||||
#define M_2PI 6.283185307179586476925286766559005
|
||||
#endif
|
||||
|
||||
namespace knf {
|
||||
|
||||
std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts) {
|
||||
os << opts.ToString();
|
||||
return os;
|
||||
}
|
||||
|
||||
FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts)
|
||||
: window_(opts.WindowSize()) {
|
||||
int32_t frame_length = opts.WindowSize();
|
||||
KNF_CHECK_GT(frame_length, 0);
|
||||
|
||||
float *window_data = window_.data();
|
||||
|
||||
double a = M_2PI / (frame_length - 1);
|
||||
for (int32_t i = 0; i < frame_length; i++) {
|
||||
double i_fl = static_cast<double>(i);
|
||||
if (opts.window_type == "hanning") {
|
||||
window_data[i] = 0.5 - 0.5 * cos(a * i_fl);
|
||||
} else if (opts.window_type == "sine") {
|
||||
// when you are checking ws wikipedia, please
|
||||
// note that 0.5 * a = M_PI/(frame_length-1)
|
||||
window_data[i] = sin(0.5 * a * i_fl);
|
||||
} else if (opts.window_type == "hamming") {
|
||||
window_data[i] = 0.54 - 0.46 * cos(a * i_fl);
|
||||
} else if (opts.window_type ==
|
||||
"povey") { // like hamming but goes to zero at edges.
|
||||
window_data[i] = pow(0.5 - 0.5 * cos(a * i_fl), 0.85);
|
||||
} else if (opts.window_type == "rectangular") {
|
||||
window_data[i] = 1.0;
|
||||
} else if (opts.window_type == "blackman") {
|
||||
window_data[i] = opts.blackman_coeff - 0.5 * cos(a * i_fl) +
|
||||
(0.5 - opts.blackman_coeff) * cos(2 * a * i_fl);
|
||||
} else {
|
||||
KNF_LOG(FATAL) << "Invalid window type " << opts.window_type;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureWindowFunction::Apply(float *wave) const {
|
||||
int32_t window_size = window_.size();
|
||||
const float *p = window_.data();
|
||||
for (int32_t k = 0; k != window_size; ++k) {
|
||||
wave[k] *= p[k];
|
||||
}
|
||||
}
|
||||
|
||||
int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts) {
|
||||
int64_t frame_shift = opts.WindowShift();
|
||||
if (opts.snip_edges) {
|
||||
return frame * frame_shift;
|
||||
} else {
|
||||
int64_t midpoint_of_frame = frame_shift * frame + frame_shift / 2,
|
||||
beginning_of_frame = midpoint_of_frame - opts.WindowSize() / 2;
|
||||
return beginning_of_frame;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts,
|
||||
bool flush /*= true*/) {
|
||||
int64_t frame_shift = opts.WindowShift();
|
||||
int64_t frame_length = opts.WindowSize();
|
||||
if (opts.snip_edges) {
|
||||
// with --snip-edges=true (the default), we use a HTK-like approach to
|
||||
// determining the number of frames-- all frames have to fit completely into
|
||||
// the waveform, and the first frame begins at sample zero.
|
||||
if (num_samples < frame_length)
|
||||
return 0;
|
||||
else
|
||||
return (1 + ((num_samples - frame_length) / frame_shift));
|
||||
// You can understand the expression above as follows: 'num_samples -
|
||||
// frame_length' is how much room we have to shift the frame within the
|
||||
// waveform; 'frame_shift' is how much we shift it each time; and the ratio
|
||||
// is how many times we can shift it (integer arithmetic rounds down).
|
||||
} else {
|
||||
// if --snip-edges=false, the number of frames is determined by rounding the
|
||||
// (file-length / frame-shift) to the nearest integer. The point of this
|
||||
// formula is to make the number of frames an obvious and predictable
|
||||
// function of the frame shift and signal length, which makes many
|
||||
// segmentation-related questions simpler.
|
||||
//
|
||||
// Because integer division in C++ rounds toward zero, we add (half the
|
||||
// frame-shift minus epsilon) before dividing, to have the effect of
|
||||
// rounding towards the closest integer.
|
||||
int32_t num_frames = (num_samples + (frame_shift / 2)) / frame_shift;
|
||||
|
||||
if (flush) return num_frames;
|
||||
|
||||
// note: 'end' always means the last plus one, i.e. one past the last.
|
||||
int64_t end_sample_of_last_frame =
|
||||
FirstSampleOfFrame(num_frames - 1, opts) + frame_length;
|
||||
|
||||
// the following code is optimized more for clarity than efficiency.
|
||||
// If flush == false, we can't output frames that extend past the end
|
||||
// of the signal.
|
||||
while (num_frames > 0 && end_sample_of_last_frame > num_samples) {
|
||||
num_frames--;
|
||||
end_sample_of_last_frame -= frame_shift;
|
||||
}
|
||||
return num_frames;
|
||||
}
|
||||
}
|
||||
|
||||
void ExtractWindow(int64_t sample_offset, const std::vector<float> &wave,
|
||||
int32_t f, const FrameExtractionOptions &opts,
|
||||
const FeatureWindowFunction &window_function,
|
||||
std::vector<float> *window,
|
||||
float *log_energy_pre_window /*= nullptr*/) {
|
||||
KNF_CHECK(sample_offset >= 0 && wave.size() != 0);
|
||||
|
||||
int32_t frame_length = opts.WindowSize();
|
||||
int32_t frame_length_padded = opts.PaddedWindowSize();
|
||||
|
||||
int64_t num_samples = sample_offset + wave.size();
|
||||
int64_t start_sample = FirstSampleOfFrame(f, opts);
|
||||
int64_t end_sample = start_sample + frame_length;
|
||||
|
||||
if (opts.snip_edges) {
|
||||
KNF_CHECK(start_sample >= sample_offset && end_sample <= num_samples);
|
||||
} else {
|
||||
KNF_CHECK(sample_offset == 0 || start_sample >= sample_offset);
|
||||
}
|
||||
|
||||
if (window->size() != frame_length_padded) {
|
||||
window->resize(frame_length_padded);
|
||||
}
|
||||
|
||||
// wave_start and wave_end are start and end indexes into 'wave', for the
|
||||
// piece of wave that we're trying to extract.
|
||||
int32_t wave_start = int32_t(start_sample - sample_offset);
|
||||
int32_t wave_end = wave_start + frame_length;
|
||||
|
||||
if (wave_start >= 0 && wave_end <= wave.size()) {
|
||||
// the normal case-- no edge effects to consider.
|
||||
std::copy(wave.begin() + wave_start,
|
||||
wave.begin() + wave_start + frame_length, window->data());
|
||||
} else {
|
||||
// Deal with any end effects by reflection, if needed. This code will only
|
||||
// be reached for about two frames per utterance, so we don't concern
|
||||
// ourselves excessively with efficiency.
|
||||
int32_t wave_dim = wave.size();
|
||||
for (int32_t s = 0; s < frame_length; ++s) {
|
||||
int32_t s_in_wave = s + wave_start;
|
||||
while (s_in_wave < 0 || s_in_wave >= wave_dim) {
|
||||
// reflect around the beginning or end of the wave.
|
||||
// e.g. -1 -> 0, -2 -> 1.
|
||||
// dim -> dim - 1, dim + 1 -> dim - 2.
|
||||
// the code supports repeated reflections, although this
|
||||
// would only be needed in pathological cases.
|
||||
if (s_in_wave < 0)
|
||||
s_in_wave = -s_in_wave - 1;
|
||||
else
|
||||
s_in_wave = 2 * wave_dim - 1 - s_in_wave;
|
||||
}
|
||||
(*window)[s] = wave[s_in_wave];
|
||||
}
|
||||
}
|
||||
|
||||
ProcessWindow(opts, window_function, window->data(), log_energy_pre_window);
|
||||
}
|
||||
|
||||
static void RemoveDcOffset(float *d, int32_t n) {
|
||||
float sum = 0;
|
||||
for (int32_t i = 0; i != n; ++i) {
|
||||
sum += d[i];
|
||||
}
|
||||
|
||||
float mean = sum / n;
|
||||
|
||||
for (int32_t i = 0; i != n; ++i) {
|
||||
d[i] -= mean;
|
||||
}
|
||||
}
|
||||
|
||||
float InnerProduct(const float *a, const float *b, int32_t n) {
|
||||
float sum = 0;
|
||||
for (int32_t i = 0; i != n; ++i) {
|
||||
sum += a[i] * b[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
static void Preemphasize(float *d, int32_t n, float preemph_coeff) {
|
||||
if (preemph_coeff == 0.0) {
|
||||
return;
|
||||
}
|
||||
|
||||
KNF_CHECK(preemph_coeff >= 0.0 && preemph_coeff <= 1.0);
|
||||
|
||||
for (int32_t i = n - 1; i > 0; --i) {
|
||||
d[i] -= preemph_coeff * d[i - 1];
|
||||
}
|
||||
d[0] -= preemph_coeff * d[0];
|
||||
}
|
||||
|
||||
void ProcessWindow(const FrameExtractionOptions &opts,
|
||||
const FeatureWindowFunction &window_function, float *window,
|
||||
float *log_energy_pre_window /*= nullptr*/) {
|
||||
int32_t frame_length = opts.WindowSize();
|
||||
|
||||
// TODO(fangjun): Remove dither
|
||||
KNF_CHECK_EQ(opts.dither, 0);
|
||||
|
||||
if (opts.remove_dc_offset) {
|
||||
RemoveDcOffset(window, frame_length);
|
||||
}
|
||||
|
||||
if (log_energy_pre_window != NULL) {
|
||||
float energy = std::max<float>(InnerProduct(window, window, frame_length),
|
||||
std::numeric_limits<float>::epsilon());
|
||||
*log_energy_pre_window = std::log(energy);
|
||||
}
|
||||
|
||||
if (opts.preemph_coeff != 0.0) {
|
||||
Preemphasize(window, frame_length, opts.preemph_coeff);
|
||||
}
|
||||
|
||||
window_function.Apply(window);
|
||||
}
|
||||
|
||||
} // namespace knf
|
@ -0,0 +1,178 @@
|
||||
// kaldi-native-fbank/csrc/feature-window.h
|
||||
//
|
||||
// Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
|
||||
// This file is copied/modified from kaldi/src/feat/feature-window.h
|
||||
|
||||
#ifndef KALDI_NATIVE_FEAT_CSRC_FEATURE_WINDOW_H_
|
||||
#define KALDI_NATIVE_FEAT_CSRC_FEATURE_WINDOW_H_
|
||||
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "kaldi-native-fbank/csrc/log.h"
|
||||
|
||||
namespace knf {
|
||||
|
||||
inline int32_t RoundUpToNearestPowerOfTwo(int32_t n) {
|
||||
// copied from kaldi/src/base/kaldi-math.cc
|
||||
KNF_CHECK_GT(n, 0);
|
||||
n--;
|
||||
n |= n >> 1;
|
||||
n |= n >> 2;
|
||||
n |= n >> 4;
|
||||
n |= n >> 8;
|
||||
n |= n >> 16;
|
||||
return n + 1;
|
||||
}
|
||||
|
||||
struct FrameExtractionOptions {
|
||||
float samp_freq = 16000;
|
||||
float frame_shift_ms = 10.0f; // in milliseconds.
|
||||
float frame_length_ms = 25.0f; // in milliseconds.
|
||||
float dither = 1.0f; // Amount of dithering, 0.0 means no dither.
|
||||
float preemph_coeff = 0.97f; // Preemphasis coefficient.
|
||||
bool remove_dc_offset = true; // Subtract mean of wave before FFT.
|
||||
std::string window_type = "povey"; // e.g. Hamming window
|
||||
// May be "hamming", "rectangular", "povey", "hanning", "sine", "blackman"
|
||||
// "povey" is a window I made to be similar to Hamming but to go to zero at
|
||||
// the edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85) I just don't think the
|
||||
// Hamming window makes sense as a windowing function.
|
||||
bool round_to_power_of_two = true;
|
||||
float blackman_coeff = 0.42f;
|
||||
bool snip_edges = true;
|
||||
// bool allow_downsample = false;
|
||||
// bool allow_upsample = false;
|
||||
|
||||
// Used for streaming feature extraction. It indicates the number
|
||||
// of feature frames to keep in the recycling vector. -1 means to
|
||||
// keep all feature frames.
|
||||
int32_t max_feature_vectors = -1;
|
||||
|
||||
int32_t WindowShift() const {
|
||||
return static_cast<int32_t>(samp_freq * 0.001f * frame_shift_ms);
|
||||
}
|
||||
int32_t WindowSize() const {
|
||||
return static_cast<int32_t>(samp_freq * 0.001f * frame_length_ms);
|
||||
}
|
||||
int32_t PaddedWindowSize() const {
|
||||
return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize())
|
||||
: WindowSize());
|
||||
}
|
||||
std::string ToString() const {
|
||||
std::ostringstream os;
|
||||
#define KNF_PRINT(x) os << #x << ": " << x << "\n"
|
||||
KNF_PRINT(samp_freq);
|
||||
KNF_PRINT(frame_shift_ms);
|
||||
KNF_PRINT(frame_length_ms);
|
||||
KNF_PRINT(dither);
|
||||
KNF_PRINT(preemph_coeff);
|
||||
KNF_PRINT(remove_dc_offset);
|
||||
KNF_PRINT(window_type);
|
||||
KNF_PRINT(round_to_power_of_two);
|
||||
KNF_PRINT(blackman_coeff);
|
||||
KNF_PRINT(snip_edges);
|
||||
// KNF_PRINT(allow_downsample);
|
||||
// KNF_PRINT(allow_upsample);
|
||||
KNF_PRINT(max_feature_vectors);
|
||||
#undef KNF_PRINT
|
||||
return os.str();
|
||||
}
|
||||
};
|
||||
|
||||
std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts);
|
||||
|
||||
class FeatureWindowFunction {
|
||||
public:
|
||||
FeatureWindowFunction() = default;
|
||||
explicit FeatureWindowFunction(const FrameExtractionOptions &opts);
|
||||
/**
|
||||
* @param wave Pointer to a 1-D array of shape [window_size].
|
||||
* It is modified in-place: wave[i] = wave[i] * window_[i].
|
||||
* @param
|
||||
*/
|
||||
void Apply(float *wave) const;
|
||||
|
||||
private:
|
||||
std::vector<float> window_; // of size opts.WindowSize()
|
||||
};
|
||||
|
||||
int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts);
|
||||
|
||||
/**
|
||||
This function returns the number of frames that we can extract from a wave
|
||||
file with the given number of samples in it (assumed to have the same
|
||||
sampling rate as specified in 'opts').
|
||||
|
||||
@param [in] num_samples The number of samples in the wave file.
|
||||
@param [in] opts The frame-extraction options class
|
||||
|
||||
@param [in] flush True if we are asserting that this number of samples
|
||||
is 'all there is', false if we expecting more data to possibly come in. This
|
||||
only makes a difference to the answer
|
||||
if opts.snips_edges== false. For offline feature extraction you always want
|
||||
flush == true. In an online-decoding context, once you know (or decide) that
|
||||
no more data is coming in, you'd call it with flush == true at the end to
|
||||
flush out any remaining data.
|
||||
*/
|
||||
int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts,
|
||||
bool flush = true);
|
||||
|
||||
/*
|
||||
ExtractWindow() extracts a windowed frame of waveform (possibly with a
|
||||
power-of-two, padded size, depending on the config), including all the
|
||||
processing done by ProcessWindow().
|
||||
|
||||
@param [in] sample_offset If 'wave' is not the entire waveform, but
|
||||
part of it to the left has been discarded, then the
|
||||
number of samples prior to 'wave' that we have
|
||||
already discarded. Set this to zero if you are
|
||||
processing the entire waveform in one piece, or
|
||||
if you get 'no matching function' compilation
|
||||
errors when updating the code.
|
||||
@param [in] wave The waveform
|
||||
@param [in] f The frame index to be extracted, with
|
||||
0 <= f < NumFrames(sample_offset + wave.Dim(), opts, true)
|
||||
@param [in] opts The options class to be used
|
||||
@param [in] window_function The windowing function, as derived from the
|
||||
options class.
|
||||
@param [out] window The windowed, possibly-padded waveform to be
|
||||
extracted. Will be resized as needed.
|
||||
@param [out] log_energy_pre_window If non-NULL, the log-energy of
|
||||
the signal prior to pre-emphasis and multiplying by
|
||||
the windowing function will be written to here.
|
||||
*/
|
||||
void ExtractWindow(int64_t sample_offset, const std::vector<float> &wave,
|
||||
int32_t f, const FrameExtractionOptions &opts,
|
||||
const FeatureWindowFunction &window_function,
|
||||
std::vector<float> *window,
|
||||
float *log_energy_pre_window = nullptr);
|
||||
|
||||
/**
|
||||
This function does all the windowing steps after actually
|
||||
extracting the windowed signal: depending on the
|
||||
configuration, it does dithering, dc offset removal,
|
||||
preemphasis, and multiplication by the windowing function.
|
||||
@param [in] opts The options class to be used
|
||||
@param [in] window_function The windowing function-- should have
|
||||
been initialized using 'opts'.
|
||||
@param [in,out] window A vector of size opts.WindowSize(). Note:
|
||||
it will typically be a sub-vector of a larger vector of size
|
||||
opts.PaddedWindowSize(), with the remaining samples zero,
|
||||
as the FFT code is more efficient if it operates on data with
|
||||
power-of-two size.
|
||||
@param [out] log_energy_pre_window If non-NULL, then after dithering and
|
||||
DC offset removal, this function will write to this pointer the log of
|
||||
the total energy (i.e. sum-squared) of the frame.
|
||||
*/
|
||||
void ProcessWindow(const FrameExtractionOptions &opts,
|
||||
const FeatureWindowFunction &window_function, float *window,
|
||||
float *log_energy_pre_window = nullptr);
|
||||
|
||||
// Compute the inner product of two vectors
|
||||
float InnerProduct(const float *a, const float *b, int32_t n);
|
||||
|
||||
} // namespace knf
|
||||
|
||||
#endif // KALDI_NATIVE_FEAT_CSRC_FEATURE_WINDOW_H_
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,143 @@
|
||||
/**
|
||||
* Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Stack trace related stuff is from kaldi.
|
||||
* Refer to
|
||||
* https://github.com/kaldi-asr/kaldi/blob/master/src/base/kaldi-error.cc
|
||||
*/
|
||||
|
||||
#include "kaldi-native-fbank/csrc/log.h"
|
||||
|
||||
#ifdef KNF_HAVE_EXECINFO_H
|
||||
#include <execinfo.h> // To get stack trace in error messages.
|
||||
#ifdef KNF_HAVE_CXXABI_H
|
||||
#include <cxxabi.h> // For name demangling.
|
||||
// Useful to decode the stack trace, but only used if we have execinfo.h
|
||||
#endif // KNF_HAVE_CXXABI_H
|
||||
#endif // KNF_HAVE_EXECINFO_H
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <ctime>
|
||||
#include <iomanip>
|
||||
#include <string>
|
||||
|
||||
namespace knf {
|
||||
|
||||
std::string GetDateTimeStr() {
|
||||
std::ostringstream os;
|
||||
std::time_t t = std::time(nullptr);
|
||||
std::tm tm = *std::localtime(&t);
|
||||
os << std::put_time(&tm, "%F %T"); // yyyy-mm-dd hh:mm:ss
|
||||
return os.str();
|
||||
}
|
||||
|
||||
static bool LocateSymbolRange(const std::string &trace_name, std::size_t *begin,
|
||||
std::size_t *end) {
|
||||
// Find the first '_' with leading ' ' or '('.
|
||||
*begin = std::string::npos;
|
||||
for (std::size_t i = 1; i < trace_name.size(); ++i) {
|
||||
if (trace_name[i] != '_') {
|
||||
continue;
|
||||
}
|
||||
if (trace_name[i - 1] == ' ' || trace_name[i - 1] == '(') {
|
||||
*begin = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (*begin == std::string::npos) {
|
||||
return false;
|
||||
}
|
||||
*end = trace_name.find_first_of(" +", *begin);
|
||||
return *end != std::string::npos;
|
||||
}
|
||||
|
||||
#ifdef KNF_HAVE_EXECINFO_H
|
||||
static std::string Demangle(const std::string &trace_name) {
|
||||
#ifndef KNF_HAVE_CXXABI_H
|
||||
return trace_name;
|
||||
#else // KNF_HAVE_CXXABI_H
|
||||
// Try demangle the symbol. We are trying to support the following formats
|
||||
// produced by different platforms:
|
||||
//
|
||||
// Linux:
|
||||
// ./kaldi-error-test(_ZN5kaldi13UnitTestErrorEv+0xb) [0x804965d]
|
||||
//
|
||||
// Mac:
|
||||
// 0 server 0x000000010f67614d _ZNK5kaldi13MessageLogger10LogMessageEv + 813
|
||||
//
|
||||
// We want to extract the name e.g., '_ZN5kaldi13UnitTestErrorEv' and
|
||||
// demangle it info a readable name like kaldi::UnitTextError.
|
||||
std::size_t begin, end;
|
||||
if (!LocateSymbolRange(trace_name, &begin, &end)) {
|
||||
return trace_name;
|
||||
}
|
||||
std::string symbol = trace_name.substr(begin, end - begin);
|
||||
int status;
|
||||
char *demangled_name = abi::__cxa_demangle(symbol.c_str(), 0, 0, &status);
|
||||
if (status == 0 && demangled_name != nullptr) {
|
||||
symbol = demangled_name;
|
||||
free(demangled_name);
|
||||
}
|
||||
return trace_name.substr(0, begin) + symbol +
|
||||
trace_name.substr(end, std::string::npos);
|
||||
#endif // KNF_HAVE_CXXABI_H
|
||||
}
|
||||
#endif // KNF_HAVE_EXECINFO_H
|
||||
|
||||
std::string GetStackTrace() {
|
||||
std::string ans;
|
||||
#ifdef KNF_HAVE_EXECINFO_H
|
||||
constexpr const std::size_t kMaxTraceSize = 50;
|
||||
constexpr const std::size_t kMaxTracePrint = 50; // Must be even.
|
||||
// Buffer for the trace.
|
||||
void *trace[kMaxTraceSize];
|
||||
// Get the trace.
|
||||
std::size_t size = backtrace(trace, kMaxTraceSize);
|
||||
// Get the trace symbols.
|
||||
char **trace_symbol = backtrace_symbols(trace, size);
|
||||
if (trace_symbol == nullptr)
|
||||
return ans;
|
||||
|
||||
// Compose a human-readable backtrace string.
|
||||
ans += "[ Stack-Trace: ]\n";
|
||||
if (size <= kMaxTracePrint) {
|
||||
for (std::size_t i = 0; i < size; ++i) {
|
||||
ans += Demangle(trace_symbol[i]) + "\n";
|
||||
}
|
||||
} else { // Print out first+last (e.g.) 5.
|
||||
for (std::size_t i = 0; i < kMaxTracePrint / 2; ++i) {
|
||||
ans += Demangle(trace_symbol[i]) + "\n";
|
||||
}
|
||||
ans += ".\n.\n.\n";
|
||||
for (std::size_t i = size - kMaxTracePrint / 2; i < size; ++i) {
|
||||
ans += Demangle(trace_symbol[i]) + "\n";
|
||||
}
|
||||
if (size == kMaxTraceSize)
|
||||
ans += ".\n.\n.\n"; // Stack was too long, probably a bug.
|
||||
}
|
||||
|
||||
// We must free the array of pointers allocated by backtrace_symbols(),
|
||||
// but not the strings themselves.
|
||||
free(trace_symbol);
|
||||
#endif // KNF_HAVE_EXECINFO_H
|
||||
return ans;
|
||||
}
|
||||
|
||||
} // namespace knf
|
@ -0,0 +1,347 @@
|
||||
/**
|
||||
* Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// The content in this file is copied/modified from
|
||||
// https://github.com/k2-fsa/k2/blob/master/k2/csrc/log.h
|
||||
#ifndef KALDI_NATIVE_FBANK_CSRC_LOG_H_
|
||||
#define KALDI_NATIVE_FBANK_CSRC_LOG_H_
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <mutex> // NOLINT
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
namespace knf {
|
||||
|
||||
#if defined(NDEBUG)
|
||||
constexpr bool kDisableDebug = true;
|
||||
#else
|
||||
constexpr bool kDisableDebug = false;
|
||||
#endif
|
||||
|
||||
enum class LogLevel {
|
||||
kTrace = 0,
|
||||
kDebug = 1,
|
||||
kInfo = 2,
|
||||
kWarning = 3,
|
||||
kError = 4,
|
||||
kFatal = 5, // print message and abort the program
|
||||
};
|
||||
|
||||
// They are used in KNF_LOG(xxx), so their names
|
||||
// do not follow the google c++ code style
|
||||
//
|
||||
// You can use them in the following way:
|
||||
//
|
||||
// KNF_LOG(TRACE) << "some message";
|
||||
// KNF_LOG(DEBUG) << "some message";
|
||||
#ifndef _MSC_VER
|
||||
constexpr LogLevel TRACE = LogLevel::kTrace;
|
||||
constexpr LogLevel DEBUG = LogLevel::kDebug;
|
||||
constexpr LogLevel INFO = LogLevel::kInfo;
|
||||
constexpr LogLevel WARNING = LogLevel::kWarning;
|
||||
constexpr LogLevel ERROR = LogLevel::kError;
|
||||
constexpr LogLevel FATAL = LogLevel::kFatal;
|
||||
#else
|
||||
#define TRACE LogLevel::kTrace
|
||||
#define DEBUG LogLevel::kDebug
|
||||
#define INFO LogLevel::kInfo
|
||||
#define WARNING LogLevel::kWarning
|
||||
#define ERROR LogLevel::kError
|
||||
#define FATAL LogLevel::kFatal
|
||||
#endif
|
||||
|
||||
std::string GetStackTrace();
|
||||
|
||||
/* Return the current log level.
|
||||
|
||||
|
||||
If the current log level is TRACE, then all logged messages are printed out.
|
||||
|
||||
If the current log level is DEBUG, log messages with "TRACE" level are not
|
||||
shown and all other levels are printed out.
|
||||
|
||||
Similarly, if the current log level is INFO, log message with "TRACE" and
|
||||
"DEBUG" are not shown and all other levels are printed out.
|
||||
|
||||
If it is FATAL, then only FATAL messages are shown.
|
||||
*/
|
||||
inline LogLevel GetCurrentLogLevel() {
|
||||
static LogLevel log_level = INFO;
|
||||
static std::once_flag init_flag;
|
||||
std::call_once(init_flag, []() {
|
||||
const char *env_log_level = std::getenv("KNF_LOG_LEVEL");
|
||||
if (env_log_level == nullptr) return;
|
||||
|
||||
std::string s = env_log_level;
|
||||
if (s == "TRACE")
|
||||
log_level = TRACE;
|
||||
else if (s == "DEBUG")
|
||||
log_level = DEBUG;
|
||||
else if (s == "INFO")
|
||||
log_level = INFO;
|
||||
else if (s == "WARNING")
|
||||
log_level = WARNING;
|
||||
else if (s == "ERROR")
|
||||
log_level = ERROR;
|
||||
else if (s == "FATAL")
|
||||
log_level = FATAL;
|
||||
else
|
||||
fprintf(stderr,
|
||||
"Unknown KNF_LOG_LEVEL: %s"
|
||||
"\nSupported values are: "
|
||||
"TRACE, DEBUG, INFO, WARNING, ERROR, FATAL",
|
||||
s.c_str());
|
||||
});
|
||||
return log_level;
|
||||
}
|
||||
|
||||
inline bool EnableAbort() {
|
||||
static std::once_flag init_flag;
|
||||
static bool enable_abort = false;
|
||||
std::call_once(init_flag, []() {
|
||||
enable_abort = (std::getenv("KNF_ABORT") != nullptr);
|
||||
});
|
||||
return enable_abort;
|
||||
}
|
||||
|
||||
class Logger {
|
||||
public:
|
||||
Logger(const char *filename, const char *func_name, uint32_t line_num,
|
||||
LogLevel level)
|
||||
: filename_(filename),
|
||||
func_name_(func_name),
|
||||
line_num_(line_num),
|
||||
level_(level) {
|
||||
cur_level_ = GetCurrentLogLevel();
|
||||
fprintf(stderr, "here\n");
|
||||
switch (level) {
|
||||
case TRACE:
|
||||
if (cur_level_ <= TRACE) fprintf(stderr, "[T] ");
|
||||
break;
|
||||
case DEBUG:
|
||||
if (cur_level_ <= DEBUG) fprintf(stderr, "[D] ");
|
||||
break;
|
||||
case INFO:
|
||||
if (cur_level_ <= INFO) fprintf(stderr, "[I] ");
|
||||
break;
|
||||
case WARNING:
|
||||
if (cur_level_ <= WARNING) fprintf(stderr, "[W] ");
|
||||
break;
|
||||
case ERROR:
|
||||
if (cur_level_ <= ERROR) fprintf(stderr, "[E] ");
|
||||
break;
|
||||
case FATAL:
|
||||
if (cur_level_ <= FATAL) fprintf(stderr, "[F] ");
|
||||
break;
|
||||
}
|
||||
|
||||
if (cur_level_ <= level_) {
|
||||
fprintf(stderr, "%s:%u:%s ", filename, line_num, func_name);
|
||||
}
|
||||
}
|
||||
|
||||
~Logger() noexcept(false) {
|
||||
static constexpr const char *kErrMsg = R"(
|
||||
Some bad things happened. Please read the above error messages and stack
|
||||
trace. If you are using Python, the following command may be helpful:
|
||||
|
||||
gdb --args python /path/to/your/code.py
|
||||
|
||||
(You can use `gdb` to debug the code. Please consider compiling
|
||||
a debug version of KNF.).
|
||||
|
||||
If you are unable to fix it, please open an issue at:
|
||||
|
||||
https://github.com/csukuangfj/kaldi-native-fbank/issues/new
|
||||
)";
|
||||
fprintf(stderr, "\n");
|
||||
if (level_ == FATAL) {
|
||||
std::string stack_trace = GetStackTrace();
|
||||
if (!stack_trace.empty()) {
|
||||
fprintf(stderr, "\n\n%s\n", stack_trace.c_str());
|
||||
}
|
||||
|
||||
fflush(nullptr);
|
||||
|
||||
#ifndef __ANDROID_API__
|
||||
if (EnableAbort()) {
|
||||
// NOTE: abort() will terminate the program immediately without
|
||||
// printing the Python stack backtrace.
|
||||
abort();
|
||||
}
|
||||
|
||||
throw std::runtime_error(kErrMsg);
|
||||
#else
|
||||
abort();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
const Logger &operator<<(bool b) const {
|
||||
if (cur_level_ <= level_) {
|
||||
fprintf(stderr, b ? "true" : "false");
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
const Logger &operator<<(int8_t i) const {
|
||||
if (cur_level_ <= level_) fprintf(stderr, "%d", i);
|
||||
return *this;
|
||||
}
|
||||
|
||||
const Logger &operator<<(const char *s) const {
|
||||
if (cur_level_ <= level_) fprintf(stderr, "%s", s);
|
||||
return *this;
|
||||
}
|
||||
|
||||
const Logger &operator<<(int32_t i) const {
|
||||
if (cur_level_ <= level_) fprintf(stderr, "%d", i);
|
||||
return *this;
|
||||
}
|
||||
|
||||
const Logger &operator<<(uint32_t i) const {
|
||||
if (cur_level_ <= level_) fprintf(stderr, "%u", i);
|
||||
return *this;
|
||||
}
|
||||
|
||||
const Logger &operator<<(uint64_t i) const {
|
||||
if (cur_level_ <= level_)
|
||||
fprintf(stderr, "%llu", (long long unsigned int)i); // NOLINT
|
||||
return *this;
|
||||
}
|
||||
|
||||
const Logger &operator<<(int64_t i) const {
|
||||
if (cur_level_ <= level_)
|
||||
fprintf(stderr, "%lli", (long long int)i); // NOLINT
|
||||
return *this;
|
||||
}
|
||||
|
||||
const Logger &operator<<(float f) const {
|
||||
if (cur_level_ <= level_) fprintf(stderr, "%f", f);
|
||||
return *this;
|
||||
}
|
||||
|
||||
const Logger &operator<<(double d) const {
|
||||
if (cur_level_ <= level_) fprintf(stderr, "%f", d);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const Logger &operator<<(const T &t) const {
|
||||
// require T overloads operator<<
|
||||
std::ostringstream os;
|
||||
os << t;
|
||||
return *this << os.str().c_str();
|
||||
}
|
||||
|
||||
// specialization to fix compile error: `stringstream << nullptr` is ambiguous
|
||||
const Logger &operator<<(const std::nullptr_t &null) const {
|
||||
if (cur_level_ <= level_) *this << "(null)";
|
||||
return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
const char *filename_;
|
||||
const char *func_name_;
|
||||
uint32_t line_num_;
|
||||
LogLevel level_;
|
||||
LogLevel cur_level_;
|
||||
};
|
||||
|
||||
class Voidifier {
|
||||
public:
|
||||
void operator&(const Logger &)const {}
|
||||
};
|
||||
|
||||
} // namespace knf
|
||||
|
||||
#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) || \
|
||||
defined(__PRETTY_FUNCTION__)
|
||||
// for clang and GCC
|
||||
#define KNF_FUNC __PRETTY_FUNCTION__
|
||||
#else
|
||||
// for other compilers
|
||||
#define KNF_FUNC __func__
|
||||
#endif
|
||||
|
||||
#define KNF_STATIC_ASSERT(x) static_assert(x, "")
|
||||
|
||||
#define KNF_CHECK(x) \
|
||||
(x) ? (void)0 \
|
||||
: ::knf::Voidifier() & \
|
||||
::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::FATAL) \
|
||||
<< "Check failed: " << #x << " "
|
||||
|
||||
// WARNING: x and y may be evaluated multiple times, but this happens only
|
||||
// when the check fails. Since the program aborts if it fails, we don't think
|
||||
// the extra evaluation of x and y matters.
|
||||
//
|
||||
// CAUTION: we recommend the following use case:
|
||||
//
|
||||
// auto x = Foo();
|
||||
// auto y = Bar();
|
||||
// KNF_CHECK_EQ(x, y) << "Some message";
|
||||
//
|
||||
// And please avoid
|
||||
//
|
||||
// KNF_CHECK_EQ(Foo(), Bar());
|
||||
//
|
||||
// if `Foo()` or `Bar()` causes some side effects, e.g., changing some
|
||||
// local static variables or global variables.
|
||||
#define _KNF_CHECK_OP(x, y, op) \
|
||||
((x)op(y)) ? (void)0 \
|
||||
: ::knf::Voidifier() & \
|
||||
::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::FATAL) \
|
||||
<< "Check failed: " << #x << " " << #op << " " << #y \
|
||||
<< " (" << (x) << " vs. " << (y) << ") "
|
||||
|
||||
#define KNF_CHECK_EQ(x, y) _KNF_CHECK_OP(x, y, ==)
|
||||
#define KNF_CHECK_NE(x, y) _KNF_CHECK_OP(x, y, !=)
|
||||
#define KNF_CHECK_LT(x, y) _KNF_CHECK_OP(x, y, <)
|
||||
#define KNF_CHECK_LE(x, y) _KNF_CHECK_OP(x, y, <=)
|
||||
#define KNF_CHECK_GT(x, y) _KNF_CHECK_OP(x, y, >)
|
||||
#define KNF_CHECK_GE(x, y) _KNF_CHECK_OP(x, y, >=)
|
||||
|
||||
#define KNF_LOG(x) ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::x)
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// For debug check
|
||||
// ------------------------------------------------------------
|
||||
// If you define the macro "-D NDEBUG" while compiling kaldi-native-fbank,
|
||||
// the following macros are in fact empty and does nothing.
|
||||
|
||||
#define KNF_DCHECK(x) ::knf::kDisableDebug ? (void)0 : KNF_CHECK(x)
|
||||
|
||||
#define KNF_DCHECK_EQ(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_EQ(x, y)
|
||||
|
||||
#define KNF_DCHECK_NE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_NE(x, y)
|
||||
|
||||
#define KNF_DCHECK_LT(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_LT(x, y)
|
||||
|
||||
#define KNF_DCHECK_LE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_LE(x, y)
|
||||
|
||||
#define KNF_DCHECK_GT(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_GT(x, y)
|
||||
|
||||
#define KNF_DCHECK_GE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_GE(x, y)
|
||||
|
||||
#define KNF_DLOG(x) \
|
||||
::knf::kDisableDebug ? (void)0 : ::knf::Voidifier() & KNF_LOG(x)
|
||||
|
||||
#endif // KALDI_NATIVE_FBANK_CSRC_LOG_H_
|
@ -0,0 +1,256 @@
|
||||
/**
|
||||
* Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// This file is copied/modified from kaldi/src/feat/mel-computations.cc
|
||||
|
||||
#include "kaldi-native-fbank/csrc/mel-computations.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
|
||||
#include "kaldi-native-fbank/csrc/feature-window.h"
|
||||
|
||||
namespace knf {
|
||||
|
||||
std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts) {
|
||||
os << opts.ToString();
|
||||
return os;
|
||||
}
|
||||
|
||||
float MelBanks::VtlnWarpFreq(
|
||||
float vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN.
|
||||
float vtln_high_cutoff,
|
||||
float low_freq, // upper+lower frequency cutoffs in mel computation
|
||||
float high_freq, float vtln_warp_factor, float freq) {
|
||||
/// This computes a VTLN warping function that is not the same as HTK's one,
|
||||
/// but has similar inputs (this function has the advantage of never producing
|
||||
/// empty bins).
|
||||
|
||||
/// This function computes a warp function F(freq), defined between low_freq
|
||||
/// and high_freq inclusive, with the following properties:
|
||||
/// F(low_freq) == low_freq
|
||||
/// F(high_freq) == high_freq
|
||||
/// The function is continuous and piecewise linear with two inflection
|
||||
/// points.
|
||||
/// The lower inflection point (measured in terms of the unwarped
|
||||
/// frequency) is at frequency l, determined as described below.
|
||||
/// The higher inflection point is at a frequency h, determined as
|
||||
/// described below.
|
||||
/// If l <= f <= h, then F(f) = f/vtln_warp_factor.
|
||||
/// If the higher inflection point (measured in terms of the unwarped
|
||||
/// frequency) is at h, then max(h, F(h)) == vtln_high_cutoff.
|
||||
/// Since (by the last point) F(h) == h/vtln_warp_factor, then
|
||||
/// max(h, h/vtln_warp_factor) == vtln_high_cutoff, so
|
||||
/// h = vtln_high_cutoff / max(1, 1/vtln_warp_factor).
|
||||
/// = vtln_high_cutoff * min(1, vtln_warp_factor).
|
||||
/// If the lower inflection point (measured in terms of the unwarped
|
||||
/// frequency) is at l, then min(l, F(l)) == vtln_low_cutoff
|
||||
/// This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor)
|
||||
/// = vtln_low_cutoff * max(1, vtln_warp_factor)
|
||||
|
||||
if (freq < low_freq || freq > high_freq)
|
||||
return freq; // in case this gets called
|
||||
// for out-of-range frequencies, just return the freq.
|
||||
|
||||
KNF_CHECK_GT(vtln_low_cutoff, low_freq);
|
||||
KNF_CHECK_LT(vtln_high_cutoff, high_freq);
|
||||
|
||||
float one = 1.0f;
|
||||
float l = vtln_low_cutoff * std::max(one, vtln_warp_factor);
|
||||
float h = vtln_high_cutoff * std::min(one, vtln_warp_factor);
|
||||
float scale = 1.0f / vtln_warp_factor;
|
||||
float Fl = scale * l; // F(l);
|
||||
float Fh = scale * h; // F(h);
|
||||
KNF_CHECK(l > low_freq && h < high_freq);
|
||||
// slope of left part of the 3-piece linear function
|
||||
float scale_left = (Fl - low_freq) / (l - low_freq);
|
||||
// [slope of center part is just "scale"]
|
||||
|
||||
// slope of right part of the 3-piece linear function
|
||||
float scale_right = (high_freq - Fh) / (high_freq - h);
|
||||
|
||||
if (freq < l) {
|
||||
return low_freq + scale_left * (freq - low_freq);
|
||||
} else if (freq < h) {
|
||||
return scale * freq;
|
||||
} else { // freq >= h
|
||||
return high_freq + scale_right * (freq - high_freq);
|
||||
}
|
||||
}
|
||||
|
||||
float MelBanks::VtlnWarpMelFreq(
|
||||
float vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN.
|
||||
float vtln_high_cutoff,
|
||||
float low_freq, // upper+lower frequency cutoffs in mel computation
|
||||
float high_freq, float vtln_warp_factor, float mel_freq) {
|
||||
return MelScale(VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, low_freq,
|
||||
high_freq, vtln_warp_factor,
|
||||
InverseMelScale(mel_freq)));
|
||||
}
|
||||
|
||||
MelBanks::MelBanks(const MelBanksOptions &opts,
|
||||
const FrameExtractionOptions &frame_opts,
|
||||
float vtln_warp_factor)
|
||||
: htk_mode_(opts.htk_mode) {
|
||||
int32_t num_bins = opts.num_bins;
|
||||
if (num_bins < 3) KNF_LOG(FATAL) << "Must have at least 3 mel bins";
|
||||
|
||||
float sample_freq = frame_opts.samp_freq;
|
||||
int32_t window_length_padded = frame_opts.PaddedWindowSize();
|
||||
KNF_CHECK_EQ(window_length_padded % 2, 0);
|
||||
|
||||
int32_t num_fft_bins = window_length_padded / 2;
|
||||
float nyquist = 0.5f * sample_freq;
|
||||
|
||||
float low_freq = opts.low_freq, high_freq;
|
||||
if (opts.high_freq > 0.0f)
|
||||
high_freq = opts.high_freq;
|
||||
else
|
||||
high_freq = nyquist + opts.high_freq;
|
||||
|
||||
if (low_freq < 0.0f || low_freq >= nyquist || high_freq <= 0.0f ||
|
||||
high_freq > nyquist || high_freq <= low_freq) {
|
||||
KNF_LOG(FATAL) << "Bad values in options: low-freq " << low_freq
|
||||
<< " and high-freq " << high_freq << " vs. nyquist "
|
||||
<< nyquist;
|
||||
}
|
||||
|
||||
float fft_bin_width = sample_freq / window_length_padded;
|
||||
// fft-bin width [think of it as Nyquist-freq / half-window-length]
|
||||
|
||||
float mel_low_freq = MelScale(low_freq);
|
||||
float mel_high_freq = MelScale(high_freq);
|
||||
|
||||
debug_ = opts.debug_mel;
|
||||
|
||||
// divide by num_bins+1 in next line because of end-effects where the bins
|
||||
// spread out to the sides.
|
||||
float mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1);
|
||||
|
||||
float vtln_low = opts.vtln_low, vtln_high = opts.vtln_high;
|
||||
if (vtln_high < 0.0f) {
|
||||
vtln_high += nyquist;
|
||||
}
|
||||
|
||||
if (vtln_warp_factor != 1.0f &&
|
||||
(vtln_low < 0.0f || vtln_low <= low_freq || vtln_low >= high_freq ||
|
||||
vtln_high <= 0.0f || vtln_high >= high_freq || vtln_high <= vtln_low)) {
|
||||
KNF_LOG(FATAL) << "Bad values in options: vtln-low " << vtln_low
|
||||
<< " and vtln-high " << vtln_high << ", versus "
|
||||
<< "low-freq " << low_freq << " and high-freq " << high_freq;
|
||||
}
|
||||
|
||||
bins_.resize(num_bins);
|
||||
center_freqs_.resize(num_bins);
|
||||
|
||||
for (int32_t bin = 0; bin < num_bins; ++bin) {
|
||||
float left_mel = mel_low_freq + bin * mel_freq_delta,
|
||||
center_mel = mel_low_freq + (bin + 1) * mel_freq_delta,
|
||||
right_mel = mel_low_freq + (bin + 2) * mel_freq_delta;
|
||||
|
||||
if (vtln_warp_factor != 1.0f) {
|
||||
left_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
|
||||
vtln_warp_factor, left_mel);
|
||||
center_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
|
||||
vtln_warp_factor, center_mel);
|
||||
right_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
|
||||
vtln_warp_factor, right_mel);
|
||||
}
|
||||
center_freqs_[bin] = InverseMelScale(center_mel);
|
||||
|
||||
// this_bin will be a vector of coefficients that is only
|
||||
// nonzero where this mel bin is active.
|
||||
std::vector<float> this_bin(num_fft_bins);
|
||||
|
||||
int32_t first_index = -1, last_index = -1;
|
||||
for (int32_t i = 0; i < num_fft_bins; ++i) {
|
||||
float freq = (fft_bin_width * i); // Center frequency of this fft
|
||||
// bin.
|
||||
float mel = MelScale(freq);
|
||||
if (mel > left_mel && mel < right_mel) {
|
||||
float weight;
|
||||
if (mel <= center_mel)
|
||||
weight = (mel - left_mel) / (center_mel - left_mel);
|
||||
else
|
||||
weight = (right_mel - mel) / (right_mel - center_mel);
|
||||
this_bin[i] = weight;
|
||||
if (first_index == -1) first_index = i;
|
||||
last_index = i;
|
||||
}
|
||||
}
|
||||
KNF_CHECK(first_index != -1 && last_index >= first_index &&
|
||||
"You may have set num_mel_bins too large.");
|
||||
|
||||
bins_[bin].first = first_index;
|
||||
int32_t size = last_index + 1 - first_index;
|
||||
bins_[bin].second.insert(bins_[bin].second.end(),
|
||||
this_bin.begin() + first_index,
|
||||
this_bin.begin() + first_index + size);
|
||||
|
||||
// Replicate a bug in HTK, for testing purposes.
|
||||
if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0f) {
|
||||
bins_[bin].second[0] = 0.0;
|
||||
}
|
||||
} // for (int32_t bin = 0; bin < num_bins; ++bin) {
|
||||
|
||||
if (debug_) {
|
||||
std::ostringstream os;
|
||||
for (size_t i = 0; i < bins_.size(); i++) {
|
||||
os << "bin " << i << ", offset = " << bins_[i].first << ", vec = ";
|
||||
for (auto k : bins_[i].second) os << k << ", ";
|
||||
os << "\n";
|
||||
}
|
||||
KNF_LOG(INFO) << os.str();
|
||||
}
|
||||
}
|
||||
|
||||
// "power_spectrum" contains fft energies.
|
||||
void MelBanks::Compute(const float *power_spectrum,
|
||||
float *mel_energies_out) const {
|
||||
int32_t num_bins = bins_.size();
|
||||
|
||||
for (int32_t i = 0; i < num_bins; i++) {
|
||||
int32_t offset = bins_[i].first;
|
||||
const auto &v = bins_[i].second;
|
||||
float energy = 0;
|
||||
for (int32_t k = 0; k != v.size(); ++k) {
|
||||
energy += v[k] * power_spectrum[k + offset];
|
||||
}
|
||||
|
||||
// HTK-like flooring- for testing purposes (we prefer dither)
|
||||
if (htk_mode_ && energy < 1.0) {
|
||||
energy = 1.0;
|
||||
}
|
||||
|
||||
mel_energies_out[i] = energy;
|
||||
|
||||
// The following assert was added due to a problem with OpenBlas that
|
||||
// we had at one point (it was a bug in that library). Just to detect
|
||||
// it early.
|
||||
KNF_CHECK_EQ(energy, energy); // check that energy is not nan
|
||||
}
|
||||
|
||||
if (debug_) {
|
||||
fprintf(stderr, "MEL BANKS:\n");
|
||||
for (int32_t i = 0; i < num_bins; i++)
|
||||
fprintf(stderr, " %f", mel_energies_out[i]);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace knf
|
@ -0,0 +1,115 @@
|
||||
/**
|
||||
* Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
// This file is copied/modified from kaldi/src/feat/mel-computations.h
|
||||
#ifndef KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
|
||||
#define KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
|
||||
|
||||
#include <cmath>
|
||||
#include <string>
|
||||
|
||||
#include "kaldi-native-fbank/csrc/feature-window.h"
|
||||
|
||||
namespace knf {
|
||||
|
||||
struct MelBanksOptions {
|
||||
int32_t num_bins = 25; // e.g. 25; number of triangular bins
|
||||
float low_freq = 20; // e.g. 20; lower frequency cutoff
|
||||
|
||||
// an upper frequency cutoff; 0 -> no cutoff, negative
|
||||
// ->added to the Nyquist frequency to get the cutoff.
|
||||
float high_freq = 0;
|
||||
|
||||
float vtln_low = 100; // vtln lower cutoff of warping function.
|
||||
|
||||
// vtln upper cutoff of warping function: if negative, added
|
||||
// to the Nyquist frequency to get the cutoff.
|
||||
float vtln_high = -500;
|
||||
|
||||
bool debug_mel = false;
|
||||
// htk_mode is a "hidden" config, it does not show up on command line.
|
||||
// Enables more exact compatibility with HTK, for testing purposes. Affects
|
||||
// mel-energy flooring and reproduces a bug in HTK.
|
||||
bool htk_mode = false;
|
||||
|
||||
std::string ToString() const {
|
||||
std::ostringstream os;
|
||||
os << "num_bins: " << num_bins << "\n";
|
||||
os << "low_freq: " << low_freq << "\n";
|
||||
os << "high_freq: " << high_freq << "\n";
|
||||
os << "vtln_low: " << vtln_low << "\n";
|
||||
os << "vtln_high: " << vtln_high << "\n";
|
||||
os << "debug_mel: " << debug_mel << "\n";
|
||||
os << "htk_mode: " << htk_mode << "\n";
|
||||
return os.str();
|
||||
}
|
||||
};
|
||||
|
||||
std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts);
|
||||
|
||||
class MelBanks {
|
||||
public:
|
||||
static inline float InverseMelScale(float mel_freq) {
|
||||
return 700.0f * (expf(mel_freq / 1127.0f) - 1.0f);
|
||||
}
|
||||
|
||||
static inline float MelScale(float freq) {
|
||||
return 1127.0f * logf(1.0f + freq / 700.0f);
|
||||
}
|
||||
|
||||
static float VtlnWarpFreq(
|
||||
float vtln_low_cutoff,
|
||||
float vtln_high_cutoff, // discontinuities in warp func
|
||||
float low_freq,
|
||||
float high_freq, // upper+lower frequency cutoffs in
|
||||
// the mel computation
|
||||
float vtln_warp_factor, float freq);
|
||||
|
||||
static float VtlnWarpMelFreq(float vtln_low_cutoff, float vtln_high_cutoff,
|
||||
float low_freq, float high_freq,
|
||||
float vtln_warp_factor, float mel_freq);
|
||||
|
||||
// TODO(fangjun): Remove vtln_warp_factor
|
||||
MelBanks(const MelBanksOptions &opts,
|
||||
const FrameExtractionOptions &frame_opts, float vtln_warp_factor);
|
||||
|
||||
/// Compute Mel energies (note: not log energies).
|
||||
/// At input, "fft_energies" contains the FFT energies (not log).
|
||||
///
|
||||
/// @param fft_energies 1-D array of size num_fft_bins/2+1
|
||||
/// @param mel_energies_out 1-D array of size num_mel_bins
|
||||
void Compute(const float *fft_energies, float *mel_energies_out) const;
|
||||
|
||||
int32_t NumBins() const { return bins_.size(); }
|
||||
|
||||
private:
|
||||
// center frequencies of bins, numbered from 0 ... num_bins-1.
|
||||
// Needed by GetCenterFreqs().
|
||||
std::vector<float> center_freqs_;
|
||||
|
||||
// the "bins_" vector is a vector, one for each bin, of a pair:
|
||||
// (the first nonzero fft-bin), (the vector of weights).
|
||||
std::vector<std::pair<int32_t, std::vector<float>>> bins_;
|
||||
|
||||
// TODO(fangjun): Remove debug_ and htk_mode_
|
||||
bool debug_;
|
||||
bool htk_mode_;
|
||||
};
|
||||
|
||||
} // namespace knf
|
||||
|
||||
#endif // KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
|
@ -0,0 +1,66 @@
|
||||
/**
|
||||
* Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "kaldi-native-fbank/csrc/rfft.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
#include "kaldi-native-fbank/csrc/log.h"
|
||||
|
||||
// see fftsg.c
|
||||
#ifdef __cplusplus
|
||||
extern "C" void rdft(int n, int isgn, double *a, int *ip, double *w);
|
||||
#else
|
||||
void rdft(int n, int isgn, double *a, int *ip, double *w);
|
||||
#endif
|
||||
|
||||
namespace knf {
|
||||
class Rfft::RfftImpl {
|
||||
public:
|
||||
explicit RfftImpl(int32_t n) : n_(n), ip_(2 + std::sqrt(n / 2)), w_(n / 2) {
|
||||
KNF_CHECK_EQ(n & (n - 1), 0);
|
||||
}
|
||||
|
||||
void Compute(float *in_out) {
|
||||
std::vector<double> d(in_out, in_out + n_);
|
||||
|
||||
Compute(d.data());
|
||||
|
||||
std::copy(d.begin(), d.end(), in_out);
|
||||
}
|
||||
|
||||
void Compute(double *in_out) {
|
||||
// 1 means forward fft
|
||||
rdft(n_, 1, in_out, ip_.data(), w_.data());
|
||||
}
|
||||
|
||||
private:
|
||||
int32_t n_;
|
||||
std::vector<int32_t> ip_;
|
||||
std::vector<double> w_;
|
||||
};
|
||||
|
||||
Rfft::Rfft(int32_t n) : impl_(std::make_unique<RfftImpl>(n)) {}
|
||||
|
||||
Rfft::~Rfft() = default;
|
||||
|
||||
void Rfft::Compute(float *in_out) { impl_->Compute(in_out); }
|
||||
void Rfft::Compute(double *in_out) { impl_->Compute(in_out); }
|
||||
|
||||
} // namespace knf
|
@ -0,0 +1,56 @@
|
||||
/**
|
||||
* Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
*
|
||||
* See LICENSE for clarification regarding multiple authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef KALDI_NATIVE_FBANK_CSRC_RFFT_H_
|
||||
#define KALDI_NATIVE_FBANK_CSRC_RFFT_H_
|
||||
|
||||
#include <memory>
|
||||
|
||||
namespace knf {
|
||||
|
||||
// n-point Real discrete Fourier transform
|
||||
// where n is a power of 2. n >= 2
|
||||
//
|
||||
// R[k] = sum_j=0^n-1 in[j]*cos(2*pi*j*k/n), 0<=k<=n/2
|
||||
// I[k] = sum_j=0^n-1 in[j]*sin(2*pi*j*k/n), 0<k<n/2
|
||||
class Rfft {
|
||||
public:
|
||||
// @param n Number of fft bins. it should be a power of 2.
|
||||
explicit Rfft(int32_t n);
|
||||
~Rfft();
|
||||
|
||||
/** @param in_out A 1-D array of size n.
|
||||
* On return:
|
||||
* in_out[0] = R[0]
|
||||
* in_out[1] = R[n/2]
|
||||
* for 1 < k < n/2,
|
||||
* in_out[2*k] = R[k]
|
||||
* in_out[2*k+1] = I[k]
|
||||
*
|
||||
*/
|
||||
void Compute(float *in_out);
|
||||
void Compute(double *in_out);
|
||||
|
||||
private:
|
||||
class RfftImpl;
|
||||
std::unique_ptr<RfftImpl> impl_;
|
||||
};
|
||||
|
||||
} // namespace knf
|
||||
|
||||
#endif // KALDI_NATIVE_FBANK_CSRC_RFFT_H_
|
@ -1,111 +0,0 @@
|
||||
# checkout the thirdparty/kaldi/base/kaldi-types.h
|
||||
# compile kaldi without openfst
|
||||
add_definitions("-DCOMPILE_WITHOUT_OPENFST")
|
||||
|
||||
if ((NOT EXISTS ${CMAKE_CURRENT_LIST_DIR}/base))
|
||||
file(COPY ../../../../speechx/speechx/kaldi/base DESTINATION ${CMAKE_CURRENT_LIST_DIR})
|
||||
file(COPY ../../../../speechx/speechx/kaldi/feat DESTINATION ${CMAKE_CURRENT_LIST_DIR})
|
||||
file(COPY ../../../../speechx/speechx/kaldi/matrix DESTINATION ${CMAKE_CURRENT_LIST_DIR})
|
||||
file(COPY ../../../../speechx/speechx/kaldi/util DESTINATION ${CMAKE_CURRENT_LIST_DIR})
|
||||
endif()
|
||||
|
||||
# kaldi-base
|
||||
add_library(kaldi-base STATIC
|
||||
base/io-funcs.cc
|
||||
base/kaldi-error.cc
|
||||
base/kaldi-math.cc
|
||||
base/kaldi-utils.cc
|
||||
base/timer.cc
|
||||
)
|
||||
target_include_directories(kaldi-base PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
# kaldi-matrix
|
||||
add_library(kaldi-matrix STATIC
|
||||
matrix/compressed-matrix.cc
|
||||
matrix/matrix-functions.cc
|
||||
matrix/kaldi-matrix.cc
|
||||
matrix/kaldi-vector.cc
|
||||
matrix/optimization.cc
|
||||
matrix/packed-matrix.cc
|
||||
matrix/qr.cc
|
||||
matrix/sparse-matrix.cc
|
||||
matrix/sp-matrix.cc
|
||||
matrix/srfft.cc
|
||||
matrix/tp-matrix.cc
|
||||
)
|
||||
target_include_directories(kaldi-matrix PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
if (NOT MSVC)
|
||||
target_link_libraries(kaldi-matrix PUBLIC kaldi-base libopenblas)
|
||||
else()
|
||||
target_link_libraries(kaldi-matrix PUBLIC kaldi-base openblas)
|
||||
endif()
|
||||
|
||||
# kaldi-util
|
||||
add_library(kaldi-util STATIC
|
||||
util/kaldi-holder.cc
|
||||
util/kaldi-io.cc
|
||||
util/kaldi-semaphore.cc
|
||||
util/kaldi-table.cc
|
||||
util/kaldi-thread.cc
|
||||
util/parse-options.cc
|
||||
util/simple-io-funcs.cc
|
||||
util/simple-options.cc
|
||||
util/text-utils.cc
|
||||
)
|
||||
target_include_directories(kaldi-util PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(kaldi-util PUBLIC kaldi-base kaldi-matrix)
|
||||
|
||||
# kaldi-feat-common
|
||||
add_library(kaldi-feat-common STATIC
|
||||
feat/cmvn.cc
|
||||
feat/feature-functions.cc
|
||||
feat/feature-window.cc
|
||||
feat/mel-computations.cc
|
||||
feat/pitch-functions.cc
|
||||
feat/resample.cc
|
||||
feat/signal.cc
|
||||
feat/wave-reader.cc
|
||||
)
|
||||
target_include_directories(kaldi-feat-common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util)
|
||||
|
||||
|
||||
# kaldi-mfcc
|
||||
add_library(kaldi-mfcc STATIC
|
||||
feat/feature-mfcc.cc
|
||||
)
|
||||
target_include_directories(kaldi-mfcc PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(kaldi-mfcc PUBLIC kaldi-feat-common)
|
||||
|
||||
|
||||
# kaldi-fbank
|
||||
add_library(kaldi-fbank STATIC
|
||||
feat/feature-fbank.cc
|
||||
)
|
||||
target_include_directories(kaldi-fbank PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(kaldi-fbank PUBLIC kaldi-feat-common)
|
||||
|
||||
|
||||
set(KALDI_LIBRARIES
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-base.a
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-matrix.a
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-util.a
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-feat-common.a
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-mfcc.a
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-fbank.a
|
||||
)
|
||||
|
||||
add_library(libkaldi INTERFACE)
|
||||
add_dependencies(libkaldi kaldi-base kaldi-matrix kaldi-util kaldi-feat-common kaldi-mfcc kaldi-fbank)
|
||||
target_include_directories(libkaldi INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
if (APPLE)
|
||||
target_link_libraries(libkaldi INTERFACE ${KALDI_LIBRARIES} libopenblas ${GFORTRAN_LIBRARIES_DIR}/libgfortran.a ${GFORTRAN_LIBRARIES_DIR}/libquadmath.a ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib)
|
||||
elseif (MSVC)
|
||||
target_link_libraries(libkaldi INTERFACE kaldi-base kaldi-matrix kaldi-util kaldi-feat-common kaldi-mfcc kaldi-fbank openblas)
|
||||
else()
|
||||
target_link_libraries(libkaldi INTERFACE -Wl,--start-group -Wl,--whole-archive ${KALDI_LIBRARIES} libopenblas.a gfortran -Wl,--no-whole-archive -Wl,--end-group)
|
||||
endif()
|
||||
|
||||
target_compile_definitions(libkaldi INTERFACE "-DCOMPILE_WITHOUT_OPENFST")
|
Loading…
Reference in new issue