add normalizer

pull/1400/head
SmileGoat 4 years ago
parent 88275aff05
commit a01fa866a4

@ -89,7 +89,7 @@ bool LinearSpectrogram::ReadFeats(Matrix<BaseFloat>* feats) const {
// Compute spectrogram feat, return num frames
// todo: refactor later (SmileGoat)
int32 LinearSpectrogram::Compute(const vector<float>& wave,
bool LinearSpectrogram::Compute(const vector<float>& wave,
vector<vector<float>>& feat) {
int num_samples = wave.size();
const int& frame_length = opts.frame_opts.WindowSize();
@ -99,7 +99,7 @@ int32 LinearSpectrogram::Compute(const vector<float>& wave,
const float scale = hanning_window_energy_ * frame_shift;
if (num_samples < frame_length) {
return 0;
return true;
}
int num_frames = 1 + ((num_samples - frame_length) / frame_shift);
@ -118,7 +118,7 @@ int32 LinearSpectrogram::Compute(const vector<float>& wave,
v.assign(data.begin(), data.end());
if (NumpyFft(&v, fft_real, fft_img)) {
LOG(ERROR)<< i << " fft compute occurs error, please checkout the input data";
return -1;
return false;
}
feat[i].resize(fft_points / 2 + 1); // the last dimension is Fs/2 Hz
@ -135,5 +135,5 @@ int32 LinearSpectrogram::Compute(const vector<float>& wave,
// log added eps=1e-14
feat[i][j] = std::log(feat[i][j] + 1e-14);
}
return 0;
return true;
}

@ -28,7 +28,7 @@ class LinearSpectrogram : public FeatureExtractorInterface {
private:
void Hanning(std::vector<kaldi::BaseFloat>& data) const;
kaldi::int32 Compute(const std::vector<kaldi::BaseFloat>& wave,
std::vector<std::vector<kaldi::BaseFloat>>& feat) const;
std::vector<std::vector<kaldi::BaseFloat>>& feat);
bool NumpyFft(std::vector<kaldi::BaseFloat>* v,
std::vector<kaldi::BaseFloat>* real,
std::vector<kaldi::BaseFloat>* img) const;

@ -0,0 +1,97 @@
#include "frontend/normalizer.h"
DecibelNormalizer::DecibelNormalizer(
const DecibelNormalizerOptions& opts,
const std::unique_ptr<FeatureExtractorInterface>& pre_extractor) {
}
void DecibelNormalizer::AcceptWavefrom(const kaldi::Vector<kaldi::BaseFloat>& input) {
}
void DecibelNormalizer::Read(kaldi::Vector<kaldi::BaseFloat>* feat) {
}
bool DecibelNormalizer::Compute(const Vector<kaldi::BaseFloat>& input,
kaldi::Vector<kaldi::BaseFloat>* feat) {
// calculate db rms
float rms_db = 0.0;
float mean_square = 0.0;
float gain = 0.0;
vector<BaseFloat> smaples;
samples.resize(input.Size());
for (int32 i = 0; i < samples.size(); ++i) {
samples[i] = input(i);
}
// square
for (auto &d : samples) {
if (_opts.convert_int_float) {
d = d * WAVE_FLOAT_NORMALIZATION;
}
mean_square += d * d;
}
// mean
mean_square /= samples.size();
rms_db = 10 * std::log10(mean_square);
gain = opts.target_db - rms_db;
if (gain > opts.max_gain_db) {
LOG(ERROR) << "Unable to normalize segment to " << opts.target_db << "dB,"
<< "because the the probable gain have exceeds opts.max_gain_db"
<< opts.max_gain_db << "dB.";
return false;
}
// Note that this is an in-place transformation.
for (auto &item : samples) {
// python item *= 10.0 ** (gain / 20.0)
item *= std::pow(10.0, gain / 20.0);
}
return true;
}
PPNormalizer::PPNormalizer(
const PPNormalizerOptions& opts,
const std::unique_ptr<FeatureExtractorInterface>& pre_extractor) {
}
void PPNormalizer::AcceptWavefrom(const kaldi::Vector<kaldi::BaseFloat>& input) {
}
void PPNormalizer::Read(kaldi::Vector<kaldi::BaseFloat>* feat) {
}
bool PPNormalizer::Compute(const Vector<kaldi::BaseFloat>& input,
kaldi::Vector<kaldi::BaseFloat>>* feat) {
if ((input.Dim() % mean_.Dim()) == 0) {
LOG(ERROR) << "CMVN dimension is wrong!";
return false;
}
try {
int32 size = mean_.Dim();
feat->Resize(input.Dim());
for (int32 row_idx = 0; row_idx < j; ++row_idx) {
int32 base_idx = row_idx * size;
for (int32 idx = 0; idx < mean_.Dim(); ++idx) {
(*feat)(base_idx + idx) = (input(base_dix + idx) - mean_(idx))* variance_(idx);
}
}
} catch(const std::exception& e) {
std::cerr << e.what() << '\n';
return false;
}
return true;
}

@ -0,0 +1,65 @@
#pragma once
#include "frontend/feature_extractor_interface.h"
namespace ppspeech {
struct DecibelNormalizerOptions {
float target_db;
float max_gain_db;
DecibelNormalizerOptions() :
target_db(-20),
max_gain_db(300.0),
convert_int_float(false) {}
void Register(kaldi::OptionsItf* opts) {
opts->Register("target-db", &target_db, "target db for db normalization");
opts->Register("max-gain-db", &max_gain_db, "max gain db for db normalization");
opts->Register("convert-int-float", &convert_int_float, "if convert int samples to float");
}
};
class DecibelNormalizer : public FeatureExtractorInterface {
public:
explict DecibelNormalizer(const DecibelNormalizerOptions& opts,
const std::unique_ptr<FeatureExtractorInterface>& pre_extractor);
virtual void AcceptWavefrom(const kaldi::Vector<kaldi::BaseFloat>& input);
virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat);
virtual size_t Dim() const;
bool Compute(const kaldi::Vector<kaldi::BaseFloat>& input,
kaldi::Vector<kaldi::BaseFloat>>* feat);
private:
};
struct NormalizerOptions {
std::string mean_std_path;
NormalizerOptions() :
mean_std_path("") {}
void Register(kaldi::OptionsItf* opts) {
opts->Register("mean-std", &mean_std_path, "mean std file");
}
};
// todo refactor later (SmileGoat)
class PPNormalizer : public FeatureExtractorInterface {
public:
explicit PPNormalizer(const NormalizerOptions& opts,
const std::unique_ptr<FeatureExtractorInterface>& pre_extractor);
~PPNormalizer() {}
virtual void AcceptWavefrom(const kaldi::Vector<kaldi::BaseFloat>& input);
virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat);
virtual size_t Dim() const;
bool Compute(const kaldi::Vector<kaldi::BaseFloat>& input,
kaldi::Vector<kaldi::BaseFloat>>& feat);
private:
bool _initialized;
kaldi::Vector<float> mean_;
kaldi::Vector<float> variance_;
NormalizerOptions _opts;
};
} // namespace ppspeech
Loading…
Cancel
Save