From a01fa866a4b20d9a8dffd672431c0fdd7fd2ceb1 Mon Sep 17 00:00:00 2001 From: SmileGoat Date: Fri, 28 Jan 2022 14:21:26 +0800 Subject: [PATCH] add normalizer --- .../speechx/frontend/linear_spectrogram.cc | 8 +- speechx/speechx/frontend/linear_spectrogram.h | 2 +- speechx/speechx/frontend/normalizer.cc | 97 +++++++++++++++++++ speechx/speechx/frontend/normalizer.h | 65 +++++++++++++ 4 files changed, 167 insertions(+), 5 deletions(-) create mode 100644 speechx/speechx/frontend/normalizer.cc create mode 100644 speechx/speechx/frontend/normalizer.h diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/linear_spectrogram.cc index 327c3f578..df8a29f97 100644 --- a/speechx/speechx/frontend/linear_spectrogram.cc +++ b/speechx/speechx/frontend/linear_spectrogram.cc @@ -89,7 +89,7 @@ bool LinearSpectrogram::ReadFeats(Matrix* feats) const { // Compute spectrogram feat, return num frames // todo: refactor later (SmileGoat) -int32 LinearSpectrogram::Compute(const vector& wave, +bool LinearSpectrogram::Compute(const vector& wave, vector>& feat) { int num_samples = wave.size(); const int& frame_length = opts.frame_opts.WindowSize(); @@ -99,7 +99,7 @@ int32 LinearSpectrogram::Compute(const vector& wave, const float scale = hanning_window_energy_ * frame_shift; if (num_samples < frame_length) { - return 0; + return true; } int num_frames = 1 + ((num_samples - frame_length) / frame_shift); @@ -118,7 +118,7 @@ int32 LinearSpectrogram::Compute(const vector& wave, v.assign(data.begin(), data.end()); if (NumpyFft(&v, fft_real, fft_img)) { LOG(ERROR)<< i << " fft compute occurs error, please checkout the input data"; - return -1; + return false; } feat[i].resize(fft_points / 2 + 1); // the last dimension is Fs/2 Hz @@ -135,5 +135,5 @@ int32 LinearSpectrogram::Compute(const vector& wave, // log added eps=1e-14 feat[i][j] = std::log(feat[i][j] + 1e-14); } - return 0; + return true; } diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/linear_spectrogram.h index b69050d1b..981f92eaa 100644 --- a/speechx/speechx/frontend/linear_spectrogram.h +++ b/speechx/speechx/frontend/linear_spectrogram.h @@ -28,7 +28,7 @@ class LinearSpectrogram : public FeatureExtractorInterface { private: void Hanning(std::vector& data) const; kaldi::int32 Compute(const std::vector& wave, - std::vector>& feat) const; + std::vector>& feat); bool NumpyFft(std::vector* v, std::vector* real, std::vector* img) const; diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/normalizer.cc new file mode 100644 index 000000000..9a384484f --- /dev/null +++ b/speechx/speechx/frontend/normalizer.cc @@ -0,0 +1,97 @@ + +#include "frontend/normalizer.h" + +DecibelNormalizer::DecibelNormalizer( + const DecibelNormalizerOptions& opts, + const std::unique_ptr& pre_extractor) { + +} + +void DecibelNormalizer::AcceptWavefrom(const kaldi::Vector& input) { + +} + +void DecibelNormalizer::Read(kaldi::Vector* feat) { + +} + +bool DecibelNormalizer::Compute(const Vector& input, + kaldi::Vector* feat) { + // calculate db rms + float rms_db = 0.0; + float mean_square = 0.0; + float gain = 0.0; + vector smaples; + samples.resize(input.Size()); + for (int32 i = 0; i < samples.size(); ++i) { + samples[i] = input(i); + } + + // square + for (auto &d : samples) { + if (_opts.convert_int_float) { + d = d * WAVE_FLOAT_NORMALIZATION; + } + mean_square += d * d; + } + + // mean + mean_square /= samples.size(); + rms_db = 10 * std::log10(mean_square); + gain = opts.target_db - rms_db; + + if (gain > opts.max_gain_db) { + LOG(ERROR) << "Unable to normalize segment to " << opts.target_db << "dB," + << "because the the probable gain have exceeds opts.max_gain_db" + << opts.max_gain_db << "dB."; + return false; + } + + // Note that this is an in-place transformation. + for (auto &item : samples) { + // python item *= 10.0 ** (gain / 20.0) + item *= std::pow(10.0, gain / 20.0); + } + + return true; +} + + +PPNormalizer::PPNormalizer( + const PPNormalizerOptions& opts, + const std::unique_ptr& pre_extractor) { + +} + +void PPNormalizer::AcceptWavefrom(const kaldi::Vector& input) { + +} + +void PPNormalizer::Read(kaldi::Vector* feat) { + +} + +bool PPNormalizer::Compute(const Vector& input, + kaldi::Vector>* feat) { + if ((input.Dim() % mean_.Dim()) == 0) { + LOG(ERROR) << "CMVN dimension is wrong!"; + return false; + } + + try { + int32 size = mean_.Dim(); + feat->Resize(input.Dim()); + for (int32 row_idx = 0; row_idx < j; ++row_idx) { + int32 base_idx = row_idx * size; + for (int32 idx = 0; idx < mean_.Dim(); ++idx) { + (*feat)(base_idx + idx) = (input(base_dix + idx) - mean_(idx))* variance_(idx); + } + } + + } catch(const std::exception& e) { + std::cerr << e.what() << '\n'; + return false; + } + + return true; +} diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/normalizer.h new file mode 100644 index 000000000..f297403b3 --- /dev/null +++ b/speechx/speechx/frontend/normalizer.h @@ -0,0 +1,65 @@ + +#pragma once + +#include "frontend/feature_extractor_interface.h" + +namespace ppspeech { + + +struct DecibelNormalizerOptions { + float target_db; + float max_gain_db; + DecibelNormalizerOptions() : + target_db(-20), + max_gain_db(300.0), + convert_int_float(false) {} + + void Register(kaldi::OptionsItf* opts) { + opts->Register("target-db", &target_db, "target db for db normalization"); + opts->Register("max-gain-db", &max_gain_db, "max gain db for db normalization"); + opts->Register("convert-int-float", &convert_int_float, "if convert int samples to float"); + } +}; + +class DecibelNormalizer : public FeatureExtractorInterface { + public: + explict DecibelNormalizer(const DecibelNormalizerOptions& opts, + const std::unique_ptr& pre_extractor); + virtual void AcceptWavefrom(const kaldi::Vector& input); + virtual void Read(kaldi::Vector* feat); + virtual size_t Dim() const; + bool Compute(const kaldi::Vector& input, + kaldi::Vector>* feat); + private: +}; + +struct NormalizerOptions { + std::string mean_std_path; + NormalizerOptions() : + mean_std_path("") {} + + void Register(kaldi::OptionsItf* opts) { + opts->Register("mean-std", &mean_std_path, "mean std file"); + } +}; + +// todo refactor later (SmileGoat) +class PPNormalizer : public FeatureExtractorInterface { + public: + explicit PPNormalizer(const NormalizerOptions& opts, + const std::unique_ptr& pre_extractor); + ~PPNormalizer() {} + virtual void AcceptWavefrom(const kaldi::Vector& input); + virtual void Read(kaldi::Vector* feat); + virtual size_t Dim() const; + bool Compute(const kaldi::Vector& input, + kaldi::Vector>& feat); + + private: + bool _initialized; + kaldi::Vector mean_; + kaldi::Vector variance_; + NormalizerOptions _opts; +}; + +} // namespace ppspeech \ No newline at end of file