From 8d66a254dae27dbe32f92921fa18be24326c689c Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 1 Apr 2022 10:31:08 +0000 Subject: [PATCH] cmvn and db norm --- speechx/speechx/frontend/CMakeLists.txt | 3 +- .../frontend/{normalizer.cc => cmvn.cc} | 79 +-------------- speechx/speechx/frontend/cmvn.h | 34 +++++++ speechx/speechx/frontend/db_norm.cc | 95 +++++++++++++++++++ speechx/speechx/frontend/db_norm.h | 65 +++++++++++++ speechx/speechx/frontend/normalizer.h | 89 +---------------- 6 files changed, 199 insertions(+), 166 deletions(-) rename speechx/speechx/frontend/{normalizer.cc => cmvn.cc} (59%) create mode 100644 speechx/speechx/frontend/cmvn.h create mode 100644 speechx/speechx/frontend/db_norm.cc create mode 100644 speechx/speechx/frontend/db_norm.h diff --git a/speechx/speechx/frontend/CMakeLists.txt b/speechx/speechx/frontend/CMakeLists.txt index d0ec008e..35243b6e 100644 --- a/speechx/speechx/frontend/CMakeLists.txt +++ b/speechx/speechx/frontend/CMakeLists.txt @@ -1,7 +1,8 @@ project(frontend) add_library(frontend STATIC - normalizer.cc + cmvn.cc + db_norm.cc linear_spectrogram.cc audio_cache.cc feature_cache.cc diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/cmvn.cc similarity index 59% rename from speechx/speechx/frontend/normalizer.cc rename to speechx/speechx/frontend/cmvn.cc index 26f11b69..d9bba943 100644 --- a/speechx/speechx/frontend/normalizer.cc +++ b/speechx/speechx/frontend/cmvn.cc @@ -1,17 +1,3 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - #include "frontend/normalizer.h" #include "kaldi/feat/cmvn.h" @@ -26,70 +12,7 @@ using std::vector; using kaldi::SubVector; using std::unique_ptr; -DecibelNormalizer::DecibelNormalizer( - const DecibelNormalizerOptions& opts, - std::unique_ptr base_extractor) { - base_extractor_ = std::move(base_extractor); - opts_ = opts; - dim_ = 1; -} - -void DecibelNormalizer::Accept(const kaldi::VectorBase& waves) { - base_extractor_->Accept(waves); -} - -bool DecibelNormalizer::Read(kaldi::Vector* waves) { - if (base_extractor_->Read(waves) == false || waves->Dim() == 0) { - return false; - } - Compute(waves); - return true; -} - -bool DecibelNormalizer::Compute(VectorBase* waves) const { - // calculate db rms - BaseFloat rms_db = 0.0; - BaseFloat mean_square = 0.0; - BaseFloat gain = 0.0; - BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1)); - - vector samples; - samples.resize(waves->Dim()); - for (size_t i = 0; i < samples.size(); ++i) { - samples[i] = (*waves)(i); - } - - // square - for (auto& d : samples) { - if (opts_.convert_int_float) { - d = d * wave_float_normlization; - } - mean_square += d * d; - } - - // mean - mean_square /= samples.size(); - rms_db = 10 * std::log10(mean_square); - gain = opts_.target_db - rms_db; - - if (gain > opts_.max_gain_db) { - LOG(ERROR) - << "Unable to normalize segment to " << opts_.target_db << "dB," - << "because the the probable gain have exceeds opts_.max_gain_db" - << opts_.max_gain_db << "dB."; - return false; - } - - // Note that this is an in-place transformation. - for (auto& item : samples) { - // python item *= 10.0 ** (gain / 20.0) - item *= std::pow(10.0, gain / 20.0); - } - std::memcpy( - waves->Data(), samples.data(), sizeof(BaseFloat) * samples.size()); - return true; -} CMVN::CMVN(std::string cmvn_file, unique_ptr base_extractor) @@ -185,4 +108,4 @@ void CMVN::ApplyCMVN(kaldi::MatrixBase* feats) { ApplyCmvn(stats_, var_norm_, feats); } -} // namespace ppspeech +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/cmvn.h b/speechx/speechx/frontend/cmvn.h new file mode 100644 index 00000000..fdf2a87a --- /dev/null +++ b/speechx/speechx/frontend/cmvn.h @@ -0,0 +1,34 @@ +#pragma once + +#include "base/common.h" +#include "frontend/frontend_itf.h" +#include "kaldi/matrix/kaldi-matrix.h" +#include "kaldi/util/options-itf.h" + +namespace ppspeech { + +class CMVN : public FrontendInterface { + public: + explicit CMVN(std::string cmvn_file, + std::unique_ptr base_extractor); + virtual void Accept(const kaldi::VectorBase& inputs); + + // the length of feats = feature_row * feature_dim, + // the Matrix is squashed into Vector + virtual bool Read(kaldi::Vector* feats); + // the dim_ is the feautre dim. + virtual size_t Dim() const { return dim_; } + virtual void SetFinished() { base_extractor_->SetFinished(); } + virtual bool IsFinished() const { return base_extractor_->IsFinished(); } + virtual void Reset() { base_extractor_->Reset(); } + + private: + void Compute(kaldi::VectorBase* feats) const; + void ApplyCMVN(kaldi::MatrixBase* feats); + kaldi::Matrix stats_; + std::unique_ptr base_extractor_; + size_t dim_; + bool var_norm_; +}; + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/db_norm.cc b/speechx/speechx/frontend/db_norm.cc new file mode 100644 index 00000000..830af13b --- /dev/null +++ b/speechx/speechx/frontend/db_norm.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "frontend/normalizer.h" +#include "kaldi/feat/cmvn.h" +#include "kaldi/util/kaldi-io.h" + +namespace ppspeech { + +using kaldi::Vector; +using kaldi::VectorBase; +using kaldi::BaseFloat; +using std::vector; +using kaldi::SubVector; +using std::unique_ptr; + +DecibelNormalizer::DecibelNormalizer( + const DecibelNormalizerOptions& opts, + std::unique_ptr base_extractor) { + base_extractor_ = std::move(base_extractor); + opts_ = opts; + dim_ = 1; +} + +void DecibelNormalizer::Accept(const kaldi::VectorBase& waves) { + base_extractor_->Accept(waves); +} + +bool DecibelNormalizer::Read(kaldi::Vector* waves) { + if (base_extractor_->Read(waves) == false || waves->Dim() == 0) { + return false; + } + Compute(waves); + return true; +} + +bool DecibelNormalizer::Compute(VectorBase* waves) const { + // calculate db rms + BaseFloat rms_db = 0.0; + BaseFloat mean_square = 0.0; + BaseFloat gain = 0.0; + BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1)); + + vector samples; + samples.resize(waves->Dim()); + for (size_t i = 0; i < samples.size(); ++i) { + samples[i] = (*waves)(i); + } + + // square + for (auto& d : samples) { + if (opts_.convert_int_float) { + d = d * wave_float_normlization; + } + mean_square += d * d; + } + + // mean + mean_square /= samples.size(); + rms_db = 10 * std::log10(mean_square); + gain = opts_.target_db - rms_db; + + if (gain > opts_.max_gain_db) { + LOG(ERROR) + << "Unable to normalize segment to " << opts_.target_db << "dB," + << "because the the probable gain have exceeds opts_.max_gain_db" + << opts_.max_gain_db << "dB."; + return false; + } + + // Note that this is an in-place transformation. + for (auto& item : samples) { + // python item *= 10.0 ** (gain / 20.0) + item *= std::pow(10.0, gain / 20.0); + } + + std::memcpy( + waves->Data(), samples.data(), sizeof(BaseFloat) * samples.size()); + return true; +} + + +} // namespace ppspeech diff --git a/speechx/speechx/frontend/db_norm.h b/speechx/speechx/frontend/db_norm.h new file mode 100644 index 00000000..3d371071 --- /dev/null +++ b/speechx/speechx/frontend/db_norm.h @@ -0,0 +1,65 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#pragma once + +#include "base/common.h" +#include "frontend/frontend_itf.h" +#include "kaldi/matrix/kaldi-matrix.h" +#include "kaldi/util/options-itf.h" + +namespace ppspeech { + +struct DecibelNormalizerOptions { + float target_db; + float max_gain_db; + bool convert_int_float; + DecibelNormalizerOptions() + : target_db(-20), max_gain_db(300.0), convert_int_float(false) {} + + void Register(kaldi::OptionsItf* opts) { + opts->Register( + "target-db", &target_db, "target db for db normalization"); + opts->Register( + "max-gain-db", &max_gain_db, "max gain db for db normalization"); + opts->Register("convert-int-float", + &convert_int_float, + "if convert int samples to float"); + } +}; + +class DecibelNormalizer : public FrontendInterface { + public: + explicit DecibelNormalizer( + const DecibelNormalizerOptions& opts, + std::unique_ptr base_extractor); + virtual void Accept(const kaldi::VectorBase& waves); + virtual bool Read(kaldi::Vector* waves); + // noramlize audio, the dim is 1. + virtual size_t Dim() const { return dim_; } + virtual void SetFinished() { base_extractor_->SetFinished(); } + virtual bool IsFinished() const { return base_extractor_->IsFinished(); } + virtual void Reset() { base_extractor_->Reset(); } + + private: + bool Compute(kaldi::VectorBase* waves) const; + DecibelNormalizerOptions opts_; + size_t dim_; + std::unique_ptr base_extractor_; + kaldi::Vector waveform_; +}; + + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/normalizer.h index df181961..89599c2a 100644 --- a/speechx/speechx/frontend/normalizer.h +++ b/speechx/speechx/frontend/normalizer.h @@ -1,89 +1,4 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - #pragma once -#include "base/common.h" -#include "frontend/frontend_itf.h" -#include "kaldi/matrix/kaldi-matrix.h" -#include "kaldi/util/options-itf.h" - -namespace ppspeech { - -struct DecibelNormalizerOptions { - float target_db; - float max_gain_db; - bool convert_int_float; - DecibelNormalizerOptions() - : target_db(-20), max_gain_db(300.0), convert_int_float(false) {} - - void Register(kaldi::OptionsItf* opts) { - opts->Register( - "target-db", &target_db, "target db for db normalization"); - opts->Register( - "max-gain-db", &max_gain_db, "max gain db for db normalization"); - opts->Register("convert-int-float", - &convert_int_float, - "if convert int samples to float"); - } -}; - -class DecibelNormalizer : public FrontendInterface { - public: - explicit DecibelNormalizer( - const DecibelNormalizerOptions& opts, - std::unique_ptr base_extractor); - virtual void Accept(const kaldi::VectorBase& waves); - virtual bool Read(kaldi::Vector* waves); - // noramlize audio, the dim is 1. - virtual size_t Dim() const { return dim_; } - virtual void SetFinished() { base_extractor_->SetFinished(); } - virtual bool IsFinished() const { return base_extractor_->IsFinished(); } - virtual void Reset() { base_extractor_->Reset(); } - - private: - bool Compute(kaldi::VectorBase* waves) const; - DecibelNormalizerOptions opts_; - size_t dim_; - std::unique_ptr base_extractor_; - kaldi::Vector waveform_; -}; - - -class CMVN : public FrontendInterface { - public: - explicit CMVN(std::string cmvn_file, - std::unique_ptr base_extractor); - virtual void Accept(const kaldi::VectorBase& inputs); - - // the length of feats = feature_row * feature_dim, - // the Matrix is squashed into Vector - virtual bool Read(kaldi::Vector* feats); - // the dim_ is the feautre dim. - virtual size_t Dim() const { return dim_; } - virtual void SetFinished() { base_extractor_->SetFinished(); } - virtual bool IsFinished() const { return base_extractor_->IsFinished(); } - virtual void Reset() { base_extractor_->Reset(); } - - private: - void Compute(kaldi::VectorBase* feats) const; - void ApplyCMVN(kaldi::MatrixBase* feats); - kaldi::Matrix stats_; - std::unique_ptr base_extractor_; - size_t dim_; - bool var_norm_; -}; - -} // namespace ppspeech \ No newline at end of file +#include "frontend/cmvn.h" +#include "frontend/db_norm.h" \ No newline at end of file