From a9f4ce47a34bbd62c88090ef9a6e3498dbfc669a Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 1 Apr 2022 10:24:16 +0000 Subject: [PATCH 1/5] frontend itf --- speechx/examples/feat/linear_spectrogram_main.cc | 12 ++++++------ speechx/speechx/frontend/audio_cache.h | 4 ++-- speechx/speechx/frontend/data_cache.h | 4 ++-- speechx/speechx/frontend/fbank.h | 4 ++-- speechx/speechx/frontend/feature_cache.cc | 2 +- speechx/speechx/frontend/feature_cache.h | 8 ++++---- .../frontend/feature_extractor_controller.h | 13 ------------- .../frontend/feature_extractor_controller_impl.h | 13 ------------- ...eature_extractor_interface.h => frontend_itf.h} | 2 +- speechx/speechx/frontend/linear_spectrogram.cc | 2 +- speechx/speechx/frontend/linear_spectrogram.h | 8 ++++---- speechx/speechx/frontend/normalizer.cc | 4 ++-- speechx/speechx/frontend/normalizer.h | 14 +++++++------- speechx/speechx/nnet/decodable.cc | 2 +- speechx/speechx/nnet/decodable.h | 6 +++--- 15 files changed, 36 insertions(+), 62 deletions(-) delete mode 100644 speechx/speechx/frontend/feature_extractor_controller.h delete mode 100644 speechx/speechx/frontend/feature_extractor_controller_impl.h rename speechx/speechx/frontend/{feature_extractor_interface.h => frontend_itf.h} (97%) diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc index e1f0a8954..c29d2b21f 100644 --- a/speechx/examples/feat/linear_spectrogram_main.cc +++ b/speechx/examples/feat/linear_spectrogram_main.cc @@ -20,7 +20,7 @@ #include "frontend/audio_cache.h" #include "frontend/data_cache.h" #include "frontend/feature_cache.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/frontend_itf.h" #include "frontend/normalizer.h" #include "kaldi/feat/wave-reader.h" #include "kaldi/util/kaldi-io.h" @@ -170,13 +170,13 @@ int main(int argc, char* argv[]) { // feature pipeline: wave cache --> decibel_normalizer --> hanning // window -->linear_spectrogram --> global cmvn -> feat cache - // std::unique_ptr data_source(new + // std::unique_ptr data_source(new // ppspeech::DataCache()); - std::unique_ptr data_source( + std::unique_ptr data_source( new ppspeech::AudioCache()); ppspeech::DecibelNormalizerOptions db_norm_opt; - std::unique_ptr db_norm( + std::unique_ptr db_norm( new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source))); ppspeech::LinearSpectrogramOptions opt; @@ -185,10 +185,10 @@ int main(int argc, char* argv[]) { LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms; LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms; - std::unique_ptr linear_spectrogram( + std::unique_ptr linear_spectrogram( new ppspeech::LinearSpectrogram(opt, std::move(db_norm))); - std::unique_ptr cmvn( + std::unique_ptr cmvn( new ppspeech::CMVN(FLAGS_cmvn_write_path, std::move(linear_spectrogram))); diff --git a/speechx/speechx/frontend/audio_cache.h b/speechx/speechx/frontend/audio_cache.h index b6c82c69e..f48da12b7 100644 --- a/speechx/speechx/frontend/audio_cache.h +++ b/speechx/speechx/frontend/audio_cache.h @@ -16,12 +16,12 @@ #pragma once #include "base/common.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/frontend_itf.h" namespace ppspeech { // waves cache -class AudioCache : public FeatureExtractorInterface { +class AudioCache : public FrontendInterface { public: explicit AudioCache(int buffer_size = kint16max); diff --git a/speechx/speechx/frontend/data_cache.h b/speechx/speechx/frontend/data_cache.h index dea51d76e..b8ce6bf65 100644 --- a/speechx/speechx/frontend/data_cache.h +++ b/speechx/speechx/frontend/data_cache.h @@ -17,13 +17,13 @@ #include "base/common.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/frontend_itf.h" namespace ppspeech { // A data source for testing different frontend module. // It accepts waves or feats. -class DataCache : public FeatureExtractorInterface { +class DataCache : public FrontendInterface { public: explicit DataCache() { finished_ = false; } diff --git a/speechx/speechx/frontend/fbank.h b/speechx/speechx/frontend/fbank.h index 7d9cf4221..68267b3d0 100644 --- a/speechx/speechx/frontend/fbank.h +++ b/speechx/speechx/frontend/fbank.h @@ -20,10 +20,10 @@ namespace ppspeech { -class FbankExtractor : FeatureExtractorInterface { +class FbankExtractor : FrontendInterface { public: explicit FbankExtractor(const FbankOptions& opts, - share_ptr pre_extractor); + share_ptr pre_extractor); virtual void AcceptWaveform( const kaldi::Vector& input) = 0; virtual void Read(kaldi::Vector* feat) = 0; diff --git a/speechx/speechx/frontend/feature_cache.cc b/speechx/speechx/frontend/feature_cache.cc index dad6907ce..53b7076d5 100644 --- a/speechx/speechx/frontend/feature_cache.cc +++ b/speechx/speechx/frontend/feature_cache.cc @@ -24,7 +24,7 @@ using kaldi::SubVector; using std::unique_ptr; FeatureCache::FeatureCache( - int max_size, unique_ptr base_extractor) { + int max_size, unique_ptr base_extractor) { max_size_ = max_size; base_extractor_ = std::move(base_extractor); } diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h index f52b9b0f6..1281ec35a 100644 --- a/speechx/speechx/frontend/feature_cache.h +++ b/speechx/speechx/frontend/feature_cache.h @@ -15,15 +15,15 @@ #pragma once #include "base/common.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/frontend_itf.h" namespace ppspeech { -class FeatureCache : public FeatureExtractorInterface { +class FeatureCache : public FrontendInterface { public: explicit FeatureCache( int32 max_size = kint16max, - std::unique_ptr base_extractor = NULL); + std::unique_ptr base_extractor = NULL); // Feed feats or waves virtual void Accept(const kaldi::VectorBase& inputs); @@ -53,7 +53,7 @@ class FeatureCache : public FeatureExtractorInterface { bool Compute(); size_t max_size_; - std::unique_ptr base_extractor_; + std::unique_ptr base_extractor_; std::mutex mutex_; std::queue> cache_; diff --git a/speechx/speechx/frontend/feature_extractor_controller.h b/speechx/speechx/frontend/feature_extractor_controller.h deleted file mode 100644 index 0544a1e29..000000000 --- a/speechx/speechx/frontend/feature_extractor_controller.h +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. diff --git a/speechx/speechx/frontend/feature_extractor_controller_impl.h b/speechx/speechx/frontend/feature_extractor_controller_impl.h deleted file mode 100644 index 0544a1e29..000000000 --- a/speechx/speechx/frontend/feature_extractor_controller_impl.h +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. diff --git a/speechx/speechx/frontend/feature_extractor_interface.h b/speechx/speechx/frontend/frontend_itf.h similarity index 97% rename from speechx/speechx/frontend/feature_extractor_interface.h rename to speechx/speechx/frontend/frontend_itf.h index 5da2526b9..7913cc7c0 100644 --- a/speechx/speechx/frontend/feature_extractor_interface.h +++ b/speechx/speechx/frontend/frontend_itf.h @@ -19,7 +19,7 @@ namespace ppspeech { -class FeatureExtractorInterface { +class FrontendInterface { public: // Feed inputs: features(2D saved in 1D) or waveforms(1D). virtual void Accept(const kaldi::VectorBase& inputs) = 0; diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/linear_spectrogram.cc index 41bc8743a..2ba00785a 100644 --- a/speechx/speechx/frontend/linear_spectrogram.cc +++ b/speechx/speechx/frontend/linear_spectrogram.cc @@ -27,7 +27,7 @@ using std::vector; LinearSpectrogram::LinearSpectrogram( const LinearSpectrogramOptions& opts, - std::unique_ptr base_extractor) { + std::unique_ptr base_extractor) { opts_ = opts; base_extractor_ = std::move(base_extractor); int32 window_size = opts.frame_opts.WindowSize(); diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/linear_spectrogram.h index 10853904d..136441efe 100644 --- a/speechx/speechx/frontend/linear_spectrogram.h +++ b/speechx/speechx/frontend/linear_spectrogram.h @@ -16,7 +16,7 @@ #pragma once #include "base/common.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/frontend_itf.h" #include "kaldi/feat/feature-window.h" namespace ppspeech { @@ -35,11 +35,11 @@ struct LinearSpectrogramOptions { } }; -class LinearSpectrogram : public FeatureExtractorInterface { +class LinearSpectrogram : public FrontendInterface { public: explicit LinearSpectrogram( const LinearSpectrogramOptions& opts, - std::unique_ptr base_extractor); + std::unique_ptr base_extractor); virtual void Accept(const kaldi::VectorBase& inputs); virtual bool Read(kaldi::Vector* feats); // the dim_ is the dim of single frame feature @@ -61,7 +61,7 @@ class LinearSpectrogram : public FeatureExtractorInterface { std::vector hanning_window_; kaldi::BaseFloat hanning_window_energy_; LinearSpectrogramOptions opts_; - std::unique_ptr base_extractor_; + std::unique_ptr base_extractor_; int chunk_sample_size_; DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram); }; diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/normalizer.cc index 524125619..26f11b692 100644 --- a/speechx/speechx/frontend/normalizer.cc +++ b/speechx/speechx/frontend/normalizer.cc @@ -28,7 +28,7 @@ using std::unique_ptr; DecibelNormalizer::DecibelNormalizer( const DecibelNormalizerOptions& opts, - std::unique_ptr base_extractor) { + std::unique_ptr base_extractor) { base_extractor_ = std::move(base_extractor); opts_ = opts; dim_ = 1; @@ -92,7 +92,7 @@ bool DecibelNormalizer::Compute(VectorBase* waves) const { } CMVN::CMVN(std::string cmvn_file, - unique_ptr base_extractor) + unique_ptr base_extractor) : var_norm_(true) { base_extractor_ = std::move(base_extractor); bool binary; diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/normalizer.h index 352d1e167..df1819612 100644 --- a/speechx/speechx/frontend/normalizer.h +++ b/speechx/speechx/frontend/normalizer.h @@ -16,7 +16,7 @@ #pragma once #include "base/common.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/frontend_itf.h" #include "kaldi/matrix/kaldi-matrix.h" #include "kaldi/util/options-itf.h" @@ -40,11 +40,11 @@ struct DecibelNormalizerOptions { } }; -class DecibelNormalizer : public FeatureExtractorInterface { +class DecibelNormalizer : public FrontendInterface { public: explicit DecibelNormalizer( const DecibelNormalizerOptions& opts, - std::unique_ptr base_extractor); + std::unique_ptr base_extractor); virtual void Accept(const kaldi::VectorBase& waves); virtual bool Read(kaldi::Vector* waves); // noramlize audio, the dim is 1. @@ -57,15 +57,15 @@ class DecibelNormalizer : public FeatureExtractorInterface { bool Compute(kaldi::VectorBase* waves) const; DecibelNormalizerOptions opts_; size_t dim_; - std::unique_ptr base_extractor_; + std::unique_ptr base_extractor_; kaldi::Vector waveform_; }; -class CMVN : public FeatureExtractorInterface { +class CMVN : public FrontendInterface { public: explicit CMVN(std::string cmvn_file, - std::unique_ptr base_extractor); + std::unique_ptr base_extractor); virtual void Accept(const kaldi::VectorBase& inputs); // the length of feats = feature_row * feature_dim, @@ -81,7 +81,7 @@ class CMVN : public FeatureExtractorInterface { void Compute(kaldi::VectorBase* feats) const; void ApplyCMVN(kaldi::MatrixBase* feats); kaldi::Matrix stats_; - std::unique_ptr base_extractor_; + std::unique_ptr base_extractor_; size_t dim_; bool var_norm_; }; diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index e6315d07a..542168d24 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -22,7 +22,7 @@ using std::vector; using kaldi::Vector; Decodable::Decodable(const std::shared_ptr& nnet, - const std::shared_ptr& frontend) + const std::shared_ptr& frontend) : frontend_(frontend), nnet_(nnet), frame_offset_(0), frames_ready_(0) {} void Decodable::Acceptlikelihood(const Matrix& likelihood) { diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h index 7938b5823..ef17601fa 100644 --- a/speechx/speechx/nnet/decodable.h +++ b/speechx/speechx/nnet/decodable.h @@ -13,7 +13,7 @@ // limitations under the License. #include "base/common.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/frontend_itf.h" #include "kaldi/matrix/kaldi-matrix.h" #include "nnet/decodable-itf.h" #include "nnet/nnet_interface.h" @@ -26,7 +26,7 @@ class Decodable : public kaldi::DecodableInterface { public: explicit Decodable( const std::shared_ptr& nnet, - const std::shared_ptr& frontend); + const std::shared_ptr& frontend); // void Init(DecodableOpts config); virtual kaldi::BaseFloat LogLikelihood(int32 frame, int32 index); virtual bool IsLastFrame(int32 frame) const; @@ -41,7 +41,7 @@ class Decodable : public kaldi::DecodableInterface { private: bool AdvanceChunk(); - std::shared_ptr frontend_; + std::shared_ptr frontend_; std::shared_ptr nnet_; kaldi::Matrix nnet_cache_; // std::vector> nnet_cache_; From 8d66a254dae27dbe32f92921fa18be24326c689c Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 1 Apr 2022 10:31:08 +0000 Subject: [PATCH 2/5] cmvn and db norm --- speechx/speechx/frontend/CMakeLists.txt | 3 +- .../frontend/{normalizer.cc => cmvn.cc} | 79 +-------------- speechx/speechx/frontend/cmvn.h | 34 +++++++ speechx/speechx/frontend/db_norm.cc | 95 +++++++++++++++++++ speechx/speechx/frontend/db_norm.h | 65 +++++++++++++ speechx/speechx/frontend/normalizer.h | 89 +---------------- 6 files changed, 199 insertions(+), 166 deletions(-) rename speechx/speechx/frontend/{normalizer.cc => cmvn.cc} (59%) create mode 100644 speechx/speechx/frontend/cmvn.h create mode 100644 speechx/speechx/frontend/db_norm.cc create mode 100644 speechx/speechx/frontend/db_norm.h diff --git a/speechx/speechx/frontend/CMakeLists.txt b/speechx/speechx/frontend/CMakeLists.txt index d0ec008ee..35243b6e3 100644 --- a/speechx/speechx/frontend/CMakeLists.txt +++ b/speechx/speechx/frontend/CMakeLists.txt @@ -1,7 +1,8 @@ project(frontend) add_library(frontend STATIC - normalizer.cc + cmvn.cc + db_norm.cc linear_spectrogram.cc audio_cache.cc feature_cache.cc diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/cmvn.cc similarity index 59% rename from speechx/speechx/frontend/normalizer.cc rename to speechx/speechx/frontend/cmvn.cc index 26f11b692..d9bba9435 100644 --- a/speechx/speechx/frontend/normalizer.cc +++ b/speechx/speechx/frontend/cmvn.cc @@ -1,17 +1,3 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - #include "frontend/normalizer.h" #include "kaldi/feat/cmvn.h" @@ -26,70 +12,7 @@ using std::vector; using kaldi::SubVector; using std::unique_ptr; -DecibelNormalizer::DecibelNormalizer( - const DecibelNormalizerOptions& opts, - std::unique_ptr base_extractor) { - base_extractor_ = std::move(base_extractor); - opts_ = opts; - dim_ = 1; -} - -void DecibelNormalizer::Accept(const kaldi::VectorBase& waves) { - base_extractor_->Accept(waves); -} - -bool DecibelNormalizer::Read(kaldi::Vector* waves) { - if (base_extractor_->Read(waves) == false || waves->Dim() == 0) { - return false; - } - Compute(waves); - return true; -} - -bool DecibelNormalizer::Compute(VectorBase* waves) const { - // calculate db rms - BaseFloat rms_db = 0.0; - BaseFloat mean_square = 0.0; - BaseFloat gain = 0.0; - BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1)); - - vector samples; - samples.resize(waves->Dim()); - for (size_t i = 0; i < samples.size(); ++i) { - samples[i] = (*waves)(i); - } - - // square - for (auto& d : samples) { - if (opts_.convert_int_float) { - d = d * wave_float_normlization; - } - mean_square += d * d; - } - - // mean - mean_square /= samples.size(); - rms_db = 10 * std::log10(mean_square); - gain = opts_.target_db - rms_db; - - if (gain > opts_.max_gain_db) { - LOG(ERROR) - << "Unable to normalize segment to " << opts_.target_db << "dB," - << "because the the probable gain have exceeds opts_.max_gain_db" - << opts_.max_gain_db << "dB."; - return false; - } - - // Note that this is an in-place transformation. - for (auto& item : samples) { - // python item *= 10.0 ** (gain / 20.0) - item *= std::pow(10.0, gain / 20.0); - } - std::memcpy( - waves->Data(), samples.data(), sizeof(BaseFloat) * samples.size()); - return true; -} CMVN::CMVN(std::string cmvn_file, unique_ptr base_extractor) @@ -185,4 +108,4 @@ void CMVN::ApplyCMVN(kaldi::MatrixBase* feats) { ApplyCmvn(stats_, var_norm_, feats); } -} // namespace ppspeech +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/cmvn.h b/speechx/speechx/frontend/cmvn.h new file mode 100644 index 000000000..fdf2a87a4 --- /dev/null +++ b/speechx/speechx/frontend/cmvn.h @@ -0,0 +1,34 @@ +#pragma once + +#include "base/common.h" +#include "frontend/frontend_itf.h" +#include "kaldi/matrix/kaldi-matrix.h" +#include "kaldi/util/options-itf.h" + +namespace ppspeech { + +class CMVN : public FrontendInterface { + public: + explicit CMVN(std::string cmvn_file, + std::unique_ptr base_extractor); + virtual void Accept(const kaldi::VectorBase& inputs); + + // the length of feats = feature_row * feature_dim, + // the Matrix is squashed into Vector + virtual bool Read(kaldi::Vector* feats); + // the dim_ is the feautre dim. + virtual size_t Dim() const { return dim_; } + virtual void SetFinished() { base_extractor_->SetFinished(); } + virtual bool IsFinished() const { return base_extractor_->IsFinished(); } + virtual void Reset() { base_extractor_->Reset(); } + + private: + void Compute(kaldi::VectorBase* feats) const; + void ApplyCMVN(kaldi::MatrixBase* feats); + kaldi::Matrix stats_; + std::unique_ptr base_extractor_; + size_t dim_; + bool var_norm_; +}; + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/db_norm.cc b/speechx/speechx/frontend/db_norm.cc new file mode 100644 index 000000000..830af13be --- /dev/null +++ b/speechx/speechx/frontend/db_norm.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "frontend/normalizer.h" +#include "kaldi/feat/cmvn.h" +#include "kaldi/util/kaldi-io.h" + +namespace ppspeech { + +using kaldi::Vector; +using kaldi::VectorBase; +using kaldi::BaseFloat; +using std::vector; +using kaldi::SubVector; +using std::unique_ptr; + +DecibelNormalizer::DecibelNormalizer( + const DecibelNormalizerOptions& opts, + std::unique_ptr base_extractor) { + base_extractor_ = std::move(base_extractor); + opts_ = opts; + dim_ = 1; +} + +void DecibelNormalizer::Accept(const kaldi::VectorBase& waves) { + base_extractor_->Accept(waves); +} + +bool DecibelNormalizer::Read(kaldi::Vector* waves) { + if (base_extractor_->Read(waves) == false || waves->Dim() == 0) { + return false; + } + Compute(waves); + return true; +} + +bool DecibelNormalizer::Compute(VectorBase* waves) const { + // calculate db rms + BaseFloat rms_db = 0.0; + BaseFloat mean_square = 0.0; + BaseFloat gain = 0.0; + BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1)); + + vector samples; + samples.resize(waves->Dim()); + for (size_t i = 0; i < samples.size(); ++i) { + samples[i] = (*waves)(i); + } + + // square + for (auto& d : samples) { + if (opts_.convert_int_float) { + d = d * wave_float_normlization; + } + mean_square += d * d; + } + + // mean + mean_square /= samples.size(); + rms_db = 10 * std::log10(mean_square); + gain = opts_.target_db - rms_db; + + if (gain > opts_.max_gain_db) { + LOG(ERROR) + << "Unable to normalize segment to " << opts_.target_db << "dB," + << "because the the probable gain have exceeds opts_.max_gain_db" + << opts_.max_gain_db << "dB."; + return false; + } + + // Note that this is an in-place transformation. + for (auto& item : samples) { + // python item *= 10.0 ** (gain / 20.0) + item *= std::pow(10.0, gain / 20.0); + } + + std::memcpy( + waves->Data(), samples.data(), sizeof(BaseFloat) * samples.size()); + return true; +} + + +} // namespace ppspeech diff --git a/speechx/speechx/frontend/db_norm.h b/speechx/speechx/frontend/db_norm.h new file mode 100644 index 000000000..3d3710715 --- /dev/null +++ b/speechx/speechx/frontend/db_norm.h @@ -0,0 +1,65 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#pragma once + +#include "base/common.h" +#include "frontend/frontend_itf.h" +#include "kaldi/matrix/kaldi-matrix.h" +#include "kaldi/util/options-itf.h" + +namespace ppspeech { + +struct DecibelNormalizerOptions { + float target_db; + float max_gain_db; + bool convert_int_float; + DecibelNormalizerOptions() + : target_db(-20), max_gain_db(300.0), convert_int_float(false) {} + + void Register(kaldi::OptionsItf* opts) { + opts->Register( + "target-db", &target_db, "target db for db normalization"); + opts->Register( + "max-gain-db", &max_gain_db, "max gain db for db normalization"); + opts->Register("convert-int-float", + &convert_int_float, + "if convert int samples to float"); + } +}; + +class DecibelNormalizer : public FrontendInterface { + public: + explicit DecibelNormalizer( + const DecibelNormalizerOptions& opts, + std::unique_ptr base_extractor); + virtual void Accept(const kaldi::VectorBase& waves); + virtual bool Read(kaldi::Vector* waves); + // noramlize audio, the dim is 1. + virtual size_t Dim() const { return dim_; } + virtual void SetFinished() { base_extractor_->SetFinished(); } + virtual bool IsFinished() const { return base_extractor_->IsFinished(); } + virtual void Reset() { base_extractor_->Reset(); } + + private: + bool Compute(kaldi::VectorBase* waves) const; + DecibelNormalizerOptions opts_; + size_t dim_; + std::unique_ptr base_extractor_; + kaldi::Vector waveform_; +}; + + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/normalizer.h index df1819612..89599c2a6 100644 --- a/speechx/speechx/frontend/normalizer.h +++ b/speechx/speechx/frontend/normalizer.h @@ -1,89 +1,4 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - #pragma once -#include "base/common.h" -#include "frontend/frontend_itf.h" -#include "kaldi/matrix/kaldi-matrix.h" -#include "kaldi/util/options-itf.h" - -namespace ppspeech { - -struct DecibelNormalizerOptions { - float target_db; - float max_gain_db; - bool convert_int_float; - DecibelNormalizerOptions() - : target_db(-20), max_gain_db(300.0), convert_int_float(false) {} - - void Register(kaldi::OptionsItf* opts) { - opts->Register( - "target-db", &target_db, "target db for db normalization"); - opts->Register( - "max-gain-db", &max_gain_db, "max gain db for db normalization"); - opts->Register("convert-int-float", - &convert_int_float, - "if convert int samples to float"); - } -}; - -class DecibelNormalizer : public FrontendInterface { - public: - explicit DecibelNormalizer( - const DecibelNormalizerOptions& opts, - std::unique_ptr base_extractor); - virtual void Accept(const kaldi::VectorBase& waves); - virtual bool Read(kaldi::Vector* waves); - // noramlize audio, the dim is 1. - virtual size_t Dim() const { return dim_; } - virtual void SetFinished() { base_extractor_->SetFinished(); } - virtual bool IsFinished() const { return base_extractor_->IsFinished(); } - virtual void Reset() { base_extractor_->Reset(); } - - private: - bool Compute(kaldi::VectorBase* waves) const; - DecibelNormalizerOptions opts_; - size_t dim_; - std::unique_ptr base_extractor_; - kaldi::Vector waveform_; -}; - - -class CMVN : public FrontendInterface { - public: - explicit CMVN(std::string cmvn_file, - std::unique_ptr base_extractor); - virtual void Accept(const kaldi::VectorBase& inputs); - - // the length of feats = feature_row * feature_dim, - // the Matrix is squashed into Vector - virtual bool Read(kaldi::Vector* feats); - // the dim_ is the feautre dim. - virtual size_t Dim() const { return dim_; } - virtual void SetFinished() { base_extractor_->SetFinished(); } - virtual bool IsFinished() const { return base_extractor_->IsFinished(); } - virtual void Reset() { base_extractor_->Reset(); } - - private: - void Compute(kaldi::VectorBase* feats) const; - void ApplyCMVN(kaldi::MatrixBase* feats); - kaldi::Matrix stats_; - std::unique_ptr base_extractor_; - size_t dim_; - bool var_norm_; -}; - -} // namespace ppspeech \ No newline at end of file +#include "frontend/cmvn.h" +#include "frontend/db_norm.h" \ No newline at end of file From 42c7537ce629a7fc717d27e5fc36073d86f8fce6 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 1 Apr 2022 10:43:39 +0000 Subject: [PATCH 3/5] frontend to audio dir --- speechx/examples/decoder/offline_decoder_main.cc | 2 +- .../decoder/offline_decoder_sliding_chunk_main.cc | 2 +- speechx/examples/feat/linear_spectrogram_main.cc | 13 ++++++------- speechx/speechx/frontend/CMakeLists.txt | 11 +---------- speechx/speechx/frontend/audio/CMakeLists.txt | 11 +++++++++++ .../speechx/frontend/{ => audio}/audio_cache.cc | 2 +- .../speechx/frontend/{ => audio}/audio_cache.h | 2 +- speechx/speechx/frontend/{ => audio}/cmvn.cc | 3 +-- speechx/speechx/frontend/{ => audio}/cmvn.h | 2 +- speechx/speechx/frontend/{ => audio}/data_cache.h | 2 +- speechx/speechx/frontend/{ => audio}/db_norm.cc | 2 +- speechx/speechx/frontend/{ => audio}/db_norm.h | 2 +- speechx/speechx/frontend/{ => audio}/fbank.h | 0 .../speechx/frontend/{ => audio}/feature_cache.cc | 2 +- .../speechx/frontend/{ => audio}/feature_cache.h | 2 +- .../speechx/frontend/{ => audio}/frontend_itf.h | 0 .../frontend/{ => audio}/linear_spectrogram.cc | 2 +- .../frontend/{ => audio}/linear_spectrogram.h | 2 +- speechx/speechx/frontend/{ => audio}/mfcc.h | 0 speechx/speechx/frontend/audio/normalizer.h | 4 ++++ speechx/speechx/frontend/normalizer.h | 4 ---- speechx/speechx/frontend/window.h | 15 --------------- speechx/speechx/nnet/decodable.h | 2 +- 23 files changed, 36 insertions(+), 51 deletions(-) rename speechx/speechx/frontend/{ => audio}/audio_cache.cc (98%) rename speechx/speechx/frontend/{ => audio}/audio_cache.h (97%) rename speechx/speechx/frontend/{ => audio}/cmvn.cc (98%) rename speechx/speechx/frontend/{ => audio}/cmvn.h (94%) rename speechx/speechx/frontend/{ => audio}/data_cache.h (97%) rename speechx/speechx/frontend/{ => audio}/db_norm.cc (98%) rename speechx/speechx/frontend/{ => audio}/db_norm.h (97%) rename speechx/speechx/frontend/{ => audio}/fbank.h (100%) rename speechx/speechx/frontend/{ => audio}/feature_cache.cc (97%) rename speechx/speechx/frontend/{ => audio}/feature_cache.h (97%) rename speechx/speechx/frontend/{ => audio}/frontend_itf.h (100%) rename speechx/speechx/frontend/{ => audio}/linear_spectrogram.cc (98%) rename speechx/speechx/frontend/{ => audio}/linear_spectrogram.h (97%) rename speechx/speechx/frontend/{ => audio}/mfcc.h (100%) create mode 100644 speechx/speechx/frontend/audio/normalizer.h delete mode 100644 speechx/speechx/frontend/normalizer.h delete mode 100644 speechx/speechx/frontend/window.h diff --git a/speechx/examples/decoder/offline_decoder_main.cc b/speechx/examples/decoder/offline_decoder_main.cc index 6bd83b9b1..9a9c14a0c 100644 --- a/speechx/examples/decoder/offline_decoder_main.cc +++ b/speechx/examples/decoder/offline_decoder_main.cc @@ -17,7 +17,7 @@ #include "base/flags.h" #include "base/log.h" #include "decoder/ctc_beam_search_decoder.h" -#include "frontend/data_cache.h" +#include "frontend/audio/data_cache.h" #include "kaldi/util/table-types.h" #include "nnet/decodable.h" #include "nnet/paddle_nnet.h" diff --git a/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc b/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc index 4d5ffe145..7f6c572ca 100644 --- a/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc +++ b/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc @@ -17,7 +17,7 @@ #include "base/flags.h" #include "base/log.h" #include "decoder/ctc_beam_search_decoder.h" -#include "frontend/data_cache.h" +#include "frontend/audio/data_cache.h" #include "kaldi/util/table-types.h" #include "nnet/decodable.h" #include "nnet/paddle_nnet.h" diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc index c29d2b21f..8f32bac2a 100644 --- a/speechx/examples/feat/linear_spectrogram_main.cc +++ b/speechx/examples/feat/linear_spectrogram_main.cc @@ -14,19 +14,18 @@ // todo refactor, repalce with gtest -#include "frontend/linear_spectrogram.h" #include "base/flags.h" #include "base/log.h" -#include "frontend/audio_cache.h" -#include "frontend/data_cache.h" -#include "frontend/feature_cache.h" -#include "frontend/frontend_itf.h" -#include "frontend/normalizer.h" #include "kaldi/feat/wave-reader.h" #include "kaldi/util/kaldi-io.h" #include "kaldi/util/table-types.h" -#include +#include "frontend/audio/linear_spectrogram.h" +#include "frontend/audio/audio_cache.h" +#include "frontend/audio/data_cache.h" +#include "frontend/audio/feature_cache.h" +#include "frontend/audio/frontend_itf.h" +#include "frontend/audio/normalizer.h" DEFINE_string(wav_rspecifier, "", "test wav scp path"); DEFINE_string(feature_wspecifier, "", "output feats wspecifier"); diff --git a/speechx/speechx/frontend/CMakeLists.txt b/speechx/speechx/frontend/CMakeLists.txt index 35243b6e3..7d10fdec9 100644 --- a/speechx/speechx/frontend/CMakeLists.txt +++ b/speechx/speechx/frontend/CMakeLists.txt @@ -1,11 +1,2 @@ -project(frontend) -add_library(frontend STATIC - cmvn.cc - db_norm.cc - linear_spectrogram.cc - audio_cache.cc - feature_cache.cc -) - -target_link_libraries(frontend PUBLIC kaldi-matrix) \ No newline at end of file +add_subdirectory(audio) \ No newline at end of file diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt index e69de29bb..35243b6e3 100644 --- a/speechx/speechx/frontend/audio/CMakeLists.txt +++ b/speechx/speechx/frontend/audio/CMakeLists.txt @@ -0,0 +1,11 @@ +project(frontend) + +add_library(frontend STATIC + cmvn.cc + db_norm.cc + linear_spectrogram.cc + audio_cache.cc + feature_cache.cc +) + +target_link_libraries(frontend PUBLIC kaldi-matrix) \ No newline at end of file diff --git a/speechx/speechx/frontend/audio_cache.cc b/speechx/speechx/frontend/audio/audio_cache.cc similarity index 98% rename from speechx/speechx/frontend/audio_cache.cc rename to speechx/speechx/frontend/audio/audio_cache.cc index d44ed592c..c3233e595 100644 --- a/speechx/speechx/frontend/audio_cache.cc +++ b/speechx/speechx/frontend/audio/audio_cache.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "frontend/audio_cache.h" +#include "frontend/audio/audio_cache.h" #include "kaldi/base/timer.h" namespace ppspeech { diff --git a/speechx/speechx/frontend/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h similarity index 97% rename from speechx/speechx/frontend/audio_cache.h rename to speechx/speechx/frontend/audio/audio_cache.h index f48da12b7..17e1a8389 100644 --- a/speechx/speechx/frontend/audio_cache.h +++ b/speechx/speechx/frontend/audio/audio_cache.h @@ -16,7 +16,7 @@ #pragma once #include "base/common.h" -#include "frontend/frontend_itf.h" +#include "frontend/audio/frontend_itf.h" namespace ppspeech { diff --git a/speechx/speechx/frontend/cmvn.cc b/speechx/speechx/frontend/audio/cmvn.cc similarity index 98% rename from speechx/speechx/frontend/cmvn.cc rename to speechx/speechx/frontend/audio/cmvn.cc index d9bba9435..706492b7c 100644 --- a/speechx/speechx/frontend/cmvn.cc +++ b/speechx/speechx/frontend/audio/cmvn.cc @@ -1,5 +1,5 @@ -#include "frontend/normalizer.h" +#include "frontend/audio/cmvn.h" #include "kaldi/feat/cmvn.h" #include "kaldi/util/kaldi-io.h" @@ -13,7 +13,6 @@ using kaldi::SubVector; using std::unique_ptr; - CMVN::CMVN(std::string cmvn_file, unique_ptr base_extractor) : var_norm_(true) { diff --git a/speechx/speechx/frontend/cmvn.h b/speechx/speechx/frontend/audio/cmvn.h similarity index 94% rename from speechx/speechx/frontend/cmvn.h rename to speechx/speechx/frontend/audio/cmvn.h index fdf2a87a4..b3cfbb11a 100644 --- a/speechx/speechx/frontend/cmvn.h +++ b/speechx/speechx/frontend/audio/cmvn.h @@ -1,7 +1,7 @@ #pragma once #include "base/common.h" -#include "frontend/frontend_itf.h" +#include "frontend/audio/frontend_itf.h" #include "kaldi/matrix/kaldi-matrix.h" #include "kaldi/util/options-itf.h" diff --git a/speechx/speechx/frontend/data_cache.h b/speechx/speechx/frontend/audio/data_cache.h similarity index 97% rename from speechx/speechx/frontend/data_cache.h rename to speechx/speechx/frontend/audio/data_cache.h index b8ce6bf65..a812278ce 100644 --- a/speechx/speechx/frontend/data_cache.h +++ b/speechx/speechx/frontend/audio/data_cache.h @@ -17,7 +17,7 @@ #include "base/common.h" -#include "frontend/frontend_itf.h" +#include "frontend/audio/frontend_itf.h" namespace ppspeech { diff --git a/speechx/speechx/frontend/db_norm.cc b/speechx/speechx/frontend/audio/db_norm.cc similarity index 98% rename from speechx/speechx/frontend/db_norm.cc rename to speechx/speechx/frontend/audio/db_norm.cc index 830af13be..931e932d6 100644 --- a/speechx/speechx/frontend/db_norm.cc +++ b/speechx/speechx/frontend/audio/db_norm.cc @@ -13,7 +13,7 @@ // limitations under the License. -#include "frontend/normalizer.h" +#include "frontend/audio/db_norm.h" #include "kaldi/feat/cmvn.h" #include "kaldi/util/kaldi-io.h" diff --git a/speechx/speechx/frontend/db_norm.h b/speechx/speechx/frontend/audio/db_norm.h similarity index 97% rename from speechx/speechx/frontend/db_norm.h rename to speechx/speechx/frontend/audio/db_norm.h index 3d3710715..425971437 100644 --- a/speechx/speechx/frontend/db_norm.h +++ b/speechx/speechx/frontend/audio/db_norm.h @@ -16,7 +16,7 @@ #pragma once #include "base/common.h" -#include "frontend/frontend_itf.h" +#include "frontend/audio/frontend_itf.h" #include "kaldi/matrix/kaldi-matrix.h" #include "kaldi/util/options-itf.h" diff --git a/speechx/speechx/frontend/fbank.h b/speechx/speechx/frontend/audio/fbank.h similarity index 100% rename from speechx/speechx/frontend/fbank.h rename to speechx/speechx/frontend/audio/fbank.h diff --git a/speechx/speechx/frontend/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc similarity index 97% rename from speechx/speechx/frontend/feature_cache.cc rename to speechx/speechx/frontend/audio/feature_cache.cc index 53b7076d5..d7bea61ad 100644 --- a/speechx/speechx/frontend/feature_cache.cc +++ b/speechx/speechx/frontend/audio/feature_cache.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "frontend/feature_cache.h" +#include "frontend/audio/feature_cache.h" namespace ppspeech { diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h similarity index 97% rename from speechx/speechx/frontend/feature_cache.h rename to speechx/speechx/frontend/audio/feature_cache.h index 1281ec35a..99961b5e2 100644 --- a/speechx/speechx/frontend/feature_cache.h +++ b/speechx/speechx/frontend/audio/feature_cache.h @@ -15,7 +15,7 @@ #pragma once #include "base/common.h" -#include "frontend/frontend_itf.h" +#include "frontend/audio/frontend_itf.h" namespace ppspeech { diff --git a/speechx/speechx/frontend/frontend_itf.h b/speechx/speechx/frontend/audio/frontend_itf.h similarity index 100% rename from speechx/speechx/frontend/frontend_itf.h rename to speechx/speechx/frontend/audio/frontend_itf.h diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/audio/linear_spectrogram.cc similarity index 98% rename from speechx/speechx/frontend/linear_spectrogram.cc rename to speechx/speechx/frontend/audio/linear_spectrogram.cc index 2ba00785a..827b8eccf 100644 --- a/speechx/speechx/frontend/linear_spectrogram.cc +++ b/speechx/speechx/frontend/audio/linear_spectrogram.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "frontend/linear_spectrogram.h" +#include "frontend/audio/linear_spectrogram.h" #include "kaldi/base/kaldi-math.h" #include "kaldi/matrix/matrix-functions.h" diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/audio/linear_spectrogram.h similarity index 97% rename from speechx/speechx/frontend/linear_spectrogram.h rename to speechx/speechx/frontend/audio/linear_spectrogram.h index 136441efe..bbf8d6853 100644 --- a/speechx/speechx/frontend/linear_spectrogram.h +++ b/speechx/speechx/frontend/audio/linear_spectrogram.h @@ -16,7 +16,7 @@ #pragma once #include "base/common.h" -#include "frontend/frontend_itf.h" +#include "frontend/audio/frontend_itf.h" #include "kaldi/feat/feature-window.h" namespace ppspeech { diff --git a/speechx/speechx/frontend/mfcc.h b/speechx/speechx/frontend/audio/mfcc.h similarity index 100% rename from speechx/speechx/frontend/mfcc.h rename to speechx/speechx/frontend/audio/mfcc.h diff --git a/speechx/speechx/frontend/audio/normalizer.h b/speechx/speechx/frontend/audio/normalizer.h new file mode 100644 index 000000000..df9e4b751 --- /dev/null +++ b/speechx/speechx/frontend/audio/normalizer.h @@ -0,0 +1,4 @@ +#pragma once + +#include "frontend/audio/cmvn.h" +#include "frontend/audio/db_norm.h" \ No newline at end of file diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/normalizer.h deleted file mode 100644 index 89599c2a6..000000000 --- a/speechx/speechx/frontend/normalizer.h +++ /dev/null @@ -1,4 +0,0 @@ -#pragma once - -#include "frontend/cmvn.h" -#include "frontend/db_norm.h" \ No newline at end of file diff --git a/speechx/speechx/frontend/window.h b/speechx/speechx/frontend/window.h deleted file mode 100644 index 70d6307ec..000000000 --- a/speechx/speechx/frontend/window.h +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// extract the window of kaldi feat. diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h index ef17601fa..c75a0f4de 100644 --- a/speechx/speechx/nnet/decodable.h +++ b/speechx/speechx/nnet/decodable.h @@ -13,7 +13,7 @@ // limitations under the License. #include "base/common.h" -#include "frontend/frontend_itf.h" +#include "frontend/audio/frontend_itf.h" #include "kaldi/matrix/kaldi-matrix.h" #include "nnet/decodable-itf.h" #include "nnet/nnet_interface.h" From 9071b9597de42bccbd34202d664ee834fb5fc34b Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 1 Apr 2022 10:49:03 +0000 Subject: [PATCH 4/5] format code --- .../examples/feat/linear_spectrogram_main.cc | 7 +++---- speechx/speechx/frontend/audio/cmvn.cc | 17 +++++++++++++++-- speechx/speechx/frontend/audio/cmvn.h | 14 ++++++++++++++ speechx/speechx/frontend/audio/feature_cache.cc | 4 ++-- speechx/speechx/frontend/audio/normalizer.h | 14 ++++++++++++++ speechx/speechx/nnet/decodable.h | 5 ++--- 6 files changed, 50 insertions(+), 11 deletions(-) diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc index 8f32bac2a..ca76d85c7 100644 --- a/speechx/examples/feat/linear_spectrogram_main.cc +++ b/speechx/examples/feat/linear_spectrogram_main.cc @@ -20,11 +20,11 @@ #include "kaldi/util/kaldi-io.h" #include "kaldi/util/table-types.h" -#include "frontend/audio/linear_spectrogram.h" #include "frontend/audio/audio_cache.h" #include "frontend/audio/data_cache.h" #include "frontend/audio/feature_cache.h" #include "frontend/audio/frontend_itf.h" +#include "frontend/audio/linear_spectrogram.h" #include "frontend/audio/normalizer.h" DEFINE_string(wav_rspecifier, "", "test wav scp path"); @@ -187,9 +187,8 @@ int main(int argc, char* argv[]) { std::unique_ptr linear_spectrogram( new ppspeech::LinearSpectrogram(opt, std::move(db_norm))); - std::unique_ptr cmvn( - new ppspeech::CMVN(FLAGS_cmvn_write_path, - std::move(linear_spectrogram))); + std::unique_ptr cmvn(new ppspeech::CMVN( + FLAGS_cmvn_write_path, std::move(linear_spectrogram))); ppspeech::FeatureCache feature_cache(kint16max, std::move(cmvn)); LOG(INFO) << "feat dim: " << feature_cache.Dim(); diff --git a/speechx/speechx/frontend/audio/cmvn.cc b/speechx/speechx/frontend/audio/cmvn.cc index 706492b7c..4c1ffd6a1 100644 --- a/speechx/speechx/frontend/audio/cmvn.cc +++ b/speechx/speechx/frontend/audio/cmvn.cc @@ -1,3 +1,17 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include "frontend/audio/cmvn.h" #include "kaldi/feat/cmvn.h" @@ -13,8 +27,7 @@ using kaldi::SubVector; using std::unique_ptr; -CMVN::CMVN(std::string cmvn_file, - unique_ptr base_extractor) +CMVN::CMVN(std::string cmvn_file, unique_ptr base_extractor) : var_norm_(true) { base_extractor_ = std::move(base_extractor); bool binary; diff --git a/speechx/speechx/frontend/audio/cmvn.h b/speechx/speechx/frontend/audio/cmvn.h index b3cfbb11a..50ef5649b 100644 --- a/speechx/speechx/frontend/audio/cmvn.h +++ b/speechx/speechx/frontend/audio/cmvn.h @@ -1,3 +1,17 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #pragma once #include "base/common.h" diff --git a/speechx/speechx/frontend/audio/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc index d7bea61ad..3f7f6502b 100644 --- a/speechx/speechx/frontend/audio/feature_cache.cc +++ b/speechx/speechx/frontend/audio/feature_cache.cc @@ -23,8 +23,8 @@ using std::vector; using kaldi::SubVector; using std::unique_ptr; -FeatureCache::FeatureCache( - int max_size, unique_ptr base_extractor) { +FeatureCache::FeatureCache(int max_size, + unique_ptr base_extractor) { max_size_ = max_size; base_extractor_ = std::move(base_extractor); } diff --git a/speechx/speechx/frontend/audio/normalizer.h b/speechx/speechx/frontend/audio/normalizer.h index df9e4b751..dcf721dd2 100644 --- a/speechx/speechx/frontend/audio/normalizer.h +++ b/speechx/speechx/frontend/audio/normalizer.h @@ -1,3 +1,17 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #pragma once #include "frontend/audio/cmvn.h" diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h index c75a0f4de..3f0aab047 100644 --- a/speechx/speechx/nnet/decodable.h +++ b/speechx/speechx/nnet/decodable.h @@ -24,9 +24,8 @@ struct DecodableOpts; class Decodable : public kaldi::DecodableInterface { public: - explicit Decodable( - const std::shared_ptr& nnet, - const std::shared_ptr& frontend); + explicit Decodable(const std::shared_ptr& nnet, + const std::shared_ptr& frontend); // void Init(DecodableOpts config); virtual kaldi::BaseFloat LogLikelihood(int32 frame, int32 index); virtual bool IsLastFrame(int32 frame) const; From f83ec41161ef3ef8969495f34587ca6870f7cc79 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 1 Apr 2022 11:11:08 +0000 Subject: [PATCH 5/5] rename nnet itf --- speechx/speechx/nnet/{nnet_interface.h => nnet_itf.h} | 0 speechx/speechx/nnet/paddle_nnet.h | 7 ++++--- 2 files changed, 4 insertions(+), 3 deletions(-) rename speechx/speechx/nnet/{nnet_interface.h => nnet_itf.h} (100%) diff --git a/speechx/speechx/nnet/nnet_interface.h b/speechx/speechx/nnet/nnet_itf.h similarity index 100% rename from speechx/speechx/nnet/nnet_interface.h rename to speechx/speechx/nnet/nnet_itf.h diff --git a/speechx/speechx/nnet/paddle_nnet.h b/speechx/speechx/nnet/paddle_nnet.h index 30fbac9f1..906994d06 100644 --- a/speechx/speechx/nnet/paddle_nnet.h +++ b/speechx/speechx/nnet/paddle_nnet.h @@ -15,13 +15,14 @@ #pragma once -#include "base/common.h" -#include "nnet/nnet_interface.h" -#include "paddle_inference_api.h" #include "kaldi/matrix/kaldi-matrix.h" #include "kaldi/util/options-itf.h" +#include "base/common.h" +#include "nnet/nnet_itf.h" +#include "paddle_inference_api.h" + #include namespace ppspeech {