From a9f4ce47a34bbd62c88090ef9a6e3498dbfc669a Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 1 Apr 2022 10:24:16 +0000 Subject: [PATCH] frontend itf --- speechx/examples/feat/linear_spectrogram_main.cc | 12 ++++++------ speechx/speechx/frontend/audio_cache.h | 4 ++-- speechx/speechx/frontend/data_cache.h | 4 ++-- speechx/speechx/frontend/fbank.h | 4 ++-- speechx/speechx/frontend/feature_cache.cc | 2 +- speechx/speechx/frontend/feature_cache.h | 8 ++++---- .../frontend/feature_extractor_controller.h | 13 ------------- .../frontend/feature_extractor_controller_impl.h | 13 ------------- ...eature_extractor_interface.h => frontend_itf.h} | 2 +- speechx/speechx/frontend/linear_spectrogram.cc | 2 +- speechx/speechx/frontend/linear_spectrogram.h | 8 ++++---- speechx/speechx/frontend/normalizer.cc | 4 ++-- speechx/speechx/frontend/normalizer.h | 14 +++++++------- speechx/speechx/nnet/decodable.cc | 2 +- speechx/speechx/nnet/decodable.h | 6 +++--- 15 files changed, 36 insertions(+), 62 deletions(-) delete mode 100644 speechx/speechx/frontend/feature_extractor_controller.h delete mode 100644 speechx/speechx/frontend/feature_extractor_controller_impl.h rename speechx/speechx/frontend/{feature_extractor_interface.h => frontend_itf.h} (97%) diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc index e1f0a895..c29d2b21 100644 --- a/speechx/examples/feat/linear_spectrogram_main.cc +++ b/speechx/examples/feat/linear_spectrogram_main.cc @@ -20,7 +20,7 @@ #include "frontend/audio_cache.h" #include "frontend/data_cache.h" #include "frontend/feature_cache.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/frontend_itf.h" #include "frontend/normalizer.h" #include "kaldi/feat/wave-reader.h" #include "kaldi/util/kaldi-io.h" @@ -170,13 +170,13 @@ int main(int argc, char* argv[]) { // feature pipeline: wave cache --> decibel_normalizer --> hanning // window -->linear_spectrogram --> global cmvn -> feat cache - // std::unique_ptr data_source(new + // std::unique_ptr data_source(new // ppspeech::DataCache()); - std::unique_ptr data_source( + std::unique_ptr data_source( new ppspeech::AudioCache()); ppspeech::DecibelNormalizerOptions db_norm_opt; - std::unique_ptr db_norm( + std::unique_ptr db_norm( new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source))); ppspeech::LinearSpectrogramOptions opt; @@ -185,10 +185,10 @@ int main(int argc, char* argv[]) { LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms; LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms; - std::unique_ptr linear_spectrogram( + std::unique_ptr linear_spectrogram( new ppspeech::LinearSpectrogram(opt, std::move(db_norm))); - std::unique_ptr cmvn( + std::unique_ptr cmvn( new ppspeech::CMVN(FLAGS_cmvn_write_path, std::move(linear_spectrogram))); diff --git a/speechx/speechx/frontend/audio_cache.h b/speechx/speechx/frontend/audio_cache.h index b6c82c69..f48da12b 100644 --- a/speechx/speechx/frontend/audio_cache.h +++ b/speechx/speechx/frontend/audio_cache.h @@ -16,12 +16,12 @@ #pragma once #include "base/common.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/frontend_itf.h" namespace ppspeech { // waves cache -class AudioCache : public FeatureExtractorInterface { +class AudioCache : public FrontendInterface { public: explicit AudioCache(int buffer_size = kint16max); diff --git a/speechx/speechx/frontend/data_cache.h b/speechx/speechx/frontend/data_cache.h index dea51d76..b8ce6bf6 100644 --- a/speechx/speechx/frontend/data_cache.h +++ b/speechx/speechx/frontend/data_cache.h @@ -17,13 +17,13 @@ #include "base/common.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/frontend_itf.h" namespace ppspeech { // A data source for testing different frontend module. // It accepts waves or feats. -class DataCache : public FeatureExtractorInterface { +class DataCache : public FrontendInterface { public: explicit DataCache() { finished_ = false; } diff --git a/speechx/speechx/frontend/fbank.h b/speechx/speechx/frontend/fbank.h index 7d9cf422..68267b3d 100644 --- a/speechx/speechx/frontend/fbank.h +++ b/speechx/speechx/frontend/fbank.h @@ -20,10 +20,10 @@ namespace ppspeech { -class FbankExtractor : FeatureExtractorInterface { +class FbankExtractor : FrontendInterface { public: explicit FbankExtractor(const FbankOptions& opts, - share_ptr pre_extractor); + share_ptr pre_extractor); virtual void AcceptWaveform( const kaldi::Vector& input) = 0; virtual void Read(kaldi::Vector* feat) = 0; diff --git a/speechx/speechx/frontend/feature_cache.cc b/speechx/speechx/frontend/feature_cache.cc index dad6907c..53b7076d 100644 --- a/speechx/speechx/frontend/feature_cache.cc +++ b/speechx/speechx/frontend/feature_cache.cc @@ -24,7 +24,7 @@ using kaldi::SubVector; using std::unique_ptr; FeatureCache::FeatureCache( - int max_size, unique_ptr base_extractor) { + int max_size, unique_ptr base_extractor) { max_size_ = max_size; base_extractor_ = std::move(base_extractor); } diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h index f52b9b0f..1281ec35 100644 --- a/speechx/speechx/frontend/feature_cache.h +++ b/speechx/speechx/frontend/feature_cache.h @@ -15,15 +15,15 @@ #pragma once #include "base/common.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/frontend_itf.h" namespace ppspeech { -class FeatureCache : public FeatureExtractorInterface { +class FeatureCache : public FrontendInterface { public: explicit FeatureCache( int32 max_size = kint16max, - std::unique_ptr base_extractor = NULL); + std::unique_ptr base_extractor = NULL); // Feed feats or waves virtual void Accept(const kaldi::VectorBase& inputs); @@ -53,7 +53,7 @@ class FeatureCache : public FeatureExtractorInterface { bool Compute(); size_t max_size_; - std::unique_ptr base_extractor_; + std::unique_ptr base_extractor_; std::mutex mutex_; std::queue> cache_; diff --git a/speechx/speechx/frontend/feature_extractor_controller.h b/speechx/speechx/frontend/feature_extractor_controller.h deleted file mode 100644 index 0544a1e2..00000000 --- a/speechx/speechx/frontend/feature_extractor_controller.h +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. diff --git a/speechx/speechx/frontend/feature_extractor_controller_impl.h b/speechx/speechx/frontend/feature_extractor_controller_impl.h deleted file mode 100644 index 0544a1e2..00000000 --- a/speechx/speechx/frontend/feature_extractor_controller_impl.h +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. diff --git a/speechx/speechx/frontend/feature_extractor_interface.h b/speechx/speechx/frontend/frontend_itf.h similarity index 97% rename from speechx/speechx/frontend/feature_extractor_interface.h rename to speechx/speechx/frontend/frontend_itf.h index 5da2526b..7913cc7c 100644 --- a/speechx/speechx/frontend/feature_extractor_interface.h +++ b/speechx/speechx/frontend/frontend_itf.h @@ -19,7 +19,7 @@ namespace ppspeech { -class FeatureExtractorInterface { +class FrontendInterface { public: // Feed inputs: features(2D saved in 1D) or waveforms(1D). virtual void Accept(const kaldi::VectorBase& inputs) = 0; diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/linear_spectrogram.cc index 41bc8743..2ba00785 100644 --- a/speechx/speechx/frontend/linear_spectrogram.cc +++ b/speechx/speechx/frontend/linear_spectrogram.cc @@ -27,7 +27,7 @@ using std::vector; LinearSpectrogram::LinearSpectrogram( const LinearSpectrogramOptions& opts, - std::unique_ptr base_extractor) { + std::unique_ptr base_extractor) { opts_ = opts; base_extractor_ = std::move(base_extractor); int32 window_size = opts.frame_opts.WindowSize(); diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/linear_spectrogram.h index 10853904..136441ef 100644 --- a/speechx/speechx/frontend/linear_spectrogram.h +++ b/speechx/speechx/frontend/linear_spectrogram.h @@ -16,7 +16,7 @@ #pragma once #include "base/common.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/frontend_itf.h" #include "kaldi/feat/feature-window.h" namespace ppspeech { @@ -35,11 +35,11 @@ struct LinearSpectrogramOptions { } }; -class LinearSpectrogram : public FeatureExtractorInterface { +class LinearSpectrogram : public FrontendInterface { public: explicit LinearSpectrogram( const LinearSpectrogramOptions& opts, - std::unique_ptr base_extractor); + std::unique_ptr base_extractor); virtual void Accept(const kaldi::VectorBase& inputs); virtual bool Read(kaldi::Vector* feats); // the dim_ is the dim of single frame feature @@ -61,7 +61,7 @@ class LinearSpectrogram : public FeatureExtractorInterface { std::vector hanning_window_; kaldi::BaseFloat hanning_window_energy_; LinearSpectrogramOptions opts_; - std::unique_ptr base_extractor_; + std::unique_ptr base_extractor_; int chunk_sample_size_; DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram); }; diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/normalizer.cc index 52412561..26f11b69 100644 --- a/speechx/speechx/frontend/normalizer.cc +++ b/speechx/speechx/frontend/normalizer.cc @@ -28,7 +28,7 @@ using std::unique_ptr; DecibelNormalizer::DecibelNormalizer( const DecibelNormalizerOptions& opts, - std::unique_ptr base_extractor) { + std::unique_ptr base_extractor) { base_extractor_ = std::move(base_extractor); opts_ = opts; dim_ = 1; @@ -92,7 +92,7 @@ bool DecibelNormalizer::Compute(VectorBase* waves) const { } CMVN::CMVN(std::string cmvn_file, - unique_ptr base_extractor) + unique_ptr base_extractor) : var_norm_(true) { base_extractor_ = std::move(base_extractor); bool binary; diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/normalizer.h index 352d1e16..df181961 100644 --- a/speechx/speechx/frontend/normalizer.h +++ b/speechx/speechx/frontend/normalizer.h @@ -16,7 +16,7 @@ #pragma once #include "base/common.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/frontend_itf.h" #include "kaldi/matrix/kaldi-matrix.h" #include "kaldi/util/options-itf.h" @@ -40,11 +40,11 @@ struct DecibelNormalizerOptions { } }; -class DecibelNormalizer : public FeatureExtractorInterface { +class DecibelNormalizer : public FrontendInterface { public: explicit DecibelNormalizer( const DecibelNormalizerOptions& opts, - std::unique_ptr base_extractor); + std::unique_ptr base_extractor); virtual void Accept(const kaldi::VectorBase& waves); virtual bool Read(kaldi::Vector* waves); // noramlize audio, the dim is 1. @@ -57,15 +57,15 @@ class DecibelNormalizer : public FeatureExtractorInterface { bool Compute(kaldi::VectorBase* waves) const; DecibelNormalizerOptions opts_; size_t dim_; - std::unique_ptr base_extractor_; + std::unique_ptr base_extractor_; kaldi::Vector waveform_; }; -class CMVN : public FeatureExtractorInterface { +class CMVN : public FrontendInterface { public: explicit CMVN(std::string cmvn_file, - std::unique_ptr base_extractor); + std::unique_ptr base_extractor); virtual void Accept(const kaldi::VectorBase& inputs); // the length of feats = feature_row * feature_dim, @@ -81,7 +81,7 @@ class CMVN : public FeatureExtractorInterface { void Compute(kaldi::VectorBase* feats) const; void ApplyCMVN(kaldi::MatrixBase* feats); kaldi::Matrix stats_; - std::unique_ptr base_extractor_; + std::unique_ptr base_extractor_; size_t dim_; bool var_norm_; }; diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index e6315d07..542168d2 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -22,7 +22,7 @@ using std::vector; using kaldi::Vector; Decodable::Decodable(const std::shared_ptr& nnet, - const std::shared_ptr& frontend) + const std::shared_ptr& frontend) : frontend_(frontend), nnet_(nnet), frame_offset_(0), frames_ready_(0) {} void Decodable::Acceptlikelihood(const Matrix& likelihood) { diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h index 7938b582..ef17601f 100644 --- a/speechx/speechx/nnet/decodable.h +++ b/speechx/speechx/nnet/decodable.h @@ -13,7 +13,7 @@ // limitations under the License. #include "base/common.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/frontend_itf.h" #include "kaldi/matrix/kaldi-matrix.h" #include "nnet/decodable-itf.h" #include "nnet/nnet_interface.h" @@ -26,7 +26,7 @@ class Decodable : public kaldi::DecodableInterface { public: explicit Decodable( const std::shared_ptr& nnet, - const std::shared_ptr& frontend); + const std::shared_ptr& frontend); // void Init(DecodableOpts config); virtual kaldi::BaseFloat LogLikelihood(int32 frame, int32 index); virtual bool IsLastFrame(int32 frame) const; @@ -41,7 +41,7 @@ class Decodable : public kaldi::DecodableInterface { private: bool AdvanceChunk(); - std::shared_ptr frontend_; + std::shared_ptr frontend_; std::shared_ptr nnet_; kaldi::Matrix nnet_cache_; // std::vector> nnet_cache_;