diff --git a/speechx/examples/decoder/offline_decoder_main.cc b/speechx/examples/decoder/offline_decoder_main.cc index 6bd83b9b1..9a9c14a0c 100644 --- a/speechx/examples/decoder/offline_decoder_main.cc +++ b/speechx/examples/decoder/offline_decoder_main.cc @@ -17,7 +17,7 @@ #include "base/flags.h" #include "base/log.h" #include "decoder/ctc_beam_search_decoder.h" -#include "frontend/data_cache.h" +#include "frontend/audio/data_cache.h" #include "kaldi/util/table-types.h" #include "nnet/decodable.h" #include "nnet/paddle_nnet.h" diff --git a/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc b/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc index 4d5ffe145..7f6c572ca 100644 --- a/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc +++ b/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc @@ -17,7 +17,7 @@ #include "base/flags.h" #include "base/log.h" #include "decoder/ctc_beam_search_decoder.h" -#include "frontend/data_cache.h" +#include "frontend/audio/data_cache.h" #include "kaldi/util/table-types.h" #include "nnet/decodable.h" #include "nnet/paddle_nnet.h" diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc index e1f0a8954..ca76d85c7 100644 --- a/speechx/examples/feat/linear_spectrogram_main.cc +++ b/speechx/examples/feat/linear_spectrogram_main.cc @@ -14,19 +14,18 @@ // todo refactor, repalce with gtest -#include "frontend/linear_spectrogram.h" #include "base/flags.h" #include "base/log.h" -#include "frontend/audio_cache.h" -#include "frontend/data_cache.h" -#include "frontend/feature_cache.h" -#include "frontend/feature_extractor_interface.h" -#include "frontend/normalizer.h" #include "kaldi/feat/wave-reader.h" #include "kaldi/util/kaldi-io.h" #include "kaldi/util/table-types.h" -#include +#include "frontend/audio/audio_cache.h" +#include "frontend/audio/data_cache.h" +#include "frontend/audio/feature_cache.h" +#include "frontend/audio/frontend_itf.h" +#include "frontend/audio/linear_spectrogram.h" +#include "frontend/audio/normalizer.h" DEFINE_string(wav_rspecifier, "", "test wav scp path"); DEFINE_string(feature_wspecifier, "", "output feats wspecifier"); @@ -170,13 +169,13 @@ int main(int argc, char* argv[]) { // feature pipeline: wave cache --> decibel_normalizer --> hanning // window -->linear_spectrogram --> global cmvn -> feat cache - // std::unique_ptr data_source(new + // std::unique_ptr data_source(new // ppspeech::DataCache()); - std::unique_ptr data_source( + std::unique_ptr data_source( new ppspeech::AudioCache()); ppspeech::DecibelNormalizerOptions db_norm_opt; - std::unique_ptr db_norm( + std::unique_ptr db_norm( new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source))); ppspeech::LinearSpectrogramOptions opt; @@ -185,12 +184,11 @@ int main(int argc, char* argv[]) { LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms; LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms; - std::unique_ptr linear_spectrogram( + std::unique_ptr linear_spectrogram( new ppspeech::LinearSpectrogram(opt, std::move(db_norm))); - std::unique_ptr cmvn( - new ppspeech::CMVN(FLAGS_cmvn_write_path, - std::move(linear_spectrogram))); + std::unique_ptr cmvn(new ppspeech::CMVN( + FLAGS_cmvn_write_path, std::move(linear_spectrogram))); ppspeech::FeatureCache feature_cache(kint16max, std::move(cmvn)); LOG(INFO) << "feat dim: " << feature_cache.Dim(); diff --git a/speechx/speechx/frontend/CMakeLists.txt b/speechx/speechx/frontend/CMakeLists.txt index d0ec008ee..7d10fdec9 100644 --- a/speechx/speechx/frontend/CMakeLists.txt +++ b/speechx/speechx/frontend/CMakeLists.txt @@ -1,10 +1,2 @@ -project(frontend) -add_library(frontend STATIC - normalizer.cc - linear_spectrogram.cc - audio_cache.cc - feature_cache.cc -) - -target_link_libraries(frontend PUBLIC kaldi-matrix) \ No newline at end of file +add_subdirectory(audio) \ No newline at end of file diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt index e69de29bb..35243b6e3 100644 --- a/speechx/speechx/frontend/audio/CMakeLists.txt +++ b/speechx/speechx/frontend/audio/CMakeLists.txt @@ -0,0 +1,11 @@ +project(frontend) + +add_library(frontend STATIC + cmvn.cc + db_norm.cc + linear_spectrogram.cc + audio_cache.cc + feature_cache.cc +) + +target_link_libraries(frontend PUBLIC kaldi-matrix) \ No newline at end of file diff --git a/speechx/speechx/frontend/audio_cache.cc b/speechx/speechx/frontend/audio/audio_cache.cc similarity index 98% rename from speechx/speechx/frontend/audio_cache.cc rename to speechx/speechx/frontend/audio/audio_cache.cc index d44ed592c..c3233e595 100644 --- a/speechx/speechx/frontend/audio_cache.cc +++ b/speechx/speechx/frontend/audio/audio_cache.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "frontend/audio_cache.h" +#include "frontend/audio/audio_cache.h" #include "kaldi/base/timer.h" namespace ppspeech { diff --git a/speechx/speechx/frontend/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h similarity index 94% rename from speechx/speechx/frontend/audio_cache.h rename to speechx/speechx/frontend/audio/audio_cache.h index b6c82c69e..17e1a8389 100644 --- a/speechx/speechx/frontend/audio_cache.h +++ b/speechx/speechx/frontend/audio/audio_cache.h @@ -16,12 +16,12 @@ #pragma once #include "base/common.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/audio/frontend_itf.h" namespace ppspeech { // waves cache -class AudioCache : public FeatureExtractorInterface { +class AudioCache : public FrontendInterface { public: explicit AudioCache(int buffer_size = kint16max); diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/audio/cmvn.cc similarity index 67% rename from speechx/speechx/frontend/normalizer.cc rename to speechx/speechx/frontend/audio/cmvn.cc index 524125619..4c1ffd6a1 100644 --- a/speechx/speechx/frontend/normalizer.cc +++ b/speechx/speechx/frontend/audio/cmvn.cc @@ -13,7 +13,7 @@ // limitations under the License. -#include "frontend/normalizer.h" +#include "frontend/audio/cmvn.h" #include "kaldi/feat/cmvn.h" #include "kaldi/util/kaldi-io.h" @@ -26,73 +26,8 @@ using std::vector; using kaldi::SubVector; using std::unique_ptr; -DecibelNormalizer::DecibelNormalizer( - const DecibelNormalizerOptions& opts, - std::unique_ptr base_extractor) { - base_extractor_ = std::move(base_extractor); - opts_ = opts; - dim_ = 1; -} - -void DecibelNormalizer::Accept(const kaldi::VectorBase& waves) { - base_extractor_->Accept(waves); -} - -bool DecibelNormalizer::Read(kaldi::Vector* waves) { - if (base_extractor_->Read(waves) == false || waves->Dim() == 0) { - return false; - } - Compute(waves); - return true; -} - -bool DecibelNormalizer::Compute(VectorBase* waves) const { - // calculate db rms - BaseFloat rms_db = 0.0; - BaseFloat mean_square = 0.0; - BaseFloat gain = 0.0; - BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1)); - - vector samples; - samples.resize(waves->Dim()); - for (size_t i = 0; i < samples.size(); ++i) { - samples[i] = (*waves)(i); - } - - // square - for (auto& d : samples) { - if (opts_.convert_int_float) { - d = d * wave_float_normlization; - } - mean_square += d * d; - } - - // mean - mean_square /= samples.size(); - rms_db = 10 * std::log10(mean_square); - gain = opts_.target_db - rms_db; - - if (gain > opts_.max_gain_db) { - LOG(ERROR) - << "Unable to normalize segment to " << opts_.target_db << "dB," - << "because the the probable gain have exceeds opts_.max_gain_db" - << opts_.max_gain_db << "dB."; - return false; - } - - // Note that this is an in-place transformation. - for (auto& item : samples) { - // python item *= 10.0 ** (gain / 20.0) - item *= std::pow(10.0, gain / 20.0); - } - - std::memcpy( - waves->Data(), samples.data(), sizeof(BaseFloat) * samples.size()); - return true; -} -CMVN::CMVN(std::string cmvn_file, - unique_ptr base_extractor) +CMVN::CMVN(std::string cmvn_file, unique_ptr base_extractor) : var_norm_(true) { base_extractor_ = std::move(base_extractor); bool binary; @@ -185,4 +120,4 @@ void CMVN::ApplyCMVN(kaldi::MatrixBase* feats) { ApplyCmvn(stats_, var_norm_, feats); } -} // namespace ppspeech +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/audio/cmvn.h b/speechx/speechx/frontend/audio/cmvn.h new file mode 100644 index 000000000..50ef5649b --- /dev/null +++ b/speechx/speechx/frontend/audio/cmvn.h @@ -0,0 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "base/common.h" +#include "frontend/audio/frontend_itf.h" +#include "kaldi/matrix/kaldi-matrix.h" +#include "kaldi/util/options-itf.h" + +namespace ppspeech { + +class CMVN : public FrontendInterface { + public: + explicit CMVN(std::string cmvn_file, + std::unique_ptr base_extractor); + virtual void Accept(const kaldi::VectorBase& inputs); + + // the length of feats = feature_row * feature_dim, + // the Matrix is squashed into Vector + virtual bool Read(kaldi::Vector* feats); + // the dim_ is the feautre dim. + virtual size_t Dim() const { return dim_; } + virtual void SetFinished() { base_extractor_->SetFinished(); } + virtual bool IsFinished() const { return base_extractor_->IsFinished(); } + virtual void Reset() { base_extractor_->Reset(); } + + private: + void Compute(kaldi::VectorBase* feats) const; + void ApplyCMVN(kaldi::MatrixBase* feats); + kaldi::Matrix stats_; + std::unique_ptr base_extractor_; + size_t dim_; + bool var_norm_; +}; + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/data_cache.h b/speechx/speechx/frontend/audio/data_cache.h similarity index 93% rename from speechx/speechx/frontend/data_cache.h rename to speechx/speechx/frontend/audio/data_cache.h index dea51d76e..a812278ce 100644 --- a/speechx/speechx/frontend/data_cache.h +++ b/speechx/speechx/frontend/audio/data_cache.h @@ -17,13 +17,13 @@ #include "base/common.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/audio/frontend_itf.h" namespace ppspeech { // A data source for testing different frontend module. // It accepts waves or feats. -class DataCache : public FeatureExtractorInterface { +class DataCache : public FrontendInterface { public: explicit DataCache() { finished_ = false; } diff --git a/speechx/speechx/frontend/audio/db_norm.cc b/speechx/speechx/frontend/audio/db_norm.cc new file mode 100644 index 000000000..931e932d6 --- /dev/null +++ b/speechx/speechx/frontend/audio/db_norm.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "frontend/audio/db_norm.h" +#include "kaldi/feat/cmvn.h" +#include "kaldi/util/kaldi-io.h" + +namespace ppspeech { + +using kaldi::Vector; +using kaldi::VectorBase; +using kaldi::BaseFloat; +using std::vector; +using kaldi::SubVector; +using std::unique_ptr; + +DecibelNormalizer::DecibelNormalizer( + const DecibelNormalizerOptions& opts, + std::unique_ptr base_extractor) { + base_extractor_ = std::move(base_extractor); + opts_ = opts; + dim_ = 1; +} + +void DecibelNormalizer::Accept(const kaldi::VectorBase& waves) { + base_extractor_->Accept(waves); +} + +bool DecibelNormalizer::Read(kaldi::Vector* waves) { + if (base_extractor_->Read(waves) == false || waves->Dim() == 0) { + return false; + } + Compute(waves); + return true; +} + +bool DecibelNormalizer::Compute(VectorBase* waves) const { + // calculate db rms + BaseFloat rms_db = 0.0; + BaseFloat mean_square = 0.0; + BaseFloat gain = 0.0; + BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1)); + + vector samples; + samples.resize(waves->Dim()); + for (size_t i = 0; i < samples.size(); ++i) { + samples[i] = (*waves)(i); + } + + // square + for (auto& d : samples) { + if (opts_.convert_int_float) { + d = d * wave_float_normlization; + } + mean_square += d * d; + } + + // mean + mean_square /= samples.size(); + rms_db = 10 * std::log10(mean_square); + gain = opts_.target_db - rms_db; + + if (gain > opts_.max_gain_db) { + LOG(ERROR) + << "Unable to normalize segment to " << opts_.target_db << "dB," + << "because the the probable gain have exceeds opts_.max_gain_db" + << opts_.max_gain_db << "dB."; + return false; + } + + // Note that this is an in-place transformation. + for (auto& item : samples) { + // python item *= 10.0 ** (gain / 20.0) + item *= std::pow(10.0, gain / 20.0); + } + + std::memcpy( + waves->Data(), samples.data(), sizeof(BaseFloat) * samples.size()); + return true; +} + + +} // namespace ppspeech diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/audio/db_norm.h similarity index 62% rename from speechx/speechx/frontend/normalizer.h rename to speechx/speechx/frontend/audio/db_norm.h index 352d1e167..425971437 100644 --- a/speechx/speechx/frontend/normalizer.h +++ b/speechx/speechx/frontend/audio/db_norm.h @@ -16,7 +16,7 @@ #pragma once #include "base/common.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/audio/frontend_itf.h" #include "kaldi/matrix/kaldi-matrix.h" #include "kaldi/util/options-itf.h" @@ -40,11 +40,11 @@ struct DecibelNormalizerOptions { } }; -class DecibelNormalizer : public FeatureExtractorInterface { +class DecibelNormalizer : public FrontendInterface { public: explicit DecibelNormalizer( const DecibelNormalizerOptions& opts, - std::unique_ptr base_extractor); + std::unique_ptr base_extractor); virtual void Accept(const kaldi::VectorBase& waves); virtual bool Read(kaldi::Vector* waves); // noramlize audio, the dim is 1. @@ -57,33 +57,9 @@ class DecibelNormalizer : public FeatureExtractorInterface { bool Compute(kaldi::VectorBase* waves) const; DecibelNormalizerOptions opts_; size_t dim_; - std::unique_ptr base_extractor_; + std::unique_ptr base_extractor_; kaldi::Vector waveform_; }; -class CMVN : public FeatureExtractorInterface { - public: - explicit CMVN(std::string cmvn_file, - std::unique_ptr base_extractor); - virtual void Accept(const kaldi::VectorBase& inputs); - - // the length of feats = feature_row * feature_dim, - // the Matrix is squashed into Vector - virtual bool Read(kaldi::Vector* feats); - // the dim_ is the feautre dim. - virtual size_t Dim() const { return dim_; } - virtual void SetFinished() { base_extractor_->SetFinished(); } - virtual bool IsFinished() const { return base_extractor_->IsFinished(); } - virtual void Reset() { base_extractor_->Reset(); } - - private: - void Compute(kaldi::VectorBase* feats) const; - void ApplyCMVN(kaldi::MatrixBase* feats); - kaldi::Matrix stats_; - std::unique_ptr base_extractor_; - size_t dim_; - bool var_norm_; -}; - } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/fbank.h b/speechx/speechx/frontend/audio/fbank.h similarity index 88% rename from speechx/speechx/frontend/fbank.h rename to speechx/speechx/frontend/audio/fbank.h index 7d9cf4221..68267b3d0 100644 --- a/speechx/speechx/frontend/fbank.h +++ b/speechx/speechx/frontend/audio/fbank.h @@ -20,10 +20,10 @@ namespace ppspeech { -class FbankExtractor : FeatureExtractorInterface { +class FbankExtractor : FrontendInterface { public: explicit FbankExtractor(const FbankOptions& opts, - share_ptr pre_extractor); + share_ptr pre_extractor); virtual void AcceptWaveform( const kaldi::Vector& input) = 0; virtual void Read(kaldi::Vector* feat) = 0; diff --git a/speechx/speechx/frontend/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc similarity index 92% rename from speechx/speechx/frontend/feature_cache.cc rename to speechx/speechx/frontend/audio/feature_cache.cc index dad6907ce..3f7f6502b 100644 --- a/speechx/speechx/frontend/feature_cache.cc +++ b/speechx/speechx/frontend/audio/feature_cache.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "frontend/feature_cache.h" +#include "frontend/audio/feature_cache.h" namespace ppspeech { @@ -23,8 +23,8 @@ using std::vector; using kaldi::SubVector; using std::unique_ptr; -FeatureCache::FeatureCache( - int max_size, unique_ptr base_extractor) { +FeatureCache::FeatureCache(int max_size, + unique_ptr base_extractor) { max_size_ = max_size; base_extractor_ = std::move(base_extractor); } diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h similarity index 87% rename from speechx/speechx/frontend/feature_cache.h rename to speechx/speechx/frontend/audio/feature_cache.h index f52b9b0f6..99961b5e2 100644 --- a/speechx/speechx/frontend/feature_cache.h +++ b/speechx/speechx/frontend/audio/feature_cache.h @@ -15,15 +15,15 @@ #pragma once #include "base/common.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/audio/frontend_itf.h" namespace ppspeech { -class FeatureCache : public FeatureExtractorInterface { +class FeatureCache : public FrontendInterface { public: explicit FeatureCache( int32 max_size = kint16max, - std::unique_ptr base_extractor = NULL); + std::unique_ptr base_extractor = NULL); // Feed feats or waves virtual void Accept(const kaldi::VectorBase& inputs); @@ -53,7 +53,7 @@ class FeatureCache : public FeatureExtractorInterface { bool Compute(); size_t max_size_; - std::unique_ptr base_extractor_; + std::unique_ptr base_extractor_; std::mutex mutex_; std::queue> cache_; diff --git a/speechx/speechx/frontend/feature_extractor_interface.h b/speechx/speechx/frontend/audio/frontend_itf.h similarity index 97% rename from speechx/speechx/frontend/feature_extractor_interface.h rename to speechx/speechx/frontend/audio/frontend_itf.h index 5da2526b9..7913cc7c0 100644 --- a/speechx/speechx/frontend/feature_extractor_interface.h +++ b/speechx/speechx/frontend/audio/frontend_itf.h @@ -19,7 +19,7 @@ namespace ppspeech { -class FeatureExtractorInterface { +class FrontendInterface { public: // Feed inputs: features(2D saved in 1D) or waveforms(1D). virtual void Accept(const kaldi::VectorBase& inputs) = 0; diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/audio/linear_spectrogram.cc similarity index 97% rename from speechx/speechx/frontend/linear_spectrogram.cc rename to speechx/speechx/frontend/audio/linear_spectrogram.cc index 41bc8743a..827b8eccf 100644 --- a/speechx/speechx/frontend/linear_spectrogram.cc +++ b/speechx/speechx/frontend/audio/linear_spectrogram.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "frontend/linear_spectrogram.h" +#include "frontend/audio/linear_spectrogram.h" #include "kaldi/base/kaldi-math.h" #include "kaldi/matrix/matrix-functions.h" @@ -27,7 +27,7 @@ using std::vector; LinearSpectrogram::LinearSpectrogram( const LinearSpectrogramOptions& opts, - std::unique_ptr base_extractor) { + std::unique_ptr base_extractor) { opts_ = opts; base_extractor_ = std::move(base_extractor); int32 window_size = opts.frame_opts.WindowSize(); diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/audio/linear_spectrogram.h similarity index 89% rename from speechx/speechx/frontend/linear_spectrogram.h rename to speechx/speechx/frontend/audio/linear_spectrogram.h index 10853904d..bbf8d6853 100644 --- a/speechx/speechx/frontend/linear_spectrogram.h +++ b/speechx/speechx/frontend/audio/linear_spectrogram.h @@ -16,7 +16,7 @@ #pragma once #include "base/common.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/audio/frontend_itf.h" #include "kaldi/feat/feature-window.h" namespace ppspeech { @@ -35,11 +35,11 @@ struct LinearSpectrogramOptions { } }; -class LinearSpectrogram : public FeatureExtractorInterface { +class LinearSpectrogram : public FrontendInterface { public: explicit LinearSpectrogram( const LinearSpectrogramOptions& opts, - std::unique_ptr base_extractor); + std::unique_ptr base_extractor); virtual void Accept(const kaldi::VectorBase& inputs); virtual bool Read(kaldi::Vector* feats); // the dim_ is the dim of single frame feature @@ -61,7 +61,7 @@ class LinearSpectrogram : public FeatureExtractorInterface { std::vector hanning_window_; kaldi::BaseFloat hanning_window_energy_; LinearSpectrogramOptions opts_; - std::unique_ptr base_extractor_; + std::unique_ptr base_extractor_; int chunk_sample_size_; DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram); }; diff --git a/speechx/speechx/frontend/mfcc.h b/speechx/speechx/frontend/audio/mfcc.h similarity index 100% rename from speechx/speechx/frontend/mfcc.h rename to speechx/speechx/frontend/audio/mfcc.h diff --git a/speechx/speechx/frontend/window.h b/speechx/speechx/frontend/audio/normalizer.h similarity index 88% rename from speechx/speechx/frontend/window.h rename to speechx/speechx/frontend/audio/normalizer.h index 70d6307ec..dcf721dd2 100644 --- a/speechx/speechx/frontend/window.h +++ b/speechx/speechx/frontend/audio/normalizer.h @@ -12,4 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// extract the window of kaldi feat. +#pragma once + +#include "frontend/audio/cmvn.h" +#include "frontend/audio/db_norm.h" \ No newline at end of file diff --git a/speechx/speechx/frontend/feature_extractor_controller.h b/speechx/speechx/frontend/feature_extractor_controller.h deleted file mode 100644 index 0544a1e29..000000000 --- a/speechx/speechx/frontend/feature_extractor_controller.h +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. diff --git a/speechx/speechx/frontend/feature_extractor_controller_impl.h b/speechx/speechx/frontend/feature_extractor_controller_impl.h deleted file mode 100644 index 0544a1e29..000000000 --- a/speechx/speechx/frontend/feature_extractor_controller_impl.h +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index e6315d07a..542168d24 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -22,7 +22,7 @@ using std::vector; using kaldi::Vector; Decodable::Decodable(const std::shared_ptr& nnet, - const std::shared_ptr& frontend) + const std::shared_ptr& frontend) : frontend_(frontend), nnet_(nnet), frame_offset_(0), frames_ready_(0) {} void Decodable::Acceptlikelihood(const Matrix& likelihood) { diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h index 7938b5823..3f0aab047 100644 --- a/speechx/speechx/nnet/decodable.h +++ b/speechx/speechx/nnet/decodable.h @@ -13,7 +13,7 @@ // limitations under the License. #include "base/common.h" -#include "frontend/feature_extractor_interface.h" +#include "frontend/audio/frontend_itf.h" #include "kaldi/matrix/kaldi-matrix.h" #include "nnet/decodable-itf.h" #include "nnet/nnet_interface.h" @@ -24,9 +24,8 @@ struct DecodableOpts; class Decodable : public kaldi::DecodableInterface { public: - explicit Decodable( - const std::shared_ptr& nnet, - const std::shared_ptr& frontend); + explicit Decodable(const std::shared_ptr& nnet, + const std::shared_ptr& frontend); // void Init(DecodableOpts config); virtual kaldi::BaseFloat LogLikelihood(int32 frame, int32 index); virtual bool IsLastFrame(int32 frame) const; @@ -41,7 +40,7 @@ class Decodable : public kaldi::DecodableInterface { private: bool AdvanceChunk(); - std::shared_ptr frontend_; + std::shared_ptr frontend_; std::shared_ptr nnet_; kaldi::Matrix nnet_cache_; // std::vector> nnet_cache_; diff --git a/speechx/speechx/nnet/nnet_interface.h b/speechx/speechx/nnet/nnet_itf.h similarity index 100% rename from speechx/speechx/nnet/nnet_interface.h rename to speechx/speechx/nnet/nnet_itf.h diff --git a/speechx/speechx/nnet/paddle_nnet.h b/speechx/speechx/nnet/paddle_nnet.h index 30fbac9f1..906994d06 100644 --- a/speechx/speechx/nnet/paddle_nnet.h +++ b/speechx/speechx/nnet/paddle_nnet.h @@ -15,13 +15,14 @@ #pragma once -#include "base/common.h" -#include "nnet/nnet_interface.h" -#include "paddle_inference_api.h" #include "kaldi/matrix/kaldi-matrix.h" #include "kaldi/util/options-itf.h" +#include "base/common.h" +#include "nnet/nnet_itf.h" +#include "paddle_inference_api.h" + #include namespace ppspeech {