From c82cf0d0866dc34e9547fc02ab9e4d42ed16679b Mon Sep 17 00:00:00 2001 From: SmileGoat Date: Mon, 7 Mar 2022 20:51:53 +0800 Subject: [PATCH 1/7] add raw_audio & feature_cache --- .../examples/feat/linear-spectrogram-main.cc | 257 ------------------ speechx/speechx/frontend/feature_cache.cc | 38 +++ speechx/speechx/frontend/feature_cache.h | 21 ++ speechx/speechx/frontend/raw_audio.cc | 60 ++++ speechx/speechx/frontend/raw_audio.h | 34 +++ 5 files changed, 153 insertions(+), 257 deletions(-) delete mode 100644 speechx/examples/feat/linear-spectrogram-main.cc create mode 100644 speechx/speechx/frontend/feature_cache.cc create mode 100644 speechx/speechx/frontend/feature_cache.h create mode 100644 speechx/speechx/frontend/raw_audio.cc create mode 100644 speechx/speechx/frontend/raw_audio.h diff --git a/speechx/examples/feat/linear-spectrogram-main.cc b/speechx/examples/feat/linear-spectrogram-main.cc deleted file mode 100644 index 3e2342c2..00000000 --- a/speechx/examples/feat/linear-spectrogram-main.cc +++ /dev/null @@ -1,257 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// todo refactor, repalce with gtest - -#include "base/flags.h" -#include "base/log.h" -#include "frontend/feature_extractor_interface.h" -#include "frontend/linear_spectrogram.h" -#include "frontend/normalizer.h" -#include "kaldi/feat/wave-reader.h" -#include "kaldi/util/kaldi-io.h" -#include "kaldi/util/table-types.h" - -DEFINE_string(wav_rspecifier, "", "test wav path"); -DEFINE_string(feature_wspecifier, "", "test wav ark"); -DEFINE_string(feature_check_wspecifier, "", "test wav ark"); -DEFINE_string(cmvn_write_path, "./cmvn.ark", "test wav ark"); - - -std::vector mean_{ - -13730251.531853663, -12982852.199316509, -13673844.299583456, - -13089406.559646806, -12673095.524938712, -12823859.223276224, - -13590267.158903603, -14257618.467152044, -14374605.116185192, - -14490009.21822485, -14849827.158924166, -15354435.470563512, - -15834149.206532761, -16172971.985514281, -16348740.496746974, - -16423536.699409386, -16556246.263649225, -16744088.772748645, - -16916184.08510357, -17054034.840031497, -17165612.509455364, - -17255955.470915023, -17322572.527648456, -17408943.862033736, - -17521554.799865916, -17620623.254924215, -17699792.395918526, - -17723364.411134344, -17741483.4433254, -17747426.888704527, - -17733315.928209435, -17748780.160905756, -17808336.883775543, - -17895918.671983004, -18009812.59173023, -18098188.66548325, - -18195798.958462656, -18293617.62980999, -18397432.92077201, - -18505834.787318766, -18585451.8100908, -18652438.235649142, - -18700960.306275308, -18734944.58792185, -18737426.313365128, - -18735347.165987637, -18738813.444170244, -18737086.848890636, - -18731576.2474336, -18717405.44095871, -18703089.25545657, - -18691014.546456724, -18692460.568905357, -18702119.628629155, - -18727710.621126678, -18761582.72034647, -18806745.835547544, - -18850674.8692112, -18884431.510951452, -18919999.992506847, - -18939303.799078144, -18952946.273760635, -18980289.22996379, - -19011610.17803294, -19040948.61805145, -19061021.429847397, - -19112055.53768819, -19149667.414264943, -19201127.05091321, - -19270250.82564605, -19334606.883057203, -19390513.336589377, - -19444176.259208687, -19502755.000038862, -19544333.014549147, - -19612668.183176614, -19681902.19006569, -19771969.951249883, - -19873329.723376893, -19996752.59235844, -20110031.131400537, - -20231658.612529557, -20319378.894054495, -20378534.45718066, - -20413332.089584175, -20438147.844177883, -20443710.248040095, - -20465457.02238927, -20488610.969337028, -20516295.16424432, - -20541423.795738827, -20553192.874953747, -20573605.50701977, - -20577871.61936797, -20571807.008916274, -20556242.38912231, - -20542199.30819195, -20521239.063551214, -20519150.80004532, - -20527204.80248933, -20536933.769257784, -20543470.522332076, - -20549700.089992985, -20551525.24958494, -20554873.406493705, - -20564277.65794227, -20572211.740052115, -20574305.69550465, - -20575494.450104576, -20567092.577932164, -20549302.929608088, - -20545445.11878376, -20546625.326603737, -20549190.03499401, - -20554824.947828256, -20568341.378989458, -20577582.331383612, - -20577980.519402675, -20566603.03458152, -20560131.592262644, - -20552166.469060015, -20549063.06763577, -20544490.562339947, - -20539817.82346569, -20528747.715731595, -20518026.24576161, - -20510977.844974525, -20506874.36087992, -20506731.11977665, - -20510482.133420516, -20507760.92101862, -20494644.834457114, - -20480107.89304893, -20461312.091867123, -20442941.75080173, - -20426123.02834838, -20424607.675283, -20426810.369107097, - -20434024.50097819, -20437404.75544205, -20447688.63916367, - -20460893.335563846, -20482922.735127095, -20503610.119434915, - -20527062.76448319, -20557830.035128627, -20593274.72068722, - -20632528.452965066, -20673637.471334763, -20733106.97143075, - -20842921.0447562, -21054357.83621519, -21416569.534189366, - -21978460.272811692, -22753170.052172784, -23671344.10563395, - -24613499.293358143, -25406477.12230188, -25884377.82156489, - -26049040.62791664, -26996879.104431007}; -std::vector variance_{ - 213747175.10846674, 188395815.34302503, 212706429.10966414, - 199109025.81461075, 189235901.23864496, 194901336.53253657, - 217481594.29306737, 238689869.12327808, 243977501.24115244, - 248479623.6431067, 259766741.47116545, 275516766.7790273, - 291271202.3691234, 302693239.8220509, 308627358.3997694, - 311143911.38788426, 315446105.07731867, 321705430.9341829, - 327458907.4659941, 332245072.43223983, 336251717.5935284, - 339694069.7639722, 342188204.4322228, 345587110.31313115, - 349903086.2875232, 353660214.20643026, 356700344.5270885, - 357665362.3529641, 358493352.05658793, 358857951.620328, - 358375239.52774596, 358899733.6342954, 361051818.3511561, - 364361716.05025816, 368750322.3771452, 372047800.6462831, - 375655861.1349018, 379358519.1980013, 383327605.3935181, - 387458599.282341, 390434692.3406868, 392994486.35057056, - 394874418.04603153, 396230525.79763395, 396365592.0414835, - 396334819.8242737, 396488353.19250053, 396438877.00744957, - 396197980.4459586, 395590921.6672991, 395001107.62072515, - 394528291.7318225, 394593110.424006, 395018405.59353715, - 396110577.5415993, 397506704.0371068, 399400197.4657644, - 401243568.2468382, 402687134.7805103, 404136047.2872507, - 404883170.001883, 405522253.219517, 406660365.3626476, - 407919346.0991902, 409045348.5384909, 409759588.7889818, - 411974821.8564483, 413489718.78201455, 415535392.56684107, - 418466481.97674364, 421104678.35678065, 423405392.5200779, - 425550570.40798235, 427929423.9579701, 429585274.253478, - 432368493.55181056, 435193587.13513297, 438886855.20476013, - 443058876.8633751, 448181232.5093362, 452883835.6332396, - 458056721.77926534, 461816531.22735566, 464363620.1970998, - 465886343.5057493, 466928872.0651, 467180536.42647296, - 468111848.70714295, 469138695.3071312, 470378429.6930793, - 471517958.7132626, 472109050.4262365, 473087417.0177867, - 473381322.04648733, 473220195.85483915, 472666071.8998819, - 472124669.87879956, 471298571.411737, 471251033.2902761, - 471672676.43128747, 472177147.2193172, 472572361.7711908, - 472968783.7751127, 473156295.4164052, 473398034.82676554, - 473897703.5203811, 474328271.33112127, 474452670.98002136, - 474549003.99284613, 474252887.13567275, 473557462.909069, - 473483385.85193115, 473609738.04855174, 473746944.82085115, - 474016729.91696435, 474617321.94138587, 475045097.237122, - 475125402.586558, 474664112.9824912, 474426247.5800283, - 474104075.42796475, 473978219.7273978, 473773171.7798875, - 473578534.69508696, 473102924.16904145, 472651240.5232615, - 472374383.1810912, 472209479.6956096, 472202298.8921673, - 472370090.76781124, 472220933.99374026, 471625467.37106377, - 470994646.51883453, 470182428.9637543, 469348211.5939578, - 468570387.4467277, 468540442.7225135, 468672018.90414184, - 468994346.9533251, 469138757.58201426, 469553915.95710236, - 470134523.38582784, 471082421.62055486, 471962316.51804745, - 472939745.1708408, 474250621.5944825, 475773933.43199486, - 477465399.71087736, 479218782.61382693, 481752299.7930922, - 486608947.8984568, 496119403.2067917, 512730085.5704984, - 539048915.2641417, 576285298.3548826, 621610270.2240586, - 669308196.4436442, 710656993.5957186, 736344437.3725077, - 745481288.0241544, 801121432.9925804}; -int count_ = 912592; - -void WriteMatrix() { - kaldi::Matrix cmvn_stats(2, mean_.size() + 1); - for (size_t idx = 0; idx < mean_.size(); ++idx) { - cmvn_stats(0, idx) = mean_[idx]; - cmvn_stats(1, idx) = variance_[idx]; - } - cmvn_stats(0, mean_.size()) = count_; - kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, true); -} - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - kaldi::SequentialTableReader wav_reader( - FLAGS_wav_rspecifier); - kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier); - kaldi::BaseFloatMatrixWriter feat_cmvn_check_writer( - FLAGS_feature_check_wspecifier); - WriteMatrix(); - - // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning - // window -->linear_spectrogram --> cmvn - int32 num_done = 0, num_err = 0; - ppspeech::LinearSpectrogramOptions opt; - opt.frame_opts.frame_length_ms = 20; - opt.frame_opts.frame_shift_ms = 10; - ppspeech::DecibelNormalizerOptions db_norm_opt; - std::unique_ptr base_feature_extractor( - new ppspeech::DecibelNormalizer(db_norm_opt)); - ppspeech::LinearSpectrogram linear_spectrogram( - opt, std::move(base_feature_extractor)); - - ppspeech::CMVN cmvn(FLAGS_cmvn_write_path); - - float streaming_chunk = 0.36; - int sample_rate = 16000; - int chunk_sample_size = streaming_chunk * sample_rate; - - LOG(INFO) << mean_.size(); - for (size_t i = 0; i < mean_.size(); i++) { - mean_[i] /= count_; - variance_[i] = variance_[i] / count_ - mean_[i] * mean_[i]; - if (variance_[i] < 1.0e-20) { - variance_[i] = 1.0e-20; - } - variance_[i] = 1.0 / std::sqrt(variance_[i]); - } - - for (; !wav_reader.Done(); wav_reader.Next()) { - std::string utt = wav_reader.Key(); - const kaldi::WaveData& wave_data = wav_reader.Value(); - - int32 this_channel = 0; - kaldi::SubVector waveform(wave_data.Data(), - this_channel); - int tot_samples = waveform.Dim(); - int sample_offset = 0; - std::vector> feats; - int feature_rows = 0; - while (sample_offset < tot_samples) { - int cur_chunk_size = - std::min(chunk_sample_size, tot_samples - sample_offset); - kaldi::Vector wav_chunk(cur_chunk_size); - for (int i = 0; i < cur_chunk_size; ++i) { - wav_chunk(i) = waveform(sample_offset + i); - } - kaldi::Matrix features; - linear_spectrogram.AcceptWaveform(wav_chunk); - linear_spectrogram.ReadFeats(&features); - - feats.push_back(features); - sample_offset += cur_chunk_size; - feature_rows += features.NumRows(); - } - - int cur_idx = 0; - kaldi::Matrix features(feature_rows, - feats[0].NumCols()); - for (auto feat : feats) { - for (int row_idx = 0; row_idx < feat.NumRows(); ++row_idx) { - for (int col_idx = 0; col_idx < feat.NumCols(); ++col_idx) { - features(cur_idx, col_idx) = - (feat(row_idx, col_idx) - mean_[col_idx]) * - variance_[col_idx]; - } - ++cur_idx; - } - } - feat_writer.Write(utt, features); - - cur_idx = 0; - kaldi::Matrix features_check(feature_rows, - feats[0].NumCols()); - for (auto feat : feats) { - for (int row_idx = 0; row_idx < feat.NumRows(); ++row_idx) { - for (int col_idx = 0; col_idx < feat.NumCols(); ++col_idx) { - features_check(cur_idx, col_idx) = feat(row_idx, col_idx); - } - kaldi::SubVector row_feat(features_check, cur_idx); - cmvn.ApplyCMVN(true, &row_feat); - ++cur_idx; - } - } - feat_cmvn_check_writer.Write(utt, features_check); - - if (num_done % 50 == 0 && num_done != 0) - KALDI_VLOG(2) << "Processed " << num_done << " utterances"; - num_done++; - } - KALDI_LOG << "Done " << num_done << " utterances, " << num_err - << " with errors."; - return (num_done != 0 ? 0 : 1); -} diff --git a/speechx/speechx/frontend/feature_cache.cc b/speechx/speechx/frontend/feature_cache.cc new file mode 100644 index 00000000..07f2cbf7 --- /dev/null +++ b/speechx/speechx/frontend/feature_cache.cc @@ -0,0 +1,38 @@ +#include "frontend/feature_cache.h" + +void FeatureCache::AcceptWaveform(const kaldi::VectorBase& input) { + base_extractor_->AcceptWaveform(input); + // feed current data + while (base_extractor_->IsLastFrame()) { + Compute(); + } +} + +// pop feature chunk +void FeatureCache::Read(kaldi::VectorBase* feat) { + std::lock_guard lock(mutex_); + while (cache_.empty()) { + ready_read_condition_.wait(lock); + } + feat->CopyFromVec(cache_.front()); + cache_.pop(); + ready_feed_condition_.notify_one(); +} + +// read all data from base_feature_extractor_ into cache_ +void FeatureCache::Compute() { + // compute and feed + Vector feature_chunk(base_extractor_->Dim()); + base_extractor_->Read(&feature_chunk); + std::lock_guard lock(mutex_); + while (cache_.size() >= max_size_) { + ready_feed_condition_.wait(lock); + } + cache_.push(feature_chunk); + ready_read_condition_.notify_one(); +} + +// compute the last chunk data && set feed finished +void FeatureCache::InputFinishd() { + Compute(); +} diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h new file mode 100644 index 00000000..71dc455c --- /dev/null +++ b/speechx/speechx/frontend/feature_cache.h @@ -0,0 +1,21 @@ +#include "frontend/feature_extractor_interface.h" + +class FeatureCache { + public: + explicit FeatureCache(FeatureExtractorInterface base_extractor); + void AcceptWaveform(const kaldi::VectorBase& input); + void Read(kaldi::VectorBase* feat); + void Dim() { return base_extractor_->Dim(); } + void SetFinished(); + bool IsFinished(); + + private: + bool finished_; + mutable std::mutex mutex_; + size_t max_size; + std::queue> cache_; + std::shared_ptr base_extractor_; + std::condition_variable ready_feed_condition_; + std::condition_variable ready_read_condition_; + DISALLOW_COPY_AND_ASSGIN(FeatureCache); +}; \ No newline at end of file diff --git a/speechx/speechx/frontend/raw_audio.cc b/speechx/speechx/frontend/raw_audio.cc new file mode 100644 index 00000000..f4584828 --- /dev/null +++ b/speechx/speechx/frontend/raw_audio.cc @@ -0,0 +1,60 @@ +#include "frontend/raw_audio.h" +#include "kaldi/base/timer.h" + +namespace ppspeech { + +RawAudioSource::RawAudioSource(int buffer_size = 65536) + : finished_(false), + data_length_(0), + start_(0), + timeout_(5) { + ring_buffer_.resize(buffer_size); +} + +// todo length > buffer size, condition_var +bool RawAudioSource::AcceptWaveform(const VectorBase& data) { + std::lock_guard lock(mutex_); + for (size_t idx = 0; idx < data.Dim(); ++idx) { + ring_buffer_[idx % ring_buffer_.size()] = data(idx); + } + data_length_ += length; +} + +// todo length > buffer size +//bool RawAudioSource::AcceptWaveform(BaseFloat* data, int length) { + //std::lock_guard lock(mutex_); + //for (size_t idx = 0; idx < length; ++idx) { + //ring_buffer_[idx % ring_buffer_.size()] = data[idx]; + //} + //data_length_ += length; + //finish_condition_.notify_one(); +//} + +bool RawAudioSource::Read(Vector* feats) { + size_t chunk_size = feats->Dim(); + Timer timer; + if (chunk_size > data_length_) { + while (true) { + int32 elapsed = static_cat(timer.Elapsed() * 1000); + if (finished_ || > timeout_) { + chunk_size = data_length_; + feats->Resize(chunk_size); + break; + } + sleep(1); + } + } + std::lock_guard lock(mutex_); + for (size_t idx = 0; idx < chunk_size; ++idx) { + feats->Data()[idx] = ring_buffer_[idx]; + } + data_length_ -= chunk_size; + start_ = (start_ + chunk_size) % ring_buffer_.size(); + finish_condition_.notify_one(); +} + +//size_t RawAudioSource::GetDataLength() { +// return data_length_; +//} + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/raw_audio.h new file mode 100644 index 00000000..24a4b2e8 --- /dev/null +++ b/speechx/speechx/frontend/raw_audio.h @@ -0,0 +1,34 @@ + +#pragma once + +#include "frontend/feature_extractor_interface.h" +#include "base/common.h" + +#pragma once + +namespace ppspeech { + +class RawAudioSource { + public: + RawAudioSource(int buffer_size = kint16max); + virtual void AcceptWaveform(kaldi::BaseFloat* data, int length); + void AcceptWaveformByByte(char* data, lnt length) {} + void AcceptWaveformByShort(kaldi::int16* data, int length) {} + + // read chunk data in buffer + bool Read(VectorBase* feats); + void SetFinished() { finished_ = true; } + bool IsFinished() { return finished_; } + + private: + vector ring_buffer_; + size_t start_; + size_t data_length_; + bool finished_; + mutable std::mutex mutext_; + std::condition_variable ready_read_condition_; + std::condition_variable ready_feed_condition_; + kaldi::int32 timeout_; +}; + +} // namespace ppspeech \ No newline at end of file From 8ec6a8c0a8fc521ebe68fca4cde8cd3eb8b538d2 Mon Sep 17 00:00:00 2001 From: SmileGoat Date: Mon, 7 Mar 2022 20:57:40 +0800 Subject: [PATCH 2/7] add streaming_feat_main --- .../examples/feat/linear_spectrogram_main.cc | 257 ++++++++++++++++++ speechx/examples/feat/streaming_feat_main.cc | 56 ++++ 2 files changed, 313 insertions(+) create mode 100644 speechx/examples/feat/linear_spectrogram_main.cc create mode 100644 speechx/examples/feat/streaming_feat_main.cc diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc new file mode 100644 index 00000000..3e2342c2 --- /dev/null +++ b/speechx/examples/feat/linear_spectrogram_main.cc @@ -0,0 +1,257 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// todo refactor, repalce with gtest + +#include "base/flags.h" +#include "base/log.h" +#include "frontend/feature_extractor_interface.h" +#include "frontend/linear_spectrogram.h" +#include "frontend/normalizer.h" +#include "kaldi/feat/wave-reader.h" +#include "kaldi/util/kaldi-io.h" +#include "kaldi/util/table-types.h" + +DEFINE_string(wav_rspecifier, "", "test wav path"); +DEFINE_string(feature_wspecifier, "", "test wav ark"); +DEFINE_string(feature_check_wspecifier, "", "test wav ark"); +DEFINE_string(cmvn_write_path, "./cmvn.ark", "test wav ark"); + + +std::vector mean_{ + -13730251.531853663, -12982852.199316509, -13673844.299583456, + -13089406.559646806, -12673095.524938712, -12823859.223276224, + -13590267.158903603, -14257618.467152044, -14374605.116185192, + -14490009.21822485, -14849827.158924166, -15354435.470563512, + -15834149.206532761, -16172971.985514281, -16348740.496746974, + -16423536.699409386, -16556246.263649225, -16744088.772748645, + -16916184.08510357, -17054034.840031497, -17165612.509455364, + -17255955.470915023, -17322572.527648456, -17408943.862033736, + -17521554.799865916, -17620623.254924215, -17699792.395918526, + -17723364.411134344, -17741483.4433254, -17747426.888704527, + -17733315.928209435, -17748780.160905756, -17808336.883775543, + -17895918.671983004, -18009812.59173023, -18098188.66548325, + -18195798.958462656, -18293617.62980999, -18397432.92077201, + -18505834.787318766, -18585451.8100908, -18652438.235649142, + -18700960.306275308, -18734944.58792185, -18737426.313365128, + -18735347.165987637, -18738813.444170244, -18737086.848890636, + -18731576.2474336, -18717405.44095871, -18703089.25545657, + -18691014.546456724, -18692460.568905357, -18702119.628629155, + -18727710.621126678, -18761582.72034647, -18806745.835547544, + -18850674.8692112, -18884431.510951452, -18919999.992506847, + -18939303.799078144, -18952946.273760635, -18980289.22996379, + -19011610.17803294, -19040948.61805145, -19061021.429847397, + -19112055.53768819, -19149667.414264943, -19201127.05091321, + -19270250.82564605, -19334606.883057203, -19390513.336589377, + -19444176.259208687, -19502755.000038862, -19544333.014549147, + -19612668.183176614, -19681902.19006569, -19771969.951249883, + -19873329.723376893, -19996752.59235844, -20110031.131400537, + -20231658.612529557, -20319378.894054495, -20378534.45718066, + -20413332.089584175, -20438147.844177883, -20443710.248040095, + -20465457.02238927, -20488610.969337028, -20516295.16424432, + -20541423.795738827, -20553192.874953747, -20573605.50701977, + -20577871.61936797, -20571807.008916274, -20556242.38912231, + -20542199.30819195, -20521239.063551214, -20519150.80004532, + -20527204.80248933, -20536933.769257784, -20543470.522332076, + -20549700.089992985, -20551525.24958494, -20554873.406493705, + -20564277.65794227, -20572211.740052115, -20574305.69550465, + -20575494.450104576, -20567092.577932164, -20549302.929608088, + -20545445.11878376, -20546625.326603737, -20549190.03499401, + -20554824.947828256, -20568341.378989458, -20577582.331383612, + -20577980.519402675, -20566603.03458152, -20560131.592262644, + -20552166.469060015, -20549063.06763577, -20544490.562339947, + -20539817.82346569, -20528747.715731595, -20518026.24576161, + -20510977.844974525, -20506874.36087992, -20506731.11977665, + -20510482.133420516, -20507760.92101862, -20494644.834457114, + -20480107.89304893, -20461312.091867123, -20442941.75080173, + -20426123.02834838, -20424607.675283, -20426810.369107097, + -20434024.50097819, -20437404.75544205, -20447688.63916367, + -20460893.335563846, -20482922.735127095, -20503610.119434915, + -20527062.76448319, -20557830.035128627, -20593274.72068722, + -20632528.452965066, -20673637.471334763, -20733106.97143075, + -20842921.0447562, -21054357.83621519, -21416569.534189366, + -21978460.272811692, -22753170.052172784, -23671344.10563395, + -24613499.293358143, -25406477.12230188, -25884377.82156489, + -26049040.62791664, -26996879.104431007}; +std::vector variance_{ + 213747175.10846674, 188395815.34302503, 212706429.10966414, + 199109025.81461075, 189235901.23864496, 194901336.53253657, + 217481594.29306737, 238689869.12327808, 243977501.24115244, + 248479623.6431067, 259766741.47116545, 275516766.7790273, + 291271202.3691234, 302693239.8220509, 308627358.3997694, + 311143911.38788426, 315446105.07731867, 321705430.9341829, + 327458907.4659941, 332245072.43223983, 336251717.5935284, + 339694069.7639722, 342188204.4322228, 345587110.31313115, + 349903086.2875232, 353660214.20643026, 356700344.5270885, + 357665362.3529641, 358493352.05658793, 358857951.620328, + 358375239.52774596, 358899733.6342954, 361051818.3511561, + 364361716.05025816, 368750322.3771452, 372047800.6462831, + 375655861.1349018, 379358519.1980013, 383327605.3935181, + 387458599.282341, 390434692.3406868, 392994486.35057056, + 394874418.04603153, 396230525.79763395, 396365592.0414835, + 396334819.8242737, 396488353.19250053, 396438877.00744957, + 396197980.4459586, 395590921.6672991, 395001107.62072515, + 394528291.7318225, 394593110.424006, 395018405.59353715, + 396110577.5415993, 397506704.0371068, 399400197.4657644, + 401243568.2468382, 402687134.7805103, 404136047.2872507, + 404883170.001883, 405522253.219517, 406660365.3626476, + 407919346.0991902, 409045348.5384909, 409759588.7889818, + 411974821.8564483, 413489718.78201455, 415535392.56684107, + 418466481.97674364, 421104678.35678065, 423405392.5200779, + 425550570.40798235, 427929423.9579701, 429585274.253478, + 432368493.55181056, 435193587.13513297, 438886855.20476013, + 443058876.8633751, 448181232.5093362, 452883835.6332396, + 458056721.77926534, 461816531.22735566, 464363620.1970998, + 465886343.5057493, 466928872.0651, 467180536.42647296, + 468111848.70714295, 469138695.3071312, 470378429.6930793, + 471517958.7132626, 472109050.4262365, 473087417.0177867, + 473381322.04648733, 473220195.85483915, 472666071.8998819, + 472124669.87879956, 471298571.411737, 471251033.2902761, + 471672676.43128747, 472177147.2193172, 472572361.7711908, + 472968783.7751127, 473156295.4164052, 473398034.82676554, + 473897703.5203811, 474328271.33112127, 474452670.98002136, + 474549003.99284613, 474252887.13567275, 473557462.909069, + 473483385.85193115, 473609738.04855174, 473746944.82085115, + 474016729.91696435, 474617321.94138587, 475045097.237122, + 475125402.586558, 474664112.9824912, 474426247.5800283, + 474104075.42796475, 473978219.7273978, 473773171.7798875, + 473578534.69508696, 473102924.16904145, 472651240.5232615, + 472374383.1810912, 472209479.6956096, 472202298.8921673, + 472370090.76781124, 472220933.99374026, 471625467.37106377, + 470994646.51883453, 470182428.9637543, 469348211.5939578, + 468570387.4467277, 468540442.7225135, 468672018.90414184, + 468994346.9533251, 469138757.58201426, 469553915.95710236, + 470134523.38582784, 471082421.62055486, 471962316.51804745, + 472939745.1708408, 474250621.5944825, 475773933.43199486, + 477465399.71087736, 479218782.61382693, 481752299.7930922, + 486608947.8984568, 496119403.2067917, 512730085.5704984, + 539048915.2641417, 576285298.3548826, 621610270.2240586, + 669308196.4436442, 710656993.5957186, 736344437.3725077, + 745481288.0241544, 801121432.9925804}; +int count_ = 912592; + +void WriteMatrix() { + kaldi::Matrix cmvn_stats(2, mean_.size() + 1); + for (size_t idx = 0; idx < mean_.size(); ++idx) { + cmvn_stats(0, idx) = mean_[idx]; + cmvn_stats(1, idx) = variance_[idx]; + } + cmvn_stats(0, mean_.size()) = count_; + kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, true); +} + +int main(int argc, char* argv[]) { + gflags::ParseCommandLineFlags(&argc, &argv, false); + google::InitGoogleLogging(argv[0]); + + kaldi::SequentialTableReader wav_reader( + FLAGS_wav_rspecifier); + kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier); + kaldi::BaseFloatMatrixWriter feat_cmvn_check_writer( + FLAGS_feature_check_wspecifier); + WriteMatrix(); + + // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning + // window -->linear_spectrogram --> cmvn + int32 num_done = 0, num_err = 0; + ppspeech::LinearSpectrogramOptions opt; + opt.frame_opts.frame_length_ms = 20; + opt.frame_opts.frame_shift_ms = 10; + ppspeech::DecibelNormalizerOptions db_norm_opt; + std::unique_ptr base_feature_extractor( + new ppspeech::DecibelNormalizer(db_norm_opt)); + ppspeech::LinearSpectrogram linear_spectrogram( + opt, std::move(base_feature_extractor)); + + ppspeech::CMVN cmvn(FLAGS_cmvn_write_path); + + float streaming_chunk = 0.36; + int sample_rate = 16000; + int chunk_sample_size = streaming_chunk * sample_rate; + + LOG(INFO) << mean_.size(); + for (size_t i = 0; i < mean_.size(); i++) { + mean_[i] /= count_; + variance_[i] = variance_[i] / count_ - mean_[i] * mean_[i]; + if (variance_[i] < 1.0e-20) { + variance_[i] = 1.0e-20; + } + variance_[i] = 1.0 / std::sqrt(variance_[i]); + } + + for (; !wav_reader.Done(); wav_reader.Next()) { + std::string utt = wav_reader.Key(); + const kaldi::WaveData& wave_data = wav_reader.Value(); + + int32 this_channel = 0; + kaldi::SubVector waveform(wave_data.Data(), + this_channel); + int tot_samples = waveform.Dim(); + int sample_offset = 0; + std::vector> feats; + int feature_rows = 0; + while (sample_offset < tot_samples) { + int cur_chunk_size = + std::min(chunk_sample_size, tot_samples - sample_offset); + kaldi::Vector wav_chunk(cur_chunk_size); + for (int i = 0; i < cur_chunk_size; ++i) { + wav_chunk(i) = waveform(sample_offset + i); + } + kaldi::Matrix features; + linear_spectrogram.AcceptWaveform(wav_chunk); + linear_spectrogram.ReadFeats(&features); + + feats.push_back(features); + sample_offset += cur_chunk_size; + feature_rows += features.NumRows(); + } + + int cur_idx = 0; + kaldi::Matrix features(feature_rows, + feats[0].NumCols()); + for (auto feat : feats) { + for (int row_idx = 0; row_idx < feat.NumRows(); ++row_idx) { + for (int col_idx = 0; col_idx < feat.NumCols(); ++col_idx) { + features(cur_idx, col_idx) = + (feat(row_idx, col_idx) - mean_[col_idx]) * + variance_[col_idx]; + } + ++cur_idx; + } + } + feat_writer.Write(utt, features); + + cur_idx = 0; + kaldi::Matrix features_check(feature_rows, + feats[0].NumCols()); + for (auto feat : feats) { + for (int row_idx = 0; row_idx < feat.NumRows(); ++row_idx) { + for (int col_idx = 0; col_idx < feat.NumCols(); ++col_idx) { + features_check(cur_idx, col_idx) = feat(row_idx, col_idx); + } + kaldi::SubVector row_feat(features_check, cur_idx); + cmvn.ApplyCMVN(true, &row_feat); + ++cur_idx; + } + } + feat_cmvn_check_writer.Write(utt, features_check); + + if (num_done % 50 == 0 && num_done != 0) + KALDI_VLOG(2) << "Processed " << num_done << " utterances"; + num_done++; + } + KALDI_LOG << "Done " << num_done << " utterances, " << num_err + << " with errors."; + return (num_done != 0 ? 0 : 1); +} diff --git a/speechx/examples/feat/streaming_feat_main.cc b/speechx/examples/feat/streaming_feat_main.cc new file mode 100644 index 00000000..29133045 --- /dev/null +++ b/speechx/examples/feat/streaming_feat_main.cc @@ -0,0 +1,56 @@ +// todo refactor, repalce with gtest + +#include "frontend/linear_spectrogram.h" +#include "frontend/normalizer.h" +#include "frontend/feature_extractor_interface.h" +#include "kaldi/util/table-types.h" +#include "base/log.h" +#include "base/flags.h" +#include "kaldi/feat/wave-reader.h" +#include "kaldi/util/kaldi-io.h" + +DEFINE_string(wav_rspecifier, "", "test wav path"); +DEFINE_string(feature_wspecifier, "", "test wav ark"); +DEFINE_string(cmvn_path, "./cmvn.ark", "test wav ark"); + +int main(int argc, char* argv[]) { + gflags::ParseCommandLineFlags(&argc, &argv, false); + google::InitGoogleLogging(argv[0]); + + kaldi::SequentialTableReader wav_reader(FLAGS_wav_rspecifier); + kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier); + + // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning window -->linear_spectrogram --> cmvn + // --> feature_cache + int32 num_done = 0, num_err = 0; + ppspeech::LinearSpectrogramOptions opt; + opt.frame_opts.frame_length_ms = 20; + opt.frame_opts.frame_shift_ms = 10; + ppspeech::DecibelNormalizerOptions db_norm_opt; + std::unique_ptr base_feature_extractor( + new ppspeech::DecibelNormalizer(db_norm_opt)); + + std::shared_ptr linear_spectrogram( + new ppspeech::LinearSpectrogram(opt, base_feature_extractor)); + + std::shared_ptr cmvn( + new ppspeech::CMVN(FLAGS_cmvn_path, linear_spectrogram); + ppspeech::FeatureCache(cmvn); + + float streaming_chunk = 0.36; + int sample_rate = 16000; + int chunk_sample_size = streaming_chunk * sample_rate; + // thread 1 feed feature + + for (; !wav_reader.Done(); wav_reader.Next()) { + std::string utt = wav_reader.Key(); + const kaldi::WaveData &wave_data = wav_reader.Value(); + + if (num_done % 50 == 0 && num_done != 0) + KALDI_VLOG(2) << "Processed " << num_done << " utterances"; + num_done++; + } + KALDI_LOG << "Done " << num_done << " utterances, " << num_err + << " with errors."; + return (num_done != 0 ? 0 : 1); +} From ac0e417032601e78621b845c1431b173362428d0 Mon Sep 17 00:00:00 2001 From: SmileGoat Date: Tue, 8 Mar 2022 20:40:20 +0800 Subject: [PATCH 3/7] make streaming pipeline work --- speechx/examples/feat/streaming_feat_main.cc | 75 ++++- speechx/speechx/base/common.h | 11 +- speechx/speechx/frontend/CMakeLists.txt | 3 +- .../frontend/feature_extractor_interface.h | 5 +- .../speechx/frontend/linear_spectrogram.cc | 229 +++++++------- speechx/speechx/frontend/linear_spectrogram.h | 43 ++- speechx/speechx/frontend/normalizer.cc | 289 +++++++++--------- speechx/speechx/frontend/normalizer.h | 70 ++--- 8 files changed, 366 insertions(+), 359 deletions(-) diff --git a/speechx/examples/feat/streaming_feat_main.cc b/speechx/examples/feat/streaming_feat_main.cc index 29133045..b3ee9842 100644 --- a/speechx/examples/feat/streaming_feat_main.cc +++ b/speechx/examples/feat/streaming_feat_main.cc @@ -1,17 +1,34 @@ // todo refactor, repalce with gtest +#include "base/log.h" +#include "base/flags.h" #include "frontend/linear_spectrogram.h" #include "frontend/normalizer.h" #include "frontend/feature_extractor_interface.h" +#include "frontend/raw_audio.h" #include "kaldi/util/table-types.h" -#include "base/log.h" -#include "base/flags.h" #include "kaldi/feat/wave-reader.h" #include "kaldi/util/kaldi-io.h" DEFINE_string(wav_rspecifier, "", "test wav path"); DEFINE_string(feature_wspecifier, "", "test wav ark"); -DEFINE_string(cmvn_path, "./cmvn.ark", "test wav ark"); +DEFINE_string(feature_check_wspecifier, "", "test wav ark"); +DEFINE_string(cmvn_write_path, "./cmvn.ark", "test wav ark"); + + +std::vector mean_{-13730251.531853663, -12982852.199316509, -13673844.299583456, -13089406.559646806, -12673095.524938712, -12823859.223276224, -13590267.158903603, -14257618.467152044, -14374605.116185192, -14490009.21822485, -14849827.158924166, -15354435.470563512, -15834149.206532761, -16172971.985514281, -16348740.496746974, -16423536.699409386, -16556246.263649225, -16744088.772748645, -16916184.08510357, -17054034.840031497, -17165612.509455364, -17255955.470915023, -17322572.527648456, -17408943.862033736, -17521554.799865916, -17620623.254924215, -17699792.395918526, -17723364.411134344, -17741483.4433254, -17747426.888704527, -17733315.928209435, -17748780.160905756, -17808336.883775543, -17895918.671983004, -18009812.59173023, -18098188.66548325, -18195798.958462656, -18293617.62980999, -18397432.92077201, -18505834.787318766, -18585451.8100908, -18652438.235649142, -18700960.306275308, -18734944.58792185, -18737426.313365128, -18735347.165987637, -18738813.444170244, -18737086.848890636, -18731576.2474336, -18717405.44095871, -18703089.25545657, -18691014.546456724, -18692460.568905357, -18702119.628629155, -18727710.621126678, -18761582.72034647, -18806745.835547544, -18850674.8692112, -18884431.510951452, -18919999.992506847, -18939303.799078144, -18952946.273760635, -18980289.22996379, -19011610.17803294, -19040948.61805145, -19061021.429847397, -19112055.53768819, -19149667.414264943, -19201127.05091321, -19270250.82564605, -19334606.883057203, -19390513.336589377, -19444176.259208687, -19502755.000038862, -19544333.014549147, -19612668.183176614, -19681902.19006569, -19771969.951249883, -19873329.723376893, -19996752.59235844, -20110031.131400537, -20231658.612529557, -20319378.894054495, -20378534.45718066, -20413332.089584175, -20438147.844177883, -20443710.248040095, -20465457.02238927, -20488610.969337028, -20516295.16424432, -20541423.795738827, -20553192.874953747, -20573605.50701977, -20577871.61936797, -20571807.008916274, -20556242.38912231, -20542199.30819195, -20521239.063551214, -20519150.80004532, -20527204.80248933, -20536933.769257784, -20543470.522332076, -20549700.089992985, -20551525.24958494, -20554873.406493705, -20564277.65794227, -20572211.740052115, -20574305.69550465, -20575494.450104576, -20567092.577932164, -20549302.929608088, -20545445.11878376, -20546625.326603737, -20549190.03499401, -20554824.947828256, -20568341.378989458, -20577582.331383612, -20577980.519402675, -20566603.03458152, -20560131.592262644, -20552166.469060015, -20549063.06763577, -20544490.562339947, -20539817.82346569, -20528747.715731595, -20518026.24576161, -20510977.844974525, -20506874.36087992, -20506731.11977665, -20510482.133420516, -20507760.92101862, -20494644.834457114, -20480107.89304893, -20461312.091867123, -20442941.75080173, -20426123.02834838, -20424607.675283, -20426810.369107097, -20434024.50097819, -20437404.75544205, -20447688.63916367, -20460893.335563846, -20482922.735127095, -20503610.119434915, -20527062.76448319, -20557830.035128627, -20593274.72068722, -20632528.452965066, -20673637.471334763, -20733106.97143075, -20842921.0447562, -21054357.83621519, -21416569.534189366, -21978460.272811692, -22753170.052172784, -23671344.10563395, -24613499.293358143, -25406477.12230188, -25884377.82156489, -26049040.62791664, -26996879.104431007}; +std::vector variance_{213747175.10846674, 188395815.34302503, 212706429.10966414, 199109025.81461075, 189235901.23864496, 194901336.53253657, 217481594.29306737, 238689869.12327808, 243977501.24115244, 248479623.6431067, 259766741.47116545, 275516766.7790273, 291271202.3691234, 302693239.8220509, 308627358.3997694, 311143911.38788426, 315446105.07731867, 321705430.9341829, 327458907.4659941, 332245072.43223983, 336251717.5935284, 339694069.7639722, 342188204.4322228, 345587110.31313115, 349903086.2875232, 353660214.20643026, 356700344.5270885, 357665362.3529641, 358493352.05658793, 358857951.620328, 358375239.52774596, 358899733.6342954, 361051818.3511561, 364361716.05025816, 368750322.3771452, 372047800.6462831, 375655861.1349018, 379358519.1980013, 383327605.3935181, 387458599.282341, 390434692.3406868, 392994486.35057056, 394874418.04603153, 396230525.79763395, 396365592.0414835, 396334819.8242737, 396488353.19250053, 396438877.00744957, 396197980.4459586, 395590921.6672991, 395001107.62072515, 394528291.7318225, 394593110.424006, 395018405.59353715, 396110577.5415993, 397506704.0371068, 399400197.4657644, 401243568.2468382, 402687134.7805103, 404136047.2872507, 404883170.001883, 405522253.219517, 406660365.3626476, 407919346.0991902, 409045348.5384909, 409759588.7889818, 411974821.8564483, 413489718.78201455, 415535392.56684107, 418466481.97674364, 421104678.35678065, 423405392.5200779, 425550570.40798235, 427929423.9579701, 429585274.253478, 432368493.55181056, 435193587.13513297, 438886855.20476013, 443058876.8633751, 448181232.5093362, 452883835.6332396, 458056721.77926534, 461816531.22735566, 464363620.1970998, 465886343.5057493, 466928872.0651, 467180536.42647296, 468111848.70714295, 469138695.3071312, 470378429.6930793, 471517958.7132626, 472109050.4262365, 473087417.0177867, 473381322.04648733, 473220195.85483915, 472666071.8998819, 472124669.87879956, 471298571.411737, 471251033.2902761, 471672676.43128747, 472177147.2193172, 472572361.7711908, 472968783.7751127, 473156295.4164052, 473398034.82676554, 473897703.5203811, 474328271.33112127, 474452670.98002136, 474549003.99284613, 474252887.13567275, 473557462.909069, 473483385.85193115, 473609738.04855174, 473746944.82085115, 474016729.91696435, 474617321.94138587, 475045097.237122, 475125402.586558, 474664112.9824912, 474426247.5800283, 474104075.42796475, 473978219.7273978, 473773171.7798875, 473578534.69508696, 473102924.16904145, 472651240.5232615, 472374383.1810912, 472209479.6956096, 472202298.8921673, 472370090.76781124, 472220933.99374026, 471625467.37106377, 470994646.51883453, 470182428.9637543, 469348211.5939578, 468570387.4467277, 468540442.7225135, 468672018.90414184, 468994346.9533251, 469138757.58201426, 469553915.95710236, 470134523.38582784, 471082421.62055486, 471962316.51804745, 472939745.1708408, 474250621.5944825, 475773933.43199486, 477465399.71087736, 479218782.61382693, 481752299.7930922, 486608947.8984568, 496119403.2067917, 512730085.5704984, 539048915.2641417, 576285298.3548826, 621610270.2240586, 669308196.4436442, 710656993.5957186, 736344437.3725077, 745481288.0241544, 801121432.9925804}; +int count_ = 912592; + +void WriteMatrix() { + kaldi::Matrix cmvn_stats(2, mean_.size()+ 1); + for (size_t idx = 0; idx < mean_.size(); ++idx) { + cmvn_stats(0, idx) = mean_[idx]; + cmvn_stats(1, idx) = variance_[idx]; + } + cmvn_stats(0, mean_.size()) = count_; + kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, true); +} int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, false); @@ -19,33 +36,69 @@ int main(int argc, char* argv[]) { kaldi::SequentialTableReader wav_reader(FLAGS_wav_rspecifier); kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier); + WriteMatrix(); // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning window -->linear_spectrogram --> cmvn - // --> feature_cache int32 num_done = 0, num_err = 0; + std::unique_ptr data_source(new ppspeech::RawDataSource()); + ppspeech::LinearSpectrogramOptions opt; opt.frame_opts.frame_length_ms = 20; opt.frame_opts.frame_shift_ms = 10; ppspeech::DecibelNormalizerOptions db_norm_opt; std::unique_ptr base_feature_extractor( - new ppspeech::DecibelNormalizer(db_norm_opt)); + new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source))); - std::shared_ptr linear_spectrogram( - new ppspeech::LinearSpectrogram(opt, base_feature_extractor)); + std::unique_ptr linear_spectrogram( + new ppspeech::LinearSpectrogram(opt, std::move(base_feature_extractor))); - std::shared_ptr cmvn( - new ppspeech::CMVN(FLAGS_cmvn_path, linear_spectrogram); - ppspeech::FeatureCache(cmvn); + ppspeech::CMVN cmvn(FLAGS_cmvn_write_path, std::move(linear_spectrogram)); float streaming_chunk = 0.36; int sample_rate = 16000; int chunk_sample_size = streaming_chunk * sample_rate; - // thread 1 feed feature for (; !wav_reader.Done(); wav_reader.Next()) { std::string utt = wav_reader.Key(); const kaldi::WaveData &wave_data = wav_reader.Value(); + int32 this_channel = 0; + kaldi::SubVector waveform(wave_data.Data(), this_channel); + int tot_samples = waveform.Dim(); + int sample_offset = 0; + std::vector> feats; + int feature_rows = 0; + while (sample_offset < tot_samples) { + int cur_chunk_size = std::min(chunk_sample_size, tot_samples - sample_offset); + kaldi::Vector wav_chunk(cur_chunk_size); + for (int i = 0; i < cur_chunk_size; ++i) { + wav_chunk(i) = waveform(sample_offset + i); + } + kaldi::Vector features; + cmvn.AcceptWaveform(wav_chunk); + cmvn.Read(&features); + + std::cout << wav_chunk(0) << std::endl; + std::cout << features(0) << std::endl; + + feats.push_back(features); + sample_offset += cur_chunk_size; + feature_rows += features.Dim() / cmvn.Dim(); + } + + int cur_idx = 0; + kaldi::Matrix features(feature_rows, cmvn.Dim()); + for (auto feat : feats) { + int num_rows = feat.Dim() / cmvn.Dim(); + for (int row_idx = 0; row_idx < num_rows; ++row_idx) { + for (int col_idx = 0; col_idx < cmvn.Dim(); ++col_idx) { + features(cur_idx, col_idx) = feat(row_idx*cmvn.Dim() + col_idx); + } + ++cur_idx; + } + } + feat_writer.Write(utt, features); + if (num_done % 50 == 0 && num_done != 0) KALDI_VLOG(2) << "Processed " << num_done << " utterances"; num_done++; diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h index 3b58f73c..ac01a977 100644 --- a/speechx/speechx/base/common.h +++ b/speechx/speechx/base/common.h @@ -15,22 +15,23 @@ #pragma once #include -#include #include #include +#include #include #include -#include #include #include #include #include #include +#include #include #include -#include +#include +#include -#include "base/basic_types.h" -#include "base/flags.h" #include "base/log.h" +#include "base/flags.h" +#include "base/basic_types.h" #include "base/macros.h" diff --git a/speechx/speechx/frontend/CMakeLists.txt b/speechx/speechx/frontend/CMakeLists.txt index da81a481..e43bd182 100644 --- a/speechx/speechx/frontend/CMakeLists.txt +++ b/speechx/speechx/frontend/CMakeLists.txt @@ -2,7 +2,8 @@ project(frontend) add_library(frontend STATIC normalizer.cc - linear_spectrogram.cc + linear_spectrogram.cc + raw_audio.cc ) target_link_libraries(frontend PUBLIC kaldi-matrix) \ No newline at end of file diff --git a/speechx/speechx/frontend/feature_extractor_interface.h b/speechx/speechx/frontend/feature_extractor_interface.h index e39f5e46..fc06f24a 100644 --- a/speechx/speechx/frontend/feature_extractor_interface.h +++ b/speechx/speechx/frontend/feature_extractor_interface.h @@ -21,9 +21,8 @@ namespace ppspeech { class FeatureExtractorInterface { public: - virtual void AcceptWaveform( - const kaldi::VectorBase& input) = 0; - virtual void Read(kaldi::VectorBase* feat) = 0; + virtual void AcceptWaveform(const kaldi::VectorBase& input) = 0; + virtual void Read(kaldi::Vector* feat) = 0; virtual size_t Dim() const = 0; }; diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/linear_spectrogram.cc index 6c008c39..ed4c2977 100644 --- a/speechx/speechx/frontend/linear_spectrogram.cc +++ b/speechx/speechx/frontend/linear_spectrogram.cc @@ -25,153 +25,146 @@ using kaldi::VectorBase; using kaldi::Matrix; using std::vector; -// todo remove later +//todo remove later void CopyVector2StdVector_(const VectorBase& input, - vector* output) { - if (input.Dim() == 0) return; - output->resize(input.Dim()); - for (size_t idx = 0; idx < input.Dim(); ++idx) { - (*output)[idx] = input(idx); - } + vector* output) { + if (input.Dim() == 0) return; + output->resize(input.Dim()); + for (size_t idx = 0; idx < input.Dim(); ++idx) { + (*output)[idx] = input(idx); + } } void CopyStdVector2Vector_(const vector& input, - Vector* output) { - if (input.empty()) return; - output->Resize(input.size()); - for (size_t idx = 0; idx < input.size(); ++idx) { - (*output)(idx) = input[idx]; - } + Vector* output) { + if (input.empty()) return; + output->Resize(input.size()); + for (size_t idx = 0; idx < input.size(); ++idx) { + (*output)(idx) = input[idx]; + } } LinearSpectrogram::LinearSpectrogram( const LinearSpectrogramOptions& opts, std::unique_ptr base_extractor) { - opts_ = opts; - base_extractor_ = std::move(base_extractor); - int32 window_size = opts.frame_opts.WindowSize(); - int32 window_shift = opts.frame_opts.WindowShift(); - fft_points_ = window_size; - hanning_window_.resize(window_size); - - double a = M_2PI / (window_size - 1); - hanning_window_energy_ = 0; - for (int i = 0; i < window_size; ++i) { - hanning_window_[i] = 0.5 - 0.5 * cos(a * i); - hanning_window_energy_ += hanning_window_[i] * hanning_window_[i]; - } - - dim_ = fft_points_ / 2 + 1; // the dimension is Fs/2 Hz + opts_ = opts; + base_extractor_ = std::move(base_extractor); + int32 window_size = opts.frame_opts.WindowSize(); + int32 window_shift = opts.frame_opts.WindowShift(); + fft_points_ = window_size; + chunk_sample_size_ = + static_cast(opts.streaming_chunk * opts.frame_opts.samp_freq); + hanning_window_.resize(window_size); + + double a = M_2PI / (window_size - 1); + hanning_window_energy_ = 0; + for (int i = 0; i < window_size; ++i) { + hanning_window_[i] = 0.5 - 0.5 * cos(a * i); + hanning_window_energy_ += hanning_window_[i] * hanning_window_[i]; + } + + dim_ = fft_points_ / 2 + 1; // the dimension is Fs/2 Hz } void LinearSpectrogram::AcceptWaveform(const VectorBase& input) { base_extractor_->AcceptWaveform(input); } +void LinearSpectrogram::Read(Vector* feat) { + Vector input_feats(chunk_sample_size_); + base_extractor_->Read(&input_feats); + vector input_feats_vec(input_feats.Dim()); + CopyVector2StdVector_(input_feats, &input_feats_vec); + //for (int idx = 0; idx < input_feats.Dim(); ++idx) { + // input_feats_vec[idx] = input_feats(idx); + //} + vector> result; + Compute(input_feats_vec, result); + int32 feat_size = 0; + if (result.size() != 0) { + feat_size = result.size() * result[0].size(); + } + feat->Resize(feat_size); + for (size_t idx = 0; idx < feat_size; ++idx) { + (*feat)(idx) = result[idx / dim_][idx % dim_]; + } + return; +} + void LinearSpectrogram::Hanning(vector* data) const { - CHECK_GE(data->size(), hanning_window_.size()); + CHECK_GE(data->size(), hanning_window_.size()); - for (size_t i = 0; i < hanning_window_.size(); ++i) { - data->at(i) *= hanning_window_[i]; - } + for (size_t i = 0; i < hanning_window_.size(); ++i) { + data->at(i) *= hanning_window_[i]; + } } bool LinearSpectrogram::NumpyFft(vector* v, vector* real, vector* img) const { - Vector v_tmp; - CopyStdVector2Vector_(*v, &v_tmp); - RealFft(&v_tmp, true); - CopyVector2StdVector_(v_tmp, v); - real->push_back(v->at(0)); - img->push_back(0); - for (int i = 1; i < v->size() / 2; i++) { - real->push_back(v->at(2 * i)); - img->push_back(v->at(2 * i + 1)); - } - real->push_back(v->at(1)); - img->push_back(0); - - return true; -} - -// todo remove later -void LinearSpectrogram::ReadFeats(Matrix* feats) { - Vector tmp; - waveform_.Resize(base_extractor_->Dim()); - Compute(tmp, &waveform_); - vector> result; - vector feats_vec; - CopyVector2StdVector_(waveform_, &feats_vec); - Compute(feats_vec, result); - feats->Resize(result.size(), result[0].size()); - for (int row_idx = 0; row_idx < result.size(); ++row_idx) { - for (int col_idx = 0; col_idx < result[0].size(); ++col_idx) { - (*feats)(row_idx, col_idx) = result[row_idx][col_idx]; - } - } - waveform_.Resize(0); -} - -void LinearSpectrogram::Read(VectorBase* feat) { - // todo - return; -} - -// only for test, remove later -// todo: compute the feature frame by frame. -void LinearSpectrogram::Compute(const VectorBase& input, - VectorBase* feature) { - base_extractor_->Read(feature); + Vector v_tmp; + CopyStdVector2Vector_(*v, &v_tmp); + RealFft(&v_tmp, true); + CopyVector2StdVector_(v_tmp, v); + real->push_back(v->at(0)); + img->push_back(0); + for (int i = 1; i < v->size() / 2; i++) { + real->push_back(v->at(2 * i)); + img->push_back(v->at(2 * i + 1)); + } + real->push_back(v->at(1)); + img->push_back(0); + + return true; } // Compute spectrogram feat, only for test, remove later // todo: refactor later (SmileGoat) bool LinearSpectrogram::Compute(const vector& wave, vector>& feat) { - int num_samples = wave.size(); - const int& frame_length = opts_.frame_opts.WindowSize(); - const int& sample_rate = opts_.frame_opts.samp_freq; - const int& frame_shift = opts_.frame_opts.WindowShift(); - const int& fft_points = fft_points_; - const float scale = hanning_window_energy_ * sample_rate; - - if (num_samples < frame_length) { - return true; - } - - int num_frames = 1 + ((num_samples - frame_length) / frame_shift); - feat.resize(num_frames); - vector fft_real((fft_points_ / 2 + 1), 0); - vector fft_img((fft_points_ / 2 + 1), 0); - vector v(frame_length, 0); - vector power((fft_points / 2 + 1)); - - for (int i = 0; i < num_frames; ++i) { - vector data(wave.data() + i * frame_shift, - wave.data() + i * frame_shift + frame_length); - Hanning(&data); - fft_img.clear(); - fft_real.clear(); - v.assign(data.begin(), data.end()); - NumpyFft(&v, &fft_real, &fft_img); - - feat[i].resize(fft_points / 2 + 1); // the last dimension is Fs/2 Hz - for (int j = 0; j < (fft_points / 2 + 1); ++j) { - power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j]; - feat[i][j] = power[j]; - - if (j == 0 || j == feat[0].size() - 1) { - feat[i][j] /= scale; - } else { - feat[i][j] *= (2.0 / scale); - } - - // log added eps=1e-14 - feat[i][j] = std::log(feat[i][j] + 1e-14); - } + int num_samples = wave.size(); + const int& frame_length = opts_.frame_opts.WindowSize(); + const int& sample_rate = opts_.frame_opts.samp_freq; + const int& frame_shift = opts_.frame_opts.WindowShift(); + const int& fft_points = fft_points_; + const float scale = hanning_window_energy_ * sample_rate; + + if (num_samples < frame_length) { + return true; + } + + int num_frames = 1 + ((num_samples - frame_length) / frame_shift); + feat.resize(num_frames); + vector fft_real((fft_points_ / 2 + 1), 0); + vector fft_img((fft_points_ / 2 + 1), 0); + vector v(frame_length, 0); + vector power((fft_points / 2 + 1)); + + for (int i = 0; i < num_frames; ++i) { + vector data(wave.data() + i * frame_shift, + wave.data() + i * frame_shift + frame_length); + Hanning(&data); + fft_img.clear(); + fft_real.clear(); + v.assign(data.begin(), data.end()); + NumpyFft(&v, &fft_real, &fft_img); + + feat[i].resize(fft_points / 2 + 1); // the last dimension is Fs/2 Hz + for (int j = 0; j < (fft_points / 2 + 1); ++j) { + power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j]; + feat[i][j] = power[j]; + + if (j == 0 || j == feat[0].size() - 1) { + feat[i][j] /= scale; + } else { + feat[i][j] *= (2.0 / scale); + } + + // log added eps=1e-14 + feat[i][j] = std::log(feat[i][j] + 1e-14); } - return true; + } + return true; } } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/linear_spectrogram.h index 20b5e4b5..e4dc3e33 100644 --- a/speechx/speechx/frontend/linear_spectrogram.h +++ b/speechx/speechx/frontend/linear_spectrogram.h @@ -1,45 +1,35 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - #pragma once -#include "base/common.h" #include "frontend/feature_extractor_interface.h" #include "kaldi/feat/feature-window.h" +#include "base/common.h" namespace ppspeech { struct LinearSpectrogramOptions { kaldi::FrameExtractionOptions frame_opts; - LinearSpectrogramOptions() : frame_opts() {} - - void Register(kaldi::OptionsItf* opts) { frame_opts.Register(opts); } + kaldi::BaseFloat streaming_chunk; + LinearSpectrogramOptions(): + streaming_chunk(0.36), + frame_opts() {} + + void Register(kaldi::OptionsItf* opts) { + opts->Register("streaming-chunk", &streaming_chunk, "streaming chunk size"); + frame_opts.Register(opts); + } }; class LinearSpectrogram : public FeatureExtractorInterface { public: - explicit LinearSpectrogram( - const LinearSpectrogramOptions& opts, - std::unique_ptr base_extractor); - virtual void AcceptWaveform( - const kaldi::VectorBase& input); - virtual void Read(kaldi::VectorBase* feat); + explicit LinearSpectrogram(const LinearSpectrogramOptions& opts, + std::unique_ptr base_extractor); + virtual void AcceptWaveform(const kaldi::VectorBase& input); + virtual void Read(kaldi::Vector* feat); virtual size_t Dim() const { return dim_; } void ReadFeats(kaldi::Matrix* feats); - private: + private: void Hanning(std::vector* data) const; bool Compute(const std::vector& wave, std::vector>& feat); @@ -54,8 +44,9 @@ class LinearSpectrogram : public FeatureExtractorInterface { std::vector hanning_window_; kaldi::BaseFloat hanning_window_energy_; LinearSpectrogramOptions opts_; - kaldi::Vector waveform_; // remove later, todo(SmileGoat) + kaldi::Vector waveform_; // remove later, todo(SmileGoat) std::unique_ptr base_extractor_; + int chunk_sample_size_; DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram); }; diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/normalizer.cc index abf798e5..69c9ab59 100644 --- a/speechx/speechx/frontend/normalizer.cc +++ b/speechx/speechx/frontend/normalizer.cc @@ -1,17 +1,3 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - #include "frontend/normalizer.h" #include "kaldi/feat/cmvn.h" @@ -24,175 +10,176 @@ using kaldi::VectorBase; using kaldi::BaseFloat; using std::vector; using kaldi::SubVector; +using std::unique_ptr; -DecibelNormalizer::DecibelNormalizer(const DecibelNormalizerOptions& opts) { - opts_ = opts; - dim_ = 0; +DecibelNormalizer::DecibelNormalizer(const DecibelNormalizerOptions& opts, + std::unique_ptr base_extractor) { + base_extractor_ = std::move(base_extractor); + opts_ = opts; + dim_ = 0; } - -void DecibelNormalizer::AcceptWaveform( - const kaldi::VectorBase& input) { - dim_ = input.Dim(); - waveform_.Resize(input.Dim()); - waveform_.CopyFromVec(input); + +void DecibelNormalizer::AcceptWaveform(const kaldi::VectorBase& input) { + //dim_ = input.Dim(); + //waveform_.Resize(input.Dim()); + //waveform_.CopyFromVec(input); + base_extractor_->AcceptWaveform(input); } -void DecibelNormalizer::Read(kaldi::VectorBase* feat) { - if (waveform_.Dim() == 0) return; - Compute(waveform_, feat); +void DecibelNormalizer::Read(kaldi::Vector* feat) { + // if (waveform_.Dim() == 0) return; + base_extractor_->Read(feat); + Compute(feat); } -// todo remove later +//todo remove later void CopyVector2StdVector(const kaldi::VectorBase& input, vector* output) { - if (input.Dim() == 0) return; - output->resize(input.Dim()); - for (size_t idx = 0; idx < input.Dim(); ++idx) { - (*output)[idx] = input(idx); - } + if (input.Dim() == 0) return; + output->resize(input.Dim()); + for (size_t idx = 0; idx < input.Dim(); ++idx) { + (*output)[idx] = input(idx); + } } void CopyStdVector2Vector(const vector& input, VectorBase* output) { - if (input.empty()) return; - assert(input.size() == output->Dim()); - for (size_t idx = 0; idx < input.size(); ++idx) { - (*output)(idx) = input[idx]; - } + if (input.empty()) return; + assert(input.size() == output->Dim()); + for (size_t idx = 0; idx < input.size(); ++idx) { + (*output)(idx) = input[idx]; + } } -bool DecibelNormalizer::Compute(const VectorBase& input, - VectorBase* feat) const { - // calculate db rms - BaseFloat rms_db = 0.0; - BaseFloat mean_square = 0.0; - BaseFloat gain = 0.0; - BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1)); - - vector samples; - samples.resize(input.Dim()); - for (int32 i = 0; i < samples.size(); ++i) { - samples[i] = input(i); - } - - // square - for (auto& d : samples) { - if (opts_.convert_int_float) { - d = d * wave_float_normlization; - } - mean_square += d * d; +bool DecibelNormalizer::Compute(VectorBase* feat) const { + // calculate db rms + BaseFloat rms_db = 0.0; + BaseFloat mean_square = 0.0; + BaseFloat gain = 0.0; + BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1)); + + vector samples; + samples.resize(feat->Dim()); + for (size_t i = 0; i < samples.size(); ++i) { + samples[i] = (*feat)(i); + } + + // square + for (auto &d : samples) { + if (opts_.convert_int_float) { + d = d * wave_float_normlization; } - - // mean - mean_square /= samples.size(); - rms_db = 10 * std::log10(mean_square); - gain = opts_.target_db - rms_db; - - if (gain > opts_.max_gain_db) { - LOG(ERROR) - << "Unable to normalize segment to " << opts_.target_db << "dB," - << "because the the probable gain have exceeds opts_.max_gain_db" - << opts_.max_gain_db << "dB."; - return false; - } - - // Note that this is an in-place transformation. - for (auto& item : samples) { - // python item *= 10.0 ** (gain / 20.0) - item *= std::pow(10.0, gain / 20.0); - } - - CopyStdVector2Vector(samples, feat); - return true; + mean_square += d * d; + } + + // mean + mean_square /= samples.size(); + rms_db = 10 * std::log10(mean_square); + gain = opts_.target_db - rms_db; + + if (gain > opts_.max_gain_db) { + LOG(ERROR) << "Unable to normalize segment to " << opts_.target_db << "dB," + << "because the the probable gain have exceeds opts_.max_gain_db" + << opts_.max_gain_db << "dB."; + return false; + } + + // Note that this is an in-place transformation. + for (auto &item : samples) { + // python item *= 10.0 ** (gain / 20.0) + item *= std::pow(10.0, gain / 20.0); + } + + CopyStdVector2Vector(samples, feat); + return true; } -CMVN::CMVN(std::string cmvn_file) : var_norm_(true) { +CMVN::CMVN(std::string cmvn_file, + unique_ptr base_extractor) + : var_norm_(true) { + base_extractor_ = std::move(base_extractor); bool binary; kaldi::Input ki(cmvn_file, &binary); stats_.Read(ki.Stream(), binary); + dim_ = stats_.NumCols() - 1; } void CMVN::AcceptWaveform(const kaldi::VectorBase& input) { + base_extractor_->AcceptWaveform(input); return; } -void CMVN::Read(kaldi::VectorBase* feat) { return; } +void CMVN::Read(kaldi::Vector* feat) { + base_extractor_->Read(feat); + Compute(feat); + return; +} // feats contain num_frames feature. -void CMVN::ApplyCMVN(bool var_norm, VectorBase* feats) { - KALDI_ASSERT(feats != NULL); - int32 dim = stats_.NumCols() - 1; - if (stats_.NumRows() > 2 || stats_.NumRows() < 1 || - feats->Dim() % dim != 0) { - KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << 'x' - << stats_.NumCols() << ", feats " << feats->Dim() << 'x'; +void CMVN::Compute(VectorBase* feats) const { + KALDI_ASSERT(feats != NULL); + int32 dim = stats_.NumCols() - 1; + if (stats_.NumRows() > 2 || stats_.NumRows() < 1 || feats->Dim() % dim != 0) { + KALDI_ERR << "Dim mismatch: cmvn " + << stats_.NumRows() << 'x' << stats_.NumCols() + << ", feats " << feats->Dim() << 'x'; + } + if (stats_.NumRows() == 1 && var_norm_) { + KALDI_ERR << "You requested variance normalization but no variance stats_ " + << "are supplied."; + } + + double count = stats_(0, dim); + // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when + // computing an offset and representing it as stats_, we use a count of one. + if (count < 1.0) + KALDI_ERR << "Insufficient stats_ for cepstral mean and variance normalization: " + << "count = " << count; + + if (!var_norm_) { + Vector offset(feats->Dim()); + SubVector mean_stats(stats_.RowData(0), dim); + Vector mean_stats_apply(feats->Dim()); + //fill the datat of mean_stats in mean_stats_appy whose dim is equal with the dim of feature. + //the dim of feats = dim * num_frames; + for (int32 idx = 0; idx < feats->Dim() / dim; ++idx) { + SubVector stats_tmp(mean_stats_apply.Data() + dim*idx, dim); + stats_tmp.CopyFromVec(mean_stats); } - if (stats_.NumRows() == 1 && var_norm) { - KALDI_ERR - << "You requested variance normalization but no variance stats_ " - << "are supplied."; - } - - double count = stats_(0, dim); - // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when - // computing an offset and representing it as stats_, we use a count of one. - if (count < 1.0) - KALDI_ERR << "Insufficient stats_ for cepstral mean and variance " - "normalization: " - << "count = " << count; - - if (!var_norm) { - Vector offset(feats->Dim()); - SubVector mean_stats(stats_.RowData(0), dim); - Vector mean_stats_apply(feats->Dim()); - // fill the datat of mean_stats in mean_stats_appy whose dim is equal - // with the dim of feature. - // the dim of feats = dim * num_frames; - for (int32 idx = 0; idx < feats->Dim() / dim; ++idx) { - SubVector stats_tmp(mean_stats_apply.Data() + dim * idx, - dim); - stats_tmp.CopyFromVec(mean_stats); - } - offset.AddVec(-1.0 / count, mean_stats_apply); - feats->AddVec(1.0, offset); - return; + offset.AddVec(-1.0 / count, mean_stats_apply); + feats->AddVec(1.0, offset); + return; + } + // norm(0, d) = mean offset; + // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d). + kaldi::Matrix norm(2, feats->Dim()); + for (int32 d = 0; d < dim; d++) { + double mean, offset, scale; + mean = stats_(0, d)/count; + double var = (stats_(1, d)/count) - mean*mean, + floor = 1.0e-20; + if (var < floor) { + KALDI_WARN << "Flooring cepstral variance from " << var << " to " + << floor; + var = floor; } - // norm(0, d) = mean offset; - // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d). - kaldi::Matrix norm(2, feats->Dim()); - for (int32 d = 0; d < dim; d++) { - double mean, offset, scale; - mean = stats_(0, d) / count; - double var = (stats_(1, d) / count) - mean * mean, floor = 1.0e-20; - if (var < floor) { - KALDI_WARN << "Flooring cepstral variance from " << var << " to " - << floor; - var = floor; - } - scale = 1.0 / sqrt(var); - if (scale != scale || 1 / scale == 0.0) - KALDI_ERR - << "NaN or infinity in cepstral mean/variance computation"; - offset = -(mean * scale); - for (int32 d_skip = d; d_skip < feats->Dim();) { - norm(0, d_skip) = offset; - norm(1, d_skip) = scale; - d_skip = d_skip + dim; - } + scale = 1.0 / sqrt(var); + if (scale != scale || 1/scale == 0.0) + KALDI_ERR << "NaN or infinity in cepstral mean/variance computation"; + offset = -(mean*scale); + for (int32 d_skip = d; d_skip < feats->Dim();) { + norm(0, d_skip) = offset; + norm(1, d_skip) = scale; + d_skip = d_skip + dim; } - // Apply the normalization. - feats->MulElements(norm.Row(1)); - feats->AddVec(1.0, norm.Row(0)); + } + // Apply the normalization. + feats->MulElements(norm.Row(1)); + feats->AddVec(1.0, norm.Row(0)); } -void CMVN::ApplyCMVNMatrix(bool var_norm, kaldi::MatrixBase* feats) { - ApplyCmvn(stats_, var_norm, feats); +void CMVN::ApplyCMVN(kaldi::MatrixBase* feats) { + ApplyCmvn(stats_, var_norm_, feats); } -bool CMVN::Compute(const VectorBase& input, - VectorBase* feat) const { - return false; -} - - -} // namespace ppspeech +} // namespace ppspeech diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/normalizer.h index 6af5cdd8..13c5b8df 100644 --- a/speechx/speechx/frontend/normalizer.h +++ b/speechx/speechx/frontend/normalizer.h @@ -1,56 +1,40 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - #pragma once #include "base/common.h" #include "frontend/feature_extractor_interface.h" -#include "kaldi/matrix/kaldi-matrix.h" #include "kaldi/util/options-itf.h" +#include "kaldi/matrix/kaldi-matrix.h" namespace ppspeech { struct DecibelNormalizerOptions { - float target_db; - float max_gain_db; - bool convert_int_float; - DecibelNormalizerOptions() - : target_db(-20), max_gain_db(300.0), convert_int_float(false) {} + float target_db; + float max_gain_db; + bool convert_int_float; + DecibelNormalizerOptions() : + target_db(-20), + max_gain_db(300.0), + convert_int_float(false){} void Register(kaldi::OptionsItf* opts) { - opts->Register( - "target-db", &target_db, "target db for db normalization"); - opts->Register( - "max-gain-db", &max_gain_db, "max gain db for db normalization"); - opts->Register("convert-int-float", - &convert_int_float, - "if convert int samples to float"); + opts->Register("target-db", &target_db, "target db for db normalization"); + opts->Register("max-gain-db", &max_gain_db, "max gain db for db normalization"); + opts->Register("convert-int-float", &convert_int_float, "if convert int samples to float"); } }; class DecibelNormalizer : public FeatureExtractorInterface { public: - explicit DecibelNormalizer(const DecibelNormalizerOptions& opts); - virtual void AcceptWaveform( - const kaldi::VectorBase& input); - virtual void Read(kaldi::VectorBase* feat); + explicit DecibelNormalizer( + const DecibelNormalizerOptions& opts, + std::unique_ptr base_extractor); + virtual void AcceptWaveform(const kaldi::VectorBase& input); + virtual void Read(kaldi::Vector* feat); virtual size_t Dim() const { return dim_; } - bool Compute(const kaldi::VectorBase& input, - kaldi::VectorBase* feat) const; private: + bool Compute(kaldi::VectorBase* feat) const; DecibelNormalizerOptions opts_; size_t dim_; std::unique_ptr base_extractor_; @@ -60,20 +44,18 @@ class DecibelNormalizer : public FeatureExtractorInterface { class CMVN : public FeatureExtractorInterface { public: - explicit CMVN(std::string cmvn_file); - virtual void AcceptWaveform( - const kaldi::VectorBase& input); - virtual void Read(kaldi::VectorBase* feat); - virtual size_t Dim() const { return stats_.NumCols() - 1; } - bool Compute(const kaldi::VectorBase& input, - kaldi::VectorBase* feat) const; - // for test - void ApplyCMVN(bool var_norm, kaldi::VectorBase* feats); - void ApplyCMVNMatrix(bool var_norm, kaldi::MatrixBase* feats); + explicit CMVN( + std::string cmvn_file, + std::unique_ptr base_extractor); + virtual void AcceptWaveform(const kaldi::VectorBase& input); + virtual void Read(kaldi::Vector* feat); + virtual size_t Dim() const { return dim_; } private: + void Compute(kaldi::VectorBase* feat) const; + void ApplyCMVN(kaldi::MatrixBase* feats); kaldi::Matrix stats_; - std::shared_ptr base_extractor_; + std::unique_ptr base_extractor_; size_t dim_; bool var_norm_; }; From c769d9078124d2419cf72493945ea9f75ef7f300 Mon Sep 17 00:00:00 2001 From: SmileGoat Date: Wed, 9 Mar 2022 21:01:39 +0800 Subject: [PATCH 4/7] make feature cache& raw audio work --- speechx/examples/feat/CMakeLists.txt | 6 +- .../examples/feat/linear_spectrogram_main.cc | 76 +++-- speechx/examples/feat/streaming_feat_main.cc | 109 ------- speechx/speechx/base/common.h | 13 +- speechx/speechx/frontend/CMakeLists.txt | 3 +- speechx/speechx/frontend/feature_cache.cc | 78 ++++- speechx/speechx/frontend/feature_cache.h | 51 +++- .../frontend/feature_extractor_interface.h | 8 +- .../speechx/frontend/linear_spectrogram.cc | 217 +++++++------- speechx/speechx/frontend/linear_spectrogram.h | 40 ++- speechx/speechx/frontend/normalizer.cc | 280 ++++++++++-------- speechx/speechx/frontend/normalizer.h | 57 ++-- speechx/speechx/frontend/raw_audio.cc | 119 +++++--- speechx/speechx/frontend/raw_audio.h | 71 ++++- 14 files changed, 610 insertions(+), 518 deletions(-) delete mode 100644 speechx/examples/feat/streaming_feat_main.cc diff --git a/speechx/examples/feat/CMakeLists.txt b/speechx/examples/feat/CMakeLists.txt index 44738e60..b8f516af 100644 --- a/speechx/examples/feat/CMakeLists.txt +++ b/speechx/examples/feat/CMakeLists.txt @@ -5,6 +5,6 @@ add_executable(mfcc-test ${CMAKE_CURRENT_SOURCE_DIR}/feature-mfcc-test.cc) target_include_directories(mfcc-test PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) target_link_libraries(mfcc-test kaldi-mfcc) -add_executable(linear-spectrogram-main ${CMAKE_CURRENT_SOURCE_DIR}/linear-spectrogram-main.cc) -target_include_directories(linear-spectrogram-main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(linear-spectrogram-main frontend kaldi-util kaldi-feat-common gflags glog) \ No newline at end of file +add_executable(linear_spectrogram_main ${CMAKE_CURRENT_SOURCE_DIR}/linear_spectrogram_main.cc) +target_include_directories(linear_spectrogram_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) +target_link_libraries(linear_spectrogram_main frontend kaldi-util kaldi-feat-common gflags glog) \ No newline at end of file diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc index 3e2342c2..c2ca6187 100644 --- a/speechx/examples/feat/linear_spectrogram_main.cc +++ b/speechx/examples/feat/linear_spectrogram_main.cc @@ -14,11 +14,13 @@ // todo refactor, repalce with gtest +#include "frontend/linear_spectrogram.h" #include "base/flags.h" #include "base/log.h" +#include "frontend/feature_cache.h" #include "frontend/feature_extractor_interface.h" -#include "frontend/linear_spectrogram.h" #include "frontend/normalizer.h" +#include "frontend/raw_audio.h" #include "kaldi/feat/wave-reader.h" #include "kaldi/util/kaldi-io.h" #include "kaldi/util/table-types.h" @@ -158,38 +160,37 @@ int main(int argc, char* argv[]) { kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier); - kaldi::BaseFloatMatrixWriter feat_cmvn_check_writer( - FLAGS_feature_check_wspecifier); WriteMatrix(); // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning // window -->linear_spectrogram --> cmvn int32 num_done = 0, num_err = 0; + // std::unique_ptr data_source(new + // ppspeech::RawDataSource()); + std::unique_ptr data_source( + new ppspeech::RawAudioSource()); + ppspeech::LinearSpectrogramOptions opt; opt.frame_opts.frame_length_ms = 20; opt.frame_opts.frame_shift_ms = 10; ppspeech::DecibelNormalizerOptions db_norm_opt; std::unique_ptr base_feature_extractor( - new ppspeech::DecibelNormalizer(db_norm_opt)); - ppspeech::LinearSpectrogram linear_spectrogram( - opt, std::move(base_feature_extractor)); + new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source))); + + std::unique_ptr linear_spectrogram( + new ppspeech::LinearSpectrogram(opt, + std::move(base_feature_extractor))); - ppspeech::CMVN cmvn(FLAGS_cmvn_write_path); + std::unique_ptr cmvn( + new ppspeech::CMVN(FLAGS_cmvn_write_path, + std::move(linear_spectrogram))); + + ppspeech::FeatureCache feature_cache(kint16max, std::move(cmvn)); float streaming_chunk = 0.36; int sample_rate = 16000; int chunk_sample_size = streaming_chunk * sample_rate; - LOG(INFO) << mean_.size(); - for (size_t i = 0; i < mean_.size(); i++) { - mean_[i] /= count_; - variance_[i] = variance_[i] / count_ - mean_[i] * mean_[i]; - if (variance_[i] < 1.0e-20) { - variance_[i] = 1.0e-20; - } - variance_[i] = 1.0 / std::sqrt(variance_[i]); - } - for (; !wav_reader.Done(); wav_reader.Next()) { std::string utt = wav_reader.Key(); const kaldi::WaveData& wave_data = wav_reader.Value(); @@ -199,54 +200,45 @@ int main(int argc, char* argv[]) { this_channel); int tot_samples = waveform.Dim(); int sample_offset = 0; - std::vector> feats; + std::vector> feats; int feature_rows = 0; while (sample_offset < tot_samples) { int cur_chunk_size = std::min(chunk_sample_size, tot_samples - sample_offset); + kaldi::Vector wav_chunk(cur_chunk_size); for (int i = 0; i < cur_chunk_size; ++i) { wav_chunk(i) = waveform(sample_offset + i); } - kaldi::Matrix features; - linear_spectrogram.AcceptWaveform(wav_chunk); - linear_spectrogram.ReadFeats(&features); + kaldi::Vector features; + feature_cache.AcceptWaveform(wav_chunk); + if (cur_chunk_size < chunk_sample_size) { + feature_cache.SetFinished(); + } + feature_cache.Read(&features); + if (features.Dim() == 0) break; feats.push_back(features); sample_offset += cur_chunk_size; - feature_rows += features.NumRows(); + feature_rows += features.Dim() / feature_cache.Dim(); } int cur_idx = 0; kaldi::Matrix features(feature_rows, - feats[0].NumCols()); + feature_cache.Dim()); for (auto feat : feats) { - for (int row_idx = 0; row_idx < feat.NumRows(); ++row_idx) { - for (int col_idx = 0; col_idx < feat.NumCols(); ++col_idx) { + int num_rows = feat.Dim() / feature_cache.Dim(); + for (int row_idx = 0; row_idx < num_rows; ++row_idx) { + for (size_t col_idx = 0; col_idx < feature_cache.Dim(); + ++col_idx) { features(cur_idx, col_idx) = - (feat(row_idx, col_idx) - mean_[col_idx]) * - variance_[col_idx]; + feat(row_idx * feature_cache.Dim() + col_idx); } ++cur_idx; } } feat_writer.Write(utt, features); - cur_idx = 0; - kaldi::Matrix features_check(feature_rows, - feats[0].NumCols()); - for (auto feat : feats) { - for (int row_idx = 0; row_idx < feat.NumRows(); ++row_idx) { - for (int col_idx = 0; col_idx < feat.NumCols(); ++col_idx) { - features_check(cur_idx, col_idx) = feat(row_idx, col_idx); - } - kaldi::SubVector row_feat(features_check, cur_idx); - cmvn.ApplyCMVN(true, &row_feat); - ++cur_idx; - } - } - feat_cmvn_check_writer.Write(utt, features_check); - if (num_done % 50 == 0 && num_done != 0) KALDI_VLOG(2) << "Processed " << num_done << " utterances"; num_done++; diff --git a/speechx/examples/feat/streaming_feat_main.cc b/speechx/examples/feat/streaming_feat_main.cc deleted file mode 100644 index b3ee9842..00000000 --- a/speechx/examples/feat/streaming_feat_main.cc +++ /dev/null @@ -1,109 +0,0 @@ -// todo refactor, repalce with gtest - -#include "base/log.h" -#include "base/flags.h" -#include "frontend/linear_spectrogram.h" -#include "frontend/normalizer.h" -#include "frontend/feature_extractor_interface.h" -#include "frontend/raw_audio.h" -#include "kaldi/util/table-types.h" -#include "kaldi/feat/wave-reader.h" -#include "kaldi/util/kaldi-io.h" - -DEFINE_string(wav_rspecifier, "", "test wav path"); -DEFINE_string(feature_wspecifier, "", "test wav ark"); -DEFINE_string(feature_check_wspecifier, "", "test wav ark"); -DEFINE_string(cmvn_write_path, "./cmvn.ark", "test wav ark"); - - -std::vector mean_{-13730251.531853663, -12982852.199316509, -13673844.299583456, -13089406.559646806, -12673095.524938712, -12823859.223276224, -13590267.158903603, -14257618.467152044, -14374605.116185192, -14490009.21822485, -14849827.158924166, -15354435.470563512, -15834149.206532761, -16172971.985514281, -16348740.496746974, -16423536.699409386, -16556246.263649225, -16744088.772748645, -16916184.08510357, -17054034.840031497, -17165612.509455364, -17255955.470915023, -17322572.527648456, -17408943.862033736, -17521554.799865916, -17620623.254924215, -17699792.395918526, -17723364.411134344, -17741483.4433254, -17747426.888704527, -17733315.928209435, -17748780.160905756, -17808336.883775543, -17895918.671983004, -18009812.59173023, -18098188.66548325, -18195798.958462656, -18293617.62980999, -18397432.92077201, -18505834.787318766, -18585451.8100908, -18652438.235649142, -18700960.306275308, -18734944.58792185, -18737426.313365128, -18735347.165987637, -18738813.444170244, -18737086.848890636, -18731576.2474336, -18717405.44095871, -18703089.25545657, -18691014.546456724, -18692460.568905357, -18702119.628629155, -18727710.621126678, -18761582.72034647, -18806745.835547544, -18850674.8692112, -18884431.510951452, -18919999.992506847, -18939303.799078144, -18952946.273760635, -18980289.22996379, -19011610.17803294, -19040948.61805145, -19061021.429847397, -19112055.53768819, -19149667.414264943, -19201127.05091321, -19270250.82564605, -19334606.883057203, -19390513.336589377, -19444176.259208687, -19502755.000038862, -19544333.014549147, -19612668.183176614, -19681902.19006569, -19771969.951249883, -19873329.723376893, -19996752.59235844, -20110031.131400537, -20231658.612529557, -20319378.894054495, -20378534.45718066, -20413332.089584175, -20438147.844177883, -20443710.248040095, -20465457.02238927, -20488610.969337028, -20516295.16424432, -20541423.795738827, -20553192.874953747, -20573605.50701977, -20577871.61936797, -20571807.008916274, -20556242.38912231, -20542199.30819195, -20521239.063551214, -20519150.80004532, -20527204.80248933, -20536933.769257784, -20543470.522332076, -20549700.089992985, -20551525.24958494, -20554873.406493705, -20564277.65794227, -20572211.740052115, -20574305.69550465, -20575494.450104576, -20567092.577932164, -20549302.929608088, -20545445.11878376, -20546625.326603737, -20549190.03499401, -20554824.947828256, -20568341.378989458, -20577582.331383612, -20577980.519402675, -20566603.03458152, -20560131.592262644, -20552166.469060015, -20549063.06763577, -20544490.562339947, -20539817.82346569, -20528747.715731595, -20518026.24576161, -20510977.844974525, -20506874.36087992, -20506731.11977665, -20510482.133420516, -20507760.92101862, -20494644.834457114, -20480107.89304893, -20461312.091867123, -20442941.75080173, -20426123.02834838, -20424607.675283, -20426810.369107097, -20434024.50097819, -20437404.75544205, -20447688.63916367, -20460893.335563846, -20482922.735127095, -20503610.119434915, -20527062.76448319, -20557830.035128627, -20593274.72068722, -20632528.452965066, -20673637.471334763, -20733106.97143075, -20842921.0447562, -21054357.83621519, -21416569.534189366, -21978460.272811692, -22753170.052172784, -23671344.10563395, -24613499.293358143, -25406477.12230188, -25884377.82156489, -26049040.62791664, -26996879.104431007}; -std::vector variance_{213747175.10846674, 188395815.34302503, 212706429.10966414, 199109025.81461075, 189235901.23864496, 194901336.53253657, 217481594.29306737, 238689869.12327808, 243977501.24115244, 248479623.6431067, 259766741.47116545, 275516766.7790273, 291271202.3691234, 302693239.8220509, 308627358.3997694, 311143911.38788426, 315446105.07731867, 321705430.9341829, 327458907.4659941, 332245072.43223983, 336251717.5935284, 339694069.7639722, 342188204.4322228, 345587110.31313115, 349903086.2875232, 353660214.20643026, 356700344.5270885, 357665362.3529641, 358493352.05658793, 358857951.620328, 358375239.52774596, 358899733.6342954, 361051818.3511561, 364361716.05025816, 368750322.3771452, 372047800.6462831, 375655861.1349018, 379358519.1980013, 383327605.3935181, 387458599.282341, 390434692.3406868, 392994486.35057056, 394874418.04603153, 396230525.79763395, 396365592.0414835, 396334819.8242737, 396488353.19250053, 396438877.00744957, 396197980.4459586, 395590921.6672991, 395001107.62072515, 394528291.7318225, 394593110.424006, 395018405.59353715, 396110577.5415993, 397506704.0371068, 399400197.4657644, 401243568.2468382, 402687134.7805103, 404136047.2872507, 404883170.001883, 405522253.219517, 406660365.3626476, 407919346.0991902, 409045348.5384909, 409759588.7889818, 411974821.8564483, 413489718.78201455, 415535392.56684107, 418466481.97674364, 421104678.35678065, 423405392.5200779, 425550570.40798235, 427929423.9579701, 429585274.253478, 432368493.55181056, 435193587.13513297, 438886855.20476013, 443058876.8633751, 448181232.5093362, 452883835.6332396, 458056721.77926534, 461816531.22735566, 464363620.1970998, 465886343.5057493, 466928872.0651, 467180536.42647296, 468111848.70714295, 469138695.3071312, 470378429.6930793, 471517958.7132626, 472109050.4262365, 473087417.0177867, 473381322.04648733, 473220195.85483915, 472666071.8998819, 472124669.87879956, 471298571.411737, 471251033.2902761, 471672676.43128747, 472177147.2193172, 472572361.7711908, 472968783.7751127, 473156295.4164052, 473398034.82676554, 473897703.5203811, 474328271.33112127, 474452670.98002136, 474549003.99284613, 474252887.13567275, 473557462.909069, 473483385.85193115, 473609738.04855174, 473746944.82085115, 474016729.91696435, 474617321.94138587, 475045097.237122, 475125402.586558, 474664112.9824912, 474426247.5800283, 474104075.42796475, 473978219.7273978, 473773171.7798875, 473578534.69508696, 473102924.16904145, 472651240.5232615, 472374383.1810912, 472209479.6956096, 472202298.8921673, 472370090.76781124, 472220933.99374026, 471625467.37106377, 470994646.51883453, 470182428.9637543, 469348211.5939578, 468570387.4467277, 468540442.7225135, 468672018.90414184, 468994346.9533251, 469138757.58201426, 469553915.95710236, 470134523.38582784, 471082421.62055486, 471962316.51804745, 472939745.1708408, 474250621.5944825, 475773933.43199486, 477465399.71087736, 479218782.61382693, 481752299.7930922, 486608947.8984568, 496119403.2067917, 512730085.5704984, 539048915.2641417, 576285298.3548826, 621610270.2240586, 669308196.4436442, 710656993.5957186, 736344437.3725077, 745481288.0241544, 801121432.9925804}; -int count_ = 912592; - -void WriteMatrix() { - kaldi::Matrix cmvn_stats(2, mean_.size()+ 1); - for (size_t idx = 0; idx < mean_.size(); ++idx) { - cmvn_stats(0, idx) = mean_[idx]; - cmvn_stats(1, idx) = variance_[idx]; - } - cmvn_stats(0, mean_.size()) = count_; - kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, true); -} - -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - google::InitGoogleLogging(argv[0]); - - kaldi::SequentialTableReader wav_reader(FLAGS_wav_rspecifier); - kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier); - WriteMatrix(); - - // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning window -->linear_spectrogram --> cmvn - int32 num_done = 0, num_err = 0; - std::unique_ptr data_source(new ppspeech::RawDataSource()); - - ppspeech::LinearSpectrogramOptions opt; - opt.frame_opts.frame_length_ms = 20; - opt.frame_opts.frame_shift_ms = 10; - ppspeech::DecibelNormalizerOptions db_norm_opt; - std::unique_ptr base_feature_extractor( - new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source))); - - std::unique_ptr linear_spectrogram( - new ppspeech::LinearSpectrogram(opt, std::move(base_feature_extractor))); - - ppspeech::CMVN cmvn(FLAGS_cmvn_write_path, std::move(linear_spectrogram)); - - float streaming_chunk = 0.36; - int sample_rate = 16000; - int chunk_sample_size = streaming_chunk * sample_rate; - - for (; !wav_reader.Done(); wav_reader.Next()) { - std::string utt = wav_reader.Key(); - const kaldi::WaveData &wave_data = wav_reader.Value(); - - int32 this_channel = 0; - kaldi::SubVector waveform(wave_data.Data(), this_channel); - int tot_samples = waveform.Dim(); - int sample_offset = 0; - std::vector> feats; - int feature_rows = 0; - while (sample_offset < tot_samples) { - int cur_chunk_size = std::min(chunk_sample_size, tot_samples - sample_offset); - kaldi::Vector wav_chunk(cur_chunk_size); - for (int i = 0; i < cur_chunk_size; ++i) { - wav_chunk(i) = waveform(sample_offset + i); - } - kaldi::Vector features; - cmvn.AcceptWaveform(wav_chunk); - cmvn.Read(&features); - - std::cout << wav_chunk(0) << std::endl; - std::cout << features(0) << std::endl; - - feats.push_back(features); - sample_offset += cur_chunk_size; - feature_rows += features.Dim() / cmvn.Dim(); - } - - int cur_idx = 0; - kaldi::Matrix features(feature_rows, cmvn.Dim()); - for (auto feat : feats) { - int num_rows = feat.Dim() / cmvn.Dim(); - for (int row_idx = 0; row_idx < num_rows; ++row_idx) { - for (int col_idx = 0; col_idx < cmvn.Dim(); ++col_idx) { - features(cur_idx, col_idx) = feat(row_idx*cmvn.Dim() + col_idx); - } - ++cur_idx; - } - } - feat_writer.Write(utt, features); - - if (num_done % 50 == 0 && num_done != 0) - KALDI_VLOG(2) << "Processed " << num_done << " utterances"; - num_done++; - } - KALDI_LOG << "Done " << num_done << " utterances, " << num_err - << " with errors."; - return (num_done != 0 ? 0 : 1); -} diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h index ac01a977..7502bc5e 100644 --- a/speechx/speechx/base/common.h +++ b/speechx/speechx/base/common.h @@ -14,24 +14,25 @@ #pragma once +#include #include +#include #include #include -#include #include #include +#include #include +#include #include #include #include #include -#include #include #include -#include -#include +#include -#include "base/log.h" -#include "base/flags.h" #include "base/basic_types.h" +#include "base/flags.h" +#include "base/log.h" #include "base/macros.h" diff --git a/speechx/speechx/frontend/CMakeLists.txt b/speechx/speechx/frontend/CMakeLists.txt index e43bd182..44ca52cd 100644 --- a/speechx/speechx/frontend/CMakeLists.txt +++ b/speechx/speechx/frontend/CMakeLists.txt @@ -4,6 +4,7 @@ add_library(frontend STATIC normalizer.cc linear_spectrogram.cc raw_audio.cc + feature_cache.cc ) -target_link_libraries(frontend PUBLIC kaldi-matrix) \ No newline at end of file +target_link_libraries(frontend PUBLIC kaldi-matrix) diff --git a/speechx/speechx/frontend/feature_cache.cc b/speechx/speechx/frontend/feature_cache.cc index 07f2cbf7..df366a06 100644 --- a/speechx/speechx/frontend/feature_cache.cc +++ b/speechx/speechx/frontend/feature_cache.cc @@ -1,38 +1,84 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include "frontend/feature_cache.h" -void FeatureCache::AcceptWaveform(const kaldi::VectorBase& input) { +namespace ppspeech { + +using kaldi::Vector; +using kaldi::VectorBase; +using kaldi::BaseFloat; +using std::vector; +using kaldi::SubVector; +using std::unique_ptr; + +FeatureCache::FeatureCache( + int max_size, unique_ptr base_extractor) { + max_size_ = max_size; + base_extractor_ = std::move(base_extractor); +} + +void FeatureCache::AcceptWaveform( + const kaldi::VectorBase& input) { base_extractor_->AcceptWaveform(input); // feed current data - while (base_extractor_->IsLastFrame()) { - Compute(); - } + bool result = false; + do { + result = Compute(); + } while (result); } -// pop feature chunk -void FeatureCache::Read(kaldi::VectorBase* feat) { - std::lock_guard lock(mutex_); - while (cache_.empty()) { +// pop feature chunk +bool FeatureCache::Read(kaldi::Vector* feat) { + kaldi::Timer timer; + std::unique_lock lock(mutex_); + while (cache_.empty() && base_extractor_->IsFinished() == false) { ready_read_condition_.wait(lock); + BaseFloat elapsed = timer.Elapsed() * 1000; + // todo replace 1.0 with timeout_ + if (elapsed > 1.0) { + return false; + } + usleep(1000); // sleep 1 ms } + if (cache_.empty()) return false; + feat->Resize(cache_.front().Dim()); feat->CopyFromVec(cache_.front()); cache_.pop(); ready_feed_condition_.notify_one(); + return true; } // read all data from base_feature_extractor_ into cache_ -void FeatureCache::Compute() { +bool FeatureCache::Compute() { // compute and feed - Vector feature_chunk(base_extractor_->Dim()); - base_extractor_->Read(&feature_chunk); - std::lock_guard lock(mutex_); + Vector feature_chunk; + bool result = base_extractor_->Read(&feature_chunk); + std::unique_lock lock(mutex_); while (cache_.size() >= max_size_) { ready_feed_condition_.wait(lock); } - cache_.push(feature_chunk); + if (feature_chunk.Dim() != 0) { + cache_.push(feature_chunk); + } ready_read_condition_.notify_one(); + return result; } -// compute the last chunk data && set feed finished -void FeatureCache::InputFinishd() { - Compute(); +void Reset() { + // std::lock_guard lock(mutex_); + return; } + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h index 71dc455c..c7d66251 100644 --- a/speechx/speechx/frontend/feature_cache.h +++ b/speechx/speechx/frontend/feature_cache.h @@ -1,21 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "base/common.h" #include "frontend/feature_extractor_interface.h" -class FeatureCache { +namespace ppspeech { + +class FeatureCache : public FeatureExtractorInterface { public: - explicit FeatureCache(FeatureExtractorInterface base_extractor); - void AcceptWaveform(const kaldi::VectorBase& input); - void Read(kaldi::VectorBase* feat); - void Dim() { return base_extractor_->Dim(); } - void SetFinished(); - bool IsFinished(); + explicit FeatureCache( + int32 max_size = kint16max, + std::unique_ptr base_extractor = NULL); + virtual void AcceptWaveform( + const kaldi::VectorBase& input); + virtual bool Read(kaldi::Vector* feat); + virtual size_t Dim() const { return base_extractor_->Dim(); } + virtual void SetFinished() { + base_extractor_->SetFinished(); + Compute(); + } + virtual bool IsFinished() const { return base_extractor_->IsFinished(); } private: + bool Compute(); + bool finished_; - mutable std::mutex mutex_; - size_t max_size; + std::mutex mutex_; + size_t max_size_; std::queue> cache_; - std::shared_ptr base_extractor_; + std::unique_ptr base_extractor_; std::condition_variable ready_feed_condition_; std::condition_variable ready_read_condition_; - DISALLOW_COPY_AND_ASSGIN(FeatureCache); -}; \ No newline at end of file + // DISALLOW_COPY_AND_ASSGIN(FeatureCache); +}; + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/feature_extractor_interface.h b/speechx/speechx/frontend/feature_extractor_interface.h index fc06f24a..e490bc75 100644 --- a/speechx/speechx/frontend/feature_extractor_interface.h +++ b/speechx/speechx/frontend/feature_extractor_interface.h @@ -21,9 +21,13 @@ namespace ppspeech { class FeatureExtractorInterface { public: - virtual void AcceptWaveform(const kaldi::VectorBase& input) = 0; - virtual void Read(kaldi::Vector* feat) = 0; + virtual void AcceptWaveform( + const kaldi::VectorBase& input) = 0; + virtual bool Read(kaldi::Vector* feat) = 0; virtual size_t Dim() const = 0; + virtual void SetFinished() = 0; + virtual bool IsFinished() const = 0; + // virtual void Reset(); }; } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/linear_spectrogram.cc index ed4c2977..73cffea5 100644 --- a/speechx/speechx/frontend/linear_spectrogram.cc +++ b/speechx/speechx/frontend/linear_spectrogram.cc @@ -25,146 +25,145 @@ using kaldi::VectorBase; using kaldi::Matrix; using std::vector; -//todo remove later +// todo remove later void CopyVector2StdVector_(const VectorBase& input, - vector* output) { - if (input.Dim() == 0) return; - output->resize(input.Dim()); - for (size_t idx = 0; idx < input.Dim(); ++idx) { - (*output)[idx] = input(idx); - } + vector* output) { + if (input.Dim() == 0) return; + output->resize(input.Dim()); + for (size_t idx = 0; idx < input.Dim(); ++idx) { + (*output)[idx] = input(idx); + } } void CopyStdVector2Vector_(const vector& input, - Vector* output) { - if (input.empty()) return; - output->Resize(input.size()); - for (size_t idx = 0; idx < input.size(); ++idx) { - (*output)(idx) = input[idx]; - } + Vector* output) { + if (input.empty()) return; + output->Resize(input.size()); + for (size_t idx = 0; idx < input.size(); ++idx) { + (*output)(idx) = input[idx]; + } } LinearSpectrogram::LinearSpectrogram( const LinearSpectrogramOptions& opts, std::unique_ptr base_extractor) { - opts_ = opts; - base_extractor_ = std::move(base_extractor); - int32 window_size = opts.frame_opts.WindowSize(); - int32 window_shift = opts.frame_opts.WindowShift(); - fft_points_ = window_size; - chunk_sample_size_ = - static_cast(opts.streaming_chunk * opts.frame_opts.samp_freq); - hanning_window_.resize(window_size); - - double a = M_2PI / (window_size - 1); - hanning_window_energy_ = 0; - for (int i = 0; i < window_size; ++i) { - hanning_window_[i] = 0.5 - 0.5 * cos(a * i); - hanning_window_energy_ += hanning_window_[i] * hanning_window_[i]; - } - - dim_ = fft_points_ / 2 + 1; // the dimension is Fs/2 Hz + opts_ = opts; + base_extractor_ = std::move(base_extractor); + int32 window_size = opts.frame_opts.WindowSize(); + int32 window_shift = opts.frame_opts.WindowShift(); + fft_points_ = window_size; + chunk_sample_size_ = + static_cast(opts.streaming_chunk * opts.frame_opts.samp_freq); + hanning_window_.resize(window_size); + + double a = M_2PI / (window_size - 1); + hanning_window_energy_ = 0; + for (int i = 0; i < window_size; ++i) { + hanning_window_[i] = 0.5 - 0.5 * cos(a * i); + hanning_window_energy_ += hanning_window_[i] * hanning_window_[i]; + } + + dim_ = fft_points_ / 2 + 1; // the dimension is Fs/2 Hz } void LinearSpectrogram::AcceptWaveform(const VectorBase& input) { base_extractor_->AcceptWaveform(input); } -void LinearSpectrogram::Read(Vector* feat) { - Vector input_feats(chunk_sample_size_); - base_extractor_->Read(&input_feats); - vector input_feats_vec(input_feats.Dim()); - CopyVector2StdVector_(input_feats, &input_feats_vec); - //for (int idx = 0; idx < input_feats.Dim(); ++idx) { - // input_feats_vec[idx] = input_feats(idx); - //} - vector> result; - Compute(input_feats_vec, result); - int32 feat_size = 0; - if (result.size() != 0) { - feat_size = result.size() * result[0].size(); - } - feat->Resize(feat_size); - for (size_t idx = 0; idx < feat_size; ++idx) { - (*feat)(idx) = result[idx / dim_][idx % dim_]; - } - return; +bool LinearSpectrogram::Read(Vector* feat) { + Vector input_feats(chunk_sample_size_); + bool flag = base_extractor_->Read(&input_feats); + if (flag == false || input_feats.Dim() == 0) return false; + + vector input_feats_vec(input_feats.Dim()); + CopyVector2StdVector_(input_feats, &input_feats_vec); + vector> result; + Compute(input_feats_vec, result); + int32 feat_size = 0; + if (result.size() != 0) { + feat_size = result.size() * result[0].size(); + } + feat->Resize(feat_size); + for (size_t idx = 0; idx < feat_size; ++idx) { + (*feat)(idx) = result[idx / dim_][idx % dim_]; + } + return true; } void LinearSpectrogram::Hanning(vector* data) const { - CHECK_GE(data->size(), hanning_window_.size()); + CHECK_GE(data->size(), hanning_window_.size()); - for (size_t i = 0; i < hanning_window_.size(); ++i) { - data->at(i) *= hanning_window_[i]; - } + for (size_t i = 0; i < hanning_window_.size(); ++i) { + data->at(i) *= hanning_window_[i]; + } } bool LinearSpectrogram::NumpyFft(vector* v, vector* real, vector* img) const { - Vector v_tmp; - CopyStdVector2Vector_(*v, &v_tmp); - RealFft(&v_tmp, true); - CopyVector2StdVector_(v_tmp, v); - real->push_back(v->at(0)); - img->push_back(0); - for (int i = 1; i < v->size() / 2; i++) { - real->push_back(v->at(2 * i)); - img->push_back(v->at(2 * i + 1)); - } - real->push_back(v->at(1)); - img->push_back(0); - - return true; + Vector v_tmp; + CopyStdVector2Vector_(*v, &v_tmp); + RealFft(&v_tmp, true); + CopyVector2StdVector_(v_tmp, v); + real->push_back(v->at(0)); + img->push_back(0); + for (int i = 1; i < v->size() / 2; i++) { + real->push_back(v->at(2 * i)); + img->push_back(v->at(2 * i + 1)); + } + real->push_back(v->at(1)); + img->push_back(0); + + return true; } // Compute spectrogram feat, only for test, remove later // todo: refactor later (SmileGoat) bool LinearSpectrogram::Compute(const vector& wave, vector>& feat) { - int num_samples = wave.size(); - const int& frame_length = opts_.frame_opts.WindowSize(); - const int& sample_rate = opts_.frame_opts.samp_freq; - const int& frame_shift = opts_.frame_opts.WindowShift(); - const int& fft_points = fft_points_; - const float scale = hanning_window_energy_ * sample_rate; - - if (num_samples < frame_length) { - return true; - } - - int num_frames = 1 + ((num_samples - frame_length) / frame_shift); - feat.resize(num_frames); - vector fft_real((fft_points_ / 2 + 1), 0); - vector fft_img((fft_points_ / 2 + 1), 0); - vector v(frame_length, 0); - vector power((fft_points / 2 + 1)); - - for (int i = 0; i < num_frames; ++i) { - vector data(wave.data() + i * frame_shift, - wave.data() + i * frame_shift + frame_length); - Hanning(&data); - fft_img.clear(); - fft_real.clear(); - v.assign(data.begin(), data.end()); - NumpyFft(&v, &fft_real, &fft_img); - - feat[i].resize(fft_points / 2 + 1); // the last dimension is Fs/2 Hz - for (int j = 0; j < (fft_points / 2 + 1); ++j) { - power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j]; - feat[i][j] = power[j]; - - if (j == 0 || j == feat[0].size() - 1) { - feat[i][j] /= scale; - } else { - feat[i][j] *= (2.0 / scale); - } - - // log added eps=1e-14 - feat[i][j] = std::log(feat[i][j] + 1e-14); + int num_samples = wave.size(); + const int& frame_length = opts_.frame_opts.WindowSize(); + const int& sample_rate = opts_.frame_opts.samp_freq; + const int& frame_shift = opts_.frame_opts.WindowShift(); + const int& fft_points = fft_points_; + const float scale = hanning_window_energy_ * sample_rate; + + if (num_samples < frame_length) { + return true; + } + + int num_frames = 1 + ((num_samples - frame_length) / frame_shift); + feat.resize(num_frames); + vector fft_real((fft_points_ / 2 + 1), 0); + vector fft_img((fft_points_ / 2 + 1), 0); + vector v(frame_length, 0); + vector power((fft_points / 2 + 1)); + + for (int i = 0; i < num_frames; ++i) { + vector data(wave.data() + i * frame_shift, + wave.data() + i * frame_shift + frame_length); + Hanning(&data); + fft_img.clear(); + fft_real.clear(); + v.assign(data.begin(), data.end()); + NumpyFft(&v, &fft_real, &fft_img); + + feat[i].resize(fft_points / 2 + 1); // the last dimension is Fs/2 Hz + for (int j = 0; j < (fft_points / 2 + 1); ++j) { + power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j]; + feat[i][j] = power[j]; + + if (j == 0 || j == feat[0].size() - 1) { + feat[i][j] /= scale; + } else { + feat[i][j] *= (2.0 / scale); + } + + // log added eps=1e-14 + feat[i][j] = std::log(feat[i][j] + 1e-14); + } } - } - return true; + return true; } } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/linear_spectrogram.h index e4dc3e33..c18438eb 100644 --- a/speechx/speechx/frontend/linear_spectrogram.h +++ b/speechx/speechx/frontend/linear_spectrogram.h @@ -1,35 +1,51 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #pragma once +#include "base/common.h" #include "frontend/feature_extractor_interface.h" #include "kaldi/feat/feature-window.h" -#include "base/common.h" namespace ppspeech { struct LinearSpectrogramOptions { kaldi::FrameExtractionOptions frame_opts; kaldi::BaseFloat streaming_chunk; - LinearSpectrogramOptions(): - streaming_chunk(0.36), - frame_opts() {} + LinearSpectrogramOptions() : streaming_chunk(0.36), frame_opts() {} void Register(kaldi::OptionsItf* opts) { - opts->Register("streaming-chunk", &streaming_chunk, "streaming chunk size"); + opts->Register( + "streaming-chunk", &streaming_chunk, "streaming chunk size"); frame_opts.Register(opts); } }; class LinearSpectrogram : public FeatureExtractorInterface { public: - explicit LinearSpectrogram(const LinearSpectrogramOptions& opts, - std::unique_ptr base_extractor); - virtual void AcceptWaveform(const kaldi::VectorBase& input); - virtual void Read(kaldi::Vector* feat); + explicit LinearSpectrogram( + const LinearSpectrogramOptions& opts, + std::unique_ptr base_extractor); + virtual void AcceptWaveform( + const kaldi::VectorBase& input); + virtual bool Read(kaldi::Vector* feat); virtual size_t Dim() const { return dim_; } - void ReadFeats(kaldi::Matrix* feats); + virtual void SetFinished() { base_extractor_->SetFinished(); } + virtual bool IsFinished() const { return base_extractor_->IsFinished(); } - private: + private: void Hanning(std::vector* data) const; bool Compute(const std::vector& wave, std::vector>& feat); @@ -44,7 +60,7 @@ class LinearSpectrogram : public FeatureExtractorInterface { std::vector hanning_window_; kaldi::BaseFloat hanning_window_energy_; LinearSpectrogramOptions opts_; - kaldi::Vector waveform_; // remove later, todo(SmileGoat) + kaldi::Vector waveform_; // remove later, todo(SmileGoat) std::unique_ptr base_extractor_; int chunk_sample_size_; DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram); diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/normalizer.cc index 69c9ab59..8aaf33de 100644 --- a/speechx/speechx/frontend/normalizer.cc +++ b/speechx/speechx/frontend/normalizer.cc @@ -1,3 +1,17 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include "frontend/normalizer.h" #include "kaldi/feat/cmvn.h" @@ -12,90 +26,96 @@ using std::vector; using kaldi::SubVector; using std::unique_ptr; -DecibelNormalizer::DecibelNormalizer(const DecibelNormalizerOptions& opts, +DecibelNormalizer::DecibelNormalizer( + const DecibelNormalizerOptions& opts, std::unique_ptr base_extractor) { - base_extractor_ = std::move(base_extractor); - opts_ = opts; - dim_ = 0; + base_extractor_ = std::move(base_extractor); + opts_ = opts; + dim_ = 0; } - -void DecibelNormalizer::AcceptWaveform(const kaldi::VectorBase& input) { - //dim_ = input.Dim(); - //waveform_.Resize(input.Dim()); - //waveform_.CopyFromVec(input); - base_extractor_->AcceptWaveform(input); + +void DecibelNormalizer::AcceptWaveform( + const kaldi::VectorBase& input) { + // dim_ = input.Dim(); + // waveform_.Resize(input.Dim()); + // waveform_.CopyFromVec(input); + base_extractor_->AcceptWaveform(input); } -void DecibelNormalizer::Read(kaldi::Vector* feat) { - // if (waveform_.Dim() == 0) return; - base_extractor_->Read(feat); - Compute(feat); +bool DecibelNormalizer::Read(kaldi::Vector* feat) { + // if (waveform_.Dim() == 0) return; + if (base_extractor_->Read(feat) == false || feat->Dim() == 0) { + return false; + } + Compute(feat); + return true; } -//todo remove later +// todo remove later void CopyVector2StdVector(const kaldi::VectorBase& input, vector* output) { - if (input.Dim() == 0) return; - output->resize(input.Dim()); - for (size_t idx = 0; idx < input.Dim(); ++idx) { - (*output)[idx] = input(idx); - } + if (input.Dim() == 0) return; + output->resize(input.Dim()); + for (size_t idx = 0; idx < input.Dim(); ++idx) { + (*output)[idx] = input(idx); + } } void CopyStdVector2Vector(const vector& input, VectorBase* output) { - if (input.empty()) return; - assert(input.size() == output->Dim()); - for (size_t idx = 0; idx < input.size(); ++idx) { - (*output)(idx) = input[idx]; - } + if (input.empty()) return; + assert(input.size() == output->Dim()); + for (size_t idx = 0; idx < input.size(); ++idx) { + (*output)(idx) = input[idx]; + } } bool DecibelNormalizer::Compute(VectorBase* feat) const { - // calculate db rms - BaseFloat rms_db = 0.0; - BaseFloat mean_square = 0.0; - BaseFloat gain = 0.0; - BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1)); - - vector samples; - samples.resize(feat->Dim()); - for (size_t i = 0; i < samples.size(); ++i) { - samples[i] = (*feat)(i); - } - - // square - for (auto &d : samples) { - if (opts_.convert_int_float) { - d = d * wave_float_normlization; + // calculate db rms + BaseFloat rms_db = 0.0; + BaseFloat mean_square = 0.0; + BaseFloat gain = 0.0; + BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1)); + + vector samples; + samples.resize(feat->Dim()); + for (size_t i = 0; i < samples.size(); ++i) { + samples[i] = (*feat)(i); + } + + // square + for (auto& d : samples) { + if (opts_.convert_int_float) { + d = d * wave_float_normlization; + } + mean_square += d * d; } - mean_square += d * d; - } - - // mean - mean_square /= samples.size(); - rms_db = 10 * std::log10(mean_square); - gain = opts_.target_db - rms_db; - - if (gain > opts_.max_gain_db) { - LOG(ERROR) << "Unable to normalize segment to " << opts_.target_db << "dB," - << "because the the probable gain have exceeds opts_.max_gain_db" - << opts_.max_gain_db << "dB."; - return false; - } - - // Note that this is an in-place transformation. - for (auto &item : samples) { - // python item *= 10.0 ** (gain / 20.0) - item *= std::pow(10.0, gain / 20.0); - } - - CopyStdVector2Vector(samples, feat); - return true; + + // mean + mean_square /= samples.size(); + rms_db = 10 * std::log10(mean_square); + gain = opts_.target_db - rms_db; + + if (gain > opts_.max_gain_db) { + LOG(ERROR) + << "Unable to normalize segment to " << opts_.target_db << "dB," + << "because the the probable gain have exceeds opts_.max_gain_db" + << opts_.max_gain_db << "dB."; + return false; + } + + // Note that this is an in-place transformation. + for (auto& item : samples) { + // python item *= 10.0 ** (gain / 20.0) + item *= std::pow(10.0, gain / 20.0); + } + + CopyStdVector2Vector(samples, feat); + return true; } -CMVN::CMVN(std::string cmvn_file, - unique_ptr base_extractor) +CMVN::CMVN(std::string cmvn_file, + unique_ptr base_extractor) : var_norm_(true) { base_extractor_ = std::move(base_extractor); bool binary; @@ -109,77 +129,83 @@ void CMVN::AcceptWaveform(const kaldi::VectorBase& input) { return; } -void CMVN::Read(kaldi::Vector* feat) { - base_extractor_->Read(feat); +bool CMVN::Read(kaldi::Vector* feat) { + if (base_extractor_->Read(feat) == false) { + return false; + } Compute(feat); - return; + return true; } // feats contain num_frames feature. void CMVN::Compute(VectorBase* feats) const { - KALDI_ASSERT(feats != NULL); - int32 dim = stats_.NumCols() - 1; - if (stats_.NumRows() > 2 || stats_.NumRows() < 1 || feats->Dim() % dim != 0) { - KALDI_ERR << "Dim mismatch: cmvn " - << stats_.NumRows() << 'x' << stats_.NumCols() - << ", feats " << feats->Dim() << 'x'; - } - if (stats_.NumRows() == 1 && var_norm_) { - KALDI_ERR << "You requested variance normalization but no variance stats_ " - << "are supplied."; - } - - double count = stats_(0, dim); - // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when - // computing an offset and representing it as stats_, we use a count of one. - if (count < 1.0) - KALDI_ERR << "Insufficient stats_ for cepstral mean and variance normalization: " - << "count = " << count; - - if (!var_norm_) { - Vector offset(feats->Dim()); - SubVector mean_stats(stats_.RowData(0), dim); - Vector mean_stats_apply(feats->Dim()); - //fill the datat of mean_stats in mean_stats_appy whose dim is equal with the dim of feature. - //the dim of feats = dim * num_frames; - for (int32 idx = 0; idx < feats->Dim() / dim; ++idx) { - SubVector stats_tmp(mean_stats_apply.Data() + dim*idx, dim); - stats_tmp.CopyFromVec(mean_stats); + KALDI_ASSERT(feats != NULL); + int32 dim = stats_.NumCols() - 1; + if (stats_.NumRows() > 2 || stats_.NumRows() < 1 || + feats->Dim() % dim != 0) { + KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << 'x' + << stats_.NumCols() << ", feats " << feats->Dim() << 'x'; } - offset.AddVec(-1.0 / count, mean_stats_apply); - feats->AddVec(1.0, offset); - return; - } - // norm(0, d) = mean offset; - // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d). - kaldi::Matrix norm(2, feats->Dim()); - for (int32 d = 0; d < dim; d++) { - double mean, offset, scale; - mean = stats_(0, d)/count; - double var = (stats_(1, d)/count) - mean*mean, - floor = 1.0e-20; - if (var < floor) { - KALDI_WARN << "Flooring cepstral variance from " << var << " to " - << floor; - var = floor; + if (stats_.NumRows() == 1 && var_norm_) { + KALDI_ERR + << "You requested variance normalization but no variance stats_ " + << "are supplied."; + } + + double count = stats_(0, dim); + // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when + // computing an offset and representing it as stats_, we use a count of one. + if (count < 1.0) + KALDI_ERR << "Insufficient stats_ for cepstral mean and variance " + "normalization: " + << "count = " << count; + + if (!var_norm_) { + Vector offset(feats->Dim()); + SubVector mean_stats(stats_.RowData(0), dim); + Vector mean_stats_apply(feats->Dim()); + // fill the datat of mean_stats in mean_stats_appy whose dim is equal + // with the dim of feature. + // the dim of feats = dim * num_frames; + for (int32 idx = 0; idx < feats->Dim() / dim; ++idx) { + SubVector stats_tmp(mean_stats_apply.Data() + dim * idx, + dim); + stats_tmp.CopyFromVec(mean_stats); + } + offset.AddVec(-1.0 / count, mean_stats_apply); + feats->AddVec(1.0, offset); + return; } - scale = 1.0 / sqrt(var); - if (scale != scale || 1/scale == 0.0) - KALDI_ERR << "NaN or infinity in cepstral mean/variance computation"; - offset = -(mean*scale); - for (int32 d_skip = d; d_skip < feats->Dim();) { - norm(0, d_skip) = offset; - norm(1, d_skip) = scale; - d_skip = d_skip + dim; + // norm(0, d) = mean offset; + // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d). + kaldi::Matrix norm(2, feats->Dim()); + for (int32 d = 0; d < dim; d++) { + double mean, offset, scale; + mean = stats_(0, d) / count; + double var = (stats_(1, d) / count) - mean * mean, floor = 1.0e-20; + if (var < floor) { + KALDI_WARN << "Flooring cepstral variance from " << var << " to " + << floor; + var = floor; + } + scale = 1.0 / sqrt(var); + if (scale != scale || 1 / scale == 0.0) + KALDI_ERR + << "NaN or infinity in cepstral mean/variance computation"; + offset = -(mean * scale); + for (int32 d_skip = d; d_skip < feats->Dim();) { + norm(0, d_skip) = offset; + norm(1, d_skip) = scale; + d_skip = d_skip + dim; + } } - } - // Apply the normalization. - feats->MulElements(norm.Row(1)); - feats->AddVec(1.0, norm.Row(0)); + // Apply the normalization. + feats->MulElements(norm.Row(1)); + feats->AddVec(1.0, norm.Row(0)); } void CMVN::ApplyCMVN(kaldi::MatrixBase* feats) { - ApplyCmvn(stats_, var_norm_, feats); + ApplyCmvn(stats_, var_norm_, feats); } -} // namespace ppspeech +} // namespace ppspeech diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/normalizer.h index 13c5b8df..189e0e2b 100644 --- a/speechx/speechx/frontend/normalizer.h +++ b/speechx/speechx/frontend/normalizer.h @@ -1,26 +1,42 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #pragma once #include "base/common.h" #include "frontend/feature_extractor_interface.h" -#include "kaldi/util/options-itf.h" #include "kaldi/matrix/kaldi-matrix.h" +#include "kaldi/util/options-itf.h" namespace ppspeech { struct DecibelNormalizerOptions { - float target_db; - float max_gain_db; - bool convert_int_float; - DecibelNormalizerOptions() : - target_db(-20), - max_gain_db(300.0), - convert_int_float(false){} + float target_db; + float max_gain_db; + bool convert_int_float; + DecibelNormalizerOptions() + : target_db(-20), max_gain_db(300.0), convert_int_float(false) {} void Register(kaldi::OptionsItf* opts) { - opts->Register("target-db", &target_db, "target db for db normalization"); - opts->Register("max-gain-db", &max_gain_db, "max gain db for db normalization"); - opts->Register("convert-int-float", &convert_int_float, "if convert int samples to float"); + opts->Register( + "target-db", &target_db, "target db for db normalization"); + opts->Register( + "max-gain-db", &max_gain_db, "max gain db for db normalization"); + opts->Register("convert-int-float", + &convert_int_float, + "if convert int samples to float"); } }; @@ -29,9 +45,12 @@ class DecibelNormalizer : public FeatureExtractorInterface { explicit DecibelNormalizer( const DecibelNormalizerOptions& opts, std::unique_ptr base_extractor); - virtual void AcceptWaveform(const kaldi::VectorBase& input); - virtual void Read(kaldi::Vector* feat); + virtual void AcceptWaveform( + const kaldi::VectorBase& input); + virtual bool Read(kaldi::Vector* feat); virtual size_t Dim() const { return dim_; } + virtual void SetFinished() { base_extractor_->SetFinished(); } + virtual bool IsFinished() const { return base_extractor_->IsFinished(); } private: bool Compute(kaldi::VectorBase* feat) const; @@ -44,12 +63,14 @@ class DecibelNormalizer : public FeatureExtractorInterface { class CMVN : public FeatureExtractorInterface { public: - explicit CMVN( - std::string cmvn_file, - std::unique_ptr base_extractor); - virtual void AcceptWaveform(const kaldi::VectorBase& input); - virtual void Read(kaldi::Vector* feat); + explicit CMVN(std::string cmvn_file, + std::unique_ptr base_extractor); + virtual void AcceptWaveform( + const kaldi::VectorBase& input); + virtual bool Read(kaldi::Vector* feat); virtual size_t Dim() const { return dim_; } + virtual void SetFinished() { base_extractor_->SetFinished(); } + virtual bool IsFinished() const { return base_extractor_->IsFinished(); } private: void Compute(kaldi::VectorBase* feat) const; diff --git a/speechx/speechx/frontend/raw_audio.cc b/speechx/speechx/frontend/raw_audio.cc index f4584828..0c7f4d21 100644 --- a/speechx/speechx/frontend/raw_audio.cc +++ b/speechx/speechx/frontend/raw_audio.cc @@ -1,60 +1,85 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include "frontend/raw_audio.h" #include "kaldi/base/timer.h" namespace ppspeech { -RawAudioSource::RawAudioSource(int buffer_size = 65536) - : finished_(false), - data_length_(0), - start_(0), - timeout_(5) { - ring_buffer_.resize(buffer_size); -} - -// todo length > buffer size, condition_var -bool RawAudioSource::AcceptWaveform(const VectorBase& data) { - std::lock_guard lock(mutex_); - for (size_t idx = 0; idx < data.Dim(); ++idx) { - ring_buffer_[idx % ring_buffer_.size()] = data(idx); - } - data_length_ += length; +using kaldi::BaseFloat; +using kaldi::VectorBase; +using kaldi::Vector; + +RawAudioSource::RawAudioSource(int buffer_size) + : finished_(false), data_length_(0), start_(0), timeout_(1) { + ring_buffer_.resize(buffer_size); } -// todo length > buffer size -//bool RawAudioSource::AcceptWaveform(BaseFloat* data, int length) { - //std::lock_guard lock(mutex_); - //for (size_t idx = 0; idx < length; ++idx) { - //ring_buffer_[idx % ring_buffer_.size()] = data[idx]; - //} - //data_length_ += length; - //finish_condition_.notify_one(); +void RawAudioSource::AcceptWaveform(const VectorBase& data) { + std::unique_lock lock(mutex_); + while (data_length_ + data.Dim() > ring_buffer_.size()) { + ready_feed_condition_.wait(lock); + } + for (size_t idx = 0; idx < data.Dim(); ++idx) { + ring_buffer_[idx % ring_buffer_.size()] = data(idx); + } + data_length_ += data.Dim(); + ready_read_condition_.notify_one(); +} + +// bool RawAudioSource::AcceptWaveform(BaseFloat* data, int length) { +// std::unique_lock lock(mutex_); +// for (size_t idx = 0; idx < length; ++idx) { +// ring_buffer_[idx % ring_buffer_.size()] = data[idx]; +//} +// data_length_ += length; +// finish_condition_.notify_one(); //} -bool RawAudioSource::Read(Vector* feats) { - size_t chunk_size = feats->Dim(); - Timer timer; - if (chunk_size > data_length_) { - while (true) { - int32 elapsed = static_cat(timer.Elapsed() * 1000); - if (finished_ || > timeout_) { +bool RawAudioSource::Read(Vector* feat) { + size_t chunk_size = feat->Dim(); + kaldi::Timer timer; + std::unique_lock lock(mutex_); + while (chunk_size > data_length_) { + // when audio is empty and no more data feed + // ready_read_condition will block in dead lock. + // ready_read_condition_.wait(lock); + int32 elapsed = static_cast(timer.Elapsed() * 1000); + if (elapsed > timeout_) { + if (finished_ == true) { // read last chunk data + break; + } + if (chunk_size > data_length_) { + return false; + } + } + usleep(100); // sleep 0.1 ms + } + + // read last chunk data + if (chunk_size > data_length_) { chunk_size = data_length_; - feats->Resize(chunk_size); - break; - } - sleep(1); + feat->Resize(chunk_size); } - } - std::lock_guard lock(mutex_); - for (size_t idx = 0; idx < chunk_size; ++idx) { - feats->Data()[idx] = ring_buffer_[idx]; - } - data_length_ -= chunk_size; - start_ = (start_ + chunk_size) % ring_buffer_.size(); - finish_condition_.notify_one(); -} -//size_t RawAudioSource::GetDataLength() { -// return data_length_; -//} + for (size_t idx = 0; idx < chunk_size; ++idx) { + feat->Data()[idx] = ring_buffer_[idx]; + } + data_length_ -= chunk_size; + start_ = (start_ + chunk_size) % ring_buffer_.size(); + ready_feed_condition_.notify_one(); + return true; +} -} // namespace ppspeech \ No newline at end of file +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/raw_audio.h index 24a4b2e8..1893da25 100644 --- a/speechx/speechx/frontend/raw_audio.h +++ b/speechx/speechx/frontend/raw_audio.h @@ -1,34 +1,77 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #pragma once -#include "frontend/feature_extractor_interface.h" #include "base/common.h" +#include "frontend/feature_extractor_interface.h" #pragma once namespace ppspeech { -class RawAudioSource { +class RawAudioSource : public FeatureExtractorInterface { public: - RawAudioSource(int buffer_size = kint16max); - virtual void AcceptWaveform(kaldi::BaseFloat* data, int length); - void AcceptWaveformByByte(char* data, lnt length) {} - void AcceptWaveformByShort(kaldi::int16* data, int length) {} - - // read chunk data in buffer - bool Read(VectorBase* feats); - void SetFinished() { finished_ = true; } - bool IsFinished() { return finished_; } + explicit RawAudioSource(int buffer_size = kint16max); + virtual void AcceptWaveform(const kaldi::VectorBase& data); + virtual bool Read(kaldi::Vector* feat); + virtual size_t Dim() const { return data_length_; } + virtual void SetFinished() { + std::lock_guard lock(mutex_); + finished_ = true; + } + virtual bool IsFinished() const { return finished_; } private: - vector ring_buffer_; + std::vector ring_buffer_; size_t start_; size_t data_length_; bool finished_; - mutable std::mutex mutext_; + mutable std::mutex mutex_; std::condition_variable ready_read_condition_; std::condition_variable ready_feed_condition_; kaldi::int32 timeout_; + + DISALLOW_COPY_AND_ASSIGN(RawAudioSource); +}; + +// it is a datasource for testing different frontend module. +class RawDataSource : public FeatureExtractorInterface { + public: + explicit RawDataSource() { finished_ = false; } + virtual void AcceptWaveform( + const kaldi::VectorBase& input) { + data_ = input; + } + virtual bool Read(kaldi::Vector* feat) { + if (data_.Dim() == 0) { + return false; + } + (*feat) = data_; + data_.Resize(0); + return true; + } + virtual size_t Dim() const { return data_.Dim(); } + virtual void SetFinished() { finished_ = true; } + virtual bool IsFinished() const { return finished_; } + + private: + kaldi::Vector data_; + bool finished_; + + DISALLOW_COPY_AND_ASSIGN(RawDataSource); }; -} // namespace ppspeech \ No newline at end of file +} // namespace ppspeech \ No newline at end of file From 7c1b432830b4a2680875b628097b4ce27f08e440 Mon Sep 17 00:00:00 2001 From: SmileGoat Date: Thu, 10 Mar 2022 09:54:32 +0800 Subject: [PATCH 5/7] format --- speechx/speechx/frontend/feature_cache.h | 4 +++- speechx/speechx/frontend/raw_audio.cc | 5 ++--- speechx/speechx/frontend/raw_audio.h | 5 +---- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h index c7d66251..5849cc5c 100644 --- a/speechx/speechx/frontend/feature_cache.h +++ b/speechx/speechx/frontend/feature_cache.h @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#pragma once + #include "base/common.h" #include "frontend/feature_extractor_interface.h" @@ -45,4 +47,4 @@ class FeatureCache : public FeatureExtractorInterface { // DISALLOW_COPY_AND_ASSGIN(FeatureCache); }; -} // namespace ppspeech \ No newline at end of file +} // namespace ppspeech diff --git a/speechx/speechx/frontend/raw_audio.cc b/speechx/speechx/frontend/raw_audio.cc index 0c7f4d21..1e265a57 100644 --- a/speechx/speechx/frontend/raw_audio.cc +++ b/speechx/speechx/frontend/raw_audio.cc @@ -35,7 +35,6 @@ void RawAudioSource::AcceptWaveform(const VectorBase& data) { ring_buffer_[idx % ring_buffer_.size()] = data(idx); } data_length_ += data.Dim(); - ready_read_condition_.notify_one(); } // bool RawAudioSource::AcceptWaveform(BaseFloat* data, int length) { @@ -53,7 +52,7 @@ bool RawAudioSource::Read(Vector* feat) { std::unique_lock lock(mutex_); while (chunk_size > data_length_) { // when audio is empty and no more data feed - // ready_read_condition will block in dead lock. + // ready_read_condition will block in dead lock. so replace with timeout_ // ready_read_condition_.wait(lock); int32 elapsed = static_cast(timer.Elapsed() * 1000); if (elapsed > timeout_) { @@ -82,4 +81,4 @@ bool RawAudioSource::Read(Vector* feat) { return true; } -} // namespace ppspeech \ No newline at end of file +} // namespace ppspeech diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/raw_audio.h index 1893da25..c3ebe559 100644 --- a/speechx/speechx/frontend/raw_audio.h +++ b/speechx/speechx/frontend/raw_audio.h @@ -18,8 +18,6 @@ #include "base/common.h" #include "frontend/feature_extractor_interface.h" -#pragma once - namespace ppspeech { class RawAudioSource : public FeatureExtractorInterface { @@ -40,7 +38,6 @@ class RawAudioSource : public FeatureExtractorInterface { size_t data_length_; bool finished_; mutable std::mutex mutex_; - std::condition_variable ready_read_condition_; std::condition_variable ready_feed_condition_; kaldi::int32 timeout_; @@ -74,4 +71,4 @@ class RawDataSource : public FeatureExtractorInterface { DISALLOW_COPY_AND_ASSIGN(RawDataSource); }; -} // namespace ppspeech \ No newline at end of file +} // namespace ppspeech From 22fe1c9dbeaee33c46931ad11748d6346c472854 Mon Sep 17 00:00:00 2001 From: SmileGoat Date: Thu, 10 Mar 2022 15:01:45 +0800 Subject: [PATCH 6/7] rename interface & add comment to Dim() --- .../examples/feat/linear_spectrogram_main.cc | 15 ++++---- speechx/speechx/frontend/feature_cache.cc | 12 +++--- speechx/speechx/frontend/feature_cache.h | 11 ++++-- .../frontend/feature_extractor_interface.h | 13 +++++-- .../speechx/frontend/linear_spectrogram.cc | 13 ++++--- speechx/speechx/frontend/linear_spectrogram.h | 10 ++--- speechx/speechx/frontend/normalizer.cc | 37 +++++++++---------- speechx/speechx/frontend/normalizer.h | 21 +++++++---- speechx/speechx/frontend/raw_audio.cc | 31 ++++++---------- speechx/speechx/frontend/raw_audio.h | 22 ++++++----- 10 files changed, 94 insertions(+), 91 deletions(-) diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc index c2ca6187..f137a52c 100644 --- a/speechx/examples/feat/linear_spectrogram_main.cc +++ b/speechx/examples/feat/linear_spectrogram_main.cc @@ -25,10 +25,9 @@ #include "kaldi/util/kaldi-io.h" #include "kaldi/util/table-types.h" -DEFINE_string(wav_rspecifier, "", "test wav path"); -DEFINE_string(feature_wspecifier, "", "test wav ark"); -DEFINE_string(feature_check_wspecifier, "", "test wav ark"); -DEFINE_string(cmvn_write_path, "./cmvn.ark", "test wav ark"); +DEFINE_string(wav_rspecifier, "", "test wav scp path"); +DEFINE_string(feature_wspecifier, "", "output feats wspecifier"); +DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn"); std::vector mean_{ @@ -165,10 +164,10 @@ int main(int argc, char* argv[]) { // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning // window -->linear_spectrogram --> cmvn int32 num_done = 0, num_err = 0; - // std::unique_ptr data_source(new - // ppspeech::RawDataSource()); + //std::unique_ptr data_source(new + //ppspeech::RawDataCache()); std::unique_ptr data_source( - new ppspeech::RawAudioSource()); + new ppspeech::RawAudioCache()); ppspeech::LinearSpectrogramOptions opt; opt.frame_opts.frame_length_ms = 20; @@ -211,7 +210,7 @@ int main(int argc, char* argv[]) { wav_chunk(i) = waveform(sample_offset + i); } kaldi::Vector features; - feature_cache.AcceptWaveform(wav_chunk); + feature_cache.Accept(wav_chunk); if (cur_chunk_size < chunk_sample_size) { feature_cache.SetFinished(); } diff --git a/speechx/speechx/frontend/feature_cache.cc b/speechx/speechx/frontend/feature_cache.cc index df366a06..27982f64 100644 --- a/speechx/speechx/frontend/feature_cache.cc +++ b/speechx/speechx/frontend/feature_cache.cc @@ -29,9 +29,9 @@ FeatureCache::FeatureCache( base_extractor_ = std::move(base_extractor); } -void FeatureCache::AcceptWaveform( - const kaldi::VectorBase& input) { - base_extractor_->AcceptWaveform(input); +void FeatureCache::Accept( + const kaldi::VectorBase& inputs) { + base_extractor_->Accept(inputs); // feed current data bool result = false; do { @@ -40,7 +40,7 @@ void FeatureCache::AcceptWaveform( } // pop feature chunk -bool FeatureCache::Read(kaldi::Vector* feat) { +bool FeatureCache::Read(kaldi::Vector* output_feats) { kaldi::Timer timer; std::unique_lock lock(mutex_); while (cache_.empty() && base_extractor_->IsFinished() == false) { @@ -53,8 +53,8 @@ bool FeatureCache::Read(kaldi::Vector* feat) { usleep(1000); // sleep 1 ms } if (cache_.empty()) return false; - feat->Resize(cache_.front().Dim()); - feat->CopyFromVec(cache_.front()); + output_feats->Resize(cache_.front().Dim()); + output_feats->CopyFromVec(cache_.front()); cache_.pop(); ready_feed_condition_.notify_one(); return true; diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h index 5849cc5c..9442fe1f 100644 --- a/speechx/speechx/frontend/feature_cache.h +++ b/speechx/speechx/frontend/feature_cache.h @@ -24,12 +24,15 @@ class FeatureCache : public FeatureExtractorInterface { explicit FeatureCache( int32 max_size = kint16max, std::unique_ptr base_extractor = NULL); - virtual void AcceptWaveform( - const kaldi::VectorBase& input); - virtual bool Read(kaldi::Vector* feat); + virtual void Accept( + const kaldi::VectorBase& inputs); + // output_feats dim = num_frames * feature_dim + virtual bool Read(kaldi::Vector* output_feats); + // feature cache only cache feature which from base extractor virtual size_t Dim() const { return base_extractor_->Dim(); } virtual void SetFinished() { base_extractor_->SetFinished(); + // read the last chunk data Compute(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); } @@ -44,7 +47,7 @@ class FeatureCache : public FeatureExtractorInterface { std::unique_ptr base_extractor_; std::condition_variable ready_feed_condition_; std::condition_variable ready_read_condition_; - // DISALLOW_COPY_AND_ASSGIN(FeatureCache); + //DISALLOW_COPY_AND_ASSGIN(FeatureCache); }; } // namespace ppspeech diff --git a/speechx/speechx/frontend/feature_extractor_interface.h b/speechx/speechx/frontend/feature_extractor_interface.h index e490bc75..70fa93ae 100644 --- a/speechx/speechx/frontend/feature_extractor_interface.h +++ b/speechx/speechx/frontend/feature_extractor_interface.h @@ -21,13 +21,18 @@ namespace ppspeech { class FeatureExtractorInterface { public: - virtual void AcceptWaveform( - const kaldi::VectorBase& input) = 0; - virtual bool Read(kaldi::Vector* feat) = 0; + // accept input data + virtual void Accept( + const kaldi::VectorBase& inputs) = 0; + // get the processed result + // the length of output = feature_row * feature_dim, + // the Matrix is squashed into Vector + virtual bool Read(kaldi::Vector* outputs) = 0; + // the Dim is the feature dim virtual size_t Dim() const = 0; virtual void SetFinished() = 0; virtual bool IsFinished() const = 0; // virtual void Reset(); }; -} // namespace ppspeech \ No newline at end of file +} // namespace ppspeech diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/linear_spectrogram.cc index 73cffea5..c0ae553f 100644 --- a/speechx/speechx/frontend/linear_spectrogram.cc +++ b/speechx/speechx/frontend/linear_spectrogram.cc @@ -66,11 +66,11 @@ LinearSpectrogram::LinearSpectrogram( dim_ = fft_points_ / 2 + 1; // the dimension is Fs/2 Hz } -void LinearSpectrogram::AcceptWaveform(const VectorBase& input) { - base_extractor_->AcceptWaveform(input); +void LinearSpectrogram::Accept(const VectorBase& inputs) { + base_extractor_->Accept(inputs); } -bool LinearSpectrogram::Read(Vector* feat) { +bool LinearSpectrogram::Read(Vector* output_feats) { Vector input_feats(chunk_sample_size_); bool flag = base_extractor_->Read(&input_feats); if (flag == false || input_feats.Dim() == 0) return false; @@ -83,9 +83,10 @@ bool LinearSpectrogram::Read(Vector* feat) { if (result.size() != 0) { feat_size = result.size() * result[0].size(); } - feat->Resize(feat_size); + output_feats->Resize(feat_size); + // todo refactor (SimleGoat) for (size_t idx = 0; idx < feat_size; ++idx) { - (*feat)(idx) = result[idx / dim_][idx % dim_]; + (*output_feats)(idx) = result[idx / dim_][idx % dim_]; } return true; } @@ -117,7 +118,7 @@ bool LinearSpectrogram::NumpyFft(vector* v, return true; } -// Compute spectrogram feat, only for test, remove later +// Compute spectrogram feat // todo: refactor later (SmileGoat) bool LinearSpectrogram::Compute(const vector& wave, vector>& feat) { diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/linear_spectrogram.h index c18438eb..5c73f207 100644 --- a/speechx/speechx/frontend/linear_spectrogram.h +++ b/speechx/speechx/frontend/linear_spectrogram.h @@ -38,9 +38,10 @@ class LinearSpectrogram : public FeatureExtractorInterface { explicit LinearSpectrogram( const LinearSpectrogramOptions& opts, std::unique_ptr base_extractor); - virtual void AcceptWaveform( - const kaldi::VectorBase& input); - virtual bool Read(kaldi::Vector* feat); + virtual void Accept( + const kaldi::VectorBase& inputs); + virtual bool Read(kaldi::Vector* output_feats); + // the dim_ is the dim of single frame feature virtual size_t Dim() const { return dim_; } virtual void SetFinished() { base_extractor_->SetFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); } @@ -49,8 +50,6 @@ class LinearSpectrogram : public FeatureExtractorInterface { void Hanning(std::vector* data) const; bool Compute(const std::vector& wave, std::vector>& feat); - void Compute(const kaldi::VectorBase& input, - kaldi::VectorBase* feature); bool NumpyFft(std::vector* v, std::vector* real, std::vector* img) const; @@ -60,7 +59,6 @@ class LinearSpectrogram : public FeatureExtractorInterface { std::vector hanning_window_; kaldi::BaseFloat hanning_window_energy_; LinearSpectrogramOptions opts_; - kaldi::Vector waveform_; // remove later, todo(SmileGoat) std::unique_ptr base_extractor_; int chunk_sample_size_; DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram); diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/normalizer.cc index 8aaf33de..3af44c38 100644 --- a/speechx/speechx/frontend/normalizer.cc +++ b/speechx/speechx/frontend/normalizer.cc @@ -31,23 +31,20 @@ DecibelNormalizer::DecibelNormalizer( std::unique_ptr base_extractor) { base_extractor_ = std::move(base_extractor); opts_ = opts; - dim_ = 0; + dim_ = 1; } -void DecibelNormalizer::AcceptWaveform( - const kaldi::VectorBase& input) { - // dim_ = input.Dim(); - // waveform_.Resize(input.Dim()); - // waveform_.CopyFromVec(input); - base_extractor_->AcceptWaveform(input); +void DecibelNormalizer::Accept( + const kaldi::VectorBase& inputs_wave) { + base_extractor_->Accept(inputs_wave); } -bool DecibelNormalizer::Read(kaldi::Vector* feat) { - // if (waveform_.Dim() == 0) return; - if (base_extractor_->Read(feat) == false || feat->Dim() == 0) { +bool DecibelNormalizer::Read(kaldi::Vector* outputs_wave) { + if (base_extractor_->Read(outputs_wave) == false || + outputs_wave->Dim() == 0) { return false; } - Compute(feat); + Compute(outputs_wave); return true; } @@ -70,7 +67,7 @@ void CopyStdVector2Vector(const vector& input, } } -bool DecibelNormalizer::Compute(VectorBase* feat) const { +bool DecibelNormalizer::Compute(VectorBase* feats) const { // calculate db rms BaseFloat rms_db = 0.0; BaseFloat mean_square = 0.0; @@ -78,9 +75,9 @@ bool DecibelNormalizer::Compute(VectorBase* feat) const { BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1)); vector samples; - samples.resize(feat->Dim()); + samples.resize(feats->Dim()); for (size_t i = 0; i < samples.size(); ++i) { - samples[i] = (*feat)(i); + samples[i] = (*feats)(i); } // square @@ -110,7 +107,7 @@ bool DecibelNormalizer::Compute(VectorBase* feat) const { item *= std::pow(10.0, gain / 20.0); } - CopyStdVector2Vector(samples, feat); + CopyStdVector2Vector(samples, feats); return true; } @@ -124,16 +121,16 @@ CMVN::CMVN(std::string cmvn_file, dim_ = stats_.NumCols() - 1; } -void CMVN::AcceptWaveform(const kaldi::VectorBase& input) { - base_extractor_->AcceptWaveform(input); +void CMVN::Accept(const kaldi::VectorBase& feats) { + base_extractor_->Accept(feats); return; } -bool CMVN::Read(kaldi::Vector* feat) { - if (base_extractor_->Read(feat) == false) { +bool CMVN::Read(kaldi::Vector* outputs) { + if (base_extractor_->Read(outputs) == false) { return false; } - Compute(feat); + Compute(outputs); return true; } diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/normalizer.h index 189e0e2b..ab333624 100644 --- a/speechx/speechx/frontend/normalizer.h +++ b/speechx/speechx/frontend/normalizer.h @@ -45,15 +45,16 @@ class DecibelNormalizer : public FeatureExtractorInterface { explicit DecibelNormalizer( const DecibelNormalizerOptions& opts, std::unique_ptr base_extractor); - virtual void AcceptWaveform( - const kaldi::VectorBase& input); - virtual bool Read(kaldi::Vector* feat); + virtual void Accept( + const kaldi::VectorBase& inputs_wave); + virtual bool Read(kaldi::Vector* outputs_wave); + // noramlize audio, the dim is 1. virtual size_t Dim() const { return dim_; } virtual void SetFinished() { base_extractor_->SetFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); } private: - bool Compute(kaldi::VectorBase* feat) const; + bool Compute(kaldi::VectorBase* feats) const; DecibelNormalizerOptions opts_; size_t dim_; std::unique_ptr base_extractor_; @@ -65,15 +66,19 @@ class CMVN : public FeatureExtractorInterface { public: explicit CMVN(std::string cmvn_file, std::unique_ptr base_extractor); - virtual void AcceptWaveform( - const kaldi::VectorBase& input); - virtual bool Read(kaldi::Vector* feat); + virtual void Accept( + const kaldi::VectorBase& feats); + + // the length of outputs = feature_row * feature_dim, + // the Matrix is squashed into Vector + virtual bool Read(kaldi::Vector* outputs); + // the dim_ is the feautre dim. virtual size_t Dim() const { return dim_; } virtual void SetFinished() { base_extractor_->SetFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); } private: - void Compute(kaldi::VectorBase* feat) const; + void Compute(kaldi::VectorBase* feats) const; void ApplyCMVN(kaldi::MatrixBase* feats); kaldi::Matrix stats_; std::unique_ptr base_extractor_; diff --git a/speechx/speechx/frontend/raw_audio.cc b/speechx/speechx/frontend/raw_audio.cc index 1e265a57..7cfeb9e4 100644 --- a/speechx/speechx/frontend/raw_audio.cc +++ b/speechx/speechx/frontend/raw_audio.cc @@ -21,33 +21,25 @@ using kaldi::BaseFloat; using kaldi::VectorBase; using kaldi::Vector; -RawAudioSource::RawAudioSource(int buffer_size) +RawAudioCache::RawAudioCache(int buffer_size) : finished_(false), data_length_(0), start_(0), timeout_(1) { ring_buffer_.resize(buffer_size); } -void RawAudioSource::AcceptWaveform(const VectorBase& data) { +void RawAudioCache::Accept(const VectorBase& input_audio) { std::unique_lock lock(mutex_); - while (data_length_ + data.Dim() > ring_buffer_.size()) { + while (data_length_ + input_audio.Dim() > ring_buffer_.size()) { ready_feed_condition_.wait(lock); } - for (size_t idx = 0; idx < data.Dim(); ++idx) { - ring_buffer_[idx % ring_buffer_.size()] = data(idx); + for (size_t idx = 0; idx < input_audio.Dim(); ++idx) { + int32 buffer_idx = (idx + start_) % ring_buffer_.size(); + ring_buffer_[buffer_idx] = input_audio(idx); } - data_length_ += data.Dim(); + data_length_ += input_audio.Dim(); } -// bool RawAudioSource::AcceptWaveform(BaseFloat* data, int length) { -// std::unique_lock lock(mutex_); -// for (size_t idx = 0; idx < length; ++idx) { -// ring_buffer_[idx % ring_buffer_.size()] = data[idx]; -//} -// data_length_ += length; -// finish_condition_.notify_one(); -//} - -bool RawAudioSource::Read(Vector* feat) { - size_t chunk_size = feat->Dim(); +bool RawAudioCache::Read(Vector* output_audio) { + size_t chunk_size = output_audio->Dim(); kaldi::Timer timer; std::unique_lock lock(mutex_); while (chunk_size > data_length_) { @@ -69,11 +61,12 @@ bool RawAudioSource::Read(Vector* feat) { // read last chunk data if (chunk_size > data_length_) { chunk_size = data_length_; - feat->Resize(chunk_size); + output_audio->Resize(chunk_size); } for (size_t idx = 0; idx < chunk_size; ++idx) { - feat->Data()[idx] = ring_buffer_[idx]; + int buff_idx = (start_ + idx) % ring_buffer_.size(); + output_audio->Data()[idx] = ring_buffer_[buff_idx]; } data_length_ -= chunk_size; start_ = (start_ + chunk_size) % ring_buffer_.size(); diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/raw_audio.h index c3ebe559..c3f5a0e1 100644 --- a/speechx/speechx/frontend/raw_audio.h +++ b/speechx/speechx/frontend/raw_audio.h @@ -20,12 +20,13 @@ namespace ppspeech { -class RawAudioSource : public FeatureExtractorInterface { +class RawAudioCache : public FeatureExtractorInterface { public: - explicit RawAudioSource(int buffer_size = kint16max); - virtual void AcceptWaveform(const kaldi::VectorBase& data); - virtual bool Read(kaldi::Vector* feat); - virtual size_t Dim() const { return data_length_; } + explicit RawAudioCache(int buffer_size = kint16max); + virtual void Accept(const kaldi::VectorBase& input_audio); + virtual bool Read(kaldi::Vector* output_audio); + // the audio dim is 1 + virtual size_t Dim() const { return 1; } virtual void SetFinished() { std::lock_guard lock(mutex_); finished_ = true; @@ -41,14 +42,14 @@ class RawAudioSource : public FeatureExtractorInterface { std::condition_variable ready_feed_condition_; kaldi::int32 timeout_; - DISALLOW_COPY_AND_ASSIGN(RawAudioSource); + DISALLOW_COPY_AND_ASSIGN(RawAudioCache); }; // it is a datasource for testing different frontend module. -class RawDataSource : public FeatureExtractorInterface { +class RawDataCache: public FeatureExtractorInterface { public: - explicit RawDataSource() { finished_ = false; } - virtual void AcceptWaveform( + explicit RawDataCache() { finished_ = false; } + virtual void Accept( const kaldi::VectorBase& input) { data_ = input; } @@ -60,6 +61,7 @@ class RawDataSource : public FeatureExtractorInterface { data_.Resize(0); return true; } + //the dim is data_ length virtual size_t Dim() const { return data_.Dim(); } virtual void SetFinished() { finished_ = true; } virtual bool IsFinished() const { return finished_; } @@ -68,7 +70,7 @@ class RawDataSource : public FeatureExtractorInterface { kaldi::Vector data_; bool finished_; - DISALLOW_COPY_AND_ASSIGN(RawDataSource); + DISALLOW_COPY_AND_ASSIGN(RawDataCache); }; } // namespace ppspeech From 027feae9f233014b842f1fa2d6921e77f38f69d4 Mon Sep 17 00:00:00 2001 From: SmileGoat Date: Thu, 10 Mar 2022 16:28:11 +0800 Subject: [PATCH 7/7] rename arg of Accept & Read --- speechx/speechx/frontend/feature_cache.cc | 6 ++-- speechx/speechx/frontend/feature_cache.h | 4 +-- .../frontend/feature_extractor_interface.h | 3 +- .../speechx/frontend/linear_spectrogram.cc | 30 +++++++++---------- speechx/speechx/frontend/linear_spectrogram.h | 6 ++-- speechx/speechx/frontend/normalizer.cc | 30 +++++++++---------- speechx/speechx/frontend/normalizer.h | 12 ++++---- speechx/speechx/frontend/raw_audio.cc | 18 +++++------ speechx/speechx/frontend/raw_audio.h | 15 +++++----- 9 files changed, 63 insertions(+), 61 deletions(-) diff --git a/speechx/speechx/frontend/feature_cache.cc b/speechx/speechx/frontend/feature_cache.cc index 27982f64..b353df16 100644 --- a/speechx/speechx/frontend/feature_cache.cc +++ b/speechx/speechx/frontend/feature_cache.cc @@ -40,7 +40,7 @@ void FeatureCache::Accept( } // pop feature chunk -bool FeatureCache::Read(kaldi::Vector* output_feats) { +bool FeatureCache::Read(kaldi::Vector* feats) { kaldi::Timer timer; std::unique_lock lock(mutex_); while (cache_.empty() && base_extractor_->IsFinished() == false) { @@ -53,8 +53,8 @@ bool FeatureCache::Read(kaldi::Vector* output_feats) { usleep(1000); // sleep 1 ms } if (cache_.empty()) return false; - output_feats->Resize(cache_.front().Dim()); - output_feats->CopyFromVec(cache_.front()); + feats->Resize(cache_.front().Dim()); + feats->CopyFromVec(cache_.front()); cache_.pop(); ready_feed_condition_.notify_one(); return true; diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h index 9442fe1f..03b11f57 100644 --- a/speechx/speechx/frontend/feature_cache.h +++ b/speechx/speechx/frontend/feature_cache.h @@ -26,8 +26,8 @@ class FeatureCache : public FeatureExtractorInterface { std::unique_ptr base_extractor = NULL); virtual void Accept( const kaldi::VectorBase& inputs); - // output_feats dim = num_frames * feature_dim - virtual bool Read(kaldi::Vector* output_feats); + // feats dim = num_frames * feature_dim + virtual bool Read(kaldi::Vector* feats); // feature cache only cache feature which from base extractor virtual size_t Dim() const { return base_extractor_->Dim(); } virtual void SetFinished() { diff --git a/speechx/speechx/frontend/feature_extractor_interface.h b/speechx/speechx/frontend/feature_extractor_interface.h index 70fa93ae..64cc67f3 100644 --- a/speechx/speechx/frontend/feature_extractor_interface.h +++ b/speechx/speechx/frontend/feature_extractor_interface.h @@ -21,7 +21,8 @@ namespace ppspeech { class FeatureExtractorInterface { public: - // accept input data + // accept input data, accept feature or raw waves which decided + // by the base_extractor virtual void Accept( const kaldi::VectorBase& inputs) = 0; // get the processed result diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/linear_spectrogram.cc index c0ae553f..7491716c 100644 --- a/speechx/speechx/frontend/linear_spectrogram.cc +++ b/speechx/speechx/frontend/linear_spectrogram.cc @@ -70,7 +70,7 @@ void LinearSpectrogram::Accept(const VectorBase& inputs) { base_extractor_->Accept(inputs); } -bool LinearSpectrogram::Read(Vector* output_feats) { +bool LinearSpectrogram::Read(Vector* feats) { Vector input_feats(chunk_sample_size_); bool flag = base_extractor_->Read(&input_feats); if (flag == false || input_feats.Dim() == 0) return false; @@ -83,10 +83,10 @@ bool LinearSpectrogram::Read(Vector* output_feats) { if (result.size() != 0) { feat_size = result.size() * result[0].size(); } - output_feats->Resize(feat_size); + feats->Resize(feat_size); // todo refactor (SimleGoat) for (size_t idx = 0; idx < feat_size; ++idx) { - (*output_feats)(idx) = result[idx / dim_][idx % dim_]; + (*feats)(idx) = result[idx / dim_][idx % dim_]; } return true; } @@ -120,9 +120,9 @@ bool LinearSpectrogram::NumpyFft(vector* v, // Compute spectrogram feat // todo: refactor later (SmileGoat) -bool LinearSpectrogram::Compute(const vector& wave, - vector>& feat) { - int num_samples = wave.size(); +bool LinearSpectrogram::Compute(const vector& waves, + vector>& feats) { + int num_samples = waves.size(); const int& frame_length = opts_.frame_opts.WindowSize(); const int& sample_rate = opts_.frame_opts.samp_freq; const int& frame_shift = opts_.frame_opts.WindowShift(); @@ -134,34 +134,34 @@ bool LinearSpectrogram::Compute(const vector& wave, } int num_frames = 1 + ((num_samples - frame_length) / frame_shift); - feat.resize(num_frames); + feats.resize(num_frames); vector fft_real((fft_points_ / 2 + 1), 0); vector fft_img((fft_points_ / 2 + 1), 0); vector v(frame_length, 0); vector power((fft_points / 2 + 1)); for (int i = 0; i < num_frames; ++i) { - vector data(wave.data() + i * frame_shift, - wave.data() + i * frame_shift + frame_length); + vector data(waves.data() + i * frame_shift, + waves.data() + i * frame_shift + frame_length); Hanning(&data); fft_img.clear(); fft_real.clear(); v.assign(data.begin(), data.end()); NumpyFft(&v, &fft_real, &fft_img); - feat[i].resize(fft_points / 2 + 1); // the last dimension is Fs/2 Hz + feats[i].resize(fft_points / 2 + 1); // the last dimension is Fs/2 Hz for (int j = 0; j < (fft_points / 2 + 1); ++j) { power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j]; - feat[i][j] = power[j]; + feats[i][j] = power[j]; - if (j == 0 || j == feat[0].size() - 1) { - feat[i][j] /= scale; + if (j == 0 || j == feats[0].size() - 1) { + feats[i][j] /= scale; } else { - feat[i][j] *= (2.0 / scale); + feats[i][j] *= (2.0 / scale); } // log added eps=1e-14 - feat[i][j] = std::log(feat[i][j] + 1e-14); + feats[i][j] = std::log(feats[i][j] + 1e-14); } } return true; diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/linear_spectrogram.h index 5c73f207..790263d9 100644 --- a/speechx/speechx/frontend/linear_spectrogram.h +++ b/speechx/speechx/frontend/linear_spectrogram.h @@ -40,7 +40,7 @@ class LinearSpectrogram : public FeatureExtractorInterface { std::unique_ptr base_extractor); virtual void Accept( const kaldi::VectorBase& inputs); - virtual bool Read(kaldi::Vector* output_feats); + virtual bool Read(kaldi::Vector* feats); // the dim_ is the dim of single frame feature virtual size_t Dim() const { return dim_; } virtual void SetFinished() { base_extractor_->SetFinished(); } @@ -48,8 +48,8 @@ class LinearSpectrogram : public FeatureExtractorInterface { private: void Hanning(std::vector* data) const; - bool Compute(const std::vector& wave, - std::vector>& feat); + bool Compute(const std::vector& waves, + std::vector>& feats); bool NumpyFft(std::vector* v, std::vector* real, std::vector* img) const; diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/normalizer.cc index 3af44c38..fbb2b645 100644 --- a/speechx/speechx/frontend/normalizer.cc +++ b/speechx/speechx/frontend/normalizer.cc @@ -35,16 +35,16 @@ DecibelNormalizer::DecibelNormalizer( } void DecibelNormalizer::Accept( - const kaldi::VectorBase& inputs_wave) { - base_extractor_->Accept(inputs_wave); + const kaldi::VectorBase& waves) { + base_extractor_->Accept(waves); } -bool DecibelNormalizer::Read(kaldi::Vector* outputs_wave) { - if (base_extractor_->Read(outputs_wave) == false || - outputs_wave->Dim() == 0) { +bool DecibelNormalizer::Read(kaldi::Vector* waves) { + if (base_extractor_->Read(waves) == false || + waves->Dim() == 0) { return false; } - Compute(outputs_wave); + Compute(waves); return true; } @@ -67,7 +67,7 @@ void CopyStdVector2Vector(const vector& input, } } -bool DecibelNormalizer::Compute(VectorBase* feats) const { +bool DecibelNormalizer::Compute(VectorBase* waves) const { // calculate db rms BaseFloat rms_db = 0.0; BaseFloat mean_square = 0.0; @@ -75,9 +75,9 @@ bool DecibelNormalizer::Compute(VectorBase* feats) const { BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1)); vector samples; - samples.resize(feats->Dim()); + samples.resize(waves->Dim()); for (size_t i = 0; i < samples.size(); ++i) { - samples[i] = (*feats)(i); + samples[i] = (*waves)(i); } // square @@ -107,7 +107,7 @@ bool DecibelNormalizer::Compute(VectorBase* feats) const { item *= std::pow(10.0, gain / 20.0); } - CopyStdVector2Vector(samples, feats); + CopyStdVector2Vector(samples, waves); return true; } @@ -121,16 +121,16 @@ CMVN::CMVN(std::string cmvn_file, dim_ = stats_.NumCols() - 1; } -void CMVN::Accept(const kaldi::VectorBase& feats) { - base_extractor_->Accept(feats); +void CMVN::Accept(const kaldi::VectorBase& inputs) { + base_extractor_->Accept(inputs); return; } -bool CMVN::Read(kaldi::Vector* outputs) { - if (base_extractor_->Read(outputs) == false) { +bool CMVN::Read(kaldi::Vector* feats) { + if (base_extractor_->Read(feats) == false) { return false; } - Compute(outputs); + Compute(feats); return true; } diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/normalizer.h index ab333624..b9daa853 100644 --- a/speechx/speechx/frontend/normalizer.h +++ b/speechx/speechx/frontend/normalizer.h @@ -46,15 +46,15 @@ class DecibelNormalizer : public FeatureExtractorInterface { const DecibelNormalizerOptions& opts, std::unique_ptr base_extractor); virtual void Accept( - const kaldi::VectorBase& inputs_wave); - virtual bool Read(kaldi::Vector* outputs_wave); + const kaldi::VectorBase& waves); + virtual bool Read(kaldi::Vector* waves); // noramlize audio, the dim is 1. virtual size_t Dim() const { return dim_; } virtual void SetFinished() { base_extractor_->SetFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); } private: - bool Compute(kaldi::VectorBase* feats) const; + bool Compute(kaldi::VectorBase* waves) const; DecibelNormalizerOptions opts_; size_t dim_; std::unique_ptr base_extractor_; @@ -67,11 +67,11 @@ class CMVN : public FeatureExtractorInterface { explicit CMVN(std::string cmvn_file, std::unique_ptr base_extractor); virtual void Accept( - const kaldi::VectorBase& feats); + const kaldi::VectorBase& inputs); - // the length of outputs = feature_row * feature_dim, + // the length of feats = feature_row * feature_dim, // the Matrix is squashed into Vector - virtual bool Read(kaldi::Vector* outputs); + virtual bool Read(kaldi::Vector* feats); // the dim_ is the feautre dim. virtual size_t Dim() const { return dim_; } virtual void SetFinished() { base_extractor_->SetFinished(); } diff --git a/speechx/speechx/frontend/raw_audio.cc b/speechx/speechx/frontend/raw_audio.cc index 7cfeb9e4..0f3d83ec 100644 --- a/speechx/speechx/frontend/raw_audio.cc +++ b/speechx/speechx/frontend/raw_audio.cc @@ -26,20 +26,20 @@ RawAudioCache::RawAudioCache(int buffer_size) ring_buffer_.resize(buffer_size); } -void RawAudioCache::Accept(const VectorBase& input_audio) { +void RawAudioCache::Accept(const VectorBase& waves) { std::unique_lock lock(mutex_); - while (data_length_ + input_audio.Dim() > ring_buffer_.size()) { + while (data_length_ + waves.Dim() > ring_buffer_.size()) { ready_feed_condition_.wait(lock); } - for (size_t idx = 0; idx < input_audio.Dim(); ++idx) { + for (size_t idx = 0; idx < waves.Dim(); ++idx) { int32 buffer_idx = (idx + start_) % ring_buffer_.size(); - ring_buffer_[buffer_idx] = input_audio(idx); + ring_buffer_[buffer_idx] = waves(idx); } - data_length_ += input_audio.Dim(); + data_length_ += waves.Dim(); } -bool RawAudioCache::Read(Vector* output_audio) { - size_t chunk_size = output_audio->Dim(); +bool RawAudioCache::Read(Vector* waves) { + size_t chunk_size = waves->Dim(); kaldi::Timer timer; std::unique_lock lock(mutex_); while (chunk_size > data_length_) { @@ -61,12 +61,12 @@ bool RawAudioCache::Read(Vector* output_audio) { // read last chunk data if (chunk_size > data_length_) { chunk_size = data_length_; - output_audio->Resize(chunk_size); + waves->Resize(chunk_size); } for (size_t idx = 0; idx < chunk_size; ++idx) { int buff_idx = (start_ + idx) % ring_buffer_.size(); - output_audio->Data()[idx] = ring_buffer_[buff_idx]; + waves->Data()[idx] = ring_buffer_[buff_idx]; } data_length_ -= chunk_size; start_ = (start_ + chunk_size) % ring_buffer_.size(); diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/raw_audio.h index c3f5a0e1..996b6e78 100644 --- a/speechx/speechx/frontend/raw_audio.h +++ b/speechx/speechx/frontend/raw_audio.h @@ -23,8 +23,8 @@ namespace ppspeech { class RawAudioCache : public FeatureExtractorInterface { public: explicit RawAudioCache(int buffer_size = kint16max); - virtual void Accept(const kaldi::VectorBase& input_audio); - virtual bool Read(kaldi::Vector* output_audio); + virtual void Accept(const kaldi::VectorBase& waves); + virtual bool Read(kaldi::Vector* waves); // the audio dim is 1 virtual size_t Dim() const { return 1; } virtual void SetFinished() { @@ -45,19 +45,20 @@ class RawAudioCache : public FeatureExtractorInterface { DISALLOW_COPY_AND_ASSIGN(RawAudioCache); }; -// it is a datasource for testing different frontend module. +// it is a data source to test different frontend module. +// it Accepts waves or feats. class RawDataCache: public FeatureExtractorInterface { public: explicit RawDataCache() { finished_ = false; } virtual void Accept( - const kaldi::VectorBase& input) { - data_ = input; + const kaldi::VectorBase& inputs) { + data_ = inputs; } - virtual bool Read(kaldi::Vector* feat) { + virtual bool Read(kaldi::Vector* feats) { if (data_.Dim() == 0) { return false; } - (*feat) = data_; + (*feats) = data_; data_.Resize(0); return true; }