diff --git a/speechx/examples/feat/CMakeLists.txt b/speechx/examples/feat/CMakeLists.txt index 44738e60..b8f516af 100644 --- a/speechx/examples/feat/CMakeLists.txt +++ b/speechx/examples/feat/CMakeLists.txt @@ -5,6 +5,6 @@ add_executable(mfcc-test ${CMAKE_CURRENT_SOURCE_DIR}/feature-mfcc-test.cc) target_include_directories(mfcc-test PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) target_link_libraries(mfcc-test kaldi-mfcc) -add_executable(linear-spectrogram-main ${CMAKE_CURRENT_SOURCE_DIR}/linear-spectrogram-main.cc) -target_include_directories(linear-spectrogram-main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(linear-spectrogram-main frontend kaldi-util kaldi-feat-common gflags glog) \ No newline at end of file +add_executable(linear_spectrogram_main ${CMAKE_CURRENT_SOURCE_DIR}/linear_spectrogram_main.cc) +target_include_directories(linear_spectrogram_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) +target_link_libraries(linear_spectrogram_main frontend kaldi-util kaldi-feat-common gflags glog) \ No newline at end of file diff --git a/speechx/examples/feat/linear-spectrogram-main.cc b/speechx/examples/feat/linear_spectrogram_main.cc similarity index 83% rename from speechx/examples/feat/linear-spectrogram-main.cc rename to speechx/examples/feat/linear_spectrogram_main.cc index 3e2342c2..f137a52c 100644 --- a/speechx/examples/feat/linear-spectrogram-main.cc +++ b/speechx/examples/feat/linear_spectrogram_main.cc @@ -14,19 +14,20 @@ // todo refactor, repalce with gtest +#include "frontend/linear_spectrogram.h" #include "base/flags.h" #include "base/log.h" +#include "frontend/feature_cache.h" #include "frontend/feature_extractor_interface.h" -#include "frontend/linear_spectrogram.h" #include "frontend/normalizer.h" +#include "frontend/raw_audio.h" #include "kaldi/feat/wave-reader.h" #include "kaldi/util/kaldi-io.h" #include "kaldi/util/table-types.h" -DEFINE_string(wav_rspecifier, "", "test wav path"); -DEFINE_string(feature_wspecifier, "", "test wav ark"); -DEFINE_string(feature_check_wspecifier, "", "test wav ark"); -DEFINE_string(cmvn_write_path, "./cmvn.ark", "test wav ark"); +DEFINE_string(wav_rspecifier, "", "test wav scp path"); +DEFINE_string(feature_wspecifier, "", "output feats wspecifier"); +DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn"); std::vector mean_{ @@ -158,38 +159,37 @@ int main(int argc, char* argv[]) { kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier); - kaldi::BaseFloatMatrixWriter feat_cmvn_check_writer( - FLAGS_feature_check_wspecifier); WriteMatrix(); // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning // window -->linear_spectrogram --> cmvn int32 num_done = 0, num_err = 0; + //std::unique_ptr data_source(new + //ppspeech::RawDataCache()); + std::unique_ptr data_source( + new ppspeech::RawAudioCache()); + ppspeech::LinearSpectrogramOptions opt; opt.frame_opts.frame_length_ms = 20; opt.frame_opts.frame_shift_ms = 10; ppspeech::DecibelNormalizerOptions db_norm_opt; std::unique_ptr base_feature_extractor( - new ppspeech::DecibelNormalizer(db_norm_opt)); - ppspeech::LinearSpectrogram linear_spectrogram( - opt, std::move(base_feature_extractor)); + new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source))); + + std::unique_ptr linear_spectrogram( + new ppspeech::LinearSpectrogram(opt, + std::move(base_feature_extractor))); - ppspeech::CMVN cmvn(FLAGS_cmvn_write_path); + std::unique_ptr cmvn( + new ppspeech::CMVN(FLAGS_cmvn_write_path, + std::move(linear_spectrogram))); + + ppspeech::FeatureCache feature_cache(kint16max, std::move(cmvn)); float streaming_chunk = 0.36; int sample_rate = 16000; int chunk_sample_size = streaming_chunk * sample_rate; - LOG(INFO) << mean_.size(); - for (size_t i = 0; i < mean_.size(); i++) { - mean_[i] /= count_; - variance_[i] = variance_[i] / count_ - mean_[i] * mean_[i]; - if (variance_[i] < 1.0e-20) { - variance_[i] = 1.0e-20; - } - variance_[i] = 1.0 / std::sqrt(variance_[i]); - } - for (; !wav_reader.Done(); wav_reader.Next()) { std::string utt = wav_reader.Key(); const kaldi::WaveData& wave_data = wav_reader.Value(); @@ -199,54 +199,45 @@ int main(int argc, char* argv[]) { this_channel); int tot_samples = waveform.Dim(); int sample_offset = 0; - std::vector> feats; + std::vector> feats; int feature_rows = 0; while (sample_offset < tot_samples) { int cur_chunk_size = std::min(chunk_sample_size, tot_samples - sample_offset); + kaldi::Vector wav_chunk(cur_chunk_size); for (int i = 0; i < cur_chunk_size; ++i) { wav_chunk(i) = waveform(sample_offset + i); } - kaldi::Matrix features; - linear_spectrogram.AcceptWaveform(wav_chunk); - linear_spectrogram.ReadFeats(&features); + kaldi::Vector features; + feature_cache.Accept(wav_chunk); + if (cur_chunk_size < chunk_sample_size) { + feature_cache.SetFinished(); + } + feature_cache.Read(&features); + if (features.Dim() == 0) break; feats.push_back(features); sample_offset += cur_chunk_size; - feature_rows += features.NumRows(); + feature_rows += features.Dim() / feature_cache.Dim(); } int cur_idx = 0; kaldi::Matrix features(feature_rows, - feats[0].NumCols()); + feature_cache.Dim()); for (auto feat : feats) { - for (int row_idx = 0; row_idx < feat.NumRows(); ++row_idx) { - for (int col_idx = 0; col_idx < feat.NumCols(); ++col_idx) { + int num_rows = feat.Dim() / feature_cache.Dim(); + for (int row_idx = 0; row_idx < num_rows; ++row_idx) { + for (size_t col_idx = 0; col_idx < feature_cache.Dim(); + ++col_idx) { features(cur_idx, col_idx) = - (feat(row_idx, col_idx) - mean_[col_idx]) * - variance_[col_idx]; + feat(row_idx * feature_cache.Dim() + col_idx); } ++cur_idx; } } feat_writer.Write(utt, features); - cur_idx = 0; - kaldi::Matrix features_check(feature_rows, - feats[0].NumCols()); - for (auto feat : feats) { - for (int row_idx = 0; row_idx < feat.NumRows(); ++row_idx) { - for (int col_idx = 0; col_idx < feat.NumCols(); ++col_idx) { - features_check(cur_idx, col_idx) = feat(row_idx, col_idx); - } - kaldi::SubVector row_feat(features_check, cur_idx); - cmvn.ApplyCMVN(true, &row_feat); - ++cur_idx; - } - } - feat_cmvn_check_writer.Write(utt, features_check); - if (num_done % 50 == 0 && num_done != 0) KALDI_VLOG(2) << "Processed " << num_done << " utterances"; num_done++; diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h index 3b58f73c..7502bc5e 100644 --- a/speechx/speechx/base/common.h +++ b/speechx/speechx/base/common.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -22,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/speechx/speechx/frontend/CMakeLists.txt b/speechx/speechx/frontend/CMakeLists.txt index da81a481..44ca52cd 100644 --- a/speechx/speechx/frontend/CMakeLists.txt +++ b/speechx/speechx/frontend/CMakeLists.txt @@ -2,7 +2,9 @@ project(frontend) add_library(frontend STATIC normalizer.cc - linear_spectrogram.cc + linear_spectrogram.cc + raw_audio.cc + feature_cache.cc ) -target_link_libraries(frontend PUBLIC kaldi-matrix) \ No newline at end of file +target_link_libraries(frontend PUBLIC kaldi-matrix) diff --git a/speechx/speechx/frontend/feature_cache.cc b/speechx/speechx/frontend/feature_cache.cc new file mode 100644 index 00000000..b353df16 --- /dev/null +++ b/speechx/speechx/frontend/feature_cache.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "frontend/feature_cache.h" + +namespace ppspeech { + +using kaldi::Vector; +using kaldi::VectorBase; +using kaldi::BaseFloat; +using std::vector; +using kaldi::SubVector; +using std::unique_ptr; + +FeatureCache::FeatureCache( + int max_size, unique_ptr base_extractor) { + max_size_ = max_size; + base_extractor_ = std::move(base_extractor); +} + +void FeatureCache::Accept( + const kaldi::VectorBase& inputs) { + base_extractor_->Accept(inputs); + // feed current data + bool result = false; + do { + result = Compute(); + } while (result); +} + +// pop feature chunk +bool FeatureCache::Read(kaldi::Vector* feats) { + kaldi::Timer timer; + std::unique_lock lock(mutex_); + while (cache_.empty() && base_extractor_->IsFinished() == false) { + ready_read_condition_.wait(lock); + BaseFloat elapsed = timer.Elapsed() * 1000; + // todo replace 1.0 with timeout_ + if (elapsed > 1.0) { + return false; + } + usleep(1000); // sleep 1 ms + } + if (cache_.empty()) return false; + feats->Resize(cache_.front().Dim()); + feats->CopyFromVec(cache_.front()); + cache_.pop(); + ready_feed_condition_.notify_one(); + return true; +} + +// read all data from base_feature_extractor_ into cache_ +bool FeatureCache::Compute() { + // compute and feed + Vector feature_chunk; + bool result = base_extractor_->Read(&feature_chunk); + std::unique_lock lock(mutex_); + while (cache_.size() >= max_size_) { + ready_feed_condition_.wait(lock); + } + if (feature_chunk.Dim() != 0) { + cache_.push(feature_chunk); + } + ready_read_condition_.notify_one(); + return result; +} + +void Reset() { + // std::lock_guard lock(mutex_); + return; +} + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h new file mode 100644 index 00000000..03b11f57 --- /dev/null +++ b/speechx/speechx/frontend/feature_cache.h @@ -0,0 +1,53 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "base/common.h" +#include "frontend/feature_extractor_interface.h" + +namespace ppspeech { + +class FeatureCache : public FeatureExtractorInterface { + public: + explicit FeatureCache( + int32 max_size = kint16max, + std::unique_ptr base_extractor = NULL); + virtual void Accept( + const kaldi::VectorBase& inputs); + // feats dim = num_frames * feature_dim + virtual bool Read(kaldi::Vector* feats); + // feature cache only cache feature which from base extractor + virtual size_t Dim() const { return base_extractor_->Dim(); } + virtual void SetFinished() { + base_extractor_->SetFinished(); + // read the last chunk data + Compute(); + } + virtual bool IsFinished() const { return base_extractor_->IsFinished(); } + + private: + bool Compute(); + + bool finished_; + std::mutex mutex_; + size_t max_size_; + std::queue> cache_; + std::unique_ptr base_extractor_; + std::condition_variable ready_feed_condition_; + std::condition_variable ready_read_condition_; + //DISALLOW_COPY_AND_ASSGIN(FeatureCache); +}; + +} // namespace ppspeech diff --git a/speechx/speechx/frontend/feature_extractor_interface.h b/speechx/speechx/frontend/feature_extractor_interface.h index e39f5e46..64cc67f3 100644 --- a/speechx/speechx/frontend/feature_extractor_interface.h +++ b/speechx/speechx/frontend/feature_extractor_interface.h @@ -21,10 +21,19 @@ namespace ppspeech { class FeatureExtractorInterface { public: - virtual void AcceptWaveform( - const kaldi::VectorBase& input) = 0; - virtual void Read(kaldi::VectorBase* feat) = 0; + // accept input data, accept feature or raw waves which decided + // by the base_extractor + virtual void Accept( + const kaldi::VectorBase& inputs) = 0; + // get the processed result + // the length of output = feature_row * feature_dim, + // the Matrix is squashed into Vector + virtual bool Read(kaldi::Vector* outputs) = 0; + // the Dim is the feature dim virtual size_t Dim() const = 0; + virtual void SetFinished() = 0; + virtual bool IsFinished() const = 0; + // virtual void Reset(); }; -} // namespace ppspeech \ No newline at end of file +} // namespace ppspeech diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/linear_spectrogram.cc index 6c008c39..7491716c 100644 --- a/speechx/speechx/frontend/linear_spectrogram.cc +++ b/speechx/speechx/frontend/linear_spectrogram.cc @@ -52,6 +52,8 @@ LinearSpectrogram::LinearSpectrogram( int32 window_size = opts.frame_opts.WindowSize(); int32 window_shift = opts.frame_opts.WindowShift(); fft_points_ = window_size; + chunk_sample_size_ = + static_cast(opts.streaming_chunk * opts.frame_opts.samp_freq); hanning_window_.resize(window_size); double a = M_2PI / (window_size - 1); @@ -64,8 +66,29 @@ LinearSpectrogram::LinearSpectrogram( dim_ = fft_points_ / 2 + 1; // the dimension is Fs/2 Hz } -void LinearSpectrogram::AcceptWaveform(const VectorBase& input) { - base_extractor_->AcceptWaveform(input); +void LinearSpectrogram::Accept(const VectorBase& inputs) { + base_extractor_->Accept(inputs); +} + +bool LinearSpectrogram::Read(Vector* feats) { + Vector input_feats(chunk_sample_size_); + bool flag = base_extractor_->Read(&input_feats); + if (flag == false || input_feats.Dim() == 0) return false; + + vector input_feats_vec(input_feats.Dim()); + CopyVector2StdVector_(input_feats, &input_feats_vec); + vector> result; + Compute(input_feats_vec, result); + int32 feat_size = 0; + if (result.size() != 0) { + feat_size = result.size() * result[0].size(); + } + feats->Resize(feat_size); + // todo refactor (SimleGoat) + for (size_t idx = 0; idx < feat_size; ++idx) { + (*feats)(idx) = result[idx / dim_][idx % dim_]; + } + return true; } void LinearSpectrogram::Hanning(vector* data) const { @@ -95,41 +118,11 @@ bool LinearSpectrogram::NumpyFft(vector* v, return true; } -// todo remove later -void LinearSpectrogram::ReadFeats(Matrix* feats) { - Vector tmp; - waveform_.Resize(base_extractor_->Dim()); - Compute(tmp, &waveform_); - vector> result; - vector feats_vec; - CopyVector2StdVector_(waveform_, &feats_vec); - Compute(feats_vec, result); - feats->Resize(result.size(), result[0].size()); - for (int row_idx = 0; row_idx < result.size(); ++row_idx) { - for (int col_idx = 0; col_idx < result[0].size(); ++col_idx) { - (*feats)(row_idx, col_idx) = result[row_idx][col_idx]; - } - } - waveform_.Resize(0); -} - -void LinearSpectrogram::Read(VectorBase* feat) { - // todo - return; -} - -// only for test, remove later -// todo: compute the feature frame by frame. -void LinearSpectrogram::Compute(const VectorBase& input, - VectorBase* feature) { - base_extractor_->Read(feature); -} - -// Compute spectrogram feat, only for test, remove later +// Compute spectrogram feat // todo: refactor later (SmileGoat) -bool LinearSpectrogram::Compute(const vector& wave, - vector>& feat) { - int num_samples = wave.size(); +bool LinearSpectrogram::Compute(const vector& waves, + vector>& feats) { + int num_samples = waves.size(); const int& frame_length = opts_.frame_opts.WindowSize(); const int& sample_rate = opts_.frame_opts.samp_freq; const int& frame_shift = opts_.frame_opts.WindowShift(); @@ -141,34 +134,34 @@ bool LinearSpectrogram::Compute(const vector& wave, } int num_frames = 1 + ((num_samples - frame_length) / frame_shift); - feat.resize(num_frames); + feats.resize(num_frames); vector fft_real((fft_points_ / 2 + 1), 0); vector fft_img((fft_points_ / 2 + 1), 0); vector v(frame_length, 0); vector power((fft_points / 2 + 1)); for (int i = 0; i < num_frames; ++i) { - vector data(wave.data() + i * frame_shift, - wave.data() + i * frame_shift + frame_length); + vector data(waves.data() + i * frame_shift, + waves.data() + i * frame_shift + frame_length); Hanning(&data); fft_img.clear(); fft_real.clear(); v.assign(data.begin(), data.end()); NumpyFft(&v, &fft_real, &fft_img); - feat[i].resize(fft_points / 2 + 1); // the last dimension is Fs/2 Hz + feats[i].resize(fft_points / 2 + 1); // the last dimension is Fs/2 Hz for (int j = 0; j < (fft_points / 2 + 1); ++j) { power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j]; - feat[i][j] = power[j]; + feats[i][j] = power[j]; - if (j == 0 || j == feat[0].size() - 1) { - feat[i][j] /= scale; + if (j == 0 || j == feats[0].size() - 1) { + feats[i][j] /= scale; } else { - feat[i][j] *= (2.0 / scale); + feats[i][j] *= (2.0 / scale); } // log added eps=1e-14 - feat[i][j] = std::log(feat[i][j] + 1e-14); + feats[i][j] = std::log(feats[i][j] + 1e-14); } } return true; diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/linear_spectrogram.h index 20b5e4b5..790263d9 100644 --- a/speechx/speechx/frontend/linear_spectrogram.h +++ b/speechx/speechx/frontend/linear_spectrogram.h @@ -23,9 +23,14 @@ namespace ppspeech { struct LinearSpectrogramOptions { kaldi::FrameExtractionOptions frame_opts; - LinearSpectrogramOptions() : frame_opts() {} + kaldi::BaseFloat streaming_chunk; + LinearSpectrogramOptions() : streaming_chunk(0.36), frame_opts() {} - void Register(kaldi::OptionsItf* opts) { frame_opts.Register(opts); } + void Register(kaldi::OptionsItf* opts) { + opts->Register( + "streaming-chunk", &streaming_chunk, "streaming chunk size"); + frame_opts.Register(opts); + } }; class LinearSpectrogram : public FeatureExtractorInterface { @@ -33,18 +38,18 @@ class LinearSpectrogram : public FeatureExtractorInterface { explicit LinearSpectrogram( const LinearSpectrogramOptions& opts, std::unique_ptr base_extractor); - virtual void AcceptWaveform( - const kaldi::VectorBase& input); - virtual void Read(kaldi::VectorBase* feat); + virtual void Accept( + const kaldi::VectorBase& inputs); + virtual bool Read(kaldi::Vector* feats); + // the dim_ is the dim of single frame feature virtual size_t Dim() const { return dim_; } - void ReadFeats(kaldi::Matrix* feats); + virtual void SetFinished() { base_extractor_->SetFinished(); } + virtual bool IsFinished() const { return base_extractor_->IsFinished(); } private: void Hanning(std::vector* data) const; - bool Compute(const std::vector& wave, - std::vector>& feat); - void Compute(const kaldi::VectorBase& input, - kaldi::VectorBase* feature); + bool Compute(const std::vector& waves, + std::vector>& feats); bool NumpyFft(std::vector* v, std::vector* real, std::vector* img) const; @@ -54,8 +59,8 @@ class LinearSpectrogram : public FeatureExtractorInterface { std::vector hanning_window_; kaldi::BaseFloat hanning_window_energy_; LinearSpectrogramOptions opts_; - kaldi::Vector waveform_; // remove later, todo(SmileGoat) std::unique_ptr base_extractor_; + int chunk_sample_size_; DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram); }; diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/normalizer.cc index abf798e5..fbb2b645 100644 --- a/speechx/speechx/frontend/normalizer.cc +++ b/speechx/speechx/frontend/normalizer.cc @@ -24,22 +24,28 @@ using kaldi::VectorBase; using kaldi::BaseFloat; using std::vector; using kaldi::SubVector; +using std::unique_ptr; -DecibelNormalizer::DecibelNormalizer(const DecibelNormalizerOptions& opts) { +DecibelNormalizer::DecibelNormalizer( + const DecibelNormalizerOptions& opts, + std::unique_ptr base_extractor) { + base_extractor_ = std::move(base_extractor); opts_ = opts; - dim_ = 0; + dim_ = 1; } -void DecibelNormalizer::AcceptWaveform( - const kaldi::VectorBase& input) { - dim_ = input.Dim(); - waveform_.Resize(input.Dim()); - waveform_.CopyFromVec(input); +void DecibelNormalizer::Accept( + const kaldi::VectorBase& waves) { + base_extractor_->Accept(waves); } -void DecibelNormalizer::Read(kaldi::VectorBase* feat) { - if (waveform_.Dim() == 0) return; - Compute(waveform_, feat); +bool DecibelNormalizer::Read(kaldi::Vector* waves) { + if (base_extractor_->Read(waves) == false || + waves->Dim() == 0) { + return false; + } + Compute(waves); + return true; } // todo remove later @@ -61,8 +67,7 @@ void CopyStdVector2Vector(const vector& input, } } -bool DecibelNormalizer::Compute(const VectorBase& input, - VectorBase* feat) const { +bool DecibelNormalizer::Compute(VectorBase* waves) const { // calculate db rms BaseFloat rms_db = 0.0; BaseFloat mean_square = 0.0; @@ -70,9 +75,9 @@ bool DecibelNormalizer::Compute(const VectorBase& input, BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1)); vector samples; - samples.resize(input.Dim()); - for (int32 i = 0; i < samples.size(); ++i) { - samples[i] = input(i); + samples.resize(waves->Dim()); + for (size_t i = 0; i < samples.size(); ++i) { + samples[i] = (*waves)(i); } // square @@ -102,24 +107,35 @@ bool DecibelNormalizer::Compute(const VectorBase& input, item *= std::pow(10.0, gain / 20.0); } - CopyStdVector2Vector(samples, feat); + CopyStdVector2Vector(samples, waves); return true; } -CMVN::CMVN(std::string cmvn_file) : var_norm_(true) { +CMVN::CMVN(std::string cmvn_file, + unique_ptr base_extractor) + : var_norm_(true) { + base_extractor_ = std::move(base_extractor); bool binary; kaldi::Input ki(cmvn_file, &binary); stats_.Read(ki.Stream(), binary); + dim_ = stats_.NumCols() - 1; } -void CMVN::AcceptWaveform(const kaldi::VectorBase& input) { +void CMVN::Accept(const kaldi::VectorBase& inputs) { + base_extractor_->Accept(inputs); return; } -void CMVN::Read(kaldi::VectorBase* feat) { return; } +bool CMVN::Read(kaldi::Vector* feats) { + if (base_extractor_->Read(feats) == false) { + return false; + } + Compute(feats); + return true; +} // feats contain num_frames feature. -void CMVN::ApplyCMVN(bool var_norm, VectorBase* feats) { +void CMVN::Compute(VectorBase* feats) const { KALDI_ASSERT(feats != NULL); int32 dim = stats_.NumCols() - 1; if (stats_.NumRows() > 2 || stats_.NumRows() < 1 || @@ -127,7 +143,7 @@ void CMVN::ApplyCMVN(bool var_norm, VectorBase* feats) { KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << 'x' << stats_.NumCols() << ", feats " << feats->Dim() << 'x'; } - if (stats_.NumRows() == 1 && var_norm) { + if (stats_.NumRows() == 1 && var_norm_) { KALDI_ERR << "You requested variance normalization but no variance stats_ " << "are supplied."; @@ -141,7 +157,7 @@ void CMVN::ApplyCMVN(bool var_norm, VectorBase* feats) { "normalization: " << "count = " << count; - if (!var_norm) { + if (!var_norm_) { Vector offset(feats->Dim()); SubVector mean_stats(stats_.RowData(0), dim); Vector mean_stats_apply(feats->Dim()); @@ -185,14 +201,8 @@ void CMVN::ApplyCMVN(bool var_norm, VectorBase* feats) { feats->AddVec(1.0, norm.Row(0)); } -void CMVN::ApplyCMVNMatrix(bool var_norm, kaldi::MatrixBase* feats) { - ApplyCmvn(stats_, var_norm, feats); +void CMVN::ApplyCMVN(kaldi::MatrixBase* feats) { + ApplyCmvn(stats_, var_norm_, feats); } -bool CMVN::Compute(const VectorBase& input, - VectorBase* feat) const { - return false; -} - - } // namespace ppspeech diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/normalizer.h index 6af5cdd8..b9daa853 100644 --- a/speechx/speechx/frontend/normalizer.h +++ b/speechx/speechx/frontend/normalizer.h @@ -42,15 +42,19 @@ struct DecibelNormalizerOptions { class DecibelNormalizer : public FeatureExtractorInterface { public: - explicit DecibelNormalizer(const DecibelNormalizerOptions& opts); - virtual void AcceptWaveform( - const kaldi::VectorBase& input); - virtual void Read(kaldi::VectorBase* feat); + explicit DecibelNormalizer( + const DecibelNormalizerOptions& opts, + std::unique_ptr base_extractor); + virtual void Accept( + const kaldi::VectorBase& waves); + virtual bool Read(kaldi::Vector* waves); + // noramlize audio, the dim is 1. virtual size_t Dim() const { return dim_; } - bool Compute(const kaldi::VectorBase& input, - kaldi::VectorBase* feat) const; + virtual void SetFinished() { base_extractor_->SetFinished(); } + virtual bool IsFinished() const { return base_extractor_->IsFinished(); } private: + bool Compute(kaldi::VectorBase* waves) const; DecibelNormalizerOptions opts_; size_t dim_; std::unique_ptr base_extractor_; @@ -60,20 +64,24 @@ class DecibelNormalizer : public FeatureExtractorInterface { class CMVN : public FeatureExtractorInterface { public: - explicit CMVN(std::string cmvn_file); - virtual void AcceptWaveform( - const kaldi::VectorBase& input); - virtual void Read(kaldi::VectorBase* feat); - virtual size_t Dim() const { return stats_.NumCols() - 1; } - bool Compute(const kaldi::VectorBase& input, - kaldi::VectorBase* feat) const; - // for test - void ApplyCMVN(bool var_norm, kaldi::VectorBase* feats); - void ApplyCMVNMatrix(bool var_norm, kaldi::MatrixBase* feats); + explicit CMVN(std::string cmvn_file, + std::unique_ptr base_extractor); + virtual void Accept( + const kaldi::VectorBase& inputs); + + // the length of feats = feature_row * feature_dim, + // the Matrix is squashed into Vector + virtual bool Read(kaldi::Vector* feats); + // the dim_ is the feautre dim. + virtual size_t Dim() const { return dim_; } + virtual void SetFinished() { base_extractor_->SetFinished(); } + virtual bool IsFinished() const { return base_extractor_->IsFinished(); } private: + void Compute(kaldi::VectorBase* feats) const; + void ApplyCMVN(kaldi::MatrixBase* feats); kaldi::Matrix stats_; - std::shared_ptr base_extractor_; + std::unique_ptr base_extractor_; size_t dim_; bool var_norm_; }; diff --git a/speechx/speechx/frontend/raw_audio.cc b/speechx/speechx/frontend/raw_audio.cc new file mode 100644 index 00000000..0f3d83ec --- /dev/null +++ b/speechx/speechx/frontend/raw_audio.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "frontend/raw_audio.h" +#include "kaldi/base/timer.h" + +namespace ppspeech { + +using kaldi::BaseFloat; +using kaldi::VectorBase; +using kaldi::Vector; + +RawAudioCache::RawAudioCache(int buffer_size) + : finished_(false), data_length_(0), start_(0), timeout_(1) { + ring_buffer_.resize(buffer_size); +} + +void RawAudioCache::Accept(const VectorBase& waves) { + std::unique_lock lock(mutex_); + while (data_length_ + waves.Dim() > ring_buffer_.size()) { + ready_feed_condition_.wait(lock); + } + for (size_t idx = 0; idx < waves.Dim(); ++idx) { + int32 buffer_idx = (idx + start_) % ring_buffer_.size(); + ring_buffer_[buffer_idx] = waves(idx); + } + data_length_ += waves.Dim(); +} + +bool RawAudioCache::Read(Vector* waves) { + size_t chunk_size = waves->Dim(); + kaldi::Timer timer; + std::unique_lock lock(mutex_); + while (chunk_size > data_length_) { + // when audio is empty and no more data feed + // ready_read_condition will block in dead lock. so replace with timeout_ + // ready_read_condition_.wait(lock); + int32 elapsed = static_cast(timer.Elapsed() * 1000); + if (elapsed > timeout_) { + if (finished_ == true) { // read last chunk data + break; + } + if (chunk_size > data_length_) { + return false; + } + } + usleep(100); // sleep 0.1 ms + } + + // read last chunk data + if (chunk_size > data_length_) { + chunk_size = data_length_; + waves->Resize(chunk_size); + } + + for (size_t idx = 0; idx < chunk_size; ++idx) { + int buff_idx = (start_ + idx) % ring_buffer_.size(); + waves->Data()[idx] = ring_buffer_[buff_idx]; + } + data_length_ -= chunk_size; + start_ = (start_ + chunk_size) % ring_buffer_.size(); + ready_feed_condition_.notify_one(); + return true; +} + +} // namespace ppspeech diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/raw_audio.h new file mode 100644 index 00000000..996b6e78 --- /dev/null +++ b/speechx/speechx/frontend/raw_audio.h @@ -0,0 +1,77 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#pragma once + +#include "base/common.h" +#include "frontend/feature_extractor_interface.h" + +namespace ppspeech { + +class RawAudioCache : public FeatureExtractorInterface { + public: + explicit RawAudioCache(int buffer_size = kint16max); + virtual void Accept(const kaldi::VectorBase& waves); + virtual bool Read(kaldi::Vector* waves); + // the audio dim is 1 + virtual size_t Dim() const { return 1; } + virtual void SetFinished() { + std::lock_guard lock(mutex_); + finished_ = true; + } + virtual bool IsFinished() const { return finished_; } + + private: + std::vector ring_buffer_; + size_t start_; + size_t data_length_; + bool finished_; + mutable std::mutex mutex_; + std::condition_variable ready_feed_condition_; + kaldi::int32 timeout_; + + DISALLOW_COPY_AND_ASSIGN(RawAudioCache); +}; + +// it is a data source to test different frontend module. +// it Accepts waves or feats. +class RawDataCache: public FeatureExtractorInterface { + public: + explicit RawDataCache() { finished_ = false; } + virtual void Accept( + const kaldi::VectorBase& inputs) { + data_ = inputs; + } + virtual bool Read(kaldi::Vector* feats) { + if (data_.Dim() == 0) { + return false; + } + (*feats) = data_; + data_.Resize(0); + return true; + } + //the dim is data_ length + virtual size_t Dim() const { return data_.Dim(); } + virtual void SetFinished() { finished_ = true; } + virtual bool IsFinished() const { return finished_; } + + private: + kaldi::Vector data_; + bool finished_; + + DISALLOW_COPY_AND_ASSIGN(RawDataCache); +}; + +} // namespace ppspeech