diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc index c2ca6187..f137a52c 100644 --- a/speechx/examples/feat/linear_spectrogram_main.cc +++ b/speechx/examples/feat/linear_spectrogram_main.cc @@ -25,10 +25,9 @@ #include "kaldi/util/kaldi-io.h" #include "kaldi/util/table-types.h" -DEFINE_string(wav_rspecifier, "", "test wav path"); -DEFINE_string(feature_wspecifier, "", "test wav ark"); -DEFINE_string(feature_check_wspecifier, "", "test wav ark"); -DEFINE_string(cmvn_write_path, "./cmvn.ark", "test wav ark"); +DEFINE_string(wav_rspecifier, "", "test wav scp path"); +DEFINE_string(feature_wspecifier, "", "output feats wspecifier"); +DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn"); std::vector mean_{ @@ -165,10 +164,10 @@ int main(int argc, char* argv[]) { // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning // window -->linear_spectrogram --> cmvn int32 num_done = 0, num_err = 0; - // std::unique_ptr data_source(new - // ppspeech::RawDataSource()); + //std::unique_ptr data_source(new + //ppspeech::RawDataCache()); std::unique_ptr data_source( - new ppspeech::RawAudioSource()); + new ppspeech::RawAudioCache()); ppspeech::LinearSpectrogramOptions opt; opt.frame_opts.frame_length_ms = 20; @@ -211,7 +210,7 @@ int main(int argc, char* argv[]) { wav_chunk(i) = waveform(sample_offset + i); } kaldi::Vector features; - feature_cache.AcceptWaveform(wav_chunk); + feature_cache.Accept(wav_chunk); if (cur_chunk_size < chunk_sample_size) { feature_cache.SetFinished(); } diff --git a/speechx/speechx/frontend/feature_cache.cc b/speechx/speechx/frontend/feature_cache.cc index df366a06..27982f64 100644 --- a/speechx/speechx/frontend/feature_cache.cc +++ b/speechx/speechx/frontend/feature_cache.cc @@ -29,9 +29,9 @@ FeatureCache::FeatureCache( base_extractor_ = std::move(base_extractor); } -void FeatureCache::AcceptWaveform( - const kaldi::VectorBase& input) { - base_extractor_->AcceptWaveform(input); +void FeatureCache::Accept( + const kaldi::VectorBase& inputs) { + base_extractor_->Accept(inputs); // feed current data bool result = false; do { @@ -40,7 +40,7 @@ void FeatureCache::AcceptWaveform( } // pop feature chunk -bool FeatureCache::Read(kaldi::Vector* feat) { +bool FeatureCache::Read(kaldi::Vector* output_feats) { kaldi::Timer timer; std::unique_lock lock(mutex_); while (cache_.empty() && base_extractor_->IsFinished() == false) { @@ -53,8 +53,8 @@ bool FeatureCache::Read(kaldi::Vector* feat) { usleep(1000); // sleep 1 ms } if (cache_.empty()) return false; - feat->Resize(cache_.front().Dim()); - feat->CopyFromVec(cache_.front()); + output_feats->Resize(cache_.front().Dim()); + output_feats->CopyFromVec(cache_.front()); cache_.pop(); ready_feed_condition_.notify_one(); return true; diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h index 5849cc5c..9442fe1f 100644 --- a/speechx/speechx/frontend/feature_cache.h +++ b/speechx/speechx/frontend/feature_cache.h @@ -24,12 +24,15 @@ class FeatureCache : public FeatureExtractorInterface { explicit FeatureCache( int32 max_size = kint16max, std::unique_ptr base_extractor = NULL); - virtual void AcceptWaveform( - const kaldi::VectorBase& input); - virtual bool Read(kaldi::Vector* feat); + virtual void Accept( + const kaldi::VectorBase& inputs); + // output_feats dim = num_frames * feature_dim + virtual bool Read(kaldi::Vector* output_feats); + // feature cache only cache feature which from base extractor virtual size_t Dim() const { return base_extractor_->Dim(); } virtual void SetFinished() { base_extractor_->SetFinished(); + // read the last chunk data Compute(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); } @@ -44,7 +47,7 @@ class FeatureCache : public FeatureExtractorInterface { std::unique_ptr base_extractor_; std::condition_variable ready_feed_condition_; std::condition_variable ready_read_condition_; - // DISALLOW_COPY_AND_ASSGIN(FeatureCache); + //DISALLOW_COPY_AND_ASSGIN(FeatureCache); }; } // namespace ppspeech diff --git a/speechx/speechx/frontend/feature_extractor_interface.h b/speechx/speechx/frontend/feature_extractor_interface.h index e490bc75..70fa93ae 100644 --- a/speechx/speechx/frontend/feature_extractor_interface.h +++ b/speechx/speechx/frontend/feature_extractor_interface.h @@ -21,13 +21,18 @@ namespace ppspeech { class FeatureExtractorInterface { public: - virtual void AcceptWaveform( - const kaldi::VectorBase& input) = 0; - virtual bool Read(kaldi::Vector* feat) = 0; + // accept input data + virtual void Accept( + const kaldi::VectorBase& inputs) = 0; + // get the processed result + // the length of output = feature_row * feature_dim, + // the Matrix is squashed into Vector + virtual bool Read(kaldi::Vector* outputs) = 0; + // the Dim is the feature dim virtual size_t Dim() const = 0; virtual void SetFinished() = 0; virtual bool IsFinished() const = 0; // virtual void Reset(); }; -} // namespace ppspeech \ No newline at end of file +} // namespace ppspeech diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/linear_spectrogram.cc index 73cffea5..c0ae553f 100644 --- a/speechx/speechx/frontend/linear_spectrogram.cc +++ b/speechx/speechx/frontend/linear_spectrogram.cc @@ -66,11 +66,11 @@ LinearSpectrogram::LinearSpectrogram( dim_ = fft_points_ / 2 + 1; // the dimension is Fs/2 Hz } -void LinearSpectrogram::AcceptWaveform(const VectorBase& input) { - base_extractor_->AcceptWaveform(input); +void LinearSpectrogram::Accept(const VectorBase& inputs) { + base_extractor_->Accept(inputs); } -bool LinearSpectrogram::Read(Vector* feat) { +bool LinearSpectrogram::Read(Vector* output_feats) { Vector input_feats(chunk_sample_size_); bool flag = base_extractor_->Read(&input_feats); if (flag == false || input_feats.Dim() == 0) return false; @@ -83,9 +83,10 @@ bool LinearSpectrogram::Read(Vector* feat) { if (result.size() != 0) { feat_size = result.size() * result[0].size(); } - feat->Resize(feat_size); + output_feats->Resize(feat_size); + // todo refactor (SimleGoat) for (size_t idx = 0; idx < feat_size; ++idx) { - (*feat)(idx) = result[idx / dim_][idx % dim_]; + (*output_feats)(idx) = result[idx / dim_][idx % dim_]; } return true; } @@ -117,7 +118,7 @@ bool LinearSpectrogram::NumpyFft(vector* v, return true; } -// Compute spectrogram feat, only for test, remove later +// Compute spectrogram feat // todo: refactor later (SmileGoat) bool LinearSpectrogram::Compute(const vector& wave, vector>& feat) { diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/linear_spectrogram.h index c18438eb..5c73f207 100644 --- a/speechx/speechx/frontend/linear_spectrogram.h +++ b/speechx/speechx/frontend/linear_spectrogram.h @@ -38,9 +38,10 @@ class LinearSpectrogram : public FeatureExtractorInterface { explicit LinearSpectrogram( const LinearSpectrogramOptions& opts, std::unique_ptr base_extractor); - virtual void AcceptWaveform( - const kaldi::VectorBase& input); - virtual bool Read(kaldi::Vector* feat); + virtual void Accept( + const kaldi::VectorBase& inputs); + virtual bool Read(kaldi::Vector* output_feats); + // the dim_ is the dim of single frame feature virtual size_t Dim() const { return dim_; } virtual void SetFinished() { base_extractor_->SetFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); } @@ -49,8 +50,6 @@ class LinearSpectrogram : public FeatureExtractorInterface { void Hanning(std::vector* data) const; bool Compute(const std::vector& wave, std::vector>& feat); - void Compute(const kaldi::VectorBase& input, - kaldi::VectorBase* feature); bool NumpyFft(std::vector* v, std::vector* real, std::vector* img) const; @@ -60,7 +59,6 @@ class LinearSpectrogram : public FeatureExtractorInterface { std::vector hanning_window_; kaldi::BaseFloat hanning_window_energy_; LinearSpectrogramOptions opts_; - kaldi::Vector waveform_; // remove later, todo(SmileGoat) std::unique_ptr base_extractor_; int chunk_sample_size_; DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram); diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/normalizer.cc index 8aaf33de..3af44c38 100644 --- a/speechx/speechx/frontend/normalizer.cc +++ b/speechx/speechx/frontend/normalizer.cc @@ -31,23 +31,20 @@ DecibelNormalizer::DecibelNormalizer( std::unique_ptr base_extractor) { base_extractor_ = std::move(base_extractor); opts_ = opts; - dim_ = 0; + dim_ = 1; } -void DecibelNormalizer::AcceptWaveform( - const kaldi::VectorBase& input) { - // dim_ = input.Dim(); - // waveform_.Resize(input.Dim()); - // waveform_.CopyFromVec(input); - base_extractor_->AcceptWaveform(input); +void DecibelNormalizer::Accept( + const kaldi::VectorBase& inputs_wave) { + base_extractor_->Accept(inputs_wave); } -bool DecibelNormalizer::Read(kaldi::Vector* feat) { - // if (waveform_.Dim() == 0) return; - if (base_extractor_->Read(feat) == false || feat->Dim() == 0) { +bool DecibelNormalizer::Read(kaldi::Vector* outputs_wave) { + if (base_extractor_->Read(outputs_wave) == false || + outputs_wave->Dim() == 0) { return false; } - Compute(feat); + Compute(outputs_wave); return true; } @@ -70,7 +67,7 @@ void CopyStdVector2Vector(const vector& input, } } -bool DecibelNormalizer::Compute(VectorBase* feat) const { +bool DecibelNormalizer::Compute(VectorBase* feats) const { // calculate db rms BaseFloat rms_db = 0.0; BaseFloat mean_square = 0.0; @@ -78,9 +75,9 @@ bool DecibelNormalizer::Compute(VectorBase* feat) const { BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1)); vector samples; - samples.resize(feat->Dim()); + samples.resize(feats->Dim()); for (size_t i = 0; i < samples.size(); ++i) { - samples[i] = (*feat)(i); + samples[i] = (*feats)(i); } // square @@ -110,7 +107,7 @@ bool DecibelNormalizer::Compute(VectorBase* feat) const { item *= std::pow(10.0, gain / 20.0); } - CopyStdVector2Vector(samples, feat); + CopyStdVector2Vector(samples, feats); return true; } @@ -124,16 +121,16 @@ CMVN::CMVN(std::string cmvn_file, dim_ = stats_.NumCols() - 1; } -void CMVN::AcceptWaveform(const kaldi::VectorBase& input) { - base_extractor_->AcceptWaveform(input); +void CMVN::Accept(const kaldi::VectorBase& feats) { + base_extractor_->Accept(feats); return; } -bool CMVN::Read(kaldi::Vector* feat) { - if (base_extractor_->Read(feat) == false) { +bool CMVN::Read(kaldi::Vector* outputs) { + if (base_extractor_->Read(outputs) == false) { return false; } - Compute(feat); + Compute(outputs); return true; } diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/normalizer.h index 189e0e2b..ab333624 100644 --- a/speechx/speechx/frontend/normalizer.h +++ b/speechx/speechx/frontend/normalizer.h @@ -45,15 +45,16 @@ class DecibelNormalizer : public FeatureExtractorInterface { explicit DecibelNormalizer( const DecibelNormalizerOptions& opts, std::unique_ptr base_extractor); - virtual void AcceptWaveform( - const kaldi::VectorBase& input); - virtual bool Read(kaldi::Vector* feat); + virtual void Accept( + const kaldi::VectorBase& inputs_wave); + virtual bool Read(kaldi::Vector* outputs_wave); + // noramlize audio, the dim is 1. virtual size_t Dim() const { return dim_; } virtual void SetFinished() { base_extractor_->SetFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); } private: - bool Compute(kaldi::VectorBase* feat) const; + bool Compute(kaldi::VectorBase* feats) const; DecibelNormalizerOptions opts_; size_t dim_; std::unique_ptr base_extractor_; @@ -65,15 +66,19 @@ class CMVN : public FeatureExtractorInterface { public: explicit CMVN(std::string cmvn_file, std::unique_ptr base_extractor); - virtual void AcceptWaveform( - const kaldi::VectorBase& input); - virtual bool Read(kaldi::Vector* feat); + virtual void Accept( + const kaldi::VectorBase& feats); + + // the length of outputs = feature_row * feature_dim, + // the Matrix is squashed into Vector + virtual bool Read(kaldi::Vector* outputs); + // the dim_ is the feautre dim. virtual size_t Dim() const { return dim_; } virtual void SetFinished() { base_extractor_->SetFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); } private: - void Compute(kaldi::VectorBase* feat) const; + void Compute(kaldi::VectorBase* feats) const; void ApplyCMVN(kaldi::MatrixBase* feats); kaldi::Matrix stats_; std::unique_ptr base_extractor_; diff --git a/speechx/speechx/frontend/raw_audio.cc b/speechx/speechx/frontend/raw_audio.cc index 1e265a57..7cfeb9e4 100644 --- a/speechx/speechx/frontend/raw_audio.cc +++ b/speechx/speechx/frontend/raw_audio.cc @@ -21,33 +21,25 @@ using kaldi::BaseFloat; using kaldi::VectorBase; using kaldi::Vector; -RawAudioSource::RawAudioSource(int buffer_size) +RawAudioCache::RawAudioCache(int buffer_size) : finished_(false), data_length_(0), start_(0), timeout_(1) { ring_buffer_.resize(buffer_size); } -void RawAudioSource::AcceptWaveform(const VectorBase& data) { +void RawAudioCache::Accept(const VectorBase& input_audio) { std::unique_lock lock(mutex_); - while (data_length_ + data.Dim() > ring_buffer_.size()) { + while (data_length_ + input_audio.Dim() > ring_buffer_.size()) { ready_feed_condition_.wait(lock); } - for (size_t idx = 0; idx < data.Dim(); ++idx) { - ring_buffer_[idx % ring_buffer_.size()] = data(idx); + for (size_t idx = 0; idx < input_audio.Dim(); ++idx) { + int32 buffer_idx = (idx + start_) % ring_buffer_.size(); + ring_buffer_[buffer_idx] = input_audio(idx); } - data_length_ += data.Dim(); + data_length_ += input_audio.Dim(); } -// bool RawAudioSource::AcceptWaveform(BaseFloat* data, int length) { -// std::unique_lock lock(mutex_); -// for (size_t idx = 0; idx < length; ++idx) { -// ring_buffer_[idx % ring_buffer_.size()] = data[idx]; -//} -// data_length_ += length; -// finish_condition_.notify_one(); -//} - -bool RawAudioSource::Read(Vector* feat) { - size_t chunk_size = feat->Dim(); +bool RawAudioCache::Read(Vector* output_audio) { + size_t chunk_size = output_audio->Dim(); kaldi::Timer timer; std::unique_lock lock(mutex_); while (chunk_size > data_length_) { @@ -69,11 +61,12 @@ bool RawAudioSource::Read(Vector* feat) { // read last chunk data if (chunk_size > data_length_) { chunk_size = data_length_; - feat->Resize(chunk_size); + output_audio->Resize(chunk_size); } for (size_t idx = 0; idx < chunk_size; ++idx) { - feat->Data()[idx] = ring_buffer_[idx]; + int buff_idx = (start_ + idx) % ring_buffer_.size(); + output_audio->Data()[idx] = ring_buffer_[buff_idx]; } data_length_ -= chunk_size; start_ = (start_ + chunk_size) % ring_buffer_.size(); diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/raw_audio.h index c3ebe559..c3f5a0e1 100644 --- a/speechx/speechx/frontend/raw_audio.h +++ b/speechx/speechx/frontend/raw_audio.h @@ -20,12 +20,13 @@ namespace ppspeech { -class RawAudioSource : public FeatureExtractorInterface { +class RawAudioCache : public FeatureExtractorInterface { public: - explicit RawAudioSource(int buffer_size = kint16max); - virtual void AcceptWaveform(const kaldi::VectorBase& data); - virtual bool Read(kaldi::Vector* feat); - virtual size_t Dim() const { return data_length_; } + explicit RawAudioCache(int buffer_size = kint16max); + virtual void Accept(const kaldi::VectorBase& input_audio); + virtual bool Read(kaldi::Vector* output_audio); + // the audio dim is 1 + virtual size_t Dim() const { return 1; } virtual void SetFinished() { std::lock_guard lock(mutex_); finished_ = true; @@ -41,14 +42,14 @@ class RawAudioSource : public FeatureExtractorInterface { std::condition_variable ready_feed_condition_; kaldi::int32 timeout_; - DISALLOW_COPY_AND_ASSIGN(RawAudioSource); + DISALLOW_COPY_AND_ASSIGN(RawAudioCache); }; // it is a datasource for testing different frontend module. -class RawDataSource : public FeatureExtractorInterface { +class RawDataCache: public FeatureExtractorInterface { public: - explicit RawDataSource() { finished_ = false; } - virtual void AcceptWaveform( + explicit RawDataCache() { finished_ = false; } + virtual void Accept( const kaldi::VectorBase& input) { data_ = input; } @@ -60,6 +61,7 @@ class RawDataSource : public FeatureExtractorInterface { data_.Resize(0); return true; } + //the dim is data_ length virtual size_t Dim() const { return data_.Dim(); } virtual void SetFinished() { finished_ = true; } virtual bool IsFinished() const { return finished_; } @@ -68,7 +70,7 @@ class RawDataSource : public FeatureExtractorInterface { kaldi::Vector data_; bool finished_; - DISALLOW_COPY_AND_ASSIGN(RawDataSource); + DISALLOW_COPY_AND_ASSIGN(RawDataCache); }; } // namespace ppspeech