rename interface & add comment to Dim()

2 years ago · 22fe1c9dbe
parent 7c1b432830
commit 22fe1c9dbe
10 changed files with 94 additions and 91 deletions
--- a/speechx/examples/feat/linear_spectrogram_main.cc
+++ b/speechx/examples/feat/linear_spectrogram_main.cc
@ -25,10 +25,9 @@
 #include "kaldi/util/kaldi-io.h"
 #include "kaldi/util/table-types.h"

-DEFINE_string(wav_rspecifier, "", "test wav path");
-DEFINE_string(feature_wspecifier, "", "test wav ark");
-DEFINE_string(feature_check_wspecifier, "", "test wav ark");
-DEFINE_string(cmvn_write_path, "./cmvn.ark", "test wav ark");
+DEFINE_string(wav_rspecifier, "", "test wav scp path");
+DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
+DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn");


 std::vector<float> mean_{
@ -165,10 +164,10 @@ int main(int argc, char* argv[]) {
    // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning
    // window -->linear_spectrogram --> cmvn
    int32 num_done = 0, num_err = 0;
-    // std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new
-    // ppspeech::RawDataSource());
+    //std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new
+     //ppspeech::RawDataCache());
    std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(
-        new ppspeech::RawAudioSource());
+        new ppspeech::RawAudioCache());

    ppspeech::LinearSpectrogramOptions opt;
    opt.frame_opts.frame_length_ms = 20;
@ -211,7 +210,7 @@ int main(int argc, char* argv[]) {
                wav_chunk(i) = waveform(sample_offset + i);
            }
            kaldi::Vector<BaseFloat> features;
-            feature_cache.AcceptWaveform(wav_chunk);
+            feature_cache.Accept(wav_chunk);
            if (cur_chunk_size < chunk_sample_size) {
                feature_cache.SetFinished();
            }
--- a/speechx/speechx/frontend/feature_cache.cc
+++ b/speechx/speechx/frontend/feature_cache.cc
@ -29,9 +29,9 @@ FeatureCache::FeatureCache(
    base_extractor_ = std::move(base_extractor);
 }

-void FeatureCache::AcceptWaveform(
-    const kaldi::VectorBase<kaldi::BaseFloat>& input) {
-    base_extractor_->AcceptWaveform(input);
+void FeatureCache::Accept(
+    const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
+    base_extractor_->Accept(inputs);
    // feed current data
    bool result = false;
    do {
@ -40,7 +40,7 @@ void FeatureCache::AcceptWaveform(
 }

 // pop feature chunk
-bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* feat) {
+bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* output_feats) {
    kaldi::Timer timer;
    std::unique_lock<std::mutex> lock(mutex_);
    while (cache_.empty() && base_extractor_->IsFinished() == false) {
@ -53,8 +53,8 @@ bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* feat) {
        usleep(1000);  // sleep 1 ms
    }
    if (cache_.empty()) return false;
-    feat->Resize(cache_.front().Dim());
-    feat->CopyFromVec(cache_.front());
+    output_feats->Resize(cache_.front().Dim());
+    output_feats->CopyFromVec(cache_.front());
    cache_.pop();
    ready_feed_condition_.notify_one();
    return true;
--- a/speechx/speechx/frontend/feature_cache.h
+++ b/speechx/speechx/frontend/feature_cache.h
@ -24,12 +24,15 @@ class FeatureCache : public FeatureExtractorInterface {
    explicit FeatureCache(
        int32 max_size = kint16max,
        std::unique_ptr<FeatureExtractorInterface> base_extractor = NULL);
-    virtual void AcceptWaveform(
-        const kaldi::VectorBase<kaldi::BaseFloat>& input);
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat);
+    virtual void Accept(
+        const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
+    // output_feats dim = num_frames * feature_dim
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* output_feats);
+    // feature cache only cache feature which from base extractor
    virtual size_t Dim() const { return base_extractor_->Dim(); }
    virtual void SetFinished() {
        base_extractor_->SetFinished();
+        // read the last chunk data
        Compute();
    }
    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
@ -44,7 +47,7 @@ class FeatureCache : public FeatureExtractorInterface {
    std::unique_ptr<FeatureExtractorInterface> base_extractor_;
    std::condition_variable ready_feed_condition_;
    std::condition_variable ready_read_condition_;
-    //    DISALLOW_COPY_AND_ASSGIN(FeatureCache);
+    //DISALLOW_COPY_AND_ASSGIN(FeatureCache);
 };

 }  // namespace ppspeech
--- a/speechx/speechx/frontend/feature_extractor_interface.h
+++ b/speechx/speechx/frontend/feature_extractor_interface.h
@ -21,13 +21,18 @@ namespace ppspeech {

 class FeatureExtractorInterface {
  public:
-    virtual void AcceptWaveform(
-        const kaldi::VectorBase<kaldi::BaseFloat>& input) = 0;
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat) = 0;
+    // accept input data
+    virtual void Accept(
+        const kaldi::VectorBase<kaldi::BaseFloat>& inputs) = 0;
+    // get the processed result
+    // the length of output = feature_row * feature_dim,
+    // the Matrix is squashed into Vector
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* outputs) = 0;
+    // the Dim is the feature dim
    virtual size_t Dim() const = 0;
    virtual void SetFinished() = 0;
    virtual bool IsFinished() const = 0;
    // virtual void Reset();
 };

-}  // namespace ppspeech
+}  // namespace ppspeech
--- a/speechx/speechx/frontend/linear_spectrogram.cc
+++ b/speechx/speechx/frontend/linear_spectrogram.cc
@ -66,11 +66,11 @@ LinearSpectrogram::LinearSpectrogram(
    dim_ = fft_points_ / 2 + 1;  // the dimension is Fs/2 Hz
 }

-void LinearSpectrogram::AcceptWaveform(const VectorBase<BaseFloat>& input) {
-    base_extractor_->AcceptWaveform(input);
+void LinearSpectrogram::Accept(const VectorBase<BaseFloat>& inputs) {
+    base_extractor_->Accept(inputs);
 }

-bool LinearSpectrogram::Read(Vector<BaseFloat>* feat) {
+bool LinearSpectrogram::Read(Vector<BaseFloat>* output_feats) {
    Vector<BaseFloat> input_feats(chunk_sample_size_);
    bool flag = base_extractor_->Read(&input_feats);
    if (flag == false || input_feats.Dim() == 0) return false;
@ -83,9 +83,10 @@ bool LinearSpectrogram::Read(Vector<BaseFloat>* feat) {
    if (result.size() != 0) {
        feat_size = result.size() * result[0].size();
    }
-    feat->Resize(feat_size);
+    output_feats->Resize(feat_size);
+    // todo refactor (SimleGoat)
    for (size_t idx = 0; idx < feat_size; ++idx) {
-        (*feat)(idx) = result[idx / dim_][idx % dim_];
+        (*output_feats)(idx) = result[idx / dim_][idx % dim_];
    }
    return true;
 }
@ -117,7 +118,7 @@ bool LinearSpectrogram::NumpyFft(vector<BaseFloat>* v,
    return true;
 }

-// Compute spectrogram feat, only for test, remove later
+// Compute spectrogram feat
 // todo: refactor later (SmileGoat)
 bool LinearSpectrogram::Compute(const vector<float>& wave,
                                vector<vector<float>>& feat) {
--- a/speechx/speechx/frontend/linear_spectrogram.h
+++ b/speechx/speechx/frontend/linear_spectrogram.h
@ -38,9 +38,10 @@ class LinearSpectrogram : public FeatureExtractorInterface {
    explicit LinearSpectrogram(
        const LinearSpectrogramOptions& opts,
        std::unique_ptr<FeatureExtractorInterface> base_extractor);
-    virtual void AcceptWaveform(
-        const kaldi::VectorBase<kaldi::BaseFloat>& input);
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat);
+    virtual void Accept(
+        const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* output_feats);
+    // the dim_ is the dim of single frame feature
    virtual size_t Dim() const { return dim_; }
    virtual void SetFinished() { base_extractor_->SetFinished(); }
    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
@ -49,8 +50,6 @@ class LinearSpectrogram : public FeatureExtractorInterface {
    void Hanning(std::vector<kaldi::BaseFloat>* data) const;
    bool Compute(const std::vector<kaldi::BaseFloat>& wave,
                 std::vector<std::vector<kaldi::BaseFloat>>& feat);
-    void Compute(const kaldi::VectorBase<kaldi::BaseFloat>& input,
-                 kaldi::VectorBase<kaldi::BaseFloat>* feature);
    bool NumpyFft(std::vector<kaldi::BaseFloat>* v,
                  std::vector<kaldi::BaseFloat>* real,
                  std::vector<kaldi::BaseFloat>* img) const;
@ -60,7 +59,6 @@ class LinearSpectrogram : public FeatureExtractorInterface {
    std::vector<kaldi::BaseFloat> hanning_window_;
    kaldi::BaseFloat hanning_window_energy_;
    LinearSpectrogramOptions opts_;
-    kaldi::Vector<kaldi::BaseFloat> waveform_;  // remove later, todo(SmileGoat)
    std::unique_ptr<FeatureExtractorInterface> base_extractor_;
    int chunk_sample_size_;
    DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram);
--- a/speechx/speechx/frontend/normalizer.cc
+++ b/speechx/speechx/frontend/normalizer.cc
@ -31,23 +31,20 @@ DecibelNormalizer::DecibelNormalizer(
    std::unique_ptr<FeatureExtractorInterface> base_extractor) {
    base_extractor_ = std::move(base_extractor);
    opts_ = opts;
-    dim_ = 0;
+    dim_ = 1;
 }

-void DecibelNormalizer::AcceptWaveform(
-    const kaldi::VectorBase<BaseFloat>& input) {
-    // dim_ = input.Dim();
-    // waveform_.Resize(input.Dim());
-    // waveform_.CopyFromVec(input);
-    base_extractor_->AcceptWaveform(input);
+void DecibelNormalizer::Accept(
+    const kaldi::VectorBase<BaseFloat>& inputs_wave) {
+    base_extractor_->Accept(inputs_wave);
 }

-bool DecibelNormalizer::Read(kaldi::Vector<BaseFloat>* feat) {
-    // if (waveform_.Dim() == 0) return;
-    if (base_extractor_->Read(feat) == false || feat->Dim() == 0) {
+bool DecibelNormalizer::Read(kaldi::Vector<BaseFloat>* outputs_wave) {
+    if (base_extractor_->Read(outputs_wave) == false || 
+        outputs_wave->Dim() == 0) {
        return false;
    }
-    Compute(feat);
+    Compute(outputs_wave);
    return true;
 }

@ -70,7 +67,7 @@ void CopyStdVector2Vector(const vector<BaseFloat>& input,
    }
 }

-bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* feat) const {
+bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* feats) const {
    // calculate db rms
    BaseFloat rms_db = 0.0;
    BaseFloat mean_square = 0.0;
@ -78,9 +75,9 @@ bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* feat) const {
    BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1));

    vector<BaseFloat> samples;
-    samples.resize(feat->Dim());
+    samples.resize(feats->Dim());
    for (size_t i = 0; i < samples.size(); ++i) {
-        samples[i] = (*feat)(i);
+        samples[i] = (*feats)(i);
    }

    // square
@ -110,7 +107,7 @@ bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* feat) const {
        item *= std::pow(10.0, gain / 20.0);
    }

-    CopyStdVector2Vector(samples, feat);
+    CopyStdVector2Vector(samples, feats);
    return true;
 }

@ -124,16 +121,16 @@ CMVN::CMVN(std::string cmvn_file,
    dim_ = stats_.NumCols() - 1;
 }

-void CMVN::AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input) {
-    base_extractor_->AcceptWaveform(input);
+void CMVN::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& feats) {
+    base_extractor_->Accept(feats);
    return;
 }

-bool CMVN::Read(kaldi::Vector<BaseFloat>* feat) {
-    if (base_extractor_->Read(feat) == false) {
+bool CMVN::Read(kaldi::Vector<BaseFloat>* outputs) {
+    if (base_extractor_->Read(outputs) == false) {
        return false;
    }
-    Compute(feat);
+    Compute(outputs);
    return true;
 }

--- a/speechx/speechx/frontend/normalizer.h
+++ b/speechx/speechx/frontend/normalizer.h
@ -45,15 +45,16 @@ class DecibelNormalizer : public FeatureExtractorInterface {
    explicit DecibelNormalizer(
        const DecibelNormalizerOptions& opts,
        std::unique_ptr<FeatureExtractorInterface> base_extractor);
-    virtual void AcceptWaveform(
-        const kaldi::VectorBase<kaldi::BaseFloat>& input);
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat);
+    virtual void Accept(
+        const kaldi::VectorBase<kaldi::BaseFloat>& inputs_wave);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* outputs_wave);
+    // noramlize audio, the dim is 1.
    virtual size_t Dim() const { return dim_; }
    virtual void SetFinished() { base_extractor_->SetFinished(); }
    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }

  private:
-    bool Compute(kaldi::VectorBase<kaldi::BaseFloat>* feat) const;
+    bool Compute(kaldi::VectorBase<kaldi::BaseFloat>* feats) const;
    DecibelNormalizerOptions opts_;
    size_t dim_;
    std::unique_ptr<FeatureExtractorInterface> base_extractor_;
@ -65,15 +66,19 @@ class CMVN : public FeatureExtractorInterface {
  public:
    explicit CMVN(std::string cmvn_file,
                  std::unique_ptr<FeatureExtractorInterface> base_extractor);
-    virtual void AcceptWaveform(
-        const kaldi::VectorBase<kaldi::BaseFloat>& input);
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat);
+    virtual void Accept(
+        const kaldi::VectorBase<kaldi::BaseFloat>& feats);
+
+    // the length of outputs = feature_row * feature_dim,
+    // the Matrix is squashed into Vector
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* outputs);
+    // the dim_ is the feautre dim.
    virtual size_t Dim() const { return dim_; }
    virtual void SetFinished() { base_extractor_->SetFinished(); }
    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }

  private:
-    void Compute(kaldi::VectorBase<kaldi::BaseFloat>* feat) const;
+    void Compute(kaldi::VectorBase<kaldi::BaseFloat>* feats) const;
    void ApplyCMVN(kaldi::MatrixBase<BaseFloat>* feats);
    kaldi::Matrix<double> stats_;
    std::unique_ptr<FeatureExtractorInterface> base_extractor_;
--- a/speechx/speechx/frontend/raw_audio.cc
+++ b/speechx/speechx/frontend/raw_audio.cc
@ -21,33 +21,25 @@ using kaldi::BaseFloat;
 using kaldi::VectorBase;
 using kaldi::Vector;

-RawAudioSource::RawAudioSource(int buffer_size)
+RawAudioCache::RawAudioCache(int buffer_size)
    : finished_(false), data_length_(0), start_(0), timeout_(1) {
    ring_buffer_.resize(buffer_size);
 }

-void RawAudioSource::AcceptWaveform(const VectorBase<BaseFloat>& data) {
+void RawAudioCache::Accept(const VectorBase<BaseFloat>& input_audio) {
    std::unique_lock<std::mutex> lock(mutex_);
-    while (data_length_ + data.Dim() > ring_buffer_.size()) {
+    while (data_length_ + input_audio.Dim() > ring_buffer_.size()) {
        ready_feed_condition_.wait(lock);
    }
-    for (size_t idx = 0; idx < data.Dim(); ++idx) {
-        ring_buffer_[idx % ring_buffer_.size()] = data(idx);
+    for (size_t idx = 0; idx < input_audio.Dim(); ++idx) {
+        int32 buffer_idx = (idx + start_) % ring_buffer_.size(); 
+        ring_buffer_[buffer_idx] = input_audio(idx);
    }
-    data_length_ += data.Dim();
+    data_length_ += input_audio.Dim();
 }

-// bool RawAudioSource::AcceptWaveform(BaseFloat* data, int length) {
-// std::unique_lock<std::mutex> lock(mutex_);
-// for (size_t idx = 0; idx < length; ++idx) {
-// ring_buffer_[idx % ring_buffer_.size()] = data[idx];
-//}
-// data_length_ += length;
-// finish_condition_.notify_one();
-//}
-
-bool RawAudioSource::Read(Vector<BaseFloat>* feat) {
-    size_t chunk_size = feat->Dim();
+bool RawAudioCache::Read(Vector<BaseFloat>* output_audio) {
+    size_t chunk_size = output_audio->Dim();
    kaldi::Timer timer;
    std::unique_lock<std::mutex> lock(mutex_);
    while (chunk_size > data_length_) {
@ -69,11 +61,12 @@ bool RawAudioSource::Read(Vector<BaseFloat>* feat) {
    // read last chunk data
    if (chunk_size > data_length_) {
        chunk_size = data_length_;
-        feat->Resize(chunk_size);
+        output_audio->Resize(chunk_size);
    }

    for (size_t idx = 0; idx < chunk_size; ++idx) {
-        feat->Data()[idx] = ring_buffer_[idx];
+        int buff_idx = (start_ + idx) % ring_buffer_.size();
+        output_audio->Data()[idx] = ring_buffer_[buff_idx];
    }
    data_length_ -= chunk_size;
    start_ = (start_ + chunk_size) % ring_buffer_.size();
--- a/speechx/speechx/frontend/raw_audio.h
+++ b/speechx/speechx/frontend/raw_audio.h
@ -20,12 +20,13 @@

 namespace ppspeech {

-class RawAudioSource : public FeatureExtractorInterface {
+class RawAudioCache : public FeatureExtractorInterface {
  public:
-    explicit RawAudioSource(int buffer_size = kint16max);
-    virtual void AcceptWaveform(const kaldi::VectorBase<BaseFloat>& data);
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat);
-    virtual size_t Dim() const { return data_length_; }
+    explicit RawAudioCache(int buffer_size = kint16max);
+    virtual void Accept(const kaldi::VectorBase<BaseFloat>& input_audio);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* output_audio);
+    // the audio dim is 1
+    virtual size_t Dim() const { return 1; }
    virtual void SetFinished() {
        std::lock_guard<std::mutex> lock(mutex_);
        finished_ = true;
@ -41,14 +42,14 @@ class RawAudioSource : public FeatureExtractorInterface {
    std::condition_variable ready_feed_condition_;
    kaldi::int32 timeout_;

-    DISALLOW_COPY_AND_ASSIGN(RawAudioSource);
+    DISALLOW_COPY_AND_ASSIGN(RawAudioCache);
 };

 // it is a datasource for testing different frontend module.
-class RawDataSource : public FeatureExtractorInterface {
+class RawDataCache: public FeatureExtractorInterface {
  public:
-    explicit RawDataSource() { finished_ = false; }
-    virtual void AcceptWaveform(
+    explicit RawDataCache() { finished_ = false; }
+    virtual void Accept(
        const kaldi::VectorBase<kaldi::BaseFloat>& input) {
        data_ = input;
    }
@ -60,6 +61,7 @@ class RawDataSource : public FeatureExtractorInterface {
        data_.Resize(0);
        return true;
    }
+    //the dim is data_ length
    virtual size_t Dim() const { return data_.Dim(); }
    virtual void SetFinished() { finished_ = true; }
    virtual bool IsFinished() const { return finished_; }
@ -68,7 +70,7 @@ class RawDataSource : public FeatureExtractorInterface {
    kaldi::Vector<kaldi::BaseFloat> data_;
    bool finished_;

-    DISALLOW_COPY_AND_ASSIGN(RawDataSource);
+    DISALLOW_COPY_AND_ASSIGN(RawDataCache);
 };

 }  // namespace ppspeech