rename interface & add comment to Dim()

pull/1542/head
SmileGoat 4 years ago
parent 7c1b432830
commit 22fe1c9dbe

@ -25,10 +25,9 @@
#include "kaldi/util/kaldi-io.h" #include "kaldi/util/kaldi-io.h"
#include "kaldi/util/table-types.h" #include "kaldi/util/table-types.h"
DEFINE_string(wav_rspecifier, "", "test wav path"); DEFINE_string(wav_rspecifier, "", "test wav scp path");
DEFINE_string(feature_wspecifier, "", "test wav ark"); DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
DEFINE_string(feature_check_wspecifier, "", "test wav ark"); DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn");
DEFINE_string(cmvn_write_path, "./cmvn.ark", "test wav ark");
std::vector<float> mean_{ std::vector<float> mean_{
@ -166,9 +165,9 @@ int main(int argc, char* argv[]) {
// window -->linear_spectrogram --> cmvn // window -->linear_spectrogram --> cmvn
int32 num_done = 0, num_err = 0; int32 num_done = 0, num_err = 0;
//std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new //std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new
// ppspeech::RawDataSource()); //ppspeech::RawDataCache());
std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source( std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(
new ppspeech::RawAudioSource()); new ppspeech::RawAudioCache());
ppspeech::LinearSpectrogramOptions opt; ppspeech::LinearSpectrogramOptions opt;
opt.frame_opts.frame_length_ms = 20; opt.frame_opts.frame_length_ms = 20;
@ -211,7 +210,7 @@ int main(int argc, char* argv[]) {
wav_chunk(i) = waveform(sample_offset + i); wav_chunk(i) = waveform(sample_offset + i);
} }
kaldi::Vector<BaseFloat> features; kaldi::Vector<BaseFloat> features;
feature_cache.AcceptWaveform(wav_chunk); feature_cache.Accept(wav_chunk);
if (cur_chunk_size < chunk_sample_size) { if (cur_chunk_size < chunk_sample_size) {
feature_cache.SetFinished(); feature_cache.SetFinished();
} }

@ -29,9 +29,9 @@ FeatureCache::FeatureCache(
base_extractor_ = std::move(base_extractor); base_extractor_ = std::move(base_extractor);
} }
void FeatureCache::AcceptWaveform( void FeatureCache::Accept(
const kaldi::VectorBase<kaldi::BaseFloat>& input) { const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
base_extractor_->AcceptWaveform(input); base_extractor_->Accept(inputs);
// feed current data // feed current data
bool result = false; bool result = false;
do { do {
@ -40,7 +40,7 @@ void FeatureCache::AcceptWaveform(
} }
// pop feature chunk // pop feature chunk
bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* feat) { bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* output_feats) {
kaldi::Timer timer; kaldi::Timer timer;
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
while (cache_.empty() && base_extractor_->IsFinished() == false) { while (cache_.empty() && base_extractor_->IsFinished() == false) {
@ -53,8 +53,8 @@ bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* feat) {
usleep(1000); // sleep 1 ms usleep(1000); // sleep 1 ms
} }
if (cache_.empty()) return false; if (cache_.empty()) return false;
feat->Resize(cache_.front().Dim()); output_feats->Resize(cache_.front().Dim());
feat->CopyFromVec(cache_.front()); output_feats->CopyFromVec(cache_.front());
cache_.pop(); cache_.pop();
ready_feed_condition_.notify_one(); ready_feed_condition_.notify_one();
return true; return true;

@ -24,12 +24,15 @@ class FeatureCache : public FeatureExtractorInterface {
explicit FeatureCache( explicit FeatureCache(
int32 max_size = kint16max, int32 max_size = kint16max,
std::unique_ptr<FeatureExtractorInterface> base_extractor = NULL); std::unique_ptr<FeatureExtractorInterface> base_extractor = NULL);
virtual void AcceptWaveform( virtual void Accept(
const kaldi::VectorBase<kaldi::BaseFloat>& input); const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat); // output_feats dim = num_frames * feature_dim
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* output_feats);
// feature cache only cache feature which from base extractor
virtual size_t Dim() const { return base_extractor_->Dim(); } virtual size_t Dim() const { return base_extractor_->Dim(); }
virtual void SetFinished() { virtual void SetFinished() {
base_extractor_->SetFinished(); base_extractor_->SetFinished();
// read the last chunk data
Compute(); Compute();
} }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); }

@ -21,9 +21,14 @@ namespace ppspeech {
class FeatureExtractorInterface { class FeatureExtractorInterface {
public: public:
virtual void AcceptWaveform( // accept input data
const kaldi::VectorBase<kaldi::BaseFloat>& input) = 0; virtual void Accept(
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat) = 0; const kaldi::VectorBase<kaldi::BaseFloat>& inputs) = 0;
// get the processed result
// the length of output = feature_row * feature_dim,
// the Matrix is squashed into Vector
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* outputs) = 0;
// the Dim is the feature dim
virtual size_t Dim() const = 0; virtual size_t Dim() const = 0;
virtual void SetFinished() = 0; virtual void SetFinished() = 0;
virtual bool IsFinished() const = 0; virtual bool IsFinished() const = 0;

@ -66,11 +66,11 @@ LinearSpectrogram::LinearSpectrogram(
dim_ = fft_points_ / 2 + 1; // the dimension is Fs/2 Hz dim_ = fft_points_ / 2 + 1; // the dimension is Fs/2 Hz
} }
void LinearSpectrogram::AcceptWaveform(const VectorBase<BaseFloat>& input) { void LinearSpectrogram::Accept(const VectorBase<BaseFloat>& inputs) {
base_extractor_->AcceptWaveform(input); base_extractor_->Accept(inputs);
} }
bool LinearSpectrogram::Read(Vector<BaseFloat>* feat) { bool LinearSpectrogram::Read(Vector<BaseFloat>* output_feats) {
Vector<BaseFloat> input_feats(chunk_sample_size_); Vector<BaseFloat> input_feats(chunk_sample_size_);
bool flag = base_extractor_->Read(&input_feats); bool flag = base_extractor_->Read(&input_feats);
if (flag == false || input_feats.Dim() == 0) return false; if (flag == false || input_feats.Dim() == 0) return false;
@ -83,9 +83,10 @@ bool LinearSpectrogram::Read(Vector<BaseFloat>* feat) {
if (result.size() != 0) { if (result.size() != 0) {
feat_size = result.size() * result[0].size(); feat_size = result.size() * result[0].size();
} }
feat->Resize(feat_size); output_feats->Resize(feat_size);
// todo refactor (SimleGoat)
for (size_t idx = 0; idx < feat_size; ++idx) { for (size_t idx = 0; idx < feat_size; ++idx) {
(*feat)(idx) = result[idx / dim_][idx % dim_]; (*output_feats)(idx) = result[idx / dim_][idx % dim_];
} }
return true; return true;
} }
@ -117,7 +118,7 @@ bool LinearSpectrogram::NumpyFft(vector<BaseFloat>* v,
return true; return true;
} }
// Compute spectrogram feat, only for test, remove later // Compute spectrogram feat
// todo: refactor later (SmileGoat) // todo: refactor later (SmileGoat)
bool LinearSpectrogram::Compute(const vector<float>& wave, bool LinearSpectrogram::Compute(const vector<float>& wave,
vector<vector<float>>& feat) { vector<vector<float>>& feat) {

@ -38,9 +38,10 @@ class LinearSpectrogram : public FeatureExtractorInterface {
explicit LinearSpectrogram( explicit LinearSpectrogram(
const LinearSpectrogramOptions& opts, const LinearSpectrogramOptions& opts,
std::unique_ptr<FeatureExtractorInterface> base_extractor); std::unique_ptr<FeatureExtractorInterface> base_extractor);
virtual void AcceptWaveform( virtual void Accept(
const kaldi::VectorBase<kaldi::BaseFloat>& input); const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat); virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* output_feats);
// the dim_ is the dim of single frame feature
virtual size_t Dim() const { return dim_; } virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); } virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
@ -49,8 +50,6 @@ class LinearSpectrogram : public FeatureExtractorInterface {
void Hanning(std::vector<kaldi::BaseFloat>* data) const; void Hanning(std::vector<kaldi::BaseFloat>* data) const;
bool Compute(const std::vector<kaldi::BaseFloat>& wave, bool Compute(const std::vector<kaldi::BaseFloat>& wave,
std::vector<std::vector<kaldi::BaseFloat>>& feat); std::vector<std::vector<kaldi::BaseFloat>>& feat);
void Compute(const kaldi::VectorBase<kaldi::BaseFloat>& input,
kaldi::VectorBase<kaldi::BaseFloat>* feature);
bool NumpyFft(std::vector<kaldi::BaseFloat>* v, bool NumpyFft(std::vector<kaldi::BaseFloat>* v,
std::vector<kaldi::BaseFloat>* real, std::vector<kaldi::BaseFloat>* real,
std::vector<kaldi::BaseFloat>* img) const; std::vector<kaldi::BaseFloat>* img) const;
@ -60,7 +59,6 @@ class LinearSpectrogram : public FeatureExtractorInterface {
std::vector<kaldi::BaseFloat> hanning_window_; std::vector<kaldi::BaseFloat> hanning_window_;
kaldi::BaseFloat hanning_window_energy_; kaldi::BaseFloat hanning_window_energy_;
LinearSpectrogramOptions opts_; LinearSpectrogramOptions opts_;
kaldi::Vector<kaldi::BaseFloat> waveform_; // remove later, todo(SmileGoat)
std::unique_ptr<FeatureExtractorInterface> base_extractor_; std::unique_ptr<FeatureExtractorInterface> base_extractor_;
int chunk_sample_size_; int chunk_sample_size_;
DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram); DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram);

@ -31,23 +31,20 @@ DecibelNormalizer::DecibelNormalizer(
std::unique_ptr<FeatureExtractorInterface> base_extractor) { std::unique_ptr<FeatureExtractorInterface> base_extractor) {
base_extractor_ = std::move(base_extractor); base_extractor_ = std::move(base_extractor);
opts_ = opts; opts_ = opts;
dim_ = 0; dim_ = 1;
} }
void DecibelNormalizer::AcceptWaveform( void DecibelNormalizer::Accept(
const kaldi::VectorBase<BaseFloat>& input) { const kaldi::VectorBase<BaseFloat>& inputs_wave) {
// dim_ = input.Dim(); base_extractor_->Accept(inputs_wave);
// waveform_.Resize(input.Dim());
// waveform_.CopyFromVec(input);
base_extractor_->AcceptWaveform(input);
} }
bool DecibelNormalizer::Read(kaldi::Vector<BaseFloat>* feat) { bool DecibelNormalizer::Read(kaldi::Vector<BaseFloat>* outputs_wave) {
// if (waveform_.Dim() == 0) return; if (base_extractor_->Read(outputs_wave) == false ||
if (base_extractor_->Read(feat) == false || feat->Dim() == 0) { outputs_wave->Dim() == 0) {
return false; return false;
} }
Compute(feat); Compute(outputs_wave);
return true; return true;
} }
@ -70,7 +67,7 @@ void CopyStdVector2Vector(const vector<BaseFloat>& input,
} }
} }
bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* feat) const { bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* feats) const {
// calculate db rms // calculate db rms
BaseFloat rms_db = 0.0; BaseFloat rms_db = 0.0;
BaseFloat mean_square = 0.0; BaseFloat mean_square = 0.0;
@ -78,9 +75,9 @@ bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* feat) const {
BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1)); BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1));
vector<BaseFloat> samples; vector<BaseFloat> samples;
samples.resize(feat->Dim()); samples.resize(feats->Dim());
for (size_t i = 0; i < samples.size(); ++i) { for (size_t i = 0; i < samples.size(); ++i) {
samples[i] = (*feat)(i); samples[i] = (*feats)(i);
} }
// square // square
@ -110,7 +107,7 @@ bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* feat) const {
item *= std::pow(10.0, gain / 20.0); item *= std::pow(10.0, gain / 20.0);
} }
CopyStdVector2Vector(samples, feat); CopyStdVector2Vector(samples, feats);
return true; return true;
} }
@ -124,16 +121,16 @@ CMVN::CMVN(std::string cmvn_file,
dim_ = stats_.NumCols() - 1; dim_ = stats_.NumCols() - 1;
} }
void CMVN::AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input) { void CMVN::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& feats) {
base_extractor_->AcceptWaveform(input); base_extractor_->Accept(feats);
return; return;
} }
bool CMVN::Read(kaldi::Vector<BaseFloat>* feat) { bool CMVN::Read(kaldi::Vector<BaseFloat>* outputs) {
if (base_extractor_->Read(feat) == false) { if (base_extractor_->Read(outputs) == false) {
return false; return false;
} }
Compute(feat); Compute(outputs);
return true; return true;
} }

@ -45,15 +45,16 @@ class DecibelNormalizer : public FeatureExtractorInterface {
explicit DecibelNormalizer( explicit DecibelNormalizer(
const DecibelNormalizerOptions& opts, const DecibelNormalizerOptions& opts,
std::unique_ptr<FeatureExtractorInterface> base_extractor); std::unique_ptr<FeatureExtractorInterface> base_extractor);
virtual void AcceptWaveform( virtual void Accept(
const kaldi::VectorBase<kaldi::BaseFloat>& input); const kaldi::VectorBase<kaldi::BaseFloat>& inputs_wave);
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat); virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* outputs_wave);
// noramlize audio, the dim is 1.
virtual size_t Dim() const { return dim_; } virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); } virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
private: private:
bool Compute(kaldi::VectorBase<kaldi::BaseFloat>* feat) const; bool Compute(kaldi::VectorBase<kaldi::BaseFloat>* feats) const;
DecibelNormalizerOptions opts_; DecibelNormalizerOptions opts_;
size_t dim_; size_t dim_;
std::unique_ptr<FeatureExtractorInterface> base_extractor_; std::unique_ptr<FeatureExtractorInterface> base_extractor_;
@ -65,15 +66,19 @@ class CMVN : public FeatureExtractorInterface {
public: public:
explicit CMVN(std::string cmvn_file, explicit CMVN(std::string cmvn_file,
std::unique_ptr<FeatureExtractorInterface> base_extractor); std::unique_ptr<FeatureExtractorInterface> base_extractor);
virtual void AcceptWaveform( virtual void Accept(
const kaldi::VectorBase<kaldi::BaseFloat>& input); const kaldi::VectorBase<kaldi::BaseFloat>& feats);
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat);
// the length of outputs = feature_row * feature_dim,
// the Matrix is squashed into Vector
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* outputs);
// the dim_ is the feautre dim.
virtual size_t Dim() const { return dim_; } virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); } virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
private: private:
void Compute(kaldi::VectorBase<kaldi::BaseFloat>* feat) const; void Compute(kaldi::VectorBase<kaldi::BaseFloat>* feats) const;
void ApplyCMVN(kaldi::MatrixBase<BaseFloat>* feats); void ApplyCMVN(kaldi::MatrixBase<BaseFloat>* feats);
kaldi::Matrix<double> stats_; kaldi::Matrix<double> stats_;
std::unique_ptr<FeatureExtractorInterface> base_extractor_; std::unique_ptr<FeatureExtractorInterface> base_extractor_;

@ -21,33 +21,25 @@ using kaldi::BaseFloat;
using kaldi::VectorBase; using kaldi::VectorBase;
using kaldi::Vector; using kaldi::Vector;
RawAudioSource::RawAudioSource(int buffer_size) RawAudioCache::RawAudioCache(int buffer_size)
: finished_(false), data_length_(0), start_(0), timeout_(1) { : finished_(false), data_length_(0), start_(0), timeout_(1) {
ring_buffer_.resize(buffer_size); ring_buffer_.resize(buffer_size);
} }
void RawAudioSource::AcceptWaveform(const VectorBase<BaseFloat>& data) { void RawAudioCache::Accept(const VectorBase<BaseFloat>& input_audio) {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
while (data_length_ + data.Dim() > ring_buffer_.size()) { while (data_length_ + input_audio.Dim() > ring_buffer_.size()) {
ready_feed_condition_.wait(lock); ready_feed_condition_.wait(lock);
} }
for (size_t idx = 0; idx < data.Dim(); ++idx) { for (size_t idx = 0; idx < input_audio.Dim(); ++idx) {
ring_buffer_[idx % ring_buffer_.size()] = data(idx); int32 buffer_idx = (idx + start_) % ring_buffer_.size();
ring_buffer_[buffer_idx] = input_audio(idx);
} }
data_length_ += data.Dim(); data_length_ += input_audio.Dim();
} }
// bool RawAudioSource::AcceptWaveform(BaseFloat* data, int length) { bool RawAudioCache::Read(Vector<BaseFloat>* output_audio) {
// std::unique_lock<std::mutex> lock(mutex_); size_t chunk_size = output_audio->Dim();
// for (size_t idx = 0; idx < length; ++idx) {
// ring_buffer_[idx % ring_buffer_.size()] = data[idx];
//}
// data_length_ += length;
// finish_condition_.notify_one();
//}
bool RawAudioSource::Read(Vector<BaseFloat>* feat) {
size_t chunk_size = feat->Dim();
kaldi::Timer timer; kaldi::Timer timer;
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
while (chunk_size > data_length_) { while (chunk_size > data_length_) {
@ -69,11 +61,12 @@ bool RawAudioSource::Read(Vector<BaseFloat>* feat) {
// read last chunk data // read last chunk data
if (chunk_size > data_length_) { if (chunk_size > data_length_) {
chunk_size = data_length_; chunk_size = data_length_;
feat->Resize(chunk_size); output_audio->Resize(chunk_size);
} }
for (size_t idx = 0; idx < chunk_size; ++idx) { for (size_t idx = 0; idx < chunk_size; ++idx) {
feat->Data()[idx] = ring_buffer_[idx]; int buff_idx = (start_ + idx) % ring_buffer_.size();
output_audio->Data()[idx] = ring_buffer_[buff_idx];
} }
data_length_ -= chunk_size; data_length_ -= chunk_size;
start_ = (start_ + chunk_size) % ring_buffer_.size(); start_ = (start_ + chunk_size) % ring_buffer_.size();

@ -20,12 +20,13 @@
namespace ppspeech { namespace ppspeech {
class RawAudioSource : public FeatureExtractorInterface { class RawAudioCache : public FeatureExtractorInterface {
public: public:
explicit RawAudioSource(int buffer_size = kint16max); explicit RawAudioCache(int buffer_size = kint16max);
virtual void AcceptWaveform(const kaldi::VectorBase<BaseFloat>& data); virtual void Accept(const kaldi::VectorBase<BaseFloat>& input_audio);
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat); virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* output_audio);
virtual size_t Dim() const { return data_length_; } // the audio dim is 1
virtual size_t Dim() const { return 1; }
virtual void SetFinished() { virtual void SetFinished() {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
finished_ = true; finished_ = true;
@ -41,14 +42,14 @@ class RawAudioSource : public FeatureExtractorInterface {
std::condition_variable ready_feed_condition_; std::condition_variable ready_feed_condition_;
kaldi::int32 timeout_; kaldi::int32 timeout_;
DISALLOW_COPY_AND_ASSIGN(RawAudioSource); DISALLOW_COPY_AND_ASSIGN(RawAudioCache);
}; };
// it is a datasource for testing different frontend module. // it is a datasource for testing different frontend module.
class RawDataSource : public FeatureExtractorInterface { class RawDataCache: public FeatureExtractorInterface {
public: public:
explicit RawDataSource() { finished_ = false; } explicit RawDataCache() { finished_ = false; }
virtual void AcceptWaveform( virtual void Accept(
const kaldi::VectorBase<kaldi::BaseFloat>& input) { const kaldi::VectorBase<kaldi::BaseFloat>& input) {
data_ = input; data_ = input;
} }
@ -60,6 +61,7 @@ class RawDataSource : public FeatureExtractorInterface {
data_.Resize(0); data_.Resize(0);
return true; return true;
} }
//the dim is data_ length
virtual size_t Dim() const { return data_.Dim(); } virtual size_t Dim() const { return data_.Dim(); }
virtual void SetFinished() { finished_ = true; } virtual void SetFinished() { finished_ = true; }
virtual bool IsFinished() const { return finished_; } virtual bool IsFinished() const { return finished_; }
@ -68,7 +70,7 @@ class RawDataSource : public FeatureExtractorInterface {
kaldi::Vector<kaldi::BaseFloat> data_; kaldi::Vector<kaldi::BaseFloat> data_;
bool finished_; bool finished_;
DISALLOW_COPY_AND_ASSIGN(RawDataSource); DISALLOW_COPY_AND_ASSIGN(RawDataCache);
}; };
} // namespace ppspeech } // namespace ppspeech

Loading…
Cancel
Save