diff --git a/speechx/speechx/frontend/audio/data_cache.h b/speechx/speechx/frontend/audio/data_cache.h index 852385e9..64e9db86 100644 --- a/speechx/speechx/frontend/audio/data_cache.h +++ b/speechx/speechx/frontend/audio/data_cache.h @@ -21,8 +21,10 @@ namespace ppspeech { -// A data source for testing different frontend module. -// It accepts waves or feats. + +// Simulates audio/feature input, by returning data from a Vector. +// This class is mostly meant to be used for online decoder testing using +// pre-recorded audio/feature class DataCache : public FrontendInterface { public: explicit DataCache() { finished_ = false; } diff --git a/speechx/speechx/frontend/audio/fbank.cc b/speechx/speechx/frontend/audio/fbank.cc new file mode 100644 index 00000000..8273beec --- /dev/null +++ b/speechx/speechx/frontend/audio/fbank.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "frontend/audio/fbank.h" +#include "kaldi/base/kaldi-math.h" +#include "kaldi/feat/feature-common.h" +#include "kaldi/feat/feature-functions.h" +#include "kaldi/matrix/matrix-functions.h" + +namespace ppspeech { + +using kaldi::int32; +using kaldi::BaseFloat; +using kaldi::Vector; +using kaldi::SubVector; +using kaldi::VectorBase; +using kaldi::Matrix; +using std::vector; + +Fbank::Fbank(const FbankOptions& opts, + std::unique_ptr base_extractor) + : opts_(opts), + computer_(opts.fbank_opts), + window_function_(computer_.GetFrameOptions()) { + base_extractor_ = std::move(base_extractor); + chunk_sample_size_ = + static_cast(opts.streaming_chunk * opts.frame_opts.samp_freq); +} + +void Fbank::Accept(const VectorBase& inputs) { + base_extractor_->Accept(inputs); +} + +bool Fbank::Read(Vector* feats) { + Vector wav(chunk_sample_size_); + bool flag = base_extractor_->Read(&wav); + if (flag == false || wav.Dim() == 0) return false; + + // append remaned waves + int32 wav_len = wav.Dim(); + int32 left_len = remained_wav_.Dim(); + Vector waves(left_len + wav_len); + waves.Range(0, left_len).CopyFromVec(remained_wav_); + waves.Range(left_len, wav_len).CopyFromVec(wav); + + // compute speech feature + Compute(waves, feats); + + // cache remaned waves + kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions(); + int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts); + int32 frame_shift = frame_opts.WindowShift(); + int32 left_samples = waves.Dim() - frame_shift * num_frames; + remained_wav_.Resize(left_samples); + remained_wav_.CopyFromVec( + waves.Range(frame_shift * num_frames, left_samples)); + return true; +} + +// Compute spectrogram feat +bool Fbank::Compute(const Vector& waves, Vector* feats) { + const FrameExtractionOptions& frame_opts = computer_.GetFrameOptions(); + int32 num_samples = waves.Dim(); + int32 frame_length = frame_opts.WindowSize(); + int32 sample_rate = frame_opts.samp_freq; + if (num_samples < frame_length) { + return true; + } + + int32 num_frames = kaldi::NumFrames(num_samples, frame_opts); + feats->Rsize(num_frames * Dim()); + + Vector window; + bool need_raw_log_energy = computer_.NeedRawLogEnergy(); + for (int32 frame = 0; frame < num_frames; frame++) { + BaseFloat raw_log_energy = 0.0; + kaldi::ExtractWindow(0, + waves, + frame, + frame_opts, + window_function_, + &window, + need_raw_log_energy ? &raw_log_energy : NULL); + + + Vector this_feature(computer_.Dim(), kUndefined); + // note: this online feature-extraction code does not support VTLN. + BaseFloat vtln_warp = 1.0; + computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature); + SubVector output_row(feats->Data() + frame * Dim(), Dim()); + output_row.CopyFromVec(this_feature); + } + return true; +} + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/audio/fbank.h b/speechx/speechx/frontend/audio/fbank.h index 96a97dfd..3b71ff84 100644 --- a/speechx/speechx/frontend/audio/fbank.h +++ b/speechx/speechx/frontend/audio/fbank.h @@ -12,29 +12,30 @@ // See the License for the specific language governing permissions and // limitations under the License. -// wrap the fbank feat of kaldi, todo (SmileGoat) +#pragma once +#include "kaldi/feat/feature-fbank.h" #include "kaldi/feat/feature-mfcc.h" -#incldue "kaldi/matrix/kaldi-vector.h" +#include "kaldi/matrix/kaldi-vector.h" namespace ppspeech { struct FbankOptions { - kaldi::FrameExtractionOptions frame_opts; + kaldi::FbankOptions fbank_opts; kaldi::BaseFloat streaming_chunk; // second - LinearSpectrogramOptions() : streaming_chunk(0.1), frame_opts() {} + FbankOptions() : streaming_chunk(0.1), fbank_opts() {} void Register(kaldi::OptionsItf* opts) { opts->Register("streaming-chunk", &streaming_chunk, "streaming chunk size, default: 0.1 sec"); - frame_opts.Register(opts); + fbank_opts.Register(opts); } }; -class Fbank : FrontendInterface { +class Fbank : public FrontendInterface { public: explicit Fbank(const FbankOptions& opts, unique_ptr base_extractor); @@ -42,7 +43,7 @@ class Fbank : FrontendInterface { virtual bool Read(kaldi::Vector* feats); // the dim_ is the dim of single frame feature - virtual size_t Dim() const { return dim_; } + virtual size_t Dim() const { return computer_.Dim(); } virtual void SetFinished() { base_extractor_->SetFinished(); } @@ -57,13 +58,17 @@ class Fbank : FrontendInterface { bool Compute(const kaldi::Vector& waves, kaldi::Vector* feats); - // kaldi::FeatureWindowFunction feature_window_funtion_; - // kaldi::BaseFloat hanning_window_energy_; - size_t dim_; FbankOptions opts_; std::unique_ptr base_extractor_; + + + FeatureWindowFunction window_function_; + kaldi::FbankComputer computer_; + // features_ is the Mfcc or Plp or Fbank features that we have already + // computed. + kaldi::Vector features_; kaldi::Vector remained_wav_; - int chunk_sample_size_; + DISALLOW_COPY_AND_ASSIGN(Fbank); }; diff --git a/speechx/speechx/frontend/audio/mfcc.cc b/speechx/speechx/frontend/audio/mfcc.cc new file mode 100644 index 00000000..115d3367 --- /dev/null +++ b/speechx/speechx/frontend/audio/mfcc.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "frontend/audio/mfcc.h" +#include "kaldi/base/kaldi-math.h" +#include "kaldi/feat/feature-common.h" +#include "kaldi/feat/feature-functions.h" +#include "kaldi/matrix/matrix-functions.h" + +namespace ppspeech { + +using kaldi::int32; +using kaldi::BaseFloat; +using kaldi::Vector; +using kaldi::SubVector; +using kaldi::VectorBase; +using kaldi::Matrix; +using std::vector; + +Mfcc::Mfcc(const MfccOptions& opts, + std::unique_ptr base_extractor) + : opts_(opts), + computer_(opts.mfcc_opts), + window_function_(computer_.GetFrameOptions()) { + base_extractor_ = std::move(base_extractor); + chunk_sample_size_ = + static_cast(opts.streaming_chunk * opts.frame_opts.samp_freq); +} + +void Mfcc::Accept(const VectorBase& inputs) { + base_extractor_->Accept(inputs); +} + +bool Mfcc::Read(Vector* feats) { + Vector wav(chunk_sample_size_); + bool flag = base_extractor_->Read(&wav); + if (flag == false || wav.Dim() == 0) return false; + + // append remaned waves + int32 wav_len = wav.Dim(); + int32 left_len = remained_wav_.Dim(); + Vector waves(left_len + wav_len); + waves.Range(0, left_len).CopyFromVec(remained_wav_); + waves.Range(left_len, wav_len).CopyFromVec(wav); + + // compute speech feature + Compute(waves, feats); + + // cache remaned waves + kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions(); + int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts); + int32 frame_shift = frame_opts.WindowShift(); + int32 left_samples = waves.Dim() - frame_shift * num_frames; + remained_wav_.Resize(left_samples); + remained_wav_.CopyFromVec( + waves.Range(frame_shift * num_frames, left_samples)); + return true; +} + +// Compute spectrogram feat +bool Mfcc::Compute(const Vector& waves, Vector* feats) { + const FrameExtractionOptions& frame_opts = computer_.GetFrameOptions(); + int32 num_samples = waves.Dim(); + int32 frame_length = frame_opts.WindowSize(); + int32 sample_rate = frame_opts.samp_freq; + if (num_samples < frame_length) { + return true; + } + + int32 num_frames = kaldi::NumFrames(num_samples, frame_opts); + feats->Rsize(num_frames * Dim()); + + Vector window; + bool need_raw_log_energy = computer_.NeedRawLogEnergy(); + for (int32 frame = 0; frame < num_frames; frame++) { + BaseFloat raw_log_energy = 0.0; + kaldi::ExtractWindow(0, + waves, + frame, + frame_opts, + window_function_, + &window, + need_raw_log_energy ? &raw_log_energy : NULL); + + + Vector this_feature(computer_.Dim(), kUndefined); + // note: this online feature-extraction code does not support VTLN. + BaseFloat vtln_warp = 1.0; + computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature); + SubVector output_row(feats->Data() + frame * Dim(), Dim()); + output_row.CopyFromVec(this_feature); + } + return true; +} + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/audio/mfcc.h b/speechx/speechx/frontend/audio/mfcc.h index aa369655..62b0078c 100644 --- a/speechx/speechx/frontend/audio/mfcc.h +++ b/speechx/speechx/frontend/audio/mfcc.h @@ -12,5 +12,65 @@ // See the License for the specific language governing permissions and // limitations under the License. -// wrap the mfcc feat of kaldi, todo (SmileGoat) -#include "kaldi/feat/feature-mfcc.h" \ No newline at end of file +#pragma once + +#include "kaldi/feat/feature-mfcc.h" +#include "kaldi/feat/feature-mfcc.h" +#include "kaldi/matrix/kaldi-vector.h" + +namespace ppspeech { + +struct MfccOptions { + kaldi::MfccOptions mfcc_opts; + kaldi::BaseFloat streaming_chunk; // second + + MfccOptions() : streaming_chunk(0.1), mfcc_opts() {} + + void Register(kaldi::OptionsItf* opts) { + opts->Register("streaming-chunk", + &streaming_chunk, + "streaming chunk size, default: 0.1 sec"); + mfcc_opts.Register(opts); + } +}; + + +class Mfcc : public FrontendInterface { + public: + explicit Mfcc(const MfccOptions& opts, + unique_ptr base_extractor); + + virtual void Accept(const kaldi::VectorBase& inputs); + virtual bool Read(kaldi::Vector* feats); + + // the dim_ is the dim of single frame feature + virtual size_t Dim() const { return computer_.Dim(); } + + virtual void SetFinished() { base_extractor_->SetFinished(); } + + virtual bool IsFinished() const { return base_extractor_->IsFinished(); } + + virtual void Reset() { + base_extractor_->Reset(); + remained_wav_.Resize(0); + } + + private: + bool Compute(const kaldi::Vector& waves, + kaldi::Vector* feats); + + MfccOptions opts_; + std::unique_ptr base_extractor_; + + + FeatureWindowFunction window_function_; + kaldi::MfccComputer computer_; + // features_ is the Mfcc or Plp or Fbank features that we have already + // computed. + kaldi::Vector features_; + kaldi::Vector remained_wav_; + + DISALLOW_COPY_AND_ASSIGN(Fbank); +}; + +} // namespace ppspeech \ No newline at end of file