From 156ccfe4e3f0f768b96ef296a373d81afeeb2f97 Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Wed, 1 Jun 2022 22:07:04 +0800 Subject: [PATCH 1/5] refactor frontend --- speechx/speechx/decoder/param.h | 5 +- speechx/speechx/frontend/audio/audio_cache.h | 4 +- .../frontend/audio/compute_fbank_main.cc | 11 +- .../audio/compute_linear_spectrogram_main.cc | 1 - speechx/speechx/frontend/audio/fbank.cc | 103 ++++-------------- speechx/speechx/frontend/audio/fbank.h | 57 +++------- .../speechx/frontend/audio/feature_common.h | 54 +++++++++ .../frontend/audio/feature_common_inl.h | 95 ++++++++++++++++ .../speechx/frontend/audio/feature_pipeline.h | 2 +- .../frontend/audio/linear_spectrogram.cc | 91 ++++------------ .../frontend/audio/linear_spectrogram.h | 57 ++++------ 11 files changed, 238 insertions(+), 242 deletions(-) create mode 100644 speechx/speechx/frontend/audio/feature_common.h create mode 100644 speechx/speechx/frontend/audio/feature_common_inl.h diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index 495e5236..c8396a58 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -62,7 +62,6 @@ namespace ppspeech { FeaturePipelineOptions InitFeaturePipelineOptions() { FeaturePipelineOptions opts; opts.cmvn_file = FLAGS_cmvn_file; - opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk; kaldi::FrameExtractionOptions frame_opts; frame_opts.dither = 0.0; frame_opts.frame_shift_ms = 10; @@ -71,8 +70,8 @@ FeaturePipelineOptions InitFeaturePipelineOptions() { opts.to_float32 = false; frame_opts.window_type = "povey"; frame_opts.frame_length_ms = 25; - opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; - opts.fbank_opts.fbank_opts.frame_opts = frame_opts; + opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; + opts.fbank_opts.frame_opts = frame_opts; } else { opts.to_float32 = true; frame_opts.remove_dc_offset = false; diff --git a/speechx/speechx/frontend/audio/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h index 4ebcd947..45498e46 100644 --- a/speechx/speechx/frontend/audio/audio_cache.h +++ b/speechx/speechx/frontend/audio/audio_cache.h @@ -30,8 +30,8 @@ class AudioCache : public FrontendInterface { virtual bool Read(kaldi::Vector* waves); - // the audio dim is 1, one sample - virtual size_t Dim() const { return 1; } + // the audio dim is 1, one sample, we return size_ instead. + virtual size_t Dim() const { return size_; } virtual void SetFinished() { std::lock_guard lock(mutex_); diff --git a/speechx/speechx/frontend/audio/compute_fbank_main.cc b/speechx/speechx/frontend/audio/compute_fbank_main.cc index 18024719..f7a42315 100644 --- a/speechx/speechx/frontend/audio/compute_fbank_main.cc +++ b/speechx/speechx/frontend/audio/compute_fbank_main.cc @@ -49,12 +49,11 @@ int main(int argc, char* argv[]) { std::unique_ptr data_source( new ppspeech::AudioCache(3600 * 1600, false)); - ppspeech::FbankOptions opt; - opt.fbank_opts.frame_opts.frame_length_ms = 25; - opt.fbank_opts.frame_opts.frame_shift_ms = 10; - opt.streaming_chunk = FLAGS_streaming_chunk; - opt.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; - opt.fbank_opts.frame_opts.dither = 0.0; + kaldi::FbankOptions opt; + opt.frame_opts.frame_length_ms = 25; + opt.frame_opts.frame_shift_ms = 10; + opt.mel_opts.num_bins = FLAGS_num_bins; + opt.frame_opts.dither = 0.0; std::unique_ptr fbank( new ppspeech::Fbank(opt, std::move(data_source))); diff --git a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc index cc7a5e17..162c3529 100644 --- a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc +++ b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc @@ -49,7 +49,6 @@ int main(int argc, char* argv[]) { ppspeech::LinearSpectrogramOptions opt; opt.frame_opts.frame_length_ms = 20; opt.frame_opts.frame_shift_ms = 10; - opt.streaming_chunk = FLAGS_streaming_chunk; opt.frame_opts.dither = 0.0; opt.frame_opts.remove_dc_offset = false; opt.frame_opts.window_type = "hanning"; diff --git a/speechx/speechx/frontend/audio/fbank.cc b/speechx/speechx/frontend/audio/fbank.cc index fea9032a..1f22263a 100644 --- a/speechx/speechx/frontend/audio/fbank.cc +++ b/speechx/speechx/frontend/audio/fbank.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. - #include "frontend/audio/fbank.h" #include "kaldi/base/kaldi-math.h" #include "kaldi/feat/feature-common.h" @@ -29,95 +28,33 @@ using kaldi::VectorBase; using kaldi::Matrix; using std::vector; -// todo refactor later:(SmileGoat) - -Fbank::Fbank(const FbankOptions& opts, - std::unique_ptr base_extractor) +FbankComputer::FbankComputer(const Options& opts) : opts_(opts), - computer_(opts.fbank_opts), - window_function_(opts.fbank_opts.frame_opts) { - base_extractor_ = std::move(base_extractor); - chunk_sample_size_ = static_cast( - opts.streaming_chunk * opts.fbank_opts.frame_opts.samp_freq); -} + computer_(opts) {} -void Fbank::Accept(const VectorBase& inputs) { - base_extractor_->Accept(inputs); +int32 FbankComputer::Dim() const { + return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0); } -bool Fbank::Read(Vector* feats) { - Vector wav(chunk_sample_size_); - bool flag = base_extractor_->Read(&wav); - if (flag == false || wav.Dim() == 0) return false; - - // append remaned waves - int32 wav_len = wav.Dim(); - int32 left_len = remained_wav_.Dim(); - Vector waves(left_len + wav_len); - waves.Range(0, left_len).CopyFromVec(remained_wav_); - waves.Range(left_len, wav_len).CopyFromVec(wav); - - // compute speech feature - Compute(waves, feats); - - // cache remaned waves - kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions(); - int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts); - int32 frame_shift = frame_opts.WindowShift(); - int32 left_samples = waves.Dim() - frame_shift * num_frames; - remained_wav_.Resize(left_samples); - remained_wav_.CopyFromVec( - waves.Range(frame_shift * num_frames, left_samples)); - return true; +bool FbankComputer::NeedRawLogEnergy() { + return opts_.use_energy && opts_.raw_energy; } -// Compute spectrogram feat -bool Fbank::Compute(const Vector& waves, Vector* feats) { - const kaldi::FrameExtractionOptions& frame_opts = - computer_.GetFrameOptions(); - int32 num_samples = waves.Dim(); - int32 frame_length = frame_opts.WindowSize(); - int32 sample_rate = frame_opts.samp_freq; - if (num_samples < frame_length) { - return true; - } - - int32 num_frames = kaldi::NumFrames(num_samples, frame_opts); - feats->Resize(num_frames * Dim()); - - Vector window; - bool need_raw_log_energy = computer_.NeedRawLogEnergy(); - for (int32 frame = 0; frame < num_frames; frame++) { - BaseFloat raw_log_energy = 0.0; - kaldi::ExtractWindow(0, - waves, - frame, - frame_opts, - window_function_, - &window, - need_raw_log_energy ? &raw_log_energy : NULL); - - - Vector this_feature(computer_.Dim(), kaldi::kUndefined); - // note: this online feature-extraction code does not support VTLN. - RealFft(&window, true); - kaldi::ComputePowerSpectrum(&window); - const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0)); - SubVector power_spectrum(window, 0, window.Dim() / 2 + 1); - if (!opts_.fbank_opts.use_power) { - power_spectrum.ApplyPow(0.5); - } - int32 mel_offset = - ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1 - : 0); - SubVector mel_energies( - this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins); - mel_bank.Compute(power_spectrum, &mel_energies); - mel_energies.ApplyFloor(1e-07); - mel_energies.ApplyLog(); - SubVector output_row(feats->Data() + frame * Dim(), Dim()); - output_row.CopyFromVec(this_feature); +// Compute feat +bool FbankComputer::Compute(Vector* window, Vector* feat) { + RealFft(window, true); + kaldi::ComputePowerSpectrum(window); + const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0)); + SubVector power_spectrum(*window, 0, window->Dim() / 2 + 1); + if (!opts_.use_power) { + power_spectrum.ApplyPow(0.5); } + int32 mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0); + SubVector mel_energies( + *feat, mel_offset, opts_.mel_opts.num_bins); + mel_bank.Compute(power_spectrum, &mel_energies); + mel_energies.ApplyFloor(1e-07); + mel_energies.ApplyLog(); return true; } diff --git a/speechx/speechx/frontend/audio/fbank.h b/speechx/speechx/frontend/audio/fbank.h index 66957dc6..e513b969 100644 --- a/speechx/speechx/frontend/audio/fbank.h +++ b/speechx/speechx/frontend/audio/fbank.h @@ -16,62 +16,35 @@ #include "base/common.h" #include "frontend/audio/frontend_itf.h" +#include "frontend/audio/feature_common.h" #include "kaldi/feat/feature-fbank.h" #include "kaldi/feat/feature-mfcc.h" #include "kaldi/matrix/kaldi-vector.h" namespace ppspeech { -struct FbankOptions { - kaldi::FbankOptions fbank_opts; - kaldi::BaseFloat streaming_chunk; // second - - FbankOptions() : streaming_chunk(0.1), fbank_opts() {} - - void Register(kaldi::OptionsItf* opts) { - opts->Register("streaming-chunk", - &streaming_chunk, - "streaming chunk size, default: 0.1 sec"); - fbank_opts.Register(opts); - } -}; - - -class Fbank : public FrontendInterface { +class FbankComputer { public: - explicit Fbank(const FbankOptions& opts, - std::unique_ptr base_extractor); - virtual void Accept(const kaldi::VectorBase& inputs); - virtual bool Read(kaldi::Vector* feats); + typedef kaldi::FbankOptions Options; + explicit FbankComputer(const Options& opts); - // the dim_ is the dim of single frame feature - virtual size_t Dim() const { return computer_.Dim(); } - - virtual void SetFinished() { base_extractor_->SetFinished(); } + kaldi::FrameExtractionOptions& GetFrameOptions() { + return opts_.frame_opts; + } - virtual bool IsFinished() const { return base_extractor_->IsFinished(); } + bool Compute(kaldi::Vector* window, + kaldi::Vector* feat); + int32 Dim() const; - virtual void Reset() { - base_extractor_->Reset(); - remained_wav_.Resize(0); - } + bool NeedRawLogEnergy(); private: - bool Compute(const kaldi::Vector& waves, - kaldi::Vector* feats); + Options opts_; - FbankOptions opts_; - std::unique_ptr base_extractor_; - - kaldi::FeatureWindowFunction window_function_; kaldi::FbankComputer computer_; - // features_ is the Mfcc or Plp or Fbank features that we have already - // computed. - kaldi::Vector features_; - kaldi::Vector remained_wav_; - kaldi::int32 chunk_sample_size_; - - DISALLOW_COPY_AND_ASSIGN(Fbank); + //DISALLOW_COPY_AND_ASSIGN(FbankComputer); }; +typedef StreamingFeatureTpl Fbank; + } // namespace ppspeech diff --git a/speechx/speechx/frontend/audio/feature_common.h b/speechx/speechx/frontend/audio/feature_common.h new file mode 100644 index 00000000..0763f012 --- /dev/null +++ b/speechx/speechx/frontend/audio/feature_common.h @@ -0,0 +1,54 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "frontend_itf.h" +#include "kaldi/feat/feature-window.h" + +namespace ppspeech { + +template +class StreamingFeatureTpl : public FrontendInterface { + public: + typedef typename F::Options Options; + StreamingFeatureTpl(const Options& opts, + std::unique_ptr base_extractor); + virtual void Accept(const kaldi::VectorBase& inputs); + virtual bool Read(kaldi::Vector* feats); + + // the dim_ is the dim of single frame feature + virtual size_t Dim() const { return computer_.Dim(); } + + virtual void SetFinished() { base_extractor_->SetFinished(); } + + virtual bool IsFinished() const { return base_extractor_->IsFinished(); } + + virtual void Reset() { + base_extractor_->Reset(); + remained_wav_.Resize(0); + } + private: + bool Compute(const kaldi::Vector& waves, + kaldi::Vector* feats); + Options opts_; + std::unique_ptr base_extractor_; + kaldi::FeatureWindowFunction window_function_; + kaldi::Vector remained_wav_; + F computer_; +}; + +} // namespace ppspeech + +#include "frontend/audio/feature_common_inl.h" \ No newline at end of file diff --git a/speechx/speechx/frontend/audio/feature_common_inl.h b/speechx/speechx/frontend/audio/feature_common_inl.h new file mode 100644 index 00000000..0199ab41 --- /dev/null +++ b/speechx/speechx/frontend/audio/feature_common_inl.h @@ -0,0 +1,95 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +namespace ppspeech { + +template +StreamingFeatureTpl::StreamingFeatureTpl(const Options& opts, + std::unique_ptr base_extractor): + opts_(opts), + computer_(opts), + window_function_(opts.frame_opts) { + base_extractor_ = std::move(base_extractor); +} + +template +void StreamingFeatureTpl::Accept(const kaldi::VectorBase& inputs) { + base_extractor_->Accept(inputs); +} + +template +bool StreamingFeatureTpl::Read(kaldi::Vector* feats) { + kaldi::Vector wav(base_extractor_->Dim()); + bool flag = base_extractor_->Read(&wav); + if (flag == false || wav.Dim() == 0) return false; + + // append remaned waves + int32 wav_len = wav.Dim(); + int32 left_len = remained_wav_.Dim(); + kaldi::Vector waves(left_len + wav_len); + waves.Range(0, left_len).CopyFromVec(remained_wav_); + waves.Range(left_len, wav_len).CopyFromVec(wav); + + // compute speech feature + Compute(waves, feats); + + // cache remaned waves + kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions(); + int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts); + int32 frame_shift = frame_opts.WindowShift(); + int32 left_samples = waves.Dim() - frame_shift * num_frames; + remained_wav_.Resize(left_samples); + remained_wav_.CopyFromVec( + waves.Range(frame_shift * num_frames, left_samples)); + return true; +} + +// Compute feat +template +bool StreamingFeatureTpl::Compute(const kaldi::Vector& waves, + kaldi::Vector* feats) { + const kaldi::FrameExtractionOptions& frame_opts = + computer_.GetFrameOptions(); + int32 num_samples = waves.Dim(); + int32 frame_length = frame_opts.WindowSize(); + int32 sample_rate = frame_opts.samp_freq; + if (num_samples < frame_length) { + return true; + } + + int32 num_frames = kaldi::NumFrames(num_samples, frame_opts); + feats->Resize(num_frames * Dim()); + + kaldi::Vector window; + bool need_raw_log_energy = computer_.NeedRawLogEnergy(); + for (int32 frame = 0; frame < num_frames; frame++) { + kaldi::BaseFloat raw_log_energy = 0.0; + kaldi::ExtractWindow(0, + waves, + frame, + frame_opts, + window_function_, + &window, + need_raw_log_energy ? &raw_log_energy : NULL); + + kaldi::Vector this_feature(computer_.Dim(), kaldi::kUndefined); + computer_.Compute(&window, &this_feature); + kaldi::SubVector output_row(feats->Data() + frame * Dim(), Dim()); + output_row.CopyFromVec(this_feature); + } + return true; +} + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index b848f548..49b2f267 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -32,7 +32,7 @@ struct FeaturePipelineOptions { bool to_float32; // true, only for linear feature bool use_fbank; LinearSpectrogramOptions linear_spectrogram_opts; - FbankOptions fbank_opts; + kaldi::FbankOptions fbank_opts; FeatureCacheOptions feature_cache_opts; AssemblerOptions assembler_opts; diff --git a/speechx/speechx/frontend/audio/linear_spectrogram.cc b/speechx/speechx/frontend/audio/linear_spectrogram.cc index 9ef5e766..76580fd5 100644 --- a/speechx/speechx/frontend/audio/linear_spectrogram.cc +++ b/speechx/speechx/frontend/audio/linear_spectrogram.cc @@ -28,81 +28,32 @@ using kaldi::VectorBase; using kaldi::Matrix; using std::vector; -LinearSpectrogram::LinearSpectrogram( - const LinearSpectrogramOptions& opts, - std::unique_ptr base_extractor) - : opts_(opts), feature_window_funtion_(opts.frame_opts) { - base_extractor_ = std::move(base_extractor); +LinearSpectrogramComputer::LinearSpectrogramComputer( + const Options& opts) + : opts_(opts) { + kaldi::FeatureWindowFunction feature_window_function(opts.frame_opts); int32 window_size = opts.frame_opts.WindowSize(); - int32 window_shift = opts.frame_opts.WindowShift(); + frame_length_ = window_size; dim_ = window_size / 2 + 1; - chunk_sample_size_ = - static_cast(opts.streaming_chunk * opts.frame_opts.samp_freq); - hanning_window_energy_ = kaldi::VecVec(feature_window_funtion_.window, - feature_window_funtion_.window); -} - -void LinearSpectrogram::Accept(const VectorBase& inputs) { - base_extractor_->Accept(inputs); -} - -bool LinearSpectrogram::Read(Vector* feats) { - Vector input_feats(chunk_sample_size_); - bool flag = base_extractor_->Read(&input_feats); - if (flag == false || input_feats.Dim() == 0) return false; - - int32 feat_len = input_feats.Dim(); - int32 left_len = remained_wav_.Dim(); - Vector waves(feat_len + left_len); - waves.Range(0, left_len).CopyFromVec(remained_wav_); - waves.Range(left_len, feat_len).CopyFromVec(input_feats); - Compute(waves, feats); - int32 frame_shift = opts_.frame_opts.WindowShift(); - int32 num_frames = kaldi::NumFrames(waves.Dim(), opts_.frame_opts); - int32 left_samples = waves.Dim() - frame_shift * num_frames; - remained_wav_.Resize(left_samples); - remained_wav_.CopyFromVec( - waves.Range(frame_shift * num_frames, left_samples)); - return true; + BaseFloat hanning_window_energy = kaldi::VecVec(feature_window_function.window, + feature_window_function.window); + int32 sample_rate = opts.frame_opts.samp_freq; + scale_ = 2.0 / (hanning_window_energy * sample_rate); } // Compute spectrogram feat -bool LinearSpectrogram::Compute(const Vector& waves, - Vector* feats) { - int32 num_samples = waves.Dim(); - int32 frame_length = opts_.frame_opts.WindowSize(); - int32 sample_rate = opts_.frame_opts.samp_freq; - BaseFloat scale = 2.0 / (hanning_window_energy_ * sample_rate); - - if (num_samples < frame_length) { - return true; - } - - int32 num_frames = kaldi::NumFrames(num_samples, opts_.frame_opts); - feats->Resize(num_frames * dim_); - Vector window; - - for (int frame_idx = 0; frame_idx < num_frames; ++frame_idx) { - kaldi::ExtractWindow(0, - waves, - frame_idx, - opts_.frame_opts, - feature_window_funtion_, - &window, - NULL); - - SubVector output_row(feats->Data() + frame_idx * dim_, dim_); - window.Resize(frame_length, kaldi::kCopyData); - RealFft(&window, true); - kaldi::ComputePowerSpectrum(&window); - SubVector power_spectrum(window, 0, dim_); - power_spectrum.Scale(scale); - power_spectrum(0) = power_spectrum(0) / 2; - power_spectrum(dim_ - 1) = power_spectrum(dim_ - 1) / 2; - power_spectrum.Add(1e-14); - power_spectrum.ApplyLog(); - output_row.CopyFromVec(power_spectrum); - } +bool LinearSpectrogramComputer::Compute(Vector* window, + Vector* feat) { + window->Resize(frame_length_, kaldi::kCopyData); + RealFft(window, true); + kaldi::ComputePowerSpectrum(window); + SubVector power_spectrum(*window, 0, dim_); + power_spectrum.Scale(scale_); + power_spectrum(0) = power_spectrum(0) / 2; + power_spectrum(dim_ - 1) = power_spectrum(dim_ - 1) / 2; + power_spectrum.Add(1e-14); + power_spectrum.ApplyLog(); + feat->CopyFromVec(power_spectrum); return true; } diff --git a/speechx/speechx/frontend/audio/linear_spectrogram.h b/speechx/speechx/frontend/audio/linear_spectrogram.h index 2764b7cf..7ef30dbc 100644 --- a/speechx/speechx/frontend/audio/linear_spectrogram.h +++ b/speechx/speechx/frontend/audio/linear_spectrogram.h @@ -18,52 +18,41 @@ #include "base/common.h" #include "frontend/audio/frontend_itf.h" #include "kaldi/feat/feature-window.h" +#include "frontend/audio/feature_common.h" namespace ppspeech { struct LinearSpectrogramOptions { kaldi::FrameExtractionOptions frame_opts; - kaldi::BaseFloat streaming_chunk; // second - - LinearSpectrogramOptions() : streaming_chunk(0.1), frame_opts() {} - - void Register(kaldi::OptionsItf* opts) { - opts->Register("streaming-chunk", - &streaming_chunk, - "streaming chunk size, default: 0.1 sec"); - frame_opts.Register(opts); - } + LinearSpectrogramOptions(): frame_opts() {} }; -class LinearSpectrogram : public FrontendInterface { +class LinearSpectrogramComputer { public: - explicit LinearSpectrogram( - const LinearSpectrogramOptions& opts, - std::unique_ptr base_extractor); - virtual void Accept(const kaldi::VectorBase& inputs); - virtual bool Read(kaldi::Vector* feats); - // the dim_ is the dim of single frame feature - virtual size_t Dim() const { return dim_; } - virtual void SetFinished() { base_extractor_->SetFinished(); } - virtual bool IsFinished() const { return base_extractor_->IsFinished(); } - virtual void Reset() { - base_extractor_->Reset(); - remained_wav_.Resize(0); + typedef LinearSpectrogramOptions Options; + explicit LinearSpectrogramComputer(const Options& opts); + + kaldi::FrameExtractionOptions& GetFrameOptions() { + return opts_.frame_opts; } - private: - bool Compute(const kaldi::Vector& waves, - kaldi::Vector* feats); + bool Compute(kaldi::Vector* window, + kaldi::Vector* feat); - size_t dim_; - kaldi::FeatureWindowFunction feature_window_funtion_; - kaldi::BaseFloat hanning_window_energy_; - LinearSpectrogramOptions opts_; - std::unique_ptr base_extractor_; - kaldi::Vector remained_wav_; - int chunk_sample_size_; - DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram); + int32 Dim() const { return dim_; } + + bool NeedRawLogEnergy() { return false; } + + private: + kaldi::BaseFloat scale_; + Options opts_; + int32 frame_length_; + int32 dim_; }; +typedef StreamingFeatureTpl LinearSpectrogram; + + //DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram); + } // namespace ppspeech \ No newline at end of file From ce322a60f8105895857ee8997f8feec35be70e7f Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Wed, 1 Jun 2022 22:45:23 +0800 Subject: [PATCH 2/5] remove streaming opt, test=doc --- speechx/examples/custom_asr/run.sh | 1 - speechx/examples/ds2_ol/aishell/run.sh | 2 -- speechx/examples/ds2_ol/aishell/run_fbank.sh | 1 - speechx/examples/ds2_ol/websocket/websocket_client.sh | 2 +- speechx/examples/ds2_ol/websocket/websocket_server.sh | 1 - speechx/speechx/decoder/param.h | 1 - speechx/speechx/decoder/recognizer_main.cc | 4 ++-- 7 files changed, 3 insertions(+), 9 deletions(-) diff --git a/speechx/examples/custom_asr/run.sh b/speechx/examples/custom_asr/run.sh index dddcf9fd..ed67a52b 100644 --- a/speechx/examples/custom_asr/run.sh +++ b/speechx/examples/custom_asr/run.sh @@ -71,7 +71,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then recognizer_test_main \ --wav_rspecifier=scp:$wav_scp \ --cmvn_file=$cmvn \ - --streaming_chunk=30 \ --use_fbank=true \ --model_path=$model_dir/avg_10.jit.pdmodel \ --param_path=$model_dir/avg_10.jit.pdiparams \ diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh index e1001e25..97ce1e65 100755 --- a/speechx/examples/ds2_ol/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -78,7 +78,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \ --cmvn_file=$cmvn \ - --streaming_chunk=0.36 echo "feature make have finished!!!" fi @@ -155,7 +154,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ --cmvn_file=$cmvn \ --model_path=$model_dir/avg_1.jit.pdmodel \ - --streaming_chunk=30 \ --param_path=$model_dir/avg_1.jit.pdiparams \ --word_symbol_table=$wfst/words.txt \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh index 6e131677..6712eb1c 100755 --- a/speechx/examples/ds2_ol/aishell/run_fbank.sh +++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh @@ -152,7 +152,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ --cmvn_file=$cmvn \ --model_path=$model_dir/avg_5.jit.pdmodel \ - --streaming_chunk=30 \ --use_fbank=true \ --param_path=$model_dir/avg_5.jit.pdiparams \ --word_symbol_table=$wfst/words.txt \ diff --git a/speechx/examples/ds2_ol/websocket/websocket_client.sh b/speechx/examples/ds2_ol/websocket/websocket_client.sh index 2a52d2a3..7cd0fdab 100755 --- a/speechx/examples/ds2_ol/websocket/websocket_client.sh +++ b/speechx/examples/ds2_ol/websocket/websocket_client.sh @@ -32,4 +32,4 @@ export GLOG_logtostderr=1 # websocket client websocket_client_main \ - --wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.36 \ No newline at end of file + --wav_rspecifier=scp:$data/$aishell_wav_scp diff --git a/speechx/examples/ds2_ol/websocket/websocket_server.sh b/speechx/examples/ds2_ol/websocket/websocket_server.sh index f798dfd4..45dbf303 100755 --- a/speechx/examples/ds2_ol/websocket/websocket_server.sh +++ b/speechx/examples/ds2_ol/websocket/websocket_server.sh @@ -62,7 +62,6 @@ fi websocket_server_main \ --cmvn_file=$cmvn \ --model_path=$model_dir/avg_1.jit.pdmodel \ - --streaming_chunk=0.1 \ --param_path=$model_dir/avg_1.jit.pdiparams \ --word_symbol_table=$wfst/words.txt \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index c8396a58..f3560343 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -25,7 +25,6 @@ DEFINE_bool(use_fbank, false, "False for fbank; or linear feature"); // feature, or fbank"); DEFINE_int32(num_bins, 161, "num bins of mel"); DEFINE_string(cmvn_file, "", "read cmvn"); -DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size"); // feature sliding window DEFINE_int32(receptive_field_length, 7, diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/decoder/recognizer_main.cc index 7aef73f7..027dadd6 100644 --- a/speechx/speechx/decoder/recognizer_main.cc +++ b/speechx/speechx/decoder/recognizer_main.cc @@ -33,7 +33,7 @@ int main(int argc, char* argv[]) { kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); int sample_rate = FLAGS_sample_rate; - float streaming_chunk = FLAGS_streaming_chunk; + float streaming_chunk = 0.1; int chunk_sample_size = streaming_chunk * sample_rate; LOG(INFO) << "sr: " << sample_rate; LOG(INFO) << "chunk size (s): " << streaming_chunk; @@ -96,4 +96,4 @@ int main(int argc, char* argv[]) { KALDI_LOG << " cost:" << elapsed << " s"; KALDI_LOG << "total wav duration is: " << tot_wav_duration << " s"; KALDI_LOG << "the RTF is: " << elapsed / tot_wav_duration; -} \ No newline at end of file +} From 1a10c4cde89eafe9de7dc4c191bcabbb6ff80ca0 Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Wed, 1 Jun 2022 22:58:30 +0800 Subject: [PATCH 3/5] format --- speechx/speechx/frontend/audio/fbank.h | 8 ++++---- speechx/speechx/frontend/audio/linear_spectrogram.h | 11 +++++------ 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/speechx/speechx/frontend/audio/fbank.h b/speechx/speechx/frontend/audio/fbank.h index e513b969..a1e65413 100644 --- a/speechx/speechx/frontend/audio/fbank.h +++ b/speechx/speechx/frontend/audio/fbank.h @@ -15,8 +15,8 @@ #pragma once #include "base/common.h" -#include "frontend/audio/frontend_itf.h" #include "frontend/audio/feature_common.h" +#include "frontend/audio/frontend_itf.h" #include "kaldi/feat/feature-fbank.h" #include "kaldi/feat/feature-mfcc.h" #include "kaldi/matrix/kaldi-vector.h" @@ -25,11 +25,11 @@ namespace ppspeech { class FbankComputer { public: - typedef kaldi::FbankOptions Options; + typedef kaldi::FbankOptions Options; explicit FbankComputer(const Options& opts); kaldi::FrameExtractionOptions& GetFrameOptions() { - return opts_.frame_opts; + return opts_.frame_opts; } bool Compute(kaldi::Vector* window, @@ -42,7 +42,7 @@ class FbankComputer { Options opts_; kaldi::FbankComputer computer_; - //DISALLOW_COPY_AND_ASSIGN(FbankComputer); + DISALLOW_COPY_AND_ASSIGN(FbankComputer); }; typedef StreamingFeatureTpl Fbank; diff --git a/speechx/speechx/frontend/audio/linear_spectrogram.h b/speechx/speechx/frontend/audio/linear_spectrogram.h index 7ef30dbc..de957c23 100644 --- a/speechx/speechx/frontend/audio/linear_spectrogram.h +++ b/speechx/speechx/frontend/audio/linear_spectrogram.h @@ -16,15 +16,15 @@ #pragma once #include "base/common.h" +#include "frontend/audio/feature_common.h" #include "frontend/audio/frontend_itf.h" #include "kaldi/feat/feature-window.h" -#include "frontend/audio/feature_common.h" namespace ppspeech { struct LinearSpectrogramOptions { kaldi::FrameExtractionOptions frame_opts; - LinearSpectrogramOptions(): frame_opts() {} + LinearSpectrogramOptions() : frame_opts() {} }; class LinearSpectrogramComputer { @@ -33,7 +33,7 @@ class LinearSpectrogramComputer { explicit LinearSpectrogramComputer(const Options& opts); kaldi::FrameExtractionOptions& GetFrameOptions() { - return opts_.frame_opts; + return opts_.frame_opts; } bool Compute(kaldi::Vector* window, @@ -42,17 +42,16 @@ class LinearSpectrogramComputer { int32 Dim() const { return dim_; } bool NeedRawLogEnergy() { return false; } - + private: kaldi::BaseFloat scale_; Options opts_; int32 frame_length_; int32 dim_; + DISALLOW_COPY_AND_ASSIGN(LinearSpectrogramComputer); }; typedef StreamingFeatureTpl LinearSpectrogram; - //DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram); - } // namespace ppspeech \ No newline at end of file From 583c62db772870097692d1ce19af1b3189a21ca8 Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Thu, 2 Jun 2022 09:30:32 +0800 Subject: [PATCH 4/5] add streaming chunk,test=doc --- speechx/examples/ds2_ol/websocket/websocket_client.sh | 2 +- speechx/speechx/decoder/recognizer_main.cc | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/speechx/examples/ds2_ol/websocket/websocket_client.sh b/speechx/examples/ds2_ol/websocket/websocket_client.sh index 7cd0fdab..a508adfb 100755 --- a/speechx/examples/ds2_ol/websocket/websocket_client.sh +++ b/speechx/examples/ds2_ol/websocket/websocket_client.sh @@ -32,4 +32,4 @@ export GLOG_logtostderr=1 # websocket client websocket_client_main \ - --wav_rspecifier=scp:$data/$aishell_wav_scp + --wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.5 diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/decoder/recognizer_main.cc index 027dadd6..23251353 100644 --- a/speechx/speechx/decoder/recognizer_main.cc +++ b/speechx/speechx/decoder/recognizer_main.cc @@ -19,6 +19,7 @@ DEFINE_string(wav_rspecifier, "", "test feature rspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier"); +DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); DEFINE_int32(sample_rate, 16000, "sample rate"); int main(int argc, char* argv[]) { @@ -33,7 +34,7 @@ int main(int argc, char* argv[]) { kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); int sample_rate = FLAGS_sample_rate; - float streaming_chunk = 0.1; + float streaming_chunk = FLAGS_streaming_chunk; int chunk_sample_size = streaming_chunk * sample_rate; LOG(INFO) << "sr: " << sample_rate; LOG(INFO) << "chunk size (s): " << streaming_chunk; From 6dbf30816b5b83f7f35c29e20f45cd411846950e Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Thu, 2 Jun 2022 11:04:58 +0800 Subject: [PATCH 5/5] fix comment --- speechx/speechx/frontend/audio/audio_cache.h | 3 ++- speechx/speechx/frontend/audio/feature_common.h | 4 ++-- speechx/speechx/frontend/audio/feature_common_inl.h | 6 +++--- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/speechx/speechx/frontend/audio/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h index 45498e46..e31a8aeb 100644 --- a/speechx/speechx/frontend/audio/audio_cache.h +++ b/speechx/speechx/frontend/audio/audio_cache.h @@ -30,7 +30,8 @@ class AudioCache : public FrontendInterface { virtual bool Read(kaldi::Vector* waves); - // the audio dim is 1, one sample, we return size_ instead. + // the audio dim is 1, one sample, which is useless, + // so we return size_(cache samples) instead. virtual size_t Dim() const { return size_; } virtual void SetFinished() { diff --git a/speechx/speechx/frontend/audio/feature_common.h b/speechx/speechx/frontend/audio/feature_common.h index 0763f012..e03634d3 100644 --- a/speechx/speechx/frontend/audio/feature_common.h +++ b/speechx/speechx/frontend/audio/feature_common.h @@ -25,7 +25,7 @@ class StreamingFeatureTpl : public FrontendInterface { typedef typename F::Options Options; StreamingFeatureTpl(const Options& opts, std::unique_ptr base_extractor); - virtual void Accept(const kaldi::VectorBase& inputs); + virtual void Accept(const kaldi::VectorBase& waves); virtual bool Read(kaldi::Vector* feats); // the dim_ is the dim of single frame feature @@ -51,4 +51,4 @@ class StreamingFeatureTpl : public FrontendInterface { } // namespace ppspeech -#include "frontend/audio/feature_common_inl.h" \ No newline at end of file +#include "frontend/audio/feature_common_inl.h" diff --git a/speechx/speechx/frontend/audio/feature_common_inl.h b/speechx/speechx/frontend/audio/feature_common_inl.h index 0199ab41..a482ef55 100644 --- a/speechx/speechx/frontend/audio/feature_common_inl.h +++ b/speechx/speechx/frontend/audio/feature_common_inl.h @@ -25,8 +25,8 @@ StreamingFeatureTpl::StreamingFeatureTpl(const Options& opts, } template -void StreamingFeatureTpl::Accept(const kaldi::VectorBase& inputs) { - base_extractor_->Accept(inputs); +void StreamingFeatureTpl::Accept(const kaldi::VectorBase& waves) { + base_extractor_->Accept(waves); } template @@ -92,4 +92,4 @@ bool StreamingFeatureTpl::Compute(const kaldi::Vector& wave return true; } -} // namespace ppspeech \ No newline at end of file +} // namespace ppspeech