Merge pull request #2003 from SmileGoat/refactor_file_struct

[speechx] refactor frontend
pull/2005/head
Hui Zhang 2 years ago committed by GitHub
commit 182858bf88
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -71,7 +71,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
recognizer_test_main \ recognizer_test_main \
--wav_rspecifier=scp:$wav_scp \ --wav_rspecifier=scp:$wav_scp \
--cmvn_file=$cmvn \ --cmvn_file=$cmvn \
--streaming_chunk=30 \
--use_fbank=true \ --use_fbank=true \
--model_path=$model_dir/avg_10.jit.pdmodel \ --model_path=$model_dir/avg_10.jit.pdmodel \
--param_path=$model_dir/avg_10.jit.pdiparams \ --param_path=$model_dir/avg_10.jit.pdiparams \

@ -78,7 +78,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
--feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \ --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
--cmvn_file=$cmvn \ --cmvn_file=$cmvn \
--streaming_chunk=0.36
echo "feature make have finished!!!" echo "feature make have finished!!!"
fi fi
@ -155,7 +154,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
--wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
--cmvn_file=$cmvn \ --cmvn_file=$cmvn \
--model_path=$model_dir/avg_1.jit.pdmodel \ --model_path=$model_dir/avg_1.jit.pdmodel \
--streaming_chunk=30 \
--param_path=$model_dir/avg_1.jit.pdiparams \ --param_path=$model_dir/avg_1.jit.pdiparams \
--word_symbol_table=$wfst/words.txt \ --word_symbol_table=$wfst/words.txt \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \

@ -152,7 +152,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
--wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
--cmvn_file=$cmvn \ --cmvn_file=$cmvn \
--model_path=$model_dir/avg_5.jit.pdmodel \ --model_path=$model_dir/avg_5.jit.pdmodel \
--streaming_chunk=30 \
--use_fbank=true \ --use_fbank=true \
--param_path=$model_dir/avg_5.jit.pdiparams \ --param_path=$model_dir/avg_5.jit.pdiparams \
--word_symbol_table=$wfst/words.txt \ --word_symbol_table=$wfst/words.txt \

@ -32,4 +32,4 @@ export GLOG_logtostderr=1
# websocket client # websocket client
websocket_client_main \ websocket_client_main \
--wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.36 --wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.5

@ -62,7 +62,6 @@ fi
websocket_server_main \ websocket_server_main \
--cmvn_file=$cmvn \ --cmvn_file=$cmvn \
--model_path=$model_dir/avg_1.jit.pdmodel \ --model_path=$model_dir/avg_1.jit.pdmodel \
--streaming_chunk=0.1 \
--param_path=$model_dir/avg_1.jit.pdiparams \ --param_path=$model_dir/avg_1.jit.pdiparams \
--word_symbol_table=$wfst/words.txt \ --word_symbol_table=$wfst/words.txt \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \

@ -25,7 +25,6 @@ DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
// feature, or fbank"); // feature, or fbank");
DEFINE_int32(num_bins, 161, "num bins of mel"); DEFINE_int32(num_bins, 161, "num bins of mel");
DEFINE_string(cmvn_file, "", "read cmvn"); DEFINE_string(cmvn_file, "", "read cmvn");
DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size");
// feature sliding window // feature sliding window
DEFINE_int32(receptive_field_length, DEFINE_int32(receptive_field_length,
7, 7,
@ -62,7 +61,6 @@ namespace ppspeech {
FeaturePipelineOptions InitFeaturePipelineOptions() { FeaturePipelineOptions InitFeaturePipelineOptions() {
FeaturePipelineOptions opts; FeaturePipelineOptions opts;
opts.cmvn_file = FLAGS_cmvn_file; opts.cmvn_file = FLAGS_cmvn_file;
opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk;
kaldi::FrameExtractionOptions frame_opts; kaldi::FrameExtractionOptions frame_opts;
frame_opts.dither = 0.0; frame_opts.dither = 0.0;
frame_opts.frame_shift_ms = 10; frame_opts.frame_shift_ms = 10;
@ -71,8 +69,8 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
opts.to_float32 = false; opts.to_float32 = false;
frame_opts.window_type = "povey"; frame_opts.window_type = "povey";
frame_opts.frame_length_ms = 25; frame_opts.frame_length_ms = 25;
opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
opts.fbank_opts.fbank_opts.frame_opts = frame_opts; opts.fbank_opts.frame_opts = frame_opts;
} else { } else {
opts.to_float32 = true; opts.to_float32 = true;
frame_opts.remove_dc_offset = false; frame_opts.remove_dc_offset = false;

@ -19,6 +19,7 @@
DEFINE_string(wav_rspecifier, "", "test feature rspecifier"); DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
DEFINE_string(result_wspecifier, "", "test result wspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier");
DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
DEFINE_int32(sample_rate, 16000, "sample rate"); DEFINE_int32(sample_rate, 16000, "sample rate");
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
@ -96,4 +97,4 @@ int main(int argc, char* argv[]) {
KALDI_LOG << " cost:" << elapsed << " s"; KALDI_LOG << " cost:" << elapsed << " s";
KALDI_LOG << "total wav duration is: " << tot_wav_duration << " s"; KALDI_LOG << "total wav duration is: " << tot_wav_duration << " s";
KALDI_LOG << "the RTF is: " << elapsed / tot_wav_duration; KALDI_LOG << "the RTF is: " << elapsed / tot_wav_duration;
} }

@ -30,8 +30,9 @@ class AudioCache : public FrontendInterface {
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves); virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);
// the audio dim is 1, one sample // the audio dim is 1, one sample, which is useless,
virtual size_t Dim() const { return 1; } // so we return size_(cache samples) instead.
virtual size_t Dim() const { return size_; }
virtual void SetFinished() { virtual void SetFinished() {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);

@ -49,12 +49,11 @@ int main(int argc, char* argv[]) {
std::unique_ptr<ppspeech::FrontendInterface> data_source( std::unique_ptr<ppspeech::FrontendInterface> data_source(
new ppspeech::AudioCache(3600 * 1600, false)); new ppspeech::AudioCache(3600 * 1600, false));
ppspeech::FbankOptions opt; kaldi::FbankOptions opt;
opt.fbank_opts.frame_opts.frame_length_ms = 25; opt.frame_opts.frame_length_ms = 25;
opt.fbank_opts.frame_opts.frame_shift_ms = 10; opt.frame_opts.frame_shift_ms = 10;
opt.streaming_chunk = FLAGS_streaming_chunk; opt.mel_opts.num_bins = FLAGS_num_bins;
opt.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; opt.frame_opts.dither = 0.0;
opt.fbank_opts.frame_opts.dither = 0.0;
std::unique_ptr<ppspeech::FrontendInterface> fbank( std::unique_ptr<ppspeech::FrontendInterface> fbank(
new ppspeech::Fbank(opt, std::move(data_source))); new ppspeech::Fbank(opt, std::move(data_source)));

@ -49,7 +49,6 @@ int main(int argc, char* argv[]) {
ppspeech::LinearSpectrogramOptions opt; ppspeech::LinearSpectrogramOptions opt;
opt.frame_opts.frame_length_ms = 20; opt.frame_opts.frame_length_ms = 20;
opt.frame_opts.frame_shift_ms = 10; opt.frame_opts.frame_shift_ms = 10;
opt.streaming_chunk = FLAGS_streaming_chunk;
opt.frame_opts.dither = 0.0; opt.frame_opts.dither = 0.0;
opt.frame_opts.remove_dc_offset = false; opt.frame_opts.remove_dc_offset = false;
opt.frame_opts.window_type = "hanning"; opt.frame_opts.window_type = "hanning";

@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "frontend/audio/fbank.h" #include "frontend/audio/fbank.h"
#include "kaldi/base/kaldi-math.h" #include "kaldi/base/kaldi-math.h"
#include "kaldi/feat/feature-common.h" #include "kaldi/feat/feature-common.h"
@ -29,95 +28,33 @@ using kaldi::VectorBase;
using kaldi::Matrix; using kaldi::Matrix;
using std::vector; using std::vector;
// todo refactor later:(SmileGoat) FbankComputer::FbankComputer(const Options& opts)
Fbank::Fbank(const FbankOptions& opts,
std::unique_ptr<FrontendInterface> base_extractor)
: opts_(opts), : opts_(opts),
computer_(opts.fbank_opts), computer_(opts) {}
window_function_(opts.fbank_opts.frame_opts) {
base_extractor_ = std::move(base_extractor);
chunk_sample_size_ = static_cast<int32>(
opts.streaming_chunk * opts.fbank_opts.frame_opts.samp_freq);
}
void Fbank::Accept(const VectorBase<BaseFloat>& inputs) { int32 FbankComputer::Dim() const {
base_extractor_->Accept(inputs); return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
} }
bool Fbank::Read(Vector<BaseFloat>* feats) { bool FbankComputer::NeedRawLogEnergy() {
Vector<BaseFloat> wav(chunk_sample_size_); return opts_.use_energy && opts_.raw_energy;
bool flag = base_extractor_->Read(&wav);
if (flag == false || wav.Dim() == 0) return false;
// append remaned waves
int32 wav_len = wav.Dim();
int32 left_len = remained_wav_.Dim();
Vector<BaseFloat> waves(left_len + wav_len);
waves.Range(0, left_len).CopyFromVec(remained_wav_);
waves.Range(left_len, wav_len).CopyFromVec(wav);
// compute speech feature
Compute(waves, feats);
// cache remaned waves
kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts);
int32 frame_shift = frame_opts.WindowShift();
int32 left_samples = waves.Dim() - frame_shift * num_frames;
remained_wav_.Resize(left_samples);
remained_wav_.CopyFromVec(
waves.Range(frame_shift * num_frames, left_samples));
return true;
} }
// Compute spectrogram feat // Compute feat
bool Fbank::Compute(const Vector<BaseFloat>& waves, Vector<BaseFloat>* feats) { bool FbankComputer::Compute(Vector<BaseFloat>* window, Vector<BaseFloat>* feat) {
const kaldi::FrameExtractionOptions& frame_opts = RealFft(window, true);
computer_.GetFrameOptions(); kaldi::ComputePowerSpectrum(window);
int32 num_samples = waves.Dim(); const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0));
int32 frame_length = frame_opts.WindowSize(); SubVector<BaseFloat> power_spectrum(*window, 0, window->Dim() / 2 + 1);
int32 sample_rate = frame_opts.samp_freq; if (!opts_.use_power) {
if (num_samples < frame_length) { power_spectrum.ApplyPow(0.5);
return true;
}
int32 num_frames = kaldi::NumFrames(num_samples, frame_opts);
feats->Resize(num_frames * Dim());
Vector<BaseFloat> window;
bool need_raw_log_energy = computer_.NeedRawLogEnergy();
for (int32 frame = 0; frame < num_frames; frame++) {
BaseFloat raw_log_energy = 0.0;
kaldi::ExtractWindow(0,
waves,
frame,
frame_opts,
window_function_,
&window,
need_raw_log_energy ? &raw_log_energy : NULL);
Vector<BaseFloat> this_feature(computer_.Dim(), kaldi::kUndefined);
// note: this online feature-extraction code does not support VTLN.
RealFft(&window, true);
kaldi::ComputePowerSpectrum(&window);
const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0));
SubVector<BaseFloat> power_spectrum(window, 0, window.Dim() / 2 + 1);
if (!opts_.fbank_opts.use_power) {
power_spectrum.ApplyPow(0.5);
}
int32 mel_offset =
((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1
: 0);
SubVector<BaseFloat> mel_energies(
this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins);
mel_bank.Compute(power_spectrum, &mel_energies);
mel_energies.ApplyFloor(1e-07);
mel_energies.ApplyLog();
SubVector<BaseFloat> output_row(feats->Data() + frame * Dim(), Dim());
output_row.CopyFromVec(this_feature);
} }
int32 mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
SubVector<BaseFloat> mel_energies(
*feat, mel_offset, opts_.mel_opts.num_bins);
mel_bank.Compute(power_spectrum, &mel_energies);
mel_energies.ApplyFloor(1e-07);
mel_energies.ApplyLog();
return true; return true;
} }

@ -15,6 +15,7 @@
#pragma once #pragma once
#include "base/common.h" #include "base/common.h"
#include "frontend/audio/feature_common.h"
#include "frontend/audio/frontend_itf.h" #include "frontend/audio/frontend_itf.h"
#include "kaldi/feat/feature-fbank.h" #include "kaldi/feat/feature-fbank.h"
#include "kaldi/feat/feature-mfcc.h" #include "kaldi/feat/feature-mfcc.h"
@ -22,56 +23,28 @@
namespace ppspeech { namespace ppspeech {
struct FbankOptions { class FbankComputer {
kaldi::FbankOptions fbank_opts;
kaldi::BaseFloat streaming_chunk; // second
FbankOptions() : streaming_chunk(0.1), fbank_opts() {}
void Register(kaldi::OptionsItf* opts) {
opts->Register("streaming-chunk",
&streaming_chunk,
"streaming chunk size, default: 0.1 sec");
fbank_opts.Register(opts);
}
};
class Fbank : public FrontendInterface {
public: public:
explicit Fbank(const FbankOptions& opts, typedef kaldi::FbankOptions Options;
std::unique_ptr<FrontendInterface> base_extractor); explicit FbankComputer(const Options& opts);
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
// the dim_ is the dim of single frame feature kaldi::FrameExtractionOptions& GetFrameOptions() {
virtual size_t Dim() const { return computer_.Dim(); } return opts_.frame_opts;
}
virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); } bool Compute(kaldi::Vector<kaldi::BaseFloat>* window,
kaldi::Vector<kaldi::BaseFloat>* feat);
int32 Dim() const;
virtual void Reset() { bool NeedRawLogEnergy();
base_extractor_->Reset();
remained_wav_.Resize(0);
}
private: private:
bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves, Options opts_;
kaldi::Vector<kaldi::BaseFloat>* feats);
FbankOptions opts_;
std::unique_ptr<FrontendInterface> base_extractor_;
kaldi::FeatureWindowFunction window_function_;
kaldi::FbankComputer computer_; kaldi::FbankComputer computer_;
// features_ is the Mfcc or Plp or Fbank features that we have already DISALLOW_COPY_AND_ASSIGN(FbankComputer);
// computed.
kaldi::Vector<kaldi::BaseFloat> features_;
kaldi::Vector<kaldi::BaseFloat> remained_wav_;
kaldi::int32 chunk_sample_size_;
DISALLOW_COPY_AND_ASSIGN(Fbank);
}; };
typedef StreamingFeatureTpl<FbankComputer> Fbank;
} // namespace ppspeech } // namespace ppspeech

@ -0,0 +1,54 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "frontend_itf.h"
#include "kaldi/feat/feature-window.h"
namespace ppspeech {
template <class F>
class StreamingFeatureTpl : public FrontendInterface {
public:
typedef typename F::Options Options;
StreamingFeatureTpl(const Options& opts,
std::unique_ptr<FrontendInterface> base_extractor);
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves);
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
// the dim_ is the dim of single frame feature
virtual size_t Dim() const { return computer_.Dim(); }
virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
virtual void Reset() {
base_extractor_->Reset();
remained_wav_.Resize(0);
}
private:
bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
kaldi::Vector<kaldi::BaseFloat>* feats);
Options opts_;
std::unique_ptr<FrontendInterface> base_extractor_;
kaldi::FeatureWindowFunction window_function_;
kaldi::Vector<kaldi::BaseFloat> remained_wav_;
F computer_;
};
} // namespace ppspeech
#include "frontend/audio/feature_common_inl.h"

@ -0,0 +1,95 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
namespace ppspeech {
template <class F>
StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts,
std::unique_ptr<FrontendInterface> base_extractor):
opts_(opts),
computer_(opts),
window_function_(opts.frame_opts) {
base_extractor_ = std::move(base_extractor);
}
template <class F>
void StreamingFeatureTpl<F>::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves) {
base_extractor_->Accept(waves);
}
template <class F>
bool StreamingFeatureTpl<F>::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
kaldi::Vector<kaldi::BaseFloat> wav(base_extractor_->Dim());
bool flag = base_extractor_->Read(&wav);
if (flag == false || wav.Dim() == 0) return false;
// append remaned waves
int32 wav_len = wav.Dim();
int32 left_len = remained_wav_.Dim();
kaldi::Vector<kaldi::BaseFloat> waves(left_len + wav_len);
waves.Range(0, left_len).CopyFromVec(remained_wav_);
waves.Range(left_len, wav_len).CopyFromVec(wav);
// compute speech feature
Compute(waves, feats);
// cache remaned waves
kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts);
int32 frame_shift = frame_opts.WindowShift();
int32 left_samples = waves.Dim() - frame_shift * num_frames;
remained_wav_.Resize(left_samples);
remained_wav_.CopyFromVec(
waves.Range(frame_shift * num_frames, left_samples));
return true;
}
// Compute feat
template <class F>
bool StreamingFeatureTpl<F>::Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
kaldi::Vector<kaldi::BaseFloat>* feats) {
const kaldi::FrameExtractionOptions& frame_opts =
computer_.GetFrameOptions();
int32 num_samples = waves.Dim();
int32 frame_length = frame_opts.WindowSize();
int32 sample_rate = frame_opts.samp_freq;
if (num_samples < frame_length) {
return true;
}
int32 num_frames = kaldi::NumFrames(num_samples, frame_opts);
feats->Resize(num_frames * Dim());
kaldi::Vector<kaldi::BaseFloat> window;
bool need_raw_log_energy = computer_.NeedRawLogEnergy();
for (int32 frame = 0; frame < num_frames; frame++) {
kaldi::BaseFloat raw_log_energy = 0.0;
kaldi::ExtractWindow(0,
waves,
frame,
frame_opts,
window_function_,
&window,
need_raw_log_energy ? &raw_log_energy : NULL);
kaldi::Vector<kaldi::BaseFloat> this_feature(computer_.Dim(), kaldi::kUndefined);
computer_.Compute(&window, &this_feature);
kaldi::SubVector<kaldi::BaseFloat> output_row(feats->Data() + frame * Dim(), Dim());
output_row.CopyFromVec(this_feature);
}
return true;
}
} // namespace ppspeech

@ -32,7 +32,7 @@ struct FeaturePipelineOptions {
bool to_float32; // true, only for linear feature bool to_float32; // true, only for linear feature
bool use_fbank; bool use_fbank;
LinearSpectrogramOptions linear_spectrogram_opts; LinearSpectrogramOptions linear_spectrogram_opts;
FbankOptions fbank_opts; kaldi::FbankOptions fbank_opts;
FeatureCacheOptions feature_cache_opts; FeatureCacheOptions feature_cache_opts;
AssemblerOptions assembler_opts; AssemblerOptions assembler_opts;

@ -28,81 +28,32 @@ using kaldi::VectorBase;
using kaldi::Matrix; using kaldi::Matrix;
using std::vector; using std::vector;
LinearSpectrogram::LinearSpectrogram( LinearSpectrogramComputer::LinearSpectrogramComputer(
const LinearSpectrogramOptions& opts, const Options& opts)
std::unique_ptr<FrontendInterface> base_extractor) : opts_(opts) {
: opts_(opts), feature_window_funtion_(opts.frame_opts) { kaldi::FeatureWindowFunction feature_window_function(opts.frame_opts);
base_extractor_ = std::move(base_extractor);
int32 window_size = opts.frame_opts.WindowSize(); int32 window_size = opts.frame_opts.WindowSize();
int32 window_shift = opts.frame_opts.WindowShift(); frame_length_ = window_size;
dim_ = window_size / 2 + 1; dim_ = window_size / 2 + 1;
chunk_sample_size_ = BaseFloat hanning_window_energy = kaldi::VecVec(feature_window_function.window,
static_cast<int32>(opts.streaming_chunk * opts.frame_opts.samp_freq); feature_window_function.window);
hanning_window_energy_ = kaldi::VecVec(feature_window_funtion_.window, int32 sample_rate = opts.frame_opts.samp_freq;
feature_window_funtion_.window); scale_ = 2.0 / (hanning_window_energy * sample_rate);
}
void LinearSpectrogram::Accept(const VectorBase<BaseFloat>& inputs) {
base_extractor_->Accept(inputs);
}
bool LinearSpectrogram::Read(Vector<BaseFloat>* feats) {
Vector<BaseFloat> input_feats(chunk_sample_size_);
bool flag = base_extractor_->Read(&input_feats);
if (flag == false || input_feats.Dim() == 0) return false;
int32 feat_len = input_feats.Dim();
int32 left_len = remained_wav_.Dim();
Vector<BaseFloat> waves(feat_len + left_len);
waves.Range(0, left_len).CopyFromVec(remained_wav_);
waves.Range(left_len, feat_len).CopyFromVec(input_feats);
Compute(waves, feats);
int32 frame_shift = opts_.frame_opts.WindowShift();
int32 num_frames = kaldi::NumFrames(waves.Dim(), opts_.frame_opts);
int32 left_samples = waves.Dim() - frame_shift * num_frames;
remained_wav_.Resize(left_samples);
remained_wav_.CopyFromVec(
waves.Range(frame_shift * num_frames, left_samples));
return true;
} }
// Compute spectrogram feat // Compute spectrogram feat
bool LinearSpectrogram::Compute(const Vector<BaseFloat>& waves, bool LinearSpectrogramComputer::Compute(Vector<BaseFloat>* window,
Vector<BaseFloat>* feats) { Vector<BaseFloat>* feat) {
int32 num_samples = waves.Dim(); window->Resize(frame_length_, kaldi::kCopyData);
int32 frame_length = opts_.frame_opts.WindowSize(); RealFft(window, true);
int32 sample_rate = opts_.frame_opts.samp_freq; kaldi::ComputePowerSpectrum(window);
BaseFloat scale = 2.0 / (hanning_window_energy_ * sample_rate); SubVector<BaseFloat> power_spectrum(*window, 0, dim_);
power_spectrum.Scale(scale_);
if (num_samples < frame_length) { power_spectrum(0) = power_spectrum(0) / 2;
return true; power_spectrum(dim_ - 1) = power_spectrum(dim_ - 1) / 2;
} power_spectrum.Add(1e-14);
power_spectrum.ApplyLog();
int32 num_frames = kaldi::NumFrames(num_samples, opts_.frame_opts); feat->CopyFromVec(power_spectrum);
feats->Resize(num_frames * dim_);
Vector<BaseFloat> window;
for (int frame_idx = 0; frame_idx < num_frames; ++frame_idx) {
kaldi::ExtractWindow(0,
waves,
frame_idx,
opts_.frame_opts,
feature_window_funtion_,
&window,
NULL);
SubVector<BaseFloat> output_row(feats->Data() + frame_idx * dim_, dim_);
window.Resize(frame_length, kaldi::kCopyData);
RealFft(&window, true);
kaldi::ComputePowerSpectrum(&window);
SubVector<BaseFloat> power_spectrum(window, 0, dim_);
power_spectrum.Scale(scale);
power_spectrum(0) = power_spectrum(0) / 2;
power_spectrum(dim_ - 1) = power_spectrum(dim_ - 1) / 2;
power_spectrum.Add(1e-14);
power_spectrum.ApplyLog();
output_row.CopyFromVec(power_spectrum);
}
return true; return true;
} }

@ -16,6 +16,7 @@
#pragma once #pragma once
#include "base/common.h" #include "base/common.h"
#include "frontend/audio/feature_common.h"
#include "frontend/audio/frontend_itf.h" #include "frontend/audio/frontend_itf.h"
#include "kaldi/feat/feature-window.h" #include "kaldi/feat/feature-window.h"
@ -23,47 +24,34 @@ namespace ppspeech {
struct LinearSpectrogramOptions { struct LinearSpectrogramOptions {
kaldi::FrameExtractionOptions frame_opts; kaldi::FrameExtractionOptions frame_opts;
kaldi::BaseFloat streaming_chunk; // second LinearSpectrogramOptions() : frame_opts() {}
LinearSpectrogramOptions() : streaming_chunk(0.1), frame_opts() {}
void Register(kaldi::OptionsItf* opts) {
opts->Register("streaming-chunk",
&streaming_chunk,
"streaming chunk size, default: 0.1 sec");
frame_opts.Register(opts);
}
}; };
class LinearSpectrogram : public FrontendInterface { class LinearSpectrogramComputer {
public: public:
explicit LinearSpectrogram( typedef LinearSpectrogramOptions Options;
const LinearSpectrogramOptions& opts, explicit LinearSpectrogramComputer(const Options& opts);
std::unique_ptr<FrontendInterface> base_extractor);
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs); kaldi::FrameExtractionOptions& GetFrameOptions() {
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats); return opts_.frame_opts;
// the dim_ is the dim of single frame feature
virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
virtual void Reset() {
base_extractor_->Reset();
remained_wav_.Resize(0);
} }
private: bool Compute(kaldi::Vector<kaldi::BaseFloat>* window,
bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves, kaldi::Vector<kaldi::BaseFloat>* feat);
kaldi::Vector<kaldi::BaseFloat>* feats);
size_t dim_; int32 Dim() const { return dim_; }
kaldi::FeatureWindowFunction feature_window_funtion_;
kaldi::BaseFloat hanning_window_energy_; bool NeedRawLogEnergy() { return false; }
LinearSpectrogramOptions opts_;
std::unique_ptr<FrontendInterface> base_extractor_; private:
kaldi::Vector<kaldi::BaseFloat> remained_wav_; kaldi::BaseFloat scale_;
int chunk_sample_size_; Options opts_;
DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram); int32 frame_length_;
int32 dim_;
DISALLOW_COPY_AND_ASSIGN(LinearSpectrogramComputer);
}; };
typedef StreamingFeatureTpl<LinearSpectrogramComputer> LinearSpectrogram;
} // namespace ppspeech } // namespace ppspeech
Loading…
Cancel
Save