[Engine] recognizer controller refactor (#3139)
* refactor recognizer_controller * clean frontend filepull/3156/head
parent
591b957b96
commit
f35a87ab89
@ -0,0 +1,13 @@
|
|||||||
|
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
@ -0,0 +1,13 @@
|
|||||||
|
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
@ -0,0 +1,91 @@
|
|||||||
|
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "decoder/common.h"
|
||||||
|
#include "fst/fstlib.h"
|
||||||
|
#include "fst/symbol-table.h"
|
||||||
|
#include "nnet/u2_nnet.h"
|
||||||
|
#include "nnet/nnet_producer.h"
|
||||||
|
#ifdef USE_ONNX
|
||||||
|
#include "nnet/u2_onnx_nnet.h"
|
||||||
|
#endif
|
||||||
|
#include "nnet/decodable.h"
|
||||||
|
#include "recognizer/recognizer_resource.h"
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
namespace ppspeech {
|
||||||
|
|
||||||
|
class RecognizerControllerImpl {
|
||||||
|
public:
|
||||||
|
explicit RecognizerControllerImpl(const RecognizerResource& resource);
|
||||||
|
explicit RecognizerControllerImpl(const RecognizerResource& resource,
|
||||||
|
std::shared_ptr<NnetBase> nnet);
|
||||||
|
~RecognizerControllerImpl();
|
||||||
|
void Accept(std::vector<float> data);
|
||||||
|
void InitDecoder();
|
||||||
|
void SetInputFinished();
|
||||||
|
std::string GetFinalResult();
|
||||||
|
std::string GetPartialResult();
|
||||||
|
void Rescoring();
|
||||||
|
void Reset();
|
||||||
|
void WaitDecoderFinished();
|
||||||
|
void WaitFinished();
|
||||||
|
void AttentionRescoring();
|
||||||
|
bool DecodedSomething() const {
|
||||||
|
return !result_.empty() && !result_[0].sentence.empty();
|
||||||
|
}
|
||||||
|
int FrameShiftInMs() const {
|
||||||
|
return 1; //todo
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
|
||||||
|
static void RunNnetEvaluation(RecognizerControllerImpl* me);
|
||||||
|
void RunNnetEvaluationInternal();
|
||||||
|
static void RunDecoder(RecognizerControllerImpl* me);
|
||||||
|
void RunDecoderInternal();
|
||||||
|
void UpdateResult(bool finish = false);
|
||||||
|
|
||||||
|
std::shared_ptr<Decodable> decodable_;
|
||||||
|
std::unique_ptr<DecoderBase> decoder_;
|
||||||
|
std::shared_ptr<NnetProducer> nnet_producer_;
|
||||||
|
|
||||||
|
// e2e unit symbol table
|
||||||
|
std::shared_ptr<fst::SymbolTable> symbol_table_ = nullptr;
|
||||||
|
std::vector<DecodeResult> result_;
|
||||||
|
|
||||||
|
RecognizerResource opts_;
|
||||||
|
bool abort_ = false;
|
||||||
|
// global decoded frame offset
|
||||||
|
int global_frame_offset_;
|
||||||
|
// cur decoded frame num
|
||||||
|
int num_frames_;
|
||||||
|
// timestamp gap between words in a sentence
|
||||||
|
const int time_stamp_gap_ = 100;
|
||||||
|
bool input_finished_;
|
||||||
|
|
||||||
|
std::mutex nnet_mutex_;
|
||||||
|
std::mutex decoder_mutex_;
|
||||||
|
std::condition_variable nnet_condition_;
|
||||||
|
std::condition_variable decoder_condition_;
|
||||||
|
std::thread nnet_thread_;
|
||||||
|
std::thread decoder_thread_;
|
||||||
|
|
||||||
|
DISALLOW_COPY_AND_ASSIGN(RecognizerControllerImpl);
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,13 @@
|
|||||||
|
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
@ -0,0 +1,13 @@
|
|||||||
|
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
@ -1,185 +0,0 @@
|
|||||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
#include "common/base/thread_pool.h"
|
|
||||||
#include "common/utils/file_utils.h"
|
|
||||||
#include "common/utils/strings.h"
|
|
||||||
#include "decoder/param.h"
|
|
||||||
#include "frontend/wave-reader.h"
|
|
||||||
#include "kaldi/util/table-types.h"
|
|
||||||
#include "nnet/u2_nnet.h"
|
|
||||||
#include "recognizer/u2_recognizer.h"
|
|
||||||
|
|
||||||
DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
|
|
||||||
DEFINE_string(result_wspecifier, "", "test result wspecifier");
|
|
||||||
DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
|
|
||||||
DEFINE_int32(sample_rate, 16000, "sample rate");
|
|
||||||
DEFINE_int32(njob, 3, "njob");
|
|
||||||
|
|
||||||
using std::string;
|
|
||||||
using std::vector;
|
|
||||||
|
|
||||||
void SplitUtt(string wavlist_file,
|
|
||||||
vector<vector<string>>* uttlists,
|
|
||||||
vector<vector<string>>* wavlists,
|
|
||||||
int njob) {
|
|
||||||
vector<string> wavlist;
|
|
||||||
wavlists->resize(njob);
|
|
||||||
uttlists->resize(njob);
|
|
||||||
ppspeech::ReadFileToVector(wavlist_file, &wavlist);
|
|
||||||
for (size_t idx = 0; idx < wavlist.size(); ++idx) {
|
|
||||||
string utt_str = wavlist[idx];
|
|
||||||
vector<string> utt_wav = ppspeech::StrSplit(utt_str, " \t");
|
|
||||||
LOG(INFO) << utt_wav[0];
|
|
||||||
CHECK_EQ(utt_wav.size(), size_t(2));
|
|
||||||
uttlists->at(idx % njob).push_back(utt_wav[0]);
|
|
||||||
wavlists->at(idx % njob).push_back(utt_wav[1]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void recognizer_func(const ppspeech::U2RecognizerResource& resource,
|
|
||||||
std::shared_ptr<ppspeech::NnetBase> nnet,
|
|
||||||
std::vector<string> wavlist,
|
|
||||||
std::vector<string> uttlist,
|
|
||||||
std::vector<string>* results) {
|
|
||||||
int32 num_done = 0, num_err = 0;
|
|
||||||
double tot_wav_duration = 0.0;
|
|
||||||
double tot_attention_rescore_time = 0.0;
|
|
||||||
double tot_decode_time = 0.0;
|
|
||||||
int chunk_sample_size = FLAGS_streaming_chunk * FLAGS_sample_rate;
|
|
||||||
if (wavlist.empty()) return;
|
|
||||||
|
|
||||||
std::shared_ptr<ppspeech::U2Recognizer> recognizer_ptr =
|
|
||||||
std::make_shared<ppspeech::U2Recognizer>(resource, nnet);
|
|
||||||
|
|
||||||
results->reserve(wavlist.size());
|
|
||||||
for (size_t idx = 0; idx < wavlist.size(); ++idx) {
|
|
||||||
std::string utt = uttlist[idx];
|
|
||||||
std::string wav_file = wavlist[idx];
|
|
||||||
std::ifstream infile;
|
|
||||||
infile.open(wav_file, std::ifstream::in);
|
|
||||||
kaldi::WaveData wave_data;
|
|
||||||
wave_data.Read(infile);
|
|
||||||
recognizer_ptr->InitDecoder();
|
|
||||||
LOG(INFO) << "utt: " << utt;
|
|
||||||
LOG(INFO) << "wav dur: " << wave_data.Duration() << " sec.";
|
|
||||||
double dur = wave_data.Duration();
|
|
||||||
tot_wav_duration += dur;
|
|
||||||
|
|
||||||
int32 this_channel = 0;
|
|
||||||
kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
|
|
||||||
this_channel);
|
|
||||||
int tot_samples = waveform.Dim();
|
|
||||||
LOG(INFO) << "wav len (sample): " << tot_samples;
|
|
||||||
|
|
||||||
int sample_offset = 0;
|
|
||||||
kaldi::Timer local_timer;
|
|
||||||
|
|
||||||
while (sample_offset < tot_samples) {
|
|
||||||
int cur_chunk_size =
|
|
||||||
std::min(chunk_sample_size, tot_samples - sample_offset);
|
|
||||||
|
|
||||||
std::vector<kaldi::BaseFloat> wav_chunk(cur_chunk_size);
|
|
||||||
for (int i = 0; i < cur_chunk_size; ++i) {
|
|
||||||
wav_chunk[i] = waveform(sample_offset + i);
|
|
||||||
}
|
|
||||||
|
|
||||||
recognizer_ptr->Accept(wav_chunk);
|
|
||||||
if (cur_chunk_size < chunk_sample_size) {
|
|
||||||
recognizer_ptr->SetInputFinished();
|
|
||||||
}
|
|
||||||
|
|
||||||
// no overlap
|
|
||||||
sample_offset += cur_chunk_size;
|
|
||||||
}
|
|
||||||
CHECK(sample_offset == tot_samples);
|
|
||||||
recognizer_ptr->WaitDecodeFinished();
|
|
||||||
|
|
||||||
kaldi::Timer timer;
|
|
||||||
recognizer_ptr->AttentionRescoring();
|
|
||||||
tot_attention_rescore_time += timer.Elapsed();
|
|
||||||
|
|
||||||
std::string result = recognizer_ptr->GetFinalResult();
|
|
||||||
if (result.empty()) {
|
|
||||||
// the TokenWriter can not write empty string.
|
|
||||||
++num_err;
|
|
||||||
LOG(INFO) << " the result of " << utt << " is empty";
|
|
||||||
result = " ";
|
|
||||||
}
|
|
||||||
|
|
||||||
tot_decode_time += local_timer.Elapsed();
|
|
||||||
LOG(INFO) << utt << " " << result;
|
|
||||||
LOG(INFO) << " RTF: " << local_timer.Elapsed() / dur << " dur: " << dur
|
|
||||||
<< " cost: " << local_timer.Elapsed();
|
|
||||||
|
|
||||||
results->push_back(result);
|
|
||||||
++num_done;
|
|
||||||
}
|
|
||||||
recognizer_ptr->WaitFinished();
|
|
||||||
LOG(INFO) << "Done " << num_done << " out of " << (num_err + num_done);
|
|
||||||
LOG(INFO) << "total wav duration is: " << tot_wav_duration << " sec";
|
|
||||||
LOG(INFO) << "total decode cost:" << tot_decode_time << " sec";
|
|
||||||
LOG(INFO) << "total rescore cost:" << tot_attention_rescore_time << " sec";
|
|
||||||
LOG(INFO) << "RTF is: " << tot_decode_time / tot_wav_duration;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char* argv[]) {
|
|
||||||
gflags::SetUsageMessage("Usage:");
|
|
||||||
gflags::ParseCommandLineFlags(&argc, &argv, false);
|
|
||||||
google::InitGoogleLogging(argv[0]);
|
|
||||||
google::InstallFailureSignalHandler();
|
|
||||||
FLAGS_logtostderr = 1;
|
|
||||||
|
|
||||||
int sample_rate = FLAGS_sample_rate;
|
|
||||||
float streaming_chunk = FLAGS_streaming_chunk;
|
|
||||||
int chunk_sample_size = streaming_chunk * sample_rate;
|
|
||||||
kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
|
|
||||||
int njob = FLAGS_njob;
|
|
||||||
LOG(INFO) << "sr: " << sample_rate;
|
|
||||||
LOG(INFO) << "chunk size (s): " << streaming_chunk;
|
|
||||||
LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
|
|
||||||
|
|
||||||
ppspeech::U2RecognizerResource resource =
|
|
||||||
ppspeech::U2RecognizerResource::InitFromFlags();
|
|
||||||
ThreadPool threadpool(njob);
|
|
||||||
vector<vector<string>> wavlist;
|
|
||||||
vector<vector<string>> uttlist;
|
|
||||||
vector<vector<string>> resultlist(njob);
|
|
||||||
vector<std::future<void>> futurelist;
|
|
||||||
std::shared_ptr<ppspeech::U2Nnet> nnet(
|
|
||||||
new ppspeech::U2Nnet(resource.model_opts));
|
|
||||||
SplitUtt(FLAGS_wav_rspecifier, &uttlist, &wavlist, njob);
|
|
||||||
for (size_t i = 0; i < njob; ++i) {
|
|
||||||
std::future<void> f = threadpool.enqueue(recognizer_func,
|
|
||||||
resource,
|
|
||||||
nnet->Clone(),
|
|
||||||
wavlist[i],
|
|
||||||
uttlist[i],
|
|
||||||
&resultlist[i]);
|
|
||||||
futurelist.push_back(std::move(f));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t i = 0; i < njob; ++i) {
|
|
||||||
futurelist[i].get();
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t idx = 0; idx < njob; ++idx) {
|
|
||||||
for (size_t utt_idx = 0; utt_idx < uttlist[idx].size(); ++utt_idx) {
|
|
||||||
string utt = uttlist[idx][utt_idx];
|
|
||||||
string result = resultlist[idx][utt_idx];
|
|
||||||
result_writer.Write(utt, result);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
@ -1,62 +0,0 @@
|
|||||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
#include "frontend/audio/fbank.h"
|
|
||||||
|
|
||||||
#include "kaldi/base/kaldi-math.h"
|
|
||||||
#include "kaldi/feat/feature-common.h"
|
|
||||||
#include "kaldi/feat/feature-functions.h"
|
|
||||||
#include "kaldi/matrix/matrix-functions.h"
|
|
||||||
|
|
||||||
namespace ppspeech {
|
|
||||||
|
|
||||||
using kaldi::BaseFloat;
|
|
||||||
using kaldi::int32;
|
|
||||||
using kaldi::Matrix;
|
|
||||||
using kaldi::SubVector;
|
|
||||||
using kaldi::Vector;
|
|
||||||
using kaldi::VectorBase;
|
|
||||||
using std::vector;
|
|
||||||
|
|
||||||
FbankComputer::FbankComputer(const Options& opts)
|
|
||||||
: opts_(opts), computer_(opts) {}
|
|
||||||
|
|
||||||
int32 FbankComputer::Dim() const {
|
|
||||||
return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool FbankComputer::NeedRawLogEnergy() {
|
|
||||||
return opts_.use_energy && opts_.raw_energy;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Compute feat
|
|
||||||
bool FbankComputer::Compute(Vector<BaseFloat>* window,
|
|
||||||
Vector<BaseFloat>* feat) {
|
|
||||||
RealFft(window, true);
|
|
||||||
kaldi::ComputePowerSpectrum(window);
|
|
||||||
const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0));
|
|
||||||
SubVector<BaseFloat> power_spectrum(*window, 0, window->Dim() / 2 + 1);
|
|
||||||
if (!opts_.use_power) {
|
|
||||||
power_spectrum.ApplyPow(0.5);
|
|
||||||
}
|
|
||||||
int32 mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
|
|
||||||
SubVector<BaseFloat> mel_energies(
|
|
||||||
*feat, mel_offset, opts_.mel_opts.num_bins);
|
|
||||||
mel_bank.Compute(power_spectrum, &mel_energies);
|
|
||||||
mel_energies.ApplyFloor(1e-07);
|
|
||||||
mel_energies.ApplyLog();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace ppspeech
|
|
@ -1,109 +0,0 @@
|
|||||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
|
|
||||||
#include "frontend/audio/mfcc.h"
|
|
||||||
|
|
||||||
#include "kaldi/base/kaldi-math.h"
|
|
||||||
#include "kaldi/feat/feature-common.h"
|
|
||||||
#include "kaldi/feat/feature-functions.h"
|
|
||||||
#include "kaldi/matrix/matrix-functions.h"
|
|
||||||
|
|
||||||
namespace ppspeech {
|
|
||||||
|
|
||||||
using kaldi::BaseFloat;
|
|
||||||
using kaldi::int32;
|
|
||||||
using kaldi::Matrix;
|
|
||||||
using kaldi::SubVector;
|
|
||||||
using kaldi::Vector;
|
|
||||||
using kaldi::VectorBase;
|
|
||||||
using std::vector;
|
|
||||||
|
|
||||||
Mfcc::Mfcc(const MfccOptions& opts,
|
|
||||||
std::unique_ptr<FrontendInterface> base_extractor)
|
|
||||||
: opts_(opts),
|
|
||||||
computer_(opts.mfcc_opts),
|
|
||||||
window_function_(computer_.GetFrameOptions()) {
|
|
||||||
base_extractor_ = std::move(base_extractor);
|
|
||||||
chunk_sample_size_ =
|
|
||||||
static_cast<int32>(opts.streaming_chunk * opts.frame_opts.samp_freq);
|
|
||||||
}
|
|
||||||
|
|
||||||
void Mfcc::Accept(const VectorBase<BaseFloat>& inputs) {
|
|
||||||
base_extractor_->Accept(inputs);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Mfcc::Read(Vector<BaseFloat>* feats) {
|
|
||||||
Vector<BaseFloat> wav(chunk_sample_size_);
|
|
||||||
bool flag = base_extractor_->Read(&wav);
|
|
||||||
if (flag == false || wav.Dim() == 0) return false;
|
|
||||||
|
|
||||||
// append remaned waves
|
|
||||||
int32 wav_len = wav.Dim();
|
|
||||||
int32 left_len = remained_wav_.Dim();
|
|
||||||
Vector<BaseFloat> waves(left_len + wav_len);
|
|
||||||
waves.Range(0, left_len).CopyFromVec(remained_wav_);
|
|
||||||
waves.Range(left_len, wav_len).CopyFromVec(wav);
|
|
||||||
|
|
||||||
// compute speech feature
|
|
||||||
Compute(waves, feats);
|
|
||||||
|
|
||||||
// cache remaned waves
|
|
||||||
kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
|
|
||||||
int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts);
|
|
||||||
int32 frame_shift = frame_opts.WindowShift();
|
|
||||||
int32 left_samples = waves.Dim() - frame_shift * num_frames;
|
|
||||||
remained_wav_.Resize(left_samples);
|
|
||||||
remained_wav_.CopyFromVec(
|
|
||||||
waves.Range(frame_shift * num_frames, left_samples));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Compute spectrogram feat
|
|
||||||
bool Mfcc::Compute(const Vector<BaseFloat>& waves, Vector<BaseFloat>* feats) {
|
|
||||||
const FrameExtractionOptions& frame_opts = computer_.GetFrameOptions();
|
|
||||||
int32 num_samples = waves.Dim();
|
|
||||||
int32 frame_length = frame_opts.WindowSize();
|
|
||||||
int32 sample_rate = frame_opts.samp_freq;
|
|
||||||
if (num_samples < frame_length) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32 num_frames = kaldi::NumFrames(num_samples, frame_opts);
|
|
||||||
feats->Rsize(num_frames * Dim());
|
|
||||||
|
|
||||||
Vector<BaseFloat> window;
|
|
||||||
bool need_raw_log_energy = computer_.NeedRawLogEnergy();
|
|
||||||
for (int32 frame = 0; frame < num_frames; frame++) {
|
|
||||||
BaseFloat raw_log_energy = 0.0;
|
|
||||||
kaldi::ExtractWindow(0,
|
|
||||||
waves,
|
|
||||||
frame,
|
|
||||||
frame_opts,
|
|
||||||
window_function_,
|
|
||||||
&window,
|
|
||||||
need_raw_log_energy ? &raw_log_energy : NULL);
|
|
||||||
|
|
||||||
|
|
||||||
Vector<BaseFloat> this_feature(computer_.Dim(), kUndefined);
|
|
||||||
// note: this online feature-extraction code does not support VTLN.
|
|
||||||
BaseFloat vtln_warp = 1.0;
|
|
||||||
computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature);
|
|
||||||
SubVector<BaseFloat> output_row(feats->Data() + frame * Dim(), Dim());
|
|
||||||
output_row.CopyFromVec(this_feature);
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace ppspeech
|
|
@ -1,75 +0,0 @@
|
|||||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "kaldi/feat/feature-mfcc.h"
|
|
||||||
#include "kaldi/matrix/kaldi-vector.h"
|
|
||||||
|
|
||||||
namespace ppspeech {
|
|
||||||
|
|
||||||
struct MfccOptions {
|
|
||||||
kaldi::MfccOptions mfcc_opts;
|
|
||||||
kaldi::BaseFloat streaming_chunk; // second
|
|
||||||
|
|
||||||
MfccOptions() : streaming_chunk(0.1), mfcc_opts() {}
|
|
||||||
|
|
||||||
void Register(kaldi::OptionsItf* opts) {
|
|
||||||
opts->Register("streaming-chunk",
|
|
||||||
&streaming_chunk,
|
|
||||||
"streaming chunk size, default: 0.1 sec");
|
|
||||||
mfcc_opts.Register(opts);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
class Mfcc : public FrontendInterface {
|
|
||||||
public:
|
|
||||||
explicit Mfcc(const MfccOptions& opts,
|
|
||||||
unique_ptr<FrontendInterface> base_extractor);
|
|
||||||
|
|
||||||
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
|
|
||||||
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
|
|
||||||
|
|
||||||
// the dim_ is the dim of single frame feature
|
|
||||||
virtual size_t Dim() const { return computer_.Dim(); }
|
|
||||||
|
|
||||||
virtual void SetFinished() { base_extractor_->SetFinished(); }
|
|
||||||
|
|
||||||
virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
|
|
||||||
|
|
||||||
virtual void Reset() {
|
|
||||||
base_extractor_->Reset();
|
|
||||||
remained_wav_.Resize(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
|
|
||||||
kaldi::Vector<kaldi::BaseFloat>* feats);
|
|
||||||
|
|
||||||
MfccOptions opts_;
|
|
||||||
std::unique_ptr<FrontendInterface> base_extractor_;
|
|
||||||
|
|
||||||
|
|
||||||
FeatureWindowFunction window_function_;
|
|
||||||
kaldi::MfccComputer computer_;
|
|
||||||
// features_ is the Mfcc or Plp or Fbank features that we have already
|
|
||||||
// computed.
|
|
||||||
kaldi::Vector<kaldi::BaseFloat> features_;
|
|
||||||
kaldi::Vector<kaldi::BaseFloat> remained_wav_;
|
|
||||||
|
|
||||||
DISALLOW_COPY_AND_ASSIGN(Fbank);
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace ppspeech
|
|
@ -1,26 +0,0 @@
|
|||||||
#include "utils/blank_process.h"
|
|
||||||
|
|
||||||
namespace ppspeech {
|
|
||||||
|
|
||||||
std::string BlankProcess(const std::string& str) {
|
|
||||||
std::string out = "";
|
|
||||||
int p = 0;
|
|
||||||
int end = str.size();
|
|
||||||
int q = -1; // last char of the output string
|
|
||||||
while (p != end) {
|
|
||||||
while (p != end && str[p] == ' ') {
|
|
||||||
p += 1;
|
|
||||||
}
|
|
||||||
if (p == end)
|
|
||||||
return out;
|
|
||||||
if (q != -1 && isalpha(str[p]) && isalpha(str[q]) && str[p-1] == ' ')
|
|
||||||
// add a space when the last and current chars are in English and there have space(s) between them
|
|
||||||
out += ' ';
|
|
||||||
out += str[p];
|
|
||||||
q = p;
|
|
||||||
p += 1;
|
|
||||||
}
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace ppspeech
|
|
@ -1,9 +0,0 @@
|
|||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <cctype>
|
|
||||||
|
|
||||||
namespace ppspeech {
|
|
||||||
|
|
||||||
std::string BlankProcess(const std::string& str);
|
|
||||||
|
|
||||||
} // namespace ppspeech
|
|
@ -1,74 +0,0 @@
|
|||||||
#include "utils/text_process.h"
|
|
||||||
|
|
||||||
namespace ppspeech {
|
|
||||||
|
|
||||||
std::string DelBlank(const std::string& str) {
|
|
||||||
std::string out = "";
|
|
||||||
int ptr_in = 0; // the pointer of input string (for traversal)
|
|
||||||
int end = str.size();
|
|
||||||
int ptr_out = -1; // the pointer of output string (last char)
|
|
||||||
while (ptr_in != end) {
|
|
||||||
while (ptr_in != end && str[ptr_in] == ' ') {
|
|
||||||
ptr_in += 1;
|
|
||||||
}
|
|
||||||
if (ptr_in == end)
|
|
||||||
return out;
|
|
||||||
if (ptr_out != -1 && isalpha(str[ptr_in]) && isalpha(str[ptr_out]) && str[ptr_in-1] == ' ')
|
|
||||||
// add a space when the last and current chars are in English and there have space(s) between them
|
|
||||||
out += ' ';
|
|
||||||
out += str[ptr_in];
|
|
||||||
ptr_out = ptr_in;
|
|
||||||
ptr_in += 1;
|
|
||||||
}
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string AddBlank(const std::string& str) {
|
|
||||||
std::string out = "";
|
|
||||||
int ptr = 0; // the pointer of the input string
|
|
||||||
int end = str.size();
|
|
||||||
while (ptr != end) {
|
|
||||||
if (isalpha(str[ptr])) {
|
|
||||||
if (ptr == 0 or str[ptr-1] != ' ')
|
|
||||||
out += " "; // add pre-space for an English word
|
|
||||||
while (isalpha(str[ptr])) {
|
|
||||||
out += str[ptr];
|
|
||||||
ptr += 1;
|
|
||||||
}
|
|
||||||
out += " "; // add post-space for an English word
|
|
||||||
} else {
|
|
||||||
out += str[ptr];
|
|
||||||
ptr += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string ReverseFraction(const std::string& str) {
|
|
||||||
std::string out = "";
|
|
||||||
int ptr = 0; // the pointer of the input string
|
|
||||||
int end = str.size();
|
|
||||||
int left, right, frac; // the start index of the left tag, right tag and '/'.
|
|
||||||
left = right = frac = 0;
|
|
||||||
int len_tag = 5; // length of "<tag>"
|
|
||||||
|
|
||||||
while (ptr != end) {
|
|
||||||
// find the position of left tag, right tag and '/'. (xxx<tag>num1/num2</tag>)
|
|
||||||
left = str.find("<tag>", ptr);
|
|
||||||
if (left == -1)
|
|
||||||
break;
|
|
||||||
out += str.substr(ptr, left - ptr); // content before left tag (xxx)
|
|
||||||
frac = str.find("/", left);
|
|
||||||
right = str.find("<tag>", frac);
|
|
||||||
|
|
||||||
out += str.substr(frac + 1, right - frac - 1) + '/' +
|
|
||||||
str.substr(left + len_tag, frac - left - len_tag); // num2/num1
|
|
||||||
ptr = right + len_tag;
|
|
||||||
}
|
|
||||||
if (ptr != end) {
|
|
||||||
out += str.substr(ptr, end - ptr);
|
|
||||||
}
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace ppspeech
|
|
@ -1,13 +0,0 @@
|
|||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <cctype>
|
|
||||||
|
|
||||||
namespace ppspeech {
|
|
||||||
|
|
||||||
std::string DelBlank(const std::string& str);
|
|
||||||
|
|
||||||
std::string AddBlank(const std::string& str);
|
|
||||||
|
|
||||||
std::string ReverseFraction(const std::string& str);
|
|
||||||
|
|
||||||
} // namespace ppspeech
|
|
Loading…
Reference in new issue