Merge pull request #1631 from zh794390558/spx

[speechx] openfst patch and glog
r0.2
YangZhou 3 years ago committed by GitHub
commit 94e5e37b06
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -11,11 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
import librosa
import numpy as np import numpy as np
from config import DEFAULT_TABLE
from logs import LOGGER from logs import LOGGER
from paddlespeech.cli import VectorExecutor from paddlespeech.cli import VectorExecutor

@ -13,7 +13,7 @@ ExternalProject_Add(openfst
"CPPFLAGS=-I${gflags_BINARY_DIR}/include -I${glog_SOURCE_DIR}/src -I${glog_BINARY_DIR}" "CPPFLAGS=-I${gflags_BINARY_DIR}/include -I${glog_SOURCE_DIR}/src -I${glog_BINARY_DIR}"
"LDFLAGS=-L${gflags_BINARY_DIR} -L${glog_BINARY_DIR}" "LDFLAGS=-L${gflags_BINARY_DIR} -L${glog_BINARY_DIR}"
"LIBS=-lgflags_nothreads -lglog -lpthread" "LIBS=-lgflags_nothreads -lglog -lpthread"
COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR} COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR}
BUILD_COMMAND make -j 4 BUILD_COMMAND make -j 4
) )
link_directories(${openfst_PREFIX_DIR}/lib) link_directories(${openfst_PREFIX_DIR}/lib)

@ -3,3 +3,5 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
add_subdirectory(feat) add_subdirectory(feat)
add_subdirectory(nnet) add_subdirectory(nnet)
add_subdirectory(decoder) add_subdirectory(decoder)
add_subdirectory(glog)

@ -1,8 +1,9 @@
# Examples # Examples
* decoder - online decoder to work as offline * glog - glog usage
* feat - mfcc, linear * feat - mfcc, linear
* nnet - ds2 nn * nnet - ds2 nn
* decoder - online decoder to work as offline
## How to run ## How to run

@ -22,11 +22,12 @@
#include "nnet/decodable.h" #include "nnet/decodable.h"
#include "nnet/paddle_nnet.h" #include "nnet/paddle_nnet.h"
DEFINE_string(feature_respecifier, "", "test feature rspecifier"); DEFINE_string(feature_respecifier, "", "feature matrix rspecifier");
DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm"); DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
DEFINE_string(lm_path, "lm.klm", "language model"); DEFINE_string(lm_path, "lm.klm", "language model");
DEFINE_int32(chunk_size, 35, "feat chunk size");
using kaldi::BaseFloat; using kaldi::BaseFloat;
@ -43,14 +44,16 @@ int main(int argc, char* argv[]) {
std::string model_params = FLAGS_param_path; std::string model_params = FLAGS_param_path;
std::string dict_file = FLAGS_dict_file; std::string dict_file = FLAGS_dict_file;
std::string lm_path = FLAGS_lm_path; std::string lm_path = FLAGS_lm_path;
int32 chunk_size = FLAGS_chunk_size;
LOG(INFO) << "model path: " << model_graph;
LOG(INFO) << "model param: " << model_params;
LOG(INFO) << "dict path: " << dict_file;
LOG(INFO) << "lm path: " << lm_path;
LOG(INFO) << "chunk size (frame): " << chunk_size;
int32 num_done = 0, num_err = 0; int32 num_done = 0, num_err = 0;
ppspeech::CTCBeamSearchOptions opts; // frontend + nnet is decodable
opts.dict_file = dict_file;
opts.lm_path = lm_path;
ppspeech::CTCBeamSearch decoder(opts);
ppspeech::ModelOptions model_opts; ppspeech::ModelOptions model_opts;
model_opts.model_path = model_graph; model_opts.model_path = model_graph;
model_opts.params_path = model_params; model_opts.params_path = model_params;
@ -60,33 +63,50 @@ int main(int argc, char* argv[]) {
new ppspeech::RawDataCache()); new ppspeech::RawDataCache());
std::shared_ptr<ppspeech::Decodable> decodable( std::shared_ptr<ppspeech::Decodable> decodable(
new ppspeech::Decodable(nnet, raw_data)); new ppspeech::Decodable(nnet, raw_data));
LOG(INFO) << "Init decodeable.";
int32 chunk_size = 35; // init decoder
decoder.InitDecoder(); ppspeech::CTCBeamSearchOptions opts;
opts.dict_file = dict_file;
opts.lm_path = lm_path;
ppspeech::CTCBeamSearch decoder(opts);
LOG(INFO) << "Init decoder.";
decoder.InitDecoder();
for (; !feature_reader.Done(); feature_reader.Next()) { for (; !feature_reader.Done(); feature_reader.Next()) {
string utt = feature_reader.Key(); string utt = feature_reader.Key();
const kaldi::Matrix<BaseFloat> feature = feature_reader.Value(); const kaldi::Matrix<BaseFloat> feature = feature_reader.Value();
LOG(INFO) << "utt: " << utt;
// feat dim
raw_data->SetDim(feature.NumCols()); raw_data->SetDim(feature.NumCols());
LOG(INFO) << "dim: " << raw_data->Dim();
int32 row_idx = 0; int32 row_idx = 0;
int32 num_chunks = feature.NumRows() / chunk_size; int32 num_chunks = feature.NumRows() / chunk_size;
LOG(INFO) << "n chunks: " << num_chunks;
for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) { for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
// feat chunk
kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size * kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
feature.NumCols()); feature.NumCols());
for (int row_id = 0; row_id < chunk_size; ++row_id) { for (int row_id = 0; row_id < chunk_size; ++row_id) {
kaldi::SubVector<kaldi::BaseFloat> tmp(feature, row_idx); kaldi::SubVector<kaldi::BaseFloat> feat_one_row(feature,
row_idx);
kaldi::SubVector<kaldi::BaseFloat> f_chunk_tmp( kaldi::SubVector<kaldi::BaseFloat> f_chunk_tmp(
feature_chunk.Data() + row_id * feature.NumCols(), feature_chunk.Data() + row_id * feature.NumCols(),
feature.NumCols()); feature.NumCols());
f_chunk_tmp.CopyFromVec(tmp); f_chunk_tmp.CopyFromVec(feat_one_row);
row_idx++; row_idx++;
} }
// feed to raw cache
raw_data->Accept(feature_chunk); raw_data->Accept(feature_chunk);
if (chunk_idx == num_chunks - 1) { if (chunk_idx == num_chunks - 1) {
raw_data->SetFinished(); raw_data->SetFinished();
} }
// decode step
decoder.AdvanceDecode(decodable); decoder.AdvanceDecode(decodable);
} }
std::string result; std::string result;
result = decoder.GetFinalBestPath(); result = decoder.GetFinalBestPath();
KALDI_LOG << " the result of " << utt << " is " << result; KALDI_LOG << " the result of " << utt << " is " << result;

@ -25,7 +25,10 @@ model_dir=../paddle_asr_model
feat_wspecifier=./feats.ark feat_wspecifier=./feats.ark
cmvn=./cmvn.ark cmvn=./cmvn.ark
# 3. run feat
export GLOG_logtostderr=1
# 3. gen linear feat
linear_spectrogram_main \ linear_spectrogram_main \
--wav_rspecifier=scp:$model_dir/wav.scp \ --wav_rspecifier=scp:$model_dir/wav.scp \
--feature_wspecifier=ark,t:$feat_wspecifier \ --feature_wspecifier=ark,t:$feat_wspecifier \

@ -41,7 +41,6 @@
using namespace kaldi; using namespace kaldi;
static void UnitTestReadWave() { static void UnitTestReadWave() {
std::cout << "=== UnitTestReadWave() ===\n"; std::cout << "=== UnitTestReadWave() ===\n";

@ -25,6 +25,8 @@
#include "kaldi/util/kaldi-io.h" #include "kaldi/util/kaldi-io.h"
#include "kaldi/util/table-types.h" #include "kaldi/util/table-types.h"
#include <glog/logging.h>
DEFINE_string(wav_rspecifier, "", "test wav scp path"); DEFINE_string(wav_rspecifier, "", "test wav scp path");
DEFINE_string(feature_wspecifier, "", "output feats wspecifier"); DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn"); DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn");
@ -149,7 +151,7 @@ void WriteMatrix() {
cmvn_stats(1, idx) = variance_[idx]; cmvn_stats(1, idx) = variance_[idx];
} }
cmvn_stats(0, mean_.size()) = count_; cmvn_stats(0, mean_.size()) = count_;
kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, true); kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, false);
} }
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
@ -161,43 +163,56 @@ int main(int argc, char* argv[]) {
kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier); kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
WriteMatrix(); WriteMatrix();
// test feature linear_spectorgram: wave --> decibel_normalizer --> hanning
// window -->linear_spectrogram --> cmvn
int32 num_done = 0, num_err = 0; int32 num_done = 0, num_err = 0;
// feature pipeline: wave cache --> decibel_normalizer --> hanning
// window -->linear_spectrogram --> global cmvn -> feat cache
// std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new // std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new
// ppspeech::RawDataCache()); // ppspeech::RawDataCache());
std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source( std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(
new ppspeech::RawAudioCache()); new ppspeech::RawAudioCache());
ppspeech::DecibelNormalizerOptions db_norm_opt;
std::unique_ptr<ppspeech::FeatureExtractorInterface> db_norm(
new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source)));
ppspeech::LinearSpectrogramOptions opt; ppspeech::LinearSpectrogramOptions opt;
opt.frame_opts.frame_length_ms = 20; opt.frame_opts.frame_length_ms = 20;
opt.frame_opts.frame_shift_ms = 10; opt.frame_opts.frame_shift_ms = 10;
ppspeech::DecibelNormalizerOptions db_norm_opt; LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms;
std::unique_ptr<ppspeech::FeatureExtractorInterface> base_feature_extractor( LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms;
new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source)));
std::unique_ptr<ppspeech::FeatureExtractorInterface> linear_spectrogram( std::unique_ptr<ppspeech::FeatureExtractorInterface> linear_spectrogram(
new ppspeech::LinearSpectrogram(opt, new ppspeech::LinearSpectrogram(opt, std::move(db_norm)));
std::move(base_feature_extractor)));
std::unique_ptr<ppspeech::FeatureExtractorInterface> cmvn( std::unique_ptr<ppspeech::FeatureExtractorInterface> cmvn(
new ppspeech::CMVN(FLAGS_cmvn_write_path, new ppspeech::CMVN(FLAGS_cmvn_write_path,
std::move(linear_spectrogram))); std::move(linear_spectrogram)));
ppspeech::FeatureCache feature_cache(kint16max, std::move(cmvn)); ppspeech::FeatureCache feature_cache(kint16max, std::move(cmvn));
LOG(INFO) << "feat dim: " << feature_cache.Dim();
float streaming_chunk = 0.36;
int sample_rate = 16000; int sample_rate = 16000;
float streaming_chunk = 0.36;
int chunk_sample_size = streaming_chunk * sample_rate; int chunk_sample_size = streaming_chunk * sample_rate;
LOG(INFO) << "sr: " << sample_rate;
LOG(INFO) << "chunk size (s): " << streaming_chunk;
LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
for (; !wav_reader.Done(); wav_reader.Next()) { for (; !wav_reader.Done(); wav_reader.Next()) {
std::string utt = wav_reader.Key(); std::string utt = wav_reader.Key();
const kaldi::WaveData& wave_data = wav_reader.Value(); const kaldi::WaveData& wave_data = wav_reader.Value();
LOG(INFO) << "process utt: " << utt;
int32 this_channel = 0; int32 this_channel = 0;
kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(), kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
this_channel); this_channel);
int tot_samples = waveform.Dim(); int tot_samples = waveform.Dim();
LOG(INFO) << "wav len (sample): " << tot_samples;
int sample_offset = 0; int sample_offset = 0;
std::vector<kaldi::Vector<BaseFloat>> feats; std::vector<kaldi::Vector<BaseFloat>> feats;
int feature_rows = 0; int feature_rows = 0;
@ -209,6 +224,7 @@ int main(int argc, char* argv[]) {
for (int i = 0; i < cur_chunk_size; ++i) { for (int i = 0; i < cur_chunk_size; ++i) {
wav_chunk(i) = waveform(sample_offset + i); wav_chunk(i) = waveform(sample_offset + i);
} }
kaldi::Vector<BaseFloat> features; kaldi::Vector<BaseFloat> features;
feature_cache.Accept(wav_chunk); feature_cache.Accept(wav_chunk);
if (cur_chunk_size < chunk_sample_size) { if (cur_chunk_size < chunk_sample_size) {

@ -25,6 +25,7 @@ feat_wspecifier=./feats.ark
cmvn=./cmvn.ark cmvn=./cmvn.ark
# 3. run feat # 3. run feat
export GLOG_logtostderr=1
linear_spectrogram_main \ linear_spectrogram_main \
--wav_rspecifier=scp:$model_dir/wav.scp \ --wav_rspecifier=scp:$model_dir/wav.scp \
--feature_wspecifier=ark,t:$feat_wspecifier \ --feature_wspecifier=ark,t:$feat_wspecifier \

@ -28,10 +28,10 @@ class FeatureCache : public FeatureExtractorInterface {
// Feed feats or waves // Feed feats or waves
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs); virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
// feats dim = num_frames * feature_dim // feats size = num_frames * feat_dim
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats); virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
// feature cache only cache feature which from base extractor // feat dim
virtual size_t Dim() const { return base_extractor_->Dim(); } virtual size_t Dim() const { return base_extractor_->Dim(); }
virtual void SetFinished() { virtual void SetFinished() {

@ -68,9 +68,10 @@ class RawDataCache : public FeatureExtractorInterface {
data_.Resize(0); data_.Resize(0);
return true; return true;
} }
virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { finished_ = true; } virtual void SetFinished() { finished_ = true; }
virtual bool IsFinished() const { return finished_; } virtual bool IsFinished() const { return finished_; }
virtual size_t Dim() const { return dim_; }
void SetDim(int32 dim) { dim_ = dim; } void SetDim(int32 dim) { dim_ = dim; }
virtual void Reset() { finished_ = true; } virtual void Reset() { finished_ = true; }

Loading…
Cancel
Save