diff --git a/demos/audio_searching/src/encode.py b/demos/audio_searching/src/encode.py index cf5f29a4..35805784 100644 --- a/demos/audio_searching/src/encode.py +++ b/demos/audio_searching/src/encode.py @@ -11,11 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os - -import librosa import numpy as np -from config import DEFAULT_TABLE from logs import LOGGER from paddlespeech.cli import VectorExecutor diff --git a/speechx/cmake/external/openfst.cmake b/speechx/cmake/external/openfst.cmake index dc9cdff6..9acf530a 100644 --- a/speechx/cmake/external/openfst.cmake +++ b/speechx/cmake/external/openfst.cmake @@ -13,7 +13,7 @@ ExternalProject_Add(openfst "CPPFLAGS=-I${gflags_BINARY_DIR}/include -I${glog_SOURCE_DIR}/src -I${glog_BINARY_DIR}" "LDFLAGS=-L${gflags_BINARY_DIR} -L${glog_BINARY_DIR}" "LIBS=-lgflags_nothreads -lglog -lpthread" - COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR} BUILD_COMMAND make -j 4 ) link_directories(${openfst_PREFIX_DIR}/lib) diff --git a/speechx/examples/CMakeLists.txt b/speechx/examples/CMakeLists.txt index ef0a72b8..7f1543c2 100644 --- a/speechx/examples/CMakeLists.txt +++ b/speechx/examples/CMakeLists.txt @@ -3,3 +3,5 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR) add_subdirectory(feat) add_subdirectory(nnet) add_subdirectory(decoder) + +add_subdirectory(glog) \ No newline at end of file diff --git a/speechx/examples/README.md b/speechx/examples/README.md index 941c4272..705ca200 100644 --- a/speechx/examples/README.md +++ b/speechx/examples/README.md @@ -1,8 +1,9 @@ # Examples -* decoder - online decoder to work as offline +* glog - glog usage * feat - mfcc, linear * nnet - ds2 nn +* decoder - online decoder to work as offline ## How to run diff --git a/speechx/examples/decoder/offline_decoder_main.cc b/speechx/examples/decoder/offline_decoder_main.cc index 44127c73..3a94cc94 100644 --- a/speechx/examples/decoder/offline_decoder_main.cc +++ b/speechx/examples/decoder/offline_decoder_main.cc @@ -22,11 +22,12 @@ #include "nnet/decodable.h" #include "nnet/paddle_nnet.h" -DEFINE_string(feature_respecifier, "", "test feature rspecifier"); +DEFINE_string(feature_respecifier, "", "feature matrix rspecifier"); DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm"); DEFINE_string(lm_path, "lm.klm", "language model"); +DEFINE_int32(chunk_size, 35, "feat chunk size"); using kaldi::BaseFloat; @@ -43,14 +44,16 @@ int main(int argc, char* argv[]) { std::string model_params = FLAGS_param_path; std::string dict_file = FLAGS_dict_file; std::string lm_path = FLAGS_lm_path; + int32 chunk_size = FLAGS_chunk_size; + LOG(INFO) << "model path: " << model_graph; + LOG(INFO) << "model param: " << model_params; + LOG(INFO) << "dict path: " << dict_file; + LOG(INFO) << "lm path: " << lm_path; + LOG(INFO) << "chunk size (frame): " << chunk_size; int32 num_done = 0, num_err = 0; - ppspeech::CTCBeamSearchOptions opts; - opts.dict_file = dict_file; - opts.lm_path = lm_path; - ppspeech::CTCBeamSearch decoder(opts); - + // frontend + nnet is decodable ppspeech::ModelOptions model_opts; model_opts.model_path = model_graph; model_opts.params_path = model_params; @@ -60,33 +63,50 @@ int main(int argc, char* argv[]) { new ppspeech::RawDataCache()); std::shared_ptr decodable( new ppspeech::Decodable(nnet, raw_data)); + LOG(INFO) << "Init decodeable."; - int32 chunk_size = 35; - decoder.InitDecoder(); + // init decoder + ppspeech::CTCBeamSearchOptions opts; + opts.dict_file = dict_file; + opts.lm_path = lm_path; + ppspeech::CTCBeamSearch decoder(opts); + LOG(INFO) << "Init decoder."; + decoder.InitDecoder(); for (; !feature_reader.Done(); feature_reader.Next()) { string utt = feature_reader.Key(); const kaldi::Matrix feature = feature_reader.Value(); + LOG(INFO) << "utt: " << utt; + + // feat dim raw_data->SetDim(feature.NumCols()); + LOG(INFO) << "dim: " << raw_data->Dim(); + int32 row_idx = 0; int32 num_chunks = feature.NumRows() / chunk_size; + LOG(INFO) << "n chunks: " << num_chunks; for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) { + // feat chunk kaldi::Vector feature_chunk(chunk_size * feature.NumCols()); for (int row_id = 0; row_id < chunk_size; ++row_id) { - kaldi::SubVector tmp(feature, row_idx); + kaldi::SubVector feat_one_row(feature, + row_idx); kaldi::SubVector f_chunk_tmp( feature_chunk.Data() + row_id * feature.NumCols(), feature.NumCols()); - f_chunk_tmp.CopyFromVec(tmp); + f_chunk_tmp.CopyFromVec(feat_one_row); row_idx++; } + // feed to raw cache raw_data->Accept(feature_chunk); if (chunk_idx == num_chunks - 1) { raw_data->SetFinished(); } + // decode step decoder.AdvanceDecode(decodable); } + std::string result; result = decoder.GetFinalBestPath(); KALDI_LOG << " the result of " << utt << " is " << result; diff --git a/speechx/examples/decoder/run.sh b/speechx/examples/decoder/run.sh index fc5e9182..ddda8970 100755 --- a/speechx/examples/decoder/run.sh +++ b/speechx/examples/decoder/run.sh @@ -25,7 +25,10 @@ model_dir=../paddle_asr_model feat_wspecifier=./feats.ark cmvn=./cmvn.ark -# 3. run feat + +export GLOG_logtostderr=1 + +# 3. gen linear feat linear_spectrogram_main \ --wav_rspecifier=scp:$model_dir/wav.scp \ --feature_wspecifier=ark,t:$feat_wspecifier \ @@ -37,4 +40,4 @@ offline_decoder_main \ --model_path=$model_dir/avg_1.jit.pdmodel \ --param_path=$model_dir/avg_1.jit.pdparams \ --dict_file=$model_dir/vocab.txt \ - --lm_path=$model_dir/avg_1.jit.klm \ No newline at end of file + --lm_path=$model_dir/avg_1.jit.klm diff --git a/speechx/examples/feat/feature-mfcc-test.cc b/speechx/examples/feat/feature-mfcc-test.cc index ae32aba9..48a9e1c2 100644 --- a/speechx/examples/feat/feature-mfcc-test.cc +++ b/speechx/examples/feat/feature-mfcc-test.cc @@ -41,7 +41,6 @@ using namespace kaldi; - static void UnitTestReadWave() { std::cout << "=== UnitTestReadWave() ===\n"; diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc index 9ed4d6f9..cde78c4d 100644 --- a/speechx/examples/feat/linear_spectrogram_main.cc +++ b/speechx/examples/feat/linear_spectrogram_main.cc @@ -25,6 +25,8 @@ #include "kaldi/util/kaldi-io.h" #include "kaldi/util/table-types.h" +#include + DEFINE_string(wav_rspecifier, "", "test wav scp path"); DEFINE_string(feature_wspecifier, "", "output feats wspecifier"); DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn"); @@ -149,7 +151,7 @@ void WriteMatrix() { cmvn_stats(1, idx) = variance_[idx]; } cmvn_stats(0, mean_.size()) = count_; - kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, true); + kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, false); } int main(int argc, char* argv[]) { @@ -161,43 +163,56 @@ int main(int argc, char* argv[]) { kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier); WriteMatrix(); - // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning - // window -->linear_spectrogram --> cmvn + int32 num_done = 0, num_err = 0; + + // feature pipeline: wave cache --> decibel_normalizer --> hanning + // window -->linear_spectrogram --> global cmvn -> feat cache + // std::unique_ptr data_source(new // ppspeech::RawDataCache()); std::unique_ptr data_source( new ppspeech::RawAudioCache()); + ppspeech::DecibelNormalizerOptions db_norm_opt; + std::unique_ptr db_norm( + new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source))); + ppspeech::LinearSpectrogramOptions opt; opt.frame_opts.frame_length_ms = 20; opt.frame_opts.frame_shift_ms = 10; - ppspeech::DecibelNormalizerOptions db_norm_opt; - std::unique_ptr base_feature_extractor( - new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source))); + LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms; + LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms; std::unique_ptr linear_spectrogram( - new ppspeech::LinearSpectrogram(opt, - std::move(base_feature_extractor))); + new ppspeech::LinearSpectrogram(opt, std::move(db_norm))); std::unique_ptr cmvn( new ppspeech::CMVN(FLAGS_cmvn_write_path, std::move(linear_spectrogram))); ppspeech::FeatureCache feature_cache(kint16max, std::move(cmvn)); + LOG(INFO) << "feat dim: " << feature_cache.Dim(); - float streaming_chunk = 0.36; int sample_rate = 16000; + float streaming_chunk = 0.36; int chunk_sample_size = streaming_chunk * sample_rate; + LOG(INFO) << "sr: " << sample_rate; + LOG(INFO) << "chunk size (s): " << streaming_chunk; + LOG(INFO) << "chunk size (sample): " << chunk_sample_size; + for (; !wav_reader.Done(); wav_reader.Next()) { std::string utt = wav_reader.Key(); const kaldi::WaveData& wave_data = wav_reader.Value(); + LOG(INFO) << "process utt: " << utt; int32 this_channel = 0; kaldi::SubVector waveform(wave_data.Data(), this_channel); int tot_samples = waveform.Dim(); + LOG(INFO) << "wav len (sample): " << tot_samples; + int sample_offset = 0; std::vector> feats; int feature_rows = 0; @@ -209,6 +224,7 @@ int main(int argc, char* argv[]) { for (int i = 0; i < cur_chunk_size; ++i) { wav_chunk(i) = waveform(sample_offset + i); } + kaldi::Vector features; feature_cache.Accept(wav_chunk); if (cur_chunk_size < chunk_sample_size) { diff --git a/speechx/examples/feat/run.sh b/speechx/examples/feat/run.sh index bd21bd7f..29c49d32 100755 --- a/speechx/examples/feat/run.sh +++ b/speechx/examples/feat/run.sh @@ -25,6 +25,7 @@ feat_wspecifier=./feats.ark cmvn=./cmvn.ark # 3. run feat +export GLOG_logtostderr=1 linear_spectrogram_main \ --wav_rspecifier=scp:$model_dir/wav.scp \ --feature_wspecifier=ark,t:$feat_wspecifier \ diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h index b6bbdf3c..f52b9b0f 100644 --- a/speechx/speechx/frontend/feature_cache.h +++ b/speechx/speechx/frontend/feature_cache.h @@ -28,10 +28,10 @@ class FeatureCache : public FeatureExtractorInterface { // Feed feats or waves virtual void Accept(const kaldi::VectorBase& inputs); - // feats dim = num_frames * feature_dim + // feats size = num_frames * feat_dim virtual bool Read(kaldi::Vector* feats); - // feature cache only cache feature which from base extractor + // feat dim virtual size_t Dim() const { return base_extractor_->Dim(); } virtual void SetFinished() { diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/raw_audio.h index ce75c137..7a28f2c9 100644 --- a/speechx/speechx/frontend/raw_audio.h +++ b/speechx/speechx/frontend/raw_audio.h @@ -68,9 +68,10 @@ class RawDataCache : public FeatureExtractorInterface { data_.Resize(0); return true; } - virtual size_t Dim() const { return dim_; } + virtual void SetFinished() { finished_ = true; } virtual bool IsFinished() const { return finished_; } + virtual size_t Dim() const { return dim_; } void SetDim(int32 dim) { dim_ = dim; } virtual void Reset() { finished_ = true; }