Merge pull request #1631 from zh794390558/spx

[speechx] openfst patch and glog
4 years ago · 94e5e37b06
parent 602b0b0da3 cb66b742ab
commit 94e5e37b06
11 changed files with 70 additions and 31 deletions
--- a/demos/audio_searching/src/encode.py
+++ b/demos/audio_searching/src/encode.py
@ -11,11 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import librosa
 import numpy as np
 from config import DEFAULT_TABLE
 from logs import LOGGER
 from paddlespeech.cli import VectorExecutor
--- a/speechx/cmake/external/openfst.cmake
+++ b/speechx/cmake/external/openfst.cmake
@ -13,7 +13,7 @@ ExternalProject_Add(openfst
                      "CPPFLAGS=-I${gflags_BINARY_DIR}/include -I${glog_SOURCE_DIR}/src -I${glog_BINARY_DIR}"
                      "LDFLAGS=-L${gflags_BINARY_DIR} -L${glog_BINARY_DIR}"
                      "LIBS=-lgflags_nothreads -lglog -lpthread"
-  COMMAND           ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR}
+  COMMAND           ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR}
  BUILD_COMMAND     make -j 4
 )
 link_directories(${openfst_PREFIX_DIR}/lib)
--- a/speechx/examples/CMakeLists.txt
+++ b/speechx/examples/CMakeLists.txt
@ -3,3 +3,5 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 add_subdirectory(feat)
 add_subdirectory(nnet)
 add_subdirectory(decoder)
 add_subdirectory(glog)
--- a/speechx/examples/README.md
+++ b/speechx/examples/README.md
@ -1,8 +1,9 @@
 # Examples
-* decoder - online decoder to work as offline
+* glog - glog usage
 * feat - mfcc, linear 
 * nnet - ds2 nn
 * decoder - online decoder to work as offline
 ## How to run
--- a/speechx/examples/decoder/offline_decoder_main.cc
+++ b/speechx/examples/decoder/offline_decoder_main.cc
@ -22,11 +22,12 @@
 #include "nnet/decodable.h"
 #include "nnet/paddle_nnet.h"
-DEFINE_string(feature_respecifier, "", "test feature rspecifier");
+DEFINE_string(feature_respecifier, "", "feature matrix rspecifier");
 DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
 DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
 DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
 DEFINE_string(lm_path, "lm.klm", "language model");
 DEFINE_int32(chunk_size, 35, "feat chunk size");
 using kaldi::BaseFloat;
@ -43,14 +44,16 @@ int main(int argc, char* argv[]) {
    std::string model_params = FLAGS_param_path;
    std::string dict_file = FLAGS_dict_file;
    std::string lm_path = FLAGS_lm_path;
    int32 chunk_size = FLAGS_chunk_size;
    LOG(INFO) << "model path: " << model_graph;
    LOG(INFO) << "model param: " << model_params;
    LOG(INFO) << "dict path: " << dict_file;
    LOG(INFO) << "lm path: " << lm_path;
    LOG(INFO) << "chunk size (frame): " << chunk_size;
    int32 num_done = 0, num_err = 0;
-    ppspeech::CTCBeamSearchOptions opts;
+    // frontend + nnet is decodable
    opts.dict_file = dict_file;
    opts.lm_path = lm_path;
    ppspeech::CTCBeamSearch decoder(opts);
    ppspeech::ModelOptions model_opts;
    model_opts.model_path = model_graph;
    model_opts.params_path = model_params;
@ -60,33 +63,50 @@ int main(int argc, char* argv[]) {
        new ppspeech::RawDataCache());
    std::shared_ptr<ppspeech::Decodable> decodable(
        new ppspeech::Decodable(nnet, raw_data));
    LOG(INFO) << "Init decodeable.";
-    int32 chunk_size = 35;
+    // init decoder
-    decoder.InitDecoder();
+    ppspeech::CTCBeamSearchOptions opts;
    opts.dict_file = dict_file;
    opts.lm_path = lm_path;
    ppspeech::CTCBeamSearch decoder(opts);
    LOG(INFO) << "Init decoder.";
    decoder.InitDecoder();
    for (; !feature_reader.Done(); feature_reader.Next()) {
        string utt = feature_reader.Key();
        const kaldi::Matrix<BaseFloat> feature = feature_reader.Value();
        LOG(INFO) << "utt: " << utt;
        // feat dim
        raw_data->SetDim(feature.NumCols());
        LOG(INFO) << "dim: " << raw_data->Dim();
        int32 row_idx = 0;
        int32 num_chunks = feature.NumRows() / chunk_size;
        LOG(INFO) << "n chunks: " << num_chunks;
        for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
            // feat chunk
            kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
                                                          feature.NumCols());
            for (int row_id = 0; row_id < chunk_size; ++row_id) {
-                kaldi::SubVector<kaldi::BaseFloat> tmp(feature, row_idx);
+                kaldi::SubVector<kaldi::BaseFloat> feat_one_row(feature,
                                                                row_idx);
                kaldi::SubVector<kaldi::BaseFloat> f_chunk_tmp(
                    feature_chunk.Data() + row_id * feature.NumCols(),
                    feature.NumCols());
-                f_chunk_tmp.CopyFromVec(tmp);
+                f_chunk_tmp.CopyFromVec(feat_one_row);
                row_idx++;
            }
            // feed to raw cache
            raw_data->Accept(feature_chunk);
            if (chunk_idx == num_chunks - 1) {
                raw_data->SetFinished();
            }
            // decode step
            decoder.AdvanceDecode(decodable);
        }
        std::string result;
        result = decoder.GetFinalBestPath();
        KALDI_LOG << " the result of " << utt << " is " << result;
--- a/speechx/examples/decoder/run.sh
+++ b/speechx/examples/decoder/run.sh
@ -25,7 +25,10 @@ model_dir=../paddle_asr_model
 feat_wspecifier=./feats.ark
 cmvn=./cmvn.ark
-# 3. run feat
+
 export GLOG_logtostderr=1
 # 3. gen linear feat
 linear_spectrogram_main \
    --wav_rspecifier=scp:$model_dir/wav.scp \
    --feature_wspecifier=ark,t:$feat_wspecifier \
--- a/speechx/examples/feat/feature-mfcc-test.cc
+++ b/speechx/examples/feat/feature-mfcc-test.cc
@ -41,7 +41,6 @@
 using namespace kaldi;
 static void UnitTestReadWave() {
    std::cout << "=== UnitTestReadWave() ===\n";
--- a/speechx/examples/feat/linear_spectrogram_main.cc
+++ b/speechx/examples/feat/linear_spectrogram_main.cc
@ -25,6 +25,8 @@
 #include "kaldi/util/kaldi-io.h"
 #include "kaldi/util/table-types.h"
 #include <glog/logging.h>
 DEFINE_string(wav_rspecifier, "", "test wav scp path");
 DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
 DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn");
@ -149,7 +151,7 @@ void WriteMatrix() {
        cmvn_stats(1, idx) = variance_[idx];
    }
    cmvn_stats(0, mean_.size()) = count_;
-    kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, true);
+    kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, false);
 }
 int main(int argc, char* argv[]) {
@ -161,43 +163,56 @@ int main(int argc, char* argv[]) {
    kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
    WriteMatrix();
-    // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning
+
    // window -->linear_spectrogram --> cmvn
    int32 num_done = 0, num_err = 0;
    // feature pipeline: wave cache --> decibel_normalizer --> hanning
    // window -->linear_spectrogram --> global cmvn -> feat cache
    // std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new
    // ppspeech::RawDataCache());
    std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(
        new ppspeech::RawAudioCache());
    ppspeech::DecibelNormalizerOptions db_norm_opt;
    std::unique_ptr<ppspeech::FeatureExtractorInterface> db_norm(
        new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source)));
    ppspeech::LinearSpectrogramOptions opt;
    opt.frame_opts.frame_length_ms = 20;
    opt.frame_opts.frame_shift_ms = 10;
-    ppspeech::DecibelNormalizerOptions db_norm_opt;
+    LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms;
-    std::unique_ptr<ppspeech::FeatureExtractorInterface> base_feature_extractor(
+    LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms;
        new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source)));
    std::unique_ptr<ppspeech::FeatureExtractorInterface> linear_spectrogram(
-        new ppspeech::LinearSpectrogram(opt,
+        new ppspeech::LinearSpectrogram(opt, std::move(db_norm)));
                                        std::move(base_feature_extractor)));
    std::unique_ptr<ppspeech::FeatureExtractorInterface> cmvn(
        new ppspeech::CMVN(FLAGS_cmvn_write_path,
                           std::move(linear_spectrogram)));
    ppspeech::FeatureCache feature_cache(kint16max, std::move(cmvn));
    LOG(INFO) << "feat dim: " << feature_cache.Dim();
    float streaming_chunk = 0.36;
    int sample_rate = 16000;
    float streaming_chunk = 0.36;
    int chunk_sample_size = streaming_chunk * sample_rate;
    LOG(INFO) << "sr: " << sample_rate;
    LOG(INFO) << "chunk size (s): " << streaming_chunk;
    LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
    for (; !wav_reader.Done(); wav_reader.Next()) {
        std::string utt = wav_reader.Key();
        const kaldi::WaveData& wave_data = wav_reader.Value();
        LOG(INFO) << "process utt: " << utt;
        int32 this_channel = 0;
        kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
                                                    this_channel);
        int tot_samples = waveform.Dim();
        LOG(INFO) << "wav len (sample): " << tot_samples;
        int sample_offset = 0;
        std::vector<kaldi::Vector<BaseFloat>> feats;
        int feature_rows = 0;
@ -209,6 +224,7 @@ int main(int argc, char* argv[]) {
            for (int i = 0; i < cur_chunk_size; ++i) {
                wav_chunk(i) = waveform(sample_offset + i);
            }
            kaldi::Vector<BaseFloat> features;
            feature_cache.Accept(wav_chunk);
            if (cur_chunk_size < chunk_sample_size) {
--- a/speechx/examples/feat/run.sh
+++ b/speechx/examples/feat/run.sh
@ -25,6 +25,7 @@ feat_wspecifier=./feats.ark
 cmvn=./cmvn.ark
 # 3. run feat
 export GLOG_logtostderr=1
 linear_spectrogram_main \
    --wav_rspecifier=scp:$model_dir/wav.scp \
    --feature_wspecifier=ark,t:$feat_wspecifier \
--- a/speechx/speechx/frontend/feature_cache.h
+++ b/speechx/speechx/frontend/feature_cache.h
@ -28,10 +28,10 @@ class FeatureCache : public FeatureExtractorInterface {
    // Feed feats or waves
    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
-    // feats dim = num_frames * feature_dim
+    // feats size = num_frames * feat_dim
    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
-    // feature cache only cache feature which from base extractor
+    // feat dim
    virtual size_t Dim() const { return base_extractor_->Dim(); }
    virtual void SetFinished() {
--- a/speechx/speechx/frontend/raw_audio.h
+++ b/speechx/speechx/frontend/raw_audio.h
@ -68,9 +68,10 @@ class RawDataCache : public FeatureExtractorInterface {
        data_.Resize(0);
        return true;
    }
-    virtual size_t Dim() const { return dim_; }
+
    virtual void SetFinished() { finished_ = true; }
    virtual bool IsFinished() const { return finished_; }
    virtual size_t Dim() const { return dim_; }
    void SetDim(int32 dim) { dim_ = dim; }
    virtual void Reset() { finished_ = true; }