Merge pull request #1848 from zh794390558/spx

[speechx] rm to_float32 flags, default feature fbank
4 years ago · 37c6106ee0
parent a58831ddf7 8522b82999
commit 37c6106ee0
16 changed files with 84 additions and 43 deletions
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@ -437,7 +437,9 @@ class VectorExecutor(BaseExecutor):
        if self.sample_rate != 16000 and self.sample_rate != 8000:
            logger.error(
                "invalid sample rate, please input --sr 8000 or --sr 16000")
-            logger.error(f"The model sample rate: {self.sample_rate}, the external sample rate is: {sample_rate}")
+            logger.error(
                f"The model sample rate: {self.sample_rate}, the external sample rate is: {sample_rate}"
            )
            return False
        if isinstance(audio_file, (str, os.PathLike)):
--- a/paddlespeech/server/engine/vector/init.py
+++ b/paddlespeech/server/engine/vector/init.py
@ -0,0 +1,13 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/paddlespeech/server/engine/vector/python/init.py
+++ b/paddlespeech/server/engine/vector/python/init.py
@ -0,0 +1,13 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/paddlespeech/server/engine/vector/python/vector_engine.py
+++ b/paddlespeech/server/engine/vector/python/vector_engine.py
@ -16,9 +16,9 @@ from collections import OrderedDict
 import numpy as np
 import paddle
 from paddleaudio.backends import load as load_audio
 from paddleaudio.compliance.librosa import melspectrogram
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.vector.infer import VectorExecutor
 from paddlespeech.server.engine.base_engine import BaseEngine
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@ -155,7 +155,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
        --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
        --cmvn_file=$cmvn \
        --model_path=$model_dir/avg_1.jit.pdmodel \
        --to_float32=true \
        --streaming_chunk=30 \
        --param_path=$model_dir/avg_1.jit.pdiparams \
        --word_symbol_table=$wfst/words.txt \
--- a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc
+++ b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc
@ -19,6 +19,7 @@
 DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
 DEFINE_string(result_wspecifier, "", "test result wspecifier");
 DEFINE_int32(sample_rate, 16000, "sample rate");
 int main(int argc, char* argv[]) {
    gflags::ParseCommandLineFlags(&argc, &argv, false);
@ -30,7 +31,8 @@ int main(int argc, char* argv[]) {
    kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
        FLAGS_wav_rspecifier);
    kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
-    int sample_rate = 16000;
+
    int sample_rate = FLAGS_sample_rate;
    float streaming_chunk = FLAGS_streaming_chunk;
    int chunk_sample_size = streaming_chunk * sample_rate;
    LOG(INFO) << "sr: " << sample_rate;
--- a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc
+++ b/speechx/examples/ds2_ol/feat/compute_fbank_main.cc
@ -69,6 +69,7 @@ int main(int argc, char* argv[]) {
    feat_cache_opts.frame_chunk_stride = 1;
    feat_cache_opts.frame_chunk_size = 1;
    ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn));
    LOG(INFO) << "fbank: " << true;
    LOG(INFO) << "feat dim: " << feature_cache.Dim();
    int sample_rate = 16000;
--- a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
+++ b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
@ -56,6 +56,7 @@ int main(int argc, char* argv[]) {
    opt.frame_opts.remove_dc_offset = false;
    opt.frame_opts.window_type = "hanning";
    opt.frame_opts.preemph_coeff = 0.0;
    LOG(INFO) << "linear feature: " << true;
    LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms;
    LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms;
@ -77,7 +78,7 @@ int main(int argc, char* argv[]) {
    int sample_rate = 16000;
    float streaming_chunk = FLAGS_streaming_chunk;
    int chunk_sample_size = streaming_chunk * sample_rate;
-    LOG(INFO) << "sr: " << sample_rate;
+    LOG(INFO) << "sample rate: " << sample_rate;
    LOG(INFO) << "chunk size (s): " << streaming_chunk;
    LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
--- a/speechx/examples/ds2_ol/websocket/websocket_server.sh
+++ b/speechx/examples/ds2_ol/websocket/websocket_server.sh
@ -63,7 +63,6 @@ websocket_server_main \
    --cmvn_file=$cmvn \
    --model_path=$model_dir/avg_1.jit.pdmodel \
    --streaming_chunk=0.1 \
    --to_float32=true \
    --param_path=$model_dir/avg_1.jit.pdiparams \
    --word_symbol_table=$wfst/words.txt \
    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@ -19,23 +19,24 @@
 #include "decoder/ctc_tlg_decoder.h"
 #include "frontend/audio/feature_pipeline.h"
 // feature
 DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
 // DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear
 // feature, or fbank");
 DEFINE_int32(num_bins, 161, "num bins of mel");
 DEFINE_string(cmvn_file, "", "read cmvn");
 DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size");
-DEFINE_bool(to_float32, true, "audio convert to pcm32");
+// feature sliding window
 DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
 DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
 DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
 DEFINE_string(graph_path, "TLG", "decoder graph");
 DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
 DEFINE_int32(max_active, 7500, "max active");
 DEFINE_double(beam, 15.0, "decoder beam");
 DEFINE_double(lattice_beam, 7.5, "decoder beam");
 DEFINE_int32(receptive_field_length,
             7,
             "receptive field of two CNN(kernel=5) downsampling module.");
 DEFINE_int32(downsampling_rate,
             4,
             "two CNN(kernel=5) module downsampling rate.");
 // nnet
 DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
 DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
 DEFINE_string(
    model_input_names,
    "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box",
@ -47,8 +48,14 @@ DEFINE_string(model_cache_names,
              "chunk_state_h_box,chunk_state_c_box",
              "model cache names");
 DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
-DEFINE_bool(use_fbank, false, "use fbank or linear feature");
+
-DEFINE_int32(num_bins, 161, "num bins of mel");
+// decoder
 DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
 DEFINE_string(graph_path, "TLG", "decoder graph");
 DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
 DEFINE_int32(max_active, 7500, "max active");
 DEFINE_double(beam, 15.0, "decoder beam");
 DEFINE_double(lattice_beam, 7.5, "decoder beam");
 namespace ppspeech {
 // todo refactor later
@ -56,17 +63,18 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
    FeaturePipelineOptions opts;
    opts.cmvn_file = FLAGS_cmvn_file;
    opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk;
    opts.to_float32 = FLAGS_to_float32;
    kaldi::FrameExtractionOptions frame_opts;
    frame_opts.dither = 0.0;
    frame_opts.frame_shift_ms = 10;
    opts.use_fbank = FLAGS_use_fbank;
    if (opts.use_fbank) {
        opts.to_float32 = false;
        frame_opts.window_type = "povey";
        frame_opts.frame_length_ms = 25;
        opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
        opts.fbank_opts.fbank_opts.frame_opts = frame_opts;
    } else {
        opts.to_float32 = true;
        frame_opts.remove_dc_offset = false;
        frame_opts.frame_length_ms = 20;
        frame_opts.window_type = "hanning";
--- a/speechx/speechx/frontend/audio/fbank.cc
+++ b/speechx/speechx/frontend/audio/fbank.cc
@ -107,8 +107,11 @@ bool Fbank::Compute(const Vector<BaseFloat>& waves, Vector<BaseFloat>* feats) {
        if (!opts_.fbank_opts.use_power) {
            power_spectrum.ApplyPow(0.5);
        }
-        int32 mel_offset = ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1 : 0);
+        int32 mel_offset =
-        SubVector<BaseFloat> mel_energies(this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins);
+            ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1
                                                                           : 0);
        SubVector<BaseFloat> mel_energies(
            this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins);
        mel_bank.Compute(power_spectrum, &mel_energies);
        mel_energies.ApplyFloor(1e-07);
        mel_energies.ApplyLog();
--- a/speechx/speechx/frontend/audio/feature_pipeline.cc
+++ b/speechx/speechx/frontend/audio/feature_pipeline.cc
@ -25,11 +25,11 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) {
    unique_ptr<FrontendInterface> base_feature;
    if (opts.use_fbank) {
-        base_feature.reset(new ppspeech::Fbank(opts.fbank_opts,
+        base_feature.reset(
-                              std::move(data_source)));
+            new ppspeech::Fbank(opts.fbank_opts, std::move(data_source)));
    } else {
-        base_feature.reset(new ppspeech::LinearSpectrogram(opts.linear_spectrogram_opts,
+        base_feature.reset(new ppspeech::LinearSpectrogram(
-                              std::move(data_source)));
+            opts.linear_spectrogram_opts, std::move(data_source)));
    }
    unique_ptr<FrontendInterface> cmvn(
--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@ -18,25 +18,25 @@
 #include "frontend/audio/audio_cache.h"
 #include "frontend/audio/data_cache.h"
 #include "frontend/audio/fbank.h"
 #include "frontend/audio/feature_cache.h"
 #include "frontend/audio/frontend_itf.h"
 #include "frontend/audio/linear_spectrogram.h"
 #include "frontend/audio/fbank.h"
 #include "frontend/audio/normalizer.h"
 namespace ppspeech {
 struct FeaturePipelineOptions {
    std::string cmvn_file;
-    bool to_float32;
+    bool to_float32;  // true, only for linear feature
    bool use_fbank;
    LinearSpectrogramOptions linear_spectrogram_opts;
    FbankOptions fbank_opts;
    FeatureCacheOptions feature_cache_opts;
    FeaturePipelineOptions()
        : cmvn_file(""),
-          to_float32(false),
+          to_float32(false),  // true, only for linear feature
-          use_fbank(false),
+          use_fbank(true),
          linear_spectrogram_opts(),
          fbank_opts(),
          feature_cache_opts() {}