Merge pull request #1848 from zh794390558/spx

[speechx] rm to_float32 flags, default feature fbank
pull/1853/head
Hui Zhang 3 years ago committed by GitHub
commit 37c6106ee0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -437,7 +437,9 @@ class VectorExecutor(BaseExecutor):
if self.sample_rate != 16000 and self.sample_rate != 8000: if self.sample_rate != 16000 and self.sample_rate != 8000:
logger.error( logger.error(
"invalid sample rate, please input --sr 8000 or --sr 16000") "invalid sample rate, please input --sr 8000 or --sr 16000")
logger.error(f"The model sample rate: {self.sample_rate}, the external sample rate is: {sample_rate}") logger.error(
f"The model sample rate: {self.sample_rate}, the external sample rate is: {sample_rate}"
)
return False return False
if isinstance(audio_file, (str, os.PathLike)): if isinstance(audio_file, (str, os.PathLike)):

@ -0,0 +1,13 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -0,0 +1,13 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -16,9 +16,9 @@ from collections import OrderedDict
import numpy as np import numpy as np
import paddle import paddle
from paddleaudio.backends import load as load_audio from paddleaudio.backends import load as load_audio
from paddleaudio.compliance.librosa import melspectrogram from paddleaudio.compliance.librosa import melspectrogram
from paddlespeech.cli.log import logger from paddlespeech.cli.log import logger
from paddlespeech.cli.vector.infer import VectorExecutor from paddlespeech.cli.vector.infer import VectorExecutor
from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.engine.base_engine import BaseEngine

@ -155,7 +155,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
--wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
--cmvn_file=$cmvn \ --cmvn_file=$cmvn \
--model_path=$model_dir/avg_1.jit.pdmodel \ --model_path=$model_dir/avg_1.jit.pdmodel \
--to_float32=true \
--streaming_chunk=30 \ --streaming_chunk=30 \
--param_path=$model_dir/avg_1.jit.pdiparams \ --param_path=$model_dir/avg_1.jit.pdiparams \
--word_symbol_table=$wfst/words.txt \ --word_symbol_table=$wfst/words.txt \

@ -19,6 +19,7 @@
DEFINE_string(wav_rspecifier, "", "test feature rspecifier"); DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
DEFINE_string(result_wspecifier, "", "test result wspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier");
DEFINE_int32(sample_rate, 16000, "sample rate");
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
gflags::ParseCommandLineFlags(&argc, &argv, false); gflags::ParseCommandLineFlags(&argc, &argv, false);
@ -30,7 +31,8 @@ int main(int argc, char* argv[]) {
kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader( kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
FLAGS_wav_rspecifier); FLAGS_wav_rspecifier);
kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
int sample_rate = 16000;
int sample_rate = FLAGS_sample_rate;
float streaming_chunk = FLAGS_streaming_chunk; float streaming_chunk = FLAGS_streaming_chunk;
int chunk_sample_size = streaming_chunk * sample_rate; int chunk_sample_size = streaming_chunk * sample_rate;
LOG(INFO) << "sr: " << sample_rate; LOG(INFO) << "sr: " << sample_rate;

@ -69,6 +69,7 @@ int main(int argc, char* argv[]) {
feat_cache_opts.frame_chunk_stride = 1; feat_cache_opts.frame_chunk_stride = 1;
feat_cache_opts.frame_chunk_size = 1; feat_cache_opts.frame_chunk_size = 1;
ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn)); ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn));
LOG(INFO) << "fbank: " << true;
LOG(INFO) << "feat dim: " << feature_cache.Dim(); LOG(INFO) << "feat dim: " << feature_cache.Dim();
int sample_rate = 16000; int sample_rate = 16000;

@ -56,6 +56,7 @@ int main(int argc, char* argv[]) {
opt.frame_opts.remove_dc_offset = false; opt.frame_opts.remove_dc_offset = false;
opt.frame_opts.window_type = "hanning"; opt.frame_opts.window_type = "hanning";
opt.frame_opts.preemph_coeff = 0.0; opt.frame_opts.preemph_coeff = 0.0;
LOG(INFO) << "linear feature: " << true;
LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms; LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms;
LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms; LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms;
@ -77,7 +78,7 @@ int main(int argc, char* argv[]) {
int sample_rate = 16000; int sample_rate = 16000;
float streaming_chunk = FLAGS_streaming_chunk; float streaming_chunk = FLAGS_streaming_chunk;
int chunk_sample_size = streaming_chunk * sample_rate; int chunk_sample_size = streaming_chunk * sample_rate;
LOG(INFO) << "sr: " << sample_rate; LOG(INFO) << "sample rate: " << sample_rate;
LOG(INFO) << "chunk size (s): " << streaming_chunk; LOG(INFO) << "chunk size (s): " << streaming_chunk;
LOG(INFO) << "chunk size (sample): " << chunk_sample_size; LOG(INFO) << "chunk size (sample): " << chunk_sample_size;

@ -63,7 +63,6 @@ websocket_server_main \
--cmvn_file=$cmvn \ --cmvn_file=$cmvn \
--model_path=$model_dir/avg_1.jit.pdmodel \ --model_path=$model_dir/avg_1.jit.pdmodel \
--streaming_chunk=0.1 \ --streaming_chunk=0.1 \
--to_float32=true \
--param_path=$model_dir/avg_1.jit.pdiparams \ --param_path=$model_dir/avg_1.jit.pdiparams \
--word_symbol_table=$wfst/words.txt \ --word_symbol_table=$wfst/words.txt \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \

@ -19,23 +19,24 @@
#include "decoder/ctc_tlg_decoder.h" #include "decoder/ctc_tlg_decoder.h"
#include "frontend/audio/feature_pipeline.h" #include "frontend/audio/feature_pipeline.h"
// feature
DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
// DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear
// feature, or fbank");
DEFINE_int32(num_bins, 161, "num bins of mel");
DEFINE_string(cmvn_file, "", "read cmvn"); DEFINE_string(cmvn_file, "", "read cmvn");
DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size"); DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size");
DEFINE_bool(to_float32, true, "audio convert to pcm32"); // feature sliding window
DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
DEFINE_string(graph_path, "TLG", "decoder graph");
DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
DEFINE_int32(max_active, 7500, "max active");
DEFINE_double(beam, 15.0, "decoder beam");
DEFINE_double(lattice_beam, 7.5, "decoder beam");
DEFINE_int32(receptive_field_length, DEFINE_int32(receptive_field_length,
7, 7,
"receptive field of two CNN(kernel=5) downsampling module."); "receptive field of two CNN(kernel=5) downsampling module.");
DEFINE_int32(downsampling_rate, DEFINE_int32(downsampling_rate,
4, 4,
"two CNN(kernel=5) module downsampling rate."); "two CNN(kernel=5) module downsampling rate.");
// nnet
DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
DEFINE_string( DEFINE_string(
model_input_names, model_input_names,
"audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box", "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box",
@ -47,8 +48,14 @@ DEFINE_string(model_cache_names,
"chunk_state_h_box,chunk_state_c_box", "chunk_state_h_box,chunk_state_c_box",
"model cache names"); "model cache names");
DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
DEFINE_bool(use_fbank, false, "use fbank or linear feature");
DEFINE_int32(num_bins, 161, "num bins of mel"); // decoder
DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
DEFINE_string(graph_path, "TLG", "decoder graph");
DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
DEFINE_int32(max_active, 7500, "max active");
DEFINE_double(beam, 15.0, "decoder beam");
DEFINE_double(lattice_beam, 7.5, "decoder beam");
namespace ppspeech { namespace ppspeech {
// todo refactor later // todo refactor later
@ -56,17 +63,18 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
FeaturePipelineOptions opts; FeaturePipelineOptions opts;
opts.cmvn_file = FLAGS_cmvn_file; opts.cmvn_file = FLAGS_cmvn_file;
opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk; opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk;
opts.to_float32 = FLAGS_to_float32;
kaldi::FrameExtractionOptions frame_opts; kaldi::FrameExtractionOptions frame_opts;
frame_opts.dither = 0.0; frame_opts.dither = 0.0;
frame_opts.frame_shift_ms = 10; frame_opts.frame_shift_ms = 10;
opts.use_fbank = FLAGS_use_fbank; opts.use_fbank = FLAGS_use_fbank;
if (opts.use_fbank) { if (opts.use_fbank) {
opts.to_float32 = false;
frame_opts.window_type = "povey"; frame_opts.window_type = "povey";
frame_opts.frame_length_ms = 25; frame_opts.frame_length_ms = 25;
opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
opts.fbank_opts.fbank_opts.frame_opts = frame_opts; opts.fbank_opts.fbank_opts.frame_opts = frame_opts;
} else { } else {
opts.to_float32 = true;
frame_opts.remove_dc_offset = false; frame_opts.remove_dc_offset = false;
frame_opts.frame_length_ms = 20; frame_opts.frame_length_ms = 20;
frame_opts.window_type = "hanning"; frame_opts.window_type = "hanning";

@ -107,8 +107,11 @@ bool Fbank::Compute(const Vector<BaseFloat>& waves, Vector<BaseFloat>* feats) {
if (!opts_.fbank_opts.use_power) { if (!opts_.fbank_opts.use_power) {
power_spectrum.ApplyPow(0.5); power_spectrum.ApplyPow(0.5);
} }
int32 mel_offset = ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1 : 0); int32 mel_offset =
SubVector<BaseFloat> mel_energies(this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins); ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1
: 0);
SubVector<BaseFloat> mel_energies(
this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins);
mel_bank.Compute(power_spectrum, &mel_energies); mel_bank.Compute(power_spectrum, &mel_energies);
mel_energies.ApplyFloor(1e-07); mel_energies.ApplyFloor(1e-07);
mel_energies.ApplyLog(); mel_energies.ApplyLog();

@ -25,11 +25,11 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) {
unique_ptr<FrontendInterface> base_feature; unique_ptr<FrontendInterface> base_feature;
if (opts.use_fbank) { if (opts.use_fbank) {
base_feature.reset(new ppspeech::Fbank(opts.fbank_opts, base_feature.reset(
std::move(data_source))); new ppspeech::Fbank(opts.fbank_opts, std::move(data_source)));
} else { } else {
base_feature.reset(new ppspeech::LinearSpectrogram(opts.linear_spectrogram_opts, base_feature.reset(new ppspeech::LinearSpectrogram(
std::move(data_source))); opts.linear_spectrogram_opts, std::move(data_source)));
} }
unique_ptr<FrontendInterface> cmvn( unique_ptr<FrontendInterface> cmvn(

@ -18,25 +18,25 @@
#include "frontend/audio/audio_cache.h" #include "frontend/audio/audio_cache.h"
#include "frontend/audio/data_cache.h" #include "frontend/audio/data_cache.h"
#include "frontend/audio/fbank.h"
#include "frontend/audio/feature_cache.h" #include "frontend/audio/feature_cache.h"
#include "frontend/audio/frontend_itf.h" #include "frontend/audio/frontend_itf.h"
#include "frontend/audio/linear_spectrogram.h" #include "frontend/audio/linear_spectrogram.h"
#include "frontend/audio/fbank.h"
#include "frontend/audio/normalizer.h" #include "frontend/audio/normalizer.h"
namespace ppspeech { namespace ppspeech {
struct FeaturePipelineOptions { struct FeaturePipelineOptions {
std::string cmvn_file; std::string cmvn_file;
bool to_float32; bool to_float32; // true, only for linear feature
bool use_fbank; bool use_fbank;
LinearSpectrogramOptions linear_spectrogram_opts; LinearSpectrogramOptions linear_spectrogram_opts;
FbankOptions fbank_opts; FbankOptions fbank_opts;
FeatureCacheOptions feature_cache_opts; FeatureCacheOptions feature_cache_opts;
FeaturePipelineOptions() FeaturePipelineOptions()
: cmvn_file(""), : cmvn_file(""),
to_float32(false), to_float32(false), // true, only for linear feature
use_fbank(false), use_fbank(true),
linear_spectrogram_opts(), linear_spectrogram_opts(),
fbank_opts(), fbank_opts(),
feature_cache_opts() {} feature_cache_opts() {}

Loading…
Cancel
Save