diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index d693dc41..6808de5e 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -630,4 +630,4 @@ bash server.sh [2022-05-02 18:29:26,566] [ INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康。 ``` - \ No newline at end of file + diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index b768c435..5fa81d4b 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -638,4 +638,4 @@ bash server.sh [2022-05-02 18:29:26,566] [ INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康。 ``` - \ No newline at end of file + diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py index 3111badf..0a169f8b 100644 --- a/paddlespeech/cli/vector/infer.py +++ b/paddlespeech/cli/vector/infer.py @@ -437,7 +437,9 @@ class VectorExecutor(BaseExecutor): if self.sample_rate != 16000 and self.sample_rate != 8000: logger.error( "invalid sample rate, please input --sr 8000 or --sr 16000") - logger.error(f"The model sample rate: {self.sample_rate}, the external sample rate is: {sample_rate}") + logger.error( + f"The model sample rate: {self.sample_rate}, the external sample rate is: {sample_rate}" + ) return False if isinstance(audio_file, (str, os.PathLike)): diff --git a/paddlespeech/server/README_cn.md b/paddlespeech/server/README_cn.md index 98c43c98..4bd4d873 100644 --- a/paddlespeech/server/README_cn.md +++ b/paddlespeech/server/README_cn.md @@ -83,4 +83,4 @@ paddlespeech_client vector --task spk --server_ip 127.0.0.1 --port 8090 --input ``` paddlespeech_client vector --task score --server_ip 127.0.0.1 --port 8090 --enroll 123456789.wav --test 85236145389.wav -``` \ No newline at end of file +``` diff --git a/paddlespeech/server/engine/vector/__init__.py b/paddlespeech/server/engine/vector/__init__.py index e69de29b..97043fd7 100644 --- a/paddlespeech/server/engine/vector/__init__.py +++ b/paddlespeech/server/engine/vector/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/server/engine/vector/python/__init__.py b/paddlespeech/server/engine/vector/python/__init__.py index e69de29b..97043fd7 100644 --- a/paddlespeech/server/engine/vector/python/__init__.py +++ b/paddlespeech/server/engine/vector/python/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/server/engine/vector/python/vector_engine.py b/paddlespeech/server/engine/vector/python/vector_engine.py index 2fd8dec6..85430370 100644 --- a/paddlespeech/server/engine/vector/python/vector_engine.py +++ b/paddlespeech/server/engine/vector/python/vector_engine.py @@ -16,9 +16,9 @@ from collections import OrderedDict import numpy as np import paddle - from paddleaudio.backends import load as load_audio from paddleaudio.compliance.librosa import melspectrogram + from paddlespeech.cli.log import logger from paddlespeech.cli.vector.infer import VectorExecutor from paddlespeech.server.engine.base_engine import BaseEngine diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh index b44200b0..650cb140 100755 --- a/speechx/examples/ds2_ol/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -155,7 +155,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ --cmvn_file=$cmvn \ --model_path=$model_dir/avg_1.jit.pdmodel \ - --to_float32=true \ --streaming_chunk=30 \ --param_path=$model_dir/avg_1.jit.pdiparams \ --word_symbol_table=$wfst/words.txt \ diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh index 246e3be4..88c16857 100755 --- a/speechx/examples/ds2_ol/aishell/run_fbank.sh +++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh @@ -5,7 +5,7 @@ set -e . path.sh nj=40 -stage=4 +stage=0 stop_stage=5 . utils/parse_options.sh @@ -156,10 +156,10 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --use_fbank=true \ --to_float32=false \ --param_path=$model_dir/avg_5.jit.pdiparams \ - --word_symbol_table=$graph_dir/words.txt \ + --word_symbol_table=$wfst/words.txt \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ --model_cache_shapes="5-1-2048,5-1-2048" \ - --graph_path=$graph_dir/TLG.fst --max_active=7500 \ + --graph_path=$wfst/TLG.fst --max_active=7500 \ --acoustic_scale=1.2 \ --result_wspecifier=ark,t:./result_fbank_recognizer diff --git a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc index 00764f53..7aef73f7 100644 --- a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc +++ b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc @@ -19,6 +19,7 @@ DEFINE_string(wav_rspecifier, "", "test feature rspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier"); +DEFINE_int32(sample_rate, 16000, "sample rate"); int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, false); @@ -30,7 +31,8 @@ int main(int argc, char* argv[]) { kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); - int sample_rate = 16000; + + int sample_rate = FLAGS_sample_rate; float streaming_chunk = FLAGS_streaming_chunk; int chunk_sample_size = streaming_chunk * sample_rate; LOG(INFO) << "sr: " << sample_rate; diff --git a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc b/speechx/examples/ds2_ol/feat/compute_fbank_main.cc index 7beaa587..67683eeb 100644 --- a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc +++ b/speechx/examples/ds2_ol/feat/compute_fbank_main.cc @@ -69,6 +69,7 @@ int main(int argc, char* argv[]) { feat_cache_opts.frame_chunk_stride = 1; feat_cache_opts.frame_chunk_size = 1; ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn)); + LOG(INFO) << "fbank: " << true; LOG(INFO) << "feat dim: " << feature_cache.Dim(); int sample_rate = 16000; diff --git a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc index c3652ad4..bbf0e690 100644 --- a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc +++ b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc @@ -56,6 +56,7 @@ int main(int argc, char* argv[]) { opt.frame_opts.remove_dc_offset = false; opt.frame_opts.window_type = "hanning"; opt.frame_opts.preemph_coeff = 0.0; + LOG(INFO) << "linear feature: " << true; LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms; LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms; @@ -77,7 +78,7 @@ int main(int argc, char* argv[]) { int sample_rate = 16000; float streaming_chunk = FLAGS_streaming_chunk; int chunk_sample_size = streaming_chunk * sample_rate; - LOG(INFO) << "sr: " << sample_rate; + LOG(INFO) << "sample rate: " << sample_rate; LOG(INFO) << "chunk size (s): " << streaming_chunk; LOG(INFO) << "chunk size (sample): " << chunk_sample_size; diff --git a/speechx/examples/ds2_ol/websocket/websocket_server.sh b/speechx/examples/ds2_ol/websocket/websocket_server.sh index 0e389f89..fc57e326 100755 --- a/speechx/examples/ds2_ol/websocket/websocket_server.sh +++ b/speechx/examples/ds2_ol/websocket/websocket_server.sh @@ -63,7 +63,6 @@ websocket_server_main \ --cmvn_file=$cmvn \ --model_path=$model_dir/avg_1.jit.pdmodel \ --streaming_chunk=0.1 \ - --to_float32=true \ --param_path=$model_dir/avg_1.jit.pdiparams \ --word_symbol_table=$wfst/words.txt \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index 85de08ca..b2bf1890 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -19,23 +19,24 @@ #include "decoder/ctc_tlg_decoder.h" #include "frontend/audio/feature_pipeline.h" +// feature +DEFINE_bool(use_fbank, false, "False for fbank; or linear feature"); +// DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear +// feature, or fbank"); +DEFINE_int32(num_bins, 161, "num bins of mel"); DEFINE_string(cmvn_file, "", "read cmvn"); DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size"); -DEFINE_bool(to_float32, true, "audio convert to pcm32"); -DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); -DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); -DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); -DEFINE_string(graph_path, "TLG", "decoder graph"); -DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); -DEFINE_int32(max_active, 7500, "max active"); -DEFINE_double(beam, 15.0, "decoder beam"); -DEFINE_double(lattice_beam, 7.5, "decoder beam"); +// feature sliding window DEFINE_int32(receptive_field_length, 7, "receptive field of two CNN(kernel=5) downsampling module."); DEFINE_int32(downsampling_rate, 4, "two CNN(kernel=5) module downsampling rate."); + +// nnet +DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); +DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); DEFINE_string( model_input_names, "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box", @@ -47,8 +48,14 @@ DEFINE_string(model_cache_names, "chunk_state_h_box,chunk_state_c_box", "model cache names"); DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); -DEFINE_bool(use_fbank, false, "use fbank or linear feature"); -DEFINE_int32(num_bins, 161, "num bins of mel"); + +// decoder +DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); +DEFINE_string(graph_path, "TLG", "decoder graph"); +DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); +DEFINE_int32(max_active, 7500, "max active"); +DEFINE_double(beam, 15.0, "decoder beam"); +DEFINE_double(lattice_beam, 7.5, "decoder beam"); namespace ppspeech { // todo refactor later @@ -56,22 +63,23 @@ FeaturePipelineOptions InitFeaturePipelineOptions() { FeaturePipelineOptions opts; opts.cmvn_file = FLAGS_cmvn_file; opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk; - opts.to_float32 = FLAGS_to_float32; kaldi::FrameExtractionOptions frame_opts; frame_opts.dither = 0.0; frame_opts.frame_shift_ms = 10; opts.use_fbank = FLAGS_use_fbank; if (opts.use_fbank) { - frame_opts.window_type = "povey"; - frame_opts.frame_length_ms = 25; - opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; - opts.fbank_opts.fbank_opts.frame_opts = frame_opts; + opts.to_float32 = false; + frame_opts.window_type = "povey"; + frame_opts.frame_length_ms = 25; + opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; + opts.fbank_opts.fbank_opts.frame_opts = frame_opts; } else { - frame_opts.remove_dc_offset = false; - frame_opts.frame_length_ms = 20; - frame_opts.window_type = "hanning"; - frame_opts.preemph_coeff = 0.0; - opts.linear_spectrogram_opts.frame_opts = frame_opts; + opts.to_float32 = true; + frame_opts.remove_dc_offset = false; + frame_opts.frame_length_ms = 20; + frame_opts.window_type = "hanning"; + frame_opts.preemph_coeff = 0.0; + opts.linear_spectrogram_opts.frame_opts = frame_opts; } opts.feature_cache_opts.frame_chunk_size = FLAGS_receptive_field_length; opts.feature_cache_opts.frame_chunk_stride = FLAGS_downsampling_rate; diff --git a/speechx/speechx/frontend/audio/fbank.cc b/speechx/speechx/frontend/audio/fbank.cc index a865db59..fea9032a 100644 --- a/speechx/speechx/frontend/audio/fbank.cc +++ b/speechx/speechx/frontend/audio/fbank.cc @@ -102,13 +102,16 @@ bool Fbank::Compute(const Vector& waves, Vector* feats) { // note: this online feature-extraction code does not support VTLN. RealFft(&window, true); kaldi::ComputePowerSpectrum(&window); - const kaldi::MelBanks &mel_bank = *(computer_.GetMelBanks(1.0)); - SubVector power_spectrum(window, 0, window.Dim() / 2 + 1); + const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0)); + SubVector power_spectrum(window, 0, window.Dim() / 2 + 1); if (!opts_.fbank_opts.use_power) { power_spectrum.ApplyPow(0.5); } - int32 mel_offset = ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1 : 0); - SubVector mel_energies(this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins); + int32 mel_offset = + ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1 + : 0); + SubVector mel_energies( + this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins); mel_bank.Compute(power_spectrum, &mel_energies); mel_energies.ApplyFloor(1e-07); mel_energies.ApplyLog(); diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc index 40891871..087de0f0 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.cc +++ b/speechx/speechx/frontend/audio/feature_pipeline.cc @@ -23,13 +23,13 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) { new ppspeech::AudioCache(1000 * kint16max, opts.to_float32)); unique_ptr base_feature; - + if (opts.use_fbank) { - base_feature.reset(new ppspeech::Fbank(opts.fbank_opts, - std::move(data_source))); + base_feature.reset( + new ppspeech::Fbank(opts.fbank_opts, std::move(data_source))); } else { - base_feature.reset(new ppspeech::LinearSpectrogram(opts.linear_spectrogram_opts, - std::move(data_source))); + base_feature.reset(new ppspeech::LinearSpectrogram( + opts.linear_spectrogram_opts, std::move(data_source))); } unique_ptr cmvn( diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index 4868d37e..6b9b4795 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -18,25 +18,25 @@ #include "frontend/audio/audio_cache.h" #include "frontend/audio/data_cache.h" +#include "frontend/audio/fbank.h" #include "frontend/audio/feature_cache.h" #include "frontend/audio/frontend_itf.h" #include "frontend/audio/linear_spectrogram.h" -#include "frontend/audio/fbank.h" #include "frontend/audio/normalizer.h" namespace ppspeech { struct FeaturePipelineOptions { std::string cmvn_file; - bool to_float32; + bool to_float32; // true, only for linear feature bool use_fbank; LinearSpectrogramOptions linear_spectrogram_opts; FbankOptions fbank_opts; FeatureCacheOptions feature_cache_opts; FeaturePipelineOptions() : cmvn_file(""), - to_float32(false), - use_fbank(false), + to_float32(false), // true, only for linear feature + use_fbank(true), linear_spectrogram_opts(), fbank_opts(), feature_cache_opts() {} diff --git a/speechx/speechx/websocket/websocket_client.cc b/speechx/speechx/websocket/websocket_client.cc index 5176dc89..6bd930b8 100644 --- a/speechx/speechx/websocket/websocket_client.cc +++ b/speechx/speechx/websocket/websocket_client.cc @@ -1,5 +1,5 @@ -// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved. -// +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +// 2022 PaddlePaddle Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at diff --git a/speechx/speechx/websocket/websocket_client.h b/speechx/speechx/websocket/websocket_client.h index df7395a7..ac0aed31 100644 --- a/speechx/speechx/websocket/websocket_client.h +++ b/speechx/speechx/websocket/websocket_client.h @@ -1,5 +1,5 @@ -// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved. -// +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +// 2022 PaddlePaddle Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at diff --git a/speechx/speechx/websocket/websocket_server.cc b/speechx/speechx/websocket/websocket_server.cc index 2a6b8990..28c9eca4 100644 --- a/speechx/speechx/websocket/websocket_server.cc +++ b/speechx/speechx/websocket/websocket_server.cc @@ -1,5 +1,5 @@ -// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved. -// +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +// 2022 PaddlePaddle Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at diff --git a/speechx/speechx/websocket/websocket_server.h b/speechx/speechx/websocket/websocket_server.h index 8856f5d0..9ea88282 100644 --- a/speechx/speechx/websocket/websocket_server.h +++ b/speechx/speechx/websocket/websocket_server.h @@ -1,5 +1,5 @@ -// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved. -// +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +// 2022 PaddlePaddle Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at