From 5d5266abff63a32c8f1c97351a299371b4b40abc Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 6 May 2022 02:36:37 +0000 Subject: [PATCH 1/5] rm to_float32 flags, default is fbank --- speechx/examples/ds2_ol/aishell/run.sh | 1 - .../ds2_ol/decoder/recognizer_test_main.cc | 4 ++- .../ds2_ol/feat/compute_fbank_main.cc | 1 + .../feat/linear-spectrogram-wo-db-norm-ol.cc | 3 +- .../ds2_ol/websocket/websocket_server.sh | 1 - speechx/speechx/decoder/param.h | 31 ++++++++++++------- .../speechx/frontend/audio/feature_pipeline.h | 6 ++-- 7 files changed, 28 insertions(+), 19 deletions(-) diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh index b44200b0..650cb140 100755 --- a/speechx/examples/ds2_ol/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -155,7 +155,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ --cmvn_file=$cmvn \ --model_path=$model_dir/avg_1.jit.pdmodel \ - --to_float32=true \ --streaming_chunk=30 \ --param_path=$model_dir/avg_1.jit.pdiparams \ --word_symbol_table=$wfst/words.txt \ diff --git a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc index 00764f53..476fac05 100644 --- a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc +++ b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc @@ -19,6 +19,7 @@ DEFINE_string(wav_rspecifier, "", "test feature rspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier"); +DEFINE_int32(sample_rate, 16000, "sample rate"); int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, false); @@ -30,7 +31,8 @@ int main(int argc, char* argv[]) { kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); - int sample_rate = 16000; + + int sample_rate = FLAGS_sample_rate; float streaming_chunk = FLAGS_streaming_chunk; int chunk_sample_size = streaming_chunk * sample_rate; LOG(INFO) << "sr: " << sample_rate; diff --git a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc b/speechx/examples/ds2_ol/feat/compute_fbank_main.cc index 7beaa587..67683eeb 100644 --- a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc +++ b/speechx/examples/ds2_ol/feat/compute_fbank_main.cc @@ -69,6 +69,7 @@ int main(int argc, char* argv[]) { feat_cache_opts.frame_chunk_stride = 1; feat_cache_opts.frame_chunk_size = 1; ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn)); + LOG(INFO) << "fbank: " << true; LOG(INFO) << "feat dim: " << feature_cache.Dim(); int sample_rate = 16000; diff --git a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc index c3652ad4..bbf0e690 100644 --- a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc +++ b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc @@ -56,6 +56,7 @@ int main(int argc, char* argv[]) { opt.frame_opts.remove_dc_offset = false; opt.frame_opts.window_type = "hanning"; opt.frame_opts.preemph_coeff = 0.0; + LOG(INFO) << "linear feature: " << true; LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms; LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms; @@ -77,7 +78,7 @@ int main(int argc, char* argv[]) { int sample_rate = 16000; float streaming_chunk = FLAGS_streaming_chunk; int chunk_sample_size = streaming_chunk * sample_rate; - LOG(INFO) << "sr: " << sample_rate; + LOG(INFO) << "sample rate: " << sample_rate; LOG(INFO) << "chunk size (s): " << streaming_chunk; LOG(INFO) << "chunk size (sample): " << chunk_sample_size; diff --git a/speechx/examples/ds2_ol/websocket/websocket_server.sh b/speechx/examples/ds2_ol/websocket/websocket_server.sh index 0e389f89..fc57e326 100755 --- a/speechx/examples/ds2_ol/websocket/websocket_server.sh +++ b/speechx/examples/ds2_ol/websocket/websocket_server.sh @@ -63,7 +63,6 @@ websocket_server_main \ --cmvn_file=$cmvn \ --model_path=$model_dir/avg_1.jit.pdmodel \ --streaming_chunk=0.1 \ - --to_float32=true \ --param_path=$model_dir/avg_1.jit.pdiparams \ --word_symbol_table=$wfst/words.txt \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index 85de08ca..9905bc6e 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -19,23 +19,23 @@ #include "decoder/ctc_tlg_decoder.h" #include "frontend/audio/feature_pipeline.h" +// feature +DEFINE_bool(use_fbank, false, "False for fbank; or linear feature"); +// DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear feature, or fbank"); +DEFINE_int32(num_bins, 161, "num bins of mel"); DEFINE_string(cmvn_file, "", "read cmvn"); DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size"); -DEFINE_bool(to_float32, true, "audio convert to pcm32"); -DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); -DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); -DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); -DEFINE_string(graph_path, "TLG", "decoder graph"); -DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); -DEFINE_int32(max_active, 7500, "max active"); -DEFINE_double(beam, 15.0, "decoder beam"); -DEFINE_double(lattice_beam, 7.5, "decoder beam"); +// feature sliding window DEFINE_int32(receptive_field_length, 7, "receptive field of two CNN(kernel=5) downsampling module."); DEFINE_int32(downsampling_rate, 4, "two CNN(kernel=5) module downsampling rate."); + +// nnet +DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); +DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); DEFINE_string( model_input_names, "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box", @@ -47,8 +47,14 @@ DEFINE_string(model_cache_names, "chunk_state_h_box,chunk_state_c_box", "model cache names"); DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); -DEFINE_bool(use_fbank, false, "use fbank or linear feature"); -DEFINE_int32(num_bins, 161, "num bins of mel"); + +// decoder +DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); +DEFINE_string(graph_path, "TLG", "decoder graph"); +DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); +DEFINE_int32(max_active, 7500, "max active"); +DEFINE_double(beam, 15.0, "decoder beam"); +DEFINE_double(lattice_beam, 7.5, "decoder beam"); namespace ppspeech { // todo refactor later @@ -56,17 +62,18 @@ FeaturePipelineOptions InitFeaturePipelineOptions() { FeaturePipelineOptions opts; opts.cmvn_file = FLAGS_cmvn_file; opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk; - opts.to_float32 = FLAGS_to_float32; kaldi::FrameExtractionOptions frame_opts; frame_opts.dither = 0.0; frame_opts.frame_shift_ms = 10; opts.use_fbank = FLAGS_use_fbank; if (opts.use_fbank) { + opts.to_float32 = false; frame_opts.window_type = "povey"; frame_opts.frame_length_ms = 25; opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; opts.fbank_opts.fbank_opts.frame_opts = frame_opts; } else { + opts.to_float32 = true; frame_opts.remove_dc_offset = false; frame_opts.frame_length_ms = 20; frame_opts.window_type = "hanning"; diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index 4868d37e..1acf62a9 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -28,15 +28,15 @@ namespace ppspeech { struct FeaturePipelineOptions { std::string cmvn_file; - bool to_float32; + bool to_float32; // true, only for linear feature bool use_fbank; LinearSpectrogramOptions linear_spectrogram_opts; FbankOptions fbank_opts; FeatureCacheOptions feature_cache_opts; FeaturePipelineOptions() : cmvn_file(""), - to_float32(false), - use_fbank(false), + to_float32(false), // true, only for linear feature + use_fbank(true), linear_spectrogram_opts(), fbank_opts(), feature_cache_opts() {} From 8522b8299971e1d86ae6e474f656ea69c25f0060 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 6 May 2022 02:40:21 +0000 Subject: [PATCH 2/5] format --- demos/streaming_asr_server/README.md | 2 +- demos/streaming_asr_server/README_cn.md | 2 +- paddlespeech/cli/vector/infer.py | 4 ++- paddlespeech/server/README_cn.md | 2 +- paddlespeech/server/engine/vector/__init__.py | 13 ++++++++++ .../server/engine/vector/python/__init__.py | 13 ++++++++++ .../engine/vector/python/vector_engine.py | 2 +- .../ds2_ol/decoder/recognizer_test_main.cc | 2 +- speechx/speechx/decoder/param.h | 25 ++++++++++--------- speechx/speechx/frontend/audio/fbank.cc | 11 +++++--- .../frontend/audio/feature_pipeline.cc | 10 ++++---- .../speechx/frontend/audio/feature_pipeline.h | 6 ++--- 12 files changed, 62 insertions(+), 30 deletions(-) diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index d693dc41..6808de5e 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -630,4 +630,4 @@ bash server.sh [2022-05-02 18:29:26,566] [ INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康。 ``` - \ No newline at end of file + diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index b768c435..5fa81d4b 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -638,4 +638,4 @@ bash server.sh [2022-05-02 18:29:26,566] [ INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康。 ``` - \ No newline at end of file + diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py index 3111badf..0a169f8b 100644 --- a/paddlespeech/cli/vector/infer.py +++ b/paddlespeech/cli/vector/infer.py @@ -437,7 +437,9 @@ class VectorExecutor(BaseExecutor): if self.sample_rate != 16000 and self.sample_rate != 8000: logger.error( "invalid sample rate, please input --sr 8000 or --sr 16000") - logger.error(f"The model sample rate: {self.sample_rate}, the external sample rate is: {sample_rate}") + logger.error( + f"The model sample rate: {self.sample_rate}, the external sample rate is: {sample_rate}" + ) return False if isinstance(audio_file, (str, os.PathLike)): diff --git a/paddlespeech/server/README_cn.md b/paddlespeech/server/README_cn.md index 010d3d51..a974d40f 100644 --- a/paddlespeech/server/README_cn.md +++ b/paddlespeech/server/README_cn.md @@ -82,4 +82,4 @@ paddlespeech_client vector --task spk --server_ip 127.0.0.1 --port 8090 --input ``` paddlespeech_client vector --task score --server_ip 127.0.0.1 --port 8090 --enroll 123456789.wav --test 85236145389.wav -``` \ No newline at end of file +``` diff --git a/paddlespeech/server/engine/vector/__init__.py b/paddlespeech/server/engine/vector/__init__.py index e69de29b..97043fd7 100644 --- a/paddlespeech/server/engine/vector/__init__.py +++ b/paddlespeech/server/engine/vector/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/server/engine/vector/python/__init__.py b/paddlespeech/server/engine/vector/python/__init__.py index e69de29b..97043fd7 100644 --- a/paddlespeech/server/engine/vector/python/__init__.py +++ b/paddlespeech/server/engine/vector/python/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/server/engine/vector/python/vector_engine.py b/paddlespeech/server/engine/vector/python/vector_engine.py index 2fd8dec6..85430370 100644 --- a/paddlespeech/server/engine/vector/python/vector_engine.py +++ b/paddlespeech/server/engine/vector/python/vector_engine.py @@ -16,9 +16,9 @@ from collections import OrderedDict import numpy as np import paddle - from paddleaudio.backends import load as load_audio from paddleaudio.compliance.librosa import melspectrogram + from paddlespeech.cli.log import logger from paddlespeech.cli.vector.infer import VectorExecutor from paddlespeech.server.engine.base_engine import BaseEngine diff --git a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc index 476fac05..7aef73f7 100644 --- a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc +++ b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc @@ -31,7 +31,7 @@ int main(int argc, char* argv[]) { kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); - + int sample_rate = FLAGS_sample_rate; float streaming_chunk = FLAGS_streaming_chunk; int chunk_sample_size = streaming_chunk * sample_rate; diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index 9905bc6e..b2bf1890 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -21,7 +21,8 @@ // feature DEFINE_bool(use_fbank, false, "False for fbank; or linear feature"); -// DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear feature, or fbank"); +// DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear +// feature, or fbank"); DEFINE_int32(num_bins, 161, "num bins of mel"); DEFINE_string(cmvn_file, "", "read cmvn"); DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size"); @@ -67,18 +68,18 @@ FeaturePipelineOptions InitFeaturePipelineOptions() { frame_opts.frame_shift_ms = 10; opts.use_fbank = FLAGS_use_fbank; if (opts.use_fbank) { - opts.to_float32 = false; - frame_opts.window_type = "povey"; - frame_opts.frame_length_ms = 25; - opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; - opts.fbank_opts.fbank_opts.frame_opts = frame_opts; + opts.to_float32 = false; + frame_opts.window_type = "povey"; + frame_opts.frame_length_ms = 25; + opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; + opts.fbank_opts.fbank_opts.frame_opts = frame_opts; } else { - opts.to_float32 = true; - frame_opts.remove_dc_offset = false; - frame_opts.frame_length_ms = 20; - frame_opts.window_type = "hanning"; - frame_opts.preemph_coeff = 0.0; - opts.linear_spectrogram_opts.frame_opts = frame_opts; + opts.to_float32 = true; + frame_opts.remove_dc_offset = false; + frame_opts.frame_length_ms = 20; + frame_opts.window_type = "hanning"; + frame_opts.preemph_coeff = 0.0; + opts.linear_spectrogram_opts.frame_opts = frame_opts; } opts.feature_cache_opts.frame_chunk_size = FLAGS_receptive_field_length; opts.feature_cache_opts.frame_chunk_stride = FLAGS_downsampling_rate; diff --git a/speechx/speechx/frontend/audio/fbank.cc b/speechx/speechx/frontend/audio/fbank.cc index a865db59..fea9032a 100644 --- a/speechx/speechx/frontend/audio/fbank.cc +++ b/speechx/speechx/frontend/audio/fbank.cc @@ -102,13 +102,16 @@ bool Fbank::Compute(const Vector& waves, Vector* feats) { // note: this online feature-extraction code does not support VTLN. RealFft(&window, true); kaldi::ComputePowerSpectrum(&window); - const kaldi::MelBanks &mel_bank = *(computer_.GetMelBanks(1.0)); - SubVector power_spectrum(window, 0, window.Dim() / 2 + 1); + const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0)); + SubVector power_spectrum(window, 0, window.Dim() / 2 + 1); if (!opts_.fbank_opts.use_power) { power_spectrum.ApplyPow(0.5); } - int32 mel_offset = ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1 : 0); - SubVector mel_energies(this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins); + int32 mel_offset = + ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1 + : 0); + SubVector mel_energies( + this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins); mel_bank.Compute(power_spectrum, &mel_energies); mel_energies.ApplyFloor(1e-07); mel_energies.ApplyLog(); diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc index 40891871..087de0f0 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.cc +++ b/speechx/speechx/frontend/audio/feature_pipeline.cc @@ -23,13 +23,13 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) { new ppspeech::AudioCache(1000 * kint16max, opts.to_float32)); unique_ptr base_feature; - + if (opts.use_fbank) { - base_feature.reset(new ppspeech::Fbank(opts.fbank_opts, - std::move(data_source))); + base_feature.reset( + new ppspeech::Fbank(opts.fbank_opts, std::move(data_source))); } else { - base_feature.reset(new ppspeech::LinearSpectrogram(opts.linear_spectrogram_opts, - std::move(data_source))); + base_feature.reset(new ppspeech::LinearSpectrogram( + opts.linear_spectrogram_opts, std::move(data_source))); } unique_ptr cmvn( diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index 1acf62a9..6b9b4795 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -18,24 +18,24 @@ #include "frontend/audio/audio_cache.h" #include "frontend/audio/data_cache.h" +#include "frontend/audio/fbank.h" #include "frontend/audio/feature_cache.h" #include "frontend/audio/frontend_itf.h" #include "frontend/audio/linear_spectrogram.h" -#include "frontend/audio/fbank.h" #include "frontend/audio/normalizer.h" namespace ppspeech { struct FeaturePipelineOptions { std::string cmvn_file; - bool to_float32; // true, only for linear feature + bool to_float32; // true, only for linear feature bool use_fbank; LinearSpectrogramOptions linear_spectrogram_opts; FbankOptions fbank_opts; FeatureCacheOptions feature_cache_opts; FeaturePipelineOptions() : cmvn_file(""), - to_float32(false), // true, only for linear feature + to_float32(false), // true, only for linear feature use_fbank(true), linear_spectrogram_opts(), fbank_opts(), From 491f2d040b9bfc04054d80dccc3680f0ae9d21af Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Fri, 6 May 2022 11:14:30 +0800 Subject: [PATCH 3/5] fix typo --- speechx/examples/ds2_ol/aishell/run_fbank.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh index 246e3be4..88c16857 100755 --- a/speechx/examples/ds2_ol/aishell/run_fbank.sh +++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh @@ -5,7 +5,7 @@ set -e . path.sh nj=40 -stage=4 +stage=0 stop_stage=5 . utils/parse_options.sh @@ -156,10 +156,10 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --use_fbank=true \ --to_float32=false \ --param_path=$model_dir/avg_5.jit.pdiparams \ - --word_symbol_table=$graph_dir/words.txt \ + --word_symbol_table=$wfst/words.txt \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ --model_cache_shapes="5-1-2048,5-1-2048" \ - --graph_path=$graph_dir/TLG.fst --max_active=7500 \ + --graph_path=$wfst/TLG.fst --max_active=7500 \ --acoustic_scale=1.2 \ --result_wspecifier=ark,t:./result_fbank_recognizer From a3eaf16f848ed216eb4e3d386ca61cc2ae812cdf Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Fri, 6 May 2022 11:41:30 +0800 Subject: [PATCH 4/5] fix copyright issue --- speechx/speechx/websocket/websocket_client.cc | 4 ++-- speechx/speechx/websocket/websocket_client.h | 4 ++-- speechx/speechx/websocket/websocket_server.cc | 4 ++-- speechx/speechx/websocket/websocket_server.h | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/speechx/speechx/websocket/websocket_client.cc b/speechx/speechx/websocket/websocket_client.cc index 5176dc89..6bd930b8 100644 --- a/speechx/speechx/websocket/websocket_client.cc +++ b/speechx/speechx/websocket/websocket_client.cc @@ -1,5 +1,5 @@ -// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved. -// +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +// 2022 PaddlePaddle Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at diff --git a/speechx/speechx/websocket/websocket_client.h b/speechx/speechx/websocket/websocket_client.h index df7395a7..ac0aed31 100644 --- a/speechx/speechx/websocket/websocket_client.h +++ b/speechx/speechx/websocket/websocket_client.h @@ -1,5 +1,5 @@ -// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved. -// +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +// 2022 PaddlePaddle Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at diff --git a/speechx/speechx/websocket/websocket_server.cc b/speechx/speechx/websocket/websocket_server.cc index 2a6b8990..28c9eca4 100644 --- a/speechx/speechx/websocket/websocket_server.cc +++ b/speechx/speechx/websocket/websocket_server.cc @@ -1,5 +1,5 @@ -// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved. -// +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +// 2022 PaddlePaddle Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at diff --git a/speechx/speechx/websocket/websocket_server.h b/speechx/speechx/websocket/websocket_server.h index 8856f5d0..9ea88282 100644 --- a/speechx/speechx/websocket/websocket_server.h +++ b/speechx/speechx/websocket/websocket_server.h @@ -1,5 +1,5 @@ -// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved. -// +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +// 2022 PaddlePaddle Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at From e87495f04562e7dfa93fc952200bef3924e95fa4 Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Fri, 6 May 2022 14:13:31 +0800 Subject: [PATCH 5/5] [server] update readme (#1851) * update readme, test=doc * update readme, test=doc * update readme, test=doc --- demos/speech_server/README.md | 11 +++++---- demos/speech_server/README_cn.md | 19 +++++++------- demos/speech_server/conf/application.yaml | 6 ++--- demos/streaming_tts_server/README.md | 6 ++++- demos/streaming_tts_server/README_cn.md | 30 +++++++++++++---------- paddlespeech/server/README.md | 4 ++- paddlespeech/server/README_cn.md | 1 + 7 files changed, 45 insertions(+), 32 deletions(-) diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md index 0323d398..3df93238 100644 --- a/demos/speech_server/README.md +++ b/demos/speech_server/README.md @@ -10,7 +10,7 @@ This demo is an implementation of starting the voice service and accessing the s ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -It is recommended to use **paddlepaddle 2.2.1** or above. +It is recommended to use **paddlepaddle 2.2.2** or above. You can choose one way from meduim and hard to install paddlespeech. ### 2. Prepare config File @@ -18,6 +18,7 @@ The configuration file can be found in `conf/application.yaml` . Among them, `engine_list` indicates the speech engine that will be included in the service to be started, in the format of `_`. At present, the speech tasks integrated by the service include: asr (speech recognition), tts (text to sppech) and cls (audio classification). Currently the engine type supports two forms: python and inference (Paddle Inference) +**Note:** If the service can be started normally in the container, but the client access IP is unreachable, you can try to replace the `host` address in the configuration file with the local IP address. The input of ASR client demo should be a WAV file(`.wav`), and the sample rate must be the same as the model. @@ -51,8 +52,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 11:17:32] [INFO] [on.py:26] Waiting for application startup. INFO: Application startup complete. [2022-02-23 11:17:32] [INFO] [on.py:38] Application startup complete. - INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + INFO: Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) + [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) ``` @@ -74,8 +75,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 14:57:56] [INFO] [on.py:26] Waiting for application startup. INFO: Application startup complete. [2022-02-23 14:57:56] [INFO] [on.py:38] Application startup complete. - INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + INFO: Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) + [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) ``` diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md index 4a7c7447..55fc6b34 100644 --- a/demos/speech_server/README_cn.md +++ b/demos/speech_server/README_cn.md @@ -1,17 +1,17 @@ -([简体中文](./README_cn.md)|English) +(简体中文|[English](./README.md)) # 语音服务 ## 介绍 -这个demo是一个启动语音服务和访问服务的实现。 它可以通过使用`paddlespeech_server` 和 `paddlespeech_client`的单个命令或 python 的几行代码来实现。 +这个demo是一个启动离线语音服务和访问服务的实现。它可以通过使用`paddlespeech_server` 和 `paddlespeech_client`的单个命令或 python 的几行代码来实现。 ## 使用方法 ### 1. 安装 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -推荐使用 **paddlepaddle 2.2.1** 或以上版本。 -你可以从 medium,hard 三中方式中选择一种方式安装 PaddleSpeech。 +推荐使用 **paddlepaddle 2.2.2** 或以上版本。 +你可以从 medium,hard 两种方式中选择一种方式安装 PaddleSpeech。 ### 2. 准备配置文件 @@ -19,9 +19,10 @@ 其中,`engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 目前服务集成的语音任务有: asr(语音识别)、tts(语音合成)以及cls(音频分类)。 目前引擎类型支持两种形式:python 及 inference (Paddle Inference) +**注意:** 如果在容器里可正常启动服务,但客户端访问 ip 不可达,可尝试将配置文件中 `host` 地址换成本地 ip 地址。 -这个 ASR client 的输入应该是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 +ASR client 的输入是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 可以下载此 ASR client的示例音频: ```bash @@ -52,8 +53,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 11:17:32] [INFO] [on.py:26] Waiting for application startup. INFO: Application startup complete. [2022-02-23 11:17:32] [INFO] [on.py:38] Application startup complete. - INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + INFO: Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) + [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) ``` @@ -75,8 +76,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 14:57:56] [INFO] [on.py:26] Waiting for application startup. INFO: Application startup complete. [2022-02-23 14:57:56] [INFO] [on.py:38] Application startup complete. - INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) - [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) + INFO: Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) + [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit) ``` diff --git a/demos/speech_server/conf/application.yaml b/demos/speech_server/conf/application.yaml index 2b1a0599..762f4af6 100644 --- a/demos/speech_server/conf/application.yaml +++ b/demos/speech_server/conf/application.yaml @@ -1,4 +1,4 @@ -# This is the parameter configuration file for PaddleSpeech Serving. +# This is the parameter configuration file for PaddleSpeech Offline Serving. ################################################################################# # SERVER SETTING # @@ -7,8 +7,8 @@ host: 127.0.0.1 port: 8090 # The task format in the engin_list is: _ -# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference'] - +# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference'] +protocol: 'http' engine_list: ['asr_python', 'tts_python', 'cls_python'] diff --git a/demos/streaming_tts_server/README.md b/demos/streaming_tts_server/README.md index d03b9e28..299aa3d2 100644 --- a/demos/streaming_tts_server/README.md +++ b/demos/streaming_tts_server/README.md @@ -10,7 +10,7 @@ This demo is an implementation of starting the streaming speech synthesis servic ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -It is recommended to use **paddlepaddle 2.2.1** or above. +It is recommended to use **paddlepaddle 2.2.2** or above. You can choose one way from meduim and hard to install paddlespeech. @@ -29,6 +29,8 @@ The configuration file can be found in `conf/tts_online_application.yaml`. - When the voc model is mb_melgan, when voc_pad=14, the synthetic audio for streaming inference is consistent with the non-streaming synthetic audio; the minimum voc_pad can be set to 7, and the synthetic audio has no abnormal hearing. If the voc_pad is less than 7, the synthetic audio sounds abnormal. - When the voc model is hifigan, when voc_pad=20, the streaming inference synthetic audio is consistent with the non-streaming synthetic audio; when voc_pad=14, the synthetic audio has no abnormal hearing. - Inference speed: mb_melgan > hifigan; Audio quality: mb_melgan < hifigan +- **Note:** If the service can be started normally in the container, but the client access IP is unreachable, you can try to replace the `host` address in the configuration file with the local IP address. + ### 3. Streaming speech synthesis server and client using http protocol @@ -120,6 +122,7 @@ The configuration file can be found in `conf/tts_online_application.yaml`. - `sample_rate`: Sampling rate, choices: [0, 8000, 16000], the default is the same as the model. Default: 0 - `output`: Output wave filepath. Default: None, which means not to save the audio to the local. - `play`: Whether to play audio, play while synthesizing, default value: False, which means not playing. **Playing audio needs to rely on the pyaudio library**. + - `spk_id, speed, volume, sample_rate` do not take effect in streaming speech synthesis service temporarily. Output: ```bash @@ -254,6 +257,7 @@ The configuration file can be found in `conf/tts_online_application.yaml`. - `sample_rate`: Sampling rate, choices: [0, 8000, 16000], the default is the same as the model. Default: 0 - `output`: Output wave filepath. Default: None, which means not to save the audio to the local. - `play`: Whether to play audio, play while synthesizing, default value: False, which means not playing. **Playing audio needs to rely on the pyaudio library**. + - `spk_id, speed, volume, sample_rate` do not take effect in streaming speech synthesis service temporarily. Output: diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index e40de11b..bb159503 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -10,25 +10,27 @@ ### 1. 安装 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -推荐使用 **paddlepaddle 2.2.1** 或以上版本。 +推荐使用 **paddlepaddle 2.2.2** 或以上版本。 你可以从 medium,hard 两种方式中选择一种方式安装 PaddleSpeech。 ### 2. 准备配置文件 配置文件可参见 `conf/tts_online_application.yaml` 。 -- `protocol`表示该流式TTS服务使用的网络协议,目前支持 **http 和 websocket** 两种。 -- `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 - - 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 - - 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 -- 流式TTS引擎的AM模型支持:**fastspeech2 以及fastspeech2_cnndecoder**; Voc 模型支持:**hifigan, mb_melgan** -- 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 - - fastspeech2不支持流式am推理,因此am_pad与am_block对它无效 - - fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 -- 流式voc推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数,`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 - - hifigan, mb_melgan 均支持流式voc 推理 - - 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 - - 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 +- `protocol` 表示该流式 TTS 服务使用的网络协议,目前支持 **http 和 websocket** 两种。 +- `engine_list` 表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 + - 该 demo 主要介绍流式语音合成服务,因此语音任务应设置为 tts。 + - 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用 onnxruntime 进行推理的引擎。其中,online-onnx 的推理速度更快。 +- 流式 TTS 引擎的 AM 模型支持:**fastspeech2 以及fastspeech2_cnndecoder**; Voc 模型支持:**hifigan, mb_melgan** +- 流式 am 推理中,每次会对一个 chunk 的数据进行推理以达到流式的效果。其中 `am_block` 表示 chunk 中的有效帧数,`am_pad` 表示一个 chunk 中 am_block 前后各加的帧数。am_pad 的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + - fastspeech2 不支持流式 am 推理,因此 am_pad 与 m_block 对它无效 + - fastspeech2_cnndecoder 支持流式推理,当 am_pad=12 时,流式推理合成音频与非流式合成音频一致 +- 流式 voc 推理中,每次会对一个 chunk 的数据进行推理以达到流式的效果。其中 `voc_block` 表示chunk中的有效帧数,`voc_pad` 表示一个 chunk 中 voc_block 前后各加的帧数。voc_pad 的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + - hifigan, mb_melgan 均支持流式 voc 推理 + - 当 voc 模型为 mb_melgan,当 voc_pad=14 时,流式推理合成音频与非流式合成音频一致;voc_pad 最小可以设置为7,合成音频听感上没有异常,若 voc_pad 小于7,合成音频听感上存在异常。 + - 当 voc 模型为 hifigan,当 voc_pad=20 时,流式推理合成音频与非流式合成音频一致;当 voc_pad=14 时,合成音频听感上没有异常。 - 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan +- **注意:** 如果在容器里可正常启动服务,但客户端访问 ip 不可达,可尝试将配置文件中 `host` 地址换成本地 ip 地址。 + ### 3. 使用http协议的流式语音合成服务端及客户端使用方法 #### 3.1 服务端使用方法 @@ -119,6 +121,7 @@ - `sample_rate`: 采样率,可选 [0, 8000, 16000],默认值:0,表示与模型采样率相同 - `output`: 输出音频的路径, 默认值:None,表示不保存音频到本地。 - `play`: 是否播放音频,边合成边播放, 默认值:False,表示不播放。**播放音频需要依赖pyaudio库**。 + - `spk_id, speed, volume, sample_rate` 在流式语音合成服务中暂时不生效。 输出: @@ -254,6 +257,7 @@ - `sample_rate`: 采样率,可选 [0, 8000, 16000],默认值:0,表示与模型采样率相同 - `output`: 输出音频的路径, 默认值:None,表示不保存音频到本地。 - `play`: 是否播放音频,边合成边播放, 默认值:False,表示不播放。**播放音频需要依赖pyaudio库**。 + - `spk_id, speed, volume, sample_rate` 在流式语音合成服务中暂时不生效。 输出: diff --git a/paddlespeech/server/README.md b/paddlespeech/server/README.md index 98ec1e28..f3dc9224 100644 --- a/paddlespeech/server/README.md +++ b/paddlespeech/server/README.md @@ -10,7 +10,9 @@ paddlespeech_server help ``` ### Start the server - First set the service-related configuration parameters, similar to `./conf/application.yaml`. Set `engine_list`, which represents the speech tasks included in the service to be started + First set the service-related configuration parameters, similar to `./conf/application.yaml`. Set `engine_list`, which represents the speech tasks included in the service to be started. + **Note:** If the service can be started normally in the container, but the client access IP is unreachable, you can try to replace the `host` address in the configuration file with the local IP address. + Then start the service: ```bash paddlespeech_server start --config_file ./conf/application.yaml diff --git a/paddlespeech/server/README_cn.md b/paddlespeech/server/README_cn.md index a974d40f..4bd4d873 100644 --- a/paddlespeech/server/README_cn.md +++ b/paddlespeech/server/README_cn.md @@ -11,6 +11,7 @@ ``` ### 启动服务 首先设置服务相关配置文件,类似于 `./conf/application.yaml`,设置 `engine_list`,该值表示即将启动的服务中包含的语音任务。 + **注意:** 如果在容器里可正常启动服务,但客户端访问 ip 不可达,可尝试将配置文件中 `host` 地址换成本地 ip 地址。 然后启动服务: ```bash paddlespeech_server start --config_file ./conf/application.yaml