From 5d5266abff63a32c8f1c97351a299371b4b40abc Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 6 May 2022 02:36:37 +0000
Subject: [PATCH 1/5] rm to_float32 flags, default is fbank

---
 speechx/examples/ds2_ol/aishell/run.sh        |  1 -
 .../ds2_ol/decoder/recognizer_test_main.cc    |  4 ++-
 .../ds2_ol/feat/compute_fbank_main.cc         |  1 +
 .../feat/linear-spectrogram-wo-db-norm-ol.cc  |  3 +-
 .../ds2_ol/websocket/websocket_server.sh      |  1 -
 speechx/speechx/decoder/param.h               | 31 ++++++++++++-------
 .../speechx/frontend/audio/feature_pipeline.h |  6 ++--
 7 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh
index b44200b0..650cb140 100755
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@@ -155,7 +155,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
         --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
         --cmvn_file=$cmvn \
         --model_path=$model_dir/avg_1.jit.pdmodel \
-        --to_float32=true \
         --streaming_chunk=30 \
         --param_path=$model_dir/avg_1.jit.pdiparams \
         --word_symbol_table=$wfst/words.txt \
diff --git a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc
index 00764f53..476fac05 100644
--- a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc
+++ b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc
@@ -19,6 +19,7 @@
 
 DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
 DEFINE_string(result_wspecifier, "", "test result wspecifier");
+DEFINE_int32(sample_rate, 16000, "sample rate");
 
 int main(int argc, char* argv[]) {
     gflags::ParseCommandLineFlags(&argc, &argv, false);
@@ -30,7 +31,8 @@ int main(int argc, char* argv[]) {
     kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
         FLAGS_wav_rspecifier);
     kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
-    int sample_rate = 16000;
+    
+    int sample_rate = FLAGS_sample_rate;
     float streaming_chunk = FLAGS_streaming_chunk;
     int chunk_sample_size = streaming_chunk * sample_rate;
     LOG(INFO) << "sr: " << sample_rate;
diff --git a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc b/speechx/examples/ds2_ol/feat/compute_fbank_main.cc
index 7beaa587..67683eeb 100644
--- a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc
+++ b/speechx/examples/ds2_ol/feat/compute_fbank_main.cc
@@ -69,6 +69,7 @@ int main(int argc, char* argv[]) {
     feat_cache_opts.frame_chunk_stride = 1;
     feat_cache_opts.frame_chunk_size = 1;
     ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn));
+    LOG(INFO) << "fbank: " << true;
     LOG(INFO) << "feat dim: " << feature_cache.Dim();
 
     int sample_rate = 16000;
diff --git a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
index c3652ad4..bbf0e690 100644
--- a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
+++ b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
@@ -56,6 +56,7 @@ int main(int argc, char* argv[]) {
     opt.frame_opts.remove_dc_offset = false;
     opt.frame_opts.window_type = "hanning";
     opt.frame_opts.preemph_coeff = 0.0;
+    LOG(INFO) << "linear feature: " << true;
     LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms;
     LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms;
 
@@ -77,7 +78,7 @@ int main(int argc, char* argv[]) {
     int sample_rate = 16000;
     float streaming_chunk = FLAGS_streaming_chunk;
     int chunk_sample_size = streaming_chunk * sample_rate;
-    LOG(INFO) << "sr: " << sample_rate;
+    LOG(INFO) << "sample rate: " << sample_rate;
     LOG(INFO) << "chunk size (s): " << streaming_chunk;
     LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
 
diff --git a/speechx/examples/ds2_ol/websocket/websocket_server.sh b/speechx/examples/ds2_ol/websocket/websocket_server.sh
index 0e389f89..fc57e326 100755
--- a/speechx/examples/ds2_ol/websocket/websocket_server.sh
+++ b/speechx/examples/ds2_ol/websocket/websocket_server.sh
@@ -63,7 +63,6 @@ websocket_server_main \
     --cmvn_file=$cmvn \
     --model_path=$model_dir/avg_1.jit.pdmodel \
     --streaming_chunk=0.1 \
-    --to_float32=true \
     --param_path=$model_dir/avg_1.jit.pdiparams \
     --word_symbol_table=$wfst/words.txt \
     --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h
index 85de08ca..9905bc6e 100644
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -19,23 +19,23 @@
 #include "decoder/ctc_tlg_decoder.h"
 #include "frontend/audio/feature_pipeline.h"
 
+// feature
+DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
+// DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear feature, or fbank");
+DEFINE_int32(num_bins, 161, "num bins of mel");
 DEFINE_string(cmvn_file, "", "read cmvn");
 DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size");
-DEFINE_bool(to_float32, true, "audio convert to pcm32");
-DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
-DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
-DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
-DEFINE_string(graph_path, "TLG", "decoder graph");
-DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
-DEFINE_int32(max_active, 7500, "max active");
-DEFINE_double(beam, 15.0, "decoder beam");
-DEFINE_double(lattice_beam, 7.5, "decoder beam");
+// feature sliding window
 DEFINE_int32(receptive_field_length,
              7,
              "receptive field of two CNN(kernel=5) downsampling module.");
 DEFINE_int32(downsampling_rate,
              4,
              "two CNN(kernel=5) module downsampling rate.");
+
+// nnet
+DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
+DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
 DEFINE_string(
     model_input_names,
     "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box",
@@ -47,8 +47,14 @@ DEFINE_string(model_cache_names,
               "chunk_state_h_box,chunk_state_c_box",
               "model cache names");
 DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
-DEFINE_bool(use_fbank, false, "use fbank or linear feature");
-DEFINE_int32(num_bins, 161, "num bins of mel");
+
+// decoder
+DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
+DEFINE_string(graph_path, "TLG", "decoder graph");
+DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
+DEFINE_int32(max_active, 7500, "max active");
+DEFINE_double(beam, 15.0, "decoder beam");
+DEFINE_double(lattice_beam, 7.5, "decoder beam");
 
 namespace ppspeech {
 // todo refactor later
@@ -56,17 +62,18 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
     FeaturePipelineOptions opts;
     opts.cmvn_file = FLAGS_cmvn_file;
     opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk;
-    opts.to_float32 = FLAGS_to_float32;
     kaldi::FrameExtractionOptions frame_opts;
     frame_opts.dither = 0.0;
     frame_opts.frame_shift_ms = 10;
     opts.use_fbank = FLAGS_use_fbank;
     if (opts.use_fbank) {
+      opts.to_float32 = false;
       frame_opts.window_type = "povey";
       frame_opts.frame_length_ms = 25;
       opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
       opts.fbank_opts.fbank_opts.frame_opts = frame_opts;
     } else {
+      opts.to_float32 = true;
       frame_opts.remove_dc_offset = false;
       frame_opts.frame_length_ms = 20;
       frame_opts.window_type = "hanning";
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h
index 4868d37e..1acf62a9 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@@ -28,15 +28,15 @@ namespace ppspeech {
 
 struct FeaturePipelineOptions {
     std::string cmvn_file;
-    bool to_float32;
+    bool to_float32; // true, only for linear feature
     bool use_fbank;
     LinearSpectrogramOptions linear_spectrogram_opts;
     FbankOptions fbank_opts;
     FeatureCacheOptions feature_cache_opts;
     FeaturePipelineOptions()
         : cmvn_file(""),
-          to_float32(false),
-          use_fbank(false),
+          to_float32(false), // true, only for linear feature
+          use_fbank(true),
           linear_spectrogram_opts(),
           fbank_opts(),
           feature_cache_opts() {}

From 8522b8299971e1d86ae6e474f656ea69c25f0060 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 6 May 2022 02:40:21 +0000
Subject: [PATCH 2/5] format

---
 demos/streaming_asr_server/README.md          |  2 +-
 demos/streaming_asr_server/README_cn.md       |  2 +-
 paddlespeech/cli/vector/infer.py              |  4 ++-
 paddlespeech/server/README_cn.md              |  2 +-
 paddlespeech/server/engine/vector/__init__.py | 13 ++++++++++
 .../server/engine/vector/python/__init__.py   | 13 ++++++++++
 .../engine/vector/python/vector_engine.py     |  2 +-
 .../ds2_ol/decoder/recognizer_test_main.cc    |  2 +-
 speechx/speechx/decoder/param.h               | 25 ++++++++++---------
 speechx/speechx/frontend/audio/fbank.cc       | 11 +++++---
 .../frontend/audio/feature_pipeline.cc        | 10 ++++----
 .../speechx/frontend/audio/feature_pipeline.h |  6 ++---
 12 files changed, 62 insertions(+), 30 deletions(-)

diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md
index d693dc41..6808de5e 100644
--- a/demos/streaming_asr_server/README.md
+++ b/demos/streaming_asr_server/README.md
@@ -630,4 +630,4 @@ bash server.sh
   [2022-05-02 18:29:26,566] [    INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康。
   ```
 
-  
\ No newline at end of file
+  
diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md
index b768c435..5fa81d4b 100644
--- a/demos/streaming_asr_server/README_cn.md
+++ b/demos/streaming_asr_server/README_cn.md
@@ -638,4 +638,4 @@ bash server.sh
   [2022-05-02 18:29:26,566] [    INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康。
   ```
 
-  
\ No newline at end of file
+  
diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
index 3111badf..0a169f8b 100644
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -437,7 +437,9 @@ class VectorExecutor(BaseExecutor):
         if self.sample_rate != 16000 and self.sample_rate != 8000:
             logger.error(
                 "invalid sample rate, please input --sr 8000 or --sr 16000")
-            logger.error(f"The model sample rate: {self.sample_rate}, the external sample rate is: {sample_rate}")
+            logger.error(
+                f"The model sample rate: {self.sample_rate}, the external sample rate is: {sample_rate}"
+            )
             return False
 
         if isinstance(audio_file, (str, os.PathLike)):
diff --git a/paddlespeech/server/README_cn.md b/paddlespeech/server/README_cn.md
index 010d3d51..a974d40f 100644
--- a/paddlespeech/server/README_cn.md
+++ b/paddlespeech/server/README_cn.md
@@ -82,4 +82,4 @@ paddlespeech_client vector --task spk  --server_ip 127.0.0.1 --port 8090 --input
 
 ```
 paddlespeech_client vector --task score  --server_ip 127.0.0.1 --port 8090 --enroll 123456789.wav --test 85236145389.wav
-```
\ No newline at end of file
+```
diff --git a/paddlespeech/server/engine/vector/__init__.py b/paddlespeech/server/engine/vector/__init__.py
index e69de29b..97043fd7 100644
--- a/paddlespeech/server/engine/vector/__init__.py
+++ b/paddlespeech/server/engine/vector/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/server/engine/vector/python/__init__.py b/paddlespeech/server/engine/vector/python/__init__.py
index e69de29b..97043fd7 100644
--- a/paddlespeech/server/engine/vector/python/__init__.py
+++ b/paddlespeech/server/engine/vector/python/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/server/engine/vector/python/vector_engine.py b/paddlespeech/server/engine/vector/python/vector_engine.py
index 2fd8dec6..85430370 100644
--- a/paddlespeech/server/engine/vector/python/vector_engine.py
+++ b/paddlespeech/server/engine/vector/python/vector_engine.py
@@ -16,9 +16,9 @@ from collections import OrderedDict
 
 import numpy as np
 import paddle
-
 from paddleaudio.backends import load as load_audio
 from paddleaudio.compliance.librosa import melspectrogram
+
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.vector.infer import VectorExecutor
 from paddlespeech.server.engine.base_engine import BaseEngine
diff --git a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc
index 476fac05..7aef73f7 100644
--- a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc
+++ b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc
@@ -31,7 +31,7 @@ int main(int argc, char* argv[]) {
     kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
         FLAGS_wav_rspecifier);
     kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
-    
+
     int sample_rate = FLAGS_sample_rate;
     float streaming_chunk = FLAGS_streaming_chunk;
     int chunk_sample_size = streaming_chunk * sample_rate;
diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h
index 9905bc6e..b2bf1890 100644
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -21,7 +21,8 @@
 
 // feature
 DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
-// DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear feature, or fbank");
+// DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear
+// feature, or fbank");
 DEFINE_int32(num_bins, 161, "num bins of mel");
 DEFINE_string(cmvn_file, "", "read cmvn");
 DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size");
@@ -67,18 +68,18 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
     frame_opts.frame_shift_ms = 10;
     opts.use_fbank = FLAGS_use_fbank;
     if (opts.use_fbank) {
-      opts.to_float32 = false;
-      frame_opts.window_type = "povey";
-      frame_opts.frame_length_ms = 25;
-      opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
-      opts.fbank_opts.fbank_opts.frame_opts = frame_opts;
+        opts.to_float32 = false;
+        frame_opts.window_type = "povey";
+        frame_opts.frame_length_ms = 25;
+        opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
+        opts.fbank_opts.fbank_opts.frame_opts = frame_opts;
     } else {
-      opts.to_float32 = true;
-      frame_opts.remove_dc_offset = false;
-      frame_opts.frame_length_ms = 20;
-      frame_opts.window_type = "hanning";
-      frame_opts.preemph_coeff = 0.0;
-      opts.linear_spectrogram_opts.frame_opts = frame_opts;
+        opts.to_float32 = true;
+        frame_opts.remove_dc_offset = false;
+        frame_opts.frame_length_ms = 20;
+        frame_opts.window_type = "hanning";
+        frame_opts.preemph_coeff = 0.0;
+        opts.linear_spectrogram_opts.frame_opts = frame_opts;
     }
     opts.feature_cache_opts.frame_chunk_size = FLAGS_receptive_field_length;
     opts.feature_cache_opts.frame_chunk_stride = FLAGS_downsampling_rate;
diff --git a/speechx/speechx/frontend/audio/fbank.cc b/speechx/speechx/frontend/audio/fbank.cc
index a865db59..fea9032a 100644
--- a/speechx/speechx/frontend/audio/fbank.cc
+++ b/speechx/speechx/frontend/audio/fbank.cc
@@ -102,13 +102,16 @@ bool Fbank::Compute(const Vector<BaseFloat>& waves, Vector<BaseFloat>* feats) {
         // note: this online feature-extraction code does not support VTLN.
         RealFft(&window, true);
         kaldi::ComputePowerSpectrum(&window);
-        const kaldi::MelBanks &mel_bank = *(computer_.GetMelBanks(1.0));
-        SubVector<BaseFloat> power_spectrum(window, 0, window.Dim() / 2 + 1); 
+        const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0));
+        SubVector<BaseFloat> power_spectrum(window, 0, window.Dim() / 2 + 1);
         if (!opts_.fbank_opts.use_power) {
             power_spectrum.ApplyPow(0.5);
         }
-        int32 mel_offset = ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1 : 0);
-        SubVector<BaseFloat> mel_energies(this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins);
+        int32 mel_offset =
+            ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1
+                                                                           : 0);
+        SubVector<BaseFloat> mel_energies(
+            this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins);
         mel_bank.Compute(power_spectrum, &mel_energies);
         mel_energies.ApplyFloor(1e-07);
         mel_energies.ApplyLog();
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc
index 40891871..087de0f0 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.cc
+++ b/speechx/speechx/frontend/audio/feature_pipeline.cc
@@ -23,13 +23,13 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) {
         new ppspeech::AudioCache(1000 * kint16max, opts.to_float32));
 
     unique_ptr<FrontendInterface> base_feature;
-    
+
     if (opts.use_fbank) {
-        base_feature.reset(new ppspeech::Fbank(opts.fbank_opts,
-                              std::move(data_source)));
+        base_feature.reset(
+            new ppspeech::Fbank(opts.fbank_opts, std::move(data_source)));
     } else {
-        base_feature.reset(new ppspeech::LinearSpectrogram(opts.linear_spectrogram_opts,
-                              std::move(data_source)));
+        base_feature.reset(new ppspeech::LinearSpectrogram(
+            opts.linear_spectrogram_opts, std::move(data_source)));
     }
 
     unique_ptr<FrontendInterface> cmvn(
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h
index 1acf62a9..6b9b4795 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@@ -18,24 +18,24 @@
 
 #include "frontend/audio/audio_cache.h"
 #include "frontend/audio/data_cache.h"
+#include "frontend/audio/fbank.h"
 #include "frontend/audio/feature_cache.h"
 #include "frontend/audio/frontend_itf.h"
 #include "frontend/audio/linear_spectrogram.h"
-#include "frontend/audio/fbank.h"
 #include "frontend/audio/normalizer.h"
 
 namespace ppspeech {
 
 struct FeaturePipelineOptions {
     std::string cmvn_file;
-    bool to_float32; // true, only for linear feature
+    bool to_float32;  // true, only for linear feature
     bool use_fbank;
     LinearSpectrogramOptions linear_spectrogram_opts;
     FbankOptions fbank_opts;
     FeatureCacheOptions feature_cache_opts;
     FeaturePipelineOptions()
         : cmvn_file(""),
-          to_float32(false), // true, only for linear feature
+          to_float32(false),  // true, only for linear feature
           use_fbank(true),
           linear_spectrogram_opts(),
           fbank_opts(),

From 491f2d040b9bfc04054d80dccc3680f0ae9d21af Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Fri, 6 May 2022 11:14:30 +0800
Subject: [PATCH 3/5] fix typo

---
 speechx/examples/ds2_ol/aishell/run_fbank.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh
index 246e3be4..88c16857 100755
--- a/speechx/examples/ds2_ol/aishell/run_fbank.sh
+++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh
@@ -5,7 +5,7 @@ set -e
 . path.sh
 
 nj=40
-stage=4
+stage=0
 stop_stage=5
 
 . utils/parse_options.sh
@@ -156,10 +156,10 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
         --use_fbank=true \
         --to_float32=false \
         --param_path=$model_dir/avg_5.jit.pdiparams \
-        --word_symbol_table=$graph_dir/words.txt \
+        --word_symbol_table=$wfst/words.txt \
         --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
         --model_cache_shapes="5-1-2048,5-1-2048" \
-        --graph_path=$graph_dir/TLG.fst --max_active=7500 \
+        --graph_path=$wfst/TLG.fst --max_active=7500 \
         --acoustic_scale=1.2 \
         --result_wspecifier=ark,t:./result_fbank_recognizer
 

From a3eaf16f848ed216eb4e3d386ca61cc2ae812cdf Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Fri, 6 May 2022 11:41:30 +0800
Subject: [PATCH 4/5] fix copyright issue

---
 speechx/speechx/websocket/websocket_client.cc | 4 ++--
 speechx/speechx/websocket/websocket_client.h  | 4 ++--
 speechx/speechx/websocket/websocket_server.cc | 4 ++--
 speechx/speechx/websocket/websocket_server.h  | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/speechx/speechx/websocket/websocket_client.cc b/speechx/speechx/websocket/websocket_client.cc
index 5176dc89..6bd930b8 100644
--- a/speechx/speechx/websocket/websocket_client.cc
+++ b/speechx/speechx/websocket/websocket_client.cc
@@ -1,5 +1,5 @@
-// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved.
-//
+// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
+//               2022 PaddlePaddle Authors
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
diff --git a/speechx/speechx/websocket/websocket_client.h b/speechx/speechx/websocket/websocket_client.h
index df7395a7..ac0aed31 100644
--- a/speechx/speechx/websocket/websocket_client.h
+++ b/speechx/speechx/websocket/websocket_client.h
@@ -1,5 +1,5 @@
-// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved.
-//
+// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
+//               2022 PaddlePaddle Authors
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
diff --git a/speechx/speechx/websocket/websocket_server.cc b/speechx/speechx/websocket/websocket_server.cc
index 2a6b8990..28c9eca4 100644
--- a/speechx/speechx/websocket/websocket_server.cc
+++ b/speechx/speechx/websocket/websocket_server.cc
@@ -1,5 +1,5 @@
-// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved.
-//
+// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
+//               2022 PaddlePaddle Authors
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
diff --git a/speechx/speechx/websocket/websocket_server.h b/speechx/speechx/websocket/websocket_server.h
index 8856f5d0..9ea88282 100644
--- a/speechx/speechx/websocket/websocket_server.h
+++ b/speechx/speechx/websocket/websocket_server.h
@@ -1,5 +1,5 @@
-// Copyright (c) 2022 PaddlePaddle Wenet Authors. All Rights Reserved.
-//
+// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
+//               2022 PaddlePaddle Authors
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at

From e87495f04562e7dfa93fc952200bef3924e95fa4 Mon Sep 17 00:00:00 2001
From: liangym <34430015+lym0302@users.noreply.github.com>
Date: Fri, 6 May 2022 14:13:31 +0800
Subject: [PATCH 5/5] [server] update readme (#1851)

* update readme, test=doc

* update readme, test=doc

* update readme, test=doc
---
 demos/speech_server/README.md             | 11 +++++----
 demos/speech_server/README_cn.md          | 19 +++++++-------
 demos/speech_server/conf/application.yaml |  6 ++---
 demos/streaming_tts_server/README.md      |  6 ++++-
 demos/streaming_tts_server/README_cn.md   | 30 +++++++++++++----------
 paddlespeech/server/README.md             |  4 ++-
 paddlespeech/server/README_cn.md          |  1 +
 7 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md
index 0323d398..3df93238 100644
--- a/demos/speech_server/README.md
+++ b/demos/speech_server/README.md
@@ -10,7 +10,7 @@ This demo is an implementation of starting the voice service and accessing the s
 ### 1. Installation
 see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
 
-It is recommended to use **paddlepaddle 2.2.1** or above.
+It is recommended to use **paddlepaddle 2.2.2** or above.
 You can choose one way from meduim and hard to install paddlespeech.
 
 ### 2. Prepare config File
@@ -18,6 +18,7 @@ The configuration file can be found in `conf/application.yaml` .
 Among them, `engine_list` indicates the speech engine that will be included in the service to be started, in the format of `<speech task>_<engine type>`.
 At present, the speech tasks integrated by the service include: asr (speech recognition), tts (text to sppech) and cls (audio classification).
 Currently the engine type supports two forms: python and inference (Paddle Inference)
+**Note:** If the service can be started normally in the container, but the client access IP is unreachable, you can try to replace the `host` address in the configuration file with the local IP address.
 
 
 The input of  ASR client demo should be a WAV file(`.wav`), and the sample rate must be the same as the model.
@@ -51,8 +52,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   [2022-02-23 11:17:32] [INFO] [on.py:26] Waiting for application startup.
   INFO:     Application startup complete.
   [2022-02-23 11:17:32] [INFO] [on.py:38] Application startup complete.
-  INFO:     Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
-  [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
+  INFO:     Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit)
+  [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit)
 
   ```
 
@@ -74,8 +75,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   [2022-02-23 14:57:56] [INFO] [on.py:26] Waiting for application startup.
   INFO:     Application startup complete.
   [2022-02-23 14:57:56] [INFO] [on.py:38] Application startup complete.
-  INFO:     Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
-  [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
+  INFO:     Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit)
+  [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit)
 
   ```
 
diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md
index 4a7c7447..55fc6b34 100644
--- a/demos/speech_server/README_cn.md
+++ b/demos/speech_server/README_cn.md
@@ -1,17 +1,17 @@
-([简体中文](./README_cn.md)|English)
+(简体中文|[English](./README.md))
 
 # 语音服务
 
 ## 介绍
-这个demo是一个启动语音服务和访问服务的实现。 它可以通过使用`paddlespeech_server` 和 `paddlespeech_client`的单个命令或 python 的几行代码来实现。
+这个demo是一个启动离线语音服务和访问服务的实现。它可以通过使用`paddlespeech_server` 和 `paddlespeech_client`的单个命令或 python 的几行代码来实现。
 
 
 ## 使用方法
 ### 1. 安装
 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
 
-推荐使用 **paddlepaddle 2.2.1** 或以上版本。
-你可以从 medium，hard 三中方式中选择一种方式安装 PaddleSpeech。
+推荐使用 **paddlepaddle 2.2.2** 或以上版本。
+你可以从 medium，hard 两种方式中选择一种方式安装 PaddleSpeech。
 
 
 ### 2. 准备配置文件
@@ -19,9 +19,10 @@
 其中，`engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
 目前服务集成的语音任务有： asr(语音识别)、tts(语音合成)以及cls(音频分类)。
 目前引擎类型支持两种形式：python 及 inference (Paddle Inference)
+**注意：** 如果在容器里可正常启动服务，但客户端访问 ip 不可达，可尝试将配置文件中 `host` 地址换成本地 ip 地址。
 
 
-这个 ASR client 的输入应该是一个 WAV 文件（`.wav`），并且采样率必须与模型的采样率相同。
+ASR client 的输入是一个 WAV 文件（`.wav`），并且采样率必须与模型的采样率相同。
 
 可以下载此 ASR client的示例音频：
 ```bash
@@ -52,8 +53,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   [2022-02-23 11:17:32] [INFO] [on.py:26] Waiting for application startup.
   INFO:     Application startup complete.
   [2022-02-23 11:17:32] [INFO] [on.py:38] Application startup complete.
-  INFO:     Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
-  [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
+  INFO:     Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit)
+  [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit)
 
   ```
 
@@ -75,8 +76,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   [2022-02-23 14:57:56] [INFO] [on.py:26] Waiting for application startup.
   INFO:     Application startup complete.
   [2022-02-23 14:57:56] [INFO] [on.py:38] Application startup complete.
-  INFO:     Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
-  [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
+  INFO:     Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit)
+  [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://127.0.0.1:8090 (Press CTRL+C to quit)
 
   ```
 
diff --git a/demos/speech_server/conf/application.yaml b/demos/speech_server/conf/application.yaml
index 2b1a0599..762f4af6 100644
--- a/demos/speech_server/conf/application.yaml
+++ b/demos/speech_server/conf/application.yaml
@@ -1,4 +1,4 @@
-# This is the parameter configuration file for PaddleSpeech Serving.
+# This is the parameter configuration file for PaddleSpeech Offline Serving.
 
 #################################################################################
 #                             SERVER SETTING                                    #
@@ -7,8 +7,8 @@ host: 127.0.0.1
 port: 8090
 
 # The task format in the engin_list is: <speech task>_<engine type>
-# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
-
+# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference']
+protocol: 'http'
 engine_list: ['asr_python', 'tts_python', 'cls_python']
 
 
diff --git a/demos/streaming_tts_server/README.md b/demos/streaming_tts_server/README.md
index d03b9e28..299aa3d2 100644
--- a/demos/streaming_tts_server/README.md
+++ b/demos/streaming_tts_server/README.md
@@ -10,7 +10,7 @@ This demo is an implementation of starting the streaming speech synthesis servic
 ### 1. Installation
 see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
 
-It is recommended to use **paddlepaddle 2.2.1** or above.
+It is recommended to use **paddlepaddle 2.2.2** or above.
 You can choose one way from meduim and hard to install paddlespeech.
 
 
@@ -29,6 +29,8 @@ The configuration file can be found in `conf/tts_online_application.yaml`.
     - When the voc model is mb_melgan, when voc_pad=14, the synthetic audio for streaming inference is consistent with the non-streaming synthetic audio; the minimum voc_pad can be set to 7, and the synthetic audio has no abnormal hearing. If the voc_pad is less than 7, the synthetic audio sounds abnormal.
     - When the voc model is hifigan, when voc_pad=20, the streaming inference synthetic audio is consistent with the non-streaming synthetic audio; when voc_pad=14, the synthetic audio has no abnormal hearing.
 - Inference speed: mb_melgan > hifigan; Audio quality: mb_melgan < hifigan
+- **Note:** If the service can be started normally in the container, but the client access IP is unreachable, you can try to replace the `host` address in the configuration file with the local IP address.
+
 
 
 ### 3. Streaming speech synthesis server and client using http protocol
@@ -120,6 +122,7 @@ The configuration file can be found in `conf/tts_online_application.yaml`.
     - `sample_rate`: Sampling rate, choices: [0, 8000, 16000], the default is the same as the model. Default: 0
     - `output`: Output wave filepath. Default: None, which means not to save the audio to the local.
     - `play`: Whether to play audio, play while synthesizing, default value: False, which means not playing. **Playing audio needs to rely on the pyaudio library**.
+    - `spk_id, speed, volume, sample_rate` do not take effect in streaming speech synthesis service temporarily.
     
     Output:
     ```bash
@@ -254,6 +257,7 @@ The configuration file can be found in `conf/tts_online_application.yaml`.
     - `sample_rate`: Sampling rate, choices: [0, 8000, 16000], the default is the same as the model. Default: 0
     - `output`: Output wave filepath. Default: None, which means not to save the audio to the local.
     - `play`: Whether to play audio, play while synthesizing, default value: False, which means not playing. **Playing audio needs to rely on the pyaudio library**.
+    - `spk_id, speed, volume, sample_rate` do not take effect in streaming speech synthesis service temporarily.
 
     
     Output:
diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index e40de11b..bb159503 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -10,25 +10,27 @@
 ### 1. 安装
 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
 
-推荐使用 **paddlepaddle 2.2.1** 或以上版本。
+推荐使用 **paddlepaddle 2.2.2** 或以上版本。
 你可以从 medium，hard 两种方式中选择一种方式安装 PaddleSpeech。
 
 
 ### 2. 准备配置文件
 配置文件可参见 `conf/tts_online_application.yaml` 。
-- `protocol`表示该流式TTS服务使用的网络协议，目前支持 **http 和 websocket** 两种。
-- `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
-    - 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
-    - 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
-- 流式TTS引擎的AM模型支持：**fastspeech2 以及fastspeech2_cnndecoder**; Voc 模型支持：**hifigan, mb_melgan**
-- 流式am推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数，`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
-    - fastspeech2不支持流式am推理，因此am_pad与am_block对它无效
-    - fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
-- 流式voc推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数，`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
-    - hifigan, mb_melgan 均支持流式voc 推理
-    - 当voc模型为mb_melgan，当voc_pad=14时，流式推理合成音频与非流式合成音频一致；voc_pad最小可以设置为7，合成音频听感上没有异常，若voc_pad小于7，合成音频听感上存在异常。
-    - 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
+- `protocol` 表示该流式 TTS 服务使用的网络协议，目前支持 **http 和 websocket** 两种。
+- `engine_list` 表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
+    - 该 demo 主要介绍流式语音合成服务，因此语音任务应设置为 tts。
+    - 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用 onnxruntime 进行推理的引擎。其中，online-onnx 的推理速度更快。
+- 流式 TTS 引擎的 AM 模型支持：**fastspeech2 以及fastspeech2_cnndecoder**; Voc 模型支持：**hifigan, mb_melgan**
+- 流式 am 推理中，每次会对一个 chunk 的数据进行推理以达到流式的效果。其中 `am_block` 表示 chunk 中的有效帧数，`am_pad` 表示一个 chunk 中 am_block 前后各加的帧数。am_pad 的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+    - fastspeech2 不支持流式 am 推理，因此 am_pad 与 m_block 对它无效
+    - fastspeech2_cnndecoder 支持流式推理，当 am_pad=12 时，流式推理合成音频与非流式合成音频一致
+- 流式 voc 推理中，每次会对一个 chunk 的数据进行推理以达到流式的效果。其中 `voc_block` 表示chunk中的有效帧数，`voc_pad` 表示一个 chunk 中 voc_block 前后各加的帧数。voc_pad 的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+    - hifigan, mb_melgan 均支持流式 voc 推理
+    - 当 voc 模型为 mb_melgan，当 voc_pad=14 时，流式推理合成音频与非流式合成音频一致；voc_pad 最小可以设置为7，合成音频听感上没有异常，若 voc_pad 小于7，合成音频听感上存在异常。
+    - 当 voc 模型为 hifigan，当 voc_pad=20 时，流式推理合成音频与非流式合成音频一致；当 voc_pad=14 时，合成音频听感上没有异常。
 - 推理速度：mb_melgan > hifigan; 音频质量：mb_melgan < hifigan
+- **注意：** 如果在容器里可正常启动服务，但客户端访问 ip 不可达，可尝试将配置文件中 `host` 地址换成本地 ip 地址。
+
 
 ### 3. 使用http协议的流式语音合成服务端及客户端使用方法
 #### 3.1 服务端使用方法
@@ -119,6 +121,7 @@
     - `sample_rate`: 采样率，可选 [0, 8000, 16000]，默认值：0，表示与模型采样率相同
     - `output`: 输出音频的路径， 默认值：None，表示不保存音频到本地。
     - `play`: 是否播放音频，边合成边播放， 默认值：False，表示不播放。**播放音频需要依赖pyaudio库**。
+    - `spk_id, speed, volume, sample_rate` 在流式语音合成服务中暂时不生效。
 
     
     输出:
@@ -254,6 +257,7 @@
     - `sample_rate`: 采样率，可选 [0, 8000, 16000]，默认值：0，表示与模型采样率相同
     - `output`: 输出音频的路径， 默认值：None，表示不保存音频到本地。
     - `play`: 是否播放音频，边合成边播放， 默认值：False，表示不播放。**播放音频需要依赖pyaudio库**。
+    - `spk_id, speed, volume, sample_rate` 在流式语音合成服务中暂时不生效。
 
     
     输出:
diff --git a/paddlespeech/server/README.md b/paddlespeech/server/README.md
index 98ec1e28..f3dc9224 100644
--- a/paddlespeech/server/README.md
+++ b/paddlespeech/server/README.md
@@ -10,7 +10,9 @@
  paddlespeech_server help
  ```
  ### Start the server
- First set the service-related configuration parameters, similar to `./conf/application.yaml`. Set `engine_list`, which represents the speech tasks included in the service to be started
+ First set the service-related configuration parameters, similar to `./conf/application.yaml`. Set `engine_list`, which represents the speech tasks included in the service to be started.
+ **Note:** If the service can be started normally in the container, but the client access IP is unreachable, you can try to replace the `host` address in the configuration file with the local IP address.
+
  Then start the service:
  ```bash
  paddlespeech_server start --config_file ./conf/application.yaml
diff --git a/paddlespeech/server/README_cn.md b/paddlespeech/server/README_cn.md
index a974d40f..4bd4d873 100644
--- a/paddlespeech/server/README_cn.md
+++ b/paddlespeech/server/README_cn.md
@@ -11,6 +11,7 @@
  ```
  ### 启动服务
  首先设置服务相关配置文件，类似于 `./conf/application.yaml`，设置 `engine_list`，该值表示即将启动的服务中包含的语音任务。
+ **注意：** 如果在容器里可正常启动服务，但客户端访问 ip 不可达，可尝试将配置文件中 `host` 地址换成本地 ip 地址。
  然后启动服务：
  ```bash
  paddlespeech_server start --config_file ./conf/application.yaml