From 1113a68a6de08187d7311fabbbbb629fea40b134 Mon Sep 17 00:00:00 2001
From: fazledyn-or <ataf@openrefactory.com>
Date: Tue, 3 Oct 2023 19:10:48 +0600
Subject: [PATCH 01/39] FIX: Added missing imports

---
 paddlespeech/audio/transform/perturb.py | 1 +
 paddlespeech/s2t/io/reader.py           | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/paddlespeech/audio/transform/perturb.py b/paddlespeech/audio/transform/perturb.py
index 0825caec8..78b8d2c34 100644
--- a/paddlespeech/audio/transform/perturb.py
+++ b/paddlespeech/audio/transform/perturb.py
@@ -14,6 +14,7 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 import io
 import os
+import sys
 
 import h5py
 import librosa
diff --git a/paddlespeech/s2t/io/reader.py b/paddlespeech/s2t/io/reader.py
index 5e018befb..be643cc7b 100644
--- a/paddlespeech/s2t/io/reader.py
+++ b/paddlespeech/s2t/io/reader.py
@@ -14,9 +14,12 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 from collections import OrderedDict
 
+import io
+import os
 import kaldiio
 import numpy as np
 import soundfile
+import h5py
 
 from .utility import feat_type
 from paddlespeech.audio.transform.transformation import Transformation

From 1a693448faaa07aa54d1a2a1b89c7c7c4d7427a2 Mon Sep 17 00:00:00 2001
From: fazledyn-or <ataf@openrefactory.com>
Date: Tue, 3 Oct 2023 19:12:00 +0600
Subject: [PATCH 02/39] FIX: Fixed the implementation of a special method

---
 paddlespeech/audio/transform/perturb.py | 2 +-
 paddlespeech/s2t/io/reader.py           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddlespeech/audio/transform/perturb.py b/paddlespeech/audio/transform/perturb.py
index 78b8d2c34..757a2f1bf 100644
--- a/paddlespeech/audio/transform/perturb.py
+++ b/paddlespeech/audio/transform/perturb.py
@@ -99,7 +99,7 @@ class SoundHDF5File():
     def __contains__(self, item):
         return item in self.file
 
-    def __len__(self, item):
+    def __len__(self):
         return len(self.file)
 
     def __enter__(self):
diff --git a/paddlespeech/s2t/io/reader.py b/paddlespeech/s2t/io/reader.py
index be643cc7b..d433a643f 100644
--- a/paddlespeech/s2t/io/reader.py
+++ b/paddlespeech/s2t/io/reader.py
@@ -404,7 +404,7 @@ class SoundHDF5File():
     def __contains__(self, item):
         return item in self.file
 
-    def __len__(self, item):
+    def __len__(self):
         return len(self.file)
 
     def __enter__(self):

From f2416ff3657a55a54ef540574d6de1cdbf560202 Mon Sep 17 00:00:00 2001
From: luyao-cv <1367355728@qq.com>
Date: Mon, 4 Dec 2023 16:43:45 +0800
Subject: [PATCH 03/39] fix develop bug function:view to reshape (#3633)

---
 paddlespeech/s2t/modules/attention.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py
index 10ab3eaea..7f040d3e2 100644
--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
@@ -79,9 +79,9 @@ class MultiHeadedAttention(nn.Layer):
         """
         n_batch = query.shape[0]
 
-        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
-        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
-        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = self.linear_q(query).reshape([n_batch, -1, self.h, self.d_k])
+        k = self.linear_k(key).reshape([n_batch, -1, self.h, self.d_k])
+        v = self.linear_v(value).reshape([n_batch, -1, self.h, self.d_k])
 
         q = q.transpose([0, 2, 1, 3])  # (batch, head, time1, d_k)
         k = k.transpose([0, 2, 1, 3])  # (batch, head, time2, d_k)
@@ -129,8 +129,8 @@ class MultiHeadedAttention(nn.Layer):
 
         p_attn = self.dropout(attn)
         x = paddle.matmul(p_attn, value)  # (batch, head, time1, d_k)
-        x = x.transpose([0, 2, 1, 3]).view(n_batch, -1, self.h *
-                                           self.d_k)  # (batch, time1, d_model)
+        x = x.transpose([0, 2, 1, 3]).reshape([n_batch, -1, self.h *
+                                           self.d_k])  # (batch, time1, d_model)
 
         return self.linear_out(x)  # (batch, time1, d_model)
 
@@ -349,7 +349,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
         new_cache = paddle.concat((k, v), axis=-1)
 
         n_batch_pos = pos_emb.shape[0]
-        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = self.linear_pos(pos_emb).reshape([n_batch_pos, -1, self.h, self.d_k])
         p = p.transpose([0, 2, 1, 3])  # (batch, head, time1, d_k)
 
         # (batch, head, time1, d_k)

From 1b8ca706d6a8e0a8b97ee21d93314a245d777a69 Mon Sep 17 00:00:00 2001
From: gmm <38800877+mmglove@users.noreply.github.com>
Date: Tue, 5 Dec 2023 14:57:20 +0800
Subject: [PATCH 04/39] =?UTF-8?q?=E3=80=90benchmark=E3=80=91fix=20gpu=5Fme?=
 =?UTF-8?q?m=20unit=20(#3634)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix profiler

* add max_mem_reserved for benchmark

* fix benchmark
---
 paddlespeech/t2s/training/trainer.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/paddlespeech/t2s/training/trainer.py b/paddlespeech/t2s/training/trainer.py
index 7f1b17de2..7631ef350 100644
--- a/paddlespeech/t2s/training/trainer.py
+++ b/paddlespeech/t2s/training/trainer.py
@@ -164,9 +164,10 @@ class Trainer(object):
                         self.updater.
                         batch_size) + "avg_ips: {:.5f} sequences/sec,".format(
                             self.updater.batch_size / avg_batch_cost)
-                    max_mem_reserved_str = f" max_mem_reserved: {paddle.device.cuda.max_memory_reserved()} B"
-                    max_mem_allocated_str = f" max_mem_allocated: {paddle.device.cuda.max_memory_allocated()} B"
-                    msg += max_mem_reserved_str + "," + max_mem_allocated_str
+                    if paddle.device.is_compiled_with_cuda():
+                        max_mem_reserved_str = f" max_mem_reserved: {paddle.device.cuda.max_memory_reserved() // (1024 ** 2)} MB"
+                        max_mem_allocated_str = f" max_mem_allocated: {paddle.device.cuda.max_memory_allocated() // (1024 ** 2)} MB"
+                        msg += max_mem_reserved_str + "," + max_mem_allocated_str
 
                     logger.info(msg)
 

From 39ba32fafb6fc80311ab3f4f6998e52f6583c12e Mon Sep 17 00:00:00 2001
From: Color_yr <402067010@qq.com>
Date: Tue, 16 Jan 2024 20:11:24 +0800
Subject: [PATCH 05/39] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=96=87=E4=BB=B6?=
 =?UTF-8?q?=E7=BC=96=E7=A0=81=E8=AF=BB=E5=8F=96=20(#3606)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixed #3605
---
 paddlespeech/server/engine/tts/online/onnx/tts_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
index 9dd31a08b..14204dde7 100644
--- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
@@ -154,7 +154,7 @@ class TTSServerExecutor(TTSExecutor):
         self.voc_sess = get_sess(self.voc_ckpt, voc_sess_conf)
         logger.debug("Create voc sess successfully.")
 
-        with open(self.phones_dict, "r") as f:
+        with open(self.phones_dict, "r", encoding='utf-8') as f:
             phn_id = [line.strip().split() for line in f.readlines()]
         self.vocab_size = len(phn_id)
         logger.debug(f"vocab_size: {self.vocab_size}")

From 02a5f7bce84e331620cf8775254c601627516450 Mon Sep 17 00:00:00 2001
From: JeffLu <luzhenhui@gmail.com>
Date: Mon, 26 Feb 2024 10:59:55 +0800
Subject: [PATCH 06/39] bugfix: audio_len should be 1D, no 0D, which will raise
 list index out (#3490)

of range error in the following decode process

Co-authored-by: Luzhenhui <luzhenhui@mqsz.com>
---
 paddlespeech/cli/asr/infer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 231a00f4d..4001f957f 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -274,7 +274,7 @@ class ASRExecutor(BaseExecutor):
             # fbank
             audio = preprocessing(audio, **preprocess_args)
 
-            audio_len = paddle.to_tensor(audio.shape[0]).unsqueeze(axis=0)
+            audio_len = paddle.to_tensor([audio.shape[0]]).unsqueeze(axis=0)
             audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
 
             self._inputs["audio"] = audio

From 2147d3b565c7db99dcdaa2db9dae52bee375d0f7 Mon Sep 17 00:00:00 2001
From: satani99 <42287151+satani99@users.noreply.github.com>
Date: Mon, 26 Feb 2024 08:30:28 +0530
Subject: [PATCH 07/39] Update README.md (#3532)

Fixed a typo
---
 demos/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demos/README.md b/demos/README.md
index a41967864..6f9cd2e41 100644
--- a/demos/README.md
+++ b/demos/README.md
@@ -18,4 +18,4 @@ This directory contains many speech applications in multiple scenarios.
 * style_fs2 - multi style control for FastSpeech2 model  
 * text_to_speech - convert text into speech 
 * self supervised pretraining - speech feature extraction and speech recognition based on wav2vec2
-* Wishper - speech recognize and translate based on Whisper model
+* Whisper - speech recognize and translate based on Whisper model

From bcbb85af7668a17c6498200f4675a6ac41d868f6 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Thu, 23 May 2024 19:34:04 +0800
Subject: [PATCH 08/39] fixed version for paddlepaddle. (#3701)

* fixed version for paddlepaddle.

* fix code style
---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index af7c4dc3d..8e81da6d4 100644
--- a/setup.py
+++ b/setup.py
@@ -53,6 +53,7 @@ base = [
     "pandas",
     "paddleaudio>=1.1.0",
     "paddlenlp>=2.4.8",
+    "paddlepaddle==2.5.1",
     "paddleslim>=2.3.4",
     "ppdiffusers>=0.9.0",
     "paddlespeech_feat",

From 03022f2170ce76d2ca8385a92aa8df3519e2366b Mon Sep 17 00:00:00 2001
From: mjxs <52824616+kk-2000@users.noreply.github.com>
Date: Tue, 4 Jun 2024 10:34:39 +0800
Subject: [PATCH 09/39] =?UTF-8?q?=E3=80=90Fix=20Speech=20Issue=20No.5?=
 =?UTF-8?q?=E3=80=91issue=203444=20transformation=20import=20error=20(#377?=
 =?UTF-8?q?9)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix paddlespeech.s2t.transform.transformation import error

* fix paddlespeech.s2t.transform import error
---
 audio/tests/features/base.py                    | 2 +-
 audio/tests/features/test_istft.py              | 4 ++--
 audio/tests/features/test_log_melspectrogram.py | 2 +-
 audio/tests/features/test_spectrogram.py        | 2 +-
 audio/tests/features/test_stft.py               | 2 +-
 docs/tutorial/asr/tutorial_transformer.ipynb    | 4 ++--
 utils/apply-cmvn.py                             | 2 +-
 utils/compute-cmvn-stats.py                     | 2 +-
 utils/copy-feats.py                             | 2 +-
 utils/feat-to-shape.py                          | 2 +-
 10 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/audio/tests/features/base.py b/audio/tests/features/base.py
index d183b72ad..3bb1d1dde 100644
--- a/audio/tests/features/base.py
+++ b/audio/tests/features/base.py
@@ -37,7 +37,7 @@ class FeatTest(unittest.TestCase):
         self.waveform, self.sr = load(os.path.abspath(os.path.basename(url)))
         self.waveform = self.waveform.astype(
             np.float32
-        )  # paddlespeech.s2t.transform.spectrogram only supports float32 
+        )  # paddlespeech.audio.transform.spectrogram only supports float32
         dim = len(self.waveform.shape)
 
         assert dim in [1, 2]
diff --git a/audio/tests/features/test_istft.py b/audio/tests/features/test_istft.py
index 9cf8cdd65..ea1ee5cb6 100644
--- a/audio/tests/features/test_istft.py
+++ b/audio/tests/features/test_istft.py
@@ -18,8 +18,8 @@ import paddle
 from paddleaudio.functional.window import get_window
 
 from .base import FeatTest
-from paddlespeech.s2t.transform.spectrogram import IStft
-from paddlespeech.s2t.transform.spectrogram import Stft
+from paddlespeech.audio.transform.spectrogram import IStft
+from paddlespeech.audio.transform.spectrogram import Stft
 
 
 class TestIstft(FeatTest):
diff --git a/audio/tests/features/test_log_melspectrogram.py b/audio/tests/features/test_log_melspectrogram.py
index 7d5680387..b2765d3be 100644
--- a/audio/tests/features/test_log_melspectrogram.py
+++ b/audio/tests/features/test_log_melspectrogram.py
@@ -18,7 +18,7 @@ import paddle
 import paddleaudio
 
 from .base import FeatTest
-from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogram
+from paddlespeech.audio.transform.spectrogram import LogMelSpectrogram
 
 
 class TestLogMelSpectrogram(FeatTest):
diff --git a/audio/tests/features/test_spectrogram.py b/audio/tests/features/test_spectrogram.py
index 5fe5afee1..6f4609632 100644
--- a/audio/tests/features/test_spectrogram.py
+++ b/audio/tests/features/test_spectrogram.py
@@ -18,7 +18,7 @@ import paddle
 import paddleaudio
 
 from .base import FeatTest
-from paddlespeech.s2t.transform.spectrogram import Spectrogram
+from paddlespeech.audio.transform.spectrogram import Spectrogram
 
 
 class TestSpectrogram(FeatTest):
diff --git a/audio/tests/features/test_stft.py b/audio/tests/features/test_stft.py
index 58792ffe2..9511a2926 100644
--- a/audio/tests/features/test_stft.py
+++ b/audio/tests/features/test_stft.py
@@ -18,7 +18,7 @@ import paddle
 from paddleaudio.functional.window import get_window
 
 from .base import FeatTest
-from paddlespeech.s2t.transform.spectrogram import Stft
+from paddlespeech.audio.transform.spectrogram import Stft
 
 
 class TestStft(FeatTest):
diff --git a/docs/tutorial/asr/tutorial_transformer.ipynb b/docs/tutorial/asr/tutorial_transformer.ipynb
index dc3030061..77aed4bf8 100644
--- a/docs/tutorial/asr/tutorial_transformer.ipynb
+++ b/docs/tutorial/asr/tutorial_transformer.ipynb
@@ -236,8 +236,8 @@
     "warnings.filterwarnings('ignore')\n",
     "\n",
     "from yacs.config import CfgNode\n",
-    "from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogramKaldi\n",
-    "from paddlespeech.s2t.transform.cmvn import GlobalCMVN\n",
+    "from paddlespeech.audio.transform.spectrogram import LogMelSpectrogramKaldi\n",
+    "from paddlespeech.audio.transform.cmvn import GlobalCMVN\n",
     "from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer\n",
     "from paddlespeech.s2t.models.u2 import U2Model\n",
     "\n",
diff --git a/utils/apply-cmvn.py b/utils/apply-cmvn.py
index cf91bdfcd..fa69ff8e0 100755
--- a/utils/apply-cmvn.py
+++ b/utils/apply-cmvn.py
@@ -6,7 +6,7 @@ import kaldiio
 import numpy
 from distutils.util import strtobool
 
-from paddlespeech.s2t.transform.cmvn import CMVN
+from paddlespeech.audio.transform.cmvn import CMVN
 from paddlespeech.s2t.utils.cli_readers import file_reader_helper
 from paddlespeech.s2t.utils.cli_utils import get_commandline_args
 from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style
diff --git a/utils/compute-cmvn-stats.py b/utils/compute-cmvn-stats.py
index 276bcd36e..763347ce8 100755
--- a/utils/compute-cmvn-stats.py
+++ b/utils/compute-cmvn-stats.py
@@ -5,7 +5,7 @@ import logging
 import kaldiio
 import numpy as np
 
-from paddlespeech.s2t.transform.transformation import Transformation
+from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.utils.cli_readers import file_reader_helper
 from paddlespeech.s2t.utils.cli_utils import get_commandline_args
 from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style
diff --git a/utils/copy-feats.py b/utils/copy-feats.py
index dc7a70b45..89ea30f97 100755
--- a/utils/copy-feats.py
+++ b/utils/copy-feats.py
@@ -4,7 +4,7 @@ import logging
 
 from distutils.util import strtobool
 
-from paddlespeech.s2t.transform.transformation import Transformation
+from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.utils.cli_readers import file_reader_helper
 from paddlespeech.s2t.utils.cli_utils import get_commandline_args
 from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style
diff --git a/utils/feat-to-shape.py b/utils/feat-to-shape.py
index bbc9242f4..e5e014ded 100755
--- a/utils/feat-to-shape.py
+++ b/utils/feat-to-shape.py
@@ -3,7 +3,7 @@ import argparse
 import logging
 import sys
 
-from paddlespeech.s2t.transform.transformation import Transformation
+from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.utils.cli_readers import file_reader_helper
 from paddlespeech.s2t.utils.cli_utils import get_commandline_args
 from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style

From 09e5d8a4ac03f29c2ce6511e1a3c39136cd3e29b Mon Sep 17 00:00:00 2001
From: Mattheliu <leonliuzx@outlook.com>
Date: Wed, 5 Jun 2024 10:41:32 +0800
Subject: [PATCH 10/39] =?UTF-8?q?=E3=80=90Fix=20Speech=20Issue=20No.8?=
 =?UTF-8?q?=E3=80=91issue=203652=20merge=5Fyi=20function=20has=20a=20bug?=
 =?UTF-8?q?=20(#3786)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 【Fix Speech Issue No.8】issue 3652 merge_yi function has a bug

* 【Fix Speech Issue No.8】issue 3652 merge_yi function has a bug
---
 paddlespeech/t2s/frontend/tone_sandhi.py | 25 ++++++++++--------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py
index 690f69aa2..3558064cd 100644
--- a/paddlespeech/t2s/frontend/tone_sandhi.py
+++ b/paddlespeech/t2s/frontend/tone_sandhi.py
@@ -237,30 +237,25 @@ class ToneSandhi():
     # output seg: [['听一听', 'v']]
     def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
         new_seg = []
+        skip_next = False
         # function 1
         for i, (word, pos) in enumerate(seg):
-            if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][
-                    0] == seg[i + 1][0] and seg[i - 1][1] == "v":
-                if i - 1 < len(new_seg):
-                    new_seg[i -
-                            1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
-                else:
-                    new_seg.append([word, pos])
-                    new_seg.append([seg[i + 1][0], pos])
+            if skip_next:
+                skip_next = False
+                continue
+            if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][0] == seg[i + 1][0] and seg[i - 1][1] == "v":
+                new_seg[-1] = (new_seg[-1][0] + "一" + seg[i + 1][0], new_seg[-1][1])
+                skip_next = True
             else:
-                if i - 2 >= 0 and seg[i - 1][0] == "一" and seg[i - 2][
-                        0] == word and pos == "v":
-                    continue
-                else:
-                    new_seg.append([word, pos])
+                new_seg.append((word, pos))
         seg = new_seg
         new_seg = []
         # function 2
         for i, (word, pos) in enumerate(seg):
             if new_seg and new_seg[-1][0] == "一":
-                new_seg[-1][0] = new_seg[-1][0] + word
+                new_seg[-1] = (new_seg[-1][0] + word, new_seg[-1][1])
             else:
-                new_seg.append([word, pos])
+                new_seg.append((word, pos))
         return new_seg
 
     # the first and the second words are all_tone_three

From 05660a62cb2f56c1af0773be06a75d8dbc18df20 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Wed, 5 Jun 2024 14:28:14 +0800
Subject: [PATCH 11/39] =?UTF-8?q?=E3=80=90test=E3=80=91add=20cli=20test=20?=
 =?UTF-8?q?readme=20(#3784)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add cli test readme

* fix code style
---
 tests/unit/cli/test_cli.sh |  2 ++
 tests/unit/doc/test_cli.md | 29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)
 create mode 100644 tests/unit/doc/test_cli.md

diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index a7f7d11e4..3bc2eae2f 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -110,5 +110,7 @@ paddlespeech whisper --task transcribe --input ./zh.wav
 # whisper recognize text and translate to English
 paddlespeech whisper --task translate --input ./zh.wav
 
+# to change model English-Only model
+paddlespeech whisper --lang en --size base --task transcribe  --input ./en.wav
 
 echo -e "\033[32mTest success !!!\033[0m"
diff --git a/tests/unit/doc/test_cli.md b/tests/unit/doc/test_cli.md
new file mode 100644
index 000000000..34a0c016a
--- /dev/null
+++ b/tests/unit/doc/test_cli.md
@@ -0,0 +1,29 @@
+# test CLI 测试文档
+
+ 该文档为 CLI 测试说明，该测试目前覆盖大部分 paddlespeech 中的 CLI 推理。该 CI 建立后用于快速验证修复是否正确。
+
+ # 测试流程
+ ## 1. 环境安装
+
+ CI 重建时在已有通过版本 paddlepaddle-gpu==2.5.1, paddlepseech==develop 下运行。
+
+ CI 重建后在 paddlepaddle-gpu==develop, paddlepseech==develop 下运行。
+ 
+ ### 其他相关依赖
+
+ gcc >= 4.8.5,
+ python >= 3.8
+
+ ## 2. 功能测试
+
+ 在 repo 的 tests/unit/cli 中运行：
+
+  ```shell
+
+  source path.sh
+  bash test_cli.sh
+
+  ```
+## 3. 预期结果
+
+ 输出 "Test success"，且运行过程中无报错或 Error 即为成功。

From 72ce8861779cc7fef9eb3277217878fd65375c58 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Thu, 6 Jun 2024 15:26:16 +0800
Subject: [PATCH 12/39] =?UTF-8?q?=E3=80=90test=E3=80=91fix=20test=20cli=20?=
 =?UTF-8?q?bug=20(#3793)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add cli test readme

* fix code style

* fix bug
---
 tests/unit/cli/test_cli.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index 3bc2eae2f..3903e6597 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -10,11 +10,12 @@ paddlespeech cls --input ./cat.wav --topk 10
 paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 --model ernie_linear_p3_wudao_fast
 
 # Speech SSL
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
 paddlespeech ssl --task asr --lang en --input ./en.wav
 paddlespeech ssl --task vector --lang en --input ./en.wav
 
 # Speech_recognition
-wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 paddlespeech asr --input ./zh.wav
 paddlespeech asr --model conformer_aishell --input ./zh.wav
 paddlespeech asr --model conformer_online_aishell --input ./zh.wav

From e8018a11ce73176549d92ddbac9bc4b0bbdd2157 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Fri, 7 Jun 2024 14:11:36 +0800
Subject: [PATCH 13/39] Update setup.py (#3795)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 8e81da6d4..10a6502c2 100644
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@ base = [
     "pandas",
     "paddleaudio>=1.1.0",
     "paddlenlp>=2.4.8",
-    "paddlepaddle==2.5.1",
+    "paddlepaddle-gpu==2.5.1",
     "paddleslim>=2.3.4",
     "ppdiffusers>=0.9.0",
     "paddlespeech_feat",

From 91170bd2604e5a22237fcb46ebcf44f4d86914b5 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Tue, 11 Jun 2024 11:12:58 +0800
Subject: [PATCH 14/39] adapt view behavior change, fix KeyError. (#3794)

* adapt view behavior change, fix KeyError.

* fix readme demo run error.

* fixed opencc version
---
 paddlespeech/cli/asr/infer.py                    | 2 +-
 paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py | 2 +-
 setup.py                                         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 4001f957f..231a00f4d 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -274,7 +274,7 @@ class ASRExecutor(BaseExecutor):
             # fbank
             audio = preprocessing(audio, **preprocess_args)
 
-            audio_len = paddle.to_tensor([audio.shape[0]]).unsqueeze(axis=0)
+            audio_len = paddle.to_tensor(audio.shape[0]).unsqueeze(axis=0)
             audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
 
             self._inputs["audio"] = audio
diff --git a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
index a3744d340..64195defc 100755
--- a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
+++ b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
@@ -188,7 +188,7 @@ class Wav2vec2ASR(nn.Layer):
         x_lens = x.shape[1]
         ctc_probs = self.ctc.log_softmax(x)  # (B, maxlen, vocab_size)
         topk_prob, topk_index = ctc_probs.topk(1, axis=2)  # (B, maxlen, 1)
-        topk_index = topk_index.view([batch_size, x_lens])  # (B, maxlen)
+        topk_index = topk_index.reshape([batch_size, x_lens])  # (B, maxlen)
 
         hyps = [hyp.tolist() for hyp in topk_index]
         hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]
diff --git a/setup.py b/setup.py
index 10a6502c2..030f7f880 100644
--- a/setup.py
+++ b/setup.py
@@ -48,7 +48,7 @@ base = [
     "matplotlib",
     "nara_wpe",
     "onnxruntime>=1.11.0",
-    "opencc",
+    "opencc==1.1.6",
     "opencc-python-reimplemented",
     "pandas",
     "paddleaudio>=1.1.0",

From 98fe6d1153e59cbaf9653e00695e061169ec683c Mon Sep 17 00:00:00 2001
From: gmm <38800877+mmglove@users.noreply.github.com>
Date: Wed, 19 Jun 2024 17:36:59 +0800
Subject: [PATCH 15/39] =?UTF-8?q?=E3=80=90benchmark=E3=80=91fix=20benchmar?=
 =?UTF-8?q?k=20prepare.sh=20(#3803)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix profiler

* add max_mem_reserved for benchmark

* fix benchmark

* Update prepare.sh

* Update prepare.sh
---
 tests/test_tipc/prepare.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh
index e57feda0a..7d4dd8b16 100755
--- a/tests/test_tipc/prepare.sh
+++ b/tests/test_tipc/prepare.sh
@@ -35,6 +35,8 @@ if [[ ${MODE} = "benchmark_train" ]];then
     pip install setuptools_scm #-i https://pypi.tuna.tsinghua.edu.cn/simple 
     pip install . #-i https://pypi.tuna.tsinghua.edu.cn/simple 
     pip install jsonlines
+    pip install -U scipy==1.12.0  # 高版本数据处理部分报错
+    pip install -U matplotlib==3.7.1 # 高版本报错cannot import name 'get_cmap' from 'matplotlib.cm' 
     pip list
     cd -
     if [[ ${model_name} == "conformer" ]]; then

From 5e03da403b3c806a1cf1a736f17d2f16d4f61c51 Mon Sep 17 00:00:00 2001
From: funnycoder888 <maodezheng@yeah.net>
Date: Mon, 8 Jul 2024 15:30:56 +0800
Subject: [PATCH 16/39] Fix spelling errors (#3807)

* Fix spelling errors

* Update README.md

fix spelling error
---
 audio/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/audio/README.md b/audio/README.md
index d42d41229..a8c47efe8 100644
--- a/audio/README.md
+++ b/audio/README.md
@@ -14,7 +14,7 @@ Linux test build whl environment:
 * gcc/g++ - 8.2.0
 * cmake - 3.18.0 (need install)
 
-MAC：test build whl envrioment：
+MAC：test build whl environment：
 * os 
 * gcc/g++ 12.2.0
 * cpu Intel Xeon E5 x86_64

From 748a5f9d5c36ed6f1f2c8fb67aa66a366314635b Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Fri, 19 Jul 2024 18:18:16 +0800
Subject: [PATCH 17/39] fix (#3818)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 030f7f880..941639e73 100644
--- a/setup.py
+++ b/setup.py
@@ -43,7 +43,7 @@ base = [
     # paddleaudio align with librosa==0.8.1, which need numpy==1.23.x
     "numpy==1.23.5",
     "librosa==0.8.1",
-    "scipy>=1.4.0",
+    "scipy>=1.4.0, <=1.12.0",
     "loguru",
     "matplotlib",
     "nara_wpe",

From 2e93229a9379868d2f76e1c2a113a18c5a55bece Mon Sep 17 00:00:00 2001
From: tianshuo78520a <tianshuo03@baidu.com>
Date: Tue, 23 Jul 2024 11:08:45 +0800
Subject: [PATCH 18/39] Fix (#3821)

* Fix

* Test CI Docker

* Test CI Docker
---
 tools/Dockerfile    |  4 ++++
 tools/pre_commit.sh | 54 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+)
 create mode 100644 tools/Dockerfile
 create mode 100644 tools/pre_commit.sh

diff --git a/tools/Dockerfile b/tools/Dockerfile
new file mode 100644
index 000000000..18596f32c
--- /dev/null
+++ b/tools/Dockerfile
@@ -0,0 +1,4 @@
+FROM registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.2-cudnn8.2-trt8.0-gcc82
+RUN apt-get update -y
+RUN apt-get -y install libsndfile1
+RUN pip3.8 install pytest-runner
diff --git a/tools/pre_commit.sh b/tools/pre_commit.sh
new file mode 100644
index 000000000..3a179782c
--- /dev/null
+++ b/tools/pre_commit.sh
@@ -0,0 +1,54 @@
+set +x
+
+# use pre-commit 2.17
+if ! [[ $(pre-commit --version) == *"2.17.0"* ]]; then
+    pip install pre-commit==2.17.0 1>nul
+fi
+
+# Install clang-format before git commit to avoid repeat installation due to
+# pre-commit multi-thread running.
+readonly VERSION="13.0.0"
+version=$(clang-format -version)
+if ! [[ $(python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1$2}') -ge 36 ]]; then
+    echo "clang-format installation by pip need python version great equal 3.6,
+          please change the default python to higher version."
+    exit 1
+fi
+
+diff_files=$(git diff --name-only --diff-filter=ACMR ${BRANCH})
+num_diff_files=$(echo "$diff_files" | wc -l)
+echo -e "diff files between pr and ${BRANCH}:\n${diff_files}"
+
+echo "Checking code style by pre-commit ..."
+pre-commit run --files ${diff_files};check_error=$?
+
+if test ! -z "$(git diff)"; then
+    echo -e '\n************************************************************************************'
+    echo -e "These files have been formatted by code format hook. You should use pre-commit to \
+format them before git push."
+    echo -e '************************************************************************************\n'
+    git diff 2>&1
+fi
+
+echo -e '\n************************************************************************************'
+if [ ${check_error} != 0 ];then
+    echo "Your PR code style check failed."
+    echo "Please install pre-commit locally and set up git hook scripts:"
+    echo ""
+    echo "    pip install pre-commit==2.17.0"
+    echo "    pre-commit install"
+    echo ""
+    if [[ $num_diff_files -le 100 ]];then
+        echo "Then, run pre-commit to check codestyle issues in your PR:"
+        echo ""
+        echo "    pre-commit run --files" $(echo ${diff_files} | tr "\n" " ")
+        echo ""
+    fi
+    echo "For more information, please refer to our codestyle check guide:"
+    echo "https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/dev_guides/git_guides/codestyle_check_guide_cn.html"
+else
+    echo "Your PR code style check passed."
+fi
+echo -e '************************************************************************************\n'
+
+exit ${check_error}

From d615fc33de2f340f1b6ca81c71d08b9bfcdd9b94 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Wed, 14 Aug 2024 14:17:53 +0800
Subject: [PATCH 19/39] =?UTF-8?q?=E3=80=90ASR=E3=80=91fix=20acs=20demo=20(?=
 =?UTF-8?q?#3826)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix demo acs

* fix
---
 demos/audio_content_search/README.md           | 15 ++++++++++++---
 demos/audio_content_search/README_cn.md        | 18 ++++++++++++++----
 .../conf/ws_conformer_application.yaml         |  4 +++-
 .../ws_conformer_wenetspeech_application.yaml  |  1 +
 4 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/demos/audio_content_search/README.md b/demos/audio_content_search/README.md
index 4428bf389..f04ac447e 100644
--- a/demos/audio_content_search/README.md
+++ b/demos/audio_content_search/README.md
@@ -19,7 +19,7 @@ You can choose one way from meduim and hard to install paddlespeech.
 The dependency refers to the requirements.txt, and install the dependency as follows:
 
 ```
-pip install -r requriement.txt 
+pip install -r requirements.txt 
 ```
 
 ### 2. Prepare Input File
@@ -30,11 +30,20 @@ Here are sample files for this demo that can be downloaded:
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 ```
 
-### 3. Usage
+### 3. run paddlespeech_server
+Before using the client, it is necessary to start paddlespeech_servers.
+
+Here are sample server configuration：
+```bash
+bash demos/audio_content_search/run.sh
+```
+The logs of the two services will be recorded in 'acs.log' and 'streaming_asr.log' in this configuration.
+
+### 4. Usage
 - Command Line(Recommended)
   ```bash
   # Chinese
-  paddlespeech_client acs --server_ip 127.0.0.1 --port 8090 --input ./zh.wav 
+  paddlespeech_client acs --server_ip 127.0.0.1 --port 8490 --input ./zh.wav 
   ```
   
   Usage:
diff --git a/demos/audio_content_search/README_cn.md b/demos/audio_content_search/README_cn.md
index 6f51c4cf2..16c1a3dd7 100644
--- a/demos/audio_content_search/README_cn.md
+++ b/demos/audio_content_search/README_cn.md
@@ -19,7 +19,7 @@
 依赖参见 requirements.txt, 安装依赖
 
 ```
-pip install -r requriement.txt 
+pip install -r requirements.txt
 ```
 
 ### 2. 准备输入
@@ -29,16 +29,26 @@ pip install -r requriement.txt
 ```bash
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 ```
-### 3. 使用方法
+
+### 3. 启动 server
+使用 client 之前需要先启动 paddlespeech_server。
+
+可以使用默认 server 配置：
+```bash
+bash demos/audio_content_search/run.sh
+```
+该配置下两个服务的日志会被记录在 `acs.log` 和 `streaming_asr.log` 中。
+
+### 4. 使用方法
 - 命令行 (推荐使用)
   ```bash
   # 中文
-  paddlespeech_client acs --server_ip 127.0.0.1 --port 8090 --input ./zh.wav 
+  paddlespeech_client acs --server_ip 127.0.0.1 --port 8490 --input ./zh.wav 
   ```
   
   使用方法：
   ```bash
-  paddlespeech acs --help
+  paddlespeech asr --help
   ```
   参数：
   - `input`(必须输入)：用于识别的音频文件。
diff --git a/demos/audio_content_search/conf/ws_conformer_application.yaml b/demos/audio_content_search/conf/ws_conformer_application.yaml
index 97201382f..ad34ec9fd 100644
--- a/demos/audio_content_search/conf/ws_conformer_application.yaml
+++ b/demos/audio_content_search/conf/ws_conformer_application.yaml
@@ -26,8 +26,10 @@ asr_online:
     sample_rate: 16000
     cfg_path: 
     decode_method: 'attention_rescoring' 
+    num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
     force_yes: True
     device: 'cpu' # cpu or gpu:id
+    continuous_decoding: False # disable continue decoding when endpoint detected
     am_predictor_conf:
         device:  # set 'gpu:id' or 'cpu'
         switch_ir_optim: True
@@ -40,4 +42,4 @@ asr_online:
         window_ms: 25   # ms
         shift_ms: 10    # ms
         sample_rate: 16000
-        sample_width: 2
+        sample_width: 2
\ No newline at end of file
diff --git a/demos/audio_content_search/conf/ws_conformer_wenetspeech_application.yaml b/demos/audio_content_search/conf/ws_conformer_wenetspeech_application.yaml
index c23680bd5..ef1ce8d5c 100644
--- a/demos/audio_content_search/conf/ws_conformer_wenetspeech_application.yaml
+++ b/demos/audio_content_search/conf/ws_conformer_wenetspeech_application.yaml
@@ -31,6 +31,7 @@ asr_online:
     force_yes: True
     device: 'cpu' # cpu or gpu:id
     decode_method: "attention_rescoring"
+    num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
     am_predictor_conf:
         device:  # set 'gpu:id' or 'cpu'
         switch_ir_optim: True

From 0b568136d9f777d2e702d2cc1f40781f6fee8312 Mon Sep 17 00:00:00 2001
From: zhuyipin <yipinzhu@outlook.com>
Date: Tue, 20 Aug 2024 16:53:25 +0800
Subject: [PATCH 20/39] speedyspeech code adapt for npu (#3804)

* speedyspeech code adapt for npu

* fix npu inference

* fix e2e synthesize

* add paddle version control for memory optim config

* fix code style

* fix code style

* fix help message

* fix code style

* fix help message
---
 examples/csmsc/tts2/local/inference_npu.sh    |  46 +++++++
 .../csmsc/tts2/local/synthesize_e2e_npu.sh    | 124 ++++++++++++++++++
 examples/csmsc/tts2/local/synthesize_npu.sh   | 110 ++++++++++++++++
 examples/csmsc/tts2/local/train_npu.sh        |  16 +++
 examples/csmsc/tts2/run_npu.sh                |  42 ++++++
 paddlespeech/t2s/exps/inference.py            |   2 +-
 paddlespeech/t2s/exps/speedyspeech/train.py   |  29 ++--
 paddlespeech/t2s/exps/syn_utils.py            |   3 +-
 paddlespeech/t2s/exps/synthesize.py           |  19 ++-
 paddlespeech/t2s/exps/synthesize_e2e.py       |  19 ++-
 10 files changed, 392 insertions(+), 18 deletions(-)
 create mode 100644 examples/csmsc/tts2/local/inference_npu.sh
 create mode 100755 examples/csmsc/tts2/local/synthesize_e2e_npu.sh
 create mode 100755 examples/csmsc/tts2/local/synthesize_npu.sh
 create mode 100755 examples/csmsc/tts2/local/train_npu.sh
 create mode 100644 examples/csmsc/tts2/run_npu.sh

diff --git a/examples/csmsc/tts2/local/inference_npu.sh b/examples/csmsc/tts2/local/inference_npu.sh
new file mode 100644
index 000000000..0746a0cdb
--- /dev/null
+++ b/examples/csmsc/tts2/local/inference_npu.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=pwgan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device npu
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=mb_melgan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device npu
+fi
+
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=hifigan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device npu
+fi
diff --git a/examples/csmsc/tts2/local/synthesize_e2e_npu.sh b/examples/csmsc/tts2/local/synthesize_e2e_npu.sh
new file mode 100755
index 000000000..1209a532b
--- /dev/null
+++ b/examples/csmsc/tts2/local/synthesize_e2e_npu.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=pwgan_csmsc \
+        --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nnpu=1
+
+
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nnpu=1
+fi
+
+# the pretrained models haven't release now
+# style melgan
+# style melgan's Dygraph to Static Graph is not ready now
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nnpu=1
+        # --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nnpu=1
+fi
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "in wavernn syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nnpu=1
+fi
diff --git a/examples/csmsc/tts2/local/synthesize_npu.sh b/examples/csmsc/tts2/local/synthesize_npu.sh
new file mode 100755
index 000000000..90fcef83d
--- /dev/null
+++ b/examples/csmsc/tts2/local/synthesize_npu.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=pwgan_csmsc \
+        --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nnpu=1
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nnpu=1
+fi
+
+# style melgan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nnpu=1
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "in hifigan syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nnpu=1
+fi
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "in wavernn syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --tones_dict=dump/tone_id_map.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nnpu=1
+fi
diff --git a/examples/csmsc/tts2/local/train_npu.sh b/examples/csmsc/tts2/local/train_npu.sh
new file mode 100755
index 000000000..46243e155
--- /dev/null
+++ b/examples/csmsc/tts2/local/train_npu.sh
@@ -0,0 +1,16 @@
+
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=0 \
+    --nnpu=1 \
+    --phones-dict=dump/phone_id_map.txt \
+    --tones-dict=dump/tone_id_map.txt \
+    --use-relative-path=True
diff --git a/examples/csmsc/tts2/run_npu.sh b/examples/csmsc/tts2/run_npu.sh
new file mode 100644
index 000000000..f36c93f74
--- /dev/null
+++ b/examples/csmsc/tts2/run_npu.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+npus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_76.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run_xpu.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    FLAGS_selected_npus=${npus} ./local/train_npu.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan by default
+    FLAGS_selected_npus=${npus} ./local/synthesize_npu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan by default
+    FLAGS_selected_npus=${npus} ./local/synthesize_e2e_npu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # inference with static model
+    FLAGS_selected_npus=${npus} ./local/inference_npu.sh ${train_output_path} || exit -1
+fi
diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py
index 8a5269825..21d105ade 100644
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@@ -112,7 +112,7 @@ def parse_args():
     parser.add_argument(
         "--device",
         default="gpu",
-        choices=["gpu", "cpu", "xpu"],
+        choices=["gpu", "cpu", "xpu", "npu"],
         help="Device selected for inference.", )
     parser.add_argument('--cpu_threads', type=int, default=1)
 
diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py
index c90090daa..b82d68802 100644
--- a/paddlespeech/t2s/exps/speedyspeech/train.py
+++ b/paddlespeech/t2s/exps/speedyspeech/train.py
@@ -45,15 +45,18 @@ def train_sp(args, config):
     # decides device type and whether to run in parallel
     # setup running environment correctly
     world_size = paddle.distributed.get_world_size()
-    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
-        if (not paddle.is_compiled_with_xpu()) or args.nxpu == 0:
-            paddle.set_device("cpu")
-        else:
-            paddle.set_device("xpu")
-    else:
+    if paddle.is_compiled_with_cuda() and args.ngpu > 0:
         paddle.set_device("gpu")
         if world_size > 1:
             paddle.distributed.init_parallel_env()
+    elif paddle.is_compiled_with_xpu() and args.nxpu > 0:
+        paddle.device.set_device("xpu")
+    elif args.nnpu > 0:
+        paddle.device.set_device("npu")
+        if world_size > 1:
+            paddle.distributed.init_parallel_env()
+    else:
+        paddle.set_device("cpu")
 
     # set the random seed, it is a must for multiprocess training
     seed_everything(config.seed)
@@ -191,9 +194,19 @@ def main():
         "--nxpu",
         type=int,
         default=0,
-        help="if nxpu == 0 and ngpu == 0, use cpu.")
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
+    )
+    parser.add_argument(
+        "--nnpu",
+        type=int,
+        default=0,
+        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
+    )
     parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu")
+        "--ngpu",
+        type=int,
+        default=1,
+        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")
 
     parser.add_argument(
         "--use-relative-path",
diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
index 9a07df64d..d29dd8110 100644
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -591,7 +591,8 @@ def get_predictor(
 
     config = inference.Config(
         str(Path(model_dir) / model_file), str(Path(model_dir) / params_file))
-    config.enable_memory_optim()
+    if paddle.__version__ <= "2.5.2" and paddle.__version__ != "0.0.0":
+        config.enable_memory_optim()
     config.switch_ir_optim(True)
     if device == "gpu":
         config.enable_use_gpu(100, device_id)
diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py
index e7cf7850e..9eb459894 100644
--- a/paddlespeech/t2s/exps/synthesize.py
+++ b/paddlespeech/t2s/exps/synthesize.py
@@ -219,12 +219,21 @@ def parse_args():
     )
     # other
     parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.")
+        "--ngpu",
+        type=int,
+        default=1,
+        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")
     parser.add_argument(
         "--nxpu",
         type=int,
         default=0,
-        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu."
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
+    )
+    parser.add_argument(
+        "--nnpu",
+        type=int,
+        default=0,
+        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
     )
     parser.add_argument("--test_metadata", type=str, help="test metadata.")
     parser.add_argument("--output_dir", type=str, help="output dir.")
@@ -245,10 +254,12 @@ def main():
         paddle.set_device("gpu")
     elif args.nxpu > 0:
         paddle.set_device("xpu")
-    elif args.ngpu == 0 and args.nxpu == 0:
+    elif args.nnpu > 0:
+        paddle.set_device("npu")
+    elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0:
         paddle.set_device("cpu")
     else:
-        print("ngpu or nxpu should >= 0 !")
+        print("ngpu, nxpu and nnpu should be >= 0")
 
     evaluate(args)
 
diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
index c63a5fbe9..b9073124b 100644
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -299,12 +299,21 @@ def parse_args():
         default=None,
         help="dir to save inference models")
     parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.")
+        "--ngpu",
+        type=int,
+        default=1,
+        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")
     parser.add_argument(
         "--nxpu",
         type=int,
         default=0,
-        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu."
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
+    )
+    parser.add_argument(
+        "--nnpu",
+        type=int,
+        default=0,
+        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
     )
     parser.add_argument(
         "--text",
@@ -339,10 +348,12 @@ def main():
         paddle.set_device("gpu")
     elif args.nxpu > 0:
         paddle.set_device("xpu")
-    elif args.ngpu == 0 and args.nxpu == 0:
+    elif args.nnpu > 0:
+        paddle.set_device("npu")
+    elif args.ngpu == 0 and args.nxpu == 0 or args.nnpu == 0:
         paddle.set_device("cpu")
     else:
-        print("ngpu or nxpu should >= 0 !")
+        print("ngpu, nxpu and nnpu should be >= 0")
 
     evaluate(args)
 

From 4be005858b75c380a6a7b614108bafd1db8dddd6 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Thu, 22 Aug 2024 11:18:29 +0800
Subject: [PATCH 21/39] =?UTF-8?q?=E3=80=90DOC=E3=80=91fix=20demos=20bug=20?=
 =?UTF-8?q?(#3830)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix demos

* fix test
---
 demos/audio_searching/requirements.txt                      | 6 +++---
 demos/streaming_asr_server/README.md                        | 4 ++--
 demos/streaming_asr_server/README_cn.md                     | 4 ++--
 demos/style_fs2/run.sh                                      | 2 +-
 .../unit/server/online/tts/test_server/test_http_client.py  | 2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/demos/audio_searching/requirements.txt b/demos/audio_searching/requirements.txt
index 9d0f6419b..3c0f05afc 100644
--- a/demos/audio_searching/requirements.txt
+++ b/demos/audio_searching/requirements.txt
@@ -1,5 +1,5 @@
-diskcache==5.2.1
-dtaidistance==2.3.1
+diskcache
+dtaidistane
 fastapi
 librosa==0.8.0
 numpy==1.22.0
@@ -10,4 +10,4 @@ python-multipart
 soundfile==0.10.3.post1
 starlette
 typing
-uvicorn
\ No newline at end of file
+uvicorn
diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md
index 31256d151..136863b96 100644
--- a/demos/streaming_asr_server/README.md
+++ b/demos/streaming_asr_server/README.md
@@ -429,7 +429,7 @@ bash server.sh
   If `127.0.0.1` is not accessible, you need to use the actual service IP address.
 
   ```bash
-  paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav
+  paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav
   ```
   Output:
   ```text
@@ -507,7 +507,7 @@ bash server.sh
   If `127.0.0.1` is not accessible, you need to use the actual service IP address.
 
   ```bash
-  python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
+  python3 local/websocket_client.py --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
   ```
   Output:
   ```text
diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md
index bbddd6932..f5f477ea1 100644
--- a/demos/streaming_asr_server/README_cn.md
+++ b/demos/streaming_asr_server/README_cn.md
@@ -428,7 +428,7 @@ bash server.sh
   若 `127.0.0.1` 不能访问，则需要使用实际服务 IP 地址
 
   ```bash
-  paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav
+  paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav
   ```
   输出:
   ```text
@@ -506,7 +506,7 @@ bash server.sh
   若 `127.0.0.1` 不能访问，则需要使用实际服务 IP 地址
 
   ```bash
-  python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
+  python3 local/websocket_client.py --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
   ```
   输出:
   ```text
diff --git a/demos/style_fs2/run.sh b/demos/style_fs2/run.sh
index 6f6d60680..45fc0c104 100755
--- a/demos/style_fs2/run.sh
+++ b/demos/style_fs2/run.sh
@@ -32,7 +32,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \
         --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
         --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=./sentences.txt \
         --output-dir=output \
         --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
 fi
diff --git a/tests/unit/server/online/tts/test_server/test_http_client.py b/tests/unit/server/online/tts/test_server/test_http_client.py
index 3174e85e2..685c5ca9c 100644
--- a/tests/unit/server/online/tts/test_server/test_http_client.py
+++ b/tests/unit/server/online/tts/test_server/test_http_client.py
@@ -48,7 +48,7 @@ if __name__ == "__main__":
     parser.add_argument(
         "--text",
         type=str,
-        default="../../../../../../paddlespeech/t2s/exps/csmsc_test.txt",
+        default="../../../../../../paddlespeech/t2s/assets/csmsc_test.txt",
         help="text to synthesize, a 'utt_id sentence' pair per line")
     parser.add_argument('--spk_id', type=int, default=0, help='Speaker id')
     parser.add_argument('--speed', type=float, default=1.0, help='Audio speed')

From a9ece28ba63bc0841ef5488b3dba46e8d5aa180b Mon Sep 17 00:00:00 2001
From: zhuyipin <yipinzhu@outlook.com>
Date: Thu, 29 Aug 2024 10:56:01 +0800
Subject: [PATCH 22/39] speedyspeech code adapt for mlu (#3828)

* speedyspeech code adapt for mlu

* fix inference

* fix help message
---
 examples/csmsc/tts2/local/inference_mlu.sh    | 33 +++++++
 .../csmsc/tts2/local/synthesize_e2e_mlu.sh    | 99 +++++++++++++++++++
 examples/csmsc/tts2/local/synthesize_mlu.sh   | 90 +++++++++++++++++
 examples/csmsc/tts2/local/train_mlu.sh        | 16 +++
 examples/csmsc/tts2/run_mlu.sh                | 76 ++++++++++++++
 paddlespeech/t2s/exps/inference.py            |  2 +-
 paddlespeech/t2s/exps/speedyspeech/train.py   | 12 ++-
 paddlespeech/t2s/exps/synthesize.py           | 21 +++-
 paddlespeech/t2s/exps/synthesize_e2e.py       | 21 +++-
 9 files changed, 357 insertions(+), 13 deletions(-)
 create mode 100755 examples/csmsc/tts2/local/inference_mlu.sh
 create mode 100755 examples/csmsc/tts2/local/synthesize_e2e_mlu.sh
 create mode 100755 examples/csmsc/tts2/local/synthesize_mlu.sh
 create mode 100755 examples/csmsc/tts2/local/train_mlu.sh
 create mode 100755 examples/csmsc/tts2/run_mlu.sh

diff --git a/examples/csmsc/tts2/local/inference_mlu.sh b/examples/csmsc/tts2/local/inference_mlu.sh
new file mode 100755
index 000000000..d1bade84d
--- /dev/null
+++ b/examples/csmsc/tts2/local/inference_mlu.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=mb_melgan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device mlu
+fi
+
+# hifigan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=hifigan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device mlu
+fi
diff --git a/examples/csmsc/tts2/local/synthesize_e2e_mlu.sh b/examples/csmsc/tts2/local/synthesize_e2e_mlu.sh
new file mode 100755
index 000000000..7ad2024ff
--- /dev/null
+++ b/examples/csmsc/tts2/local/synthesize_e2e_mlu.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nmlu=1
+fi
+
+# the pretrained models haven't release now
+# style melgan
+# style melgan's Dygraph to Static Graph is not ready now
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nmlu=1
+        # --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nmlu=1
+fi
+
+# wavernn
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "in wavernn syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nmlu=1
+fi
diff --git a/examples/csmsc/tts2/local/synthesize_mlu.sh b/examples/csmsc/tts2/local/synthesize_mlu.sh
new file mode 100755
index 000000000..6c0b0b650
--- /dev/null
+++ b/examples/csmsc/tts2/local/synthesize_mlu.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+stage=0
+stop_stage=0
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nmlu=1
+fi
+
+# style melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nmlu=1
+fi
+
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "in hifigan syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nmlu=1
+fi
+
+# wavernn
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "in wavernn syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --tones_dict=dump/tone_id_map.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nmlu=1
+fi
diff --git a/examples/csmsc/tts2/local/train_mlu.sh b/examples/csmsc/tts2/local/train_mlu.sh
new file mode 100755
index 000000000..4c1486434
--- /dev/null
+++ b/examples/csmsc/tts2/local/train_mlu.sh
@@ -0,0 +1,16 @@
+
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+# export MLU_VISIBLE_DEVICES=8
+python ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=0 \
+    --nmlu=2 \
+    --phones-dict=dump/phone_id_map.txt \
+    --tones-dict=dump/tone_id_map.txt \
+    --use-relative-path=True
diff --git a/examples/csmsc/tts2/run_mlu.sh b/examples/csmsc/tts2/run_mlu.sh
new file mode 100755
index 000000000..848e54077
--- /dev/null
+++ b/examples/csmsc/tts2/run_mlu.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+set -e
+source path.sh
+export CUSTOM_DEVICE_BLACK_LIST=elementwise_max
+mlus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_30600.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    FLAGS_selected_mlus=${mlus} ./local/train_mlu.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan by default
+    FLAGS_selected_mlus=${mlus} ./local/synthesize_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan by default
+    FLAGS_selected_mlus=${mlus} ./local/synthesize_e2e_mlu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # inference with static model
+    FLAGS_selected_mlus=${mlus} ./local/inference_mlu.sh ${train_output_path} || exit -1
+fi
+
+# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
+# we have only tested the following models so far
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # install paddle2onnx
+    pip install paddle2onnx --upgrade
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc
+    # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
+    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
+    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
+fi
+
+# inference with onnxruntime
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    ./local/ort_predict.sh ${train_output_path}
+fi
+
+# must run after stage 3 (which stage generated static models)
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+    ./local/export2lite.sh ${train_output_path} inference pdlite speedyspeech_csmsc x86
+    ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86
+    # ./local/export2lite.sh ${train_output_path} inference pdlite mb_melgan_csmsc x86
+    # ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_csmsc x86
+fi
+
+if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
+fi
+
+# PTQ_static
+if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh  ${train_output_path} speedyspeech_csmsc || exit -1
+fi
diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py
index 21d105ade..e8ddd3bef 100644
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@@ -112,7 +112,7 @@ def parse_args():
     parser.add_argument(
         "--device",
         default="gpu",
-        choices=["gpu", "cpu", "xpu", "npu"],
+        choices=["gpu", "cpu", "xpu", "npu", "mlu"],
         help="Device selected for inference.", )
     parser.add_argument('--cpu_threads', type=int, default=1)
 
diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py
index b82d68802..b1916fbc4 100644
--- a/paddlespeech/t2s/exps/speedyspeech/train.py
+++ b/paddlespeech/t2s/exps/speedyspeech/train.py
@@ -55,6 +55,8 @@ def train_sp(args, config):
         paddle.device.set_device("npu")
         if world_size > 1:
             paddle.distributed.init_parallel_env()
+    elif args.nmlu > 0:
+        paddle.device.set_device("mlu")
     else:
         paddle.set_device("cpu")
 
@@ -194,13 +196,19 @@ def main():
         "--nxpu",
         type=int,
         default=0,
-        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
     )
     parser.add_argument(
         "--nnpu",
         type=int,
         default=0,
-        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
+        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
+    )
+    parser.add_argument(
+        "--nmlu",
+        type=int,
+        default=1,
+        help="if wish to use npu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
     )
     parser.add_argument(
         "--ngpu",
diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py
index 9eb459894..b159725e2 100644
--- a/paddlespeech/t2s/exps/synthesize.py
+++ b/paddlespeech/t2s/exps/synthesize.py
@@ -222,18 +222,25 @@ def parse_args():
         "--ngpu",
         type=int,
         default=1,
-        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")
+        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu, mlu or cpu."
+    )
     parser.add_argument(
         "--nxpu",
         type=int,
         default=0,
-        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
     )
     parser.add_argument(
         "--nnpu",
         type=int,
         default=0,
-        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
+        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
+    )
+    parser.add_argument(
+        "--nmlu",
+        type=int,
+        default=0,
+        help="if wish to use xpu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
     )
     parser.add_argument("--test_metadata", type=str, help="test metadata.")
     parser.add_argument("--output_dir", type=str, help="output dir.")
@@ -256,10 +263,14 @@ def main():
         paddle.set_device("xpu")
     elif args.nnpu > 0:
         paddle.set_device("npu")
-    elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0:
+    elif args.nmlu > 0:
+        paddle.set_device("mlu")
+    elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0 and args.nmlu == 0:
         paddle.set_device("cpu")
     else:
-        print("ngpu, nxpu and nnpu should be >= 0")
+        print(
+            "one of ngpu, nxpu, nnpu or nmlu should be greater than 0 or all of them equal to 0"
+        )
 
     evaluate(args)
 
diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
index b9073124b..08a14b315 100644
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -302,18 +302,25 @@ def parse_args():
         "--ngpu",
         type=int,
         default=1,
-        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu or cpu.")
+        help="if wish to use gpu, set ngpu > 0, otherwise use xpu, npu, mlu or cpu."
+    )
     parser.add_argument(
         "--nxpu",
         type=int,
         default=0,
-        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu or cpu."
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, otherwise use gpu, npu, mlu or cpu."
     )
     parser.add_argument(
         "--nnpu",
         type=int,
         default=0,
-        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu or cpu."
+        help="if wish to use npu, set ngpu == 0 and nnpu > 0, otherwise use gpu, xpu, mlu or cpu."
+    )
+    parser.add_argument(
+        "--nmlu",
+        type=int,
+        default=0,
+        help="if wish to use xpu, set ngpu == 0 and nmlu > 0, otherwise use gpu, xpu, npu or cpu."
     )
     parser.add_argument(
         "--text",
@@ -350,10 +357,14 @@ def main():
         paddle.set_device("xpu")
     elif args.nnpu > 0:
         paddle.set_device("npu")
-    elif args.ngpu == 0 and args.nxpu == 0 or args.nnpu == 0:
+    elif args.nmlu > 0:
+        paddle.set_device("mlu")
+    elif args.ngpu == 0 and args.nxpu == 0 and args.nnpu == 0 and args.nmlu == 0:
         paddle.set_device("cpu")
     else:
-        print("ngpu, nxpu and nnpu should be >= 0")
+        print(
+            "one of ngpu, nxpu, nnpu or nmlu should be greater than 0 or all of them equal to 0"
+        )
 
     evaluate(args)
 

From d9eb82a6324bdc3ab7bfd9d38ced92ae7e9693c5 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Thu, 29 Aug 2024 19:35:26 +0800
Subject: [PATCH 23/39] fix unit test (#3835)

---
 tests/unit/asr/deepspeech2_model_test.py      | 10 ++++----
 .../unit/asr/deepspeech2_online_model_test.py | 24 +++++++++----------
 .../unit/server/offline/test_server_client.sh |  2 ++
 tests/unit/tts/test_data_table.py             |  2 +-
 4 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/tests/unit/asr/deepspeech2_model_test.py b/tests/unit/asr/deepspeech2_model_test.py
index 5835445d2..fd42192ea 100644
--- a/tests/unit/asr/deepspeech2_model_test.py
+++ b/tests/unit/asr/deepspeech2_model_test.py
@@ -48,7 +48,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
             num_rnn_layers=3,
             rnn_size=1024,
             use_gru=False,
-            share_rnn_weights=False, )
+            rnn_direction="forward", )
         loss = model(self.audio, self.audio_len, self.text, self.text_len)
         self.assertEqual(loss.numel(), 1)
 
@@ -60,7 +60,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
             num_rnn_layers=3,
             rnn_size=1024,
             use_gru=True,
-            share_rnn_weights=False, )
+            rnn_direction="forward", )
         loss = model(self.audio, self.audio_len, self.text, self.text_len)
         self.assertEqual(loss.numel(), 1)
 
@@ -72,7 +72,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
             num_rnn_layers=3,
             rnn_size=1024,
             use_gru=False,
-            share_rnn_weights=True, )
+            rnn_direction="bidirect", )
         loss = model(self.audio, self.audio_len, self.text, self.text_len)
         self.assertEqual(loss.numel(), 1)
 
@@ -84,7 +84,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
             num_rnn_layers=3,
             rnn_size=1024,
             use_gru=True,
-            share_rnn_weights=True, )
+            rnn_direction="bidirect", )
         loss = model(self.audio, self.audio_len, self.text, self.text_len)
         self.assertEqual(loss.numel(), 1)
 
@@ -96,7 +96,7 @@ class TestDeepSpeech2Model(unittest.TestCase):
             num_rnn_layers=3,
             rnn_size=1024,
             use_gru=False,
-            share_rnn_weights=False, )
+            rnn_direction="forward", )
         loss = model(self.audio, self.audio_len, self.text, self.text_len)
         self.assertEqual(loss.numel(), 1)
 
diff --git a/tests/unit/asr/deepspeech2_online_model_test.py b/tests/unit/asr/deepspeech2_online_model_test.py
index f23c49263..f7ea87b12 100644
--- a/tests/unit/asr/deepspeech2_online_model_test.py
+++ b/tests/unit/asr/deepspeech2_online_model_test.py
@@ -19,11 +19,11 @@ import numpy as np
 import paddle
 from paddle import inference
 
-from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline
-from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
+from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
+from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
 
 
-class TestDeepSpeech2ModelOnline(unittest.TestCase):
+class TestDeepSpeech2Model(unittest.TestCase):
     def setUp(self):
         paddle.set_device('cpu')
 
@@ -45,7 +45,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
         self.text_len = paddle.to_tensor(text_len, dtype='int64')
 
     def test_ds2_1(self):
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -58,7 +58,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
         self.assertEqual(loss.numel(), 1)
 
     def test_ds2_2(self):
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -71,7 +71,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
         self.assertEqual(loss.numel(), 1)
 
     def test_ds2_3(self):
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -84,7 +84,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
         self.assertEqual(loss.numel(), 1)
 
     def test_ds2_4(self):
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -97,7 +97,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
         self.assertEqual(loss.numel(), 1)
 
     def test_ds2_5(self):
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -110,7 +110,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
         self.assertEqual(loss.numel(), 1)
 
     def test_ds2_6(self):
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -125,7 +125,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
 
     def test_ds2_7(self):
         use_gru = False
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -156,7 +156,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase):
 
     def test_ds2_8(self):
         use_gru = True
-        model = DeepSpeech2ModelOnline(
+        model = DeepSpeech2Model(
             feat_size=self.feat_dim,
             dict_size=10,
             num_conv_layers=2,
@@ -191,7 +191,7 @@ class TestDeepSpeech2StaticModelOnline(unittest.TestCase):
         export_prefix = "exp/deepspeech2_online/checkpoints/test_export"
         if not os.path.exists(os.path.dirname(export_prefix)):
             os.makedirs(os.path.dirname(export_prefix), mode=0o755)
-        infer_model = DeepSpeech2InferModelOnline(
+        infer_model = DeepSpeech2InferModel(
             feat_size=161,
             dict_size=4233,
             num_conv_layers=2,
diff --git a/tests/unit/server/offline/test_server_client.sh b/tests/unit/server/offline/test_server_client.sh
index dc52609c5..29bdd4032 100644
--- a/tests/unit/server/offline/test_server_client.sh
+++ b/tests/unit/server/offline/test_server_client.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 # bash test_server_client.sh
+## require lsof to get server pid
+## apt-get install -y lsof
 
 StartService(){
     # Start service 
diff --git a/tests/unit/tts/test_data_table.py b/tests/unit/tts/test_data_table.py
index 3ff5bc1af..773942a2e 100644
--- a/tests/unit/tts/test_data_table.py
+++ b/tests/unit/tts/test_data_table.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddlespeech.t2s.datasets.data_tabel import DataTable
+from paddlespeech.t2s.datasets.data_table import DataTable
 
 
 def test_audio_dataset():

From 7e52aaed74f87b02af6d03098ff9f65e3224f5ce Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Fri, 30 Aug 2024 13:09:29 +0800
Subject: [PATCH 24/39] Add tests (#3836)

* Add tests

* fix

* Fix

* Fix

* disable deepspeech2_online_model_test

* disable test_data_table

* Fix
---
 tests/unit/ci.sh | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 tests/unit/ci.sh

diff --git a/tests/unit/ci.sh b/tests/unit/ci.sh
new file mode 100644
index 000000000..9342a2685
--- /dev/null
+++ b/tests/unit/ci.sh
@@ -0,0 +1,31 @@
+function main(){
+  set -ex
+  speech_ci_path=`pwd`
+
+  echo "Start asr"
+  cd ${speech_ci_path}/asr
+  bash deepspeech2_online_model_test.sh
+  python error_rate_test.py
+  python mask_test.py
+  python reverse_pad_list.py
+  echo "End asr"
+
+  echo "Start TTS"
+  cd ${speech_ci_path}/tts
+  python test_data_table.py
+  python test_enfrontend.py
+  python test_mixfrontend.py
+  echo "End TTS"
+
+  echo "Start Vector"
+  cd ${speech_ci_path}/vector
+  python test_augment.py
+  echo "End Vector"
+
+  echo "Start cli"
+  cd ${speech_ci_path}/cli
+  bash test_cli.sh
+  echo "End cli"
+}
+
+main

From f66d7d25c40987bf4262ae7f17b442b0d7d4f356 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Thu, 5 Sep 2024 17:05:29 +0800
Subject: [PATCH 25/39] fix matplotlib version for incompatible upgrade (#3841)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 941639e73..48c684855 100644
--- a/setup.py
+++ b/setup.py
@@ -45,7 +45,7 @@ base = [
     "librosa==0.8.1",
     "scipy>=1.4.0, <=1.12.0",
     "loguru",
-    "matplotlib",
+    "matplotlib<=3.8.4",
     "nara_wpe",
     "onnxruntime>=1.11.0",
     "opencc==1.1.6",

From 188444f77841725fd720cb1115fd700bc6363615 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Sun, 29 Sep 2024 11:22:13 +0800
Subject: [PATCH 26/39] fix (#3856)

---
 README.md                                     | 2 +-
 paddlespeech/cls/exps/panns/deploy/predict.py | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 19ec61cb0..5197ff4c7 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
     <a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-red.svg"></a>
     <a href="https://github.com/PaddlePaddle/PaddleSpeech/releases"><img src="https://img.shields.io/github/v/release/PaddlePaddle/PaddleSpeech?color=ffa"></a>
     <a href="support os"><img src="https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg"></a>
-    <a href=""><img src="https://img.shields.io/badge/python-3.7+-aff.svg"></a>
+    <a href=""><img src="https://img.shields.io/badge/python-3.8+-aff.svg"></a>
     <a href="https://github.com/PaddlePaddle/PaddleSpeech/graphs/contributors"><img src="https://img.shields.io/github/contributors/PaddlePaddle/PaddleSpeech?color=9ea"></a>
     <a href="https://github.com/PaddlePaddle/PaddleSpeech/commits"><img src="https://img.shields.io/github/commit-activity/m/PaddlePaddle/PaddleSpeech?color=3af"></a>
     <a href="https://github.com/PaddlePaddle/PaddleSpeech/issues"><img src="https://img.shields.io/github/issues/PaddlePaddle/PaddleSpeech?color=9cc"></a>
diff --git a/paddlespeech/cls/exps/panns/deploy/predict.py b/paddlespeech/cls/exps/panns/deploy/predict.py
index f14b44215..866a669e8 100644
--- a/paddlespeech/cls/exps/panns/deploy/predict.py
+++ b/paddlespeech/cls/exps/panns/deploy/predict.py
@@ -17,7 +17,7 @@ import os
 import numpy as np
 from paddle import inference
 from paddle.audio.datasets import ESC50
-from paddle.audio.features import MelSpectrogram
+from paddle.audio.features import LogMelSpectrogram
 from paddleaudio.backends import soundfile_load as load_audio
 from scipy.special import softmax
 
@@ -53,7 +53,10 @@ def extract_features(files: str, **kwargs):
             pad_width = max_length - len(waveforms[i])
             waveforms[i] = np.pad(waveforms[i], pad_width=(0, pad_width))
 
-        feat = MelSpectrogram(waveforms[i], sr, **kwargs).transpose()
+        feature_extractor = LogMelSpectrogram(sr, **kwargs)
+        feat = feature_extractor(paddle.to_tensor(waveforms[i]))
+        feat = paddle.transpose(feat, perm=[1, 0]).unsqueeze(0)
+
         feats.append(feat)
 
     return np.stack(feats, axis=0)

From bf03c9a620463962b79a5ed3b9f1dbb8c0340fb1 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Tue, 8 Oct 2024 14:30:39 +0800
Subject: [PATCH 27/39] Add server ci (#3857)

* Add server ci

* fix
---
 tests/unit/ci.sh | 5 +++++
 tools/Dockerfile | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/unit/ci.sh b/tests/unit/ci.sh
index 9342a2685..72b4678d6 100644
--- a/tests/unit/ci.sh
+++ b/tests/unit/ci.sh
@@ -26,6 +26,11 @@ function main(){
   cd ${speech_ci_path}/cli
   bash test_cli.sh
   echo "End cli"
+
+  echo "Start server"
+  cd ${speech_ci_path}/server/offline
+  bash test_server_client.sh
+  echo "End server"
 }
 
 main
diff --git a/tools/Dockerfile b/tools/Dockerfile
index 18596f32c..b2f2b3125 100644
--- a/tools/Dockerfile
+++ b/tools/Dockerfile
@@ -1,4 +1,4 @@
 FROM registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.2-cudnn8.2-trt8.0-gcc82
 RUN apt-get update -y
-RUN apt-get -y install libsndfile1
+RUN apt-get -y install libsndfile1 lsof
 RUN pip3.8 install pytest-runner

From 658d19a73e2c8af06f5e17efd5b3885eb0689018 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Wed, 9 Oct 2024 17:38:49 +0800
Subject: [PATCH 28/39] =?UTF-8?q?=E3=80=90doc=E3=80=91add=20Squeezeformer?=
 =?UTF-8?q?=20info=20in=20README=20(#3860)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update README.md

* Update README_cn.md

* Update README_cn.md
---
 README.md    | 1 +
 README_cn.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 5197ff4c7..00367d787 100644
--- a/README.md
+++ b/README.md
@@ -179,6 +179,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
 
 ### Recent Update
 - 👑 2023.05.31: Add [WavLM ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr5), WavLM fine-tuning for ASR on LibriSpeech.
+- 🎉 2023.05.18: Add [Squeezeformer](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell/asr1), Squeezeformer training for ASR on Aishell.
 - 👑 2023.05.04: Add [HuBERT ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr4), HuBERT fine-tuning for ASR on LibriSpeech.
 - ⚡ 2023.04.28: Fix [0-d tensor](https://github.com/PaddlePaddle/PaddleSpeech/pull/3214), with the upgrade of paddlepaddle==2.5, the problem of modifying 0-d tensor has been solved.
 - 👑 2023.04.25: Add [AMP for U2 conformer](https://github.com/PaddlePaddle/PaddleSpeech/pull/3167).
diff --git a/README_cn.md b/README_cn.md
index 7aef30871..d70940dd2 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -184,6 +184,7 @@
 
 ### 近期更新
 - 👑 2023.05.31: 新增 [WavLM ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr5), 基于WavLM的英语识别微调，使用LibriSpeech数据集
+- 🎉 2023.05.18: 新增 [Squeezeformer](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell/asr1), 使用Squeezeformer进行训练，使用Aishell数据集
 - 👑 2023.05.04: 新增 [HuBERT ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr4), 基于HuBERT的英语识别微调，使用LibriSpeech数据集
 - ⚡ 2023.04.28: 修正 [0-d tensor](https://github.com/PaddlePaddle/PaddleSpeech/pull/3214), 配合PaddlePaddle2.5升级修改了0-d tensor的问题。
 - 👑 2023.04.25: 新增 [U2 conformer 的 AMP 训练](https://github.com/PaddlePaddle/PaddleSpeech/pull/3167).

From 6316514329764151e1a409f0a31631c51924920c Mon Sep 17 00:00:00 2001
From: megemini <megemini@outlook.com>
Date: Wed, 30 Oct 2024 17:41:17 +0800
Subject: [PATCH 29/39] [Fix] type promotion (#3817)

---
 paddlespeech/audio/utils/tensor_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddlespeech/audio/utils/tensor_utils.py b/paddlespeech/audio/utils/tensor_utils.py
index 93883c94d..43dcac0ac 100644
--- a/paddlespeech/audio/utils/tensor_utils.py
+++ b/paddlespeech/audio/utils/tensor_utils.py
@@ -248,7 +248,7 @@ def st_reverse_pad_list(ys_pad: paddle.Tensor,
     #   >>> tensor([[ 2,  1,  0],
     #   >>>         [ 2,  1,  0],
     #   >>>         [ 0, -1, -2]])
-    index = index * seq_mask
+    index = index * seq_mask.astype(index.dtype)
 
     #   >>> index
     #   >>> tensor([[2, 1, 0],

From e41a8794c4d505ff7b8e8be99cfafa184c4b6599 Mon Sep 17 00:00:00 2001
From: 131 <11222509+Netrvin@users.noreply.github.com>
Date: Mon, 4 Nov 2024 14:56:07 +0800
Subject: [PATCH 30/39] fix pydantic dependency (#3715)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix pydantic dependency

pydantic>=2.0时，str不能为None

* remove extra space
---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 48c684855..927f7edb6 100644
--- a/setup.py
+++ b/setup.py
@@ -59,6 +59,7 @@ base = [
     "paddlespeech_feat",
     "praatio>=5.0.0, <=5.1.1",
     "prettytable",
+    "pydantic>=1.10.14, <2.0",
     "pypinyin<=0.44.0",
     "pypinyin-dict",
     "python-dateutil",

From d5b0020e9a492abb96286590316044d5a8b9417f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Mon, 4 Nov 2024 17:48:23 +0800
Subject: [PATCH 31/39] =?UTF-8?q?Fix=20missing=20=E2=80=99=20(#3869)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 demos/speech_ssl/README_cn.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demos/speech_ssl/README_cn.md b/demos/speech_ssl/README_cn.md
index a18c778a7..5b209419a 100644
--- a/demos/speech_ssl/README_cn.md
+++ b/demos/speech_ssl/README_cn.md
@@ -56,7 +56,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
 
   # 识别文本
   text = ssl_executor(
-      model='wav2vec2,
+      model='wav2vec2',
       task='asr',
       lang='en',
       sample_rate=16000,

From 6f8438818936380dde33d445cabe0e2626e05573 Mon Sep 17 00:00:00 2001
From: undefined <undefined_1@outlook.com>
Date: Wed, 6 Nov 2024 10:24:29 +0800
Subject: [PATCH 32/39] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E9=87=8F=E8=AF=8D?=
 =?UTF-8?q?=E5=92=8C=E5=8D=95=E4=BD=8D=E7=AC=A6=E5=8F=B7=20(#3837)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 添加对公里、公引、公丈、公尺、公寸、公分、公釐的支持

> 低于100公里或超过110公里时

先前生成```低于幺零零公里或超过幺幺零公里时```

修改后```低于一百公里或超过一百一十公里时```

* 添加更多单位
---
 paddlespeech/t2s/frontend/zh_normalization/num.py        | 2 +-
 paddlespeech/t2s/frontend/zh_normalization/quantifier.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddlespeech/t2s/frontend/zh_normalization/num.py b/paddlespeech/t2s/frontend/zh_normalization/num.py
index 8a54d3e63..c56563e56 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/num.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/num.py
@@ -28,7 +28,7 @@ UNITS = OrderedDict({
     8: '亿',
 })
 
-COM_QUANTIFIERS = '(封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)'
+COM_QUANTIFIERS = '(封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分|(公(里|引|丈|尺|寸|分|釐)))'
 
 # 分数表达式
 RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
diff --git a/paddlespeech/t2s/frontend/zh_normalization/quantifier.py b/paddlespeech/t2s/frontend/zh_normalization/quantifier.py
index 598030e43..6790d7eab 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/quantifier.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/quantifier.py
@@ -35,7 +35,9 @@ measure_dict = {
     "ml": "毫升",
     "m": "米",
     "mm": "毫米",
-    "s": "秒"
+    "s": "秒",
+    "h": "小时",
+    "mg": "毫克"
 }
 
 

From 8279539978b29dd81afe78a58d06e6f6e3476861 Mon Sep 17 00:00:00 2001
From: SuiYunsy <104718844+SuiYunsy@users.noreply.github.com>
Date: Wed, 6 Nov 2024 14:24:32 +0800
Subject: [PATCH 33/39] Fix a bug of streaming_tts_server (#3865)

* Fix a bug of streaming_tts_server

Fix "Failed to get model related files" when using fastspeech2_csmsc_onnx model in streaming_tts_server.

* Fix CodeStyle
---
 paddlespeech/server/engine/tts/online/onnx/tts_engine.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
index 14204dde7..c78414163 100644
--- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
@@ -76,8 +76,8 @@ class TTSServerExecutor(TTSExecutor):
                     version=None,  # default version
                 )
                 self.am_res_path = self.task_resource.res_dir
-                self.am_ckpt = os.path.join(
-                    self.am_res_path, self.task_resource.res_dict['ckpt'][0])
+                self.am_ckpt = os.path.join(self.am_res_path,
+                                            self.task_resource.res_dict['ckpt'])
                 # must have phones_dict in acoustic
                 self.phones_dict = os.path.join(
                     self.am_res_path,

From 5f40262a96f1fac361a8f8869f6fb38ff7647ec1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Fri, 8 Nov 2024 14:32:27 +0800
Subject: [PATCH 34/39] Fix wav2vec error in Demos/ssl (#3872)

* Update infer.py

* Update wav2vec2_ASR.py

* Apply suggestions from code review

* Apply suggestions from code review

* Update infer.py

* Update wav2vec2_ASR.py

* Apply suggestions from code review
---
 paddlespeech/cli/ssl/infer.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/paddlespeech/cli/ssl/infer.py b/paddlespeech/cli/ssl/infer.py
index 9b4b02803..33cdf7637 100644
--- a/paddlespeech/cli/ssl/infer.py
+++ b/paddlespeech/cli/ssl/infer.py
@@ -119,6 +119,7 @@ class SSLExecutor(BaseExecutor):
             '--verbose',
             action='store_true',
             help='Increase logger verbosity of current task.')
+        self.last_call_params = None
 
     def _init_from_path(self,
                         model_type: str=None,
@@ -453,6 +454,23 @@ class SSLExecutor(BaseExecutor):
         Python API to call an executor.
         """
 
+        current_call_params = {
+            "model": model,
+            "task": task,
+            "lang": lang,
+            "sample_rate": sample_rate,
+            "config": config,
+            "ckpt_path": ckpt_path,
+            "decode_method": decode_method,
+            "force_yes": force_yes,
+            "rtf": rtf,
+            "device": device
+        }
+        if self.last_call_params is not None and self.last_call_params != current_call_params and hasattr(
+                self, 'model'):
+            del self.model
+        self.last_call_params = current_call_params
+
         audio_file = os.path.abspath(audio_file)
         paddle.set_device(device)
         self._init_from_path(model, task, lang, sample_rate, config,

From e75a3987fc55f6e01ed0b1812c3570e926e6c872 Mon Sep 17 00:00:00 2001
From: Wang Xin <xinwang614@gmail.com>
Date: Mon, 11 Nov 2024 11:21:35 +0800
Subject: [PATCH 35/39] update install_openblas.sh (#3876)

---
 tools/extras/install_openblas.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/extras/install_openblas.sh b/tools/extras/install_openblas.sh
index 91b6444b8..9e2564e0a 100755
--- a/tools/extras/install_openblas.sh
+++ b/tools/extras/install_openblas.sh
@@ -19,18 +19,18 @@ fi
 
 tarball=OpenBLAS-$OPENBLAS_VERSION.tar.gz
 
-rm -rf xianyi-OpenBLAS-* OpenBLAS OpenBLAS-*.tar.gz
+rm -rf OpenMathLib-OpenBLAS-* OpenBLAS OpenBLAS-*.tar.gz
 
 if [ -d "$DOWNLOAD_DIR" ]; then
     cp -p "$DOWNLOAD_DIR/$tarball" .
 else
-    url=$($WGET -qO- "https://api.github.com/repos/xianyi/OpenBLAS/releases/tags/v${OPENBLAS_VERSION}" | python -c 'import sys,json;print(json.load(sys.stdin)["tarball_url"])')
+    url=$($WGET -qO- "https://api.github.com/repos/OpenMathLib/OpenBLAS/releases/tags/v${OPENBLAS_VERSION}" | python -c 'import sys,json;print(json.load(sys.stdin)["tarball_url"])')
     test -n "$url"
     $WGET -t3 -nv -O $tarball "$url"
 fi
 
 tar xzf $tarball
-mv xianyi-OpenBLAS-* OpenBLAS
+mv OpenMathLib-OpenBLAS-* OpenBLAS
 
 make PREFIX=$(pwd)/OpenBLAS/install USE_LOCKING=1 USE_THREAD=0 -C OpenBLAS all install
 if [ $? -eq 0 ]; then

From 21b55419c7f2c9c4ff44413d4dedeeb503686024 Mon Sep 17 00:00:00 2001
From: Wang Xin <xinwang614@gmail.com>
Date: Mon, 11 Nov 2024 11:22:35 +0800
Subject: [PATCH 36/39] fix scipy import error (#3874)

---
 paddlespeech/t2s/modules/pqmf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddlespeech/t2s/modules/pqmf.py b/paddlespeech/t2s/modules/pqmf.py
index 7b42409d8..cbdd6e375 100644
--- a/paddlespeech/t2s/modules/pqmf.py
+++ b/paddlespeech/t2s/modules/pqmf.py
@@ -17,7 +17,7 @@ import numpy as np
 import paddle
 import paddle.nn.functional as F
 from paddle import nn
-from scipy.signal import kaiser
+from scipy.signal.windows import kaiser
 
 
 def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):

From 290d161d8ac9d44bd0cb31e9b521bd37ade5c326 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Tue, 12 Nov 2024 19:29:59 +0800
Subject: [PATCH 37/39] =?UTF-8?q?=E3=80=90Hackathon=207th=E3=80=91add=20im?=
 =?UTF-8?q?plemention=20of=20strtobool=20(#3877)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add implemention of strtobool

* Apply suggestions from code review

* Apply suggestions from code review

* Apply suggestions from code review
---
 dataset/librispeech/librispeech.py         |  3 ++-
 examples/ami/sd0/local/ami_prepare.py      |  3 ++-
 paddlespeech/s2t/decoders/recog_bin.py     |  3 ++-
 paddlespeech/s2t/exps/whisper/test_wav.py  |  6 ++---
 paddlespeech/s2t/training/cli.py           |  4 +++-
 paddlespeech/s2t/utils/cli_utils.py        |  5 ++--
 paddlespeech/utils/argparse.py             | 27 ++++++++++++++++++++--
 paddlespeech/vector/cluster/diarization.py |  3 ++-
 utils/DER.py                               |  3 ++-
 utils/addjson.py                           |  3 ++-
 utils/apply-cmvn.py                        |  2 +-
 utils/copy-feats.py                        |  3 +--
 utils/merge_scp2json.py                    |  3 +--
 13 files changed, 48 insertions(+), 20 deletions(-)

diff --git a/dataset/librispeech/librispeech.py b/dataset/librispeech/librispeech.py
index 44567b0cf..2f5f9016c 100644
--- a/dataset/librispeech/librispeech.py
+++ b/dataset/librispeech/librispeech.py
@@ -30,6 +30,7 @@ import soundfile
 
 from paddlespeech.dataset.download import download
 from paddlespeech.dataset.download import unpack
+from paddlespeech.utils.argparse import strtobool
 
 URL_ROOT = "http://openslr.elda.org/resources/12"
 #URL_ROOT = "https://openslr.magicdatatech.com/resources/12"
@@ -63,7 +64,7 @@ parser.add_argument(
 parser.add_argument(
     "--full_download",
     default="True",
-    type=distutils.util.strtobool,
+    type=strtobool,
     help="Download all datasets for Librispeech."
     " If False, only download a minimal requirement (test-clean, dev-clean"
     " train-clean-100). (default: %(default)s)")
diff --git a/examples/ami/sd0/local/ami_prepare.py b/examples/ami/sd0/local/ami_prepare.py
index 1f02afe00..e38eb7131 100644
--- a/examples/ami/sd0/local/ami_prepare.py
+++ b/examples/ami/sd0/local/ami_prepare.py
@@ -28,7 +28,8 @@ import xml.etree.ElementTree as et
 from ami_splits import get_AMI_split
 from dataio import load_pkl
 from dataio import save_pkl
-from distutils.util import strtobool
+
+from paddlespeech.utils.argparse import strtobool
 
 logger = logging.getLogger(__name__)
 SAMPLERATE = 16000
diff --git a/paddlespeech/s2t/decoders/recog_bin.py b/paddlespeech/s2t/decoders/recog_bin.py
index 37b49f3a0..829b2b4a7 100644
--- a/paddlespeech/s2t/decoders/recog_bin.py
+++ b/paddlespeech/s2t/decoders/recog_bin.py
@@ -21,7 +21,8 @@ import sys
 
 import configargparse
 import numpy as np
-from distutils.util import strtobool
+
+from paddlespeech.utils.argparse import strtobool
 
 
 def get_parser():
diff --git a/paddlespeech/s2t/exps/whisper/test_wav.py b/paddlespeech/s2t/exps/whisper/test_wav.py
index e04eec4f2..d9c32a406 100644
--- a/paddlespeech/s2t/exps/whisper/test_wav.py
+++ b/paddlespeech/s2t/exps/whisper/test_wav.py
@@ -27,6 +27,7 @@ from paddlespeech.s2t.models.whisper import transcribe
 from paddlespeech.s2t.models.whisper import Whisper
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.log import Log
+from paddlespeech.utils.argparse import strtobool
 
 logger = Log(__name__).getlog()
 
@@ -103,10 +104,7 @@ if __name__ == "__main__":
     parser.add_argument(
         "--audio_file", type=str, help="path of the input audio file")
     parser.add_argument(
-        "--debug",
-        type=distutils.util.strtobool,
-        default=False,
-        help="for debug.")
+        "--debug", type=strtobool, default=False, help="for debug.")
     args = parser.parse_args()
 
     config = CfgNode(new_allowed=True)
diff --git a/paddlespeech/s2t/training/cli.py b/paddlespeech/s2t/training/cli.py
index 741b95dff..ded2aff9f 100644
--- a/paddlespeech/s2t/training/cli.py
+++ b/paddlespeech/s2t/training/cli.py
@@ -16,6 +16,8 @@ import argparse
 import distutils
 from yacs.config import CfgNode
 
+from paddlespeech.utils.argparse import strtobool
+
 
 class ExtendAction(argparse.Action):
     """
@@ -73,7 +75,7 @@ def default_argument_parser(parser=None):
         '--conf', type=open, action=LoadFromFile, help="config file.")
     parser.add_argument(
         "--debug",
-        type=distutils.util.strtobool,
+        type=strtobool,
         default=False,
         help="logging with debug mode.")
     parser.add_argument(
diff --git a/paddlespeech/s2t/utils/cli_utils.py b/paddlespeech/s2t/utils/cli_utils.py
index ccb0d3c97..ab93723b4 100644
--- a/paddlespeech/s2t/utils/cli_utils.py
+++ b/paddlespeech/s2t/utils/cli_utils.py
@@ -16,11 +16,12 @@ import sys
 from collections.abc import Sequence
 
 import numpy
-from distutils.util import strtobool as dist_strtobool
+
+from paddlespeech.utils.argparse import strtobool as dist_strtobool
 
 
 def strtobool(x):
-    # distutils.util.strtobool returns integer, but it's confusing,
+    # paddlespeech.utils.argparse.strtobool returns integer, but it's confusing,
     return bool(dist_strtobool(x))
 
 
diff --git a/paddlespeech/utils/argparse.py b/paddlespeech/utils/argparse.py
index aad3801ea..3ebefb826 100644
--- a/paddlespeech/utils/argparse.py
+++ b/paddlespeech/utils/argparse.py
@@ -18,7 +18,9 @@ from typing import Text
 
 import distutils
 
-__all__ = ["print_arguments", "add_arguments", "get_commandline_args"]
+__all__ = [
+    "print_arguments", "add_arguments", "get_commandline_args", "strtobool"
+]
 
 
 def get_commandline_args():
@@ -80,6 +82,27 @@ def print_arguments(args, info=None):
     print("-----------------------------------------------------------")
 
 
+def strtobool(value):
+    """Convert a string value to an integer boolean (1 for True, 0 for False).
+
+    The function recognizes the following strings as True (case insensitive):
+    - "yes"
+    - "true"
+    - "1"
+
+    All other values are considered False.
+
+    NOTE: After Python 3.10, the distutils module, particularly distutils.util, has been partially deprecated. To maintain compatibility with existing code, the strtobool function implemented here.
+    """
+    if isinstance(value, bool):
+        return int(value)
+    value = value.strip().lower()
+    if value in ('yes', 'true', '1'):
+        return 1
+    else:
+        return 0
+
+
 def add_arguments(argname, type, default, help, argparser, **kwargs):
     """Add argparse's argument.
 
@@ -91,7 +114,7 @@ def add_arguments(argname, type, default, help, argparser, **kwargs):
         add_argument("name", str, "Jonh", "User name.", parser)
         args = parser.parse_args()
     """
-    type = distutils.util.strtobool if type == bool else type
+    type = strtobool if type == bool else type
     argparser.add_argument(
         "--" + argname,
         default=default,
diff --git a/paddlespeech/vector/cluster/diarization.py b/paddlespeech/vector/cluster/diarization.py
index b47b3f248..1de055c85 100644
--- a/paddlespeech/vector/cluster/diarization.py
+++ b/paddlespeech/vector/cluster/diarization.py
@@ -24,7 +24,6 @@ import warnings
 import numpy as np
 import scipy
 import sklearn
-from distutils.util import strtobool
 from scipy import linalg
 from scipy import sparse
 from scipy.sparse.csgraph import connected_components
@@ -34,6 +33,8 @@ from sklearn.cluster import SpectralClustering
 from sklearn.cluster._kmeans import k_means
 from sklearn.neighbors import kneighbors_graph
 
+from paddlespeech.utils.argparse import strtobool
+
 
 def _graph_connected_component(graph, node_id):
     """
diff --git a/utils/DER.py b/utils/DER.py
index 59bcbec47..d12620199 100755
--- a/utils/DER.py
+++ b/utils/DER.py
@@ -28,7 +28,8 @@ import re
 import subprocess
 
 import numpy as np
-from distutils.util import strtobool
+
+from paddlespeech.utils.argparse import strtobool
 
 FILE_IDS = re.compile(r"(?<=Speaker Diarization for).+(?=\*\*\*)")
 SCORED_SPEAKER_TIME = re.compile(r"(?<=SCORED SPEAKER TIME =)[\d.]+")
diff --git a/utils/addjson.py b/utils/addjson.py
index e1be7ab31..f90f7afab 100755
--- a/utils/addjson.py
+++ b/utils/addjson.py
@@ -11,9 +11,10 @@ import json
 import logging
 import sys
 
-from distutils.util import strtobool
 from espnet.utils.cli_utils import get_commandline_args
 
+from paddlespeech.utils.argparse import strtobool
+
 is_python2 = sys.version_info[0] == 2
 
 
diff --git a/utils/apply-cmvn.py b/utils/apply-cmvn.py
index fa69ff8e0..872d69608 100755
--- a/utils/apply-cmvn.py
+++ b/utils/apply-cmvn.py
@@ -4,13 +4,13 @@ import logging
 
 import kaldiio
 import numpy
-from distutils.util import strtobool
 
 from paddlespeech.audio.transform.cmvn import CMVN
 from paddlespeech.s2t.utils.cli_readers import file_reader_helper
 from paddlespeech.s2t.utils.cli_utils import get_commandline_args
 from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style
 from paddlespeech.s2t.utils.cli_writers import file_writer_helper
+from paddlespeech.utils.argparse import strtobool
 
 
 def get_parser():
diff --git a/utils/copy-feats.py b/utils/copy-feats.py
index 89ea30f97..8f38dc8ee 100755
--- a/utils/copy-feats.py
+++ b/utils/copy-feats.py
@@ -2,13 +2,12 @@
 import argparse
 import logging
 
-from distutils.util import strtobool
-
 from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.utils.cli_readers import file_reader_helper
 from paddlespeech.s2t.utils.cli_utils import get_commandline_args
 from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style
 from paddlespeech.s2t.utils.cli_writers import file_writer_helper
+from paddlespeech.utils.argparse import strtobool
 
 
 def get_parser():
diff --git a/utils/merge_scp2json.py b/utils/merge_scp2json.py
index 99db6bac8..98f6cae84 100755
--- a/utils/merge_scp2json.py
+++ b/utils/merge_scp2json.py
@@ -7,9 +7,8 @@ import logging
 import sys
 from io import open
 
-from distutils.util import strtobool
-
 from paddlespeech.s2t.utils.cli_utils import get_commandline_args
+from paddlespeech.utils.argparse import strtobool
 
 PY2 = sys.version_info[0] == 2
 sys.stdin = codecs.getreader("utf-8")(sys.stdin if PY2 else sys.stdin.buffer)

From 99d4b7061a7e3cca97afa9f60ec350a450d0722e Mon Sep 17 00:00:00 2001
From: wanx7130 <wanx7130@126.com>
Date: Wed, 13 Nov 2024 11:04:53 +0800
Subject: [PATCH 38/39] Add GCU Backend (#3875)

---
 paddlespeech/cls/exps/panns/deploy/predict.py | 2 +-
 paddlespeech/t2s/exps/inference.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddlespeech/cls/exps/panns/deploy/predict.py b/paddlespeech/cls/exps/panns/deploy/predict.py
index 866a669e8..1dd0fb531 100644
--- a/paddlespeech/cls/exps/panns/deploy/predict.py
+++ b/paddlespeech/cls/exps/panns/deploy/predict.py
@@ -24,7 +24,7 @@ from scipy.special import softmax
 # yapf: disable
 parser = argparse.ArgumentParser()
 parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
-parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu', 'gcu'], default="gpu", help="Select which device to train model, defaults to gpu.")
 parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
 parser.add_argument("--batch_size", type=int, default=1, help="Batch size per GPU/CPU for training.")
 parser.add_argument('--use_tensorrt', type=eval, default=False, choices=[True, False], help='Enable to use tensorrt to speed up.')
diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py
index e8ddd3bef..3edc4b63b 100644
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@@ -112,7 +112,7 @@ def parse_args():
     parser.add_argument(
         "--device",
         default="gpu",
-        choices=["gpu", "cpu", "xpu", "npu", "mlu"],
+        choices=["gpu", "cpu", "xpu", "npu", "mlu", "gcu"],
         help="Device selected for inference.", )
     parser.add_argument('--cpu_threads', type=int, default=1)
 

From 419af4503cd47f74dc2597d66f31c34f770ffc9e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Thu, 14 Nov 2024 11:55:43 +0800
Subject: [PATCH 39/39] =?UTF-8?q?=E3=80=90Hackathon=207th=E3=80=91Remove?=
 =?UTF-8?q?=20parser.add=5Fargument=20(#3878)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update test_wav.py

* Update export.py

* Update test_export.py

* Update model.py

* Update README.md

* Apply suggestions from code review

* Apply suggestions from code review

* Update README.md

* Update README.md

* Update test.py

* Update README.md
---
 examples/aishell/asr0/README.md               | 29 ++++++++++++-------
 .../s2t/exps/deepspeech2/bin/export.py        |  3 --
 paddlespeech/s2t/exps/deepspeech2/bin/test.py |  3 --
 .../s2t/exps/deepspeech2/bin/test_export.py   |  6 ----
 .../s2t/exps/deepspeech2/bin/test_wav.py      |  4 ---
 paddlespeech/s2t/exps/deepspeech2/model.py    |  7 ++++-
 6 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/examples/aishell/asr0/README.md b/examples/aishell/asr0/README.md
index 131de36e3..a9469c460 100644
--- a/examples/aishell/asr0/README.md
+++ b/examples/aishell/asr0/README.md
@@ -103,12 +103,19 @@ If you want to train the model, you can use the script below to execute stage 0
 ```bash
 bash run.sh --stage 0 --stop_stage 1
 ```
-or you can run these scripts in the command line (only use CPU).
+Or you can run these scripts in the command line (only use CPU).
 ```bash
 source path.sh
 bash ./local/data.sh
-CUDA_VISIBLE_DEVICES= ./local/train.sh conf/deepspeech2.yaml  deepspeech2
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/deepspeech2.yaml deepspeech2
 ```
+If you want to use GPU, you can run these scripts in the command line (suppose you have only 1 GPU).
+```bash
+source path.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES=0 ./local/train.sh conf/deepspeech2.yaml deepspeech2
+```
+
 ## Stage 2:  Top-k Models Averaging
 After training the model,  we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model.  We can use stage 2 to do this, and the code is shown below:
 ```bash
@@ -148,7 +155,7 @@ source path.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/deepspeech2.yaml deepspeech2
 avg.sh best exp/deepspeech2/checkpoints 1
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_10
 ```
 ## Pretrained Model
 You can get the pretrained models from [this](../../../docs/source/released_model.md).
@@ -157,14 +164,14 @@ using the `tar` scripts to unpack the model and then you can use the script to t
 
 For example:
 ```
-wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz
-tar xzvf asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz
+wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz
+tar xzvf asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz
 source path.sh
 # If you have process the data and get the manifest file， you can skip the following 2 steps
 bash local/data.sh --stage -1 --stop_stage -1
 bash local/data.sh --stage 2  --stop_stage 2
 
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_10
 ```
 The performance of the released models are shown in [this](./RESULTS.md)
 ## Stage 4: Static graph model Export
@@ -178,7 +185,7 @@ This stage is to transform dygraph to static graph.
 If you already have a dynamic graph model, you can run this script:
 ```bash
 source path.sh
-./local/export.sh deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1 exp/deepspeech2/checkpoints/avg_1.jit offline
+./local/export.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_10 exp/deepspeech2/checkpoints/avg_10.jit
 ```
 ## Stage 5: Static graph Model Testing
 Similar to stage 3, the static graph model can also be tested.
@@ -190,7 +197,7 @@ Similar to stage 3, the static graph model can also be tested.
 ```
 If you already have exported the static graph, you can run this script:
 ```bash
-CUDA_VISIBLE_DEVICES= ./local/test_export.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1.jit offline
+CUDA_VISIBLE_DEVICES= ./local/test_export.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_10.jit
 ```
 ## Stage 6: Single Audio File Inference
 In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage  5. The code is shown below
@@ -202,8 +209,8 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
 ```
 you can train the model by yourself, or you can download the pretrained model by the script below:
 ```bash
-wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz
-tar xzvf asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz
+wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz
+tar asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz
 ```
 You can download the audio demo:
 ```bash
@@ -211,5 +218,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wa
 ```
 You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
 ```bash
-CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_1 data/demo_01_03.wav
+CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_10 data/demo_01_03.wav
 ```
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/export.py b/paddlespeech/s2t/exps/deepspeech2/bin/export.py
index 07228e98b..762aa6f2c 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/export.py
@@ -32,9 +32,6 @@ def main(config, args):
 
 if __name__ == "__main__":
     parser = default_argument_parser()
-    # save jit model to
-    parser.add_argument(
-        "--export_path", type=str, help="path of the jit model to save")
     args = parser.parse_args()
     print_arguments(args)
 
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test.py b/paddlespeech/s2t/exps/deepspeech2/bin/test.py
index a8e20ff93..0839cf453 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test.py
@@ -32,9 +32,6 @@ def main(config, args):
 
 if __name__ == "__main__":
     parser = default_argument_parser()
-    # save asr result to
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
     args = parser.parse_args()
     print_arguments(args, globals())
 
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
index 1e07aa800..71ffa6613 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
@@ -32,12 +32,6 @@ def main(config, args):
 
 if __name__ == "__main__":
     parser = default_argument_parser()
-    # save asr result to
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
-    #load jit model from
-    parser.add_argument(
-        "--export_path", type=str, help="path of the jit model to save")
     parser.add_argument(
         "--enable-auto-log", action="store_true", help="use auto log")
     args = parser.parse_args()
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
index 32a583b6a..d087405d5 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
@@ -171,10 +171,6 @@ def main(config, args):
 
 if __name__ == "__main__":
     parser = default_argument_parser()
-    parser.add_argument("--audio_file", type=str, help='audio file path')
-    # save asr result to
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
     args = parser.parse_args()
     print_arguments(args, globals())
     if not os.path.isfile(args.audio_file):
diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py
index d007a9e39..710757115 100644
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@@ -335,7 +335,12 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
             self.test_loader, self.config, self.args.checkpoint_path)
         infer_model.eval()
         static_model = infer_model.export()
-        logger.info(f"Export code: {static_model.forward.code}")
+        try:
+            logger.info(f"Export code: {static_model.forward.code}")
+        except:
+            logger.info(
+                f"Fail to print Export code, static_model.forward.code can not be run."
+            )
         paddle.jit.save(static_model, self.args.export_path)